mirror of
https://github.com/project-zot/zot.git
synced 2026-06-16 20:38:08 +08:00
feat(metrics): add scheduler related metrics (#2076)
Signed-off-by: Alexei Dodon <adodon@cisco.com>
This commit is contained in:
@@ -83,6 +83,53 @@ var (
|
||||
},
|
||||
[]string{"storageName", "lockType"},
|
||||
)
|
||||
schedulerGenerators = promauto.NewCounter( //nolint: gochecknoglobals
|
||||
prometheus.CounterOpts{
|
||||
Namespace: metricsNamespace,
|
||||
Name: "scheduler_generators_total",
|
||||
Help: "Total number of generators registered in scheduler",
|
||||
},
|
||||
)
|
||||
schedulerGeneratorsStatus = promauto.NewGaugeVec( //nolint: gochecknoglobals
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: metricsNamespace,
|
||||
Name: "scheduler_generators_status",
|
||||
Help: "Scheduler generators by priority & state",
|
||||
},
|
||||
[]string{"priority", "state"},
|
||||
)
|
||||
schedulerNumWorkers = promauto.NewGauge( //nolint: gochecknoglobals
|
||||
prometheus.GaugeOpts{ //nolint: promlinter
|
||||
Namespace: metricsNamespace,
|
||||
Name: "scheduler_workers_total",
|
||||
Help: "Total number of available workers to perform scheduler tasks",
|
||||
},
|
||||
)
|
||||
schedulerWorkers = promauto.NewGaugeVec( //nolint: gochecknoglobals
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: metricsNamespace,
|
||||
Name: "scheduler_workers",
|
||||
Help: "Scheduler workers state",
|
||||
},
|
||||
[]string{"state"},
|
||||
)
|
||||
schedulerTasksQueue = promauto.NewGaugeVec( //nolint: gochecknoglobals
|
||||
prometheus.GaugeOpts{
|
||||
Namespace: metricsNamespace,
|
||||
Name: "scheduler_tasksqueue_length",
|
||||
Help: "Number of tasks waiting in the queue to pe processed by scheduler workers",
|
||||
},
|
||||
[]string{"priority"},
|
||||
)
|
||||
workersTasksDuration = promauto.NewHistogramVec( //nolint: gochecknoglobals
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: metricsNamespace,
|
||||
Name: "scheduler_workers_tasks_duration_seconds",
|
||||
Help: "How long it takes for a worker to execute a task",
|
||||
Buckets: GetDefaultBuckets(),
|
||||
},
|
||||
[]string{"name"},
|
||||
)
|
||||
)
|
||||
|
||||
type metricServer struct {
|
||||
@@ -169,7 +216,7 @@ func IncDownloadCounter(ms MetricServer, repo string) {
|
||||
}
|
||||
|
||||
func SetStorageUsage(ms MetricServer, rootDir, repo string) {
|
||||
ms.SendMetric(func() {
|
||||
ms.ForceSendMetric(func() {
|
||||
dir := path.Join(rootDir, repo)
|
||||
repoSize, err := GetDirSize(dir)
|
||||
|
||||
@@ -196,3 +243,47 @@ func ObserveStorageLockLatency(ms MetricServer, latency time.Duration, storageNa
|
||||
storageLockLatency.WithLabelValues(storageName, lockType).Observe(latency.Seconds())
|
||||
})
|
||||
}
|
||||
|
||||
func IncSchedulerGenerators(ms MetricServer) {
|
||||
ms.ForceSendMetric(func() {
|
||||
schedulerGenerators.Inc()
|
||||
})
|
||||
}
|
||||
|
||||
func SetSchedulerGenerators(ms MetricServer, gen map[string]map[string]uint64) {
|
||||
ms.SendMetric(func() {
|
||||
for priority, states := range gen {
|
||||
for state, value := range states {
|
||||
schedulerGeneratorsStatus.WithLabelValues(priority, state).Set(float64(value))
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func SetSchedulerNumWorkers(ms MetricServer, total int) {
|
||||
ms.SendMetric(func() {
|
||||
schedulerNumWorkers.Set(float64(total))
|
||||
})
|
||||
}
|
||||
|
||||
func SetSchedulerWorkers(ms MetricServer, w map[string]int) {
|
||||
ms.SendMetric(func() {
|
||||
for state, value := range w {
|
||||
schedulerWorkers.WithLabelValues(state).Set(float64(value))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func SetSchedulerTasksQueue(ms MetricServer, tq map[string]int) {
|
||||
ms.SendMetric(func() {
|
||||
for priority, value := range tq {
|
||||
schedulerTasksQueue.WithLabelValues(priority).Set(float64(value))
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
func ObserveWorkersTasksDuration(ms MetricServer, taskName string, duration time.Duration) {
|
||||
ms.SendMetric(func() {
|
||||
workersTasksDuration.WithLabelValues(taskName).Observe(duration.Seconds())
|
||||
})
|
||||
}
|
||||
|
||||
@@ -18,17 +18,23 @@ import (
|
||||
const (
|
||||
metricsNamespace = "zot"
|
||||
// Counters.
|
||||
httpConnRequests = metricsNamespace + ".http.requests"
|
||||
repoDownloads = metricsNamespace + ".repo.downloads"
|
||||
repoUploads = metricsNamespace + ".repo.uploads"
|
||||
httpConnRequests = metricsNamespace + ".http.requests"
|
||||
repoDownloads = metricsNamespace + ".repo.downloads"
|
||||
repoUploads = metricsNamespace + ".repo.uploads"
|
||||
schedulerGenerators = metricsNamespace + ".scheduler.generators"
|
||||
// Gauge.
|
||||
repoStorageBytes = metricsNamespace + ".repo.storage.bytes"
|
||||
serverInfo = metricsNamespace + ".info"
|
||||
repoStorageBytes = metricsNamespace + ".repo.storage.bytes"
|
||||
serverInfo = metricsNamespace + ".info"
|
||||
schedulerNumWorkers = metricsNamespace + ".scheduler.workers.total"
|
||||
schedulerWorkers = metricsNamespace + ".scheduler.workers"
|
||||
schedulerGeneratorsStatus = metricsNamespace + ".scheduler.generators.status"
|
||||
schedulerTasksQueue = metricsNamespace + ".scheduler.tasksqueue.length"
|
||||
// Summary.
|
||||
httpRepoLatencySeconds = metricsNamespace + ".http.repo.latency.seconds"
|
||||
// Histogram.
|
||||
httpMethodLatencySeconds = metricsNamespace + ".http.method.latency.seconds"
|
||||
storageLockLatencySeconds = metricsNamespace + ".storage.lock.latency.seconds"
|
||||
workersTasksDuration = metricsNamespace + ".scheduler.workers.tasks.duration.seconds"
|
||||
|
||||
metricsScrapeTimeout = 2 * time.Minute
|
||||
metricsScrapeCheckInterval = 30 * time.Second
|
||||
@@ -39,7 +45,7 @@ type metricServer struct {
|
||||
lastCheck time.Time
|
||||
reqChan chan interface{}
|
||||
cache *MetricsInfo
|
||||
cacheChan chan *MetricsInfo
|
||||
cacheChan chan MetricsCopy
|
||||
bucketsF2S map[float64]string // float64 to string conversion of buckets label
|
||||
log log.Logger
|
||||
lock *sync.RWMutex
|
||||
@@ -51,6 +57,12 @@ type MetricsInfo struct {
|
||||
Summaries []*SummaryValue
|
||||
Histograms []*HistogramValue
|
||||
}
|
||||
type MetricsCopy struct {
|
||||
Counters []CounterValue
|
||||
Gauges []GaugeValue
|
||||
Summaries []SummaryValue
|
||||
Histograms []HistogramValue
|
||||
}
|
||||
|
||||
// CounterValue stores info about a metric that is incremented over time,
|
||||
// such as the number of requests to an HTTP endpoint.
|
||||
@@ -118,7 +130,7 @@ func (ms *metricServer) ReceiveMetrics() interface{} {
|
||||
ms.enabled = true
|
||||
}
|
||||
ms.lock.Unlock()
|
||||
ms.cacheChan <- &MetricsInfo{}
|
||||
ms.cacheChan <- MetricsCopy{}
|
||||
|
||||
return <-ms.cacheChan
|
||||
}
|
||||
@@ -145,7 +157,29 @@ func (ms *metricServer) Run() {
|
||||
select {
|
||||
case <-ms.cacheChan:
|
||||
ms.lastCheck = time.Now()
|
||||
ms.cacheChan <- ms.cache
|
||||
// make a copy of cache values to prevent data race
|
||||
metrics := MetricsCopy{
|
||||
Counters: make([]CounterValue, len(ms.cache.Counters)),
|
||||
Gauges: make([]GaugeValue, len(ms.cache.Gauges)),
|
||||
Summaries: make([]SummaryValue, len(ms.cache.Summaries)),
|
||||
Histograms: make([]HistogramValue, len(ms.cache.Histograms)),
|
||||
}
|
||||
for i, cv := range ms.cache.Counters {
|
||||
metrics.Counters[i] = *cv
|
||||
}
|
||||
|
||||
for i, gv := range ms.cache.Gauges {
|
||||
metrics.Gauges[i] = *gv
|
||||
}
|
||||
|
||||
for i, sv := range ms.cache.Summaries {
|
||||
metrics.Summaries[i] = *sv
|
||||
}
|
||||
|
||||
for i, hv := range ms.cache.Histograms {
|
||||
metrics.Histograms[i] = *hv
|
||||
}
|
||||
ms.cacheChan <- metrics
|
||||
case m := <-ms.reqChan:
|
||||
switch v := m.(type) {
|
||||
case CounterValue:
|
||||
@@ -200,7 +234,7 @@ func NewMetricsServer(enabled bool, log log.Logger) MetricServer {
|
||||
ms := &metricServer{
|
||||
enabled: enabled,
|
||||
reqChan: make(chan interface{}),
|
||||
cacheChan: make(chan *MetricsInfo),
|
||||
cacheChan: make(chan MetricsCopy),
|
||||
cache: mi,
|
||||
bucketsF2S: bucketsFloat2String,
|
||||
log: log,
|
||||
@@ -215,16 +249,21 @@ func NewMetricsServer(enabled bool, log log.Logger) MetricServer {
|
||||
// contains a map with key=CounterName and value=CounterLabels.
|
||||
func GetCounters() map[string][]string {
|
||||
return map[string][]string{
|
||||
httpConnRequests: {"method", "code"},
|
||||
repoDownloads: {"repo"},
|
||||
repoUploads: {"repo"},
|
||||
httpConnRequests: {"method", "code"},
|
||||
repoDownloads: {"repo"},
|
||||
repoUploads: {"repo"},
|
||||
schedulerGenerators: {},
|
||||
}
|
||||
}
|
||||
|
||||
func GetGauges() map[string][]string {
|
||||
return map[string][]string{
|
||||
repoStorageBytes: {"repo"},
|
||||
serverInfo: {"commit", "binaryType", "goVersion", "version"},
|
||||
repoStorageBytes: {"repo"},
|
||||
serverInfo: {"commit", "binaryType", "goVersion", "version"},
|
||||
schedulerNumWorkers: {},
|
||||
schedulerGeneratorsStatus: {"priority", "state"},
|
||||
schedulerTasksQueue: {"priority"},
|
||||
schedulerWorkers: {"state"},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -238,6 +277,7 @@ func GetHistograms() map[string][]string {
|
||||
return map[string][]string{
|
||||
httpMethodLatencySeconds: {"method"},
|
||||
storageLockLatencySeconds: {"storageName", "lockType"},
|
||||
workersTasksDuration: {"name"},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -533,3 +573,66 @@ func GetBuckets(metricName string) []float64 {
|
||||
return GetDefaultBuckets()
|
||||
}
|
||||
}
|
||||
|
||||
func SetSchedulerNumWorkers(ms MetricServer, workers int) {
|
||||
numWorkers := GaugeValue{
|
||||
Name: schedulerNumWorkers,
|
||||
Value: float64(workers),
|
||||
}
|
||||
ms.ForceSendMetric(numWorkers)
|
||||
}
|
||||
|
||||
func IncSchedulerGenerators(ms MetricServer) {
|
||||
genCounter := CounterValue{
|
||||
Name: schedulerGenerators,
|
||||
}
|
||||
ms.ForceSendMetric(genCounter)
|
||||
}
|
||||
|
||||
func ObserveWorkersTasksDuration(ms MetricServer, taskName string, duration time.Duration) {
|
||||
h := HistogramValue{
|
||||
Name: workersTasksDuration,
|
||||
Sum: duration.Seconds(), // convenient temporary store for Histogram latency value
|
||||
LabelNames: []string{"name"},
|
||||
LabelValues: []string{taskName},
|
||||
}
|
||||
ms.SendMetric(h)
|
||||
}
|
||||
|
||||
func SetSchedulerGenerators(ms MetricServer, gen map[string]map[string]uint64) {
|
||||
for priority, states := range gen {
|
||||
for state, value := range states {
|
||||
generator := GaugeValue{
|
||||
Name: schedulerGeneratorsStatus,
|
||||
Value: float64(value),
|
||||
LabelNames: []string{"priority", "state"},
|
||||
LabelValues: []string{priority, state},
|
||||
}
|
||||
ms.SendMetric(generator)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func SetSchedulerTasksQueue(ms MetricServer, tq map[string]int) {
|
||||
for priority, value := range tq {
|
||||
tasks := GaugeValue{
|
||||
Name: schedulerTasksQueue,
|
||||
Value: float64(value),
|
||||
LabelNames: []string{"priority"},
|
||||
LabelValues: []string{priority},
|
||||
}
|
||||
ms.SendMetric(tasks)
|
||||
}
|
||||
}
|
||||
|
||||
func SetSchedulerWorkers(ms MetricServer, w map[string]int) {
|
||||
for state, value := range w {
|
||||
workers := GaugeValue{
|
||||
Name: schedulerWorkers,
|
||||
Value: float64(value),
|
||||
LabelNames: []string{"state"},
|
||||
LabelValues: []string{state},
|
||||
}
|
||||
ms.SendMetric(workers)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -461,7 +461,8 @@ func TestPopulateStorageMetrics(t *testing.T) {
|
||||
err = WriteImageToFileSystem(CreateDefaultImage(), "busybox", "0.0.1", srcStorageCtlr)
|
||||
So(err, ShouldBeNil)
|
||||
|
||||
sch := scheduler.NewScheduler(conf, ctlr.Log)
|
||||
metrics := monitoring.NewMetricsServer(true, ctlr.Log)
|
||||
sch := scheduler.NewScheduler(conf, metrics, ctlr.Log)
|
||||
ctx, cancel := context.WithCancel(context.Background())
|
||||
sch.RunScheduler(ctx)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user