feat(metrics): add scheduler related metrics (#2076)

Signed-off-by: Alexei Dodon <adodon@cisco.com>
This commit is contained in:
Alexei Dodon
2023-12-05 00:13:50 +02:00
committed by GitHub
parent 8bac653dd2
commit 2e733b3f4f
21 changed files with 566 additions and 69 deletions
+92 -1
View File
@@ -83,6 +83,53 @@ var (
},
[]string{"storageName", "lockType"},
)
schedulerGenerators = promauto.NewCounter( //nolint: gochecknoglobals
prometheus.CounterOpts{
Namespace: metricsNamespace,
Name: "scheduler_generators_total",
Help: "Total number of generators registered in scheduler",
},
)
schedulerGeneratorsStatus = promauto.NewGaugeVec( //nolint: gochecknoglobals
prometheus.GaugeOpts{
Namespace: metricsNamespace,
Name: "scheduler_generators_status",
Help: "Scheduler generators by priority & state",
},
[]string{"priority", "state"},
)
schedulerNumWorkers = promauto.NewGauge( //nolint: gochecknoglobals
prometheus.GaugeOpts{ //nolint: promlinter
Namespace: metricsNamespace,
Name: "scheduler_workers_total",
Help: "Total number of available workers to perform scheduler tasks",
},
)
schedulerWorkers = promauto.NewGaugeVec( //nolint: gochecknoglobals
prometheus.GaugeOpts{
Namespace: metricsNamespace,
Name: "scheduler_workers",
Help: "Scheduler workers state",
},
[]string{"state"},
)
schedulerTasksQueue = promauto.NewGaugeVec( //nolint: gochecknoglobals
prometheus.GaugeOpts{
Namespace: metricsNamespace,
Name: "scheduler_tasksqueue_length",
Help: "Number of tasks waiting in the queue to pe processed by scheduler workers",
},
[]string{"priority"},
)
workersTasksDuration = promauto.NewHistogramVec( //nolint: gochecknoglobals
prometheus.HistogramOpts{
Namespace: metricsNamespace,
Name: "scheduler_workers_tasks_duration_seconds",
Help: "How long it takes for a worker to execute a task",
Buckets: GetDefaultBuckets(),
},
[]string{"name"},
)
)
type metricServer struct {
@@ -169,7 +216,7 @@ func IncDownloadCounter(ms MetricServer, repo string) {
}
func SetStorageUsage(ms MetricServer, rootDir, repo string) {
ms.SendMetric(func() {
ms.ForceSendMetric(func() {
dir := path.Join(rootDir, repo)
repoSize, err := GetDirSize(dir)
@@ -196,3 +243,47 @@ func ObserveStorageLockLatency(ms MetricServer, latency time.Duration, storageNa
storageLockLatency.WithLabelValues(storageName, lockType).Observe(latency.Seconds())
})
}
func IncSchedulerGenerators(ms MetricServer) {
ms.ForceSendMetric(func() {
schedulerGenerators.Inc()
})
}
func SetSchedulerGenerators(ms MetricServer, gen map[string]map[string]uint64) {
ms.SendMetric(func() {
for priority, states := range gen {
for state, value := range states {
schedulerGeneratorsStatus.WithLabelValues(priority, state).Set(float64(value))
}
}
})
}
func SetSchedulerNumWorkers(ms MetricServer, total int) {
ms.SendMetric(func() {
schedulerNumWorkers.Set(float64(total))
})
}
func SetSchedulerWorkers(ms MetricServer, w map[string]int) {
ms.SendMetric(func() {
for state, value := range w {
schedulerWorkers.WithLabelValues(state).Set(float64(value))
}
})
}
func SetSchedulerTasksQueue(ms MetricServer, tq map[string]int) {
ms.SendMetric(func() {
for priority, value := range tq {
schedulerTasksQueue.WithLabelValues(priority).Set(float64(value))
}
})
}
func ObserveWorkersTasksDuration(ms MetricServer, taskName string, duration time.Duration) {
ms.SendMetric(func() {
workersTasksDuration.WithLabelValues(taskName).Observe(duration.Seconds())
})
}
+117 -14
View File
@@ -18,17 +18,23 @@ import (
const (
metricsNamespace = "zot"
// Counters.
httpConnRequests = metricsNamespace + ".http.requests"
repoDownloads = metricsNamespace + ".repo.downloads"
repoUploads = metricsNamespace + ".repo.uploads"
httpConnRequests = metricsNamespace + ".http.requests"
repoDownloads = metricsNamespace + ".repo.downloads"
repoUploads = metricsNamespace + ".repo.uploads"
schedulerGenerators = metricsNamespace + ".scheduler.generators"
// Gauge.
repoStorageBytes = metricsNamespace + ".repo.storage.bytes"
serverInfo = metricsNamespace + ".info"
repoStorageBytes = metricsNamespace + ".repo.storage.bytes"
serverInfo = metricsNamespace + ".info"
schedulerNumWorkers = metricsNamespace + ".scheduler.workers.total"
schedulerWorkers = metricsNamespace + ".scheduler.workers"
schedulerGeneratorsStatus = metricsNamespace + ".scheduler.generators.status"
schedulerTasksQueue = metricsNamespace + ".scheduler.tasksqueue.length"
// Summary.
httpRepoLatencySeconds = metricsNamespace + ".http.repo.latency.seconds"
// Histogram.
httpMethodLatencySeconds = metricsNamespace + ".http.method.latency.seconds"
storageLockLatencySeconds = metricsNamespace + ".storage.lock.latency.seconds"
workersTasksDuration = metricsNamespace + ".scheduler.workers.tasks.duration.seconds"
metricsScrapeTimeout = 2 * time.Minute
metricsScrapeCheckInterval = 30 * time.Second
@@ -39,7 +45,7 @@ type metricServer struct {
lastCheck time.Time
reqChan chan interface{}
cache *MetricsInfo
cacheChan chan *MetricsInfo
cacheChan chan MetricsCopy
bucketsF2S map[float64]string // float64 to string conversion of buckets label
log log.Logger
lock *sync.RWMutex
@@ -51,6 +57,12 @@ type MetricsInfo struct {
Summaries []*SummaryValue
Histograms []*HistogramValue
}
type MetricsCopy struct {
Counters []CounterValue
Gauges []GaugeValue
Summaries []SummaryValue
Histograms []HistogramValue
}
// CounterValue stores info about a metric that is incremented over time,
// such as the number of requests to an HTTP endpoint.
@@ -118,7 +130,7 @@ func (ms *metricServer) ReceiveMetrics() interface{} {
ms.enabled = true
}
ms.lock.Unlock()
ms.cacheChan <- &MetricsInfo{}
ms.cacheChan <- MetricsCopy{}
return <-ms.cacheChan
}
@@ -145,7 +157,29 @@ func (ms *metricServer) Run() {
select {
case <-ms.cacheChan:
ms.lastCheck = time.Now()
ms.cacheChan <- ms.cache
// make a copy of cache values to prevent data race
metrics := MetricsCopy{
Counters: make([]CounterValue, len(ms.cache.Counters)),
Gauges: make([]GaugeValue, len(ms.cache.Gauges)),
Summaries: make([]SummaryValue, len(ms.cache.Summaries)),
Histograms: make([]HistogramValue, len(ms.cache.Histograms)),
}
for i, cv := range ms.cache.Counters {
metrics.Counters[i] = *cv
}
for i, gv := range ms.cache.Gauges {
metrics.Gauges[i] = *gv
}
for i, sv := range ms.cache.Summaries {
metrics.Summaries[i] = *sv
}
for i, hv := range ms.cache.Histograms {
metrics.Histograms[i] = *hv
}
ms.cacheChan <- metrics
case m := <-ms.reqChan:
switch v := m.(type) {
case CounterValue:
@@ -200,7 +234,7 @@ func NewMetricsServer(enabled bool, log log.Logger) MetricServer {
ms := &metricServer{
enabled: enabled,
reqChan: make(chan interface{}),
cacheChan: make(chan *MetricsInfo),
cacheChan: make(chan MetricsCopy),
cache: mi,
bucketsF2S: bucketsFloat2String,
log: log,
@@ -215,16 +249,21 @@ func NewMetricsServer(enabled bool, log log.Logger) MetricServer {
// contains a map with key=CounterName and value=CounterLabels.
func GetCounters() map[string][]string {
return map[string][]string{
httpConnRequests: {"method", "code"},
repoDownloads: {"repo"},
repoUploads: {"repo"},
httpConnRequests: {"method", "code"},
repoDownloads: {"repo"},
repoUploads: {"repo"},
schedulerGenerators: {},
}
}
func GetGauges() map[string][]string {
return map[string][]string{
repoStorageBytes: {"repo"},
serverInfo: {"commit", "binaryType", "goVersion", "version"},
repoStorageBytes: {"repo"},
serverInfo: {"commit", "binaryType", "goVersion", "version"},
schedulerNumWorkers: {},
schedulerGeneratorsStatus: {"priority", "state"},
schedulerTasksQueue: {"priority"},
schedulerWorkers: {"state"},
}
}
@@ -238,6 +277,7 @@ func GetHistograms() map[string][]string {
return map[string][]string{
httpMethodLatencySeconds: {"method"},
storageLockLatencySeconds: {"storageName", "lockType"},
workersTasksDuration: {"name"},
}
}
@@ -533,3 +573,66 @@ func GetBuckets(metricName string) []float64 {
return GetDefaultBuckets()
}
}
func SetSchedulerNumWorkers(ms MetricServer, workers int) {
numWorkers := GaugeValue{
Name: schedulerNumWorkers,
Value: float64(workers),
}
ms.ForceSendMetric(numWorkers)
}
func IncSchedulerGenerators(ms MetricServer) {
genCounter := CounterValue{
Name: schedulerGenerators,
}
ms.ForceSendMetric(genCounter)
}
func ObserveWorkersTasksDuration(ms MetricServer, taskName string, duration time.Duration) {
h := HistogramValue{
Name: workersTasksDuration,
Sum: duration.Seconds(), // convenient temporary store for Histogram latency value
LabelNames: []string{"name"},
LabelValues: []string{taskName},
}
ms.SendMetric(h)
}
func SetSchedulerGenerators(ms MetricServer, gen map[string]map[string]uint64) {
for priority, states := range gen {
for state, value := range states {
generator := GaugeValue{
Name: schedulerGeneratorsStatus,
Value: float64(value),
LabelNames: []string{"priority", "state"},
LabelValues: []string{priority, state},
}
ms.SendMetric(generator)
}
}
}
func SetSchedulerTasksQueue(ms MetricServer, tq map[string]int) {
for priority, value := range tq {
tasks := GaugeValue{
Name: schedulerTasksQueue,
Value: float64(value),
LabelNames: []string{"priority"},
LabelValues: []string{priority},
}
ms.SendMetric(tasks)
}
}
func SetSchedulerWorkers(ms MetricServer, w map[string]int) {
for state, value := range w {
workers := GaugeValue{
Name: schedulerWorkers,
Value: float64(value),
LabelNames: []string{"state"},
LabelValues: []string{state},
}
ms.SendMetric(workers)
}
}
+2 -1
View File
@@ -461,7 +461,8 @@ func TestPopulateStorageMetrics(t *testing.T) {
err = WriteImageToFileSystem(CreateDefaultImage(), "busybox", "0.0.1", srcStorageCtlr)
So(err, ShouldBeNil)
sch := scheduler.NewScheduler(conf, ctlr.Log)
metrics := monitoring.NewMetricsServer(true, ctlr.Log)
sch := scheduler.NewScheduler(conf, metrics, ctlr.Log)
ctx, cancel := context.WithCancel(context.Background())
sch.RunScheduler(ctx)