mirror of
https://github.com/project-zot/zot.git
synced 2026-06-18 05:28:07 +08:00
6c1f1bdd40
* feat(metrics): add Prometheus GC metrics Track garbage collection activity with three new metrics: - zot_gc_runs_total (counter, label: error) — GC run count - zot_gc_duration_seconds (summary) — GC run duration - zot_gc_deleted_total (counter, label: type) — items deleted by type: blob, manifest, upload MetricServer is added to GarbageCollect and wired through all callers (controller, verify-feature retention, tests). Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * fix(test): add missing metrics var in GCS GC tests TestGCSGarbageCollectImageIndex and TestGCSGarbageCollectChainedImageIndexes were missing the metrics variable required by NewGarbageCollect after the MetricServer parameter was added. Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * fix(test): add defer metrics.Stop() in GC tests Prevent goroutine/port leaks by stopping MetricsServer in storage_test.go (3 functions) and gcs_test.go (also add missing metrics declaration in TestGCSGarbageCollectImageManifest). Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * fix(test): cover `CleanRepo` error path Add test that exercises the error branch in `CleanRepo` where `cleanRepo` fails, covering the metrics calls and log lines flagged by Codecov. Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * test: Cover GC error paths for codecov Add three tests in gc_internal_test.go to cover previously untested error branches in `removeBlobUploads` and `removeUnreferencedBlobs`: `ListBlobUploads` failure, `addIndexBlobsToReferences` failure, and `PathNotFoundError` from `GetAllBlobs`. Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * test(gc): cover remaining error paths Cover `StatBlobUpload`, `digest.Validate()`, `isBlobOlderThan`, and `CleanupRepo` error branches in `removeBlobUploads` and `removeUnreferencedBlobs`. `removeUnreferencedBlobs` now at 100% coverage, `removeBlobUploads` from 78.3% to 91.3%. Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * test: cover `sanityChecks` label name mismatch Try to avoid -0.09% coverage regression on `minimal.go` by exercising the uncovered branch in `sanityChecks` where label names have correct count but wrong values. Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * test(gc): exercise real GC path in metrics test TestGCMetrics was calling metric helpers directly instead of running actual garbage collection, so it couldn't catch wiring regressions where `CleanRepo` stops recording metrics. Now uploads an orphaned blob and runs `gc.CleanRepo` end-to-end, verifying metrics appear on the Prometheus endpoint. Suggestion from Copilot: https://github.com/project-zot/zot/pull/3863#discussion_r3129324719 Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * fix(gc): skip deletion metrics when DryRun is enabled https://github.com/project-zot/zot/pull/3863#discussion_r3129324684 Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * fix(test): stop leaked MetricsServer goroutines in GCS tests https://github.com/project-zot/zot/pull/3863#discussion_r3129324657 Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * refactor(test): drop unnecessary zlog import alias Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * fix(monitoring): expose metric types outside build tag `MetricsCopy` and related types were only visible under `\!metrics`, causing a typecheck failure when golangci-lint runs with `-tags metrics`. Moving the type definitions to `common.go` makes them unconditionally available. Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * fix(monitoring): remove extra blank line for gci Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * test(gc): cover both dry-run and real deletion metrics And fix issue with build tag with metrics Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * Satisfy testpackage linter for gc metrics test The `testpackage` linter allows `package gc` only in files named `*_internal_test.go`; rename to follow that convention. Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> --------- Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>
340 lines
8.9 KiB
Go
340 lines
8.9 KiB
Go
//go:build metrics
|
|
|
|
package monitoring
|
|
|
|
import (
|
|
"path"
|
|
"strconv"
|
|
"time"
|
|
|
|
"github.com/prometheus/client_golang/prometheus"
|
|
"github.com/prometheus/client_golang/prometheus/promauto"
|
|
|
|
"zotregistry.dev/zot/v2/errors"
|
|
"zotregistry.dev/zot/v2/pkg/log"
|
|
)
|
|
|
|
const metricsNamespace = "zot"
|
|
|
|
var (
|
|
httpConnRequests = promauto.NewCounterVec( //nolint: gochecknoglobals
|
|
prometheus.CounterOpts{
|
|
Namespace: metricsNamespace,
|
|
Name: "http_requests_total",
|
|
Help: "Total number of http request in zot",
|
|
},
|
|
[]string{"method", "code"},
|
|
)
|
|
httpRepoLatency = promauto.NewSummaryVec( //nolint: gochecknoglobals
|
|
prometheus.SummaryOpts{
|
|
Namespace: metricsNamespace,
|
|
Name: "http_repo_latency_seconds",
|
|
Help: "Latency of serving HTTP requests",
|
|
},
|
|
[]string{"repo"},
|
|
)
|
|
httpMethodLatency = promauto.NewHistogramVec( //nolint: gochecknoglobals
|
|
prometheus.HistogramOpts{
|
|
Namespace: metricsNamespace,
|
|
Name: "http_method_latency_seconds",
|
|
Help: "Latency of serving HTTP requests",
|
|
Buckets: GetDefaultBuckets(),
|
|
},
|
|
[]string{"method"},
|
|
)
|
|
repoStorageBytes = promauto.NewGaugeVec( //nolint: gochecknoglobals
|
|
prometheus.GaugeOpts{
|
|
Namespace: metricsNamespace,
|
|
Name: "repo_storage_bytes",
|
|
Help: "Storage used per zot repo",
|
|
},
|
|
[]string{"repo"},
|
|
)
|
|
uploadCounter = promauto.NewCounterVec( //nolint: gochecknoglobals
|
|
prometheus.CounterOpts{
|
|
Namespace: metricsNamespace,
|
|
Name: "repo_uploads_total",
|
|
Help: "Total number times an image was uploaded",
|
|
},
|
|
[]string{"repo"},
|
|
)
|
|
downloadCounter = promauto.NewCounterVec( //nolint: gochecknoglobals
|
|
prometheus.CounterOpts{
|
|
Namespace: metricsNamespace,
|
|
Name: "repo_downloads_total",
|
|
Help: "Total number times an image was downloaded",
|
|
},
|
|
[]string{"repo"},
|
|
)
|
|
serverInfo = promauto.NewGaugeVec( //nolint: gochecknoglobals
|
|
prometheus.GaugeOpts{
|
|
Namespace: metricsNamespace,
|
|
Name: "info",
|
|
Help: "Server general information",
|
|
},
|
|
[]string{"commit", "binaryType", "goVersion", "version"},
|
|
)
|
|
storageLockLatency = promauto.NewHistogramVec( //nolint: gochecknoglobals
|
|
prometheus.HistogramOpts{
|
|
Namespace: metricsNamespace,
|
|
Name: "storage_lock_latency_seconds",
|
|
Help: "Latency of serving HTTP requests",
|
|
Buckets: GetStorageLatencyBuckets(),
|
|
},
|
|
[]string{"storageName", "lockType"},
|
|
)
|
|
schedulerGenerators = promauto.NewCounter( //nolint: gochecknoglobals
|
|
prometheus.CounterOpts{
|
|
Namespace: metricsNamespace,
|
|
Name: "scheduler_generators_total",
|
|
Help: "Total number of generators registered in scheduler",
|
|
},
|
|
)
|
|
schedulerGeneratorsStatus = promauto.NewGaugeVec( //nolint: gochecknoglobals
|
|
prometheus.GaugeOpts{
|
|
Namespace: metricsNamespace,
|
|
Name: "scheduler_generators_status",
|
|
Help: "Scheduler generators by priority & state",
|
|
},
|
|
[]string{"priority", "state"},
|
|
)
|
|
schedulerNumWorkers = promauto.NewGauge( //nolint: gochecknoglobals
|
|
prometheus.GaugeOpts{ //nolint: promlinter
|
|
Namespace: metricsNamespace,
|
|
Name: "scheduler_workers_total",
|
|
Help: "Total number of available workers to perform scheduler tasks",
|
|
},
|
|
)
|
|
schedulerWorkers = promauto.NewGaugeVec( //nolint: gochecknoglobals
|
|
prometheus.GaugeOpts{
|
|
Namespace: metricsNamespace,
|
|
Name: "scheduler_workers",
|
|
Help: "Scheduler workers state",
|
|
},
|
|
[]string{"state"},
|
|
)
|
|
schedulerTasksQueue = promauto.NewGaugeVec( //nolint: gochecknoglobals
|
|
prometheus.GaugeOpts{
|
|
Namespace: metricsNamespace,
|
|
Name: "scheduler_tasksqueue_length",
|
|
Help: "Number of tasks waiting in the queue to pe processed by scheduler workers",
|
|
},
|
|
[]string{"priority"},
|
|
)
|
|
workersTasksDuration = promauto.NewHistogramVec( //nolint: gochecknoglobals
|
|
prometheus.HistogramOpts{
|
|
Namespace: metricsNamespace,
|
|
Name: "scheduler_workers_tasks_duration_seconds",
|
|
Help: "How long it takes for a worker to execute a task",
|
|
Buckets: GetDefaultBuckets(),
|
|
},
|
|
[]string{"name"},
|
|
)
|
|
gcRuns = promauto.NewCounterVec( //nolint: gochecknoglobals
|
|
prometheus.CounterOpts{
|
|
Namespace: metricsNamespace,
|
|
Name: "gc_runs_total",
|
|
Help: "Total number of garbage collection runs",
|
|
},
|
|
[]string{"error"},
|
|
)
|
|
gcDuration = promauto.NewSummary( //nolint: gochecknoglobals
|
|
prometheus.SummaryOpts{
|
|
Namespace: metricsNamespace,
|
|
Name: "gc_duration_seconds",
|
|
Help: "Duration of garbage collection runs",
|
|
},
|
|
)
|
|
gcDeleted = promauto.NewCounterVec( //nolint: gochecknoglobals
|
|
prometheus.CounterOpts{
|
|
Namespace: metricsNamespace,
|
|
Name: "gc_deleted_total",
|
|
Help: "Total number of items deleted by garbage collection",
|
|
},
|
|
[]string{"type"},
|
|
)
|
|
)
|
|
|
|
type metricServer struct {
|
|
enabled bool
|
|
log log.Logger
|
|
}
|
|
|
|
// Stop gracefully shuts down the metrics server (no-op for this implementation).
|
|
func (ms *metricServer) Stop() {
|
|
// This is a no-op implementation for the disabled metrics server
|
|
}
|
|
|
|
func GetDefaultBuckets() []float64 {
|
|
return []float64{.05, .5, 1, 5, 30, 60, 600}
|
|
}
|
|
|
|
func GetStorageLatencyBuckets() []float64 {
|
|
return []float64{.001, .01, 0.1, 1, 5, 10, 15, 30, 60}
|
|
}
|
|
|
|
func NewMetricsServer(enabled bool, log log.Logger) MetricServer {
|
|
return &metricServer{
|
|
enabled: enabled,
|
|
log: log,
|
|
}
|
|
}
|
|
|
|
// SendMetric implements the MetricServer interface.
|
|
func (ms *metricServer) SendMetric(mfunc any) {
|
|
if ms.enabled {
|
|
mfn, ok := mfunc.(func())
|
|
if !ok {
|
|
ms.log.Error().Err(errors.ErrInvalidMetric).
|
|
Msgf("failed to cast type, expected '%T' but got '%T'", func() {}, mfunc)
|
|
|
|
return
|
|
}
|
|
|
|
mfn()
|
|
}
|
|
}
|
|
|
|
func (ms *metricServer) ForceSendMetric(mfunc any) {
|
|
mfn, ok := mfunc.(func())
|
|
if !ok {
|
|
ms.log.Error().Err(errors.ErrInvalidMetric).
|
|
Msgf("failed to cast type, expected '%T' but got '%T'", func() {}, mfunc)
|
|
|
|
return
|
|
}
|
|
|
|
mfn()
|
|
}
|
|
|
|
func (ms *metricServer) ReceiveMetrics() any {
|
|
return nil
|
|
}
|
|
|
|
func (ms *metricServer) IsEnabled() bool {
|
|
return ms.enabled
|
|
}
|
|
|
|
func IncHTTPConnRequests(ms MetricServer, lvalues ...string) {
|
|
ms.SendMetric(func() {
|
|
httpConnRequests.WithLabelValues(lvalues...).Inc()
|
|
})
|
|
}
|
|
|
|
func ObserveHTTPRepoLatency(ms MetricServer, path string, latency time.Duration) {
|
|
ms.SendMetric(func() {
|
|
match := re.FindStringSubmatch(path)
|
|
|
|
if len(match) > 1 {
|
|
httpRepoLatency.WithLabelValues(match[1]).Observe(latency.Seconds())
|
|
} else {
|
|
httpRepoLatency.WithLabelValues("N/A").Observe(latency.Seconds())
|
|
}
|
|
})
|
|
}
|
|
|
|
func ObserveHTTPMethodLatency(ms MetricServer, method string, latency time.Duration) {
|
|
ms.SendMetric(func() {
|
|
httpMethodLatency.WithLabelValues(method).Observe(latency.Seconds())
|
|
})
|
|
}
|
|
|
|
func IncDownloadCounter(ms MetricServer, repo string) {
|
|
ms.SendMetric(func() {
|
|
downloadCounter.WithLabelValues(repo).Inc()
|
|
})
|
|
}
|
|
|
|
func SetStorageUsage(ms MetricServer, rootDir, repo string) {
|
|
ms.ForceSendMetric(func() {
|
|
dir := path.Join(rootDir, repo)
|
|
|
|
repoSize, err := GetDirSize(dir)
|
|
if err == nil {
|
|
repoStorageBytes.WithLabelValues(repo).Set(float64(repoSize))
|
|
}
|
|
})
|
|
}
|
|
|
|
func IncUploadCounter(ms MetricServer, repo string) {
|
|
ms.SendMetric(func() {
|
|
uploadCounter.WithLabelValues(repo).Inc()
|
|
})
|
|
}
|
|
|
|
func SetServerInfo(ms MetricServer, lvalues ...string) {
|
|
ms.ForceSendMetric(func() {
|
|
serverInfo.WithLabelValues(lvalues...).Set(0)
|
|
})
|
|
}
|
|
|
|
func ObserveStorageLockLatency(ms MetricServer, latency time.Duration, storageName, lockType string) {
|
|
ms.SendMetric(func() {
|
|
storageLockLatency.WithLabelValues(storageName, lockType).Observe(latency.Seconds())
|
|
})
|
|
}
|
|
|
|
func IncSchedulerGenerators(ms MetricServer) {
|
|
ms.ForceSendMetric(func() {
|
|
schedulerGenerators.Inc()
|
|
})
|
|
}
|
|
|
|
func SetSchedulerGenerators(ms MetricServer, gen map[string]map[string]uint64) {
|
|
ms.SendMetric(func() {
|
|
for priority, states := range gen {
|
|
for state, value := range states {
|
|
schedulerGeneratorsStatus.WithLabelValues(priority, state).Set(float64(value))
|
|
}
|
|
}
|
|
})
|
|
}
|
|
|
|
func SetSchedulerNumWorkers(ms MetricServer, total int) {
|
|
ms.SendMetric(func() {
|
|
schedulerNumWorkers.Set(float64(total))
|
|
})
|
|
}
|
|
|
|
func SetSchedulerWorkers(ms MetricServer, w map[string]int) {
|
|
ms.SendMetric(func() {
|
|
for state, value := range w {
|
|
schedulerWorkers.WithLabelValues(state).Set(float64(value))
|
|
}
|
|
})
|
|
}
|
|
|
|
func SetSchedulerTasksQueue(ms MetricServer, tq map[string]int) {
|
|
ms.SendMetric(func() {
|
|
for priority, value := range tq {
|
|
schedulerTasksQueue.WithLabelValues(priority).Set(float64(value))
|
|
}
|
|
})
|
|
}
|
|
|
|
func ObserveWorkersTasksDuration(ms MetricServer, taskName string, duration time.Duration) {
|
|
ms.SendMetric(func() {
|
|
workersTasksDuration.WithLabelValues(taskName).Observe(duration.Seconds())
|
|
})
|
|
}
|
|
|
|
func IncGCRuns(ms MetricServer, hasError bool) {
|
|
ms.SendMetric(func() {
|
|
gcRuns.WithLabelValues(strconv.FormatBool(hasError)).Inc()
|
|
})
|
|
}
|
|
|
|
func ObserveGCDuration(ms MetricServer, latency time.Duration) {
|
|
ms.SendMetric(func() {
|
|
gcDuration.Observe(latency.Seconds())
|
|
})
|
|
}
|
|
|
|
func IncGCDeleted(ms MetricServer, artifactType string, count int) {
|
|
if count > 0 {
|
|
ms.SendMetric(func() {
|
|
gcDeleted.WithLabelValues(artifactType).Add(float64(count))
|
|
})
|
|
}
|
|
}
|