mirror of
https://github.com/project-zot/zot.git
synced 2026-06-17 21:17:58 +08:00
feat(metrics): add Prometheus GC metrics (#3863)
* feat(metrics): add Prometheus GC metrics Track garbage collection activity with three new metrics: - zot_gc_runs_total (counter, label: error) — GC run count - zot_gc_duration_seconds (summary) — GC run duration - zot_gc_deleted_total (counter, label: type) — items deleted by type: blob, manifest, upload MetricServer is added to GarbageCollect and wired through all callers (controller, verify-feature retention, tests). Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * fix(test): add missing metrics var in GCS GC tests TestGCSGarbageCollectImageIndex and TestGCSGarbageCollectChainedImageIndexes were missing the metrics variable required by NewGarbageCollect after the MetricServer parameter was added. Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * fix(test): add defer metrics.Stop() in GC tests Prevent goroutine/port leaks by stopping MetricsServer in storage_test.go (3 functions) and gcs_test.go (also add missing metrics declaration in TestGCSGarbageCollectImageManifest). Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * fix(test): cover `CleanRepo` error path Add test that exercises the error branch in `CleanRepo` where `cleanRepo` fails, covering the metrics calls and log lines flagged by Codecov. Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * test: Cover GC error paths for codecov Add three tests in gc_internal_test.go to cover previously untested error branches in `removeBlobUploads` and `removeUnreferencedBlobs`: `ListBlobUploads` failure, `addIndexBlobsToReferences` failure, and `PathNotFoundError` from `GetAllBlobs`. Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * test(gc): cover remaining error paths Cover `StatBlobUpload`, `digest.Validate()`, `isBlobOlderThan`, and `CleanupRepo` error branches in `removeBlobUploads` and `removeUnreferencedBlobs`. `removeUnreferencedBlobs` now at 100% coverage, `removeBlobUploads` from 78.3% to 91.3%. Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * test: cover `sanityChecks` label name mismatch Try to avoid -0.09% coverage regression on `minimal.go` by exercising the uncovered branch in `sanityChecks` where label names have correct count but wrong values. Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * test(gc): exercise real GC path in metrics test TestGCMetrics was calling metric helpers directly instead of running actual garbage collection, so it couldn't catch wiring regressions where `CleanRepo` stops recording metrics. Now uploads an orphaned blob and runs `gc.CleanRepo` end-to-end, verifying metrics appear on the Prometheus endpoint. Suggestion from Copilot: https://github.com/project-zot/zot/pull/3863#discussion_r3129324719 Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * fix(gc): skip deletion metrics when DryRun is enabled https://github.com/project-zot/zot/pull/3863#discussion_r3129324684 Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * fix(test): stop leaked MetricsServer goroutines in GCS tests https://github.com/project-zot/zot/pull/3863#discussion_r3129324657 Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * refactor(test): drop unnecessary zlog import alias Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * fix(monitoring): expose metric types outside build tag `MetricsCopy` and related types were only visible under `\!metrics`, causing a typecheck failure when golangci-lint runs with `-tags metrics`. Moving the type definitions to `common.go` makes them unconditionally available. Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * fix(monitoring): remove extra blank line for gci Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * test(gc): cover both dry-run and real deletion metrics And fix issue with build tag with metrics Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * Satisfy testpackage linter for gc metrics test The `testpackage` linter allows `package gc` only in files named `*_internal_test.go`; rename to follow that convention. Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> --------- Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>
This commit is contained in:
@@ -18,6 +18,57 @@ type MetricServer interface {
|
||||
Stop()
|
||||
}
|
||||
|
||||
type MetricsInfo struct {
|
||||
Counters []*CounterValue
|
||||
Gauges []*GaugeValue
|
||||
Summaries []*SummaryValue
|
||||
Histograms []*HistogramValue
|
||||
}
|
||||
|
||||
type MetricsCopy struct {
|
||||
Counters []CounterValue
|
||||
Gauges []GaugeValue
|
||||
Summaries []SummaryValue
|
||||
Histograms []HistogramValue
|
||||
}
|
||||
|
||||
// CounterValue stores info about a metric that is incremented over time,
|
||||
// such as the number of requests to an HTTP endpoint.
|
||||
type CounterValue struct {
|
||||
Name string
|
||||
Count int
|
||||
LabelNames []string
|
||||
LabelValues []string
|
||||
}
|
||||
|
||||
// GaugeValue stores one value that is updated as time goes on, such as
|
||||
// the amount of memory allocated.
|
||||
type GaugeValue struct {
|
||||
Name string
|
||||
Value float64
|
||||
LabelNames []string
|
||||
LabelValues []string
|
||||
}
|
||||
|
||||
// SummaryValue stores info about a metric that is incremented over time,
|
||||
// such as the number of requests to an HTTP endpoint.
|
||||
type SummaryValue struct {
|
||||
Name string
|
||||
Count int
|
||||
Sum float64
|
||||
LabelNames []string
|
||||
LabelValues []string
|
||||
}
|
||||
|
||||
type HistogramValue struct {
|
||||
Name string
|
||||
Count int
|
||||
Sum float64
|
||||
Buckets map[string]int
|
||||
LabelNames []string
|
||||
LabelValues []string
|
||||
}
|
||||
|
||||
func GetDirSize(path string) (int64, error) {
|
||||
var size int64
|
||||
|
||||
|
||||
@@ -4,6 +4,7 @@ package monitoring
|
||||
|
||||
import (
|
||||
"path"
|
||||
"strconv"
|
||||
"time"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
@@ -129,6 +130,29 @@ var (
|
||||
},
|
||||
[]string{"name"},
|
||||
)
|
||||
gcRuns = promauto.NewCounterVec( //nolint: gochecknoglobals
|
||||
prometheus.CounterOpts{
|
||||
Namespace: metricsNamespace,
|
||||
Name: "gc_runs_total",
|
||||
Help: "Total number of garbage collection runs",
|
||||
},
|
||||
[]string{"error"},
|
||||
)
|
||||
gcDuration = promauto.NewSummary( //nolint: gochecknoglobals
|
||||
prometheus.SummaryOpts{
|
||||
Namespace: metricsNamespace,
|
||||
Name: "gc_duration_seconds",
|
||||
Help: "Duration of garbage collection runs",
|
||||
},
|
||||
)
|
||||
gcDeleted = promauto.NewCounterVec( //nolint: gochecknoglobals
|
||||
prometheus.CounterOpts{
|
||||
Namespace: metricsNamespace,
|
||||
Name: "gc_deleted_total",
|
||||
Help: "Total number of items deleted by garbage collection",
|
||||
},
|
||||
[]string{"type"},
|
||||
)
|
||||
)
|
||||
|
||||
type metricServer struct {
|
||||
@@ -293,3 +317,23 @@ func ObserveWorkersTasksDuration(ms MetricServer, taskName string, duration time
|
||||
workersTasksDuration.WithLabelValues(taskName).Observe(duration.Seconds())
|
||||
})
|
||||
}
|
||||
|
||||
func IncGCRuns(ms MetricServer, hasError bool) {
|
||||
ms.SendMetric(func() {
|
||||
gcRuns.WithLabelValues(strconv.FormatBool(hasError)).Inc()
|
||||
})
|
||||
}
|
||||
|
||||
func ObserveGCDuration(ms MetricServer, latency time.Duration) {
|
||||
ms.SendMetric(func() {
|
||||
gcDuration.Observe(latency.Seconds())
|
||||
})
|
||||
}
|
||||
|
||||
func IncGCDeleted(ms MetricServer, artifactType string, count int) {
|
||||
if count > 0 {
|
||||
ms.SendMetric(func() {
|
||||
gcDeleted.WithLabelValues(artifactType).Add(float64(count))
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,6 +35,10 @@ const (
|
||||
httpMethodLatencySeconds = metricsNamespace + ".http.method.latency.seconds"
|
||||
storageLockLatencySeconds = metricsNamespace + ".storage.lock.latency.seconds"
|
||||
workersTasksDuration = metricsNamespace + ".scheduler.workers.tasks.duration.seconds"
|
||||
// GC metrics.
|
||||
gcRuns = metricsNamespace + ".gc.runs"
|
||||
gcDuration = metricsNamespace + ".gc.duration.seconds"
|
||||
gcDeleted = metricsNamespace + ".gc.deleted"
|
||||
|
||||
metricsScrapeTimeout = 2 * time.Minute
|
||||
metricsScrapeCheckInterval = 30 * time.Second
|
||||
@@ -52,56 +56,6 @@ type metricServer struct {
|
||||
stopChan chan struct{} // Channel to signal shutdown
|
||||
}
|
||||
|
||||
type MetricsInfo struct {
|
||||
Counters []*CounterValue
|
||||
Gauges []*GaugeValue
|
||||
Summaries []*SummaryValue
|
||||
Histograms []*HistogramValue
|
||||
}
|
||||
type MetricsCopy struct {
|
||||
Counters []CounterValue
|
||||
Gauges []GaugeValue
|
||||
Summaries []SummaryValue
|
||||
Histograms []HistogramValue
|
||||
}
|
||||
|
||||
// CounterValue stores info about a metric that is incremented over time,
|
||||
// such as the number of requests to an HTTP endpoint.
|
||||
type CounterValue struct {
|
||||
Name string
|
||||
Count int
|
||||
LabelNames []string
|
||||
LabelValues []string
|
||||
}
|
||||
|
||||
// GaugeValue stores one value that is updated as time goes on, such as
|
||||
// the amount of memory allocated.
|
||||
type GaugeValue struct {
|
||||
Name string
|
||||
Value float64
|
||||
LabelNames []string
|
||||
LabelValues []string
|
||||
}
|
||||
|
||||
// SummaryValue stores info about a metric that is incremented over time,
|
||||
// such as the number of requests to an HTTP endpoint.
|
||||
type SummaryValue struct {
|
||||
Name string
|
||||
Count int
|
||||
Sum float64
|
||||
LabelNames []string
|
||||
LabelValues []string
|
||||
}
|
||||
|
||||
type HistogramValue struct {
|
||||
Name string
|
||||
Count int
|
||||
Sum float64
|
||||
Buckets map[string]int
|
||||
LabelNames []string
|
||||
LabelValues []string
|
||||
}
|
||||
|
||||
func GetDefaultBuckets() []float64 {
|
||||
return []float64{.05, .5, 1, 5, 30, 60, 600, math.MaxFloat64}
|
||||
}
|
||||
@@ -274,6 +228,8 @@ func GetCounters() map[string][]string {
|
||||
repoDownloads: {"repo"},
|
||||
repoUploads: {"repo"},
|
||||
schedulerGenerators: {},
|
||||
gcRuns: {"error"},
|
||||
gcDeleted: {"type"},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -291,6 +247,7 @@ func GetGauges() map[string][]string {
|
||||
func GetSummaries() map[string][]string {
|
||||
return map[string][]string{
|
||||
httpRepoLatencySeconds: {"repo"},
|
||||
gcDuration: {},
|
||||
}
|
||||
}
|
||||
|
||||
@@ -643,3 +600,31 @@ func SetSchedulerWorkers(ms MetricServer, w map[string]int) {
|
||||
ms.SendMetric(workers)
|
||||
}
|
||||
}
|
||||
|
||||
func IncGCRuns(ms MetricServer, hasError bool) {
|
||||
req := CounterValue{
|
||||
Name: gcRuns,
|
||||
LabelNames: []string{"error"},
|
||||
LabelValues: []string{strconv.FormatBool(hasError)},
|
||||
}
|
||||
ms.SendMetric(req)
|
||||
}
|
||||
|
||||
func ObserveGCDuration(ms MetricServer, latency time.Duration) {
|
||||
sv := SummaryValue{
|
||||
Name: gcDuration,
|
||||
Sum: latency.Seconds(),
|
||||
}
|
||||
ms.SendMetric(sv)
|
||||
}
|
||||
|
||||
func IncGCDeleted(ms MetricServer, artifactType string, count int) {
|
||||
for range count {
|
||||
req := CounterValue{
|
||||
Name: gcDeleted,
|
||||
LabelNames: []string{"type"},
|
||||
LabelValues: []string{artifactType},
|
||||
}
|
||||
ms.SendMetric(req)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -143,6 +143,16 @@ func TestNewMetricsClientFallbackKeepsTLSHardening(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestSanityChecksLabelNameMismatch(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
err := sanityChecks("test.metric", []string{"method", "code"}, true,
|
||||
[]string{"method", "wrong"}, []string{"GET", "200"})
|
||||
if err == nil {
|
||||
t.Fatal("expected error when label names don't match known labels")
|
||||
}
|
||||
}
|
||||
|
||||
func generateServerCertificateChain() ([]byte, []byte, []byte, error) {
|
||||
now := time.Now()
|
||||
|
||||
|
||||
@@ -3,6 +3,8 @@
|
||||
package monitoring_test
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"math/rand"
|
||||
@@ -12,6 +14,7 @@ import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
godigest "github.com/opencontainers/go-digest"
|
||||
. "github.com/smartystreets/goconvey/convey"
|
||||
"gopkg.in/resty.v1"
|
||||
|
||||
@@ -22,8 +25,10 @@ import (
|
||||
"zotregistry.dev/zot/v2/pkg/log"
|
||||
"zotregistry.dev/zot/v2/pkg/scheduler"
|
||||
common "zotregistry.dev/zot/v2/pkg/storage/common"
|
||||
"zotregistry.dev/zot/v2/pkg/storage/gc"
|
||||
test "zotregistry.dev/zot/v2/pkg/test/common"
|
||||
. "zotregistry.dev/zot/v2/pkg/test/image-utils"
|
||||
"zotregistry.dev/zot/v2/pkg/test/mocks"
|
||||
ociutils "zotregistry.dev/zot/v2/pkg/test/oci-utils"
|
||||
)
|
||||
|
||||
@@ -502,6 +507,57 @@ func TestPopulateStorageMetrics(t *testing.T) {
|
||||
})
|
||||
}
|
||||
|
||||
func TestGCMetrics(t *testing.T) {
|
||||
Convey("GC metrics should be emitted after garbage collection", t, func() {
|
||||
port := test.GetFreePort()
|
||||
baseURL := test.GetBaseURL(port)
|
||||
conf := config.New()
|
||||
conf.HTTP.Port = port
|
||||
rootDir := t.TempDir()
|
||||
conf.Storage.RootDirectory = rootDir
|
||||
conf.Storage.GC = false
|
||||
enabled := true
|
||||
conf.Extensions = &extconf.ExtensionConfig{}
|
||||
conf.Extensions.Metrics = &extconf.MetricsConfig{
|
||||
BaseConfig: extconf.BaseConfig{Enable: &enabled},
|
||||
Prometheus: &extconf.PrometheusConfig{Path: "/metrics"},
|
||||
}
|
||||
|
||||
ctlr := api.NewController(conf)
|
||||
|
||||
srcStorageCtlr := ociutils.GetDefaultStoreController(rootDir, ctlr.Log)
|
||||
err := WriteImageToFileSystem(CreateDefaultImage(), "gc-metrics-test", "0.0.1", srcStorageCtlr)
|
||||
So(err, ShouldBeNil)
|
||||
|
||||
cm := test.NewControllerManager(ctlr)
|
||||
cm.StartAndWait(port)
|
||||
defer cm.StopServer()
|
||||
|
||||
imgStore := ctlr.StoreController.DefaultStore
|
||||
|
||||
orphanBlob := []byte("orphaned-blob-content")
|
||||
_, _, err = imgStore.FullBlobUpload("gc-metrics-test", bytes.NewReader(orphanBlob), godigest.FromBytes(orphanBlob))
|
||||
So(err, ShouldBeNil)
|
||||
|
||||
audit := log.NewAuditLogger("debug", "/dev/null")
|
||||
gcObj := gc.NewGarbageCollect(imgStore, mocks.MetaDBMock{}, gc.Options{Delay: 0},
|
||||
audit, ctlr.Log, ctlr.Metrics)
|
||||
|
||||
err = gcObj.CleanRepo(context.Background(), "gc-metrics-test")
|
||||
So(err, ShouldBeNil)
|
||||
|
||||
resp, err := resty.R().Get(baseURL + "/metrics")
|
||||
So(err, ShouldBeNil)
|
||||
So(resp, ShouldNotBeNil)
|
||||
So(resp.StatusCode(), ShouldEqual, http.StatusOK)
|
||||
|
||||
respStr := string(resp.Body())
|
||||
So(respStr, ShouldContainSubstring, "zot_gc_runs_total")
|
||||
So(respStr, ShouldContainSubstring, "zot_gc_duration_seconds")
|
||||
So(respStr, ShouldContainSubstring, "zot_gc_deleted_total{type=\"blob\"}")
|
||||
})
|
||||
}
|
||||
|
||||
func generateRandomString() string {
|
||||
//nolint: gosec
|
||||
seededRand := rand.New(rand.NewSource(time.Now().UnixNano()))
|
||||
|
||||
Reference in New Issue
Block a user