Files
Benoit Tigeot 6c1f1bdd40 feat(metrics): add Prometheus GC metrics (#3863)
* feat(metrics): add Prometheus GC metrics

Track garbage collection activity with three new metrics:
- zot_gc_runs_total (counter, label: error) — GC run count
- zot_gc_duration_seconds (summary) — GC run duration
- zot_gc_deleted_total (counter, label: type) — items deleted
  by type: blob, manifest, upload

MetricServer is added to GarbageCollect and wired through
all callers (controller, verify-feature retention, tests).

Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>

* fix(test): add missing metrics var in GCS GC tests

TestGCSGarbageCollectImageIndex and
TestGCSGarbageCollectChainedImageIndexes were missing the
metrics variable required by NewGarbageCollect after the
MetricServer parameter was added.

Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>

* fix(test): add defer metrics.Stop() in GC tests

Prevent goroutine/port leaks by stopping MetricsServer in
storage_test.go (3 functions) and gcs_test.go (also add
missing metrics declaration in TestGCSGarbageCollectImageManifest).

Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>

* fix(test): cover `CleanRepo` error path

Add test that exercises the error branch in
`CleanRepo` where `cleanRepo` fails, covering
the metrics calls and log lines flagged by Codecov.

Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>

* test: Cover GC error paths for codecov

Add three tests in gc_internal_test.go to cover previously
untested error branches in `removeBlobUploads` and
`removeUnreferencedBlobs`: `ListBlobUploads` failure,
`addIndexBlobsToReferences` failure, and `PathNotFoundError`
from `GetAllBlobs`.

Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>

* test(gc): cover remaining error paths

Cover `StatBlobUpload`, `digest.Validate()`,
`isBlobOlderThan`, and `CleanupRepo` error branches
in `removeBlobUploads` and `removeUnreferencedBlobs`.

`removeUnreferencedBlobs` now at 100% coverage,
`removeBlobUploads` from 78.3% to 91.3%.

Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>

* test: cover `sanityChecks` label name mismatch

Try to avoid -0.09% coverage regression on `minimal.go`
by exercising the uncovered branch in `sanityChecks`
where label names have correct count but wrong values.

Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>

* test(gc): exercise real GC path in metrics test

TestGCMetrics was calling metric helpers directly instead of
running actual garbage collection, so it couldn't catch wiring
regressions where `CleanRepo` stops recording metrics.

Now uploads an orphaned blob and runs `gc.CleanRepo` end-to-end,
verifying metrics appear on the Prometheus endpoint.

Suggestion from Copilot: https://github.com/project-zot/zot/pull/3863#discussion_r3129324719

Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>

* fix(gc): skip deletion metrics when DryRun is enabled

https://github.com/project-zot/zot/pull/3863#discussion_r3129324684

Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>

* fix(test): stop leaked MetricsServer goroutines in GCS tests

https://github.com/project-zot/zot/pull/3863#discussion_r3129324657

Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>

* refactor(test): drop unnecessary zlog import alias

Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>

* fix(monitoring): expose metric types outside build tag

`MetricsCopy` and related types were only visible under `\!metrics`,
causing a typecheck failure when golangci-lint runs with `-tags metrics`.
Moving the type definitions to `common.go` makes them unconditionally available.

Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>

* fix(monitoring): remove extra blank line for gci

Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>

* test(gc): cover both dry-run and real deletion metrics

And fix issue with build tag with metrics

Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>

* Satisfy testpackage linter for gc metrics test

The `testpackage` linter allows `package gc` only in files named
`*_internal_test.go`; rename to follow that convention.

Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>

---------

Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>
2026-05-16 23:03:36 -07:00

207 lines
5.7 KiB
Go

//go:build !metrics
//nolint:testpackage // Tests intentionally cover unexported client construction helpers.
package monitoring
import (
"context"
"crypto/rand"
"crypto/rsa"
"crypto/tls"
"crypto/x509"
"crypto/x509/pkix"
"encoding/pem"
"math/big"
"net"
"net/http"
"net/http/httptest"
"os"
"path/filepath"
"testing"
"time"
"zotregistry.dev/zot/v2/pkg/log"
)
func TestNewHTTPMetricsClientDefaultRootsAndTLSMinVersion(t *testing.T) {
t.Parallel()
client, err := newHTTPMetricsClient("")
if err != nil {
t.Fatalf("expected no error, got: %v", err)
}
transport, ok := client.Transport.(*http.Transport)
if !ok {
t.Fatalf("expected *http.Transport, got %T", client.Transport)
}
if transport.TLSClientConfig == nil {
t.Fatal("expected TLSClientConfig to be set")
}
if transport.TLSClientConfig.MinVersion != tls.VersionTLS12 {
t.Fatalf("expected MinVersion TLS1.2, got: %d", transport.TLSClientConfig.MinVersion)
}
if transport.TLSClientConfig.RootCAs != nil {
t.Fatal("expected RootCAs to be nil when no custom CA is provided")
}
}
func TestNewHTTPMetricsClientInvalidCACertPath(t *testing.T) {
t.Parallel()
_, err := newHTTPMetricsClient(filepath.Join(t.TempDir(), "missing-ca.pem"))
if err == nil {
t.Fatal("expected error for missing CA cert file")
}
}
func TestNewHTTPMetricsClientInvalidCACertPEM(t *testing.T) {
t.Parallel()
tmpDir := t.TempDir()
caPath := filepath.Join(tmpDir, "ca.pem")
if err := os.WriteFile(caPath, []byte("not-a-pem-cert"), 0o600); err != nil {
t.Fatalf("failed writing temp CA file: %v", err)
}
_, err := newHTTPMetricsClient(caPath)
if err == nil {
t.Fatal("expected error for invalid PEM CA cert file")
}
}
func TestNewHTTPMetricsClientCustomCAValidatesServer(t *testing.T) {
t.Parallel()
caPEM, serverCert, serverKey, err := generateServerCertificateChain()
if err != nil {
t.Fatalf("failed generating cert chain: %v", err)
}
tmpDir := t.TempDir()
caPath := filepath.Join(tmpDir, "ca.pem")
if err := os.WriteFile(caPath, caPEM, 0o600); err != nil {
t.Fatalf("failed writing CA PEM: %v", err)
}
tlsCert, err := tls.X509KeyPair(serverCert, serverKey)
if err != nil {
t.Fatalf("failed loading server key pair: %v", err)
}
srv := httptest.NewUnstartedServer(http.HandlerFunc(func(w http.ResponseWriter, _ *http.Request) {
w.WriteHeader(http.StatusOK)
_, _ = w.Write([]byte("ok"))
}))
srv.TLS = &tls.Config{Certificates: []tls.Certificate{tlsCert}, MinVersion: tls.VersionTLS12}
srv.StartTLS()
defer srv.Close()
client, err := newHTTPMetricsClient(caPath)
if err != nil {
t.Fatalf("expected no error creating client with CA cert, got: %v", err)
}
req, err := http.NewRequestWithContext(context.Background(), http.MethodGet, srv.URL, nil)
if err != nil {
t.Fatalf("failed to create request: %v", err)
}
resp, err := client.Do(req)
if err != nil {
t.Fatalf("expected TLS handshake to succeed with custom CA, got: %v", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
t.Fatalf("expected status %d, got %d", http.StatusOK, resp.StatusCode)
}
}
func TestNewMetricsClientFallbackKeepsTLSHardening(t *testing.T) {
t.Parallel()
cfg := &MetricsConfig{Address: "https://127.0.0.1:8443", CACert: filepath.Join(t.TempDir(), "missing-ca.pem")}
mc := NewMetricsClient(cfg, log.NewLogger("debug", ""))
transport, ok := mc.config.HTTPClient.Transport.(*http.Transport)
if !ok {
t.Fatalf("expected fallback transport to be *http.Transport, got %T", mc.config.HTTPClient.Transport)
}
if transport.TLSClientConfig == nil {
t.Fatal("expected TLSClientConfig to be present on fallback client")
}
if transport.TLSClientConfig.MinVersion != tls.VersionTLS12 {
t.Fatalf("expected fallback MinVersion TLS1.2, got: %d", transport.TLSClientConfig.MinVersion)
}
}
func TestSanityChecksLabelNameMismatch(t *testing.T) {
t.Parallel()
err := sanityChecks("test.metric", []string{"method", "code"}, true,
[]string{"method", "wrong"}, []string{"GET", "200"})
if err == nil {
t.Fatal("expected error when label names don't match known labels")
}
}
func generateServerCertificateChain() ([]byte, []byte, []byte, error) {
now := time.Now()
caTemplate := &x509.Certificate{
SerialNumber: big.NewInt(1),
Subject: pkix.Name{CommonName: "zot-test-ca"},
NotBefore: now.Add(-1 * time.Hour),
NotAfter: now.Add(24 * time.Hour),
KeyUsage: x509.KeyUsageCertSign | x509.KeyUsageDigitalSignature,
IsCA: true,
BasicConstraintsValid: true,
}
caKey, err := rsa.GenerateKey(rand.Reader, 2048)
if err != nil {
return nil, nil, nil, err
}
caDER, err := x509.CreateCertificate(rand.Reader, caTemplate, caTemplate, &caKey.PublicKey, caKey)
if err != nil {
return nil, nil, nil, err
}
caPEM := pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: caDER})
serverTemplate := &x509.Certificate{
SerialNumber: big.NewInt(2),
Subject: pkix.Name{CommonName: "localhost"},
NotBefore: now.Add(-1 * time.Hour),
NotAfter: now.Add(24 * time.Hour),
KeyUsage: x509.KeyUsageDigitalSignature,
ExtKeyUsage: []x509.ExtKeyUsage{x509.ExtKeyUsageServerAuth},
DNSNames: []string{"localhost"},
IPAddresses: []net.IP{net.ParseIP("127.0.0.1")},
}
serverKey, err := rsa.GenerateKey(rand.Reader, 2048)
if err != nil {
return nil, nil, nil, err
}
serverDER, err := x509.CreateCertificate(rand.Reader, serverTemplate, caTemplate, &serverKey.PublicKey, caKey)
if err != nil {
return nil, nil, nil, err
}
serverCertPEM := pem.EncodeToMemory(&pem.Block{Type: "CERTIFICATE", Bytes: serverDER})
serverKeyPEM := pem.EncodeToMemory(&pem.Block{Type: "RSA PRIVATE KEY", Bytes: x509.MarshalPKCS1PrivateKey(serverKey)})
return caPEM, serverCertPEM, serverKeyPEM, nil
}