Files
zot/pkg/cli/server/verify_retention.go
Benoit Tigeot 6c1f1bdd40 feat(metrics): add Prometheus GC metrics (#3863)
* feat(metrics): add Prometheus GC metrics

Track garbage collection activity with three new metrics:
- zot_gc_runs_total (counter, label: error) — GC run count
- zot_gc_duration_seconds (summary) — GC run duration
- zot_gc_deleted_total (counter, label: type) — items deleted
  by type: blob, manifest, upload

MetricServer is added to GarbageCollect and wired through
all callers (controller, verify-feature retention, tests).

Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>

* fix(test): add missing metrics var in GCS GC tests

TestGCSGarbageCollectImageIndex and
TestGCSGarbageCollectChainedImageIndexes were missing the
metrics variable required by NewGarbageCollect after the
MetricServer parameter was added.

Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>

* fix(test): add defer metrics.Stop() in GC tests

Prevent goroutine/port leaks by stopping MetricsServer in
storage_test.go (3 functions) and gcs_test.go (also add
missing metrics declaration in TestGCSGarbageCollectImageManifest).

Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>

* fix(test): cover `CleanRepo` error path

Add test that exercises the error branch in
`CleanRepo` where `cleanRepo` fails, covering
the metrics calls and log lines flagged by Codecov.

Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>

* test: Cover GC error paths for codecov

Add three tests in gc_internal_test.go to cover previously
untested error branches in `removeBlobUploads` and
`removeUnreferencedBlobs`: `ListBlobUploads` failure,
`addIndexBlobsToReferences` failure, and `PathNotFoundError`
from `GetAllBlobs`.

Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>

* test(gc): cover remaining error paths

Cover `StatBlobUpload`, `digest.Validate()`,
`isBlobOlderThan`, and `CleanupRepo` error branches
in `removeBlobUploads` and `removeUnreferencedBlobs`.

`removeUnreferencedBlobs` now at 100% coverage,
`removeBlobUploads` from 78.3% to 91.3%.

Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>

* test: cover `sanityChecks` label name mismatch

Try to avoid -0.09% coverage regression on `minimal.go`
by exercising the uncovered branch in `sanityChecks`
where label names have correct count but wrong values.

Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>

* test(gc): exercise real GC path in metrics test

TestGCMetrics was calling metric helpers directly instead of
running actual garbage collection, so it couldn't catch wiring
regressions where `CleanRepo` stops recording metrics.

Now uploads an orphaned blob and runs `gc.CleanRepo` end-to-end,
verifying metrics appear on the Prometheus endpoint.

Suggestion from Copilot: https://github.com/project-zot/zot/pull/3863#discussion_r3129324719

Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>

* fix(gc): skip deletion metrics when DryRun is enabled

https://github.com/project-zot/zot/pull/3863#discussion_r3129324684

Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>

* fix(test): stop leaked MetricsServer goroutines in GCS tests

https://github.com/project-zot/zot/pull/3863#discussion_r3129324657

Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>

* refactor(test): drop unnecessary zlog import alias

Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>

* fix(monitoring): expose metric types outside build tag

`MetricsCopy` and related types were only visible under `\!metrics`,
causing a typecheck failure when golangci-lint runs with `-tags metrics`.
Moving the type definitions to `common.go` makes them unconditionally available.

Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>

* fix(monitoring): remove extra blank line for gci

Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>

* test(gc): cover both dry-run and real deletion metrics

And fix issue with build tag with metrics

Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>

* Satisfy testpackage linter for gc metrics test

The `testpackage` linter allows `package gc` only in files named
`*_internal_test.go`; rename to follow that convention.

Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>

---------

Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>
2026-05-16 23:03:36 -07:00

244 lines
7.7 KiB
Go

package server
import (
"context"
"fmt"
"net"
"os"
"os/signal"
"syscall"
"time"
"github.com/spf13/cobra"
zerr "zotregistry.dev/zot/v2/errors"
"zotregistry.dev/zot/v2/pkg/api"
"zotregistry.dev/zot/v2/pkg/api/config"
"zotregistry.dev/zot/v2/pkg/extensions/monitoring"
zlog "zotregistry.dev/zot/v2/pkg/log"
"zotregistry.dev/zot/v2/pkg/meta"
mTypes "zotregistry.dev/zot/v2/pkg/meta/types"
"zotregistry.dev/zot/v2/pkg/scheduler"
"zotregistry.dev/zot/v2/pkg/storage"
)
func newVerifyFeatureRetentionCmd(conf *config.Config) *cobra.Command {
// "verify-feature retention"
retentionCheckCmd := &cobra.Command{
Use: "retention <config>",
Short: "`verify-feature retention` runs garbage collection and retention tasks",
Long: "`verify-feature retention` runs garbage collection and retention tasks " +
"based on the provided configuration.\n\n" +
"WARNING: If retention settings are enabled in the config, the server metadata database needs to be accessed, " +
"which means the zot server must be stopped before running this command.",
Args: cobra.ExactArgs(1),
RunE: func(cmd *cobra.Command, args []string) error {
// Use stdout by default, or the specified log file
logFile, err := cmd.PersistentFlags().GetString("log-file")
if err != nil {
return fmt.Errorf("failed to get log-file flag: %w", err)
}
logOutput := ""
if logFile != "" {
logOutput = logFile
}
logger := zlog.NewLogger("info", logOutput)
if len(args) > 0 {
if err := LoadConfiguration(conf, args[0]); err != nil {
return err
}
}
// Do not show usage on errors which are not related to command line arguments
cmd.SilenceUsage = true
// Check if GC is enabled in config
if !conf.Storage.GC {
logger.Error().Msgf("failed to run verify-feature retention, garbage collection is disabled in config")
return fmt.Errorf("%w: %s", zerr.ErrBadConfig, "verify-feature retention requires GC to be enabled")
}
// Set short delay for verify-feature retention command
conf.Storage.GCMaxSchedulerDelay = 5 * time.Millisecond
// Override GC interval if specified
gcInterval, err := cmd.PersistentFlags().GetDuration("gc-interval")
if err != nil {
return fmt.Errorf("failed to get gc-interval flag: %w", err)
}
if gcInterval > 0 {
conf.Storage.GCInterval = gcInterval
}
// Process subpaths for GC interval override
if conf.Storage.SubPaths != nil {
for route, storageConfig := range conf.Storage.SubPaths {
storageConfig.GCMaxSchedulerDelay = 5 * time.Millisecond
if gcInterval > 0 {
storageConfig.GCInterval = gcInterval
}
conf.Storage.SubPaths[route] = storageConfig
}
}
// Log entire configuration after all overrides
logger.Info().Interface("params", conf.Sanitize()).
Msg("configuration settings (after applying overrides)")
// Check if server is running BEFORE initializing storage (to avoid database lock)
if !isRemoteCacheEnabled(conf) {
logger.Warn().Msg("local storage detected - the zot server must be stopped to access the storage database")
if err := checkServerRunning(conf, logger); err != nil {
return err
}
}
// Initialize metrics server
metricsServer := monitoring.NewMetricsServer(false, logger)
// Initialize store controller
storeController, err := storage.New(conf, nil, metricsServer, logger, nil)
if err != nil {
msg := "failed to initialize store controller"
logger.Error().Err(err).Msg(msg)
return fmt.Errorf("%s: %w", msg, err)
}
// Initialize MetaDB only if retention policies are configured
var metaDB mTypes.MetaDB
if conf.IsRetentionEnabled() {
// Enable retention dry-run mode only when retention is enabled
conf.Storage.Retention.DryRun = true
// Process subpaths for retention dry-run
if conf.Storage.SubPaths != nil {
for route, storageConfig := range conf.Storage.SubPaths {
storageConfig.Retention.DryRun = true
conf.Storage.SubPaths[route] = storageConfig
}
}
driver, err := meta.New(conf.Storage.StorageConfig, logger)
if err != nil {
msg := "failed to initialize metadata database"
logger.Error().Err(err).Msg(msg)
return fmt.Errorf("%s: %w", msg, err)
}
err = meta.ParseStorage(driver, storeController, logger)
if err != nil {
msg := "failed to parse storage"
logger.Error().Err(err).Msg(msg)
return fmt.Errorf("%s: %w", msg, err)
}
metaDB = driver
logger.Info().Msg("retention policies are configured - retention rules will be applied")
} else {
metaDB = nil
logger.Info().Msg("no retention policies are configured - garbage collection will run with default settings")
}
// Initialize scheduler
taskScheduler := scheduler.NewScheduler(conf, metricsServer, logger)
taskScheduler.RunScheduler()
defer taskScheduler.Shutdown()
logger.Info().Msg("garbage collection and retention tasks will be submitted to the scheduler")
// Run GC and retention tasks
api.RunGCTasks(conf, storeController, metaDB, taskScheduler, logger, nil, metricsServer)
// Wait for tasks to complete with optional timeout
timeout, err := cmd.PersistentFlags().GetDuration("timeout")
if err != nil {
return fmt.Errorf("failed to get timeout flag: %w", err)
}
var (
waitCtx context.Context
cancel context.CancelFunc
)
if timeout > 0 {
logger.Info().Dur("timeout", timeout).Msg("waiting for garbage collection tasks to complete...")
waitCtx, cancel = context.WithTimeout(context.Background(), timeout)
} else {
logger.Info().Msg("waiting for garbage collection tasks to complete indefinitely " +
"(can be interrupted by SIGINT/SIGTERM)...")
waitCtx, cancel = context.WithCancel(cmd.Context())
}
defer cancel()
// Set up signal handling for graceful shutdown
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
// Wait for either context cancellation or signal
select {
case <-waitCtx.Done():
logger.Info().Msg("retention check completed successfully")
case sig := <-sigChan:
logger.Info().Str("signal", sig.String()).Msg("received interrupt signal, stopping retention check")
logger.Info().Msg("retention check stopped gracefully")
}
return nil
},
}
retentionCheckCmd.PersistentFlags().StringP("log-file", "l", "", "log file location (default: stdout)")
retentionCheckCmd.PersistentFlags().DurationP("gc-interval", "i", 0,
"override GC interval (default: use config value)")
retentionCheckCmd.PersistentFlags().DurationP("timeout", "t", 0,
"timeout for waiting for tasks to complete (default: wait indefinitely)")
return retentionCheckCmd
}
// checkServerRunning checks if a Zot server is already running on the configured address/port.
func checkServerRunning(conf *config.Config, logger zlog.Logger) error {
addr, err := net.ResolveTCPAddr("tcp", net.JoinHostPort(conf.HTTP.Address, conf.HTTP.Port))
if err != nil {
msg := "failed to resolve TCP address"
logger.Error().Err(err).Msg(msg)
return fmt.Errorf("%s: %w", msg, err)
}
listener, err := net.ListenTCP("tcp", addr)
if err != nil {
msg := fmt.Sprintf("failed to bind to %s (server may be running or address unavailable)", addr.String())
logger.Error().Err(err).Msg(msg)
return fmt.Errorf("%s: %w", msg, err)
}
// Binding succeeded, server is not running
return listener.Close()
}
// isRemoteCacheEnabled checks if the remote cache is enabled for the global and subpaths storage configs.
func isRemoteCacheEnabled(conf *config.Config) bool {
if conf == nil || !conf.Storage.RemoteCache {
return false
}
for _, subStorageConfig := range conf.Storage.SubPaths {
if !subStorageConfig.RemoteCache {
return false
}
}
return true
}