mirror of
https://github.com/project-zot/zot.git
synced 2026-06-18 05:28:07 +08:00
6c1f1bdd40
* feat(metrics): add Prometheus GC metrics Track garbage collection activity with three new metrics: - zot_gc_runs_total (counter, label: error) — GC run count - zot_gc_duration_seconds (summary) — GC run duration - zot_gc_deleted_total (counter, label: type) — items deleted by type: blob, manifest, upload MetricServer is added to GarbageCollect and wired through all callers (controller, verify-feature retention, tests). Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * fix(test): add missing metrics var in GCS GC tests TestGCSGarbageCollectImageIndex and TestGCSGarbageCollectChainedImageIndexes were missing the metrics variable required by NewGarbageCollect after the MetricServer parameter was added. Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * fix(test): add defer metrics.Stop() in GC tests Prevent goroutine/port leaks by stopping MetricsServer in storage_test.go (3 functions) and gcs_test.go (also add missing metrics declaration in TestGCSGarbageCollectImageManifest). Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * fix(test): cover `CleanRepo` error path Add test that exercises the error branch in `CleanRepo` where `cleanRepo` fails, covering the metrics calls and log lines flagged by Codecov. Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * test: Cover GC error paths for codecov Add three tests in gc_internal_test.go to cover previously untested error branches in `removeBlobUploads` and `removeUnreferencedBlobs`: `ListBlobUploads` failure, `addIndexBlobsToReferences` failure, and `PathNotFoundError` from `GetAllBlobs`. Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * test(gc): cover remaining error paths Cover `StatBlobUpload`, `digest.Validate()`, `isBlobOlderThan`, and `CleanupRepo` error branches in `removeBlobUploads` and `removeUnreferencedBlobs`. `removeUnreferencedBlobs` now at 100% coverage, `removeBlobUploads` from 78.3% to 91.3%. Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * test: cover `sanityChecks` label name mismatch Try to avoid -0.09% coverage regression on `minimal.go` by exercising the uncovered branch in `sanityChecks` where label names have correct count but wrong values. Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * test(gc): exercise real GC path in metrics test TestGCMetrics was calling metric helpers directly instead of running actual garbage collection, so it couldn't catch wiring regressions where `CleanRepo` stops recording metrics. Now uploads an orphaned blob and runs `gc.CleanRepo` end-to-end, verifying metrics appear on the Prometheus endpoint. Suggestion from Copilot: https://github.com/project-zot/zot/pull/3863#discussion_r3129324719 Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * fix(gc): skip deletion metrics when DryRun is enabled https://github.com/project-zot/zot/pull/3863#discussion_r3129324684 Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * fix(test): stop leaked MetricsServer goroutines in GCS tests https://github.com/project-zot/zot/pull/3863#discussion_r3129324657 Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * refactor(test): drop unnecessary zlog import alias Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * fix(monitoring): expose metric types outside build tag `MetricsCopy` and related types were only visible under `\!metrics`, causing a typecheck failure when golangci-lint runs with `-tags metrics`. Moving the type definitions to `common.go` makes them unconditionally available. Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * fix(monitoring): remove extra blank line for gci Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * test(gc): cover both dry-run and real deletion metrics And fix issue with build tag with metrics Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> * Satisfy testpackage linter for gc metrics test The `testpackage` linter allows `package gc` only in files named `*_internal_test.go`; rename to follow that convention. Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr> --------- Signed-off-by: Benoit Tigeot <benoit.tigeot@lifen.fr>
244 lines
7.7 KiB
Go
244 lines
7.7 KiB
Go
package server
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"net"
|
|
"os"
|
|
"os/signal"
|
|
"syscall"
|
|
"time"
|
|
|
|
"github.com/spf13/cobra"
|
|
|
|
zerr "zotregistry.dev/zot/v2/errors"
|
|
"zotregistry.dev/zot/v2/pkg/api"
|
|
"zotregistry.dev/zot/v2/pkg/api/config"
|
|
"zotregistry.dev/zot/v2/pkg/extensions/monitoring"
|
|
zlog "zotregistry.dev/zot/v2/pkg/log"
|
|
"zotregistry.dev/zot/v2/pkg/meta"
|
|
mTypes "zotregistry.dev/zot/v2/pkg/meta/types"
|
|
"zotregistry.dev/zot/v2/pkg/scheduler"
|
|
"zotregistry.dev/zot/v2/pkg/storage"
|
|
)
|
|
|
|
func newVerifyFeatureRetentionCmd(conf *config.Config) *cobra.Command {
|
|
// "verify-feature retention"
|
|
retentionCheckCmd := &cobra.Command{
|
|
Use: "retention <config>",
|
|
Short: "`verify-feature retention` runs garbage collection and retention tasks",
|
|
Long: "`verify-feature retention` runs garbage collection and retention tasks " +
|
|
"based on the provided configuration.\n\n" +
|
|
"WARNING: If retention settings are enabled in the config, the server metadata database needs to be accessed, " +
|
|
"which means the zot server must be stopped before running this command.",
|
|
Args: cobra.ExactArgs(1),
|
|
RunE: func(cmd *cobra.Command, args []string) error {
|
|
// Use stdout by default, or the specified log file
|
|
logFile, err := cmd.PersistentFlags().GetString("log-file")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get log-file flag: %w", err)
|
|
}
|
|
|
|
logOutput := ""
|
|
if logFile != "" {
|
|
logOutput = logFile
|
|
}
|
|
logger := zlog.NewLogger("info", logOutput)
|
|
|
|
if len(args) > 0 {
|
|
if err := LoadConfiguration(conf, args[0]); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// Do not show usage on errors which are not related to command line arguments
|
|
cmd.SilenceUsage = true
|
|
|
|
// Check if GC is enabled in config
|
|
if !conf.Storage.GC {
|
|
logger.Error().Msgf("failed to run verify-feature retention, garbage collection is disabled in config")
|
|
|
|
return fmt.Errorf("%w: %s", zerr.ErrBadConfig, "verify-feature retention requires GC to be enabled")
|
|
}
|
|
|
|
// Set short delay for verify-feature retention command
|
|
conf.Storage.GCMaxSchedulerDelay = 5 * time.Millisecond
|
|
|
|
// Override GC interval if specified
|
|
gcInterval, err := cmd.PersistentFlags().GetDuration("gc-interval")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get gc-interval flag: %w", err)
|
|
}
|
|
|
|
if gcInterval > 0 {
|
|
conf.Storage.GCInterval = gcInterval
|
|
}
|
|
|
|
// Process subpaths for GC interval override
|
|
if conf.Storage.SubPaths != nil {
|
|
for route, storageConfig := range conf.Storage.SubPaths {
|
|
storageConfig.GCMaxSchedulerDelay = 5 * time.Millisecond
|
|
if gcInterval > 0 {
|
|
storageConfig.GCInterval = gcInterval
|
|
}
|
|
conf.Storage.SubPaths[route] = storageConfig
|
|
}
|
|
}
|
|
|
|
// Log entire configuration after all overrides
|
|
logger.Info().Interface("params", conf.Sanitize()).
|
|
Msg("configuration settings (after applying overrides)")
|
|
|
|
// Check if server is running BEFORE initializing storage (to avoid database lock)
|
|
if !isRemoteCacheEnabled(conf) {
|
|
logger.Warn().Msg("local storage detected - the zot server must be stopped to access the storage database")
|
|
|
|
if err := checkServerRunning(conf, logger); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// Initialize metrics server
|
|
metricsServer := monitoring.NewMetricsServer(false, logger)
|
|
|
|
// Initialize store controller
|
|
storeController, err := storage.New(conf, nil, metricsServer, logger, nil)
|
|
if err != nil {
|
|
msg := "failed to initialize store controller"
|
|
logger.Error().Err(err).Msg(msg)
|
|
|
|
return fmt.Errorf("%s: %w", msg, err)
|
|
}
|
|
|
|
// Initialize MetaDB only if retention policies are configured
|
|
var metaDB mTypes.MetaDB
|
|
|
|
if conf.IsRetentionEnabled() {
|
|
// Enable retention dry-run mode only when retention is enabled
|
|
conf.Storage.Retention.DryRun = true
|
|
|
|
// Process subpaths for retention dry-run
|
|
if conf.Storage.SubPaths != nil {
|
|
for route, storageConfig := range conf.Storage.SubPaths {
|
|
storageConfig.Retention.DryRun = true
|
|
conf.Storage.SubPaths[route] = storageConfig
|
|
}
|
|
}
|
|
|
|
driver, err := meta.New(conf.Storage.StorageConfig, logger)
|
|
if err != nil {
|
|
msg := "failed to initialize metadata database"
|
|
logger.Error().Err(err).Msg(msg)
|
|
|
|
return fmt.Errorf("%s: %w", msg, err)
|
|
}
|
|
|
|
err = meta.ParseStorage(driver, storeController, logger)
|
|
if err != nil {
|
|
msg := "failed to parse storage"
|
|
logger.Error().Err(err).Msg(msg)
|
|
|
|
return fmt.Errorf("%s: %w", msg, err)
|
|
}
|
|
|
|
metaDB = driver
|
|
logger.Info().Msg("retention policies are configured - retention rules will be applied")
|
|
} else {
|
|
metaDB = nil
|
|
logger.Info().Msg("no retention policies are configured - garbage collection will run with default settings")
|
|
}
|
|
|
|
// Initialize scheduler
|
|
taskScheduler := scheduler.NewScheduler(conf, metricsServer, logger)
|
|
taskScheduler.RunScheduler()
|
|
defer taskScheduler.Shutdown()
|
|
|
|
logger.Info().Msg("garbage collection and retention tasks will be submitted to the scheduler")
|
|
|
|
// Run GC and retention tasks
|
|
api.RunGCTasks(conf, storeController, metaDB, taskScheduler, logger, nil, metricsServer)
|
|
|
|
// Wait for tasks to complete with optional timeout
|
|
timeout, err := cmd.PersistentFlags().GetDuration("timeout")
|
|
if err != nil {
|
|
return fmt.Errorf("failed to get timeout flag: %w", err)
|
|
}
|
|
|
|
var (
|
|
waitCtx context.Context
|
|
cancel context.CancelFunc
|
|
)
|
|
|
|
if timeout > 0 {
|
|
logger.Info().Dur("timeout", timeout).Msg("waiting for garbage collection tasks to complete...")
|
|
waitCtx, cancel = context.WithTimeout(context.Background(), timeout)
|
|
} else {
|
|
logger.Info().Msg("waiting for garbage collection tasks to complete indefinitely " +
|
|
"(can be interrupted by SIGINT/SIGTERM)...")
|
|
waitCtx, cancel = context.WithCancel(cmd.Context())
|
|
}
|
|
defer cancel()
|
|
|
|
// Set up signal handling for graceful shutdown
|
|
sigChan := make(chan os.Signal, 1)
|
|
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
|
|
|
|
// Wait for either context cancellation or signal
|
|
select {
|
|
case <-waitCtx.Done():
|
|
logger.Info().Msg("retention check completed successfully")
|
|
case sig := <-sigChan:
|
|
logger.Info().Str("signal", sig.String()).Msg("received interrupt signal, stopping retention check")
|
|
logger.Info().Msg("retention check stopped gracefully")
|
|
}
|
|
|
|
return nil
|
|
},
|
|
}
|
|
|
|
retentionCheckCmd.PersistentFlags().StringP("log-file", "l", "", "log file location (default: stdout)")
|
|
retentionCheckCmd.PersistentFlags().DurationP("gc-interval", "i", 0,
|
|
"override GC interval (default: use config value)")
|
|
retentionCheckCmd.PersistentFlags().DurationP("timeout", "t", 0,
|
|
"timeout for waiting for tasks to complete (default: wait indefinitely)")
|
|
|
|
return retentionCheckCmd
|
|
}
|
|
|
|
// checkServerRunning checks if a Zot server is already running on the configured address/port.
|
|
func checkServerRunning(conf *config.Config, logger zlog.Logger) error {
|
|
addr, err := net.ResolveTCPAddr("tcp", net.JoinHostPort(conf.HTTP.Address, conf.HTTP.Port))
|
|
if err != nil {
|
|
msg := "failed to resolve TCP address"
|
|
logger.Error().Err(err).Msg(msg)
|
|
|
|
return fmt.Errorf("%s: %w", msg, err)
|
|
}
|
|
|
|
listener, err := net.ListenTCP("tcp", addr)
|
|
if err != nil {
|
|
msg := fmt.Sprintf("failed to bind to %s (server may be running or address unavailable)", addr.String())
|
|
logger.Error().Err(err).Msg(msg)
|
|
|
|
return fmt.Errorf("%s: %w", msg, err)
|
|
}
|
|
|
|
// Binding succeeded, server is not running
|
|
return listener.Close()
|
|
}
|
|
|
|
// isRemoteCacheEnabled checks if the remote cache is enabled for the global and subpaths storage configs.
|
|
func isRemoteCacheEnabled(conf *config.Config) bool {
|
|
if conf == nil || !conf.Storage.RemoteCache {
|
|
return false
|
|
}
|
|
|
|
for _, subStorageConfig := range conf.Storage.SubPaths {
|
|
if !subStorageConfig.RemoteCache {
|
|
return false
|
|
}
|
|
}
|
|
|
|
return true
|
|
}
|