From 983dc7f8d51a9372ab24410e7e0def9b569349b5 Mon Sep 17 00:00:00 2001 From: Andrei Aaron Date: Sat, 1 Mar 2025 01:04:09 +0200 Subject: [PATCH] Cumulative improvements for CI troubleshooting (#2996) * feat: show more error information in zb output Signed-off-by: Andrei Aaron * chore(ci): gc stress tests to save logs as artifacts Signed-off-by: Andrei Aaron * chore: add benchmark results to job summaries Signed-off-by: Andrei Aaron * fix: count and show zb errors Signed-off-by: Andrei Aaron * ci: fix the flaky coverage of the redis logger Signed-off-by: Andrei Aaron --------- Signed-off-by: Andrei Aaron --- .github/workflows/benchmark.yaml | 2 ++ .github/workflows/cluster.yaml | 4 +++ .github/workflows/gc-stress-test.yaml | 34 ++++++++++++++++++++++ cmd/zb/helper.go | 42 ++++++++++++++++----------- cmd/zb/perf.go | 24 +++++++++++---- pkg/api/config/redis/redis.go | 10 +++---- pkg/api/config/redis/redis_test.go | 21 ++++++++++++++ 7 files changed, 110 insertions(+), 27 deletions(-) diff --git a/.github/workflows/benchmark.yaml b/.github/workflows/benchmark.yaml index c6bb9479..00870512 100644 --- a/.github/workflows/benchmark.yaml +++ b/.github/workflows/benchmark.yaml @@ -40,4 +40,6 @@ jobs: external-data-json-path: ./cache/benchmark-data.json # Workflow will fail when an alert happens fail-on-alert: true + # Show data in the job summary + summary-always: true # Upload the updated cache file for the next job by actions/cache diff --git a/.github/workflows/cluster.yaml b/.github/workflows/cluster.yaml index 5f233f2a..07357f6c 100644 --- a/.github/workflows/cluster.yaml +++ b/.github/workflows/cluster.yaml @@ -254,6 +254,8 @@ jobs: external-data-json-path: ./cache/benchmark-data.json # Workflow will fail when an alert happens fail-on-alert: true + # Show data in the job summary + summary-always: true # Upload the updated cache file for the next job by actions/cache minio-redis: @@ -507,4 +509,6 @@ jobs: external-data-json-path: ./cache/benchmark-data.json # Workflow will fail when an alert happens fail-on-alert: true + # Show data in the job summary + summary-always: true # Upload the updated cache file for the next job by actions/cache diff --git a/.github/workflows/gc-stress-test.yaml b/.github/workflows/gc-stress-test.yaml index 782a7050..3e6069e9 100644 --- a/.github/workflows/gc-stress-test.yaml +++ b/.github/workflows/gc-stress-test.yaml @@ -37,6 +37,14 @@ jobs: sudo rm -rf /tmp/zot continue-on-error: true + - name: Upload zot logs + uses: actions/upload-artifact@v4 + if: always() + with: + name: gc-referrers-bench-local + path: /tmp/gc-referrers-bench-local.log + if-no-files-found: error + - name: Check on failures if: steps.bench.outcome != 'success' run: | @@ -68,6 +76,14 @@ jobs: sudo rm -rf /tmp/zot continue-on-error: true + - name: Upload zot logs + uses: actions/upload-artifact@v4 + if: always() + with: + name: gc-bench-local + path: /tmp/gc-bench-local.log + if-no-files-found: error + - name: Check on failures if: steps.bench.outcome != 'success' run: | @@ -142,11 +158,20 @@ jobs: AWS_SECRET_ACCESS_KEY: fake continue-on-error: true + - name: Upload zot logs + uses: actions/upload-artifact@v4 + if: always() + with: + name: gc-referrers-bench-s3 + path: /tmp/gc-referrers-bench-s3.log + if-no-files-found: error + - name: Check on failures if: steps.bench.outcome != 'success' run: | cat /tmp/gc-referrers-bench-s3.log exit 1 + - uses: ./.github/actions/teardown-localstack gc-stress-s3: @@ -217,9 +242,18 @@ jobs: AWS_SECRET_ACCESS_KEY: fake continue-on-error: true + - name: Upload zot logs + uses: actions/upload-artifact@v4 + if: always() + with: + name: gc-bench-s3 + path: /tmp/gc-bench-s3.log + if-no-files-found: error + - name: Check on failures if: steps.bench.outcome != 'success' run: | cat /tmp/gc-bench-s3.log exit 1 + - uses: ./.github/actions/teardown-localstack diff --git a/cmd/zb/helper.go b/cmd/zb/helper.go index e4dc27e8..d6b1747c 100644 --- a/cmd/zb/helper.go +++ b/cmd/zb/helper.go @@ -119,11 +119,12 @@ func pullAndCollect(url string, repos []string, manifestItem manifestStruct, func() { start := time.Now() - var isConnFail, isErr bool - - var statusCode int - - var latency time.Duration + var ( + isConnFail, isErr bool + statusCode int + latency time.Duration + err error + ) defer func() { // send a stats record @@ -132,6 +133,7 @@ func pullAndCollect(url string, repos []string, manifestItem manifestStruct, statusCode: statusCode, isConnFail: isConnFail, isErr: isErr, + err: err, } }() @@ -144,8 +146,10 @@ func pullAndCollect(url string, repos []string, manifestItem manifestStruct, for repo, manifestTag := range manifestHash { manifestLoc := fmt.Sprintf("%s/v2/%s/manifests/%s", url, repo, manifestTag) + var resp *resty.Response + // check manifest - resp, err := client.R(). + resp, err = client.R(). SetHeader("Content-Type", "application/vnd.oci.image.manifest.v1+json"). Head(manifestLoc) @@ -261,7 +265,7 @@ func pullAndCollect(url string, repos []string, manifestItem manifestStruct, blobLoc := fmt.Sprintf("%s/v2/%s/blobs/%s", url, repo, blobDigest) // check blob - resp, err := client.R().Head(blobLoc) + resp, err = client.R().Head(blobLoc) latency = time.Since(start) @@ -476,11 +480,12 @@ func pushMonolithAndCollect(workdir, url, trepo string, count int, func() { start := time.Now() - var isConnFail, isErr bool - - var statusCode int - - var latency time.Duration + var ( + isConnFail, isErr bool + statusCode int + latency time.Duration + err error + ) defer func() { // send a stats record @@ -489,6 +494,7 @@ func pushMonolithAndCollect(workdir, url, trepo string, count int, statusCode: statusCode, isConnFail: isConnFail, isErr: isErr, + err: err, } }() @@ -680,11 +686,12 @@ func pushChunkAndCollect(workdir, url, trepo string, count int, func() { start := time.Now() - var isConnFail, isErr bool - - var statusCode int - - var latency time.Duration + var ( + isConnFail, isErr bool + statusCode int + latency time.Duration + err error + ) defer func() { // send a stats record @@ -693,6 +700,7 @@ func pushChunkAndCollect(workdir, url, trepo string, count int, statusCode: statusCode, isConnFail: isConnFail, isErr: isErr, + err: err, } }() diff --git a/cmd/zb/perf.go b/cmd/zb/perf.go index 206f875a..4d9c0fa3 100644 --- a/cmd/zb/perf.go +++ b/cmd/zb/perf.go @@ -126,7 +126,8 @@ type statsSummary struct { statusHist map[string]int rps float32 mixedSize, mixedType bool - errors int + errorCount int + errors map[string]int } func newStatsSummary(name string) statsSummary { @@ -147,11 +148,16 @@ type statsRecord struct { statusCode int isConnFail bool isErr bool + err error } func updateStats(summary *statsSummary, record statsRecord) { if record.isConnFail || record.isErr { - summary.errors++ + summary.errorCount++ + } + + if record.err != nil { + summary.errors[record.err.Error()] += 1 } if summary.min < 0 || record.latency < summary.min { @@ -208,9 +214,14 @@ func printStats(requests int, summary *statsSummary, outFmt string) { log.Printf("============\n") log.Printf("Test name:\t%s", summary.name) log.Printf("Time taken for tests:\t%v", summary.total) - log.Printf("Complete requests:\t%v", requests-summary.errors) - log.Printf("Failed requests:\t%v", summary.errors) log.Printf("Requests per second:\t%v", summary.rps) + log.Printf("Complete requests:\t%v", requests-summary.errorCount) + log.Printf("Failed requests:\t%v", summary.errorCount) + + for errStr, count := range summary.errors { + log.Printf("Error %s count:\t%d", errStr, count) + } + log.Printf("\n") if summary.mixedSize { @@ -306,6 +317,8 @@ func GetCatalog( var latency time.Duration + var err error + defer func() { // send a stats record statsCh <- statsRecord{ @@ -313,6 +326,7 @@ func GetCatalog( statusCode: statusCode, isConnFail: isConnFail, isErr: isErr, + err: err, } }() @@ -749,7 +763,7 @@ func Perf( printStats(requests, &summary, outFmt) - if summary.errors != 0 && !zbError { + if summary.errorCount != 0 && !zbError { zbError = true } } diff --git a/pkg/api/config/redis/redis.go b/pkg/api/config/redis/redis.go index 7dc76cd0..4539b5c6 100644 --- a/pkg/api/config/redis/redis.go +++ b/pkg/api/config/redis/redis.go @@ -16,16 +16,16 @@ import ( var once sync.Once //nolint: gochecknoglobals // redis.SetLogger modifies an unprotected global variable -type redisLogger struct { - log log.Logger +type RedisLogger struct { + Log log.Logger } -func (r redisLogger) Printf(ctx context.Context, format string, v ...interface{}) { - r.log.Debug().Msgf(format, v...) +func (r RedisLogger) Printf(ctx context.Context, format string, v ...interface{}) { + r.Log.Debug().Msgf(format, v...) } func GetRedisClient(redisConfig map[string]interface{}, log log.Logger) (redis.UniversalClient, error) { - once.Do(func() { redis.SetLogger(redisLogger{log}) }) // call redis.SetLogger only once + once.Do(func() { redis.SetLogger(RedisLogger{log}) }) // call redis.SetLogger only once // go-redis supports connecting via the redis uri specification (more convenient than parameter parsing) // Note failover/Sentinel cannot be configured via URL parsing at the moment diff --git a/pkg/api/config/redis/redis_test.go b/pkg/api/config/redis/redis_test.go index ffb7e294..87c0ac04 100644 --- a/pkg/api/config/redis/redis_test.go +++ b/pkg/api/config/redis/redis_test.go @@ -1,6 +1,8 @@ package rediscfg_test import ( + "context" + "io" "os" "path" "testing" @@ -15,6 +17,25 @@ import ( "zotregistry.dev/zot/pkg/log" ) +func TestRedisLogger(t *testing.T) { + Convey("Print using Redis logger", t, func() { + logFile, err := os.CreateTemp(t.TempDir(), "zot-log*.txt") + So(err, ShouldBeNil) + + logger := log.NewLogger("debug", logFile.Name()) + writers := io.MultiWriter(os.Stdout, logFile) + logger.Logger = logger.Output(writers) + + rlog := rediscfg.RedisLogger{logger} + rlog.Printf(context.Background(), "this is a rest string") + + content, err := os.ReadFile(logFile.Name()) + So(err, ShouldBeNil) + + So(string(content), ShouldContainSubstring, "this is a rest string") + }) +} + func TestRedisOptions(t *testing.T) { Convey("Test redis initialization", t, func() { log := log.NewLogger("debug", "")