From 983dc7f8d51a9372ab24410e7e0def9b569349b5 Mon Sep 17 00:00:00 2001
From: Andrei Aaron <aaaron@luxoft.com>
Date: Sat, 1 Mar 2025 01:04:09 +0200
Subject: [PATCH] Cumulative improvements for CI troubleshooting (#2996)

* feat: show more error information in zb output

Signed-off-by: Andrei Aaron <aaaron@luxoft.com>

* chore(ci): gc stress tests to save logs as artifacts

Signed-off-by: Andrei Aaron <aaaron@luxoft.com>

* chore: add benchmark results to job summaries

Signed-off-by: Andrei Aaron <aaaron@luxoft.com>

* fix: count and show zb errors

Signed-off-by: Andrei Aaron <aaaron@luxoft.com>

* ci: fix the flaky coverage of the redis logger

Signed-off-by: Andrei Aaron <aaaron@luxoft.com>

---------

Signed-off-by: Andrei Aaron <aaaron@luxoft.com>
---
 .github/workflows/benchmark.yaml      |  2 ++
 .github/workflows/cluster.yaml        |  4 +++
 .github/workflows/gc-stress-test.yaml | 34 ++++++++++++++++++++++
 cmd/zb/helper.go                      | 42 ++++++++++++++++-----------
 cmd/zb/perf.go                        | 24 +++++++++++----
 pkg/api/config/redis/redis.go         | 10 +++----
 pkg/api/config/redis/redis_test.go    | 21 ++++++++++++++
 7 files changed, 110 insertions(+), 27 deletions(-)

diff --git a/.github/workflows/benchmark.yaml b/.github/workflows/benchmark.yaml
index c6bb9479..00870512 100644
--- a/.github/workflows/benchmark.yaml
+++ b/.github/workflows/benchmark.yaml
@@ -40,4 +40,6 @@ jobs:
           external-data-json-path: ./cache/benchmark-data.json
           # Workflow will fail when an alert happens
           fail-on-alert: true
+          # Show data in the job summary
+          summary-always: true
       # Upload the updated cache file for the next job by actions/cache
diff --git a/.github/workflows/cluster.yaml b/.github/workflows/cluster.yaml
index 5f233f2a..07357f6c 100644
--- a/.github/workflows/cluster.yaml
+++ b/.github/workflows/cluster.yaml
@@ -254,6 +254,8 @@ jobs:
           external-data-json-path: ./cache/benchmark-data.json
           # Workflow will fail when an alert happens
           fail-on-alert: true
+          # Show data in the job summary
+          summary-always: true
       # Upload the updated cache file for the next job by actions/cache
 
   minio-redis:
@@ -507,4 +509,6 @@ jobs:
           external-data-json-path: ./cache/benchmark-data.json
           # Workflow will fail when an alert happens
           fail-on-alert: true
+          # Show data in the job summary
+          summary-always: true
       # Upload the updated cache file for the next job by actions/cache
diff --git a/.github/workflows/gc-stress-test.yaml b/.github/workflows/gc-stress-test.yaml
index 782a7050..3e6069e9 100644
--- a/.github/workflows/gc-stress-test.yaml
+++ b/.github/workflows/gc-stress-test.yaml
@@ -37,6 +37,14 @@ jobs:
             sudo rm -rf /tmp/zot
         continue-on-error: true
 
+      - name: Upload zot logs
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: gc-referrers-bench-local
+          path: /tmp/gc-referrers-bench-local.log
+          if-no-files-found: error
+
       - name: Check on failures
         if: steps.bench.outcome != 'success'
         run: |
@@ -68,6 +76,14 @@ jobs:
             sudo rm -rf /tmp/zot
         continue-on-error: true
 
+      - name: Upload zot logs
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: gc-bench-local
+          path: /tmp/gc-bench-local.log
+          if-no-files-found: error
+
       - name: Check on failures
         if: steps.bench.outcome != 'success'
         run: |
@@ -142,11 +158,20 @@ jobs:
           AWS_SECRET_ACCESS_KEY: fake
         continue-on-error: true
 
+      - name: Upload zot logs
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: gc-referrers-bench-s3
+          path: /tmp/gc-referrers-bench-s3.log
+          if-no-files-found: error
+
       - name: Check on failures
         if: steps.bench.outcome != 'success'
         run: |
           cat /tmp/gc-referrers-bench-s3.log
           exit 1
+
       - uses: ./.github/actions/teardown-localstack
 
   gc-stress-s3:
@@ -217,9 +242,18 @@ jobs:
           AWS_SECRET_ACCESS_KEY: fake
         continue-on-error: true
 
+      - name: Upload zot logs
+        uses: actions/upload-artifact@v4
+        if: always()
+        with:
+          name: gc-bench-s3
+          path: /tmp/gc-bench-s3.log
+          if-no-files-found: error
+
       - name: Check on failures
         if: steps.bench.outcome != 'success'
         run: |
           cat /tmp/gc-bench-s3.log
           exit 1
+
       - uses: ./.github/actions/teardown-localstack
diff --git a/cmd/zb/helper.go b/cmd/zb/helper.go
index e4dc27e8..d6b1747c 100644
--- a/cmd/zb/helper.go
+++ b/cmd/zb/helper.go
@@ -119,11 +119,12 @@ func pullAndCollect(url string, repos []string, manifestItem manifestStruct,
 	func() {
 		start := time.Now()
 
-		var isConnFail, isErr bool
-
-		var statusCode int
-
-		var latency time.Duration
+		var (
+			isConnFail, isErr bool
+			statusCode        int
+			latency           time.Duration
+			err               error
+		)
 
 		defer func() {
 			// send a stats record
@@ -132,6 +133,7 @@ func pullAndCollect(url string, repos []string, manifestItem manifestStruct,
 				statusCode: statusCode,
 				isConnFail: isConnFail,
 				isErr:      isErr,
+				err:        err,
 			}
 		}()
 
@@ -144,8 +146,10 @@ func pullAndCollect(url string, repos []string, manifestItem manifestStruct,
 		for repo, manifestTag := range manifestHash {
 			manifestLoc := fmt.Sprintf("%s/v2/%s/manifests/%s", url, repo, manifestTag)
 
+			var resp *resty.Response
+
 			// check manifest
-			resp, err := client.R().
+			resp, err = client.R().
 				SetHeader("Content-Type", "application/vnd.oci.image.manifest.v1+json").
 				Head(manifestLoc)
 
@@ -261,7 +265,7 @@ func pullAndCollect(url string, repos []string, manifestItem manifestStruct,
 				blobLoc := fmt.Sprintf("%s/v2/%s/blobs/%s", url, repo, blobDigest)
 
 				// check blob
-				resp, err := client.R().Head(blobLoc)
+				resp, err = client.R().Head(blobLoc)
 
 				latency = time.Since(start)
 
@@ -476,11 +480,12 @@ func pushMonolithAndCollect(workdir, url, trepo string, count int,
 	func() {
 		start := time.Now()
 
-		var isConnFail, isErr bool
-
-		var statusCode int
-
-		var latency time.Duration
+		var (
+			isConnFail, isErr bool
+			statusCode        int
+			latency           time.Duration
+			err               error
+		)
 
 		defer func() {
 			// send a stats record
@@ -489,6 +494,7 @@ func pushMonolithAndCollect(workdir, url, trepo string, count int,
 				statusCode: statusCode,
 				isConnFail: isConnFail,
 				isErr:      isErr,
+				err:        err,
 			}
 		}()
 
@@ -680,11 +686,12 @@ func pushChunkAndCollect(workdir, url, trepo string, count int,
 	func() {
 		start := time.Now()
 
-		var isConnFail, isErr bool
-
-		var statusCode int
-
-		var latency time.Duration
+		var (
+			isConnFail, isErr bool
+			statusCode        int
+			latency           time.Duration
+			err               error
+		)
 
 		defer func() {
 			// send a stats record
@@ -693,6 +700,7 @@ func pushChunkAndCollect(workdir, url, trepo string, count int,
 				statusCode: statusCode,
 				isConnFail: isConnFail,
 				isErr:      isErr,
+				err:        err,
 			}
 		}()
 
diff --git a/cmd/zb/perf.go b/cmd/zb/perf.go
index 206f875a..4d9c0fa3 100644
--- a/cmd/zb/perf.go
+++ b/cmd/zb/perf.go
@@ -126,7 +126,8 @@ type statsSummary struct {
 	statusHist           map[string]int
 	rps                  float32
 	mixedSize, mixedType bool
-	errors               int
+	errorCount           int
+	errors               map[string]int
 }
 
 func newStatsSummary(name string) statsSummary {
@@ -147,11 +148,16 @@ type statsRecord struct {
 	statusCode int
 	isConnFail bool
 	isErr      bool
+	err        error
 }
 
 func updateStats(summary *statsSummary, record statsRecord) {
 	if record.isConnFail || record.isErr {
-		summary.errors++
+		summary.errorCount++
+	}
+
+	if record.err != nil {
+		summary.errors[record.err.Error()] += 1
 	}
 
 	if summary.min < 0 || record.latency < summary.min {
@@ -208,9 +214,14 @@ func printStats(requests int, summary *statsSummary, outFmt string) {
 	log.Printf("============\n")
 	log.Printf("Test name:\t%s", summary.name)
 	log.Printf("Time taken for tests:\t%v", summary.total)
-	log.Printf("Complete requests:\t%v", requests-summary.errors)
-	log.Printf("Failed requests:\t%v", summary.errors)
 	log.Printf("Requests per second:\t%v", summary.rps)
+	log.Printf("Complete requests:\t%v", requests-summary.errorCount)
+	log.Printf("Failed requests:\t%v", summary.errorCount)
+
+	for errStr, count := range summary.errors {
+		log.Printf("Error %s count:\t%d", errStr, count)
+	}
+
 	log.Printf("\n")
 
 	if summary.mixedSize {
@@ -306,6 +317,8 @@ func GetCatalog(
 
 			var latency time.Duration
 
+			var err error
+
 			defer func() {
 				// send a stats record
 				statsCh <- statsRecord{
@@ -313,6 +326,7 @@ func GetCatalog(
 					statusCode: statusCode,
 					isConnFail: isConnFail,
 					isErr:      isErr,
+					err:        err,
 				}
 			}()
 
@@ -749,7 +763,7 @@ func Perf(
 
 		printStats(requests, &summary, outFmt)
 
-		if summary.errors != 0 && !zbError {
+		if summary.errorCount != 0 && !zbError {
 			zbError = true
 		}
 	}
diff --git a/pkg/api/config/redis/redis.go b/pkg/api/config/redis/redis.go
index 7dc76cd0..4539b5c6 100644
--- a/pkg/api/config/redis/redis.go
+++ b/pkg/api/config/redis/redis.go
@@ -16,16 +16,16 @@ import (
 
 var once sync.Once //nolint: gochecknoglobals // redis.SetLogger modifies an unprotected global variable
 
-type redisLogger struct {
-	log log.Logger
+type RedisLogger struct {
+	Log log.Logger
 }
 
-func (r redisLogger) Printf(ctx context.Context, format string, v ...interface{}) {
-	r.log.Debug().Msgf(format, v...)
+func (r RedisLogger) Printf(ctx context.Context, format string, v ...interface{}) {
+	r.Log.Debug().Msgf(format, v...)
 }
 
 func GetRedisClient(redisConfig map[string]interface{}, log log.Logger) (redis.UniversalClient, error) {
-	once.Do(func() { redis.SetLogger(redisLogger{log}) }) // call redis.SetLogger only once
+	once.Do(func() { redis.SetLogger(RedisLogger{log}) }) // call redis.SetLogger only once
 
 	// go-redis supports connecting via the redis uri specification (more convenient than parameter parsing)
 	// Note failover/Sentinel cannot be configured via URL parsing at the moment
diff --git a/pkg/api/config/redis/redis_test.go b/pkg/api/config/redis/redis_test.go
index ffb7e294..87c0ac04 100644
--- a/pkg/api/config/redis/redis_test.go
+++ b/pkg/api/config/redis/redis_test.go
@@ -1,6 +1,8 @@
 package rediscfg_test
 
 import (
+	"context"
+	"io"
 	"os"
 	"path"
 	"testing"
@@ -15,6 +17,25 @@ import (
 	"zotregistry.dev/zot/pkg/log"
 )
 
+func TestRedisLogger(t *testing.T) {
+	Convey("Print using Redis logger", t, func() {
+		logFile, err := os.CreateTemp(t.TempDir(), "zot-log*.txt")
+		So(err, ShouldBeNil)
+
+		logger := log.NewLogger("debug", logFile.Name())
+		writers := io.MultiWriter(os.Stdout, logFile)
+		logger.Logger = logger.Output(writers)
+
+		rlog := rediscfg.RedisLogger{logger}
+		rlog.Printf(context.Background(), "this is a rest string")
+
+		content, err := os.ReadFile(logFile.Name())
+		So(err, ShouldBeNil)
+
+		So(string(content), ShouldContainSubstring, "this is a rest string")
+	})
+}
+
 func TestRedisOptions(t *testing.T) {
 	Convey("Test redis initialization", t, func() {
 		log := log.NewLogger("debug", "")