fix(scheduler): fix data race (#2085)

* fix(scheduler): data race when pushing new tasks

the problem here is that scheduler can be closed in two ways:
- canceling the context given as argument to scheduler.RunScheduler()
- running scheduler.Shutdown()

because of this shutdown can trigger a data race between calling scheduler.inShutdown()
and actually pushing tasks into the pool workers

solved that by keeping a quit channel and listening on both quit channel and ctx.Done()
and closing the worker chan and scheduler afterwards.

Signed-off-by: Petu Eusebiu <peusebiu@cisco.com>

* refactor(scheduler): refactor into a single shutdown

before this we could stop scheduler either by closing the context
provided to RunScheduler(ctx) or by running Shutdown().

simplify things by getting rid of the external context in RunScheduler().
keep an internal context in the scheduler itself and pass it down to all tasks.

Signed-off-by: Petu Eusebiu <peusebiu@cisco.com>

---------

Signed-off-by: Petu Eusebiu <peusebiu@cisco.com>
This commit is contained in:
peusebiu
2023-12-11 20:00:34 +02:00
committed by GitHub
parent d71a1f494e
commit 7642e5af98
31 changed files with 494 additions and 326 deletions
+22 -16
View File
@@ -62,16 +62,15 @@ var DeleteReferrers = config.ImageRetention{ //nolint: gochecknoglobals
var errCache = errors.New("new cache error")
func runAndGetScheduler() (*scheduler.Scheduler, context.CancelFunc) {
func runAndGetScheduler() *scheduler.Scheduler {
log := zlog.Logger{}
metrics := monitoring.NewMetricsServer(true, log)
taskScheduler := scheduler.NewScheduler(config.New(), metrics, log)
taskScheduler.RateLimit = 50 * time.Millisecond
ctx, cancel := context.WithCancel(context.Background())
taskScheduler.RunScheduler(ctx)
taskScheduler.RunScheduler()
return taskScheduler, cancel
return taskScheduler
}
func TestStorageFSAPIs(t *testing.T) {
@@ -1195,14 +1194,15 @@ func TestDedupeLinks(t *testing.T) {
// run on empty image store
// switch dedupe to true from false
taskScheduler, cancel := runAndGetScheduler()
taskScheduler := runAndGetScheduler()
defer taskScheduler.Shutdown()
// rebuild with dedupe true
imgStore.RunDedupeBlobs(time.Duration(0), taskScheduler)
// wait until rebuild finishes
time.Sleep(1 * time.Second)
cancel()
taskScheduler.Shutdown()
// manifest1
upload, err := imgStore.NewBlobUpload("dedupe1")
@@ -1367,7 +1367,9 @@ func TestDedupeLinks(t *testing.T) {
Convey("Intrerrupt rebuilding and restart, checking idempotency", func() {
for i := 0; i < 10; i++ {
taskScheduler, cancel := runAndGetScheduler()
taskScheduler := runAndGetScheduler()
defer taskScheduler.Shutdown()
// rebuild with dedupe true
imgStore := local.NewImageStore(dir, true, true, log, metrics, nil, cacheDriver)
@@ -1375,10 +1377,11 @@ func TestDedupeLinks(t *testing.T) {
sleepValue := i * 5
time.Sleep(time.Duration(sleepValue) * time.Millisecond)
cancel()
taskScheduler.Shutdown()
}
taskScheduler, cancel := runAndGetScheduler()
taskScheduler := runAndGetScheduler()
defer taskScheduler.Shutdown()
// rebuild with dedupe true
imgStore := local.NewImageStore(dir, true, true, log, metrics, nil, cacheDriver)
@@ -1387,7 +1390,7 @@ func TestDedupeLinks(t *testing.T) {
// wait until rebuild finishes
time.Sleep(10 * time.Second)
cancel()
taskScheduler.Shutdown()
fi1, err := os.Stat(path.Join(dir, "dedupe1", "blobs", "sha256", blobDigest1))
So(err, ShouldBeNil)
@@ -1398,7 +1401,8 @@ func TestDedupeLinks(t *testing.T) {
Convey("rebuild dedupe index error cache nil", func() {
// switch dedupe to true from false
taskScheduler, cancel := runAndGetScheduler()
taskScheduler := runAndGetScheduler()
defer taskScheduler.Shutdown()
imgStore := local.NewImageStore(dir, true, true, log, metrics, nil, nil)
@@ -1408,7 +1412,7 @@ func TestDedupeLinks(t *testing.T) {
time.Sleep(3 * time.Second)
cancel()
taskScheduler.Shutdown()
fi1, err := os.Stat(path.Join(dir, "dedupe1", "blobs", "sha256", blobDigest1))
So(err, ShouldBeNil)
@@ -1420,7 +1424,8 @@ func TestDedupeLinks(t *testing.T) {
Convey("rebuild dedupe index cache error on original blob", func() {
// switch dedupe to true from false
taskScheduler, cancel := runAndGetScheduler()
taskScheduler := runAndGetScheduler()
defer taskScheduler.Shutdown()
imgStore := local.NewImageStore(dir, true, true, log, metrics, nil, &mocks.CacheMock{
HasBlobFn: func(digest godigest.Digest, path string) bool {
@@ -1436,7 +1441,7 @@ func TestDedupeLinks(t *testing.T) {
time.Sleep(10 * time.Second)
cancel()
taskScheduler.Shutdown()
fi1, err := os.Stat(path.Join(dir, "dedupe1", "blobs", "sha256", blobDigest1))
So(err, ShouldBeNil)
@@ -1448,7 +1453,8 @@ func TestDedupeLinks(t *testing.T) {
Convey("rebuild dedupe index cache error on duplicate blob", func() {
// switch dedupe to true from false
taskScheduler, cancel := runAndGetScheduler()
taskScheduler := runAndGetScheduler()
defer taskScheduler.Shutdown()
imgStore := local.NewImageStore(dir, true, true, log, metrics, nil, &mocks.CacheMock{
HasBlobFn: func(digest godigest.Digest, path string) bool {
@@ -1468,7 +1474,7 @@ func TestDedupeLinks(t *testing.T) {
time.Sleep(15 * time.Second)
cancel()
taskScheduler.Shutdown()
fi1, err := os.Stat(path.Join(dir, "dedupe1", "blobs", "sha256", blobDigest1))
So(err, ShouldBeNil)
+44 -40
View File
@@ -186,16 +186,15 @@ func createObjectsStoreDynamo(rootDir string, cacheDir string, dedupe bool, tabl
return store, il, err
}
func runAndGetScheduler() (*scheduler.Scheduler, context.CancelFunc) {
func runAndGetScheduler() *scheduler.Scheduler {
logger := log.Logger{}
metrics := monitoring.NewMetricsServer(false, logger)
taskScheduler := scheduler.NewScheduler(config.New(), metrics, logger)
taskScheduler.RateLimit = 50 * time.Millisecond
ctx, cancel := context.WithCancel(context.Background())
taskScheduler.RunScheduler(ctx)
taskScheduler.RunScheduler()
return taskScheduler, cancel
return taskScheduler
}
type FileInfoMock struct {
@@ -1587,7 +1586,8 @@ func TestS3Dedupe(t *testing.T) {
})
Convey("rebuild s3 dedupe index from true to false", func() { //nolint: dupl
taskScheduler, cancel := runAndGetScheduler()
taskScheduler := runAndGetScheduler()
defer taskScheduler.Shutdown()
storeDriver, imgStore, _ := createObjectsStore(testDir, t.TempDir(), false)
defer cleanupStorage(storeDriver, testDir)
@@ -1598,7 +1598,7 @@ func TestS3Dedupe(t *testing.T) {
time.Sleep(10 * time.Second)
cancel()
taskScheduler.Shutdown()
fi1, err := storeDriver.Stat(context.Background(), path.Join(testDir, "dedupe1", "blobs", "sha256",
blobDigest1.Encoded()))
@@ -1615,9 +1615,8 @@ func TestS3Dedupe(t *testing.T) {
So(len(blobContent), ShouldEqual, fi1.Size())
Convey("rebuild s3 dedupe index from false to true", func() {
taskScheduler, cancel := runAndGetScheduler()
defer cancel()
taskScheduler := runAndGetScheduler()
defer taskScheduler.Shutdown()
storeDriver, imgStore, _ := createObjectsStore(testDir, t.TempDir(), true)
defer cleanupStorage(storeDriver, testDir)
@@ -1628,6 +1627,8 @@ func TestS3Dedupe(t *testing.T) {
time.Sleep(10 * time.Second)
taskScheduler.Shutdown()
fi2, err := storeDriver.Stat(context.Background(), path.Join(testDir, "dedupe2", "blobs", "sha256",
blobDigest2.Encoded()))
So(err, ShouldBeNil)
@@ -1816,7 +1817,8 @@ func TestS3Dedupe(t *testing.T) {
})
Convey("rebuild s3 dedupe index from true to false", func() { //nolint: dupl
taskScheduler, cancel := runAndGetScheduler()
taskScheduler := runAndGetScheduler()
defer taskScheduler.Shutdown()
storeDriver, imgStore, _ := createObjectsStore(testDir, t.TempDir(), false)
defer cleanupStorage(storeDriver, testDir)
@@ -1827,7 +1829,7 @@ func TestS3Dedupe(t *testing.T) {
time.Sleep(10 * time.Second)
cancel()
taskScheduler.Shutdown()
fi1, err := storeDriver.Stat(context.Background(), path.Join(testDir, "dedupe1", "blobs", "sha256",
blobDigest1.Encoded()))
@@ -1861,7 +1863,8 @@ func TestS3Dedupe(t *testing.T) {
})
Convey("rebuild s3 dedupe index from false to true", func() {
taskScheduler, cancel := runAndGetScheduler()
taskScheduler := runAndGetScheduler()
defer taskScheduler.Shutdown()
storeDriver, imgStore, _ := createObjectsStore(testDir, t.TempDir(), true)
defer cleanupStorage(storeDriver, testDir)
@@ -1872,7 +1875,7 @@ func TestS3Dedupe(t *testing.T) {
time.Sleep(10 * time.Second)
cancel()
taskScheduler.Shutdown()
fi2, err := storeDriver.Stat(context.Background(), path.Join(testDir, "dedupe2", "blobs", "sha256",
blobDigest2.Encoded()))
@@ -2055,9 +2058,8 @@ func TestRebuildDedupeIndex(t *testing.T) {
taskScheduler := scheduler.NewScheduler(config.New(), metrics, logger)
taskScheduler.RateLimit = 1 * time.Millisecond
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second)
taskScheduler.RunScheduler(ctx)
taskScheduler.RunScheduler()
defer taskScheduler.Shutdown()
storeDriver, imgStore, _ = createObjectsStore(testDir, t.TempDir(), false)
defer cleanupStorage(storeDriver, testDir)
@@ -2067,17 +2069,18 @@ func TestRebuildDedupeIndex(t *testing.T) {
sleepValue := i * 5
time.Sleep(time.Duration(sleepValue) * time.Millisecond)
cancel()
taskScheduler.Shutdown()
}
taskScheduler, cancel := runAndGetScheduler()
taskScheduler := runAndGetScheduler()
defer taskScheduler.Shutdown()
imgStore.RunDedupeBlobs(time.Duration(0), taskScheduler)
// wait until rebuild finishes
time.Sleep(10 * time.Second)
cancel()
taskScheduler.Shutdown()
fi2, err := storeDriver.Stat(context.Background(), path.Join(testDir, "dedupe2", "blobs", "sha256",
blobDigest2.Encoded()))
@@ -2096,10 +2099,8 @@ func TestRebuildDedupeIndex(t *testing.T) {
taskScheduler := scheduler.NewScheduler(config.New(), metrics, logger)
taskScheduler.RateLimit = 1 * time.Millisecond
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second)
defer cancel()
taskScheduler.RunScheduler(ctx)
taskScheduler.RunScheduler()
defer taskScheduler.Shutdown()
storeDriver, imgStore, _ = createObjectsStore(testDir, t.TempDir(), true)
defer cleanupStorage(storeDriver, testDir)
@@ -2110,10 +2111,11 @@ func TestRebuildDedupeIndex(t *testing.T) {
sleepValue := i * 5
time.Sleep(time.Duration(sleepValue) * time.Millisecond)
cancel()
taskScheduler.Shutdown()
}
taskScheduler, cancel = runAndGetScheduler()
taskScheduler = runAndGetScheduler()
defer taskScheduler.Shutdown()
// rebuild with dedupe false, should have all blobs with content
imgStore.RunDedupeBlobs(time.Duration(0), taskScheduler)
@@ -2121,7 +2123,7 @@ func TestRebuildDedupeIndex(t *testing.T) {
// wait until rebuild finishes
time.Sleep(10 * time.Second)
cancel()
taskScheduler.Shutdown()
fi2, err = storeDriver.Stat(context.Background(), path.Join(testDir, "dedupe2", "blobs", "sha256",
blobDigest2.Encoded()))
@@ -2140,8 +2142,8 @@ func TestRebuildDedupeIndex(t *testing.T) {
storeDriver, imgStore, _ := createObjectsStore(testDir, tdir, true)
defer cleanupStorage(storeDriver, testDir)
taskScheduler, cancel := runAndGetScheduler()
defer cancel()
taskScheduler := runAndGetScheduler()
defer taskScheduler.Shutdown()
imgStore.RunDedupeBlobs(time.Duration(0), taskScheduler)
@@ -2150,8 +2152,8 @@ func TestRebuildDedupeIndex(t *testing.T) {
})
Convey("Rebuild dedupe index already rebuilt", func() {
taskScheduler, cancel := runAndGetScheduler()
defer cancel()
taskScheduler := runAndGetScheduler()
defer taskScheduler.Shutdown()
storeDriver, imgStore, _ := createObjectsStore(testDir, t.TempDir(), true)
defer cleanupStorage(storeDriver, testDir)
@@ -2171,8 +2173,8 @@ func TestRebuildDedupeIndex(t *testing.T) {
err := storeDriver.PutContent(context.Background(), fi1.Path(), []byte{})
So(err, ShouldBeNil)
taskScheduler, cancel := runAndGetScheduler()
defer cancel()
taskScheduler := runAndGetScheduler()
defer taskScheduler.Shutdown()
imgStore.RunDedupeBlobs(time.Duration(0), taskScheduler)
@@ -2185,8 +2187,8 @@ func TestRebuildDedupeIndex(t *testing.T) {
err := storeDriver.Delete(context.Background(), fi1.Path())
So(err, ShouldBeNil)
taskScheduler, cancel := runAndGetScheduler()
defer cancel()
taskScheduler := runAndGetScheduler()
defer taskScheduler.Shutdown()
storeDriver, imgStore, _ := createObjectsStore(testDir, t.TempDir(), true)
defer cleanupStorage(storeDriver, testDir)
@@ -2202,8 +2204,8 @@ func TestRebuildDedupeIndex(t *testing.T) {
err := storeDriver.PutContent(context.Background(), fi1.Path(), []byte{})
So(err, ShouldBeNil)
taskScheduler, cancel := runAndGetScheduler()
defer cancel()
taskScheduler := runAndGetScheduler()
defer taskScheduler.Shutdown()
storeDriver, imgStore, _ := createObjectsStore(testDir, t.TempDir(), true)
defer cleanupStorage(storeDriver, testDir)
@@ -2224,8 +2226,8 @@ func TestRebuildDedupeIndex(t *testing.T) {
err := storeDriver.Delete(context.Background(), imgStore.RootDir())
So(err, ShouldBeNil)
taskScheduler, cancel := runAndGetScheduler()
defer cancel()
taskScheduler := runAndGetScheduler()
defer taskScheduler.Shutdown()
// rebuild with dedupe false, should have all blobs with content
imgStore.RunDedupeBlobs(time.Duration(0), taskScheduler)
@@ -2235,8 +2237,8 @@ func TestRebuildDedupeIndex(t *testing.T) {
})
Convey("Rebuild from true to false", func() {
taskScheduler, cancel := runAndGetScheduler()
defer cancel()
taskScheduler := runAndGetScheduler()
defer taskScheduler.Shutdown()
storeDriver, imgStore, _ := createObjectsStore(testDir, t.TempDir(), false)
defer cleanupStorage(storeDriver, testDir)
@@ -2247,6 +2249,8 @@ func TestRebuildDedupeIndex(t *testing.T) {
// wait until rebuild finishes
time.Sleep(10 * time.Second)
taskScheduler.Shutdown()
fi2, err := storeDriver.Stat(context.Background(), path.Join(testDir, "dedupe2", "blobs", "sha256",
blobDigest2.Encoded()))
So(err, ShouldBeNil)