From 1dbf34f3aab290cb5a9246f54d66d84d59a27cb6 Mon Sep 17 00:00:00 2001 From: Drew Erny Date: Thu, 26 Mar 2020 11:04:58 -0500 Subject: [PATCH] Bump swarmkit to ebe39a32e3ed4c3a3783a02c11cccf388818694c Bumps swarmkit vendoring. Includes docker/swarmkit#2938, which fixes tasks.db growing out of control on worker nodes. Signed-off-by: Drew Erny --- vendor.conf | 2 +- .../docker/swarmkit/agent/storage.go | 4 ++- .../docker/swarmkit/agent/worker.go | 26 ++++++++++++++++--- 3 files changed, 27 insertions(+), 5 deletions(-) diff --git a/vendor.conf b/vendor.conf index 8d06fb9f2e..91ee6e918e 100644 --- a/vendor.conf +++ b/vendor.conf @@ -130,7 +130,7 @@ github.com/containerd/ttrpc 0be804eadb152bc3b3c20c5edc31 github.com/gogo/googleapis 01e0f9cca9b92166042241267ee2a5cdf5cff46c # v1.3.2 # cluster -github.com/docker/swarmkit 49e35619b18200845c9365c1e953440c28868002 +github.com/docker/swarmkit ebe39a32e3ed4c3a3783a02c11cccf388818694c github.com/gogo/protobuf 5628607bb4c51c3157aacc3a50f0ab707582b805 # v1.3.1 github.com/golang/protobuf d23c5127dc24889085f8ccea5c9d560a57a879d8 # v1.3.3 github.com/cloudflare/cfssl 5d63dbd981b5c408effbb58c442d54761ff94fbd # 1.3.2 diff --git a/vendor/github.com/docker/swarmkit/agent/storage.go b/vendor/github.com/docker/swarmkit/agent/storage.go index 519880197f..8d32ebf1ba 100644 --- a/vendor/github.com/docker/swarmkit/agent/storage.go +++ b/vendor/github.com/docker/swarmkit/agent/storage.go @@ -131,7 +131,9 @@ func PutTask(tx *bolt.Tx, task *api.Task) error { // PutTaskStatus updates the status for the task with id. func PutTaskStatus(tx *bolt.Tx, id string, status *api.TaskStatus) error { - return withCreateTaskBucketIfNotExists(tx, id, func(bkt *bolt.Bucket) error { + // this used to be withCreateTaskBucketIfNotExists, but that could lead + // to weird race conditions, and was not necessary. + return withTaskBucket(tx, id, func(bkt *bolt.Bucket) error { p, err := proto.Marshal(status) if err != nil { return err diff --git a/vendor/github.com/docker/swarmkit/agent/worker.go b/vendor/github.com/docker/swarmkit/agent/worker.go index efe538afa7..53d88dfe1f 100644 --- a/vendor/github.com/docker/swarmkit/agent/worker.go +++ b/vendor/github.com/docker/swarmkit/agent/worker.go @@ -278,10 +278,15 @@ func reconcileTaskState(ctx context.Context, w *worker, assignments []*api.Assig removeTaskAssignment := func(taskID string) error { ctx := log.WithLogger(ctx, log.G(ctx).WithField("task.id", taskID)) - if err := SetTaskAssignment(tx, taskID, false); err != nil { - log.G(ctx).WithError(err).Error("error setting task assignment in database") + // if a task is no longer assigned, then we do not have to keep track + // of it. a task will only be unassigned when it is deleted on the + // manager. instead of SetTaskAssginment to true, we'll just remove the + // task now. + if err := DeleteTask(tx, taskID); err != nil { + log.G(ctx).WithError(err).Error("error removing de-assigned task") + return err } - return err + return nil } // If this was a complete set of assignments, we're going to remove all the remaining @@ -500,6 +505,21 @@ func (w *worker) newTaskManager(ctx context.Context, tx *bolt.Tx, task *api.Task // updateTaskStatus reports statuses to listeners, read lock must be held. func (w *worker) updateTaskStatus(ctx context.Context, tx *bolt.Tx, taskID string, status *api.TaskStatus) error { if err := PutTaskStatus(tx, taskID, status); err != nil { + // we shouldn't fail to put a task status. however, there exists the + // possibility of a race in which we try to put a task status after the + // task has been deleted. because this whole contraption is a careful + // dance of too-tightly-coupled concurrent parts, fixing tht race is + // fraught with hazards. instead, we'll recognize that it can occur, + // log the error, and then ignore it. + if err == errTaskUnknown { + // log at info level. debug logging in docker is already really + // verbose, so many people disable it. the race that causes this + // behavior should be very rare, but if it occurs, we should know + // about it, because if there is some case where it is _not_ rare, + // then knowing about it will go a long way toward debugging. + log.G(ctx).Info("attempted to update status for a task that has been removed") + return nil + } log.G(ctx).WithError(err).Error("failed writing status to disk") return err }