From 13cff6d5839cca995d889b47d656888e8e2345ca Mon Sep 17 00:00:00 2001 From: Sebastiaan van Stijn Date: Fri, 22 Oct 2021 15:33:22 +0200 Subject: [PATCH] Fix race in TestCreateServiceSecretFileMode, TestCreateServiceConfigFileMode Looks like this test was broken from the start, and fully relied on a race condition. (Test was added in 65ee7fff02111bf696bc2fec442d07c2957f4151) The problem is in the service's command: `ls -l /etc/config || /bin/top`, which will either: - exit immediately if the secret is mounted correctly at `/etc/config` (which it should) - keep running with `/bin/top` if the above failed After the service is created, the test enters a race-condition, checking for 1 task to be running (which it ocassionally is), after which it proceeds, and looks up the list of tasks of the service, to get the log output of `ls -l /etc/config`. This is another race: first of all, the original filter for that task lookup did not filter by `running`, so it would pick "any" task of the service (either failed, running, or "completed" (successfully exited) tasks). In the meantime though, SwarmKit kept reconciling the service, and creating new tasks, so even if the test was able to get the ID of the correct task, that task may already have been exited, and removed (task-limit is 5 by default), so only if the test was "lucky", it would be able to get the logs, but of course, chances were likely that it would be "too late", and the task already gone. The problem can be easily reproduced when running the steps manually: echo 'CONFIG' | docker config create myconfig - docker service create --config source=myconfig,target=/etc/config,mode=0777 --name myservice busybox sh -c 'ls -l /etc/config || /bin/top' The above creates the service, but it keeps retrying, because each task exits immediately (followed by SwarmKit reconciling and starting a new task); mjntpfkkyuuc1dpay4h00c4oo overall progress: 0 out of 1 tasks 1/1: ready [======================================> ] verify: Detected task failure ^COperation continuing in background. Use `docker service ps mjntpfkkyuuc1dpay4h00c4oo` to check progress. And checking the tasks for the service reveals that tasks exit cleanly (no error), but _do exit_, so swarm just keeps up reconciling, and spinning up new tasks; docker service ps myservice --no-trunc ID NAME IMAGE NODE DESIRED STATE CURRENT STATE ERROR PORTS 2wmcuv4vffnet8nybg3he4v9n myservice.1 busybox:latest@sha256:f7ca5a32c10d51aeda3b4d01c61c6061f497893d7f6628b92f822f7117182a57 docker-desktop Ready Ready less than a second ago 5p8b006uec125iq2892lxay64 \_ myservice.1 busybox:latest@sha256:f7ca5a32c10d51aeda3b4d01c61c6061f497893d7f6628b92f822f7117182a57 docker-desktop Shutdown Complete less than a second ago k8lpsvlak4b3nil0zfkexw61p \_ myservice.1 busybox:latest@sha256:f7ca5a32c10d51aeda3b4d01c61c6061f497893d7f6628b92f822f7117182a57 docker-desktop Shutdown Complete 6 seconds ago vsunl5pi7e2n9ol3p89kvj6pn \_ myservice.1 busybox:latest@sha256:f7ca5a32c10d51aeda3b4d01c61c6061f497893d7f6628b92f822f7117182a57 docker-desktop Shutdown Complete 11 seconds ago orxl8b6kt2l6dfznzzd4lij4s \_ myservice.1 busybox:latest@sha256:f7ca5a32c10d51aeda3b4d01c61c6061f497893d7f6628b92f822f7117182a57 docker-desktop Shutdown Complete 17 seconds ago This patch changes the service's command to `sleep`, so that a successful task (after successfully performing `ls -l /etc/config`) continues to be running until the service is deleted. With that change, the service should (usually) reconcile immediately, which removes the race condition, and should also make it faster :) This patch changes the tests to use client.ServiceLogs() instead of using the service's tasklist to directly access container logs. This should also fix some failures that happened if some tasks failed to start before reconciling, in which case client.TaskList() (with the current filters), could return more tasks than anticipated (as it also contained the exited tasks); === RUN TestCreateServiceSecretFileMode create_test.go:291: assertion failed: 2 (int) != 1 (int) --- FAIL: TestCreateServiceSecretFileMode (7.88s) === RUN TestCreateServiceConfigFileMode create_test.go:355: assertion failed: 2 (int) != 1 (int) --- FAIL: TestCreateServiceConfigFileMode (7.87s) Signed-off-by: Sebastiaan van Stijn --- integration/service/create_test.go | 26 ++++++-------------------- 1 file changed, 6 insertions(+), 20 deletions(-) diff --git a/integration/service/create_test.go b/integration/service/create_test.go index 2a3d66b9fa..8335f894b8 100644 --- a/integration/service/create_test.go +++ b/integration/service/create_test.go @@ -267,7 +267,7 @@ func TestCreateServiceSecretFileMode(t *testing.T) { serviceID := swarm.CreateService(t, d, swarm.ServiceWithReplicas(instances), swarm.ServiceWithName(serviceName), - swarm.ServiceWithCommand([]string{"/bin/sh", "-c", "ls -l /etc/secret || /bin/top"}), + swarm.ServiceWithCommand([]string{"/bin/sh", "-c", "ls -l /etc/secret && sleep inf"}), swarm.ServiceWithSecret(&swarmtypes.SecretReference{ File: &swarmtypes.SecretReferenceFileTarget{ Name: "/etc/secret", @@ -282,15 +282,8 @@ func TestCreateServiceSecretFileMode(t *testing.T) { poll.WaitOn(t, swarm.RunningTasksCount(client, serviceID, instances), swarm.ServicePoll) - filter := filters.NewArgs() - filter.Add("service", serviceID) - tasks, err := client.TaskList(ctx, types.TaskListOptions{ - Filters: filter, - }) - assert.NilError(t, err) - assert.Check(t, is.Equal(len(tasks), 1)) - - body, err := client.ContainerLogs(ctx, tasks[0].Status.ContainerStatus.ContainerID, types.ContainerLogsOptions{ + body, err := client.ServiceLogs(ctx, serviceID, types.ContainerLogsOptions{ + Tail: "1", ShowStdout: true, }) assert.NilError(t, err) @@ -330,7 +323,7 @@ func TestCreateServiceConfigFileMode(t *testing.T) { serviceName := "TestService_" + t.Name() serviceID := swarm.CreateService(t, d, swarm.ServiceWithName(serviceName), - swarm.ServiceWithCommand([]string{"/bin/sh", "-c", "ls -l /etc/config || /bin/top"}), + swarm.ServiceWithCommand([]string{"/bin/sh", "-c", "ls -l /etc/config && sleep inf"}), swarm.ServiceWithReplicas(instances), swarm.ServiceWithConfig(&swarmtypes.ConfigReference{ File: &swarmtypes.ConfigReferenceFileTarget{ @@ -346,15 +339,8 @@ func TestCreateServiceConfigFileMode(t *testing.T) { poll.WaitOn(t, swarm.RunningTasksCount(client, serviceID, instances)) - filter := filters.NewArgs() - filter.Add("service", serviceID) - tasks, err := client.TaskList(ctx, types.TaskListOptions{ - Filters: filter, - }) - assert.NilError(t, err) - assert.Check(t, is.Equal(len(tasks), 1)) - - body, err := client.ContainerLogs(ctx, tasks[0].Status.ContainerStatus.ContainerID, types.ContainerLogsOptions{ + body, err := client.ServiceLogs(ctx, serviceID, types.ContainerLogsOptions{ + Tail: "1", ShowStdout: true, }) assert.NilError(t, err)