diff --git a/container/container.go b/container/container.go index dea443df8a..d0f70e139e 100644 --- a/container/container.go +++ b/container/container.go @@ -81,17 +81,18 @@ type Container struct { Driver string OS string // MountLabel contains the options for the 'mount' command - MountLabel string - ProcessLabel string - RestartCount int - HasBeenStartedBefore bool - HasBeenManuallyStopped bool // used for unless-stopped restart policy - MountPoints map[string]*volumemounts.MountPoint - HostConfig *containertypes.HostConfig `json:"-"` // do not serialize the host config in the json, otherwise we'll make the container unportable - ExecCommands *exec.Store `json:"-"` - DependencyStore agentexec.DependencyGetter `json:"-"` - SecretReferences []*swarmtypes.SecretReference - ConfigReferences []*swarmtypes.ConfigReference + MountLabel string + ProcessLabel string + RestartCount int + HasBeenStartedBefore bool + HasBeenManuallyStopped bool // used for unless-stopped restart policy + HasBeenManuallyRestarted bool `json:"-"` // used to distinguish restart caused by restart policy from the manual one + MountPoints map[string]*volumemounts.MountPoint + HostConfig *containertypes.HostConfig `json:"-"` // do not serialize the host config in the json, otherwise we'll make the container unportable + ExecCommands *exec.Store `json:"-"` + DependencyStore agentexec.DependencyGetter `json:"-"` + SecretReferences []*swarmtypes.SecretReference + ConfigReferences []*swarmtypes.ConfigReference // logDriver for closing LogDriver logger.Logger `json:"-"` LogCopier *logger.Copier `json:"-"` diff --git a/container/state.go b/container/state.go index fe2b060d72..1267c8694a 100644 --- a/container/state.go +++ b/container/state.go @@ -32,9 +32,10 @@ type State struct { StartedAt time.Time FinishedAt time.Time Health *Health + Removed bool `json:"-"` - waitStop chan struct{} - waitRemove chan struct{} + stopWaiters []chan<- StateStatus + removeOnlyWaiters []chan<- StateStatus } // StateStatus is used to return container wait results. @@ -57,12 +58,9 @@ func (s StateStatus) Err() error { return s.err } -// NewState creates a default state object with a fresh channel for state changes. +// NewState creates a default state object. func NewState() *State { - return &State{ - waitStop: make(chan struct{}), - waitRemove: make(chan struct{}), - } + return &State{} } // String returns a human-readable description of the state @@ -182,11 +180,10 @@ func (s *State) Wait(ctx context.Context, condition WaitCondition) <-chan StateS s.Lock() defer s.Unlock() - if condition == WaitConditionNotRunning && !s.Running { - // Buffer so we can put it in the channel now. - resultC := make(chan StateStatus, 1) + // Buffer so we can put status and finish even nobody receives it. + resultC := make(chan StateStatus, 1) - // Send the current status. + if s.conditionAlreadyMet(condition) { resultC <- StateStatus{ exitCode: s.ExitCode(), err: s.Err(), @@ -195,20 +192,17 @@ func (s *State) Wait(ctx context.Context, condition WaitCondition) <-chan StateS return resultC } - // If we are waiting only for removal, the waitStop channel should - // remain nil and block forever. - var waitStop chan struct{} - if condition < WaitConditionRemoved { - waitStop = s.waitStop + waitC := make(chan StateStatus, 1) + + // Removal wakes up both removeOnlyWaiters and stopWaiters + // Container could be removed while still in "created" state + // in which case it is never actually stopped + if condition == WaitConditionRemoved { + s.removeOnlyWaiters = append(s.removeOnlyWaiters, waitC) + } else { + s.stopWaiters = append(s.stopWaiters, waitC) } - // Always wait for removal, just in case the container gets removed - // while it is still in a "created" state, in which case it is never - // actually stopped. - waitRemove := s.waitRemove - - resultC := make(chan StateStatus, 1) - go func() { select { case <-ctx.Done(): @@ -218,23 +212,25 @@ func (s *State) Wait(ctx context.Context, condition WaitCondition) <-chan StateS err: ctx.Err(), } return - case <-waitStop: - case <-waitRemove: + case status := <-waitC: + resultC <- status } - - s.Lock() - result := StateStatus{ - exitCode: s.ExitCode(), - err: s.Err(), - } - s.Unlock() - - resultC <- result }() return resultC } +func (s *State) conditionAlreadyMet(condition WaitCondition) bool { + switch condition { + case WaitConditionNotRunning: + return !s.Running + case WaitConditionRemoved: + return s.Removed + } + + return false +} + // IsRunning returns whether the running flag is set. Used by Container to check whether a container is running. func (s *State) IsRunning() bool { s.Lock() @@ -292,8 +288,8 @@ func (s *State) SetStopped(exitStatus *ExitStatus) { } s.ExitCodeValue = exitStatus.ExitCode s.OOMKilled = exitStatus.OOMKilled - close(s.waitStop) // fire waiters for stop - s.waitStop = make(chan struct{}) + + s.notifyAndClear(&s.stopWaiters) } // SetRestarting sets the container state to "restarting" without locking. @@ -308,8 +304,8 @@ func (s *State) SetRestarting(exitStatus *ExitStatus) { s.FinishedAt = time.Now().UTC() s.ExitCodeValue = exitStatus.ExitCode s.OOMKilled = exitStatus.OOMKilled - close(s.waitStop) // fire waiters for stop - s.waitStop = make(chan struct{}) + + s.notifyAndClear(&s.stopWaiters) } // SetError sets the container's error state. This is useful when we want to @@ -374,22 +370,19 @@ func (s *State) IsDead() bool { return res } -// SetRemoved assumes this container is already in the "dead" state and -// closes the internal waitRemove channel to unblock callers waiting for a -// container to be removed. +// SetRemoved assumes this container is already in the "dead" state and notifies all waiters. func (s *State) SetRemoved() { s.SetRemovalError(nil) } // SetRemovalError is to be called in case a container remove failed. -// It sets an error and closes the internal waitRemove channel to unblock -// callers waiting for the container to be removed. +// It sets an error and notifies all waiters. func (s *State) SetRemovalError(err error) { s.SetError(err) s.Lock() - close(s.waitRemove) // Unblock those waiting on remove. - // Recreate the channel so next ContainerWait will work - s.waitRemove = make(chan struct{}) + s.Removed = true + s.notifyAndClear(&s.removeOnlyWaiters) + s.notifyAndClear(&s.stopWaiters) s.Unlock() } @@ -400,3 +393,15 @@ func (s *State) Err() error { } return nil } + +func (s *State) notifyAndClear(waiters *[]chan<- StateStatus) { + result := StateStatus{ + exitCode: s.ExitCodeValue, + err: s.Err(), + } + + for _, c := range *waiters { + c <- result + } + *waiters = nil +} diff --git a/container/state_test.go b/container/state_test.go index bf114ea1aa..09dfb56089 100644 --- a/container/state_test.go +++ b/container/state_test.go @@ -169,6 +169,31 @@ func TestStateTimeoutWait(t *testing.T) { } } +// Related issue: #39352 +func TestCorrectStateWaitResultAfterRestart(t *testing.T) { + s := NewState() + + s.Lock() + s.SetRunning(0, true) + s.Unlock() + + waitC := s.Wait(context.Background(), WaitConditionNotRunning) + want := ExitStatus{ExitCode: 10, ExitedAt: time.Now()} + + s.Lock() + s.SetRestarting(&want) + s.Unlock() + + s.Lock() + s.SetRunning(0, true) + s.Unlock() + + got := <-waitC + if got.exitCode != want.ExitCode { + t.Fatalf("expected exit code %v, got %v", want.ExitCode, got.exitCode) + } +} + func TestIsValidStateString(t *testing.T) { states := []struct { state string diff --git a/daemon/monitor.go b/daemon/monitor.go index 36b82f17a7..9a087283c8 100644 --- a/daemon/monitor.go +++ b/daemon/monitor.go @@ -52,7 +52,20 @@ func (daemon *Daemon) handleContainerExit(c *container.Container, e *libcontaine } } - restart, wait, err := c.RestartManager().ShouldRestart(ec, daemon.IsShuttingDown() || c.HasBeenManuallyStopped, time.Since(c.StartedAt)) + daemonShutdown := daemon.IsShuttingDown() + execDuration := time.Since(c.StartedAt) + restart, wait, err := c.RestartManager().ShouldRestart(ec, daemonShutdown || c.HasBeenManuallyStopped, execDuration) + if err != nil { + logrus.WithError(err). + WithField("container", c.ID). + WithField("restartCount", c.RestartCount). + WithField("exitStatus", exitStatus). + WithField("daemonShuttingDown", daemonShutdown). + WithField("hasBeenManuallyStopped", c.HasBeenManuallyStopped). + WithField("execDuration", execDuration). + Warn("ShouldRestart failed, container will not be restarted") + restart = false + } // cancel healthcheck here, they will be automatically // restarted if/when the container is started again @@ -62,12 +75,19 @@ func (daemon *Daemon) handleContainerExit(c *container.Container, e *libcontaine } daemon.Cleanup(c) - if err == nil && restart { + if restart { c.RestartCount++ + logrus.WithField("container", c.ID). + WithField("restartCount", c.RestartCount). + WithField("exitStatus", exitStatus). + WithField("manualRestart", c.HasBeenManuallyRestarted). + Debug("Restarting container") c.SetRestarting(&exitStatus) } else { c.SetStopped(&exitStatus) - defer daemon.autoRemove(c) + if !c.HasBeenManuallyRestarted { + defer daemon.autoRemove(c) + } } defer c.Unlock() // needs to be called before autoRemove @@ -76,7 +96,7 @@ func (daemon *Daemon) handleContainerExit(c *container.Container, e *libcontaine daemon.LogContainerEventWithAttributes(c, "die", attributes) - if err == nil && restart { + if restart { go func() { err := <-wait if err == nil { diff --git a/daemon/restart.go b/daemon/restart.go index c20e8cbb10..a6c8ddbb24 100644 --- a/daemon/restart.go +++ b/daemon/restart.go @@ -6,7 +6,6 @@ import ( containertypes "github.com/docker/docker/api/types/container" "github.com/docker/docker/container" - "github.com/sirupsen/logrus" ) // ContainerRestart stops and starts a container. It attempts to @@ -52,19 +51,11 @@ func (daemon *Daemon) containerRestart(ctx context.Context, container *container } if container.IsRunning() { - // set AutoRemove flag to false before stop so the container won't be - // removed during restart process - autoRemove := container.HostConfig.AutoRemove + container.Lock() + container.HasBeenManuallyRestarted = true + container.Unlock() - container.HostConfig.AutoRemove = false err := daemon.containerStop(ctx, container, options) - // restore AutoRemove irrespective of whether the stop worked or not - container.HostConfig.AutoRemove = autoRemove - // containerStop will write HostConfig to disk, we shall restore AutoRemove - // in disk too - if toDiskErr := daemon.checkpointAndSave(container); toDiskErr != nil { - logrus.Errorf("Write container to disk error: %v", toDiskErr) - } if err != nil { return err diff --git a/daemon/start.go b/daemon/start.go index fc5aa5b55f..ecc0f8b8af 100644 --- a/daemon/start.go +++ b/daemon/start.go @@ -206,6 +206,7 @@ func (daemon *Daemon) containerStart(container *container.Container, checkpoint return translateContainerdStartErr(container.Path, container.SetExitCode, err) } + container.HasBeenManuallyRestarted = false container.SetRunning(pid, true) container.HasBeenStartedBefore = true daemon.setStateCounter(container) diff --git a/integration/container/restart_test.go b/integration/container/restart_test.go index 0c29812e8d..363b550b46 100644 --- a/integration/container/restart_test.go +++ b/integration/container/restart_test.go @@ -9,6 +9,7 @@ import ( "github.com/docker/docker/api/types" "github.com/docker/docker/api/types/container" "github.com/docker/docker/client" + testContainer "github.com/docker/docker/integration/internal/container" "github.com/docker/docker/testutil/daemon" "gotest.tools/v3/assert" "gotest.tools/v3/poll" @@ -153,3 +154,60 @@ func pollForNewHealthCheck(ctx context.Context, client *client.Client, startTime return poll.Continue("waiting for a new container healthcheck") } } + +// Container started with --rm should be able to be restarted. +// It should be removed only if killed or stopped +func TestContainerWithAutoRemoveCanBeRestarted(t *testing.T) { + defer setupTest(t)() + cli := testEnv.APIClient() + ctx := context.Background() + + noWaitTimeout := 0 + + for _, tc := range []struct { + desc string + doSth func(ctx context.Context, containerID string) error + }{ + { + desc: "kill", + doSth: func(ctx context.Context, containerID string) error { + return cli.ContainerKill(ctx, containerID, "SIGKILL") + }, + }, + { + desc: "stop", + doSth: func(ctx context.Context, containerID string) error { + return cli.ContainerStop(ctx, containerID, container.StopOptions{Timeout: &noWaitTimeout}) + }, + }, + } { + tc := tc + t.Run(tc.desc, func(t *testing.T) { + cID := testContainer.Run(ctx, t, cli, + testContainer.WithName("autoremove-restart-and-"+tc.desc), + testContainer.WithAutoRemove, + ) + defer func() { + err := cli.ContainerRemove(ctx, cID, types.ContainerRemoveOptions{Force: true}) + if t.Failed() && err != nil { + t.Logf("Cleaning up test container failed with error: %v", err) + } + }() + + err := cli.ContainerRestart(ctx, cID, container.StopOptions{Timeout: &noWaitTimeout}) + assert.NilError(t, err) + + inspect, err := cli.ContainerInspect(ctx, cID) + assert.NilError(t, err) + assert.Assert(t, inspect.State.Status != "removing", "Container should not be removing yet") + + poll.WaitOn(t, testContainer.IsInState(ctx, cli, cID, "running")) + + err = tc.doSth(ctx, cID) + assert.NilError(t, err) + + poll.WaitOn(t, testContainer.IsRemoved(ctx, cli, cID)) + }) + } + +} diff --git a/integration/container/wait_test.go b/integration/container/wait_test.go index 6ac6b21299..12a9313caa 100644 --- a/integration/container/wait_test.go +++ b/integration/container/wait_test.go @@ -5,6 +5,7 @@ import ( "testing" "time" + "github.com/docker/docker/api/types" containertypes "github.com/docker/docker/api/types/container" "github.com/docker/docker/integration/internal/container" "github.com/docker/docker/testutil/request" @@ -156,3 +157,69 @@ func TestWaitConditions(t *testing.T) { }) } } + +func TestWaitRestartedContainer(t *testing.T) { + defer setupTest(t)() + cli := request.NewAPIClient(t) + + testCases := []struct { + doc string + waitCond containertypes.WaitCondition + }{ + { + doc: "default", + }, + { + doc: "not-running", + waitCond: containertypes.WaitConditionNotRunning, + }, + { + doc: "next-exit", + waitCond: containertypes.WaitConditionNextExit, + }, + } + + // We can't catch the SIGTERM in the Windows based busybox image + isWindowDaemon := testEnv.DaemonInfo.OSType == "windows" + + for _, tc := range testCases { + tc := tc + t.Run(tc.doc, func(t *testing.T) { + t.Parallel() + ctx := context.Background() + containerID := container.Run(ctx, t, cli, + container.WithCmd("sh", "-c", "trap 'exit 5' SIGTERM; while true; do sleep 0.1; done"), + ) + defer cli.ContainerRemove(ctx, containerID, types.ContainerRemoveOptions{Force: true}) + + poll.WaitOn(t, container.IsInState(ctx, cli, containerID, "running"), poll.WithTimeout(30*time.Second), poll.WithDelay(100*time.Millisecond)) + + // Container is running now, wait for exit + waitResC, errC := cli.ContainerWait(ctx, containerID, tc.waitCond) + + timeout := 5 + // On Windows it will always timeout, because our process won't receive SIGTERM + // Skip to force killing immediately + if isWindowDaemon { + timeout = 0 + } + + err := cli.ContainerRestart(ctx, containerID, containertypes.StopOptions{Timeout: &timeout, Signal: "SIGTERM"}) + assert.NilError(t, err) + + select { + case err := <-errC: + t.Fatalf("Unexpected error: %v", err) + case <-time.After(time.Second * 3): + t.Fatalf("Wait should end after restart") + case waitRes := <-waitResC: + expectedCode := int64(5) + + if !isWindowDaemon { + assert.Check(t, is.Equal(expectedCode, waitRes.StatusCode)) + } + } + }) + } + +} diff --git a/integration/internal/container/states.go b/integration/internal/container/states.go index fea8e3f598..0671b679e8 100644 --- a/integration/internal/container/states.go +++ b/integration/internal/container/states.go @@ -57,3 +57,17 @@ func IsSuccessful(ctx context.Context, client client.APIClient, containerID stri return poll.Continue("waiting for container to be \"exited\", currently %s", inspect.State.Status) } } + +// IsRemoved verifies the container has been removed +func IsRemoved(ctx context.Context, cli client.APIClient, containerID string) func(log poll.LogT) poll.Result { + return func(log poll.LogT) poll.Result { + inspect, err := cli.ContainerInspect(ctx, containerID) + if err != nil { + if client.IsErrNotFound(err) { + return poll.Success() + } + return poll.Error(err) + } + return poll.Continue("waiting for container to be removed, currently %s", inspect.State.Status) + } +}