Merge pull request #43751 from vvoland/fix-exitcode-wait

state/Wait: Fix race when reading exit status
This commit is contained in:
Sebastiaan van Stijn 2022-07-29 16:01:39 +02:00 committed by GitHub
commit f34567bf41
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
9 changed files with 255 additions and 73 deletions

View File

@ -81,17 +81,18 @@ type Container struct {
Driver string
OS string
// MountLabel contains the options for the 'mount' command
MountLabel string
ProcessLabel string
RestartCount int
HasBeenStartedBefore bool
HasBeenManuallyStopped bool // used for unless-stopped restart policy
MountPoints map[string]*volumemounts.MountPoint
HostConfig *containertypes.HostConfig `json:"-"` // do not serialize the host config in the json, otherwise we'll make the container unportable
ExecCommands *exec.Store `json:"-"`
DependencyStore agentexec.DependencyGetter `json:"-"`
SecretReferences []*swarmtypes.SecretReference
ConfigReferences []*swarmtypes.ConfigReference
MountLabel string
ProcessLabel string
RestartCount int
HasBeenStartedBefore bool
HasBeenManuallyStopped bool // used for unless-stopped restart policy
HasBeenManuallyRestarted bool `json:"-"` // used to distinguish restart caused by restart policy from the manual one
MountPoints map[string]*volumemounts.MountPoint
HostConfig *containertypes.HostConfig `json:"-"` // do not serialize the host config in the json, otherwise we'll make the container unportable
ExecCommands *exec.Store `json:"-"`
DependencyStore agentexec.DependencyGetter `json:"-"`
SecretReferences []*swarmtypes.SecretReference
ConfigReferences []*swarmtypes.ConfigReference
// logDriver for closing
LogDriver logger.Logger `json:"-"`
LogCopier *logger.Copier `json:"-"`

View File

@ -32,9 +32,10 @@ type State struct {
StartedAt time.Time
FinishedAt time.Time
Health *Health
Removed bool `json:"-"`
waitStop chan struct{}
waitRemove chan struct{}
stopWaiters []chan<- StateStatus
removeOnlyWaiters []chan<- StateStatus
}
// StateStatus is used to return container wait results.
@ -57,12 +58,9 @@ func (s StateStatus) Err() error {
return s.err
}
// NewState creates a default state object with a fresh channel for state changes.
// NewState creates a default state object.
func NewState() *State {
return &State{
waitStop: make(chan struct{}),
waitRemove: make(chan struct{}),
}
return &State{}
}
// String returns a human-readable description of the state
@ -182,11 +180,10 @@ func (s *State) Wait(ctx context.Context, condition WaitCondition) <-chan StateS
s.Lock()
defer s.Unlock()
if condition == WaitConditionNotRunning && !s.Running {
// Buffer so we can put it in the channel now.
resultC := make(chan StateStatus, 1)
// Buffer so we can put status and finish even nobody receives it.
resultC := make(chan StateStatus, 1)
// Send the current status.
if s.conditionAlreadyMet(condition) {
resultC <- StateStatus{
exitCode: s.ExitCode(),
err: s.Err(),
@ -195,20 +192,17 @@ func (s *State) Wait(ctx context.Context, condition WaitCondition) <-chan StateS
return resultC
}
// If we are waiting only for removal, the waitStop channel should
// remain nil and block forever.
var waitStop chan struct{}
if condition < WaitConditionRemoved {
waitStop = s.waitStop
waitC := make(chan StateStatus, 1)
// Removal wakes up both removeOnlyWaiters and stopWaiters
// Container could be removed while still in "created" state
// in which case it is never actually stopped
if condition == WaitConditionRemoved {
s.removeOnlyWaiters = append(s.removeOnlyWaiters, waitC)
} else {
s.stopWaiters = append(s.stopWaiters, waitC)
}
// Always wait for removal, just in case the container gets removed
// while it is still in a "created" state, in which case it is never
// actually stopped.
waitRemove := s.waitRemove
resultC := make(chan StateStatus, 1)
go func() {
select {
case <-ctx.Done():
@ -218,23 +212,25 @@ func (s *State) Wait(ctx context.Context, condition WaitCondition) <-chan StateS
err: ctx.Err(),
}
return
case <-waitStop:
case <-waitRemove:
case status := <-waitC:
resultC <- status
}
s.Lock()
result := StateStatus{
exitCode: s.ExitCode(),
err: s.Err(),
}
s.Unlock()
resultC <- result
}()
return resultC
}
func (s *State) conditionAlreadyMet(condition WaitCondition) bool {
switch condition {
case WaitConditionNotRunning:
return !s.Running
case WaitConditionRemoved:
return s.Removed
}
return false
}
// IsRunning returns whether the running flag is set. Used by Container to check whether a container is running.
func (s *State) IsRunning() bool {
s.Lock()
@ -292,8 +288,8 @@ func (s *State) SetStopped(exitStatus *ExitStatus) {
}
s.ExitCodeValue = exitStatus.ExitCode
s.OOMKilled = exitStatus.OOMKilled
close(s.waitStop) // fire waiters for stop
s.waitStop = make(chan struct{})
s.notifyAndClear(&s.stopWaiters)
}
// SetRestarting sets the container state to "restarting" without locking.
@ -308,8 +304,8 @@ func (s *State) SetRestarting(exitStatus *ExitStatus) {
s.FinishedAt = time.Now().UTC()
s.ExitCodeValue = exitStatus.ExitCode
s.OOMKilled = exitStatus.OOMKilled
close(s.waitStop) // fire waiters for stop
s.waitStop = make(chan struct{})
s.notifyAndClear(&s.stopWaiters)
}
// SetError sets the container's error state. This is useful when we want to
@ -374,22 +370,19 @@ func (s *State) IsDead() bool {
return res
}
// SetRemoved assumes this container is already in the "dead" state and
// closes the internal waitRemove channel to unblock callers waiting for a
// container to be removed.
// SetRemoved assumes this container is already in the "dead" state and notifies all waiters.
func (s *State) SetRemoved() {
s.SetRemovalError(nil)
}
// SetRemovalError is to be called in case a container remove failed.
// It sets an error and closes the internal waitRemove channel to unblock
// callers waiting for the container to be removed.
// It sets an error and notifies all waiters.
func (s *State) SetRemovalError(err error) {
s.SetError(err)
s.Lock()
close(s.waitRemove) // Unblock those waiting on remove.
// Recreate the channel so next ContainerWait will work
s.waitRemove = make(chan struct{})
s.Removed = true
s.notifyAndClear(&s.removeOnlyWaiters)
s.notifyAndClear(&s.stopWaiters)
s.Unlock()
}
@ -400,3 +393,15 @@ func (s *State) Err() error {
}
return nil
}
func (s *State) notifyAndClear(waiters *[]chan<- StateStatus) {
result := StateStatus{
exitCode: s.ExitCodeValue,
err: s.Err(),
}
for _, c := range *waiters {
c <- result
}
*waiters = nil
}

View File

@ -169,6 +169,31 @@ func TestStateTimeoutWait(t *testing.T) {
}
}
// Related issue: #39352
func TestCorrectStateWaitResultAfterRestart(t *testing.T) {
s := NewState()
s.Lock()
s.SetRunning(0, true)
s.Unlock()
waitC := s.Wait(context.Background(), WaitConditionNotRunning)
want := ExitStatus{ExitCode: 10, ExitedAt: time.Now()}
s.Lock()
s.SetRestarting(&want)
s.Unlock()
s.Lock()
s.SetRunning(0, true)
s.Unlock()
got := <-waitC
if got.exitCode != want.ExitCode {
t.Fatalf("expected exit code %v, got %v", want.ExitCode, got.exitCode)
}
}
func TestIsValidStateString(t *testing.T) {
states := []struct {
state string

View File

@ -52,7 +52,20 @@ func (daemon *Daemon) handleContainerExit(c *container.Container, e *libcontaine
}
}
restart, wait, err := c.RestartManager().ShouldRestart(ec, daemon.IsShuttingDown() || c.HasBeenManuallyStopped, time.Since(c.StartedAt))
daemonShutdown := daemon.IsShuttingDown()
execDuration := time.Since(c.StartedAt)
restart, wait, err := c.RestartManager().ShouldRestart(ec, daemonShutdown || c.HasBeenManuallyStopped, execDuration)
if err != nil {
logrus.WithError(err).
WithField("container", c.ID).
WithField("restartCount", c.RestartCount).
WithField("exitStatus", exitStatus).
WithField("daemonShuttingDown", daemonShutdown).
WithField("hasBeenManuallyStopped", c.HasBeenManuallyStopped).
WithField("execDuration", execDuration).
Warn("ShouldRestart failed, container will not be restarted")
restart = false
}
// cancel healthcheck here, they will be automatically
// restarted if/when the container is started again
@ -62,12 +75,19 @@ func (daemon *Daemon) handleContainerExit(c *container.Container, e *libcontaine
}
daemon.Cleanup(c)
if err == nil && restart {
if restart {
c.RestartCount++
logrus.WithField("container", c.ID).
WithField("restartCount", c.RestartCount).
WithField("exitStatus", exitStatus).
WithField("manualRestart", c.HasBeenManuallyRestarted).
Debug("Restarting container")
c.SetRestarting(&exitStatus)
} else {
c.SetStopped(&exitStatus)
defer daemon.autoRemove(c)
if !c.HasBeenManuallyRestarted {
defer daemon.autoRemove(c)
}
}
defer c.Unlock() // needs to be called before autoRemove
@ -76,7 +96,7 @@ func (daemon *Daemon) handleContainerExit(c *container.Container, e *libcontaine
daemon.LogContainerEventWithAttributes(c, "die", attributes)
if err == nil && restart {
if restart {
go func() {
err := <-wait
if err == nil {

View File

@ -6,7 +6,6 @@ import (
containertypes "github.com/docker/docker/api/types/container"
"github.com/docker/docker/container"
"github.com/sirupsen/logrus"
)
// ContainerRestart stops and starts a container. It attempts to
@ -52,19 +51,11 @@ func (daemon *Daemon) containerRestart(ctx context.Context, container *container
}
if container.IsRunning() {
// set AutoRemove flag to false before stop so the container won't be
// removed during restart process
autoRemove := container.HostConfig.AutoRemove
container.Lock()
container.HasBeenManuallyRestarted = true
container.Unlock()
container.HostConfig.AutoRemove = false
err := daemon.containerStop(ctx, container, options)
// restore AutoRemove irrespective of whether the stop worked or not
container.HostConfig.AutoRemove = autoRemove
// containerStop will write HostConfig to disk, we shall restore AutoRemove
// in disk too
if toDiskErr := daemon.checkpointAndSave(container); toDiskErr != nil {
logrus.Errorf("Write container to disk error: %v", toDiskErr)
}
if err != nil {
return err

View File

@ -206,6 +206,7 @@ func (daemon *Daemon) containerStart(container *container.Container, checkpoint
return translateContainerdStartErr(container.Path, container.SetExitCode, err)
}
container.HasBeenManuallyRestarted = false
container.SetRunning(pid, true)
container.HasBeenStartedBefore = true
daemon.setStateCounter(container)

View File

@ -9,6 +9,7 @@ import (
"github.com/docker/docker/api/types"
"github.com/docker/docker/api/types/container"
"github.com/docker/docker/client"
testContainer "github.com/docker/docker/integration/internal/container"
"github.com/docker/docker/testutil/daemon"
"gotest.tools/v3/assert"
"gotest.tools/v3/poll"
@ -153,3 +154,60 @@ func pollForNewHealthCheck(ctx context.Context, client *client.Client, startTime
return poll.Continue("waiting for a new container healthcheck")
}
}
// Container started with --rm should be able to be restarted.
// It should be removed only if killed or stopped
func TestContainerWithAutoRemoveCanBeRestarted(t *testing.T) {
defer setupTest(t)()
cli := testEnv.APIClient()
ctx := context.Background()
noWaitTimeout := 0
for _, tc := range []struct {
desc string
doSth func(ctx context.Context, containerID string) error
}{
{
desc: "kill",
doSth: func(ctx context.Context, containerID string) error {
return cli.ContainerKill(ctx, containerID, "SIGKILL")
},
},
{
desc: "stop",
doSth: func(ctx context.Context, containerID string) error {
return cli.ContainerStop(ctx, containerID, container.StopOptions{Timeout: &noWaitTimeout})
},
},
} {
tc := tc
t.Run(tc.desc, func(t *testing.T) {
cID := testContainer.Run(ctx, t, cli,
testContainer.WithName("autoremove-restart-and-"+tc.desc),
testContainer.WithAutoRemove,
)
defer func() {
err := cli.ContainerRemove(ctx, cID, types.ContainerRemoveOptions{Force: true})
if t.Failed() && err != nil {
t.Logf("Cleaning up test container failed with error: %v", err)
}
}()
err := cli.ContainerRestart(ctx, cID, container.StopOptions{Timeout: &noWaitTimeout})
assert.NilError(t, err)
inspect, err := cli.ContainerInspect(ctx, cID)
assert.NilError(t, err)
assert.Assert(t, inspect.State.Status != "removing", "Container should not be removing yet")
poll.WaitOn(t, testContainer.IsInState(ctx, cli, cID, "running"))
err = tc.doSth(ctx, cID)
assert.NilError(t, err)
poll.WaitOn(t, testContainer.IsRemoved(ctx, cli, cID))
})
}
}

View File

@ -5,6 +5,7 @@ import (
"testing"
"time"
"github.com/docker/docker/api/types"
containertypes "github.com/docker/docker/api/types/container"
"github.com/docker/docker/integration/internal/container"
"github.com/docker/docker/testutil/request"
@ -156,3 +157,69 @@ func TestWaitConditions(t *testing.T) {
})
}
}
func TestWaitRestartedContainer(t *testing.T) {
defer setupTest(t)()
cli := request.NewAPIClient(t)
testCases := []struct {
doc string
waitCond containertypes.WaitCondition
}{
{
doc: "default",
},
{
doc: "not-running",
waitCond: containertypes.WaitConditionNotRunning,
},
{
doc: "next-exit",
waitCond: containertypes.WaitConditionNextExit,
},
}
// We can't catch the SIGTERM in the Windows based busybox image
isWindowDaemon := testEnv.DaemonInfo.OSType == "windows"
for _, tc := range testCases {
tc := tc
t.Run(tc.doc, func(t *testing.T) {
t.Parallel()
ctx := context.Background()
containerID := container.Run(ctx, t, cli,
container.WithCmd("sh", "-c", "trap 'exit 5' SIGTERM; while true; do sleep 0.1; done"),
)
defer cli.ContainerRemove(ctx, containerID, types.ContainerRemoveOptions{Force: true})
poll.WaitOn(t, container.IsInState(ctx, cli, containerID, "running"), poll.WithTimeout(30*time.Second), poll.WithDelay(100*time.Millisecond))
// Container is running now, wait for exit
waitResC, errC := cli.ContainerWait(ctx, containerID, tc.waitCond)
timeout := 5
// On Windows it will always timeout, because our process won't receive SIGTERM
// Skip to force killing immediately
if isWindowDaemon {
timeout = 0
}
err := cli.ContainerRestart(ctx, containerID, containertypes.StopOptions{Timeout: &timeout, Signal: "SIGTERM"})
assert.NilError(t, err)
select {
case err := <-errC:
t.Fatalf("Unexpected error: %v", err)
case <-time.After(time.Second * 3):
t.Fatalf("Wait should end after restart")
case waitRes := <-waitResC:
expectedCode := int64(5)
if !isWindowDaemon {
assert.Check(t, is.Equal(expectedCode, waitRes.StatusCode))
}
}
})
}
}

View File

@ -57,3 +57,17 @@ func IsSuccessful(ctx context.Context, client client.APIClient, containerID stri
return poll.Continue("waiting for container to be \"exited\", currently %s", inspect.State.Status)
}
}
// IsRemoved verifies the container has been removed
func IsRemoved(ctx context.Context, cli client.APIClient, containerID string) func(log poll.LogT) poll.Result {
return func(log poll.LogT) poll.Result {
inspect, err := cli.ContainerInspect(ctx, containerID)
if err != nil {
if client.IsErrNotFound(err) {
return poll.Success()
}
return poll.Error(err)
}
return poll.Continue("waiting for container to be removed, currently %s", inspect.State.Status)
}
}