state/Wait: Fix race when reading exit status

Before this change there was a race condition between State.Wait reading
the exit code from State and the State being changed instantly after the
change which ended the State.Wait.

Now, each State.Wait has its own channel which is used to transmit the
desired StateStatus at the time the state transitions to the awaited
one. Wait no longer reads the status by itself so there is no race.

The issue caused the `docker run --restart=always ...' to sometimes exit
with 0 exit code, because the process was already restarted by the time
State.Wait got the chance to read the exit code.

Test run
--------
Before:
```
$ go test -count 1 -run TestCorrectStateWaitResultAfterRestart .
--- FAIL: TestCorrectStateWaitResultAfterRestart (0.00s)
    state_test.go:198: expected exit code 10, got 0
FAIL
FAIL    github.com/docker/docker/container      0.011s
FAIL

```

After:
```
$ go test -count 1 -run TestCorrectStateWaitResultAfterRestart .
ok      github.com/docker/docker/container      0.011s
```

Signed-off-by: Paweł Gronowski <pawel.gronowski@docker.com>
This commit is contained in:
Paweł Gronowski 2022-07-19 12:59:46 +02:00
parent 826003ecae
commit 85b9568d0e
3 changed files with 88 additions and 57 deletions

View File

@ -81,17 +81,18 @@ type Container struct {
Driver string Driver string
OS string OS string
// MountLabel contains the options for the 'mount' command // MountLabel contains the options for the 'mount' command
MountLabel string MountLabel string
ProcessLabel string ProcessLabel string
RestartCount int RestartCount int
HasBeenStartedBefore bool HasBeenStartedBefore bool
HasBeenManuallyStopped bool // used for unless-stopped restart policy HasBeenManuallyStopped bool // used for unless-stopped restart policy
MountPoints map[string]*volumemounts.MountPoint HasBeenManuallyRestarted bool `json:"-"` // used to distinguish restart caused by restart policy from the manual one
HostConfig *containertypes.HostConfig `json:"-"` // do not serialize the host config in the json, otherwise we'll make the container unportable MountPoints map[string]*volumemounts.MountPoint
ExecCommands *exec.Store `json:"-"` HostConfig *containertypes.HostConfig `json:"-"` // do not serialize the host config in the json, otherwise we'll make the container unportable
DependencyStore agentexec.DependencyGetter `json:"-"` ExecCommands *exec.Store `json:"-"`
SecretReferences []*swarmtypes.SecretReference DependencyStore agentexec.DependencyGetter `json:"-"`
ConfigReferences []*swarmtypes.ConfigReference SecretReferences []*swarmtypes.SecretReference
ConfigReferences []*swarmtypes.ConfigReference
// logDriver for closing // logDriver for closing
LogDriver logger.Logger `json:"-"` LogDriver logger.Logger `json:"-"`
LogCopier *logger.Copier `json:"-"` LogCopier *logger.Copier `json:"-"`

View File

@ -32,9 +32,10 @@ type State struct {
StartedAt time.Time StartedAt time.Time
FinishedAt time.Time FinishedAt time.Time
Health *Health Health *Health
Removed bool `json:"-"`
waitStop chan struct{} stopWaiters []chan<- StateStatus
waitRemove chan struct{} removeOnlyWaiters []chan<- StateStatus
} }
// StateStatus is used to return container wait results. // StateStatus is used to return container wait results.
@ -57,12 +58,9 @@ func (s StateStatus) Err() error {
return s.err return s.err
} }
// NewState creates a default state object with a fresh channel for state changes. // NewState creates a default state object.
func NewState() *State { func NewState() *State {
return &State{ return &State{}
waitStop: make(chan struct{}),
waitRemove: make(chan struct{}),
}
} }
// String returns a human-readable description of the state // String returns a human-readable description of the state
@ -182,11 +180,10 @@ func (s *State) Wait(ctx context.Context, condition WaitCondition) <-chan StateS
s.Lock() s.Lock()
defer s.Unlock() defer s.Unlock()
if condition == WaitConditionNotRunning && !s.Running { // Buffer so we can put status and finish even nobody receives it.
// Buffer so we can put it in the channel now. resultC := make(chan StateStatus, 1)
resultC := make(chan StateStatus, 1)
// Send the current status. if s.conditionAlreadyMet(condition) {
resultC <- StateStatus{ resultC <- StateStatus{
exitCode: s.ExitCode(), exitCode: s.ExitCode(),
err: s.Err(), err: s.Err(),
@ -195,20 +192,17 @@ func (s *State) Wait(ctx context.Context, condition WaitCondition) <-chan StateS
return resultC return resultC
} }
// If we are waiting only for removal, the waitStop channel should waitC := make(chan StateStatus, 1)
// remain nil and block forever.
var waitStop chan struct{} // Removal wakes up both removeOnlyWaiters and stopWaiters
if condition < WaitConditionRemoved { // Container could be removed while still in "created" state
waitStop = s.waitStop // in which case it is never actually stopped
if condition == WaitConditionRemoved {
s.removeOnlyWaiters = append(s.removeOnlyWaiters, waitC)
} else {
s.stopWaiters = append(s.stopWaiters, waitC)
} }
// Always wait for removal, just in case the container gets removed
// while it is still in a "created" state, in which case it is never
// actually stopped.
waitRemove := s.waitRemove
resultC := make(chan StateStatus, 1)
go func() { go func() {
select { select {
case <-ctx.Done(): case <-ctx.Done():
@ -218,23 +212,25 @@ func (s *State) Wait(ctx context.Context, condition WaitCondition) <-chan StateS
err: ctx.Err(), err: ctx.Err(),
} }
return return
case <-waitStop: case status := <-waitC:
case <-waitRemove: resultC <- status
} }
s.Lock()
result := StateStatus{
exitCode: s.ExitCode(),
err: s.Err(),
}
s.Unlock()
resultC <- result
}() }()
return resultC return resultC
} }
func (s *State) conditionAlreadyMet(condition WaitCondition) bool {
switch condition {
case WaitConditionNotRunning:
return !s.Running
case WaitConditionRemoved:
return s.Removed
}
return false
}
// IsRunning returns whether the running flag is set. Used by Container to check whether a container is running. // IsRunning returns whether the running flag is set. Used by Container to check whether a container is running.
func (s *State) IsRunning() bool { func (s *State) IsRunning() bool {
s.Lock() s.Lock()
@ -292,8 +288,8 @@ func (s *State) SetStopped(exitStatus *ExitStatus) {
} }
s.ExitCodeValue = exitStatus.ExitCode s.ExitCodeValue = exitStatus.ExitCode
s.OOMKilled = exitStatus.OOMKilled s.OOMKilled = exitStatus.OOMKilled
close(s.waitStop) // fire waiters for stop
s.waitStop = make(chan struct{}) s.notifyAndClear(&s.stopWaiters)
} }
// SetRestarting sets the container state to "restarting" without locking. // SetRestarting sets the container state to "restarting" without locking.
@ -308,8 +304,8 @@ func (s *State) SetRestarting(exitStatus *ExitStatus) {
s.FinishedAt = time.Now().UTC() s.FinishedAt = time.Now().UTC()
s.ExitCodeValue = exitStatus.ExitCode s.ExitCodeValue = exitStatus.ExitCode
s.OOMKilled = exitStatus.OOMKilled s.OOMKilled = exitStatus.OOMKilled
close(s.waitStop) // fire waiters for stop
s.waitStop = make(chan struct{}) s.notifyAndClear(&s.stopWaiters)
} }
// SetError sets the container's error state. This is useful when we want to // SetError sets the container's error state. This is useful when we want to
@ -374,22 +370,19 @@ func (s *State) IsDead() bool {
return res return res
} }
// SetRemoved assumes this container is already in the "dead" state and // SetRemoved assumes this container is already in the "dead" state and notifies all waiters.
// closes the internal waitRemove channel to unblock callers waiting for a
// container to be removed.
func (s *State) SetRemoved() { func (s *State) SetRemoved() {
s.SetRemovalError(nil) s.SetRemovalError(nil)
} }
// SetRemovalError is to be called in case a container remove failed. // SetRemovalError is to be called in case a container remove failed.
// It sets an error and closes the internal waitRemove channel to unblock // It sets an error and notifies all waiters.
// callers waiting for the container to be removed.
func (s *State) SetRemovalError(err error) { func (s *State) SetRemovalError(err error) {
s.SetError(err) s.SetError(err)
s.Lock() s.Lock()
close(s.waitRemove) // Unblock those waiting on remove. s.Removed = true
// Recreate the channel so next ContainerWait will work s.notifyAndClear(&s.removeOnlyWaiters)
s.waitRemove = make(chan struct{}) s.notifyAndClear(&s.stopWaiters)
s.Unlock() s.Unlock()
} }
@ -400,3 +393,15 @@ func (s *State) Err() error {
} }
return nil return nil
} }
func (s *State) notifyAndClear(waiters *[]chan<- StateStatus) {
result := StateStatus{
exitCode: s.ExitCodeValue,
err: s.Err(),
}
for _, c := range *waiters {
c <- result
}
*waiters = nil
}

View File

@ -169,6 +169,31 @@ func TestStateTimeoutWait(t *testing.T) {
} }
} }
// Related issue: #39352
func TestCorrectStateWaitResultAfterRestart(t *testing.T) {
s := NewState()
s.Lock()
s.SetRunning(0, true)
s.Unlock()
waitC := s.Wait(context.Background(), WaitConditionNotRunning)
want := ExitStatus{ExitCode: 10, ExitedAt: time.Now()}
s.Lock()
s.SetRestarting(&want)
s.Unlock()
s.Lock()
s.SetRunning(0, true)
s.Unlock()
got := <-waitC
if got.exitCode != want.ExitCode {
t.Fatalf("expected exit code %v, got %v", want.ExitCode, got.exitCode)
}
}
func TestIsValidStateString(t *testing.T) { func TestIsValidStateString(t *testing.T) {
states := []struct { states := []struct {
state string state string