Merge pull request #41586 from sparrc/stop-refactor

Fix hung docker stop if stop signal and daemon.Kill both fail
This commit is contained in:
Brian Goff 2021-04-14 13:33:14 -07:00 committed by GitHub
commit 6110ba3d7c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 52 additions and 40 deletions

View File

@ -38,52 +38,64 @@ func (daemon *Daemon) ContainerStop(name string, timeout *int) error {
// containerStop sends a stop signal, waits, sends a kill signal.
func (daemon *Daemon) containerStop(container *containerpkg.Container, seconds int) error {
// TODO propagate a context down to this function
ctx := context.TODO()
if !container.IsRunning() {
return nil
}
stopSignal := container.StopSignal()
// 1. Send a stop signal
if err := daemon.killPossiblyDeadProcess(container, stopSignal); err != nil {
// While normally we might "return err" here we're not going to
// because if we can't stop the container by this point then
// it's probably because it's already stopped. Meaning, between
// the time of the IsRunning() call above and now it stopped.
// Also, since the err return will be environment specific we can't
// look for any particular (common) error that would indicate
// that the process is already dead vs something else going wrong.
// So, instead we'll give it up to 2 more seconds to complete and if
// by that time the container is still running, then the error
// we got is probably valid and so we force kill it.
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
if status := <-container.Wait(ctx, containerpkg.WaitConditionNotRunning); status.Err() != nil {
logrus.Infof("Container failed to stop after sending signal %d to the process, force killing", stopSignal)
if err := daemon.killPossiblyDeadProcess(container, 9); err != nil {
return err
}
}
}
// 2. Wait for the process to exit on its own
ctx := context.Background()
var wait time.Duration
if seconds >= 0 {
var cancel context.CancelFunc
ctx, cancel = context.WithTimeout(ctx, time.Duration(seconds)*time.Second)
wait = time.Duration(seconds) * time.Second
}
success := func() error {
daemon.LogContainerEvent(container, "stop")
return nil
}
stopSignal := container.StopSignal()
// 1. Send a stop signal
err := daemon.killPossiblyDeadProcess(container, stopSignal)
if err != nil {
wait = 2 * time.Second
}
var subCtx context.Context
var cancel context.CancelFunc
if seconds >= 0 {
subCtx, cancel = context.WithTimeout(ctx, wait)
} else {
subCtx, cancel = context.WithCancel(ctx)
}
defer cancel()
if status := <-container.Wait(subCtx, containerpkg.WaitConditionNotRunning); status.Err() == nil {
// container did exit, so ignore any previous errors and return
return success()
}
if err != nil {
// the container has still not exited, and the kill function errored, so log the error here:
logrus.WithError(err).WithField("container", container.ID).Errorf("Error sending stop (signal %d) to container", stopSignal)
}
if seconds < 0 {
// if the client requested that we never kill / wait forever, but container.Wait was still
// interrupted (parent context cancelled, for example), we should propagate the signal failure
return err
}
logrus.WithField("container", container.ID).Infof("Container failed to exit within %d seconds of signal %d - using the force", seconds, stopSignal)
// Stop either failed or container didnt exit, so fallback to kill.
if err := daemon.Kill(container); err != nil {
// got a kill error, but give container 2 more seconds to exit just in case
subCtx, cancel := context.WithTimeout(ctx, 2*time.Second)
defer cancel()
}
if status := <-container.Wait(ctx, containerpkg.WaitConditionNotRunning); status.Err() != nil {
logrus.Infof("Container %v failed to exit within %d seconds of signal %d - using the force", container.ID, seconds, stopSignal)
// 3. If it doesn't, then send SIGKILL
if err := daemon.Kill(container); err != nil {
// Wait without a timeout, ignore result.
<-container.Wait(context.Background(), containerpkg.WaitConditionNotRunning)
logrus.Warn(err) // Don't return error because we only care that container is stopped, not what function stopped it
if status := <-container.Wait(subCtx, containerpkg.WaitConditionNotRunning); status.Err() == nil {
// container did exit, so ignore error and return
return success()
}
logrus.WithError(err).WithField("container", container.ID).Error("Error killing the container")
return err
}
daemon.LogContainerEvent(container, "stop")
return nil
return success()
}