From 8e362b75cb8cfbc539169cbb5d77c52802f65eb1 Mon Sep 17 00:00:00 2001 From: Cam Date: Sat, 24 Oct 2020 13:15:58 -0700 Subject: [PATCH] docker daemon container stop refactor this refactors the Stop command to fix a few issues and behaviors that dont seem completely correct: 1. first it fixes a situation where stop could hang forever (#41579) 2. fixes a behavior where if sending the stop signal failed, then the code directly sends a -9 signal. If that fails, it returns without waiting for the process to exit or going through the full docker kill codepath. 3. fixes a behavior where if sending the stop signal failed, then the code sends a -9 signal. If that succeeds, then we still go through the same stop waiting process, and may even go through the docker kill path again, even though we've already sent a -9. 4. fixes a behavior where the code would wait the full 30 seconds after sending a stop signal, even if we already know the stop signal failed. fixes #41579 Signed-off-by: Cam --- daemon/stop.go | 92 ++++++++++++++++++++++++++++---------------------- 1 file changed, 52 insertions(+), 40 deletions(-) diff --git a/daemon/stop.go b/daemon/stop.go index c3ac09056a..ce0ff08e7d 100644 --- a/daemon/stop.go +++ b/daemon/stop.go @@ -38,52 +38,64 @@ func (daemon *Daemon) ContainerStop(name string, timeout *int) error { // containerStop sends a stop signal, waits, sends a kill signal. func (daemon *Daemon) containerStop(container *containerpkg.Container, seconds int) error { + // TODO propagate a context down to this function + ctx := context.TODO() if !container.IsRunning() { return nil } - - stopSignal := container.StopSignal() - // 1. Send a stop signal - if err := daemon.killPossiblyDeadProcess(container, stopSignal); err != nil { - // While normally we might "return err" here we're not going to - // because if we can't stop the container by this point then - // it's probably because it's already stopped. Meaning, between - // the time of the IsRunning() call above and now it stopped. - // Also, since the err return will be environment specific we can't - // look for any particular (common) error that would indicate - // that the process is already dead vs something else going wrong. - // So, instead we'll give it up to 2 more seconds to complete and if - // by that time the container is still running, then the error - // we got is probably valid and so we force kill it. - ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) - defer cancel() - - if status := <-container.Wait(ctx, containerpkg.WaitConditionNotRunning); status.Err() != nil { - logrus.Infof("Container failed to stop after sending signal %d to the process, force killing", stopSignal) - if err := daemon.killPossiblyDeadProcess(container, 9); err != nil { - return err - } - } - } - - // 2. Wait for the process to exit on its own - ctx := context.Background() + var wait time.Duration if seconds >= 0 { - var cancel context.CancelFunc - ctx, cancel = context.WithTimeout(ctx, time.Duration(seconds)*time.Second) + wait = time.Duration(seconds) * time.Second + } + success := func() error { + daemon.LogContainerEvent(container, "stop") + return nil + } + stopSignal := container.StopSignal() + + // 1. Send a stop signal + err := daemon.killPossiblyDeadProcess(container, stopSignal) + if err != nil { + wait = 2 * time.Second + } + + var subCtx context.Context + var cancel context.CancelFunc + if seconds >= 0 { + subCtx, cancel = context.WithTimeout(ctx, wait) + } else { + subCtx, cancel = context.WithCancel(ctx) + } + defer cancel() + + if status := <-container.Wait(subCtx, containerpkg.WaitConditionNotRunning); status.Err() == nil { + // container did exit, so ignore any previous errors and return + return success() + } + + if err != nil { + // the container has still not exited, and the kill function errored, so log the error here: + logrus.WithError(err).WithField("container", container.ID).Errorf("Error sending stop (signal %d) to container", stopSignal) + } + if seconds < 0 { + // if the client requested that we never kill / wait forever, but container.Wait was still + // interrupted (parent context cancelled, for example), we should propagate the signal failure + return err + } + + logrus.WithField("container", container.ID).Infof("Container failed to exit within %d seconds of signal %d - using the force", seconds, stopSignal) + // Stop either failed or container didnt exit, so fallback to kill. + if err := daemon.Kill(container); err != nil { + // got a kill error, but give container 2 more seconds to exit just in case + subCtx, cancel := context.WithTimeout(ctx, 2*time.Second) defer cancel() - } - - if status := <-container.Wait(ctx, containerpkg.WaitConditionNotRunning); status.Err() != nil { - logrus.Infof("Container %v failed to exit within %d seconds of signal %d - using the force", container.ID, seconds, stopSignal) - // 3. If it doesn't, then send SIGKILL - if err := daemon.Kill(container); err != nil { - // Wait without a timeout, ignore result. - <-container.Wait(context.Background(), containerpkg.WaitConditionNotRunning) - logrus.Warn(err) // Don't return error because we only care that container is stopped, not what function stopped it + if status := <-container.Wait(subCtx, containerpkg.WaitConditionNotRunning); status.Err() == nil { + // container did exit, so ignore error and return + return success() } + logrus.WithError(err).WithField("container", container.ID).Error("Error killing the container") + return err } - daemon.LogContainerEvent(container, "stop") - return nil + return success() }