1
0
Fork 0
mirror of https://github.com/moby/moby.git synced 2022-11-09 12:21:53 -05:00

docker daemon container stop refactor

this refactors the Stop command to fix a few issues and behaviors that
dont seem completely correct:

1. first it fixes a situation where stop could hang forever (#41579)
2. fixes a behavior where if sending the
stop signal failed, then the code directly sends a -9 signal. If that
fails, it returns without waiting for the process to exit or going
through the full docker kill codepath.
3. fixes a behavior where if sending the stop signal failed, then the
code sends a -9 signal. If that succeeds, then we still go through the
same stop waiting process, and may even go through the docker kill path
again, even though we've already sent a -9.
4. fixes a behavior where the code would wait the full 30 seconds after
sending a stop signal, even if we already know the stop signal failed.

fixes #41579

Signed-off-by: Cam <gh@sparr.email>
This commit is contained in:
Cam 2020-10-24 13:15:58 -07:00
parent f76958f612
commit 8e362b75cb
No known key found for this signature in database
GPG key ID: EE812B08DF32F58C

View file

@ -38,52 +38,64 @@ func (daemon *Daemon) ContainerStop(name string, timeout *int) error {
// containerStop sends a stop signal, waits, sends a kill signal.
func (daemon *Daemon) containerStop(container *containerpkg.Container, seconds int) error {
// TODO propagate a context down to this function
ctx := context.TODO()
if !container.IsRunning() {
return nil
}
stopSignal := container.StopSignal()
// 1. Send a stop signal
if err := daemon.killPossiblyDeadProcess(container, stopSignal); err != nil {
// While normally we might "return err" here we're not going to
// because if we can't stop the container by this point then
// it's probably because it's already stopped. Meaning, between
// the time of the IsRunning() call above and now it stopped.
// Also, since the err return will be environment specific we can't
// look for any particular (common) error that would indicate
// that the process is already dead vs something else going wrong.
// So, instead we'll give it up to 2 more seconds to complete and if
// by that time the container is still running, then the error
// we got is probably valid and so we force kill it.
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
if status := <-container.Wait(ctx, containerpkg.WaitConditionNotRunning); status.Err() != nil {
logrus.Infof("Container failed to stop after sending signal %d to the process, force killing", stopSignal)
if err := daemon.killPossiblyDeadProcess(container, 9); err != nil {
return err
}
}
}
// 2. Wait for the process to exit on its own
ctx := context.Background()
var wait time.Duration
if seconds >= 0 {
var cancel context.CancelFunc
ctx, cancel = context.WithTimeout(ctx, time.Duration(seconds)*time.Second)
defer cancel()
wait = time.Duration(seconds) * time.Second
}
if status := <-container.Wait(ctx, containerpkg.WaitConditionNotRunning); status.Err() != nil {
logrus.Infof("Container %v failed to exit within %d seconds of signal %d - using the force", container.ID, seconds, stopSignal)
// 3. If it doesn't, then send SIGKILL
if err := daemon.Kill(container); err != nil {
// Wait without a timeout, ignore result.
<-container.Wait(context.Background(), containerpkg.WaitConditionNotRunning)
logrus.Warn(err) // Don't return error because we only care that container is stopped, not what function stopped it
}
}
success := func() error {
daemon.LogContainerEvent(container, "stop")
return nil
}
stopSignal := container.StopSignal()
// 1. Send a stop signal
err := daemon.killPossiblyDeadProcess(container, stopSignal)
if err != nil {
wait = 2 * time.Second
}
var subCtx context.Context
var cancel context.CancelFunc
if seconds >= 0 {
subCtx, cancel = context.WithTimeout(ctx, wait)
} else {
subCtx, cancel = context.WithCancel(ctx)
}
defer cancel()
if status := <-container.Wait(subCtx, containerpkg.WaitConditionNotRunning); status.Err() == nil {
// container did exit, so ignore any previous errors and return
return success()
}
if err != nil {
// the container has still not exited, and the kill function errored, so log the error here:
logrus.WithError(err).WithField("container", container.ID).Errorf("Error sending stop (signal %d) to container", stopSignal)
}
if seconds < 0 {
// if the client requested that we never kill / wait forever, but container.Wait was still
// interrupted (parent context cancelled, for example), we should propagate the signal failure
return err
}
logrus.WithField("container", container.ID).Infof("Container failed to exit within %d seconds of signal %d - using the force", seconds, stopSignal)
// Stop either failed or container didnt exit, so fallback to kill.
if err := daemon.Kill(container); err != nil {
// got a kill error, but give container 2 more seconds to exit just in case
subCtx, cancel := context.WithTimeout(ctx, 2*time.Second)
defer cancel()
if status := <-container.Wait(subCtx, containerpkg.WaitConditionNotRunning); status.Err() == nil {
// container did exit, so ignore error and return
return success()
}
logrus.WithError(err).WithField("container", container.ID).Error("Error killing the container")
return err
}
return success()
}