mirror of
https://github.com/moby/moby.git
synced 2022-11-09 12:21:53 -05:00
docker daemon container stop refactor
this refactors the Stop command to fix a few issues and behaviors that dont seem completely correct: 1. first it fixes a situation where stop could hang forever (#41579) 2. fixes a behavior where if sending the stop signal failed, then the code directly sends a -9 signal. If that fails, it returns without waiting for the process to exit or going through the full docker kill codepath. 3. fixes a behavior where if sending the stop signal failed, then the code sends a -9 signal. If that succeeds, then we still go through the same stop waiting process, and may even go through the docker kill path again, even though we've already sent a -9. 4. fixes a behavior where the code would wait the full 30 seconds after sending a stop signal, even if we already know the stop signal failed. fixes #41579 Signed-off-by: Cam <gh@sparr.email>
This commit is contained in:
parent
f76958f612
commit
8e362b75cb
1 changed files with 52 additions and 40 deletions
|
@ -38,52 +38,64 @@ func (daemon *Daemon) ContainerStop(name string, timeout *int) error {
|
||||||
|
|
||||||
// containerStop sends a stop signal, waits, sends a kill signal.
|
// containerStop sends a stop signal, waits, sends a kill signal.
|
||||||
func (daemon *Daemon) containerStop(container *containerpkg.Container, seconds int) error {
|
func (daemon *Daemon) containerStop(container *containerpkg.Container, seconds int) error {
|
||||||
|
// TODO propagate a context down to this function
|
||||||
|
ctx := context.TODO()
|
||||||
if !container.IsRunning() {
|
if !container.IsRunning() {
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
var wait time.Duration
|
||||||
stopSignal := container.StopSignal()
|
|
||||||
// 1. Send a stop signal
|
|
||||||
if err := daemon.killPossiblyDeadProcess(container, stopSignal); err != nil {
|
|
||||||
// While normally we might "return err" here we're not going to
|
|
||||||
// because if we can't stop the container by this point then
|
|
||||||
// it's probably because it's already stopped. Meaning, between
|
|
||||||
// the time of the IsRunning() call above and now it stopped.
|
|
||||||
// Also, since the err return will be environment specific we can't
|
|
||||||
// look for any particular (common) error that would indicate
|
|
||||||
// that the process is already dead vs something else going wrong.
|
|
||||||
// So, instead we'll give it up to 2 more seconds to complete and if
|
|
||||||
// by that time the container is still running, then the error
|
|
||||||
// we got is probably valid and so we force kill it.
|
|
||||||
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
|
|
||||||
defer cancel()
|
|
||||||
|
|
||||||
if status := <-container.Wait(ctx, containerpkg.WaitConditionNotRunning); status.Err() != nil {
|
|
||||||
logrus.Infof("Container failed to stop after sending signal %d to the process, force killing", stopSignal)
|
|
||||||
if err := daemon.killPossiblyDeadProcess(container, 9); err != nil {
|
|
||||||
return err
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// 2. Wait for the process to exit on its own
|
|
||||||
ctx := context.Background()
|
|
||||||
if seconds >= 0 {
|
if seconds >= 0 {
|
||||||
var cancel context.CancelFunc
|
wait = time.Duration(seconds) * time.Second
|
||||||
ctx, cancel = context.WithTimeout(ctx, time.Duration(seconds)*time.Second)
|
}
|
||||||
|
success := func() error {
|
||||||
|
daemon.LogContainerEvent(container, "stop")
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
stopSignal := container.StopSignal()
|
||||||
|
|
||||||
|
// 1. Send a stop signal
|
||||||
|
err := daemon.killPossiblyDeadProcess(container, stopSignal)
|
||||||
|
if err != nil {
|
||||||
|
wait = 2 * time.Second
|
||||||
|
}
|
||||||
|
|
||||||
|
var subCtx context.Context
|
||||||
|
var cancel context.CancelFunc
|
||||||
|
if seconds >= 0 {
|
||||||
|
subCtx, cancel = context.WithTimeout(ctx, wait)
|
||||||
|
} else {
|
||||||
|
subCtx, cancel = context.WithCancel(ctx)
|
||||||
|
}
|
||||||
|
defer cancel()
|
||||||
|
|
||||||
|
if status := <-container.Wait(subCtx, containerpkg.WaitConditionNotRunning); status.Err() == nil {
|
||||||
|
// container did exit, so ignore any previous errors and return
|
||||||
|
return success()
|
||||||
|
}
|
||||||
|
|
||||||
|
if err != nil {
|
||||||
|
// the container has still not exited, and the kill function errored, so log the error here:
|
||||||
|
logrus.WithError(err).WithField("container", container.ID).Errorf("Error sending stop (signal %d) to container", stopSignal)
|
||||||
|
}
|
||||||
|
if seconds < 0 {
|
||||||
|
// if the client requested that we never kill / wait forever, but container.Wait was still
|
||||||
|
// interrupted (parent context cancelled, for example), we should propagate the signal failure
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
|
||||||
|
logrus.WithField("container", container.ID).Infof("Container failed to exit within %d seconds of signal %d - using the force", seconds, stopSignal)
|
||||||
|
// Stop either failed or container didnt exit, so fallback to kill.
|
||||||
|
if err := daemon.Kill(container); err != nil {
|
||||||
|
// got a kill error, but give container 2 more seconds to exit just in case
|
||||||
|
subCtx, cancel := context.WithTimeout(ctx, 2*time.Second)
|
||||||
defer cancel()
|
defer cancel()
|
||||||
}
|
if status := <-container.Wait(subCtx, containerpkg.WaitConditionNotRunning); status.Err() == nil {
|
||||||
|
// container did exit, so ignore error and return
|
||||||
if status := <-container.Wait(ctx, containerpkg.WaitConditionNotRunning); status.Err() != nil {
|
return success()
|
||||||
logrus.Infof("Container %v failed to exit within %d seconds of signal %d - using the force", container.ID, seconds, stopSignal)
|
|
||||||
// 3. If it doesn't, then send SIGKILL
|
|
||||||
if err := daemon.Kill(container); err != nil {
|
|
||||||
// Wait without a timeout, ignore result.
|
|
||||||
<-container.Wait(context.Background(), containerpkg.WaitConditionNotRunning)
|
|
||||||
logrus.Warn(err) // Don't return error because we only care that container is stopped, not what function stopped it
|
|
||||||
}
|
}
|
||||||
|
logrus.WithError(err).WithField("container", container.ID).Error("Error killing the container")
|
||||||
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
daemon.LogContainerEvent(container, "stop")
|
return success()
|
||||||
return nil
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Reference in a new issue