From 8e362b75cb8cfbc539169cbb5d77c52802f65eb1 Mon Sep 17 00:00:00 2001
From: Cam <gh@sparr.email>
Date: Sat, 24 Oct 2020 13:15:58 -0700
Subject: [PATCH] docker daemon container stop refactor

this refactors the Stop command to fix a few issues and behaviors that
dont seem completely correct:

1. first it fixes a situation where stop could hang forever (#41579)
2. fixes a behavior where if sending the
stop signal failed, then the code directly sends a -9 signal. If that
fails, it returns without waiting for the process to exit or going
through the full docker kill codepath.
3. fixes a behavior where if sending the stop signal failed, then the
code sends a -9 signal. If that succeeds, then we still go through the
same stop waiting process, and may even go through the docker kill path
again, even though we've already sent a -9.
4. fixes a behavior where the code would wait the full 30 seconds after
sending a stop signal, even if we already know the stop signal failed.

fixes #41579

Signed-off-by: Cam <gh@sparr.email>
---
 daemon/stop.go | 92 ++++++++++++++++++++++++++++----------------------
 1 file changed, 52 insertions(+), 40 deletions(-)

diff --git a/daemon/stop.go b/daemon/stop.go
index c3ac09056a..ce0ff08e7d 100644
--- a/daemon/stop.go
+++ b/daemon/stop.go
@@ -38,52 +38,64 @@ func (daemon *Daemon) ContainerStop(name string, timeout *int) error {
 
 // containerStop sends a stop signal, waits, sends a kill signal.
 func (daemon *Daemon) containerStop(container *containerpkg.Container, seconds int) error {
+	// TODO propagate a context down to this function
+	ctx := context.TODO()
 	if !container.IsRunning() {
 		return nil
 	}
-
-	stopSignal := container.StopSignal()
-	// 1. Send a stop signal
-	if err := daemon.killPossiblyDeadProcess(container, stopSignal); err != nil {
-		// While normally we might "return err" here we're not going to
-		// because if we can't stop the container by this point then
-		// it's probably because it's already stopped. Meaning, between
-		// the time of the IsRunning() call above and now it stopped.
-		// Also, since the err return will be environment specific we can't
-		// look for any particular (common) error that would indicate
-		// that the process is already dead vs something else going wrong.
-		// So, instead we'll give it up to 2 more seconds to complete and if
-		// by that time the container is still running, then the error
-		// we got is probably valid and so we force kill it.
-		ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
-		defer cancel()
-
-		if status := <-container.Wait(ctx, containerpkg.WaitConditionNotRunning); status.Err() != nil {
-			logrus.Infof("Container failed to stop after sending signal %d to the process, force killing", stopSignal)
-			if err := daemon.killPossiblyDeadProcess(container, 9); err != nil {
-				return err
-			}
-		}
-	}
-
-	// 2. Wait for the process to exit on its own
-	ctx := context.Background()
+	var wait time.Duration
 	if seconds >= 0 {
-		var cancel context.CancelFunc
-		ctx, cancel = context.WithTimeout(ctx, time.Duration(seconds)*time.Second)
+		wait = time.Duration(seconds) * time.Second
+	}
+	success := func() error {
+		daemon.LogContainerEvent(container, "stop")
+		return nil
+	}
+	stopSignal := container.StopSignal()
+
+	// 1. Send a stop signal
+	err := daemon.killPossiblyDeadProcess(container, stopSignal)
+	if err != nil {
+		wait = 2 * time.Second
+	}
+
+	var subCtx context.Context
+	var cancel context.CancelFunc
+	if seconds >= 0 {
+		subCtx, cancel = context.WithTimeout(ctx, wait)
+	} else {
+		subCtx, cancel = context.WithCancel(ctx)
+	}
+	defer cancel()
+
+	if status := <-container.Wait(subCtx, containerpkg.WaitConditionNotRunning); status.Err() == nil {
+		// container did exit, so ignore any previous errors and return
+		return success()
+	}
+
+	if err != nil {
+		// the container has still not exited, and the kill function errored, so log the error here:
+		logrus.WithError(err).WithField("container", container.ID).Errorf("Error sending stop (signal %d) to container", stopSignal)
+	}
+	if seconds < 0 {
+		// if the client requested that we never kill / wait forever, but container.Wait was still
+		// interrupted (parent context cancelled, for example), we should propagate the signal failure
+		return err
+	}
+
+	logrus.WithField("container", container.ID).Infof("Container failed to exit within %d seconds of signal %d - using the force", seconds, stopSignal)
+	// Stop either failed or container didnt exit, so fallback to kill.
+	if err := daemon.Kill(container); err != nil {
+		// got a kill error, but give container 2 more seconds to exit just in case
+		subCtx, cancel := context.WithTimeout(ctx, 2*time.Second)
 		defer cancel()
-	}
-
-	if status := <-container.Wait(ctx, containerpkg.WaitConditionNotRunning); status.Err() != nil {
-		logrus.Infof("Container %v failed to exit within %d seconds of signal %d - using the force", container.ID, seconds, stopSignal)
-		// 3. If it doesn't, then send SIGKILL
-		if err := daemon.Kill(container); err != nil {
-			// Wait without a timeout, ignore result.
-			<-container.Wait(context.Background(), containerpkg.WaitConditionNotRunning)
-			logrus.Warn(err) // Don't return error because we only care that container is stopped, not what function stopped it
+		if status := <-container.Wait(subCtx, containerpkg.WaitConditionNotRunning); status.Err() == nil {
+			// container did exit, so ignore error and return
+			return success()
 		}
+		logrus.WithError(err).WithField("container", container.ID).Error("Error killing the container")
+		return err
 	}
 
-	daemon.LogContainerEvent(container, "stop")
-	return nil
+	return success()
 }