docker kill: fix bug where failed kills didnt fallback to unix kill

1. fixes #41587
2. removes potential infinite Wait and goroutine leak at end of kill
function

fixes #41587

Signed-off-by: Cam <gh@sparr.email>
This commit is contained in:
Cam 2020-10-24 10:56:21 -07:00
parent 6110ba3d7c
commit e57a365ab1
No known key found for this signature in database
GPG Key ID: EE812B08DF32F58C
2 changed files with 41 additions and 53 deletions

View File

@ -3,13 +3,11 @@
package daemon // import "github.com/docker/docker/daemon" package daemon // import "github.com/docker/docker/daemon"
import ( import (
"context"
"fmt" "fmt"
"io/ioutil" "io/ioutil"
"os" "os"
"path/filepath" "path/filepath"
"strconv" "strconv"
"time"
"github.com/docker/docker/container" "github.com/docker/docker/container"
"github.com/docker/docker/daemon/links" "github.com/docker/docker/daemon/links"
@ -336,38 +334,32 @@ func (daemon *Daemon) cleanupSecretDir(c *container.Container) {
} }
} }
func killProcessDirectly(cntr *container.Container) error { func killProcessDirectly(container *container.Container) error {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) pid := container.GetPID()
defer cancel() // Ensure that we don't kill ourselves
if pid == 0 {
return nil
}
// Block until the container to stops or timeout. if err := unix.Kill(pid, 9); err != nil {
status := <-cntr.Wait(ctx, container.WaitConditionNotRunning) if err != unix.ESRCH {
if status.Err() != nil { return err
// Ensure that we don't kill ourselves }
if pid := cntr.GetPID(); pid != 0 { e := errNoSuchProcess{pid, 9}
logrus.Infof("Container %s failed to exit within 10 seconds of kill - trying direct SIGKILL", stringid.TruncateID(cntr.ID)) logrus.WithError(e).WithField("container", container.ID).Debug("no such process")
if err := unix.Kill(pid, 9); err != nil { return e
if err != unix.ESRCH { }
return err
}
e := errNoSuchProcess{pid, 9}
logrus.Debug(e)
return e
}
// In case there were some exceptions(e.g., state of zombie and D) // In case there were some exceptions(e.g., state of zombie and D)
if system.IsProcessAlive(pid) { if system.IsProcessAlive(pid) {
// Since we can not kill a zombie pid, add zombie check here
// Since we can not kill a zombie pid, add zombie check here isZombie, err := system.IsProcessZombie(pid)
isZombie, err := system.IsProcessZombie(pid) if err != nil {
if err != nil { logrus.WithError(err).WithField("container", container.ID).Warn("Container state is invalid")
logrus.Warnf("Container %s state is invalid", stringid.TruncateID(cntr.ID)) return err
return err }
} if isZombie {
if isZombie { return errdefs.System(errors.Errorf("container %s PID %d is zombie and can not be killed. Use the --init option when creating containers to run an init inside the container that forwards signals and reaps processes", stringid.TruncateID(container.ID), pid))
return errdefs.System(errors.Errorf("container %s PID %d is zombie and can not be killed. Use the --init option when creating containers to run an init inside the container that forwards signals and reaps processes", stringid.TruncateID(cntr.ID), pid))
}
}
} }
} }
return nil return nil

View File

@ -139,29 +139,22 @@ func (daemon *Daemon) Kill(container *containerpkg.Container) error {
// 1. Send SIGKILL // 1. Send SIGKILL
if err := daemon.killPossiblyDeadProcess(container, int(syscall.SIGKILL)); err != nil { if err := daemon.killPossiblyDeadProcess(container, int(syscall.SIGKILL)); err != nil {
// While normally we might "return err" here we're not going to // kill failed, check if process is no longer running.
// because if we can't stop the container by this point then
// it's probably because it's already stopped. Meaning, between
// the time of the IsRunning() call above and now it stopped.
// Also, since the err return will be environment specific we can't
// look for any particular (common) error that would indicate
// that the process is already dead vs something else going wrong.
// So, instead we'll give it up to 2 more seconds to complete and if
// by that time the container is still running, then the error
// we got is probably valid and so we return it to the caller.
if isErrNoSuchProcess(err) { if isErrNoSuchProcess(err) {
return nil return nil
} }
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()
if status := <-container.Wait(ctx, containerpkg.WaitConditionNotRunning); status.Err() != nil {
return err
}
} }
// 2. Wait for the process to die, in last resort, try to kill the process directly ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
status := <-container.Wait(ctx, containerpkg.WaitConditionNotRunning)
if status.Err() == nil {
return nil
}
logrus.WithError(status.Err()).WithField("container", container.ID).Error("Container failed to exit within 10 seconds of kill - trying direct SIGKILL")
if err := killProcessDirectly(container); err != nil { if err := killProcessDirectly(container); err != nil {
if isErrNoSuchProcess(err) { if isErrNoSuchProcess(err) {
return nil return nil
@ -169,10 +162,13 @@ func (daemon *Daemon) Kill(container *containerpkg.Container) error {
return err return err
} }
// Wait for exit with no timeout. // wait for container to exit one last time, if it doesn't then kill didnt work, so return error
// Ignore returned status. ctx2, cancel2 := context.WithTimeout(context.Background(), 2*time.Second)
<-container.Wait(context.Background(), containerpkg.WaitConditionNotRunning) defer cancel2()
if status := <-container.Wait(ctx2, containerpkg.WaitConditionNotRunning); status.Err() != nil {
return errors.New("tried to kill container, but did not receive an exit event")
}
return nil return nil
} }