Fix ShouldRestart for on-failure handle

Currently if you restart docker daemon, all the containers with restart policy `on-failure` regardless of its `RestartCount` will be started, this will make daemon cost more extra time for restart. This commit will stop these containers to do unnecessary start on daemon's restart. Signed-off-by: Zhang Wei <zhangwei555@huawei.com>
2022-11-09 12:21:53 -05:00 · 2016-03-22 23:46:40 +08:00 · 2016-03-22 23:46:40 +08:00 · 51e42e6ee0
commit 51e42e6ee0
parent b4664e3f86
6 changed files with 50 additions and 20 deletions
--- a/container/container.go
+++ b/container/container.go
@ -519,9 +519,8 @@ func copyEscapable(dst io.Writer, src io.ReadCloser, keys []byte) (written int64
 // ShouldRestart decides whether the daemon should restart the container or not.
 // This is based on the container's restart policy.
 func (container *Container) ShouldRestart() bool {
-	return container.HostConfig.RestartPolicy.Name == "always" ||
-		(container.HostConfig.RestartPolicy.Name == "unless-stopped" && !container.HasBeenManuallyStopped) ||
-		(container.HostConfig.RestartPolicy.Name == "on-failure" && container.ExitCode != 0)
+	shouldRestart, _, _ := container.restartManager.ShouldRestart(uint32(container.ExitCode), container.HasBeenManuallyStopped)
+	return shouldRestart
 }

 // AddBindMountPoint adds a new bind mount point configuration to the container.
@ -912,8 +911,9 @@ func (container *Container) RestartManager(reset bool) restartmanager.RestartMan
 		container.restartManager = nil
 	}
 	if container.restartManager == nil {
-		container.restartManager = restartmanager.New(container.HostConfig.RestartPolicy)
+		container.restartManager = restartmanager.New(container.HostConfig.RestartPolicy, container.RestartCount)
 	}
+
 	return container.restartManager
 }

--- a/daemon/daemon.go
+++ b/daemon/daemon.go
@ -291,13 +291,14 @@ func (daemon *Daemon) restore() error {
 		wg.Add(1)
 		go func(c *container.Container) {
 			defer wg.Done()
+			rm := c.RestartManager(false)
 			if c.IsRunning() || c.IsPaused() {
 				// Fix activityCount such that graph mounts can be unmounted later
 				if err := daemon.layerStore.ReinitRWLayer(c.RWLayer); err != nil {
 					logrus.Errorf("Failed to ReinitRWLayer for %s due to %s", c.ID, err)
 					return
 				}
-				if err := daemon.containerd.Restore(c.ID, libcontainerd.WithRestartManager(c.RestartManager(true))); err != nil {
+				if err := daemon.containerd.Restore(c.ID, libcontainerd.WithRestartManager(rm)); err != nil {
 					logrus.Errorf("Failed to restore with containerd: %q", err)
 					return
 				}
--- a/integration-cli/docker_cli_daemon_test.go
+++ b/integration-cli/docker_cli_daemon_test.go
@ -150,6 +150,37 @@ func (s *DockerDaemonSuite) TestDaemonRestartUnlessStopped(c *check.C) {

 }

+func (s *DockerDaemonSuite) TestDaemonRestartOnFailure(c *check.C) {
+	err := s.d.StartWithBusybox()
+	c.Assert(err, check.IsNil)
+
+	out, err := s.d.Cmd("run", "-d", "--name", "test1", "--restart", "on-failure:3", "busybox:latest", "false")
+	c.Assert(err, check.IsNil, check.Commentf("run top1: %v", out))
+
+	// wait test1 to stop
+	hostArgs := []string{"--host", s.d.sock()}
+	err = waitInspectWithArgs("test1", "{{.State.Running}} {{.State.Restarting}}", "false false", 10*time.Second, hostArgs...)
+	c.Assert(err, checker.IsNil, check.Commentf("test1 should exit but not"))
+
+	// record last start time
+	out, err = s.d.Cmd("inspect", "-f={{.State.StartedAt}}", "test1")
+	c.Assert(err, checker.IsNil, check.Commentf("out: %v", out))
+	lastStartTime := out
+
+	err = s.d.Restart()
+	c.Assert(err, check.IsNil)
+
+	// test1 shouldn't restart at all
+	err = waitInspectWithArgs("test1", "{{.State.Running}} {{.State.Restarting}}", "false false", 0, hostArgs...)
+	c.Assert(err, checker.IsNil, check.Commentf("test1 should exit but not"))
+
+	// make sure test1 isn't restarted when daemon restart
+	// if "StartAt" time updates, means test1 was once restarted.
+	out, err = s.d.Cmd("inspect", "-f={{.State.StartedAt}}", "test1")
+	c.Assert(err, checker.IsNil, check.Commentf("out: %v", out))
+	c.Assert(out, checker.Equals, lastStartTime, check.Commentf("test1 shouldn't start after daemon restarts"))
+}
+
 func (s *DockerDaemonSuite) TestDaemonStartIptablesFalse(c *check.C) {
 	if err := s.d.Start("--iptables=false"); err != nil {
 		c.Fatalf("we should have been able to start the daemon with passing iptables=false: %v", err)
--- a/libcontainerd/container_linux.go
+++ b/libcontainerd/container_linux.go
@ -118,7 +118,7 @@ func (ctr *container) handleEvent(e *containerd.Event) error {
 			st.State = StateExitProcess
 		}
 		if st.State == StateExit && ctr.restartManager != nil {
-			restart, wait, err := ctr.restartManager.ShouldRestart(e.Status)
+			restart, wait, err := ctr.restartManager.ShouldRestart(e.Status, false)
 			if err != nil {
 				logrus.Error(err)
 			} else if restart {
--- a/libcontainerd/container_windows.go
+++ b/libcontainerd/container_windows.go
@ -173,7 +173,7 @@ func (ctr *container) waitExit(pid uint32, processFriendlyName string, isFirstPr
 		defer ctr.client.unlock(ctr.containerID)

 		if si.State == StateExit && ctr.restartManager != nil {
-			restart, wait, err := ctr.restartManager.ShouldRestart(uint32(exitCode))
+			restart, wait, err := ctr.restartManager.ShouldRestart(uint32(exitCode), false)
 			if err != nil {
 				logrus.Error(err)
 			} else if restart {
--- a/restartmanager/restartmanager.go
+++ b/restartmanager/restartmanager.go
@ -16,14 +16,14 @@ const (
 // RestartManager defines object that controls container restarting rules.
 type RestartManager interface {
 	Cancel() error
-	ShouldRestart(exitCode uint32) (bool, chan error, error)
+	ShouldRestart(exitCode uint32, hasBeenManuallyStopped bool) (bool, chan error, error)
 }

 type restartManager struct {
 	sync.Mutex
 	sync.Once
 	policy       container.RestartPolicy
-	failureCount int
+	restartCount int
 	timeout      time.Duration
 	active       bool
 	cancel       chan struct{}
@ -31,8 +31,8 @@ type restartManager struct {
 }

 // New returns a new restartmanager based on a policy.
-func New(policy container.RestartPolicy) RestartManager {
-	return &restartManager{policy: policy, cancel: make(chan struct{})}
+func New(policy container.RestartPolicy, restartCount int) RestartManager {
+	return &restartManager{policy: policy, restartCount: restartCount, cancel: make(chan struct{})}
 }

 func (rm *restartManager) SetPolicy(policy container.RestartPolicy) {
@ -41,7 +41,7 @@ func (rm *restartManager) SetPolicy(policy container.RestartPolicy) {
 	rm.Unlock()
 }

-func (rm *restartManager) ShouldRestart(exitCode uint32) (bool, chan error, error) {
+func (rm *restartManager) ShouldRestart(exitCode uint32, hasBeenManuallyStopped bool) (bool, chan error, error) {
 	rm.Lock()
 	unlockOnExit := true
 	defer func() {
@ -58,12 +58,6 @@ func (rm *restartManager) ShouldRestart(exitCode uint32) (bool, chan error, erro
 		return false, nil, fmt.Errorf("invalid call on active restartmanager")
 	}

-	if exitCode != 0 {
-		rm.failureCount++
-	} else {
-		rm.failureCount = 0
-	}
-
 	if rm.timeout == 0 {
 		rm.timeout = defaultTimeout
 	} else {
@ -72,11 +66,13 @@ func (rm *restartManager) ShouldRestart(exitCode uint32) (bool, chan error, erro

 	var restart bool
 	switch {
-	case rm.policy.IsAlways(), rm.policy.IsUnlessStopped():
+	case rm.policy.IsAlways():
+		restart = true
+	case rm.policy.IsUnlessStopped() && !hasBeenManuallyStopped:
 		restart = true
 	case rm.policy.IsOnFailure():
 		// the default value of 0 for MaximumRetryCount means that we will not enforce a maximum count
-		if max := rm.policy.MaximumRetryCount; max == 0 || rm.failureCount <= max {
+		if max := rm.policy.MaximumRetryCount; max == 0 || rm.restartCount < max {
 			restart = exitCode != 0
 		}
 	}
@ -86,6 +82,8 @@ func (rm *restartManager) ShouldRestart(exitCode uint32) (bool, chan error, erro
 		return false, nil, nil
 	}

+	rm.restartCount++
+
 	unlockOnExit = false
 	rm.active = true
 	rm.Unlock()