Merge pull request #38522 from cpuguy83/fix_timers

Make sure timers are stopped after use.
2022-11-09 12:21:53 -05:00 · 2019-06-07 13:16:46 +02:00 · 2019-06-07 13:16:46 +02:00 · c85fe2d224
commit c85fe2d224
parent 1d5748d975 eaad3ee3cf
15 changed files with 120 additions and 40 deletions
--- a/api/server/router/system/system_routes.go
+++ b/api/server/router/system/system_routes.go
@ -174,7 +174,9 @@ func (s *systemRouter) getEvents(ctx context.Context, w http.ResponseWriter, r *

 		if !onlyPastEvents {
 			dur := until.Sub(now)
-			timeout = time.After(dur)
+			timer := time.NewTimer(dur)
+			defer timer.Stop()
+			timeout = timer.C
 		}
 	}

--- a/cmd/dockerd/daemon.go
+++ b/cmd/dockerd/daemon.go
@ -400,10 +400,14 @@ func shutdownDaemon(d *daemon.Daemon) {
 		logrus.Debug("Clean shutdown succeeded")
 		return
 	}
+
+	timeout := time.NewTimer(time.Duration(shutdownTimeout) * time.Second)
+	defer timeout.Stop()
+
 	select {
 	case <-ch:
 		logrus.Debug("Clean shutdown succeeded")
-	case <-time.After(time.Duration(shutdownTimeout) * time.Second):
+	case <-timeout.C:
 		logrus.Error("Force shutdown daemon")
 	}
 }
--- a/container/monitor.go
+++ b/container/monitor.go
@ -33,8 +33,11 @@ func (container *Container) Reset(lock bool) {
 				container.LogCopier.Wait()
 				close(exit)
 			}()
+
+			timer := time.NewTimer(loggerCloseTimeout)
+			defer timer.Stop()
 			select {
-			case <-time.After(loggerCloseTimeout):
+			case <-timer.C:
 				logrus.Warn("Logger didn't exit in time: logs may be truncated")
 			case <-exit:
 			}
--- a/daemon/cluster/cluster.go
+++ b/daemon/cluster/cluster.go
@ -186,8 +186,11 @@ func (c *Cluster) Start() error {
 	}
 	c.nr = nr

+	timer := time.NewTimer(swarmConnectTimeout)
+	defer timer.Stop()
+
 	select {
-	case <-time.After(swarmConnectTimeout):
+	case <-timer.C:
 		logrus.Error("swarm component could not be started before timeout was reached")
 	case err := <-nr.Ready():
 		if err != nil {
--- a/daemon/cluster/swarm.go
+++ b/daemon/cluster/swarm.go
@ -194,8 +194,11 @@ func (c *Cluster) Join(req types.JoinRequest) error {
 	c.nr = nr
 	c.mu.Unlock()

+	timeout := time.NewTimer(swarmConnectTimeout)
+	defer timeout.Stop()
+
 	select {
-	case <-time.After(swarmConnectTimeout):
+	case <-timeout.C:
 		return errSwarmJoinTimeoutReached
 	case err := <-nr.Ready():
 		if err != nil {
--- a/daemon/daemon.go
+++ b/daemon/daemon.go
@ -486,12 +486,14 @@ func (daemon *Daemon) restore() error {
 			// ignore errors here as this is a best effort to wait for children to be
 			//   running before we try to start the container
 			children := daemon.children(c)
-			timeout := time.After(5 * time.Second)
+			timeout := time.NewTimer(5 * time.Second)
+			defer timeout.Stop()
+
 			for _, child := range children {
 				if notifier, exists := restartContainers[child]; exists {
 					select {
 					case <-notifier:
-					case <-timeout:
+					case <-timeout.C:
 					}
 				}
 			}
@ -609,6 +611,7 @@ func (daemon *Daemon) waitForNetworks(c *container.Container) {
 	if daemon.discoveryWatcher == nil {
 		return
 	}
+
 	// Make sure if the container has a network that requires discovery that the discovery service is available before starting
 	for netName := range c.NetworkSettings.Networks {
 		// If we get `ErrNoSuchNetwork` here, we can assume that it is due to discovery not being ready
@ -617,13 +620,19 @@ func (daemon *Daemon) waitForNetworks(c *container.Container) {
 			if _, ok := err.(libnetwork.ErrNoSuchNetwork); !ok {
 				continue
 			}
+
 			// use a longish timeout here due to some slowdowns in libnetwork if the k/v store is on anything other than --net=host
 			// FIXME: why is this slow???
+			dur := 60 * time.Second
+			timer := time.NewTimer(dur)
+
 			logrus.Debugf("Container %s waiting for network to be ready", c.Name)
 			select {
 			case <-daemon.discoveryWatcher.ReadyCh():
-			case <-time.After(60 * time.Second):
+			case <-timer.C:
 			}
+			timer.Stop()
+
 			return
 		}
 	}
@ -673,10 +682,14 @@ func (daemon *Daemon) DaemonLeavesCluster() {
 	// This is called also on graceful daemon shutdown. We need to
 	// wait, because the ingress release has to happen before the
 	// network controller is stopped.
+
 	if done, err := daemon.ReleaseIngress(); err == nil {
+		timeout := time.NewTimer(5 * time.Second)
+		defer timeout.Stop()
+
 		select {
 		case <-done:
-		case <-time.After(5 * time.Second):
+		case <-timeout.C:
 			logrus.Warn("timeout while waiting for ingress network removal")
 		}
 	} else {
--- a/daemon/discovery/discovery.go
+++ b/daemon/discovery/discovery.go
@ -148,12 +148,14 @@ func (d *daemonDiscoveryReloader) initHeartbeat(address string) error {
 	// Setup a short ticker until the first heartbeat has succeeded
 	t := time.NewTicker(500 * time.Millisecond)
 	defer t.Stop()
+
 	// timeout makes sure that after a period of time we stop being so aggressive trying to reach the discovery service
-	timeout := time.After(60 * time.Second)
+	timeout := time.NewTimer(60 * time.Second)
+	defer timeout.Stop()

 	for {
 		select {
-		case <-timeout:
+		case <-timeout.C:
 			return errors.New("timeout waiting for initial discovery")
 		case <-d.term:
 			return errors.New("terminated")
--- a/daemon/exec.go
+++ b/daemon/exec.go
@ -23,7 +23,7 @@ import (
 )

 // Seconds to wait after sending TERM before trying KILL
-const termProcessTimeout = 10
+const termProcessTimeout = 10 * time.Second

 func (d *Daemon) registerExecCommand(container *container.Container, config *exec.Config) {
 	// Storing execs in container in order to kill them gracefully whenever the container is stopped or removed.
@ -277,9 +277,13 @@ func (d *Daemon) ContainerExecStart(ctx context.Context, name string, stdin io.R
 	case <-ctx.Done():
 		logrus.Debugf("Sending TERM signal to process %v in container %v", name, c.ID)
 		d.containerd.SignalProcess(ctx, c.ID, name, int(signal.SignalMap["TERM"]))
+
+		timeout := time.NewTimer(termProcessTimeout)
+		defer timeout.Stop()
+
 		select {
-		case <-time.After(termProcessTimeout * time.Second):
-			logrus.Infof("Container %v, process %v failed to exit within %d seconds of signal TERM - using the force", c.ID, name, termProcessTimeout)
+		case <-timeout.C:
+			logrus.Infof("Container %v, process %v failed to exit within %v of signal TERM - using the force", c.ID, name, termProcessTimeout)
 			d.containerd.SignalProcess(ctx, c.ID, name, int(signal.SignalMap["KILL"]))
 		case <-attachErr:
 			// TERM signal worked
--- a/daemon/health.go
+++ b/daemon/health.go
@ -187,12 +187,18 @@ func handleProbeResult(d *Daemon, c *container.Container, result *types.Healthch
 func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) {
 	probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout)
 	probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval)
+
+	intervalTimer := time.NewTimer(probeInterval)
+	defer intervalTimer.Stop()
+
 	for {
+		intervalTimer.Reset(probeInterval)
+
 		select {
 		case <-stop:
 			logrus.Debugf("Stop healthcheck monitoring for container %s (received while idle)", c.ID)
 			return
-		case <-time.After(probeInterval):
+		case <-intervalTimer.C:
 			logrus.Debugf("Running health check for container %s ...", c.ID)
 			startTime := time.Now()
 			ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout)
--- a/daemon/resize.go
+++ b/daemon/resize.go
@ -38,13 +38,16 @@ func (daemon *Daemon) ContainerExecResize(name string, height, width int) error
 	if err != nil {
 		return err
 	}
+
 	// TODO: the timeout is hardcoded here, it would be more flexible to make it
 	// a parameter in resize request context, which would need API changes.
-	timeout := 10 * time.Second
+	timeout := time.NewTimer(10 * time.Second)
+	defer timeout.Stop()
+
 	select {
 	case <-ec.Started:
 		return daemon.containerd.ResizeTerminal(context.Background(), ec.ContainerID, ec.ID, width, height)
-	case <-time.After(timeout):
+	case <-timeout.C:
 		return fmt.Errorf("timeout waiting for exec session ready")
 	}
 }
--- a/libcontainerd/supervisor/remote_daemon.go
+++ b/libcontainerd/supervisor/remote_daemon.go
@ -89,8 +89,11 @@ func Start(ctx context.Context, rootDir, stateDir string, opts ...DaemonOpt) (Da

 	go r.monitorDaemon(ctx)

+	timeout := time.NewTimer(startupTimeout)
+	defer timeout.Stop()
+
 	select {
-	case <-time.After(startupTimeout):
+	case <-timeout.C:
 		return nil, errors.New("timeout waiting for containerd to start")
 	case err := <-r.daemonStartCh:
 		if err != nil {
@ -101,8 +104,11 @@ func Start(ctx context.Context, rootDir, stateDir string, opts ...DaemonOpt) (Da
 	return r, nil
 }
 func (r *remote) WaitTimeout(d time.Duration) error {
+	timeout := time.NewTimer(d)
+	defer timeout.Stop()
+
 	select {
-	case <-time.After(d):
+	case <-timeout.C:
 		return errors.New("timeout waiting for containerd to stop")
 	case <-r.daemonStopCh:
 	}
@ -230,7 +236,8 @@ func (r *remote) monitorDaemon(ctx context.Context) {
 		transientFailureCount = 0
 		client                *containerd.Client
 		err                   error
-		delay                 <-chan time.Time
+		delay                 time.Duration
+		timer                 = time.NewTimer(0)
 		started               bool
 	)

@ -245,10 +252,17 @@ func (r *remote) monitorDaemon(ctx context.Context) {
 		r.platformCleanup()

 		close(r.daemonStopCh)
+		timer.Stop()
 	}()

+	// ensure no races on sending to timer.C even though there is a 0 duration.
+	if !timer.Stop() {
+		<-timer.C
+	}
+
 	for {
-		if delay != nil {
+		timer.Reset(delay)
+
 		select {
 		case <-ctx.Done():
 			r.logger.Info("stopping healthcheck following graceful shutdown")
@ -256,8 +270,7 @@ func (r *remote) monitorDaemon(ctx context.Context) {
 				client.Close()
 			}
 			return
-			case <-delay:
-			}
+		case <-timer.C:
 		}

 		if r.daemonPid == -1 {
@ -277,14 +290,14 @@ func (r *remote) monitorDaemon(ctx context.Context) {
 					return
 				}
 				r.logger.WithError(err).Error("failed restarting containerd")
-				delay = time.After(50 * time.Millisecond)
+				delay = 50 * time.Millisecond
 				continue
 			}

 			client, err = containerd.New(r.GRPC.Address, containerd.WithTimeout(60*time.Second))
 			if err != nil {
 				r.logger.WithError(err).Error("failed connecting to containerd")
-				delay = time.After(100 * time.Millisecond)
+				delay = 100 * time.Millisecond
 				continue
 			}
 		}
@ -300,7 +313,7 @@ func (r *remote) monitorDaemon(ctx context.Context) {
 				}

 				transientFailureCount = 0
-				delay = time.After(500 * time.Millisecond)
+				delay = 500 * time.Millisecond
 				continue
 			}

@ -308,7 +321,7 @@ func (r *remote) monitorDaemon(ctx context.Context) {

 			transientFailureCount++
 			if transientFailureCount < maxConnectionRetryCount || system.IsProcessAlive(r.daemonPid) {
-				delay = time.After(time.Duration(transientFailureCount) * 200 * time.Millisecond)
+				delay = time.Duration(transientFailureCount) * 200 * time.Millisecond
 				continue
 			}
 			client.Close()
@ -321,7 +334,7 @@ func (r *remote) monitorDaemon(ctx context.Context) {
 		}

 		r.daemonPid = -1
-		delay = nil
+		delay = 0
 		transientFailureCount = 0
 	}
 }
--- a/pkg/filenotify/poller.go
+++ b/pkg/filenotify/poller.go
@ -146,9 +146,18 @@ func (w *filePoller) sendErr(e error, chClose <-chan struct{}) error {
 // upon finding changes to a file or errors, sendEvent/sendErr is called
 func (w *filePoller) watch(f *os.File, lastFi os.FileInfo, chClose chan struct{}) {
 	defer f.Close()
+
+	timer := time.NewTimer(watchWaitTime)
+	if !timer.Stop() {
+		<-timer.C
+	}
+	defer timer.Stop()
+
 	for {
+		timer.Reset(watchWaitTime)
+
 		select {
-		case <-time.After(watchWaitTime):
+		case <-timer.C:
 		case <-chClose:
 			logrus.Debugf("watch for %s closed", f.Name())
 			return
--- a/pkg/pubsub/publisher.go
+++ b/pkg/pubsub/publisher.go
@ -107,9 +107,12 @@ func (p *Publisher) sendTopic(sub subscriber, topic topicFunc, v interface{}, wg

 	// send under a select as to not block if the receiver is unavailable
 	if p.timeout > 0 {
+		timeout := time.NewTimer(p.timeout)
+		defer timeout.Stop()
+
 		select {
 		case sub <- v:
-		case <-time.After(p.timeout):
+		case <-timeout.C:
 		}
 		return
 	}
--- a/plugin/manager_linux.go
+++ b/plugin/manager_linux.go
@ -146,6 +146,8 @@ func (pm *Manager) restore(p *v2.Plugin, c *controller) error {
 	return nil
 }

+const shutdownTimeout = 10 * time.Second
+
 func shutdownPlugin(p *v2.Plugin, ec chan bool, executor Executor) {
 	pluginID := p.GetID()

@ -153,19 +155,26 @@ func shutdownPlugin(p *v2.Plugin, ec chan bool, executor Executor) {
 	if err != nil {
 		logrus.Errorf("Sending SIGTERM to plugin failed with error: %v", err)
 	} else {
+
+		timeout := time.NewTimer(shutdownTimeout)
+		defer timeout.Stop()
+
 		select {
 		case <-ec:
 			logrus.Debug("Clean shutdown of plugin")
-		case <-time.After(time.Second * 10):
+		case <-timeout.C:
 			logrus.Debug("Force shutdown plugin")
 			if err := executor.Signal(pluginID, int(unix.SIGKILL)); err != nil {
 				logrus.Errorf("Sending SIGKILL to plugin failed with error: %v", err)
 			}
+
+			timeout.Reset(shutdownTimeout)
+
 			select {
 			case <-ec:
 				logrus.Debug("SIGKILL plugin shutdown")
-			case <-time.After(time.Second * 10):
-				logrus.Debug("Force shutdown plugin FAILED")
+			case <-timeout.C:
+				logrus.WithField("plugin", p.Name).Warn("Force shutdown plugin FAILED")
 			}
 		}
 	}
--- a/restartmanager/restartmanager.go
+++ b/restartmanager/restartmanager.go
@ -107,11 +107,14 @@ func (rm *restartManager) ShouldRestart(exitCode uint32, hasBeenManuallyStopped

 	ch := make(chan error)
 	go func() {
+		timeout := time.NewTimer(rm.timeout)
+		defer timeout.Stop()
+
 		select {
 		case <-rm.cancel:
 			ch <- ErrRestartCanceled
 			close(ch)
-		case <-time.After(rm.timeout):
+		case <-timeout.C:
 			rm.Lock()
 			close(ch)
 			rm.active = false