2018-05-23 15:15:21 -04:00
|
|
|
package supervisor // import "github.com/docker/docker/libcontainerd/supervisor"
|
2017-09-22 09:52:41 -04:00
|
|
|
|
|
|
|
import (
|
|
|
|
"context"
|
|
|
|
"io"
|
|
|
|
"os"
|
|
|
|
"os/exec"
|
|
|
|
"path/filepath"
|
2022-09-28 13:33:53 -04:00
|
|
|
"runtime"
|
2017-09-22 09:52:41 -04:00
|
|
|
"strconv"
|
|
|
|
"strings"
|
|
|
|
"time"
|
|
|
|
|
|
|
|
"github.com/containerd/containerd"
|
2018-10-18 15:37:23 -04:00
|
|
|
"github.com/containerd/containerd/services/server/config"
|
libcontainerd/supervisor: make supervisor adjust OOM score for containerd
Containerd, like dockerd has a OOMScore configuration option to adjust its own
OOM score. In dockerd, this option was added when default installations were not
yet running the daemon as a systemd unit, which made it more complicated to set
the score, and adding a daemon option was convenient.
A binary adjusting its own score has been frowned upon, as it's more logical to
make that the responsibility of the process manager _starting_ the daemon, which
is what we did for dockerd in 21578530d7291f2e7bc0b90ace2f058df753a443.
There have been discussions on deprecating the daemon flag for dockerd, and
similar discussions have been happening for containerd.
This patch changes how we set the OOM score for the containerd child process,
and to have dockerd (supervisor) set the OOM score, as it's acting as process
manager in this case (performing a role similar to systemd otherwise).
With this patch, the score is still adjusted as usual, but not written to the
containerd configuration file;
dockerd --oom-score-adjust=-123
cat /proc/$(pidof containerd)/oom_score_adj
-123
As a follow-up, we may consider to adjust the containerd OOM score based on the
daemon's own score instead of on the `cli.OOMScoreAdjust` configuration so that
we will also adjust the score in situations where dockerd's OOM score was set
through other ways (systemd or manually adjusting the cgroup). A TODO was added
for this.
Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
2022-08-11 06:09:13 -04:00
|
|
|
"github.com/containerd/containerd/sys"
|
2017-09-22 09:52:41 -04:00
|
|
|
"github.com/docker/docker/pkg/system"
|
2021-04-02 11:22:22 -04:00
|
|
|
"github.com/pelletier/go-toml"
|
2017-09-22 09:52:41 -04:00
|
|
|
"github.com/pkg/errors"
|
|
|
|
"github.com/sirupsen/logrus"
|
|
|
|
)
|
|
|
|
|
|
|
|
const (
|
|
|
|
maxConnectionRetryCount = 3
|
|
|
|
healthCheckTimeout = 3 * time.Second
|
|
|
|
shutdownTimeout = 15 * time.Second
|
2018-05-23 15:15:21 -04:00
|
|
|
startupTimeout = 15 * time.Second
|
2017-09-22 09:52:41 -04:00
|
|
|
configFile = "containerd.toml"
|
2018-09-21 18:58:34 -04:00
|
|
|
binaryName = "containerd"
|
|
|
|
pidFile = "containerd.pid"
|
2017-09-22 09:52:41 -04:00
|
|
|
)
|
|
|
|
|
|
|
|
type remote struct {
|
2018-10-18 15:37:23 -04:00
|
|
|
config.Config
|
2017-09-22 09:52:41 -04:00
|
|
|
|
2022-08-11 07:00:01 -04:00
|
|
|
// configFile is the location where the generated containerd configuration
|
|
|
|
// file is saved.
|
|
|
|
configFile string
|
|
|
|
|
2017-09-22 09:52:41 -04:00
|
|
|
daemonPid int
|
2022-08-11 06:38:30 -04:00
|
|
|
pidFile string
|
2017-09-22 09:52:41 -04:00
|
|
|
logger *logrus.Entry
|
|
|
|
|
2018-05-23 15:15:21 -04:00
|
|
|
daemonWaitCh chan struct{}
|
2018-09-04 15:04:35 -04:00
|
|
|
daemonStartCh chan error
|
2018-05-23 15:15:21 -04:00
|
|
|
daemonStopCh chan struct{}
|
2017-09-22 09:52:41 -04:00
|
|
|
|
2021-04-02 11:22:22 -04:00
|
|
|
stateDir string
|
libcontainerd/supervisor: make supervisor adjust OOM score for containerd
Containerd, like dockerd has a OOMScore configuration option to adjust its own
OOM score. In dockerd, this option was added when default installations were not
yet running the daemon as a systemd unit, which made it more complicated to set
the score, and adding a daemon option was convenient.
A binary adjusting its own score has been frowned upon, as it's more logical to
make that the responsibility of the process manager _starting_ the daemon, which
is what we did for dockerd in 21578530d7291f2e7bc0b90ace2f058df753a443.
There have been discussions on deprecating the daemon flag for dockerd, and
similar discussions have been happening for containerd.
This patch changes how we set the OOM score for the containerd child process,
and to have dockerd (supervisor) set the OOM score, as it's acting as process
manager in this case (performing a role similar to systemd otherwise).
With this patch, the score is still adjusted as usual, but not written to the
containerd configuration file;
dockerd --oom-score-adjust=-123
cat /proc/$(pidof containerd)/oom_score_adj
-123
As a follow-up, we may consider to adjust the containerd OOM score based on the
daemon's own score instead of on the `cli.OOMScoreAdjust` configuration so that
we will also adjust the score in situations where dockerd's OOM score was set
through other ways (systemd or manually adjusting the cgroup). A TODO was added
for this.
Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
2022-08-11 06:09:13 -04:00
|
|
|
|
|
|
|
// oomScore adjusts the OOM score for the containerd process.
|
|
|
|
oomScore int
|
2022-08-10 12:27:07 -04:00
|
|
|
|
|
|
|
// logLevel overrides the containerd logging-level through the --log-level
|
|
|
|
// command-line option.
|
|
|
|
logLevel string
|
2017-09-22 09:52:41 -04:00
|
|
|
}
|
|
|
|
|
2018-05-23 15:15:21 -04:00
|
|
|
// Daemon represents a running containerd daemon
|
|
|
|
type Daemon interface {
|
|
|
|
WaitTimeout(time.Duration) error
|
|
|
|
Address() string
|
|
|
|
}
|
|
|
|
|
|
|
|
// DaemonOpt allows to configure parameters of container daemons
|
|
|
|
type DaemonOpt func(c *remote) error
|
2017-09-22 09:52:41 -04:00
|
|
|
|
2018-05-23 15:15:21 -04:00
|
|
|
// Start starts a containerd daemon and monitors it
|
|
|
|
func Start(ctx context.Context, rootDir, stateDir string, opts ...DaemonOpt) (Daemon, error) {
|
2017-09-22 09:52:41 -04:00
|
|
|
r := &remote{
|
|
|
|
stateDir: stateDir,
|
2018-10-18 15:37:23 -04:00
|
|
|
Config: config.Config{
|
2022-07-27 10:33:00 -04:00
|
|
|
Version: 2,
|
|
|
|
Root: filepath.Join(rootDir, "daemon"),
|
|
|
|
State: filepath.Join(stateDir, "daemon"),
|
2017-09-22 09:52:41 -04:00
|
|
|
},
|
2022-08-11 07:00:01 -04:00
|
|
|
configFile: filepath.Join(stateDir, configFile),
|
2018-05-23 15:15:21 -04:00
|
|
|
daemonPid: -1,
|
2022-08-11 06:38:30 -04:00
|
|
|
pidFile: filepath.Join(stateDir, pidFile),
|
2018-05-23 15:15:21 -04:00
|
|
|
logger: logrus.WithField("module", "libcontainerd"),
|
2018-09-04 15:04:35 -04:00
|
|
|
daemonStartCh: make(chan error, 1),
|
2018-05-23 15:15:21 -04:00
|
|
|
daemonStopCh: make(chan struct{}),
|
2017-09-22 09:52:41 -04:00
|
|
|
}
|
|
|
|
|
2018-05-23 15:15:21 -04:00
|
|
|
for _, opt := range opts {
|
|
|
|
if err := opt(r); err != nil {
|
|
|
|
return nil, err
|
2017-09-22 09:52:41 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
r.setDefaults()
|
|
|
|
|
2019-08-08 05:51:00 -04:00
|
|
|
if err := system.MkdirAll(stateDir, 0700); err != nil {
|
2018-05-23 15:15:21 -04:00
|
|
|
return nil, err
|
2017-09-22 09:52:41 -04:00
|
|
|
}
|
|
|
|
|
2018-05-23 15:15:21 -04:00
|
|
|
go r.monitorDaemon(ctx)
|
2017-09-22 09:52:41 -04:00
|
|
|
|
2019-01-09 13:24:03 -05:00
|
|
|
timeout := time.NewTimer(startupTimeout)
|
|
|
|
defer timeout.Stop()
|
|
|
|
|
2018-05-23 15:15:21 -04:00
|
|
|
select {
|
2019-01-09 13:24:03 -05:00
|
|
|
case <-timeout.C:
|
2018-05-23 15:15:21 -04:00
|
|
|
return nil, errors.New("timeout waiting for containerd to start")
|
2018-09-04 15:04:35 -04:00
|
|
|
case err := <-r.daemonStartCh:
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2017-09-22 09:52:41 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
return r, nil
|
|
|
|
}
|
2018-05-23 15:15:21 -04:00
|
|
|
func (r *remote) WaitTimeout(d time.Duration) error {
|
2019-01-09 13:24:03 -05:00
|
|
|
timeout := time.NewTimer(d)
|
|
|
|
defer timeout.Stop()
|
|
|
|
|
2018-05-23 15:15:21 -04:00
|
|
|
select {
|
2019-01-09 13:24:03 -05:00
|
|
|
case <-timeout.C:
|
2018-05-23 15:15:21 -04:00
|
|
|
return errors.New("timeout waiting for containerd to stop")
|
|
|
|
case <-r.daemonStopCh:
|
2017-09-22 09:52:41 -04:00
|
|
|
}
|
|
|
|
|
2018-05-23 15:15:21 -04:00
|
|
|
return nil
|
2017-09-22 09:52:41 -04:00
|
|
|
}
|
|
|
|
|
2018-05-23 15:15:21 -04:00
|
|
|
func (r *remote) Address() string {
|
|
|
|
return r.GRPC.Address
|
2017-09-22 09:52:41 -04:00
|
|
|
}
|
|
|
|
func (r *remote) getContainerdPid() (int, error) {
|
2022-08-11 06:38:30 -04:00
|
|
|
f, err := os.OpenFile(r.pidFile, os.O_RDWR, 0600)
|
2017-09-22 09:52:41 -04:00
|
|
|
if err != nil {
|
|
|
|
if os.IsNotExist(err) {
|
|
|
|
return -1, nil
|
|
|
|
}
|
|
|
|
return -1, err
|
|
|
|
}
|
|
|
|
defer f.Close()
|
|
|
|
|
|
|
|
b := make([]byte, 8)
|
|
|
|
n, err := f.Read(b)
|
|
|
|
if err != nil && err != io.EOF {
|
|
|
|
return -1, err
|
|
|
|
}
|
|
|
|
|
|
|
|
if n > 0 {
|
|
|
|
pid, err := strconv.ParseUint(string(b[:n]), 10, 64)
|
|
|
|
if err != nil {
|
|
|
|
return -1, err
|
|
|
|
}
|
|
|
|
if system.IsProcessAlive(int(pid)) {
|
|
|
|
return int(pid), nil
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return -1, nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (r *remote) getContainerdConfig() (string, error) {
|
2022-08-11 07:00:01 -04:00
|
|
|
f, err := os.OpenFile(r.configFile, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0600)
|
2017-09-22 09:52:41 -04:00
|
|
|
if err != nil {
|
2022-08-11 07:00:01 -04:00
|
|
|
return "", errors.Wrapf(err, "failed to open containerd config file (%s)", r.configFile)
|
2017-09-22 09:52:41 -04:00
|
|
|
}
|
|
|
|
defer f.Close()
|
|
|
|
|
2021-04-02 11:22:22 -04:00
|
|
|
if err := toml.NewEncoder(f).Encode(r); err != nil {
|
2022-08-11 07:00:01 -04:00
|
|
|
return "", errors.Wrapf(err, "failed to write containerd config file (%s)", r.configFile)
|
2017-09-22 09:52:41 -04:00
|
|
|
}
|
2022-08-11 07:00:01 -04:00
|
|
|
return r.configFile, nil
|
2017-09-22 09:52:41 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
func (r *remote) startContainerd() error {
|
|
|
|
pid, err := r.getContainerdPid()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
if pid != -1 {
|
|
|
|
r.daemonPid = pid
|
2022-08-11 04:41:31 -04:00
|
|
|
r.logger.WithField("pid", pid).Infof("%s is still running", binaryName)
|
2017-09-22 09:52:41 -04:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
configFile, err := r.getContainerdConfig()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
args := []string{"--config", configFile}
|
2018-07-09 08:16:35 -04:00
|
|
|
|
2022-08-10 12:27:07 -04:00
|
|
|
if r.logLevel != "" {
|
|
|
|
args = append(args, "--log-level", r.logLevel)
|
2018-07-09 08:16:35 -04:00
|
|
|
}
|
|
|
|
|
2017-09-22 09:52:41 -04:00
|
|
|
cmd := exec.Command(binaryName, args...)
|
|
|
|
// redirect containerd logs to docker logs
|
|
|
|
cmd.Stdout = os.Stdout
|
|
|
|
cmd.Stderr = os.Stderr
|
|
|
|
cmd.SysProcAttr = containerdSysProcAttr()
|
|
|
|
// clear the NOTIFY_SOCKET from the env when starting containerd
|
|
|
|
cmd.Env = nil
|
|
|
|
for _, e := range os.Environ() {
|
|
|
|
if !strings.HasPrefix(e, "NOTIFY_SOCKET") {
|
|
|
|
cmd.Env = append(cmd.Env, e)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2022-09-28 13:33:53 -04:00
|
|
|
startedCh := make(chan error)
|
2017-09-22 09:52:41 -04:00
|
|
|
go func() {
|
2022-09-28 13:33:53 -04:00
|
|
|
// On Linux, when cmd.SysProcAttr.Pdeathsig is set,
|
|
|
|
// the signal is sent to the subprocess when the creating thread
|
|
|
|
// terminates. The runtime terminates a thread if a goroutine
|
|
|
|
// exits while locked to it. Prevent the containerd process
|
|
|
|
// from getting killed prematurely by ensuring that the thread
|
|
|
|
// used to start it remains alive until it or the daemon process
|
|
|
|
// exits. See https://go.dev/issue/27505 for more details.
|
|
|
|
runtime.LockOSThread()
|
|
|
|
defer runtime.UnlockOSThread()
|
|
|
|
err := cmd.Start()
|
|
|
|
startedCh <- err
|
|
|
|
if err != nil {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
|
|
|
r.daemonWaitCh = make(chan struct{})
|
2017-09-22 09:52:41 -04:00
|
|
|
// Reap our child when needed
|
|
|
|
if err := cmd.Wait(); err != nil {
|
|
|
|
r.logger.WithError(err).Errorf("containerd did not exit successfully")
|
|
|
|
}
|
|
|
|
close(r.daemonWaitCh)
|
|
|
|
}()
|
2022-09-28 13:33:53 -04:00
|
|
|
if err := <-startedCh; err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2017-09-22 09:52:41 -04:00
|
|
|
|
|
|
|
r.daemonPid = cmd.Process.Pid
|
|
|
|
|
libcontainerd/supervisor: make supervisor adjust OOM score for containerd
Containerd, like dockerd has a OOMScore configuration option to adjust its own
OOM score. In dockerd, this option was added when default installations were not
yet running the daemon as a systemd unit, which made it more complicated to set
the score, and adding a daemon option was convenient.
A binary adjusting its own score has been frowned upon, as it's more logical to
make that the responsibility of the process manager _starting_ the daemon, which
is what we did for dockerd in 21578530d7291f2e7bc0b90ace2f058df753a443.
There have been discussions on deprecating the daemon flag for dockerd, and
similar discussions have been happening for containerd.
This patch changes how we set the OOM score for the containerd child process,
and to have dockerd (supervisor) set the OOM score, as it's acting as process
manager in this case (performing a role similar to systemd otherwise).
With this patch, the score is still adjusted as usual, but not written to the
containerd configuration file;
dockerd --oom-score-adjust=-123
cat /proc/$(pidof containerd)/oom_score_adj
-123
As a follow-up, we may consider to adjust the containerd OOM score based on the
daemon's own score instead of on the `cli.OOMScoreAdjust` configuration so that
we will also adjust the score in situations where dockerd's OOM score was set
through other ways (systemd or manually adjusting the cgroup). A TODO was added
for this.
Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
2022-08-11 06:09:13 -04:00
|
|
|
if err := r.adjustOOMScore(); err != nil {
|
|
|
|
r.logger.WithError(err).Warn("failed to adjust OOM score")
|
|
|
|
}
|
|
|
|
|
2022-08-11 06:38:30 -04:00
|
|
|
err = os.WriteFile(r.pidFile, []byte(strconv.Itoa(r.daemonPid)), 0660)
|
2017-09-22 09:52:41 -04:00
|
|
|
if err != nil {
|
|
|
|
system.KillProcess(r.daemonPid)
|
|
|
|
return errors.Wrap(err, "libcontainerd: failed to save daemon pid to disk")
|
|
|
|
}
|
|
|
|
|
2022-08-09 15:50:10 -04:00
|
|
|
r.logger.WithField("pid", r.daemonPid).WithField("address", r.Address()).Infof("started new %s process", binaryName)
|
2017-09-22 09:52:41 -04:00
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
libcontainerd/supervisor: make supervisor adjust OOM score for containerd
Containerd, like dockerd has a OOMScore configuration option to adjust its own
OOM score. In dockerd, this option was added when default installations were not
yet running the daemon as a systemd unit, which made it more complicated to set
the score, and adding a daemon option was convenient.
A binary adjusting its own score has been frowned upon, as it's more logical to
make that the responsibility of the process manager _starting_ the daemon, which
is what we did for dockerd in 21578530d7291f2e7bc0b90ace2f058df753a443.
There have been discussions on deprecating the daemon flag for dockerd, and
similar discussions have been happening for containerd.
This patch changes how we set the OOM score for the containerd child process,
and to have dockerd (supervisor) set the OOM score, as it's acting as process
manager in this case (performing a role similar to systemd otherwise).
With this patch, the score is still adjusted as usual, but not written to the
containerd configuration file;
dockerd --oom-score-adjust=-123
cat /proc/$(pidof containerd)/oom_score_adj
-123
As a follow-up, we may consider to adjust the containerd OOM score based on the
daemon's own score instead of on the `cli.OOMScoreAdjust` configuration so that
we will also adjust the score in situations where dockerd's OOM score was set
through other ways (systemd or manually adjusting the cgroup). A TODO was added
for this.
Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
2022-08-11 06:09:13 -04:00
|
|
|
func (r *remote) adjustOOMScore() error {
|
|
|
|
if r.oomScore == 0 || r.daemonPid <= 1 {
|
|
|
|
// no score configured, or daemonPid contains an invalid PID (we don't
|
|
|
|
// expect containerd to be running as PID 1 :)).
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
if err := sys.SetOOMScore(r.daemonPid, r.oomScore); err != nil {
|
|
|
|
return errors.Wrap(err, "failed to adjust OOM score for containerd process")
|
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2018-05-23 15:15:21 -04:00
|
|
|
func (r *remote) monitorDaemon(ctx context.Context) {
|
|
|
|
var (
|
|
|
|
transientFailureCount = 0
|
|
|
|
client *containerd.Client
|
|
|
|
err error
|
2019-01-09 13:24:03 -05:00
|
|
|
delay time.Duration
|
|
|
|
timer = time.NewTimer(0)
|
2018-05-23 15:15:21 -04:00
|
|
|
started bool
|
|
|
|
)
|
|
|
|
|
|
|
|
defer func() {
|
|
|
|
if r.daemonPid != -1 {
|
|
|
|
r.stopDaemon()
|
|
|
|
}
|
|
|
|
|
|
|
|
// cleanup some files
|
2022-08-11 06:38:30 -04:00
|
|
|
_ = os.Remove(r.pidFile)
|
2018-05-23 15:15:21 -04:00
|
|
|
|
|
|
|
r.platformCleanup()
|
|
|
|
|
|
|
|
close(r.daemonStopCh)
|
2019-01-09 13:24:03 -05:00
|
|
|
timer.Stop()
|
2018-05-23 15:15:21 -04:00
|
|
|
}()
|
2017-09-22 09:52:41 -04:00
|
|
|
|
2019-01-09 13:24:03 -05:00
|
|
|
// ensure no races on sending to timer.C even though there is a 0 duration.
|
|
|
|
if !timer.Stop() {
|
|
|
|
<-timer.C
|
|
|
|
}
|
|
|
|
|
2017-09-22 09:52:41 -04:00
|
|
|
for {
|
2019-01-09 13:24:03 -05:00
|
|
|
timer.Reset(delay)
|
|
|
|
|
|
|
|
select {
|
|
|
|
case <-ctx.Done():
|
|
|
|
r.logger.Info("stopping healthcheck following graceful shutdown")
|
|
|
|
if client != nil {
|
|
|
|
client.Close()
|
2018-05-23 15:15:21 -04:00
|
|
|
}
|
2019-01-09 13:24:03 -05:00
|
|
|
return
|
|
|
|
case <-timer.C:
|
2018-05-23 15:15:21 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
if r.daemonPid == -1 {
|
|
|
|
if r.daemonWaitCh != nil {
|
2018-09-04 14:00:28 -04:00
|
|
|
select {
|
|
|
|
case <-ctx.Done():
|
|
|
|
r.logger.Info("stopping containerd startup following graceful shutdown")
|
|
|
|
return
|
|
|
|
case <-r.daemonWaitCh:
|
|
|
|
}
|
2018-05-23 15:15:21 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
os.RemoveAll(r.GRPC.Address)
|
|
|
|
if err := r.startContainerd(); err != nil {
|
2018-09-04 15:04:35 -04:00
|
|
|
if !started {
|
|
|
|
r.daemonStartCh <- err
|
|
|
|
return
|
|
|
|
}
|
|
|
|
r.logger.WithError(err).Error("failed restarting containerd")
|
2019-01-09 13:24:03 -05:00
|
|
|
delay = 50 * time.Millisecond
|
2018-05-23 15:15:21 -04:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
2018-08-22 18:32:39 -04:00
|
|
|
client, err = containerd.New(r.GRPC.Address, containerd.WithTimeout(60*time.Second))
|
2018-05-23 15:15:21 -04:00
|
|
|
if err != nil {
|
|
|
|
r.logger.WithError(err).Error("failed connecting to containerd")
|
2019-01-09 13:24:03 -05:00
|
|
|
delay = 100 * time.Millisecond
|
2018-05-23 15:15:21 -04:00
|
|
|
continue
|
|
|
|
}
|
2022-08-11 04:41:31 -04:00
|
|
|
r.logger.WithField("address", r.GRPC.Address).Debug("created containerd monitoring client")
|
2018-03-13 12:21:56 -04:00
|
|
|
}
|
|
|
|
|
2018-09-04 14:00:28 -04:00
|
|
|
if client != nil {
|
|
|
|
tctx, cancel := context.WithTimeout(ctx, healthCheckTimeout)
|
|
|
|
_, err := client.IsServing(tctx)
|
|
|
|
cancel()
|
|
|
|
if err == nil {
|
|
|
|
if !started {
|
|
|
|
close(r.daemonStartCh)
|
|
|
|
started = true
|
|
|
|
}
|
|
|
|
|
|
|
|
transientFailureCount = 0
|
2019-10-16 15:23:10 -04:00
|
|
|
|
|
|
|
select {
|
|
|
|
case <-r.daemonWaitCh:
|
|
|
|
case <-ctx.Done():
|
|
|
|
}
|
|
|
|
|
|
|
|
// Set a small delay in case there is a recurring failure (or bug in this code)
|
|
|
|
// to ensure we don't end up in a super tight loop.
|
2019-01-09 13:24:03 -05:00
|
|
|
delay = 500 * time.Millisecond
|
2018-09-04 14:00:28 -04:00
|
|
|
continue
|
2018-05-23 15:15:21 -04:00
|
|
|
}
|
|
|
|
|
2018-09-04 14:00:28 -04:00
|
|
|
r.logger.WithError(err).WithField("binary", binaryName).Debug("daemon is not responding")
|
2017-09-22 09:52:41 -04:00
|
|
|
|
2018-09-04 14:00:28 -04:00
|
|
|
transientFailureCount++
|
|
|
|
if transientFailureCount < maxConnectionRetryCount || system.IsProcessAlive(r.daemonPid) {
|
2019-01-09 13:24:03 -05:00
|
|
|
delay = time.Duration(transientFailureCount) * 200 * time.Millisecond
|
2018-09-04 14:00:28 -04:00
|
|
|
continue
|
|
|
|
}
|
2019-01-30 08:41:54 -05:00
|
|
|
client.Close()
|
|
|
|
client = nil
|
2018-01-31 17:32:40 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
if system.IsProcessAlive(r.daemonPid) {
|
|
|
|
r.logger.WithField("pid", r.daemonPid).Info("killing and restarting containerd")
|
2018-05-23 15:15:21 -04:00
|
|
|
r.killDaemon()
|
2018-01-31 17:32:40 -05:00
|
|
|
}
|
|
|
|
|
2018-05-23 15:15:21 -04:00
|
|
|
r.daemonPid = -1
|
2019-01-09 13:24:03 -05:00
|
|
|
delay = 0
|
2018-05-23 15:15:21 -04:00
|
|
|
transientFailureCount = 0
|
2017-09-22 09:52:41 -04:00
|
|
|
}
|
|
|
|
}
|