mirror of
https://github.com/moby/moby.git
synced 2022-11-09 12:21:53 -05:00
health: Start probe timeout after exec starts
Starting an exec can take a significant amount of time while under heavy container operation load. In extreme cases the time to start the process can take upwards of a second, which is a significant fraction of the default health probe timeout (30s). With a shorter timeout, the exec start delay could make the difference between a successful probe and a probe timeout! Mitigate the impact of excessive exec start latencies by only starting the probe timeout timer after the exec'ed process has started. Add a metric to sample the latency of starting health-check exec probes. Signed-off-by: Cory Snider <csnider@mirantis.com>
This commit is contained in:
parent
3fd01fc3c6
commit
bdc6473d2d
2 changed files with 48 additions and 18 deletions
|
@ -62,6 +62,7 @@ type cmdProbe struct {
|
||||||
// exec the healthcheck command in the container.
|
// exec the healthcheck command in the container.
|
||||||
// Returns the exit code and probe output (if any)
|
// Returns the exit code and probe output (if any)
|
||||||
func (p *cmdProbe) run(ctx context.Context, d *Daemon, cntr *container.Container) (*types.HealthcheckResult, error) {
|
func (p *cmdProbe) run(ctx context.Context, d *Daemon, cntr *container.Container) (*types.HealthcheckResult, error) {
|
||||||
|
startTime := time.Now()
|
||||||
cmdSlice := strslice.StrSlice(cntr.Config.Healthcheck.Test)[1:]
|
cmdSlice := strslice.StrSlice(cntr.Config.Healthcheck.Test)[1:]
|
||||||
if p.shell {
|
if p.shell {
|
||||||
cmdSlice = append(getShell(cntr), cmdSlice...)
|
cmdSlice = append(getShell(cntr), cmdSlice...)
|
||||||
|
@ -93,10 +94,52 @@ func (p *cmdProbe) run(ctx context.Context, d *Daemon, cntr *container.Container
|
||||||
d.LogContainerEventWithAttributes(cntr, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " "), attributes)
|
d.LogContainerEventWithAttributes(cntr, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " "), attributes)
|
||||||
|
|
||||||
output := &limitedBuffer{}
|
output := &limitedBuffer{}
|
||||||
err = d.ContainerExecStart(ctx, execConfig.ID, nil, output, output)
|
probeCtx, cancelProbe := context.WithCancel(ctx)
|
||||||
if err != nil {
|
defer cancelProbe()
|
||||||
return nil, err
|
execErr := make(chan error, 1)
|
||||||
|
go func() { execErr <- d.ContainerExecStart(probeCtx, execConfig.ID, nil, output, output) }()
|
||||||
|
|
||||||
|
// Starting an exec can take a significant amount of time: on the order
|
||||||
|
// of 1s in extreme cases. The time it takes dockerd and containerd to
|
||||||
|
// start the exec is time that the probe process is not running, and so
|
||||||
|
// should not count towards the health check's timeout. Apply a separate
|
||||||
|
// timeout to abort if the exec request is wedged.
|
||||||
|
tm := time.NewTimer(30 * time.Second)
|
||||||
|
defer tm.Stop()
|
||||||
|
select {
|
||||||
|
case <-tm.C:
|
||||||
|
return nil, fmt.Errorf("timed out starting health check for container %s", cntr.ID)
|
||||||
|
case err := <-execErr:
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
case <-execConfig.Started:
|
||||||
|
healthCheckStartDuration.UpdateSince(startTime)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if !tm.Stop() {
|
||||||
|
<-tm.C
|
||||||
|
}
|
||||||
|
probeTimeout := timeoutWithDefault(cntr.Config.Healthcheck.Timeout, defaultProbeTimeout)
|
||||||
|
tm.Reset(probeTimeout)
|
||||||
|
select {
|
||||||
|
case <-tm.C:
|
||||||
|
cancelProbe()
|
||||||
|
logrus.WithContext(ctx).Debugf("Health check for container %s taking too long", cntr.ID)
|
||||||
|
// Wait for probe to exit (it might take a while to respond to the TERM
|
||||||
|
// signal and we don't want dying probes to pile up).
|
||||||
|
<-execErr
|
||||||
|
return &types.HealthcheckResult{
|
||||||
|
ExitCode: -1,
|
||||||
|
Output: fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout),
|
||||||
|
End: time.Now(),
|
||||||
|
}, nil
|
||||||
|
case err := <-execErr:
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
info, err := d.getExecConfig(execConfig.ID)
|
info, err := d.getExecConfig(execConfig.ID)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return nil, err
|
return nil, err
|
||||||
|
@ -184,7 +227,6 @@ func handleProbeResult(d *Daemon, c *container.Container, result *types.Healthch
|
||||||
// Run the container's monitoring thread until notified via "stop".
|
// Run the container's monitoring thread until notified via "stop".
|
||||||
// There is never more than one monitor thread running per container at a time.
|
// There is never more than one monitor thread running per container at a time.
|
||||||
func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) {
|
func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) {
|
||||||
probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout)
|
|
||||||
probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval)
|
probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval)
|
||||||
|
|
||||||
intervalTimer := time.NewTimer(probeInterval)
|
intervalTimer := time.NewTimer(probeInterval)
|
||||||
|
@ -200,7 +242,7 @@ func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe)
|
||||||
case <-intervalTimer.C:
|
case <-intervalTimer.C:
|
||||||
logrus.Debugf("Running health check for container %s ...", c.ID)
|
logrus.Debugf("Running health check for container %s ...", c.ID)
|
||||||
startTime := time.Now()
|
startTime := time.Now()
|
||||||
ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout)
|
ctx, cancelProbe := context.WithCancel(context.Background())
|
||||||
results := make(chan *types.HealthcheckResult, 1)
|
results := make(chan *types.HealthcheckResult, 1)
|
||||||
go func() {
|
go func() {
|
||||||
healthChecksCounter.Inc()
|
healthChecksCounter.Inc()
|
||||||
|
@ -231,20 +273,7 @@ func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe)
|
||||||
return
|
return
|
||||||
case result := <-results:
|
case result := <-results:
|
||||||
handleProbeResult(d, c, result, stop)
|
handleProbeResult(d, c, result, stop)
|
||||||
// Stop timeout
|
|
||||||
cancelProbe()
|
cancelProbe()
|
||||||
case <-ctx.Done():
|
|
||||||
logrus.Debugf("Health check for container %s taking too long", c.ID)
|
|
||||||
handleProbeResult(d, c, &types.HealthcheckResult{
|
|
||||||
ExitCode: -1,
|
|
||||||
Output: fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout),
|
|
||||||
Start: startTime,
|
|
||||||
End: time.Now(),
|
|
||||||
}, stop)
|
|
||||||
cancelProbe()
|
|
||||||
// Wait for probe to exit (it might take a while to respond to the TERM
|
|
||||||
// signal and we don't want dying probes to pile up).
|
|
||||||
<-results
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -37,6 +37,7 @@ var (
|
||||||
|
|
||||||
healthChecksCounter = metricsNS.NewCounter("health_checks", "The total number of health checks")
|
healthChecksCounter = metricsNS.NewCounter("health_checks", "The total number of health checks")
|
||||||
healthChecksFailedCounter = metricsNS.NewCounter("health_checks_failed", "The total number of failed health checks")
|
healthChecksFailedCounter = metricsNS.NewCounter("health_checks_failed", "The total number of failed health checks")
|
||||||
|
healthCheckStartDuration = metricsNS.NewTimer("health_check_start_duration", "The number of seconds it takes to prepare to run health checks")
|
||||||
|
|
||||||
stateCtr = newStateCounter(metricsNS, metricsNS.NewDesc("container_states", "The count of containers in various states", metrics.Unit("containers"), "state"))
|
stateCtr = newStateCounter(metricsNS, metricsNS.NewDesc("container_states", "The count of containers in various states", metrics.Unit("containers"), "state"))
|
||||||
)
|
)
|
||||||
|
|
Loading…
Reference in a new issue