1
0
Fork 0
mirror of https://github.com/moby/moby.git synced 2022-11-09 12:21:53 -05:00
moby--moby/daemon/monitor.go

213 lines
5.6 KiB
Go
Raw Normal View History

package daemon // import "github.com/docker/docker/daemon"
import (
"context"
"errors"
"fmt"
"runtime"
"strconv"
"time"
"github.com/docker/docker/api/types"
"github.com/docker/docker/container"
"github.com/docker/docker/libcontainerd"
"github.com/docker/docker/restartmanager"
"github.com/sirupsen/logrus"
)
func (daemon *Daemon) setStateCounter(c *container.Container) {
switch c.StateString() {
case "paused":
stateCtr.set(c.ID, "paused")
case "running":
stateCtr.set(c.ID, "running")
default:
stateCtr.set(c.ID, "stopped")
}
}
// ProcessEvent is called by libcontainerd whenever an event occurs
func (daemon *Daemon) ProcessEvent(id string, e libcontainerd.EventType, ei libcontainerd.EventInfo) error {
c, err := daemon.GetContainer(id)
if c == nil || err != nil {
return fmt.Errorf("no such container: %s", id)
}
switch e {
case libcontainerd.EventOOM:
// StateOOM is Linux specific and should never be hit on Windows
if runtime.GOOS == "windows" {
return errors.New("received StateOOM from libcontainerd on Windows. This should never happen")
}
c.Lock()
defer c.Unlock()
Add support for user-defined healthchecks This PR adds support for user-defined health-check probes for Docker containers. It adds a `HEALTHCHECK` instruction to the Dockerfile syntax plus some corresponding "docker run" options. It can be used with a restart policy to automatically restart a container if the check fails. The `HEALTHCHECK` instruction has two forms: * `HEALTHCHECK [OPTIONS] CMD command` (check container health by running a command inside the container) * `HEALTHCHECK NONE` (disable any healthcheck inherited from the base image) The `HEALTHCHECK` instruction tells Docker how to test a container to check that it is still working. This can detect cases such as a web server that is stuck in an infinite loop and unable to handle new connections, even though the server process is still running. When a container has a healthcheck specified, it has a _health status_ in addition to its normal status. This status is initially `starting`. Whenever a health check passes, it becomes `healthy` (whatever state it was previously in). After a certain number of consecutive failures, it becomes `unhealthy`. The options that can appear before `CMD` are: * `--interval=DURATION` (default: `30s`) * `--timeout=DURATION` (default: `30s`) * `--retries=N` (default: `1`) The health check will first run **interval** seconds after the container is started, and then again **interval** seconds after each previous check completes. If a single run of the check takes longer than **timeout** seconds then the check is considered to have failed. It takes **retries** consecutive failures of the health check for the container to be considered `unhealthy`. There can only be one `HEALTHCHECK` instruction in a Dockerfile. If you list more than one then only the last `HEALTHCHECK` will take effect. The command after the `CMD` keyword can be either a shell command (e.g. `HEALTHCHECK CMD /bin/check-running`) or an _exec_ array (as with other Dockerfile commands; see e.g. `ENTRYPOINT` for details). The command's exit status indicates the health status of the container. The possible values are: - 0: success - the container is healthy and ready for use - 1: unhealthy - the container is not working correctly - 2: starting - the container is not ready for use yet, but is working correctly If the probe returns 2 ("starting") when the container has already moved out of the "starting" state then it is treated as "unhealthy" instead. For example, to check every five minutes or so that a web-server is able to serve the site's main page within three seconds: HEALTHCHECK --interval=5m --timeout=3s \ CMD curl -f http://localhost/ || exit 1 To help debug failing probes, any output text (UTF-8 encoded) that the command writes on stdout or stderr will be stored in the health status and can be queried with `docker inspect`. Such output should be kept short (only the first 4096 bytes are stored currently). When the health status of a container changes, a `health_status` event is generated with the new status. The health status is also displayed in the `docker ps` output. Signed-off-by: Thomas Leonard <thomas.leonard@docker.com> Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
2016-04-18 05:48:13 -04:00
daemon.updateHealthMonitor(c)
if err := c.CheckpointTo(daemon.containersReplica); err != nil {
return err
}
daemon.LogContainerEvent(c, "oom")
case libcontainerd.EventExit:
if int(ei.Pid) == c.Pid {
c.Lock()
_, _, err := daemon.containerd.DeleteTask(context.Background(), c.ID)
if err != nil {
logrus.WithError(err).Warnf("failed to delete container %s from containerd", c.ID)
}
c.StreamConfig.Wait()
c.Reset(false)
exitStatus := container.ExitStatus{
ExitCode: int(ei.ExitCode),
ExitedAt: ei.ExitedAt,
OOMKilled: ei.OOMKilled,
}
restart, wait, err := c.RestartManager().ShouldRestart(ei.ExitCode, daemon.IsShuttingDown() || c.HasBeenManuallyStopped, time.Since(c.StartedAt))
if err == nil && restart {
c.RestartCount++
c.SetRestarting(&exitStatus)
} else {
c.SetStopped(&exitStatus)
defer daemon.autoRemove(c)
}
defer c.Unlock() // needs to be called before autoRemove
// cancel healthcheck here, they will be automatically
// restarted if/when the container is started again
daemon.stopHealthchecks(c)
attributes := map[string]string{
"exitCode": strconv.Itoa(int(ei.ExitCode)),
}
daemon.LogContainerEventWithAttributes(c, "die", attributes)
daemon.Cleanup(c)
if err == nil && restart {
go func() {
err := <-wait
if err == nil {
// daemon.netController is initialized when daemon is restoring containers.
// But containerStart will use daemon.netController segment.
// So to avoid panic at startup process, here must wait util daemon restore done.
daemon.waitForStartupDone()
if err = daemon.containerStart(c, "", "", false); err != nil {
logrus.Debugf("failed to restart container: %+v", err)
}
}
if err != nil {
c.Lock()
c.SetStopped(&exitStatus)
c.Unlock()
defer daemon.autoRemove(c)
if err != restartmanager.ErrRestartCanceled {
logrus.Errorf("restartmanger wait error: %+v", err)
}
}
}()
}
daemon.setStateCounter(c)
if err := c.CheckpointTo(daemon.containersReplica); err != nil {
return err
}
return daemon.postRunProcessing(c, ei)
}
if execConfig := c.ExecCommands.Get(ei.ProcessID); execConfig != nil {
ec := int(ei.ExitCode)
execConfig.Lock()
defer execConfig.Unlock()
execConfig.ExitCode = &ec
execConfig.Running = false
execConfig.StreamConfig.Wait()
if err := execConfig.CloseStreams(); err != nil {
logrus.Errorf("failed to cleanup exec %s streams: %s", c.ID, err)
}
// remove the exec command from the container's store only and not the
// daemon's store so that the exec command can be inspected.
c.ExecCommands.Delete(execConfig.ID, execConfig.Pid)
attributes := map[string]string{
"execID": execConfig.ID,
"exitCode": strconv.Itoa(ec),
}
daemon.LogContainerEventWithAttributes(c, "exec_die", attributes)
} else {
logrus.WithFields(logrus.Fields{
"container": c.ID,
"exec-id": ei.ProcessID,
"exec-pid": ei.Pid,
}).Warnf("Ignoring Exit Event, no such exec command found")
}
case libcontainerd.EventStart:
c.Lock()
defer c.Unlock()
// This is here to handle start not generated by docker
if !c.Running {
c.SetRunning(int(ei.Pid), false)
c.HasBeenManuallyStopped = false
c.HasBeenStartedBefore = true
daemon.setStateCounter(c)
daemon.initHealthMonitor(c)
if err := c.CheckpointTo(daemon.containersReplica); err != nil {
return err
}
daemon.LogContainerEvent(c, "start")
}
case libcontainerd.EventPaused:
c.Lock()
defer c.Unlock()
if !c.Paused {
c.Paused = true
daemon.setStateCounter(c)
daemon.updateHealthMonitor(c)
if err := c.CheckpointTo(daemon.containersReplica); err != nil {
return err
}
daemon.LogContainerEvent(c, "pause")
}
case libcontainerd.EventResumed:
c.Lock()
defer c.Unlock()
if c.Paused {
c.Paused = false
daemon.setStateCounter(c)
daemon.updateHealthMonitor(c)
if err := c.CheckpointTo(daemon.containersReplica); err != nil {
return err
}
daemon.LogContainerEvent(c, "unpause")
}
}
return nil
}
func (daemon *Daemon) autoRemove(c *container.Container) {
c.Lock()
ar := c.HostConfig.AutoRemove
c.Unlock()
if !ar {
return
}
var err error
if err = daemon.ContainerRm(c.ID, &types.ContainerRmConfig{ForceRemove: true, RemoveVolume: true}); err == nil {
return
}
if c := daemon.containers.Get(c.ID); c == nil {
return
}
if err != nil {
logrus.WithError(err).WithField("container", c.ID).Error("error removing container")
}
}