1
0
Fork 0
mirror of https://github.com/moby/moby.git synced 2022-11-09 12:21:53 -05:00
moby--moby/daemon/monitor.go
Thomas Leonard b6c7becbfe
Add support for user-defined healthchecks
This PR adds support for user-defined health-check probes for Docker
containers. It adds a `HEALTHCHECK` instruction to the Dockerfile syntax plus
some corresponding "docker run" options. It can be used with a restart policy
to automatically restart a container if the check fails.

The `HEALTHCHECK` instruction has two forms:

* `HEALTHCHECK [OPTIONS] CMD command` (check container health by running a command inside the container)
* `HEALTHCHECK NONE` (disable any healthcheck inherited from the base image)

The `HEALTHCHECK` instruction tells Docker how to test a container to check that
it is still working. This can detect cases such as a web server that is stuck in
an infinite loop and unable to handle new connections, even though the server
process is still running.

When a container has a healthcheck specified, it has a _health status_ in
addition to its normal status. This status is initially `starting`. Whenever a
health check passes, it becomes `healthy` (whatever state it was previously in).
After a certain number of consecutive failures, it becomes `unhealthy`.

The options that can appear before `CMD` are:

* `--interval=DURATION` (default: `30s`)
* `--timeout=DURATION` (default: `30s`)
* `--retries=N` (default: `1`)

The health check will first run **interval** seconds after the container is
started, and then again **interval** seconds after each previous check completes.

If a single run of the check takes longer than **timeout** seconds then the check
is considered to have failed.

It takes **retries** consecutive failures of the health check for the container
to be considered `unhealthy`.

There can only be one `HEALTHCHECK` instruction in a Dockerfile. If you list
more than one then only the last `HEALTHCHECK` will take effect.

The command after the `CMD` keyword can be either a shell command (e.g. `HEALTHCHECK
CMD /bin/check-running`) or an _exec_ array (as with other Dockerfile commands;
see e.g. `ENTRYPOINT` for details).

The command's exit status indicates the health status of the container.
The possible values are:

- 0: success - the container is healthy and ready for use
- 1: unhealthy - the container is not working correctly
- 2: starting - the container is not ready for use yet, but is working correctly

If the probe returns 2 ("starting") when the container has already moved out of the
"starting" state then it is treated as "unhealthy" instead.

For example, to check every five minutes or so that a web-server is able to
serve the site's main page within three seconds:

    HEALTHCHECK --interval=5m --timeout=3s \
      CMD curl -f http://localhost/ || exit 1

To help debug failing probes, any output text (UTF-8 encoded) that the command writes
on stdout or stderr will be stored in the health status and can be queried with
`docker inspect`. Such output should be kept short (only the first 4096 bytes
are stored currently).

When the health status of a container changes, a `health_status` event is
generated with the new status. The health status is also displayed in the
`docker ps` output.

Signed-off-by: Thomas Leonard <thomas.leonard@docker.com>
Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
2016-06-02 23:58:34 +02:00

156 lines
4.1 KiB
Go

package daemon
import (
"errors"
"fmt"
"io"
"runtime"
"strconv"
"github.com/Sirupsen/logrus"
"github.com/docker/docker/libcontainerd"
"github.com/docker/docker/runconfig"
)
// StateChanged updates daemon state changes from containerd
func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
c := daemon.containers.Get(id)
if c == nil {
return fmt.Errorf("no such container: %s", id)
}
switch e.State {
case libcontainerd.StateOOM:
// StateOOM is Linux specific and should never be hit on Windows
if runtime.GOOS == "windows" {
return errors.New("Received StateOOM from libcontainerd on Windows. This should never happen.")
}
daemon.updateHealthMonitor(c)
daemon.LogContainerEvent(c, "oom")
case libcontainerd.StateExit:
c.Lock()
defer c.Unlock()
c.Wait()
c.Reset(false)
c.SetStopped(platformConstructExitStatus(e))
attributes := map[string]string{
"exitCode": strconv.Itoa(int(e.ExitCode)),
}
daemon.updateHealthMonitor(c)
daemon.LogContainerEventWithAttributes(c, "die", attributes)
daemon.Cleanup(c)
// FIXME: here is race condition between two RUN instructions in Dockerfile
// because they share same runconfig and change image. Must be fixed
// in builder/builder.go
if err := c.ToDisk(); err != nil {
return err
}
return daemon.postRunProcessing(c, e)
case libcontainerd.StateRestart:
c.Lock()
defer c.Unlock()
c.Reset(false)
c.RestartCount++
c.SetRestarting(platformConstructExitStatus(e))
attributes := map[string]string{
"exitCode": strconv.Itoa(int(e.ExitCode)),
}
daemon.LogContainerEventWithAttributes(c, "die", attributes)
daemon.updateHealthMonitor(c)
return c.ToDisk()
case libcontainerd.StateExitProcess:
c.Lock()
defer c.Unlock()
if execConfig := c.ExecCommands.Get(e.ProcessID); execConfig != nil {
ec := int(e.ExitCode)
execConfig.ExitCode = &ec
execConfig.Running = false
execConfig.Wait()
if err := execConfig.CloseStreams(); err != nil {
logrus.Errorf("%s: %s", c.ID, err)
}
// remove the exec command from the container's store only and not the
// daemon's store so that the exec command can be inspected.
c.ExecCommands.Delete(execConfig.ID)
} else {
logrus.Warnf("Ignoring StateExitProcess for %v but no exec command found", e)
}
case libcontainerd.StateStart, libcontainerd.StateRestore:
// Container is already locked in this case
c.SetRunning(int(e.Pid), e.State == libcontainerd.StateStart)
c.HasBeenManuallyStopped = false
if err := c.ToDisk(); err != nil {
c.Reset(false)
return err
}
daemon.initHealthMonitor(c)
daemon.LogContainerEvent(c, "start")
case libcontainerd.StatePause:
// Container is already locked in this case
c.Paused = true
daemon.updateHealthMonitor(c)
daemon.LogContainerEvent(c, "pause")
case libcontainerd.StateResume:
// Container is already locked in this case
c.Paused = false
daemon.updateHealthMonitor(c)
daemon.LogContainerEvent(c, "unpause")
}
return nil
}
// AttachStreams is called by libcontainerd to connect the stdio.
func (daemon *Daemon) AttachStreams(id string, iop libcontainerd.IOPipe) error {
var s *runconfig.StreamConfig
c := daemon.containers.Get(id)
if c == nil {
ec, err := daemon.getExecConfig(id)
if err != nil {
return fmt.Errorf("no such exec/container: %s", id)
}
s = ec.StreamConfig
} else {
s = c.StreamConfig
if err := daemon.StartLogging(c); err != nil {
c.Reset(false)
return err
}
}
if stdin := s.Stdin(); stdin != nil {
if iop.Stdin != nil {
go func() {
io.Copy(iop.Stdin, stdin)
iop.Stdin.Close()
}()
}
} else {
if c != nil && !c.Config.Tty {
// tty is enabled, so dont close containerd's iopipe stdin.
if iop.Stdin != nil {
iop.Stdin.Close()
}
}
}
copyFunc := func(w io.Writer, r io.Reader) {
s.Add(1)
go func() {
if _, err := io.Copy(w, r); err != nil {
logrus.Errorf("%v stream copy error: %v", id, err)
}
s.Done()
}()
}
if iop.Stdout != nil {
copyFunc(s.Stdout(), iop.Stdout)
}
if iop.Stderr != nil {
copyFunc(s.Stderr(), iop.Stderr)
}
return nil
}