From b6c7becbfe1d76b1250f6d8e991e645e13808a9c Mon Sep 17 00:00:00 2001 From: Thomas Leonard Date: Mon, 18 Apr 2016 10:48:13 +0100 Subject: [PATCH] Add support for user-defined healthchecks This PR adds support for user-defined health-check probes for Docker containers. It adds a `HEALTHCHECK` instruction to the Dockerfile syntax plus some corresponding "docker run" options. It can be used with a restart policy to automatically restart a container if the check fails. The `HEALTHCHECK` instruction has two forms: * `HEALTHCHECK [OPTIONS] CMD command` (check container health by running a command inside the container) * `HEALTHCHECK NONE` (disable any healthcheck inherited from the base image) The `HEALTHCHECK` instruction tells Docker how to test a container to check that it is still working. This can detect cases such as a web server that is stuck in an infinite loop and unable to handle new connections, even though the server process is still running. When a container has a healthcheck specified, it has a _health status_ in addition to its normal status. This status is initially `starting`. Whenever a health check passes, it becomes `healthy` (whatever state it was previously in). After a certain number of consecutive failures, it becomes `unhealthy`. The options that can appear before `CMD` are: * `--interval=DURATION` (default: `30s`) * `--timeout=DURATION` (default: `30s`) * `--retries=N` (default: `1`) The health check will first run **interval** seconds after the container is started, and then again **interval** seconds after each previous check completes. If a single run of the check takes longer than **timeout** seconds then the check is considered to have failed. It takes **retries** consecutive failures of the health check for the container to be considered `unhealthy`. There can only be one `HEALTHCHECK` instruction in a Dockerfile. If you list more than one then only the last `HEALTHCHECK` will take effect. The command after the `CMD` keyword can be either a shell command (e.g. `HEALTHCHECK CMD /bin/check-running`) or an _exec_ array (as with other Dockerfile commands; see e.g. `ENTRYPOINT` for details). The command's exit status indicates the health status of the container. The possible values are: - 0: success - the container is healthy and ready for use - 1: unhealthy - the container is not working correctly - 2: starting - the container is not ready for use yet, but is working correctly If the probe returns 2 ("starting") when the container has already moved out of the "starting" state then it is treated as "unhealthy" instead. For example, to check every five minutes or so that a web-server is able to serve the site's main page within three seconds: HEALTHCHECK --interval=5m --timeout=3s \ CMD curl -f http://localhost/ || exit 1 To help debug failing probes, any output text (UTF-8 encoded) that the command writes on stdout or stderr will be stored in the health status and can be queried with `docker inspect`. Such output should be kept short (only the first 4096 bytes are stored currently). When the health status of a container changes, a `health_status` event is generated with the new status. The health status is also displayed in the `docker ps` output. Signed-off-by: Thomas Leonard Signed-off-by: Sebastiaan van Stijn --- api/server/router/container/backend.go | 2 +- api/server/router/container/exec.go | 3 +- builder/dockerfile/builder.go | 19 +- builder/dockerfile/command/command.go | 66 ++-- builder/dockerfile/dispatchers.go | 107 ++++++ builder/dockerfile/evaluator.go | 33 +- builder/dockerfile/parser/line_parsers.go | 29 ++ builder/dockerfile/parser/parser.go | 33 +- .../parser/testfiles/health/Dockerfile | 10 + .../dockerfile/parser/testfiles/health/result | 9 + container/health.go | 49 +++ container/state.go | 4 + daemon/commit.go | 19 ++ daemon/exec.go | 28 +- daemon/health.go | 314 ++++++++++++++++++ daemon/health_test.go | 112 +++++++ daemon/inspect.go | 10 + daemon/monitor.go | 9 + daemon/stop.go | 2 + docs/reference/builder.md | 67 ++++ docs/reference/run.md | 60 ++++ integration-cli/docker_cli_health_test.go | 154 +++++++++ libcontainerd/client_linux.go | 11 + libcontainerd/client_windows.go | 19 ++ libcontainerd/types.go | 1 + runconfig/opts/parse.go | 40 +++ runconfig/opts/parse_test.go | 40 +++ 27 files changed, 1170 insertions(+), 80 deletions(-) create mode 100644 builder/dockerfile/parser/testfiles/health/Dockerfile create mode 100644 builder/dockerfile/parser/testfiles/health/result create mode 100644 container/health.go create mode 100644 daemon/health.go create mode 100644 daemon/health_test.go create mode 100644 integration-cli/docker_cli_health_test.go diff --git a/api/server/router/container/backend.go b/api/server/router/container/backend.go index c7eafa770f..b3cc625ff2 100644 --- a/api/server/router/container/backend.go +++ b/api/server/router/container/backend.go @@ -17,7 +17,7 @@ type execBackend interface { ContainerExecCreate(name string, config *types.ExecConfig) (string, error) ContainerExecInspect(id string) (*backend.ExecInspect, error) ContainerExecResize(name string, height, width int) error - ContainerExecStart(name string, stdin io.ReadCloser, stdout io.Writer, stderr io.Writer) error + ContainerExecStart(ctx context.Context, name string, stdin io.ReadCloser, stdout io.Writer, stderr io.Writer) error ExecExists(name string) (bool, error) } diff --git a/api/server/router/container/exec.go b/api/server/router/container/exec.go index fb88ac824a..21f5dc8300 100644 --- a/api/server/router/container/exec.go +++ b/api/server/router/container/exec.go @@ -106,7 +106,8 @@ func (s *containerRouter) postContainerExecStart(ctx context.Context, w http.Res } // Now run the user process in container. - if err := s.backend.ContainerExecStart(execName, stdin, stdout, stderr); err != nil { + // Maybe we should we pass ctx here if we're not detaching? + if err := s.backend.ContainerExecStart(context.Background(), execName, stdin, stdout, stderr); err != nil { if execStartCheck.Detach { return err } diff --git a/builder/dockerfile/builder.go b/builder/dockerfile/builder.go index 6ac76e877e..a7f96c6f13 100644 --- a/builder/dockerfile/builder.go +++ b/builder/dockerfile/builder.go @@ -22,15 +22,16 @@ import ( ) var validCommitCommands = map[string]bool{ - "cmd": true, - "entrypoint": true, - "env": true, - "expose": true, - "label": true, - "onbuild": true, - "user": true, - "volume": true, - "workdir": true, + "cmd": true, + "entrypoint": true, + "healthcheck": true, + "env": true, + "expose": true, + "label": true, + "onbuild": true, + "user": true, + "volume": true, + "workdir": true, } // BuiltinAllowedBuildArgs is list of built-in allowed build args diff --git a/builder/dockerfile/command/command.go b/builder/dockerfile/command/command.go index 9e1b799dcf..3e087e422e 100644 --- a/builder/dockerfile/command/command.go +++ b/builder/dockerfile/command/command.go @@ -3,40 +3,42 @@ package command // Define constants for the command strings const ( - Env = "env" - Label = "label" - Maintainer = "maintainer" - Add = "add" - Copy = "copy" - From = "from" - Onbuild = "onbuild" - Workdir = "workdir" - Run = "run" - Cmd = "cmd" - Entrypoint = "entrypoint" - Expose = "expose" - Volume = "volume" - User = "user" - StopSignal = "stopsignal" - Arg = "arg" + Env = "env" + Label = "label" + Maintainer = "maintainer" + Add = "add" + Copy = "copy" + From = "from" + Onbuild = "onbuild" + Workdir = "workdir" + Run = "run" + Cmd = "cmd" + Entrypoint = "entrypoint" + Expose = "expose" + Volume = "volume" + User = "user" + StopSignal = "stopsignal" + Arg = "arg" + Healthcheck = "healthcheck" ) // Commands is list of all Dockerfile commands var Commands = map[string]struct{}{ - Env: {}, - Label: {}, - Maintainer: {}, - Add: {}, - Copy: {}, - From: {}, - Onbuild: {}, - Workdir: {}, - Run: {}, - Cmd: {}, - Entrypoint: {}, - Expose: {}, - Volume: {}, - User: {}, - StopSignal: {}, - Arg: {}, + Env: {}, + Label: {}, + Maintainer: {}, + Add: {}, + Copy: {}, + From: {}, + Onbuild: {}, + Workdir: {}, + Run: {}, + Cmd: {}, + Entrypoint: {}, + Expose: {}, + Volume: {}, + User: {}, + StopSignal: {}, + Arg: {}, + Healthcheck: {}, } diff --git a/builder/dockerfile/dispatchers.go b/builder/dockerfile/dispatchers.go index 1de7dc2465..2f3b56cfd2 100644 --- a/builder/dockerfile/dispatchers.go +++ b/builder/dockerfile/dispatchers.go @@ -12,7 +12,9 @@ import ( "regexp" "runtime" "sort" + "strconv" "strings" + "time" "github.com/Sirupsen/logrus" "github.com/docker/docker/api" @@ -426,6 +428,111 @@ func cmd(b *Builder, args []string, attributes map[string]bool, original string) return nil } +// parseOptInterval(flag) is the duration of flag.Value, or 0 if +// empty. An error is reported if the value is given and is not positive. +func parseOptInterval(f *Flag) (time.Duration, error) { + s := f.Value + if s == "" { + return 0, nil + } + d, err := time.ParseDuration(s) + if err != nil { + return 0, err + } + if d <= 0 { + return 0, fmt.Errorf("Interval %#v must be positive", f.name) + } + return d, nil +} + +// HEALTHCHECK foo +// +// Set the default healthcheck command to run in the container (which may be empty). +// Argument handling is the same as RUN. +// +func healthcheck(b *Builder, args []string, attributes map[string]bool, original string) error { + if len(args) == 0 { + return fmt.Errorf("HEALTHCHECK requires an argument") + } + typ := strings.ToUpper(args[0]) + args = args[1:] + if typ == "NONE" { + if len(args) != 0 { + return fmt.Errorf("HEALTHCHECK NONE takes no arguments") + } + test := strslice.StrSlice{typ} + b.runConfig.Healthcheck = &container.HealthConfig{ + Test: test, + } + } else { + if b.runConfig.Healthcheck != nil { + oldCmd := b.runConfig.Healthcheck.Test + if len(oldCmd) > 0 && oldCmd[0] != "NONE" { + fmt.Fprintf(b.Stdout, "Note: overriding previous HEALTHCHECK: %v\n", oldCmd) + } + } + + healthcheck := container.HealthConfig{} + + flInterval := b.flags.AddString("interval", "") + flTimeout := b.flags.AddString("timeout", "") + flRetries := b.flags.AddString("retries", "") + + if err := b.flags.Parse(); err != nil { + return err + } + + switch typ { + case "CMD": + cmdSlice := handleJSONArgs(args, attributes) + if len(cmdSlice) == 0 { + return fmt.Errorf("Missing command after HEALTHCHECK CMD") + } + + if !attributes["json"] { + typ = "CMD-SHELL" + } + + healthcheck.Test = strslice.StrSlice(append([]string{typ}, cmdSlice...)) + default: + return fmt.Errorf("Unknown type %#v in HEALTHCHECK (try CMD)", typ) + } + + interval, err := parseOptInterval(flInterval) + if err != nil { + return err + } + healthcheck.Interval = interval + + timeout, err := parseOptInterval(flTimeout) + if err != nil { + return err + } + healthcheck.Timeout = timeout + + if flRetries.Value != "" { + retries, err := strconv.ParseInt(flRetries.Value, 10, 32) + if err != nil { + return err + } + if retries < 1 { + return fmt.Errorf("--retries must be at least 1 (not %d)", retries) + } + healthcheck.Retries = int(retries) + } else { + healthcheck.Retries = 0 + } + + b.runConfig.Healthcheck = &healthcheck + } + + if err := b.commit("", b.runConfig.Cmd, fmt.Sprintf("HEALTHCHECK %q", b.runConfig.Healthcheck)); err != nil { + return err + } + + return nil +} + // ENTRYPOINT /usr/sbin/nginx // // Set the entrypoint (which defaults to sh -c on linux, or cmd /S /C on Windows) to diff --git a/builder/dockerfile/evaluator.go b/builder/dockerfile/evaluator.go index 905675d0e0..52786371df 100644 --- a/builder/dockerfile/evaluator.go +++ b/builder/dockerfile/evaluator.go @@ -58,22 +58,23 @@ var evaluateTable map[string]func(*Builder, []string, map[string]bool, string) e func init() { evaluateTable = map[string]func(*Builder, []string, map[string]bool, string) error{ - command.Env: env, - command.Label: label, - command.Maintainer: maintainer, - command.Add: add, - command.Copy: dispatchCopy, // copy() is a go builtin - command.From: from, - command.Onbuild: onbuild, - command.Workdir: workdir, - command.Run: run, - command.Cmd: cmd, - command.Entrypoint: entrypoint, - command.Expose: expose, - command.Volume: volume, - command.User: user, - command.StopSignal: stopSignal, - command.Arg: arg, + command.Env: env, + command.Label: label, + command.Maintainer: maintainer, + command.Add: add, + command.Copy: dispatchCopy, // copy() is a go builtin + command.From: from, + command.Onbuild: onbuild, + command.Workdir: workdir, + command.Run: run, + command.Cmd: cmd, + command.Entrypoint: entrypoint, + command.Expose: expose, + command.Volume: volume, + command.User: user, + command.StopSignal: stopSignal, + command.Arg: arg, + command.Healthcheck: healthcheck, } } diff --git a/builder/dockerfile/parser/line_parsers.go b/builder/dockerfile/parser/line_parsers.go index adf15ed5a5..ddd92dd416 100644 --- a/builder/dockerfile/parser/line_parsers.go +++ b/builder/dockerfile/parser/line_parsers.go @@ -329,3 +329,32 @@ func parseMaybeJSONToList(rest string) (*Node, map[string]bool, error) { return parseStringsWhitespaceDelimited(rest) } + +// The HEALTHCHECK command is like parseMaybeJSON, but has an extra type argument. +func parseHealthConfig(rest string) (*Node, map[string]bool, error) { + // Find end of first argument + var sep int + for ; sep < len(rest); sep++ { + if unicode.IsSpace(rune(rest[sep])) { + break + } + } + next := sep + for ; next < len(rest); next++ { + if !unicode.IsSpace(rune(rest[next])) { + break + } + } + + if sep == 0 { + return nil, nil, nil + } + + typ := rest[:sep] + cmd, attrs, err := parseMaybeJSON(rest[next:]) + if err != nil { + return nil, nil, err + } + + return &Node{Value: typ, Next: cmd, Attributes: attrs}, nil, err +} diff --git a/builder/dockerfile/parser/parser.go b/builder/dockerfile/parser/parser.go index e42904fef8..683f30f68a 100644 --- a/builder/dockerfile/parser/parser.go +++ b/builder/dockerfile/parser/parser.go @@ -66,22 +66,23 @@ func init() { // functions. Errors are propagated up by Parse() and the resulting AST can // be incorporated directly into the existing AST as a next. dispatch = map[string]func(string) (*Node, map[string]bool, error){ - command.User: parseString, - command.Onbuild: parseSubCommand, - command.Workdir: parseString, - command.Env: parseEnv, - command.Label: parseLabel, - command.Maintainer: parseString, - command.From: parseString, - command.Add: parseMaybeJSONToList, - command.Copy: parseMaybeJSONToList, - command.Run: parseMaybeJSON, - command.Cmd: parseMaybeJSON, - command.Entrypoint: parseMaybeJSON, - command.Expose: parseStringsWhitespaceDelimited, - command.Volume: parseMaybeJSONToList, - command.StopSignal: parseString, - command.Arg: parseNameOrNameVal, + command.User: parseString, + command.Onbuild: parseSubCommand, + command.Workdir: parseString, + command.Env: parseEnv, + command.Label: parseLabel, + command.Maintainer: parseString, + command.From: parseString, + command.Add: parseMaybeJSONToList, + command.Copy: parseMaybeJSONToList, + command.Run: parseMaybeJSON, + command.Cmd: parseMaybeJSON, + command.Entrypoint: parseMaybeJSON, + command.Expose: parseStringsWhitespaceDelimited, + command.Volume: parseMaybeJSONToList, + command.StopSignal: parseString, + command.Arg: parseNameOrNameVal, + command.Healthcheck: parseHealthConfig, } } diff --git a/builder/dockerfile/parser/testfiles/health/Dockerfile b/builder/dockerfile/parser/testfiles/health/Dockerfile new file mode 100644 index 0000000000..6534ce17d4 --- /dev/null +++ b/builder/dockerfile/parser/testfiles/health/Dockerfile @@ -0,0 +1,10 @@ +FROM debian +ADD check.sh main.sh /app/ +CMD /app/main.sh +HEALTHCHECK +HEALTHCHECK --interval=5s --timeout=3s --retries=1 \ + CMD /app/check.sh --quiet +HEALTHCHECK CMD +HEALTHCHECK CMD a b +HEALTHCHECK --timeout=3s CMD ["foo"] +HEALTHCHECK CONNECT TCP 7000 diff --git a/builder/dockerfile/parser/testfiles/health/result b/builder/dockerfile/parser/testfiles/health/result new file mode 100644 index 0000000000..bfa846c917 --- /dev/null +++ b/builder/dockerfile/parser/testfiles/health/result @@ -0,0 +1,9 @@ +(from "debian") +(add "check.sh" "main.sh" "/app/") +(cmd "/app/main.sh") +(healthcheck) +(healthcheck ["--interval=5s" "--timeout=3s" "--retries=1"] "CMD" "/app/check.sh --quiet") +(healthcheck "CMD") +(healthcheck "CMD" "a b") +(healthcheck ["--timeout=3s"] "CMD" "foo") +(healthcheck "CONNECT" "TCP 7000") diff --git a/container/health.go b/container/health.go new file mode 100644 index 0000000000..36f01debc6 --- /dev/null +++ b/container/health.go @@ -0,0 +1,49 @@ +package container + +import ( + "github.com/Sirupsen/logrus" + "github.com/docker/engine-api/types" +) + +// Health holds the current container health-check state +type Health struct { + types.Health + stop chan struct{} // Write struct{} to stop the monitor +} + +// String returns a human-readable description of the health-check state +func (s *Health) String() string { + if s.stop == nil { + return "no healthcheck" + } + switch s.Status { + case types.Starting: + return "health: starting" + default: // Healthy and Unhealthy are clear on their own + return s.Status + } +} + +// OpenMonitorChannel creates and returns a new monitor channel. If there already is one, +// it returns nil. +func (s *Health) OpenMonitorChannel() chan struct{} { + if s.stop == nil { + logrus.Debugf("OpenMonitorChannel") + s.stop = make(chan struct{}) + return s.stop + } + return nil +} + +// CloseMonitorChannel closes any existing monitor channel. +func (s *Health) CloseMonitorChannel() { + if s.stop != nil { + logrus.Debugf("CloseMonitorChannel: waiting for probe to stop") + // This channel does not buffer. Once the write succeeds, the monitor + // has read the stop request and will not make any further updates + // to c.State.Health. + s.stop <- struct{}{} + s.stop = nil + logrus.Debugf("CloseMonitorChannel done") + } +} diff --git a/container/state.go b/container/state.go index e0ede8a33a..852ca1d0e5 100644 --- a/container/state.go +++ b/container/state.go @@ -27,6 +27,7 @@ type State struct { StartedAt time.Time FinishedAt time.Time waitChan chan struct{} + Health *Health } // NewState creates a default state object with a fresh channel for state changes. @@ -46,6 +47,9 @@ func (s *State) String() string { return fmt.Sprintf("Restarting (%d) %s ago", s.ExitCode, units.HumanDuration(time.Now().UTC().Sub(s.FinishedAt))) } + if h := s.Health; h != nil { + return fmt.Sprintf("Up %s (%s)", units.HumanDuration(time.Now().UTC().Sub(s.StartedAt)), h.String()) + } return fmt.Sprintf("Up %s", units.HumanDuration(time.Now().UTC().Sub(s.StartedAt))) } diff --git a/daemon/commit.go b/daemon/commit.go index bb82c85e54..24c7a46701 100644 --- a/daemon/commit.go +++ b/daemon/commit.go @@ -80,6 +80,25 @@ func merge(userConf, imageConf *containertypes.Config) error { userConf.Entrypoint = imageConf.Entrypoint } } + if imageConf.Healthcheck != nil { + if userConf.Healthcheck == nil { + userConf.Healthcheck = imageConf.Healthcheck + } else { + if len(userConf.Healthcheck.Test) == 0 { + userConf.Healthcheck.Test = imageConf.Healthcheck.Test + } + if userConf.Healthcheck.Interval == 0 { + userConf.Healthcheck.Interval = imageConf.Healthcheck.Interval + } + if userConf.Healthcheck.Timeout == 0 { + userConf.Healthcheck.Timeout = imageConf.Healthcheck.Timeout + } + if userConf.Healthcheck.Retries == 0 { + userConf.Healthcheck.Retries = imageConf.Healthcheck.Retries + } + } + } + if userConf.WorkingDir == "" { userConf.WorkingDir = imageConf.WorkingDir } diff --git a/daemon/exec.go b/daemon/exec.go index e58205361e..fd09fd784d 100644 --- a/daemon/exec.go +++ b/daemon/exec.go @@ -14,11 +14,15 @@ import ( "github.com/docker/docker/errors" "github.com/docker/docker/libcontainerd" "github.com/docker/docker/pkg/pools" + "github.com/docker/docker/pkg/signal" "github.com/docker/docker/pkg/term" "github.com/docker/engine-api/types" "github.com/docker/engine-api/types/strslice" ) +// Seconds to wait after sending TERM before trying KILL +const termProcessTimeout = 10 + func (d *Daemon) registerExecCommand(container *container.Container, config *exec.Config) { // Storing execs in container in order to kill them gracefully whenever the container is stopped or removed. container.ExecCommands.Add(config.ID, config) @@ -130,7 +134,8 @@ func (d *Daemon) ContainerExecCreate(name string, config *types.ExecConfig) (str // ContainerExecStart starts a previously set up exec instance. The // std streams are set up. -func (d *Daemon) ContainerExecStart(name string, stdin io.ReadCloser, stdout io.Writer, stderr io.Writer) (err error) { +// If ctx is cancelled, the process is terminated. +func (d *Daemon) ContainerExecStart(ctx context.Context, name string, stdin io.ReadCloser, stdout io.Writer, stderr io.Writer) (err error) { var ( cStdin io.ReadCloser cStdout, cStderr io.Writer @@ -197,15 +202,28 @@ func (d *Daemon) ContainerExecStart(name string, stdin io.ReadCloser, stdout io. return nil } - attachErr := container.AttachStreams(context.Background(), ec.StreamConfig, ec.OpenStdin, true, ec.Tty, cStdin, cStdout, cStderr, ec.DetachKeys) + attachErr := container.AttachStreams(ctx, ec.StreamConfig, ec.OpenStdin, true, ec.Tty, cStdin, cStdout, cStderr, ec.DetachKeys) if err := d.containerd.AddProcess(c.ID, name, p); err != nil { return err } - err = <-attachErr - if err != nil { - return fmt.Errorf("attach failed with error: %v", err) + select { + case <-ctx.Done(): + logrus.Debugf("Sending TERM signal to process %v in container %v", name, c.ID) + d.containerd.SignalProcess(c.ID, name, int(signal.SignalMap["TERM"])) + select { + case <-time.After(termProcessTimeout * time.Second): + logrus.Infof("Container %v, process %v failed to exit within %d seconds of signal TERM - using the force", c.ID, name, termProcessTimeout) + d.containerd.SignalProcess(c.ID, name, int(signal.SignalMap["KILL"])) + case <-attachErr: + // TERM signal worked + } + return fmt.Errorf("context cancelled") + case err := <-attachErr: + if err != nil { + return fmt.Errorf("attach failed with error: %v", err) + } } return nil } diff --git a/daemon/health.go b/daemon/health.go new file mode 100644 index 0000000000..ec9843561a --- /dev/null +++ b/daemon/health.go @@ -0,0 +1,314 @@ +package daemon + +import ( + "bytes" + "fmt" + "runtime" + "strings" + "time" + + "golang.org/x/net/context" + + "github.com/Sirupsen/logrus" + "github.com/docker/docker/container" + "github.com/docker/docker/daemon/exec" + "github.com/docker/engine-api/types" + "github.com/docker/engine-api/types/strslice" +) + +const ( + // Longest healthcheck probe output message to store. Longer messages will be truncated. + maxOutputLen = 4096 + + // Default interval between probe runs (from the end of the first to the start of the second). + // Also the time before the first probe. + defaultProbeInterval = 30 * time.Second + + // The maximum length of time a single probe run should take. If the probe takes longer + // than this, the check is considered to have failed. + defaultProbeTimeout = 30 * time.Second + + // Shut down a container if it becomes Unhealthy. + defaultExitOnUnhealthy = true + + // Maximum number of entries to record + maxLogEntries = 5 +) + +const ( + // Exit status codes that can be returned by the probe command. + + exitStatusHealthy = 0 // Container is healthy + exitStatusUnhealthy = 1 // Container is unhealthy + exitStatusStarting = 2 // Container needs more time to start +) + +// probe implementations know how to run a particular type of probe. +type probe interface { + // Perform one run of the check. Returns the exit code and an optional + // short diagnostic string. + run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error) +} + +// cmdProbe implements the "CMD" probe type. +type cmdProbe struct { + // Run the command with the system's default shell instead of execing it directly. + shell bool +} + +// exec the healthcheck command in the container. +// Returns the exit code and probe output (if any) +func (p *cmdProbe) run(ctx context.Context, d *Daemon, container *container.Container) (*types.HealthcheckResult, error) { + cmdSlice := strslice.StrSlice(container.Config.Healthcheck.Test)[1:] + if p.shell { + if runtime.GOOS != "windows" { + cmdSlice = append([]string{"/bin/sh", "-c"}, cmdSlice...) + } else { + cmdSlice = append([]string{"cmd", "/S", "/C"}, cmdSlice...) + } + } + entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice) + execConfig := exec.NewConfig() + execConfig.OpenStdin = false + execConfig.OpenStdout = true + execConfig.OpenStderr = true + execConfig.ContainerID = container.ID + execConfig.DetachKeys = []byte{} + execConfig.Entrypoint = entrypoint + execConfig.Args = args + execConfig.Tty = false + execConfig.Privileged = false + execConfig.User = container.Config.User + + d.registerExecCommand(container, execConfig) + d.LogContainerEvent(container, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " ")) + + output := &limitedBuffer{} + err := d.ContainerExecStart(ctx, execConfig.ID, nil, output, output) + if err != nil { + return nil, err + } + info, err := d.getExecConfig(execConfig.ID) + if err != nil { + return nil, err + } + if info.ExitCode == nil { + return nil, fmt.Errorf("Healthcheck has no exit code!") + } + // Note: Go's json package will handle invalid UTF-8 for us + out := output.String() + return &types.HealthcheckResult{ + End: time.Now(), + ExitCode: *info.ExitCode, + Output: out, + }, nil +} + +// Update the container's Status.Health struct based on the latest probe's result. +func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult) { + c.Lock() + defer c.Unlock() + + retries := c.Config.Healthcheck.Retries + if retries <= 0 { + retries = 1 // Default if unset or set to an invalid value + } + + h := c.State.Health + oldStatus := h.Status + + if len(h.Log) >= maxLogEntries { + h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result) + } else { + h.Log = append(h.Log, result) + } + + if result.ExitCode == exitStatusHealthy { + h.FailingStreak = 0 + h.Status = types.Healthy + } else if result.ExitCode == exitStatusStarting && c.State.Health.Status == types.Starting { + // The container is not ready yet. Remain in the starting state. + } else { + // Failure (incuding invalid exit code) + h.FailingStreak++ + if c.State.Health.FailingStreak >= retries { + h.Status = types.Unhealthy + } + // Else we're starting or healthy. Stay in that state. + } + + if oldStatus != h.Status { + d.LogContainerEvent(c, "health_status: "+h.Status) + } +} + +// Run the container's monitoring thread until notified via "stop". +// There is never more than one monitor thread running per container at a time. +func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) { + probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout) + probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval) + for { + select { + case <-stop: + logrus.Debugf("Stop healthcheck monitoring (received while idle)") + return + case <-time.After(probeInterval): + logrus.Debugf("Running health check...") + startTime := time.Now() + ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout) + results := make(chan *types.HealthcheckResult) + go func() { + result, err := probe.run(ctx, d, c) + if err != nil { + logrus.Warnf("Health check error: %v", err) + results <- &types.HealthcheckResult{ + ExitCode: -1, + Output: err.Error(), + Start: startTime, + End: time.Now(), + } + } else { + result.Start = startTime + logrus.Debugf("Health check done (exitCode=%d)", result.ExitCode) + results <- result + } + close(results) + }() + select { + case <-stop: + logrus.Debugf("Stop healthcheck monitoring (received while probing)") + // Stop timeout and kill probe, but don't wait for probe to exit. + cancelProbe() + return + case result := <-results: + handleProbeResult(d, c, result) + // Stop timeout + cancelProbe() + case <-ctx.Done(): + logrus.Debugf("Health check taking too long") + handleProbeResult(d, c, &types.HealthcheckResult{ + ExitCode: -1, + Output: fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout), + Start: startTime, + End: time.Now(), + }) + cancelProbe() + // Wait for probe to exit (it might take a while to respond to the TERM + // signal and we don't want dying probes to pile up). + <-results + } + } + } +} + +// Get a suitable probe implementation for the container's healthcheck configuration. +func getProbe(c *container.Container) probe { + config := c.Config.Healthcheck + if config == nil || len(config.Test) == 0 { + return nil + } + switch config.Test[0] { + case "CMD": + return &cmdProbe{shell: false} + case "CMD-SHELL": + return &cmdProbe{shell: true} + default: + logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD')", config.Test[0]) + return nil + } +} + +// Ensure the health-check monitor is running or not, depending on the current +// state of the container. +// Called from monitor.go, with c locked. +func (d *Daemon) updateHealthMonitor(c *container.Container) { + h := c.State.Health + if h == nil { + return // No healthcheck configured + } + + probe := getProbe(c) + wantRunning := c.Running && !c.Paused && probe != nil + if wantRunning { + if stop := h.OpenMonitorChannel(); stop != nil { + go monitor(d, c, stop, probe) + } + } else { + h.CloseMonitorChannel() + } +} + +// Reset the health state for a newly-started, restarted or restored container. +// initHealthMonitor is called from monitor.go and we should never be running +// two instances at once. +// Called with c locked. +func (d *Daemon) initHealthMonitor(c *container.Container) { + if c.Config.Healthcheck == nil { + return + } + + // This is needed in case we're auto-restarting + d.stopHealthchecks(c) + + if c.State.Health == nil { + h := &container.Health{} + h.Status = types.Starting + h.FailingStreak = 0 + c.State.Health = h + } + + d.updateHealthMonitor(c) +} + +// Called when the container is being stopped (whether because the health check is +// failing or for any other reason). +func (d *Daemon) stopHealthchecks(c *container.Container) { + h := c.State.Health + if h != nil { + h.CloseMonitorChannel() + } +} + +// Buffer up to maxOutputLen bytes. Further data is discarded. +type limitedBuffer struct { + buf bytes.Buffer + truncated bool // indicates that data has been lost +} + +// Append to limitedBuffer while there is room. +func (b *limitedBuffer) Write(data []byte) (int, error) { + bufLen := b.buf.Len() + dataLen := len(data) + keep := min(maxOutputLen-bufLen, dataLen) + if keep > 0 { + b.buf.Write(data[:keep]) + } + if keep < dataLen { + b.truncated = true + } + return dataLen, nil +} + +// The contents of the buffer, with "..." appended if it overflowed. +func (b *limitedBuffer) String() string { + out := b.buf.String() + if b.truncated { + out = out + "..." + } + return out +} + +// If configuredValue is zero, use defaultValue instead. +func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration { + if configuredValue == 0 { + return defaultValue + } + return configuredValue +} + +func min(x, y int) int { + if x < y { + return x + } + return y +} diff --git a/daemon/health_test.go b/daemon/health_test.go new file mode 100644 index 0000000000..f53c32f4f2 --- /dev/null +++ b/daemon/health_test.go @@ -0,0 +1,112 @@ +package daemon + +import ( + "testing" + "time" + + "github.com/docker/docker/container" + "github.com/docker/docker/daemon/events" + "github.com/docker/engine-api/types" + containertypes "github.com/docker/engine-api/types/container" + eventtypes "github.com/docker/engine-api/types/events" +) + +func reset(c *container.Container) { + c.State = &container.State{} + c.State.Health = &container.Health{} + c.State.Health.Status = types.Starting +} + +func TestHealthStates(t *testing.T) { + e := events.New() + _, l, _ := e.Subscribe() + defer e.Evict(l) + + expect := func(expected string) { + select { + case event := <-l: + ev := event.(eventtypes.Message) + if ev.Status != expected { + t.Errorf("Expecting event %#v, but got %#v\n", expected, ev.Status) + } + case <-time.After(1 * time.Second): + t.Errorf("Expecting event %#v, but got nothing\n", expected) + } + } + + c := &container.Container{ + CommonContainer: container.CommonContainer{ + ID: "container_id", + Name: "container_name", + Config: &containertypes.Config{ + Image: "image_name", + }, + }, + } + daemon := &Daemon{ + EventsService: e, + } + + c.Config.Healthcheck = &containertypes.HealthConfig{ + Retries: 1, + } + + reset(c) + + handleResult := func(startTime time.Time, exitCode int) { + handleProbeResult(daemon, c, &types.HealthcheckResult{ + Start: startTime, + End: startTime, + ExitCode: exitCode, + }) + } + + // starting -> failed -> success -> failed + + handleResult(c.State.StartedAt.Add(1*time.Second), 1) + expect("health_status: unhealthy") + + handleResult(c.State.StartedAt.Add(2*time.Second), 0) + expect("health_status: healthy") + + handleResult(c.State.StartedAt.Add(3*time.Second), 1) + expect("health_status: unhealthy") + + // starting -> starting -> starting -> + // healthy -> starting (invalid transition) + + reset(c) + + handleResult(c.State.StartedAt.Add(20*time.Second), 2) + handleResult(c.State.StartedAt.Add(40*time.Second), 2) + if c.State.Health.Status != types.Starting { + t.Errorf("Expecting starting, but got %#v\n", c.State.Health.Status) + } + + handleResult(c.State.StartedAt.Add(50*time.Second), 0) + expect("health_status: healthy") + handleResult(c.State.StartedAt.Add(60*time.Second), 2) + expect("health_status: unhealthy") + + // Test retries + + reset(c) + c.Config.Healthcheck.Retries = 3 + + handleResult(c.State.StartedAt.Add(20*time.Second), 1) + handleResult(c.State.StartedAt.Add(40*time.Second), 1) + if c.State.Health.Status != types.Starting { + t.Errorf("Expecting starting, but got %#v\n", c.State.Health.Status) + } + if c.State.Health.FailingStreak != 2 { + t.Errorf("Expecting FailingStreak=2, but got %d\n", c.State.Health.FailingStreak) + } + handleResult(c.State.StartedAt.Add(60*time.Second), 1) + expect("health_status: unhealthy") + + handleResult(c.State.StartedAt.Add(80*time.Second), 0) + expect("health_status: healthy") + if c.State.Health.FailingStreak != 0 { + t.Errorf("Expecting FailingStreak=0, but got %d\n", c.State.Health.FailingStreak) + } +} diff --git a/daemon/inspect.go b/daemon/inspect.go index db475537f1..e10402203f 100644 --- a/daemon/inspect.go +++ b/daemon/inspect.go @@ -108,6 +108,15 @@ func (daemon *Daemon) getInspectData(container *container.Container, size bool) hostConfig.Links = append(hostConfig.Links, fmt.Sprintf("%s:%s", child.Name, linkAlias)) } + var containerHealth *types.Health + if container.State.Health != nil { + containerHealth = &types.Health{ + Status: container.State.Health.Status, + FailingStreak: container.State.Health.FailingStreak, + Log: append([]*types.HealthcheckResult{}, container.State.Health.Log...), + } + } + containerState := &types.ContainerState{ Status: container.State.StateString(), Running: container.State.Running, @@ -120,6 +129,7 @@ func (daemon *Daemon) getInspectData(container *container.Container, size bool) Error: container.State.Error, StartedAt: container.State.StartedAt.Format(time.RFC3339Nano), FinishedAt: container.State.FinishedAt.Format(time.RFC3339Nano), + Health: containerHealth, } contJSONBase := &types.ContainerJSONBase{ diff --git a/daemon/monitor.go b/daemon/monitor.go index cb334cf29d..30d36836f8 100644 --- a/daemon/monitor.go +++ b/daemon/monitor.go @@ -25,6 +25,7 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error { if runtime.GOOS == "windows" { return errors.New("Received StateOOM from libcontainerd on Windows. This should never happen.") } + daemon.updateHealthMonitor(c) daemon.LogContainerEvent(c, "oom") case libcontainerd.StateExit: c.Lock() @@ -35,6 +36,7 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error { attributes := map[string]string{ "exitCode": strconv.Itoa(int(e.ExitCode)), } + daemon.updateHealthMonitor(c) daemon.LogContainerEventWithAttributes(c, "die", attributes) daemon.Cleanup(c) // FIXME: here is race condition between two RUN instructions in Dockerfile @@ -54,6 +56,7 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error { "exitCode": strconv.Itoa(int(e.ExitCode)), } daemon.LogContainerEventWithAttributes(c, "die", attributes) + daemon.updateHealthMonitor(c) return c.ToDisk() case libcontainerd.StateExitProcess: c.Lock() @@ -74,18 +77,24 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error { logrus.Warnf("Ignoring StateExitProcess for %v but no exec command found", e) } case libcontainerd.StateStart, libcontainerd.StateRestore: + // Container is already locked in this case c.SetRunning(int(e.Pid), e.State == libcontainerd.StateStart) c.HasBeenManuallyStopped = false if err := c.ToDisk(); err != nil { c.Reset(false) return err } + daemon.initHealthMonitor(c) daemon.LogContainerEvent(c, "start") case libcontainerd.StatePause: + // Container is already locked in this case c.Paused = true + daemon.updateHealthMonitor(c) daemon.LogContainerEvent(c, "pause") case libcontainerd.StateResume: + // Container is already locked in this case c.Paused = false + daemon.updateHealthMonitor(c) daemon.LogContainerEvent(c, "unpause") } diff --git a/daemon/stop.go b/daemon/stop.go index 701743008a..4bbdbbd74c 100644 --- a/daemon/stop.go +++ b/daemon/stop.go @@ -41,6 +41,8 @@ func (daemon *Daemon) containerStop(container *container.Container, seconds int) return nil } + daemon.stopHealthchecks(container) + stopSignal := container.StopSignal() // 1. Send a stop signal if err := daemon.killPossiblyDeadProcess(container, stopSignal); err != nil { diff --git a/docs/reference/builder.md b/docs/reference/builder.md index 863dcd36c7..f460338b22 100644 --- a/docs/reference/builder.md +++ b/docs/reference/builder.md @@ -1470,6 +1470,73 @@ The `STOPSIGNAL` instruction sets the system call signal that will be sent to th This signal can be a valid unsigned number that matches a position in the kernel's syscall table, for instance 9, or a signal name in the format SIGNAME, for instance SIGKILL. +## HEALTHCHECK + +The `HEALTHCHECK` instruction has two forms: + +* `HEALTHCHECK [OPTIONS] CMD command` (check container health by running a command inside the container) +* `HEALTHCHECK NONE` (disable any healthcheck inherited from the base image) + +The `HEALTHCHECK` instruction tells Docker how to test a container to check that +it is still working. This can detect cases such as a web server that is stuck in +an infinite loop and unable to handle new connections, even though the server +process is still running. + +When a container has a healthcheck specified, it has a _health status_ in +addition to its normal status. This status is initially `starting`. Whenever a +health check passes, it becomes `healthy` (whatever state it was previously in). +After a certain number of consecutive failures, it becomes `unhealthy`. + +The options that can appear before `CMD` are: + +* `--interval=DURATION` (default: `30s`) +* `--timeout=DURATION` (default: `30s`) +* `--retries=N` (default: `1`) + +The health check will first run **interval** seconds after the container is +started, and then again **interval** seconds after each previous check completes. + +If a single run of the check takes longer than **timeout** seconds then the check +is considered to have failed. + +It takes **retries** consecutive failures of the health check for the container +to be considered `unhealthy`. + +There can only be one `HEALTHCHECK` instruction in a Dockerfile. If you list +more than one then only the last `HEALTHCHECK` will take effect. + +The command after the `CMD` keyword can be either a shell command (e.g. `HEALTHCHECK +CMD /bin/check-running`) or an _exec_ array (as with other Dockerfile commands; +see e.g. `ENTRYPOINT` for details). + +The command's exit status indicates the health status of the container. +The possible values are: + +- 0: success - the container is healthy and ready for use +- 1: unhealthy - the container is not working correctly +- 2: starting - the container is not ready for use yet, but is working correctly + +If the probe returns 2 ("starting") when the container has already moved out of the +"starting" state then it is treated as "unhealthy" instead. + +For example, to check every five minutes or so that a web-server is able to +serve the site's main page within three seconds: + + HEALTHCHECK --interval=5m --timeout=3s \ + CMD curl -f http://localhost/ || exit 1 + +To help debug failing probes, any output text (UTF-8 encoded) that the command writes +on stdout or stderr will be stored in the health status and can be queried with +`docker inspect`. Such output should be kept short (only the first 4096 bytes +are stored currently). + +When the health status of a container changes, a `health_status` event is +generated with the new status. + +The `HEALTHCHECK` feature was added in Docker 1.12. + + + ## Dockerfile examples Below you can see some examples of Dockerfile syntax. If you're interested in diff --git a/docs/reference/run.md b/docs/reference/run.md index 567d42b207..a7c5267f45 100644 --- a/docs/reference/run.md +++ b/docs/reference/run.md @@ -1250,6 +1250,7 @@ Dockerfile instruction and how the operator can override that setting. #entrypoint-default-command-to-execute-at-runtime) - [EXPOSE (Incoming Ports)](#expose-incoming-ports) - [ENV (Environment Variables)](#env-environment-variables) + - [HEALTHCHECK](#healthcheck) - [VOLUME (Shared Filesystems)](#volume-shared-filesystems) - [USER](#user) - [WORKDIR](#workdir) @@ -1398,6 +1399,65 @@ above, or already defined by the developer with a Dockerfile `ENV`: Similarly the operator can set the **hostname** with `-h`. +### HEALTHCHECK + +``` + --health-cmd Command to run to check health + --health-interval Time between running the check + --health-retries Consecutive failures needed to report unhealthy + --health-timeout Maximum time to allow one check to run + --no-healthcheck Disable any container-specified HEALTHCHECK +``` + +Example: + + $ docker run --name=test -d \ + --health-cmd='stat /etc/passwd || exit 1' \ + --health-interval=2s \ + busybox sleep 1d + $ sleep 2; docker inspect --format='{{.State.Health.Status}}' test + healthy + $ docker exec test rm /etc/passwd + $ sleep 2; docker inspect --format='{{json .State.Health}}' test + { + "Status": "unhealthy", + "FailingStreak": 3, + "Log": [ + { + "Start": "2016-05-25T17:22:04.635478668Z", + "End": "2016-05-25T17:22:04.7272552Z", + "ExitCode": 0, + "Output": " File: /etc/passwd\n Size: 334 \tBlocks: 8 IO Block: 4096 regular file\nDevice: 32h/50d\tInode: 12 Links: 1\nAccess: (0664/-rw-rw-r--) Uid: ( 0/ root) Gid: ( 0/ root)\nAccess: 2015-12-05 22:05:32.000000000\nModify: 2015..." + }, + { + "Start": "2016-05-25T17:22:06.732900633Z", + "End": "2016-05-25T17:22:06.822168935Z", + "ExitCode": 0, + "Output": " File: /etc/passwd\n Size: 334 \tBlocks: 8 IO Block: 4096 regular file\nDevice: 32h/50d\tInode: 12 Links: 1\nAccess: (0664/-rw-rw-r--) Uid: ( 0/ root) Gid: ( 0/ root)\nAccess: 2015-12-05 22:05:32.000000000\nModify: 2015..." + }, + { + "Start": "2016-05-25T17:22:08.823956535Z", + "End": "2016-05-25T17:22:08.897359124Z", + "ExitCode": 1, + "Output": "stat: can't stat '/etc/passwd': No such file or directory\n" + }, + { + "Start": "2016-05-25T17:22:10.898802931Z", + "End": "2016-05-25T17:22:10.969631866Z", + "ExitCode": 1, + "Output": "stat: can't stat '/etc/passwd': No such file or directory\n" + }, + { + "Start": "2016-05-25T17:22:12.971033523Z", + "End": "2016-05-25T17:22:13.082015516Z", + "ExitCode": 1, + "Output": "stat: can't stat '/etc/passwd': No such file or directory\n" + } + ] + } + +The health status is also displayed in the `docker ps` output. + ### TMPFS (mount tmpfs filesystems) ```bash diff --git a/integration-cli/docker_cli_health_test.go b/integration-cli/docker_cli_health_test.go new file mode 100644 index 0000000000..b374dba357 --- /dev/null +++ b/integration-cli/docker_cli_health_test.go @@ -0,0 +1,154 @@ +package main + +import ( + "encoding/json" + "github.com/docker/docker/pkg/integration/checker" + "github.com/docker/engine-api/types" + "github.com/go-check/check" + "strconv" + "strings" + "time" +) + +func waitForStatus(c *check.C, name string, prev string, expected string) { + prev = prev + "\n" + expected = expected + "\n" + for { + out, _ := dockerCmd(c, "inspect", "--format={{.State.Status}}", name) + if out == expected { + return + } + c.Check(out, checker.Equals, prev) + if out != prev { + return + } + time.Sleep(100 * time.Millisecond) + } +} + +func waitForHealthStatus(c *check.C, name string, prev string, expected string) { + prev = prev + "\n" + expected = expected + "\n" + for { + out, _ := dockerCmd(c, "inspect", "--format={{.State.Health.Status}}", name) + if out == expected { + return + } + c.Check(out, checker.Equals, prev) + if out != prev { + return + } + time.Sleep(100 * time.Millisecond) + } +} + +func getHealth(c *check.C, name string) *types.Health { + out, _ := dockerCmd(c, "inspect", "--format={{json .State.Health}}", name) + var health types.Health + err := json.Unmarshal([]byte(out), &health) + c.Check(err, checker.Equals, nil) + return &health +} + +func (s *DockerSuite) TestHealth(c *check.C) { + testRequires(c, DaemonIsLinux) // busybox doesn't work on Windows + + imageName := "testhealth" + _, err := buildImage(imageName, + `FROM busybox + RUN echo OK > /status + CMD ["/bin/sleep", "120"] + STOPSIGNAL SIGKILL + HEALTHCHECK --interval=1s --timeout=30s \ + CMD cat /status`, + true) + + c.Check(err, check.IsNil) + + // No health status before starting + name := "test_health" + dockerCmd(c, "create", "--name", name, imageName) + out, _ := dockerCmd(c, "ps", "-a", "--format={{.Status}}") + c.Check(out, checker.Equals, "Created\n") + + // Inspect the options + out, _ = dockerCmd(c, "inspect", + "--format='timeout={{.Config.Healthcheck.Timeout}} "+ + "interval={{.Config.Healthcheck.Interval}} "+ + "retries={{.Config.Healthcheck.Retries}} "+ + "test={{.Config.Healthcheck.Test}}'", name) + c.Check(out, checker.Equals, "timeout=30s interval=1s retries=0 test=[CMD-SHELL cat /status]\n") + + // Start + dockerCmd(c, "start", name) + waitForHealthStatus(c, name, "starting", "healthy") + + // Make it fail + dockerCmd(c, "exec", name, "rm", "/status") + waitForHealthStatus(c, name, "healthy", "unhealthy") + + // Inspect the status + out, _ = dockerCmd(c, "inspect", "--format={{.State.Health.Status}}", name) + c.Check(out, checker.Equals, "unhealthy\n") + + // Make it healthy again + dockerCmd(c, "exec", name, "touch", "/status") + waitForHealthStatus(c, name, "unhealthy", "healthy") + + // Remove container + dockerCmd(c, "rm", "-f", name) + + // Disable the check from the CLI + out, _ = dockerCmd(c, "create", "--name=noh", "--no-healthcheck", imageName) + out, _ = dockerCmd(c, "inspect", "--format={{.Config.Healthcheck.Test}}", "noh") + c.Check(out, checker.Equals, "[NONE]\n") + dockerCmd(c, "rm", "noh") + + // Disable the check with a new build + _, err = buildImage("no_healthcheck", + `FROM testhealth + HEALTHCHECK NONE`, true) + c.Check(err, check.IsNil) + + out, _ = dockerCmd(c, "inspect", "--format={{.ContainerConfig.Healthcheck.Test}}", "no_healthcheck") + c.Check(out, checker.Equals, "[NONE]\n") + + // Enable the checks from the CLI + _, _ = dockerCmd(c, "run", "-d", "--name=fatal_healthcheck", + "--health-interval=0.5s", + "--health-retries=3", + "--health-cmd=cat /status", + "no_healthcheck") + waitForHealthStatus(c, "fatal_healthcheck", "starting", "healthy") + health := getHealth(c, "fatal_healthcheck") + c.Check(health.Status, checker.Equals, "healthy") + c.Check(health.FailingStreak, checker.Equals, 0) + last := health.Log[len(health.Log)-1] + c.Check(last.ExitCode, checker.Equals, 0) + c.Check(last.Output, checker.Equals, "OK\n") + + // Fail the check, which should now make it exit + dockerCmd(c, "exec", "fatal_healthcheck", "rm", "/status") + waitForStatus(c, "fatal_healthcheck", "running", "exited") + + out, _ = dockerCmd(c, "inspect", "--format={{.State.Health.Status}}", "fatal_healthcheck") + c.Check(out, checker.Equals, "unhealthy\n") + failsStr, _ := dockerCmd(c, "inspect", "--format={{.State.Health.FailingStreak}}", "fatal_healthcheck") + fails, err := strconv.Atoi(strings.TrimSpace(failsStr)) + c.Check(err, check.IsNil) + c.Check(fails >= 3, checker.Equals, true) + dockerCmd(c, "rm", "-f", "fatal_healthcheck") + + // Check timeout + // Note: if the interval is too small, it seems that Docker spends all its time running health + // checks and never gets around to killing it. + _, _ = dockerCmd(c, "run", "-d", "--name=test", + "--health-interval=1s", "--health-cmd=sleep 5m", "--health-timeout=1ms", imageName) + waitForHealthStatus(c, "test", "starting", "unhealthy") + health = getHealth(c, "test") + last = health.Log[len(health.Log)-1] + c.Check(health.Status, checker.Equals, "unhealthy") + c.Check(last.ExitCode, checker.Equals, -1) + c.Check(last.Output, checker.Equals, "Health check exceeded timeout (1ms)") + dockerCmd(c, "rm", "-f", "test") +} diff --git a/libcontainerd/client_linux.go b/libcontainerd/client_linux.go index 165597b9a6..10c377154e 100644 --- a/libcontainerd/client_linux.go +++ b/libcontainerd/client_linux.go @@ -190,6 +190,17 @@ func (clnt *client) Signal(containerID string, sig int) error { return err } +func (clnt *client) SignalProcess(containerID string, pid string, sig int) error { + clnt.lock(containerID) + defer clnt.unlock(containerID) + _, err := clnt.remote.apiClient.Signal(context.Background(), &containerd.SignalRequest{ + Id: containerID, + Pid: pid, + Signal: uint32(sig), + }) + return err +} + func (clnt *client) Resize(containerID, processFriendlyName string, width, height int) error { clnt.lock(containerID) defer clnt.unlock(containerID) diff --git a/libcontainerd/client_windows.go b/libcontainerd/client_windows.go index 1f4046507a..78b3b992f1 100644 --- a/libcontainerd/client_windows.go +++ b/libcontainerd/client_windows.go @@ -304,6 +304,25 @@ func (clnt *client) Signal(containerID string, sig int) error { return nil } +// While Linux has support for the full range of signals, signals aren't really implemented on Windows. +// We try to terminate the specified process whatever signal is requested. +func (clnt *client) SignalProcess(containerID string, processFriendlyName string, sig int) error { + clnt.lock(containerID) + defer clnt.unlock(containerID) + cont, err := clnt.getContainer(containerID) + if err != nil { + return err + } + + for _, p := range cont.processes { + if p.friendlyName == processFriendlyName { + return hcsshim.TerminateProcessInComputeSystem(containerID, p.systemPid) + } + } + + return fmt.Errorf("SignalProcess could not find process %s in %s", processFriendlyName, containerID) +} + // Resize handles a CLI event to resize an interactive docker run or docker exec // window. func (clnt *client) Resize(containerID, processFriendlyName string, width, height int) error { diff --git a/libcontainerd/types.go b/libcontainerd/types.go index 15d0fc33f8..357ca1bd4d 100644 --- a/libcontainerd/types.go +++ b/libcontainerd/types.go @@ -34,6 +34,7 @@ type Backend interface { type Client interface { Create(containerID string, spec Spec, options ...CreateOption) error Signal(containerID string, sig int) error + SignalProcess(containerID string, processFriendlyName string, sig int) error AddProcess(containerID, processFriendlyName string, process Process) error Resize(containerID, processFriendlyName string, width, height int) error Pause(containerID string) error diff --git a/runconfig/opts/parse.go b/runconfig/opts/parse.go index 8f9371fd39..c2e009eaa6 100644 --- a/runconfig/opts/parse.go +++ b/runconfig/opts/parse.go @@ -100,6 +100,12 @@ func Parse(cmd *flag.FlagSet, args []string) (*container.Config, *container.Host flStopSignal = cmd.String([]string{"-stop-signal"}, signal.DefaultStopSignal, fmt.Sprintf("Signal to stop a container, %v by default", signal.DefaultStopSignal)) flIsolation = cmd.String([]string{"-isolation"}, "", "Container isolation technology") flShmSize = cmd.String([]string{"-shm-size"}, "", "Size of /dev/shm, default value is 64MB") + // Healthcheck + flNoHealthcheck = cmd.Bool([]string{"-no-healthcheck"}, false, "Disable any container-specified HEALTHCHECK") + flHealthCmd = cmd.String([]string{"-health-cmd"}, "", "Command to run to check health") + flHealthInterval = cmd.Duration([]string{"-health-interval"}, 0, "Time between running the check") + flHealthTimeout = cmd.Duration([]string{"-health-timeout"}, 0, "Maximum time to allow one check to run") + flHealthRetries = cmd.Int([]string{"-health-retries"}, 0, "Consecutive failures needed to report unhealthy") ) cmd.Var(&flAttach, []string{"a", "-attach"}, "Attach to STDIN, STDOUT or STDERR") @@ -351,6 +357,39 @@ func Parse(cmd *flag.FlagSet, args []string) (*container.Config, *container.Host return nil, nil, nil, cmd, err } + // Healthcheck + var healthConfig *container.HealthConfig + haveHealthSettings := *flHealthCmd != "" || + *flHealthInterval != 0 || + *flHealthTimeout != 0 || + *flHealthRetries != 0 + if *flNoHealthcheck { + if haveHealthSettings { + return nil, nil, nil, cmd, fmt.Errorf("--no-healthcheck conflicts with --health-* options") + } + test := strslice.StrSlice{"NONE"} + healthConfig = &container.HealthConfig{Test: test} + } else if haveHealthSettings { + var probe strslice.StrSlice + if *flHealthCmd != "" { + args := []string{"CMD-SHELL", *flHealthCmd} + probe = strslice.StrSlice(args) + } + if *flHealthInterval < 0 { + return nil, nil, nil, cmd, fmt.Errorf("--health-interval cannot be negative") + } + if *flHealthTimeout < 0 { + return nil, nil, nil, cmd, fmt.Errorf("--health-timeout cannot be negative") + } + + healthConfig = &container.HealthConfig{ + Test: probe, + Interval: *flHealthInterval, + Timeout: *flHealthTimeout, + Retries: *flHealthRetries, + } + } + resources := container.Resources{ CgroupParent: *flCgroupParent, Memory: flMemory, @@ -399,6 +438,7 @@ func Parse(cmd *flag.FlagSet, args []string) (*container.Config, *container.Host Entrypoint: entrypoint, WorkingDir: *flWorkingDir, Labels: ConvertKVStringsToMap(labels), + Healthcheck: healthConfig, } if cmd.IsSet("-stop-signal") { config.StopSignal = *flStopSignal diff --git a/runconfig/opts/parse_test.go b/runconfig/opts/parse_test.go index 30f755c792..e3a00a93b4 100644 --- a/runconfig/opts/parse_test.go +++ b/runconfig/opts/parse_test.go @@ -9,6 +9,7 @@ import ( "runtime" "strings" "testing" + "time" flag "github.com/docker/docker/pkg/mflag" "github.com/docker/docker/runconfig" @@ -584,6 +585,45 @@ func TestParseRestartPolicy(t *testing.T) { } } +func TestParseHealth(t *testing.T) { + checkOk := func(args ...string) *container.HealthConfig { + config, _, _, _, err := parseRun(args) + if err != nil { + t.Fatalf("%#v: %v", args, err) + } + return config.Healthcheck + } + checkError := func(expected string, args ...string) { + config, _, _, _, err := parseRun(args) + if err == nil { + t.Fatalf("Expected error, but got %#v", config) + } + if err.Error() != expected { + t.Fatalf("Expected %#v, got %#v", expected, err) + } + } + health := checkOk("--no-healthcheck", "img", "cmd") + if health == nil || len(health.Test) != 1 || health.Test[0] != "NONE" { + t.Fatalf("--no-healthcheck failed: %#v", health) + } + + health = checkOk("--health-cmd=/check.sh -q", "img", "cmd") + if len(health.Test) != 2 || health.Test[0] != "CMD-SHELL" || health.Test[1] != "/check.sh -q" { + t.Fatalf("--health-cmd: got %#v", health.Test) + } + if health.Timeout != 0 { + t.Fatalf("--health-cmd: timeout = %f", health.Timeout) + } + + checkError("--no-healthcheck conflicts with --health-* options", + "--no-healthcheck", "--health-cmd=/check.sh -q", "img", "cmd") + + health = checkOk("--health-timeout=2s", "--health-retries=3", "--health-interval=4.5s", "img", "cmd") + if health.Timeout != 2*time.Second || health.Retries != 3 || health.Interval != 4500*time.Millisecond { + t.Fatalf("--health-*: got %#v", health) + } +} + func TestParseLoggingOpts(t *testing.T) { // logging opts ko if _, _, _, _, err := parseRun([]string{"--log-driver=none", "--log-opt=anything", "img", "cmd"}); err == nil || err.Error() != "invalid logging opts for driver none" {