mirror of
https://github.com/moby/moby.git
synced 2022-11-09 12:21:53 -05:00
Add support for user-defined healthchecks
This PR adds support for user-defined health-check probes for Docker containers. It adds a `HEALTHCHECK` instruction to the Dockerfile syntax plus some corresponding "docker run" options. It can be used with a restart policy to automatically restart a container if the check fails. The `HEALTHCHECK` instruction has two forms: * `HEALTHCHECK [OPTIONS] CMD command` (check container health by running a command inside the container) * `HEALTHCHECK NONE` (disable any healthcheck inherited from the base image) The `HEALTHCHECK` instruction tells Docker how to test a container to check that it is still working. This can detect cases such as a web server that is stuck in an infinite loop and unable to handle new connections, even though the server process is still running. When a container has a healthcheck specified, it has a _health status_ in addition to its normal status. This status is initially `starting`. Whenever a health check passes, it becomes `healthy` (whatever state it was previously in). After a certain number of consecutive failures, it becomes `unhealthy`. The options that can appear before `CMD` are: * `--interval=DURATION` (default: `30s`) * `--timeout=DURATION` (default: `30s`) * `--retries=N` (default: `1`) The health check will first run **interval** seconds after the container is started, and then again **interval** seconds after each previous check completes. If a single run of the check takes longer than **timeout** seconds then the check is considered to have failed. It takes **retries** consecutive failures of the health check for the container to be considered `unhealthy`. There can only be one `HEALTHCHECK` instruction in a Dockerfile. If you list more than one then only the last `HEALTHCHECK` will take effect. The command after the `CMD` keyword can be either a shell command (e.g. `HEALTHCHECK CMD /bin/check-running`) or an _exec_ array (as with other Dockerfile commands; see e.g. `ENTRYPOINT` for details). The command's exit status indicates the health status of the container. The possible values are: - 0: success - the container is healthy and ready for use - 1: unhealthy - the container is not working correctly - 2: starting - the container is not ready for use yet, but is working correctly If the probe returns 2 ("starting") when the container has already moved out of the "starting" state then it is treated as "unhealthy" instead. For example, to check every five minutes or so that a web-server is able to serve the site's main page within three seconds: HEALTHCHECK --interval=5m --timeout=3s \ CMD curl -f http://localhost/ || exit 1 To help debug failing probes, any output text (UTF-8 encoded) that the command writes on stdout or stderr will be stored in the health status and can be queried with `docker inspect`. Such output should be kept short (only the first 4096 bytes are stored currently). When the health status of a container changes, a `health_status` event is generated with the new status. The health status is also displayed in the `docker ps` output. Signed-off-by: Thomas Leonard <thomas.leonard@docker.com> Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
This commit is contained in:
parent
d1b1b6a98e
commit
b6c7becbfe
27 changed files with 1170 additions and 80 deletions
|
@ -17,7 +17,7 @@ type execBackend interface {
|
|||
ContainerExecCreate(name string, config *types.ExecConfig) (string, error)
|
||||
ContainerExecInspect(id string) (*backend.ExecInspect, error)
|
||||
ContainerExecResize(name string, height, width int) error
|
||||
ContainerExecStart(name string, stdin io.ReadCloser, stdout io.Writer, stderr io.Writer) error
|
||||
ContainerExecStart(ctx context.Context, name string, stdin io.ReadCloser, stdout io.Writer, stderr io.Writer) error
|
||||
ExecExists(name string) (bool, error)
|
||||
}
|
||||
|
||||
|
|
|
@ -106,7 +106,8 @@ func (s *containerRouter) postContainerExecStart(ctx context.Context, w http.Res
|
|||
}
|
||||
|
||||
// Now run the user process in container.
|
||||
if err := s.backend.ContainerExecStart(execName, stdin, stdout, stderr); err != nil {
|
||||
// Maybe we should we pass ctx here if we're not detaching?
|
||||
if err := s.backend.ContainerExecStart(context.Background(), execName, stdin, stdout, stderr); err != nil {
|
||||
if execStartCheck.Detach {
|
||||
return err
|
||||
}
|
||||
|
|
|
@ -22,15 +22,16 @@ import (
|
|||
)
|
||||
|
||||
var validCommitCommands = map[string]bool{
|
||||
"cmd": true,
|
||||
"entrypoint": true,
|
||||
"env": true,
|
||||
"expose": true,
|
||||
"label": true,
|
||||
"onbuild": true,
|
||||
"user": true,
|
||||
"volume": true,
|
||||
"workdir": true,
|
||||
"cmd": true,
|
||||
"entrypoint": true,
|
||||
"healthcheck": true,
|
||||
"env": true,
|
||||
"expose": true,
|
||||
"label": true,
|
||||
"onbuild": true,
|
||||
"user": true,
|
||||
"volume": true,
|
||||
"workdir": true,
|
||||
}
|
||||
|
||||
// BuiltinAllowedBuildArgs is list of built-in allowed build args
|
||||
|
|
|
@ -3,40 +3,42 @@ package command
|
|||
|
||||
// Define constants for the command strings
|
||||
const (
|
||||
Env = "env"
|
||||
Label = "label"
|
||||
Maintainer = "maintainer"
|
||||
Add = "add"
|
||||
Copy = "copy"
|
||||
From = "from"
|
||||
Onbuild = "onbuild"
|
||||
Workdir = "workdir"
|
||||
Run = "run"
|
||||
Cmd = "cmd"
|
||||
Entrypoint = "entrypoint"
|
||||
Expose = "expose"
|
||||
Volume = "volume"
|
||||
User = "user"
|
||||
StopSignal = "stopsignal"
|
||||
Arg = "arg"
|
||||
Env = "env"
|
||||
Label = "label"
|
||||
Maintainer = "maintainer"
|
||||
Add = "add"
|
||||
Copy = "copy"
|
||||
From = "from"
|
||||
Onbuild = "onbuild"
|
||||
Workdir = "workdir"
|
||||
Run = "run"
|
||||
Cmd = "cmd"
|
||||
Entrypoint = "entrypoint"
|
||||
Expose = "expose"
|
||||
Volume = "volume"
|
||||
User = "user"
|
||||
StopSignal = "stopsignal"
|
||||
Arg = "arg"
|
||||
Healthcheck = "healthcheck"
|
||||
)
|
||||
|
||||
// Commands is list of all Dockerfile commands
|
||||
var Commands = map[string]struct{}{
|
||||
Env: {},
|
||||
Label: {},
|
||||
Maintainer: {},
|
||||
Add: {},
|
||||
Copy: {},
|
||||
From: {},
|
||||
Onbuild: {},
|
||||
Workdir: {},
|
||||
Run: {},
|
||||
Cmd: {},
|
||||
Entrypoint: {},
|
||||
Expose: {},
|
||||
Volume: {},
|
||||
User: {},
|
||||
StopSignal: {},
|
||||
Arg: {},
|
||||
Env: {},
|
||||
Label: {},
|
||||
Maintainer: {},
|
||||
Add: {},
|
||||
Copy: {},
|
||||
From: {},
|
||||
Onbuild: {},
|
||||
Workdir: {},
|
||||
Run: {},
|
||||
Cmd: {},
|
||||
Entrypoint: {},
|
||||
Expose: {},
|
||||
Volume: {},
|
||||
User: {},
|
||||
StopSignal: {},
|
||||
Arg: {},
|
||||
Healthcheck: {},
|
||||
}
|
||||
|
|
|
@ -12,7 +12,9 @@ import (
|
|||
"regexp"
|
||||
"runtime"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/Sirupsen/logrus"
|
||||
"github.com/docker/docker/api"
|
||||
|
@ -426,6 +428,111 @@ func cmd(b *Builder, args []string, attributes map[string]bool, original string)
|
|||
return nil
|
||||
}
|
||||
|
||||
// parseOptInterval(flag) is the duration of flag.Value, or 0 if
|
||||
// empty. An error is reported if the value is given and is not positive.
|
||||
func parseOptInterval(f *Flag) (time.Duration, error) {
|
||||
s := f.Value
|
||||
if s == "" {
|
||||
return 0, nil
|
||||
}
|
||||
d, err := time.ParseDuration(s)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
if d <= 0 {
|
||||
return 0, fmt.Errorf("Interval %#v must be positive", f.name)
|
||||
}
|
||||
return d, nil
|
||||
}
|
||||
|
||||
// HEALTHCHECK foo
|
||||
//
|
||||
// Set the default healthcheck command to run in the container (which may be empty).
|
||||
// Argument handling is the same as RUN.
|
||||
//
|
||||
func healthcheck(b *Builder, args []string, attributes map[string]bool, original string) error {
|
||||
if len(args) == 0 {
|
||||
return fmt.Errorf("HEALTHCHECK requires an argument")
|
||||
}
|
||||
typ := strings.ToUpper(args[0])
|
||||
args = args[1:]
|
||||
if typ == "NONE" {
|
||||
if len(args) != 0 {
|
||||
return fmt.Errorf("HEALTHCHECK NONE takes no arguments")
|
||||
}
|
||||
test := strslice.StrSlice{typ}
|
||||
b.runConfig.Healthcheck = &container.HealthConfig{
|
||||
Test: test,
|
||||
}
|
||||
} else {
|
||||
if b.runConfig.Healthcheck != nil {
|
||||
oldCmd := b.runConfig.Healthcheck.Test
|
||||
if len(oldCmd) > 0 && oldCmd[0] != "NONE" {
|
||||
fmt.Fprintf(b.Stdout, "Note: overriding previous HEALTHCHECK: %v\n", oldCmd)
|
||||
}
|
||||
}
|
||||
|
||||
healthcheck := container.HealthConfig{}
|
||||
|
||||
flInterval := b.flags.AddString("interval", "")
|
||||
flTimeout := b.flags.AddString("timeout", "")
|
||||
flRetries := b.flags.AddString("retries", "")
|
||||
|
||||
if err := b.flags.Parse(); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
switch typ {
|
||||
case "CMD":
|
||||
cmdSlice := handleJSONArgs(args, attributes)
|
||||
if len(cmdSlice) == 0 {
|
||||
return fmt.Errorf("Missing command after HEALTHCHECK CMD")
|
||||
}
|
||||
|
||||
if !attributes["json"] {
|
||||
typ = "CMD-SHELL"
|
||||
}
|
||||
|
||||
healthcheck.Test = strslice.StrSlice(append([]string{typ}, cmdSlice...))
|
||||
default:
|
||||
return fmt.Errorf("Unknown type %#v in HEALTHCHECK (try CMD)", typ)
|
||||
}
|
||||
|
||||
interval, err := parseOptInterval(flInterval)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
healthcheck.Interval = interval
|
||||
|
||||
timeout, err := parseOptInterval(flTimeout)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
healthcheck.Timeout = timeout
|
||||
|
||||
if flRetries.Value != "" {
|
||||
retries, err := strconv.ParseInt(flRetries.Value, 10, 32)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if retries < 1 {
|
||||
return fmt.Errorf("--retries must be at least 1 (not %d)", retries)
|
||||
}
|
||||
healthcheck.Retries = int(retries)
|
||||
} else {
|
||||
healthcheck.Retries = 0
|
||||
}
|
||||
|
||||
b.runConfig.Healthcheck = &healthcheck
|
||||
}
|
||||
|
||||
if err := b.commit("", b.runConfig.Cmd, fmt.Sprintf("HEALTHCHECK %q", b.runConfig.Healthcheck)); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// ENTRYPOINT /usr/sbin/nginx
|
||||
//
|
||||
// Set the entrypoint (which defaults to sh -c on linux, or cmd /S /C on Windows) to
|
||||
|
|
|
@ -58,22 +58,23 @@ var evaluateTable map[string]func(*Builder, []string, map[string]bool, string) e
|
|||
|
||||
func init() {
|
||||
evaluateTable = map[string]func(*Builder, []string, map[string]bool, string) error{
|
||||
command.Env: env,
|
||||
command.Label: label,
|
||||
command.Maintainer: maintainer,
|
||||
command.Add: add,
|
||||
command.Copy: dispatchCopy, // copy() is a go builtin
|
||||
command.From: from,
|
||||
command.Onbuild: onbuild,
|
||||
command.Workdir: workdir,
|
||||
command.Run: run,
|
||||
command.Cmd: cmd,
|
||||
command.Entrypoint: entrypoint,
|
||||
command.Expose: expose,
|
||||
command.Volume: volume,
|
||||
command.User: user,
|
||||
command.StopSignal: stopSignal,
|
||||
command.Arg: arg,
|
||||
command.Env: env,
|
||||
command.Label: label,
|
||||
command.Maintainer: maintainer,
|
||||
command.Add: add,
|
||||
command.Copy: dispatchCopy, // copy() is a go builtin
|
||||
command.From: from,
|
||||
command.Onbuild: onbuild,
|
||||
command.Workdir: workdir,
|
||||
command.Run: run,
|
||||
command.Cmd: cmd,
|
||||
command.Entrypoint: entrypoint,
|
||||
command.Expose: expose,
|
||||
command.Volume: volume,
|
||||
command.User: user,
|
||||
command.StopSignal: stopSignal,
|
||||
command.Arg: arg,
|
||||
command.Healthcheck: healthcheck,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -329,3 +329,32 @@ func parseMaybeJSONToList(rest string) (*Node, map[string]bool, error) {
|
|||
|
||||
return parseStringsWhitespaceDelimited(rest)
|
||||
}
|
||||
|
||||
// The HEALTHCHECK command is like parseMaybeJSON, but has an extra type argument.
|
||||
func parseHealthConfig(rest string) (*Node, map[string]bool, error) {
|
||||
// Find end of first argument
|
||||
var sep int
|
||||
for ; sep < len(rest); sep++ {
|
||||
if unicode.IsSpace(rune(rest[sep])) {
|
||||
break
|
||||
}
|
||||
}
|
||||
next := sep
|
||||
for ; next < len(rest); next++ {
|
||||
if !unicode.IsSpace(rune(rest[next])) {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if sep == 0 {
|
||||
return nil, nil, nil
|
||||
}
|
||||
|
||||
typ := rest[:sep]
|
||||
cmd, attrs, err := parseMaybeJSON(rest[next:])
|
||||
if err != nil {
|
||||
return nil, nil, err
|
||||
}
|
||||
|
||||
return &Node{Value: typ, Next: cmd, Attributes: attrs}, nil, err
|
||||
}
|
||||
|
|
|
@ -66,22 +66,23 @@ func init() {
|
|||
// functions. Errors are propagated up by Parse() and the resulting AST can
|
||||
// be incorporated directly into the existing AST as a next.
|
||||
dispatch = map[string]func(string) (*Node, map[string]bool, error){
|
||||
command.User: parseString,
|
||||
command.Onbuild: parseSubCommand,
|
||||
command.Workdir: parseString,
|
||||
command.Env: parseEnv,
|
||||
command.Label: parseLabel,
|
||||
command.Maintainer: parseString,
|
||||
command.From: parseString,
|
||||
command.Add: parseMaybeJSONToList,
|
||||
command.Copy: parseMaybeJSONToList,
|
||||
command.Run: parseMaybeJSON,
|
||||
command.Cmd: parseMaybeJSON,
|
||||
command.Entrypoint: parseMaybeJSON,
|
||||
command.Expose: parseStringsWhitespaceDelimited,
|
||||
command.Volume: parseMaybeJSONToList,
|
||||
command.StopSignal: parseString,
|
||||
command.Arg: parseNameOrNameVal,
|
||||
command.User: parseString,
|
||||
command.Onbuild: parseSubCommand,
|
||||
command.Workdir: parseString,
|
||||
command.Env: parseEnv,
|
||||
command.Label: parseLabel,
|
||||
command.Maintainer: parseString,
|
||||
command.From: parseString,
|
||||
command.Add: parseMaybeJSONToList,
|
||||
command.Copy: parseMaybeJSONToList,
|
||||
command.Run: parseMaybeJSON,
|
||||
command.Cmd: parseMaybeJSON,
|
||||
command.Entrypoint: parseMaybeJSON,
|
||||
command.Expose: parseStringsWhitespaceDelimited,
|
||||
command.Volume: parseMaybeJSONToList,
|
||||
command.StopSignal: parseString,
|
||||
command.Arg: parseNameOrNameVal,
|
||||
command.Healthcheck: parseHealthConfig,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
10
builder/dockerfile/parser/testfiles/health/Dockerfile
Normal file
10
builder/dockerfile/parser/testfiles/health/Dockerfile
Normal file
|
@ -0,0 +1,10 @@
|
|||
FROM debian
|
||||
ADD check.sh main.sh /app/
|
||||
CMD /app/main.sh
|
||||
HEALTHCHECK
|
||||
HEALTHCHECK --interval=5s --timeout=3s --retries=1 \
|
||||
CMD /app/check.sh --quiet
|
||||
HEALTHCHECK CMD
|
||||
HEALTHCHECK CMD a b
|
||||
HEALTHCHECK --timeout=3s CMD ["foo"]
|
||||
HEALTHCHECK CONNECT TCP 7000
|
9
builder/dockerfile/parser/testfiles/health/result
Normal file
9
builder/dockerfile/parser/testfiles/health/result
Normal file
|
@ -0,0 +1,9 @@
|
|||
(from "debian")
|
||||
(add "check.sh" "main.sh" "/app/")
|
||||
(cmd "/app/main.sh")
|
||||
(healthcheck)
|
||||
(healthcheck ["--interval=5s" "--timeout=3s" "--retries=1"] "CMD" "/app/check.sh --quiet")
|
||||
(healthcheck "CMD")
|
||||
(healthcheck "CMD" "a b")
|
||||
(healthcheck ["--timeout=3s"] "CMD" "foo")
|
||||
(healthcheck "CONNECT" "TCP 7000")
|
49
container/health.go
Normal file
49
container/health.go
Normal file
|
@ -0,0 +1,49 @@
|
|||
package container
|
||||
|
||||
import (
|
||||
"github.com/Sirupsen/logrus"
|
||||
"github.com/docker/engine-api/types"
|
||||
)
|
||||
|
||||
// Health holds the current container health-check state
|
||||
type Health struct {
|
||||
types.Health
|
||||
stop chan struct{} // Write struct{} to stop the monitor
|
||||
}
|
||||
|
||||
// String returns a human-readable description of the health-check state
|
||||
func (s *Health) String() string {
|
||||
if s.stop == nil {
|
||||
return "no healthcheck"
|
||||
}
|
||||
switch s.Status {
|
||||
case types.Starting:
|
||||
return "health: starting"
|
||||
default: // Healthy and Unhealthy are clear on their own
|
||||
return s.Status
|
||||
}
|
||||
}
|
||||
|
||||
// OpenMonitorChannel creates and returns a new monitor channel. If there already is one,
|
||||
// it returns nil.
|
||||
func (s *Health) OpenMonitorChannel() chan struct{} {
|
||||
if s.stop == nil {
|
||||
logrus.Debugf("OpenMonitorChannel")
|
||||
s.stop = make(chan struct{})
|
||||
return s.stop
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// CloseMonitorChannel closes any existing monitor channel.
|
||||
func (s *Health) CloseMonitorChannel() {
|
||||
if s.stop != nil {
|
||||
logrus.Debugf("CloseMonitorChannel: waiting for probe to stop")
|
||||
// This channel does not buffer. Once the write succeeds, the monitor
|
||||
// has read the stop request and will not make any further updates
|
||||
// to c.State.Health.
|
||||
s.stop <- struct{}{}
|
||||
s.stop = nil
|
||||
logrus.Debugf("CloseMonitorChannel done")
|
||||
}
|
||||
}
|
|
@ -27,6 +27,7 @@ type State struct {
|
|||
StartedAt time.Time
|
||||
FinishedAt time.Time
|
||||
waitChan chan struct{}
|
||||
Health *Health
|
||||
}
|
||||
|
||||
// NewState creates a default state object with a fresh channel for state changes.
|
||||
|
@ -46,6 +47,9 @@ func (s *State) String() string {
|
|||
return fmt.Sprintf("Restarting (%d) %s ago", s.ExitCode, units.HumanDuration(time.Now().UTC().Sub(s.FinishedAt)))
|
||||
}
|
||||
|
||||
if h := s.Health; h != nil {
|
||||
return fmt.Sprintf("Up %s (%s)", units.HumanDuration(time.Now().UTC().Sub(s.StartedAt)), h.String())
|
||||
}
|
||||
return fmt.Sprintf("Up %s", units.HumanDuration(time.Now().UTC().Sub(s.StartedAt)))
|
||||
}
|
||||
|
||||
|
|
|
@ -80,6 +80,25 @@ func merge(userConf, imageConf *containertypes.Config) error {
|
|||
userConf.Entrypoint = imageConf.Entrypoint
|
||||
}
|
||||
}
|
||||
if imageConf.Healthcheck != nil {
|
||||
if userConf.Healthcheck == nil {
|
||||
userConf.Healthcheck = imageConf.Healthcheck
|
||||
} else {
|
||||
if len(userConf.Healthcheck.Test) == 0 {
|
||||
userConf.Healthcheck.Test = imageConf.Healthcheck.Test
|
||||
}
|
||||
if userConf.Healthcheck.Interval == 0 {
|
||||
userConf.Healthcheck.Interval = imageConf.Healthcheck.Interval
|
||||
}
|
||||
if userConf.Healthcheck.Timeout == 0 {
|
||||
userConf.Healthcheck.Timeout = imageConf.Healthcheck.Timeout
|
||||
}
|
||||
if userConf.Healthcheck.Retries == 0 {
|
||||
userConf.Healthcheck.Retries = imageConf.Healthcheck.Retries
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if userConf.WorkingDir == "" {
|
||||
userConf.WorkingDir = imageConf.WorkingDir
|
||||
}
|
||||
|
|
|
@ -14,11 +14,15 @@ import (
|
|||
"github.com/docker/docker/errors"
|
||||
"github.com/docker/docker/libcontainerd"
|
||||
"github.com/docker/docker/pkg/pools"
|
||||
"github.com/docker/docker/pkg/signal"
|
||||
"github.com/docker/docker/pkg/term"
|
||||
"github.com/docker/engine-api/types"
|
||||
"github.com/docker/engine-api/types/strslice"
|
||||
)
|
||||
|
||||
// Seconds to wait after sending TERM before trying KILL
|
||||
const termProcessTimeout = 10
|
||||
|
||||
func (d *Daemon) registerExecCommand(container *container.Container, config *exec.Config) {
|
||||
// Storing execs in container in order to kill them gracefully whenever the container is stopped or removed.
|
||||
container.ExecCommands.Add(config.ID, config)
|
||||
|
@ -130,7 +134,8 @@ func (d *Daemon) ContainerExecCreate(name string, config *types.ExecConfig) (str
|
|||
|
||||
// ContainerExecStart starts a previously set up exec instance. The
|
||||
// std streams are set up.
|
||||
func (d *Daemon) ContainerExecStart(name string, stdin io.ReadCloser, stdout io.Writer, stderr io.Writer) (err error) {
|
||||
// If ctx is cancelled, the process is terminated.
|
||||
func (d *Daemon) ContainerExecStart(ctx context.Context, name string, stdin io.ReadCloser, stdout io.Writer, stderr io.Writer) (err error) {
|
||||
var (
|
||||
cStdin io.ReadCloser
|
||||
cStdout, cStderr io.Writer
|
||||
|
@ -197,15 +202,28 @@ func (d *Daemon) ContainerExecStart(name string, stdin io.ReadCloser, stdout io.
|
|||
return nil
|
||||
}
|
||||
|
||||
attachErr := container.AttachStreams(context.Background(), ec.StreamConfig, ec.OpenStdin, true, ec.Tty, cStdin, cStdout, cStderr, ec.DetachKeys)
|
||||
attachErr := container.AttachStreams(ctx, ec.StreamConfig, ec.OpenStdin, true, ec.Tty, cStdin, cStdout, cStderr, ec.DetachKeys)
|
||||
|
||||
if err := d.containerd.AddProcess(c.ID, name, p); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
err = <-attachErr
|
||||
if err != nil {
|
||||
return fmt.Errorf("attach failed with error: %v", err)
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
logrus.Debugf("Sending TERM signal to process %v in container %v", name, c.ID)
|
||||
d.containerd.SignalProcess(c.ID, name, int(signal.SignalMap["TERM"]))
|
||||
select {
|
||||
case <-time.After(termProcessTimeout * time.Second):
|
||||
logrus.Infof("Container %v, process %v failed to exit within %d seconds of signal TERM - using the force", c.ID, name, termProcessTimeout)
|
||||
d.containerd.SignalProcess(c.ID, name, int(signal.SignalMap["KILL"]))
|
||||
case <-attachErr:
|
||||
// TERM signal worked
|
||||
}
|
||||
return fmt.Errorf("context cancelled")
|
||||
case err := <-attachErr:
|
||||
if err != nil {
|
||||
return fmt.Errorf("attach failed with error: %v", err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
|
314
daemon/health.go
Normal file
314
daemon/health.go
Normal file
|
@ -0,0 +1,314 @@
|
|||
package daemon
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"runtime"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"golang.org/x/net/context"
|
||||
|
||||
"github.com/Sirupsen/logrus"
|
||||
"github.com/docker/docker/container"
|
||||
"github.com/docker/docker/daemon/exec"
|
||||
"github.com/docker/engine-api/types"
|
||||
"github.com/docker/engine-api/types/strslice"
|
||||
)
|
||||
|
||||
const (
|
||||
// Longest healthcheck probe output message to store. Longer messages will be truncated.
|
||||
maxOutputLen = 4096
|
||||
|
||||
// Default interval between probe runs (from the end of the first to the start of the second).
|
||||
// Also the time before the first probe.
|
||||
defaultProbeInterval = 30 * time.Second
|
||||
|
||||
// The maximum length of time a single probe run should take. If the probe takes longer
|
||||
// than this, the check is considered to have failed.
|
||||
defaultProbeTimeout = 30 * time.Second
|
||||
|
||||
// Shut down a container if it becomes Unhealthy.
|
||||
defaultExitOnUnhealthy = true
|
||||
|
||||
// Maximum number of entries to record
|
||||
maxLogEntries = 5
|
||||
)
|
||||
|
||||
const (
|
||||
// Exit status codes that can be returned by the probe command.
|
||||
|
||||
exitStatusHealthy = 0 // Container is healthy
|
||||
exitStatusUnhealthy = 1 // Container is unhealthy
|
||||
exitStatusStarting = 2 // Container needs more time to start
|
||||
)
|
||||
|
||||
// probe implementations know how to run a particular type of probe.
|
||||
type probe interface {
|
||||
// Perform one run of the check. Returns the exit code and an optional
|
||||
// short diagnostic string.
|
||||
run(context.Context, *Daemon, *container.Container) (*types.HealthcheckResult, error)
|
||||
}
|
||||
|
||||
// cmdProbe implements the "CMD" probe type.
|
||||
type cmdProbe struct {
|
||||
// Run the command with the system's default shell instead of execing it directly.
|
||||
shell bool
|
||||
}
|
||||
|
||||
// exec the healthcheck command in the container.
|
||||
// Returns the exit code and probe output (if any)
|
||||
func (p *cmdProbe) run(ctx context.Context, d *Daemon, container *container.Container) (*types.HealthcheckResult, error) {
|
||||
cmdSlice := strslice.StrSlice(container.Config.Healthcheck.Test)[1:]
|
||||
if p.shell {
|
||||
if runtime.GOOS != "windows" {
|
||||
cmdSlice = append([]string{"/bin/sh", "-c"}, cmdSlice...)
|
||||
} else {
|
||||
cmdSlice = append([]string{"cmd", "/S", "/C"}, cmdSlice...)
|
||||
}
|
||||
}
|
||||
entrypoint, args := d.getEntrypointAndArgs(strslice.StrSlice{}, cmdSlice)
|
||||
execConfig := exec.NewConfig()
|
||||
execConfig.OpenStdin = false
|
||||
execConfig.OpenStdout = true
|
||||
execConfig.OpenStderr = true
|
||||
execConfig.ContainerID = container.ID
|
||||
execConfig.DetachKeys = []byte{}
|
||||
execConfig.Entrypoint = entrypoint
|
||||
execConfig.Args = args
|
||||
execConfig.Tty = false
|
||||
execConfig.Privileged = false
|
||||
execConfig.User = container.Config.User
|
||||
|
||||
d.registerExecCommand(container, execConfig)
|
||||
d.LogContainerEvent(container, "exec_create: "+execConfig.Entrypoint+" "+strings.Join(execConfig.Args, " "))
|
||||
|
||||
output := &limitedBuffer{}
|
||||
err := d.ContainerExecStart(ctx, execConfig.ID, nil, output, output)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
info, err := d.getExecConfig(execConfig.ID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if info.ExitCode == nil {
|
||||
return nil, fmt.Errorf("Healthcheck has no exit code!")
|
||||
}
|
||||
// Note: Go's json package will handle invalid UTF-8 for us
|
||||
out := output.String()
|
||||
return &types.HealthcheckResult{
|
||||
End: time.Now(),
|
||||
ExitCode: *info.ExitCode,
|
||||
Output: out,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Update the container's Status.Health struct based on the latest probe's result.
|
||||
func handleProbeResult(d *Daemon, c *container.Container, result *types.HealthcheckResult) {
|
||||
c.Lock()
|
||||
defer c.Unlock()
|
||||
|
||||
retries := c.Config.Healthcheck.Retries
|
||||
if retries <= 0 {
|
||||
retries = 1 // Default if unset or set to an invalid value
|
||||
}
|
||||
|
||||
h := c.State.Health
|
||||
oldStatus := h.Status
|
||||
|
||||
if len(h.Log) >= maxLogEntries {
|
||||
h.Log = append(h.Log[len(h.Log)+1-maxLogEntries:], result)
|
||||
} else {
|
||||
h.Log = append(h.Log, result)
|
||||
}
|
||||
|
||||
if result.ExitCode == exitStatusHealthy {
|
||||
h.FailingStreak = 0
|
||||
h.Status = types.Healthy
|
||||
} else if result.ExitCode == exitStatusStarting && c.State.Health.Status == types.Starting {
|
||||
// The container is not ready yet. Remain in the starting state.
|
||||
} else {
|
||||
// Failure (incuding invalid exit code)
|
||||
h.FailingStreak++
|
||||
if c.State.Health.FailingStreak >= retries {
|
||||
h.Status = types.Unhealthy
|
||||
}
|
||||
// Else we're starting or healthy. Stay in that state.
|
||||
}
|
||||
|
||||
if oldStatus != h.Status {
|
||||
d.LogContainerEvent(c, "health_status: "+h.Status)
|
||||
}
|
||||
}
|
||||
|
||||
// Run the container's monitoring thread until notified via "stop".
|
||||
// There is never more than one monitor thread running per container at a time.
|
||||
func monitor(d *Daemon, c *container.Container, stop chan struct{}, probe probe) {
|
||||
probeTimeout := timeoutWithDefault(c.Config.Healthcheck.Timeout, defaultProbeTimeout)
|
||||
probeInterval := timeoutWithDefault(c.Config.Healthcheck.Interval, defaultProbeInterval)
|
||||
for {
|
||||
select {
|
||||
case <-stop:
|
||||
logrus.Debugf("Stop healthcheck monitoring (received while idle)")
|
||||
return
|
||||
case <-time.After(probeInterval):
|
||||
logrus.Debugf("Running health check...")
|
||||
startTime := time.Now()
|
||||
ctx, cancelProbe := context.WithTimeout(context.Background(), probeTimeout)
|
||||
results := make(chan *types.HealthcheckResult)
|
||||
go func() {
|
||||
result, err := probe.run(ctx, d, c)
|
||||
if err != nil {
|
||||
logrus.Warnf("Health check error: %v", err)
|
||||
results <- &types.HealthcheckResult{
|
||||
ExitCode: -1,
|
||||
Output: err.Error(),
|
||||
Start: startTime,
|
||||
End: time.Now(),
|
||||
}
|
||||
} else {
|
||||
result.Start = startTime
|
||||
logrus.Debugf("Health check done (exitCode=%d)", result.ExitCode)
|
||||
results <- result
|
||||
}
|
||||
close(results)
|
||||
}()
|
||||
select {
|
||||
case <-stop:
|
||||
logrus.Debugf("Stop healthcheck monitoring (received while probing)")
|
||||
// Stop timeout and kill probe, but don't wait for probe to exit.
|
||||
cancelProbe()
|
||||
return
|
||||
case result := <-results:
|
||||
handleProbeResult(d, c, result)
|
||||
// Stop timeout
|
||||
cancelProbe()
|
||||
case <-ctx.Done():
|
||||
logrus.Debugf("Health check taking too long")
|
||||
handleProbeResult(d, c, &types.HealthcheckResult{
|
||||
ExitCode: -1,
|
||||
Output: fmt.Sprintf("Health check exceeded timeout (%v)", probeTimeout),
|
||||
Start: startTime,
|
||||
End: time.Now(),
|
||||
})
|
||||
cancelProbe()
|
||||
// Wait for probe to exit (it might take a while to respond to the TERM
|
||||
// signal and we don't want dying probes to pile up).
|
||||
<-results
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Get a suitable probe implementation for the container's healthcheck configuration.
|
||||
func getProbe(c *container.Container) probe {
|
||||
config := c.Config.Healthcheck
|
||||
if config == nil || len(config.Test) == 0 {
|
||||
return nil
|
||||
}
|
||||
switch config.Test[0] {
|
||||
case "CMD":
|
||||
return &cmdProbe{shell: false}
|
||||
case "CMD-SHELL":
|
||||
return &cmdProbe{shell: true}
|
||||
default:
|
||||
logrus.Warnf("Unknown healthcheck type '%s' (expected 'CMD')", config.Test[0])
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
||||
// Ensure the health-check monitor is running or not, depending on the current
|
||||
// state of the container.
|
||||
// Called from monitor.go, with c locked.
|
||||
func (d *Daemon) updateHealthMonitor(c *container.Container) {
|
||||
h := c.State.Health
|
||||
if h == nil {
|
||||
return // No healthcheck configured
|
||||
}
|
||||
|
||||
probe := getProbe(c)
|
||||
wantRunning := c.Running && !c.Paused && probe != nil
|
||||
if wantRunning {
|
||||
if stop := h.OpenMonitorChannel(); stop != nil {
|
||||
go monitor(d, c, stop, probe)
|
||||
}
|
||||
} else {
|
||||
h.CloseMonitorChannel()
|
||||
}
|
||||
}
|
||||
|
||||
// Reset the health state for a newly-started, restarted or restored container.
|
||||
// initHealthMonitor is called from monitor.go and we should never be running
|
||||
// two instances at once.
|
||||
// Called with c locked.
|
||||
func (d *Daemon) initHealthMonitor(c *container.Container) {
|
||||
if c.Config.Healthcheck == nil {
|
||||
return
|
||||
}
|
||||
|
||||
// This is needed in case we're auto-restarting
|
||||
d.stopHealthchecks(c)
|
||||
|
||||
if c.State.Health == nil {
|
||||
h := &container.Health{}
|
||||
h.Status = types.Starting
|
||||
h.FailingStreak = 0
|
||||
c.State.Health = h
|
||||
}
|
||||
|
||||
d.updateHealthMonitor(c)
|
||||
}
|
||||
|
||||
// Called when the container is being stopped (whether because the health check is
|
||||
// failing or for any other reason).
|
||||
func (d *Daemon) stopHealthchecks(c *container.Container) {
|
||||
h := c.State.Health
|
||||
if h != nil {
|
||||
h.CloseMonitorChannel()
|
||||
}
|
||||
}
|
||||
|
||||
// Buffer up to maxOutputLen bytes. Further data is discarded.
|
||||
type limitedBuffer struct {
|
||||
buf bytes.Buffer
|
||||
truncated bool // indicates that data has been lost
|
||||
}
|
||||
|
||||
// Append to limitedBuffer while there is room.
|
||||
func (b *limitedBuffer) Write(data []byte) (int, error) {
|
||||
bufLen := b.buf.Len()
|
||||
dataLen := len(data)
|
||||
keep := min(maxOutputLen-bufLen, dataLen)
|
||||
if keep > 0 {
|
||||
b.buf.Write(data[:keep])
|
||||
}
|
||||
if keep < dataLen {
|
||||
b.truncated = true
|
||||
}
|
||||
return dataLen, nil
|
||||
}
|
||||
|
||||
// The contents of the buffer, with "..." appended if it overflowed.
|
||||
func (b *limitedBuffer) String() string {
|
||||
out := b.buf.String()
|
||||
if b.truncated {
|
||||
out = out + "..."
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
// If configuredValue is zero, use defaultValue instead.
|
||||
func timeoutWithDefault(configuredValue time.Duration, defaultValue time.Duration) time.Duration {
|
||||
if configuredValue == 0 {
|
||||
return defaultValue
|
||||
}
|
||||
return configuredValue
|
||||
}
|
||||
|
||||
func min(x, y int) int {
|
||||
if x < y {
|
||||
return x
|
||||
}
|
||||
return y
|
||||
}
|
112
daemon/health_test.go
Normal file
112
daemon/health_test.go
Normal file
|
@ -0,0 +1,112 @@
|
|||
package daemon
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/docker/docker/container"
|
||||
"github.com/docker/docker/daemon/events"
|
||||
"github.com/docker/engine-api/types"
|
||||
containertypes "github.com/docker/engine-api/types/container"
|
||||
eventtypes "github.com/docker/engine-api/types/events"
|
||||
)
|
||||
|
||||
func reset(c *container.Container) {
|
||||
c.State = &container.State{}
|
||||
c.State.Health = &container.Health{}
|
||||
c.State.Health.Status = types.Starting
|
||||
}
|
||||
|
||||
func TestHealthStates(t *testing.T) {
|
||||
e := events.New()
|
||||
_, l, _ := e.Subscribe()
|
||||
defer e.Evict(l)
|
||||
|
||||
expect := func(expected string) {
|
||||
select {
|
||||
case event := <-l:
|
||||
ev := event.(eventtypes.Message)
|
||||
if ev.Status != expected {
|
||||
t.Errorf("Expecting event %#v, but got %#v\n", expected, ev.Status)
|
||||
}
|
||||
case <-time.After(1 * time.Second):
|
||||
t.Errorf("Expecting event %#v, but got nothing\n", expected)
|
||||
}
|
||||
}
|
||||
|
||||
c := &container.Container{
|
||||
CommonContainer: container.CommonContainer{
|
||||
ID: "container_id",
|
||||
Name: "container_name",
|
||||
Config: &containertypes.Config{
|
||||
Image: "image_name",
|
||||
},
|
||||
},
|
||||
}
|
||||
daemon := &Daemon{
|
||||
EventsService: e,
|
||||
}
|
||||
|
||||
c.Config.Healthcheck = &containertypes.HealthConfig{
|
||||
Retries: 1,
|
||||
}
|
||||
|
||||
reset(c)
|
||||
|
||||
handleResult := func(startTime time.Time, exitCode int) {
|
||||
handleProbeResult(daemon, c, &types.HealthcheckResult{
|
||||
Start: startTime,
|
||||
End: startTime,
|
||||
ExitCode: exitCode,
|
||||
})
|
||||
}
|
||||
|
||||
// starting -> failed -> success -> failed
|
||||
|
||||
handleResult(c.State.StartedAt.Add(1*time.Second), 1)
|
||||
expect("health_status: unhealthy")
|
||||
|
||||
handleResult(c.State.StartedAt.Add(2*time.Second), 0)
|
||||
expect("health_status: healthy")
|
||||
|
||||
handleResult(c.State.StartedAt.Add(3*time.Second), 1)
|
||||
expect("health_status: unhealthy")
|
||||
|
||||
// starting -> starting -> starting ->
|
||||
// healthy -> starting (invalid transition)
|
||||
|
||||
reset(c)
|
||||
|
||||
handleResult(c.State.StartedAt.Add(20*time.Second), 2)
|
||||
handleResult(c.State.StartedAt.Add(40*time.Second), 2)
|
||||
if c.State.Health.Status != types.Starting {
|
||||
t.Errorf("Expecting starting, but got %#v\n", c.State.Health.Status)
|
||||
}
|
||||
|
||||
handleResult(c.State.StartedAt.Add(50*time.Second), 0)
|
||||
expect("health_status: healthy")
|
||||
handleResult(c.State.StartedAt.Add(60*time.Second), 2)
|
||||
expect("health_status: unhealthy")
|
||||
|
||||
// Test retries
|
||||
|
||||
reset(c)
|
||||
c.Config.Healthcheck.Retries = 3
|
||||
|
||||
handleResult(c.State.StartedAt.Add(20*time.Second), 1)
|
||||
handleResult(c.State.StartedAt.Add(40*time.Second), 1)
|
||||
if c.State.Health.Status != types.Starting {
|
||||
t.Errorf("Expecting starting, but got %#v\n", c.State.Health.Status)
|
||||
}
|
||||
if c.State.Health.FailingStreak != 2 {
|
||||
t.Errorf("Expecting FailingStreak=2, but got %d\n", c.State.Health.FailingStreak)
|
||||
}
|
||||
handleResult(c.State.StartedAt.Add(60*time.Second), 1)
|
||||
expect("health_status: unhealthy")
|
||||
|
||||
handleResult(c.State.StartedAt.Add(80*time.Second), 0)
|
||||
expect("health_status: healthy")
|
||||
if c.State.Health.FailingStreak != 0 {
|
||||
t.Errorf("Expecting FailingStreak=0, but got %d\n", c.State.Health.FailingStreak)
|
||||
}
|
||||
}
|
|
@ -108,6 +108,15 @@ func (daemon *Daemon) getInspectData(container *container.Container, size bool)
|
|||
hostConfig.Links = append(hostConfig.Links, fmt.Sprintf("%s:%s", child.Name, linkAlias))
|
||||
}
|
||||
|
||||
var containerHealth *types.Health
|
||||
if container.State.Health != nil {
|
||||
containerHealth = &types.Health{
|
||||
Status: container.State.Health.Status,
|
||||
FailingStreak: container.State.Health.FailingStreak,
|
||||
Log: append([]*types.HealthcheckResult{}, container.State.Health.Log...),
|
||||
}
|
||||
}
|
||||
|
||||
containerState := &types.ContainerState{
|
||||
Status: container.State.StateString(),
|
||||
Running: container.State.Running,
|
||||
|
@ -120,6 +129,7 @@ func (daemon *Daemon) getInspectData(container *container.Container, size bool)
|
|||
Error: container.State.Error,
|
||||
StartedAt: container.State.StartedAt.Format(time.RFC3339Nano),
|
||||
FinishedAt: container.State.FinishedAt.Format(time.RFC3339Nano),
|
||||
Health: containerHealth,
|
||||
}
|
||||
|
||||
contJSONBase := &types.ContainerJSONBase{
|
||||
|
|
|
@ -25,6 +25,7 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
|
|||
if runtime.GOOS == "windows" {
|
||||
return errors.New("Received StateOOM from libcontainerd on Windows. This should never happen.")
|
||||
}
|
||||
daemon.updateHealthMonitor(c)
|
||||
daemon.LogContainerEvent(c, "oom")
|
||||
case libcontainerd.StateExit:
|
||||
c.Lock()
|
||||
|
@ -35,6 +36,7 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
|
|||
attributes := map[string]string{
|
||||
"exitCode": strconv.Itoa(int(e.ExitCode)),
|
||||
}
|
||||
daemon.updateHealthMonitor(c)
|
||||
daemon.LogContainerEventWithAttributes(c, "die", attributes)
|
||||
daemon.Cleanup(c)
|
||||
// FIXME: here is race condition between two RUN instructions in Dockerfile
|
||||
|
@ -54,6 +56,7 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
|
|||
"exitCode": strconv.Itoa(int(e.ExitCode)),
|
||||
}
|
||||
daemon.LogContainerEventWithAttributes(c, "die", attributes)
|
||||
daemon.updateHealthMonitor(c)
|
||||
return c.ToDisk()
|
||||
case libcontainerd.StateExitProcess:
|
||||
c.Lock()
|
||||
|
@ -74,18 +77,24 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
|
|||
logrus.Warnf("Ignoring StateExitProcess for %v but no exec command found", e)
|
||||
}
|
||||
case libcontainerd.StateStart, libcontainerd.StateRestore:
|
||||
// Container is already locked in this case
|
||||
c.SetRunning(int(e.Pid), e.State == libcontainerd.StateStart)
|
||||
c.HasBeenManuallyStopped = false
|
||||
if err := c.ToDisk(); err != nil {
|
||||
c.Reset(false)
|
||||
return err
|
||||
}
|
||||
daemon.initHealthMonitor(c)
|
||||
daemon.LogContainerEvent(c, "start")
|
||||
case libcontainerd.StatePause:
|
||||
// Container is already locked in this case
|
||||
c.Paused = true
|
||||
daemon.updateHealthMonitor(c)
|
||||
daemon.LogContainerEvent(c, "pause")
|
||||
case libcontainerd.StateResume:
|
||||
// Container is already locked in this case
|
||||
c.Paused = false
|
||||
daemon.updateHealthMonitor(c)
|
||||
daemon.LogContainerEvent(c, "unpause")
|
||||
}
|
||||
|
||||
|
|
|
@ -41,6 +41,8 @@ func (daemon *Daemon) containerStop(container *container.Container, seconds int)
|
|||
return nil
|
||||
}
|
||||
|
||||
daemon.stopHealthchecks(container)
|
||||
|
||||
stopSignal := container.StopSignal()
|
||||
// 1. Send a stop signal
|
||||
if err := daemon.killPossiblyDeadProcess(container, stopSignal); err != nil {
|
||||
|
|
|
@ -1470,6 +1470,73 @@ The `STOPSIGNAL` instruction sets the system call signal that will be sent to th
|
|||
This signal can be a valid unsigned number that matches a position in the kernel's syscall table, for instance 9,
|
||||
or a signal name in the format SIGNAME, for instance SIGKILL.
|
||||
|
||||
## HEALTHCHECK
|
||||
|
||||
The `HEALTHCHECK` instruction has two forms:
|
||||
|
||||
* `HEALTHCHECK [OPTIONS] CMD command` (check container health by running a command inside the container)
|
||||
* `HEALTHCHECK NONE` (disable any healthcheck inherited from the base image)
|
||||
|
||||
The `HEALTHCHECK` instruction tells Docker how to test a container to check that
|
||||
it is still working. This can detect cases such as a web server that is stuck in
|
||||
an infinite loop and unable to handle new connections, even though the server
|
||||
process is still running.
|
||||
|
||||
When a container has a healthcheck specified, it has a _health status_ in
|
||||
addition to its normal status. This status is initially `starting`. Whenever a
|
||||
health check passes, it becomes `healthy` (whatever state it was previously in).
|
||||
After a certain number of consecutive failures, it becomes `unhealthy`.
|
||||
|
||||
The options that can appear before `CMD` are:
|
||||
|
||||
* `--interval=DURATION` (default: `30s`)
|
||||
* `--timeout=DURATION` (default: `30s`)
|
||||
* `--retries=N` (default: `1`)
|
||||
|
||||
The health check will first run **interval** seconds after the container is
|
||||
started, and then again **interval** seconds after each previous check completes.
|
||||
|
||||
If a single run of the check takes longer than **timeout** seconds then the check
|
||||
is considered to have failed.
|
||||
|
||||
It takes **retries** consecutive failures of the health check for the container
|
||||
to be considered `unhealthy`.
|
||||
|
||||
There can only be one `HEALTHCHECK` instruction in a Dockerfile. If you list
|
||||
more than one then only the last `HEALTHCHECK` will take effect.
|
||||
|
||||
The command after the `CMD` keyword can be either a shell command (e.g. `HEALTHCHECK
|
||||
CMD /bin/check-running`) or an _exec_ array (as with other Dockerfile commands;
|
||||
see e.g. `ENTRYPOINT` for details).
|
||||
|
||||
The command's exit status indicates the health status of the container.
|
||||
The possible values are:
|
||||
|
||||
- 0: success - the container is healthy and ready for use
|
||||
- 1: unhealthy - the container is not working correctly
|
||||
- 2: starting - the container is not ready for use yet, but is working correctly
|
||||
|
||||
If the probe returns 2 ("starting") when the container has already moved out of the
|
||||
"starting" state then it is treated as "unhealthy" instead.
|
||||
|
||||
For example, to check every five minutes or so that a web-server is able to
|
||||
serve the site's main page within three seconds:
|
||||
|
||||
HEALTHCHECK --interval=5m --timeout=3s \
|
||||
CMD curl -f http://localhost/ || exit 1
|
||||
|
||||
To help debug failing probes, any output text (UTF-8 encoded) that the command writes
|
||||
on stdout or stderr will be stored in the health status and can be queried with
|
||||
`docker inspect`. Such output should be kept short (only the first 4096 bytes
|
||||
are stored currently).
|
||||
|
||||
When the health status of a container changes, a `health_status` event is
|
||||
generated with the new status.
|
||||
|
||||
The `HEALTHCHECK` feature was added in Docker 1.12.
|
||||
|
||||
|
||||
|
||||
## Dockerfile examples
|
||||
|
||||
Below you can see some examples of Dockerfile syntax. If you're interested in
|
||||
|
|
|
@ -1250,6 +1250,7 @@ Dockerfile instruction and how the operator can override that setting.
|
|||
#entrypoint-default-command-to-execute-at-runtime)
|
||||
- [EXPOSE (Incoming Ports)](#expose-incoming-ports)
|
||||
- [ENV (Environment Variables)](#env-environment-variables)
|
||||
- [HEALTHCHECK](#healthcheck)
|
||||
- [VOLUME (Shared Filesystems)](#volume-shared-filesystems)
|
||||
- [USER](#user)
|
||||
- [WORKDIR](#workdir)
|
||||
|
@ -1398,6 +1399,65 @@ above, or already defined by the developer with a Dockerfile `ENV`:
|
|||
|
||||
Similarly the operator can set the **hostname** with `-h`.
|
||||
|
||||
### HEALTHCHECK
|
||||
|
||||
```
|
||||
--health-cmd Command to run to check health
|
||||
--health-interval Time between running the check
|
||||
--health-retries Consecutive failures needed to report unhealthy
|
||||
--health-timeout Maximum time to allow one check to run
|
||||
--no-healthcheck Disable any container-specified HEALTHCHECK
|
||||
```
|
||||
|
||||
Example:
|
||||
|
||||
$ docker run --name=test -d \
|
||||
--health-cmd='stat /etc/passwd || exit 1' \
|
||||
--health-interval=2s \
|
||||
busybox sleep 1d
|
||||
$ sleep 2; docker inspect --format='{{.State.Health.Status}}' test
|
||||
healthy
|
||||
$ docker exec test rm /etc/passwd
|
||||
$ sleep 2; docker inspect --format='{{json .State.Health}}' test
|
||||
{
|
||||
"Status": "unhealthy",
|
||||
"FailingStreak": 3,
|
||||
"Log": [
|
||||
{
|
||||
"Start": "2016-05-25T17:22:04.635478668Z",
|
||||
"End": "2016-05-25T17:22:04.7272552Z",
|
||||
"ExitCode": 0,
|
||||
"Output": " File: /etc/passwd\n Size: 334 \tBlocks: 8 IO Block: 4096 regular file\nDevice: 32h/50d\tInode: 12 Links: 1\nAccess: (0664/-rw-rw-r--) Uid: ( 0/ root) Gid: ( 0/ root)\nAccess: 2015-12-05 22:05:32.000000000\nModify: 2015..."
|
||||
},
|
||||
{
|
||||
"Start": "2016-05-25T17:22:06.732900633Z",
|
||||
"End": "2016-05-25T17:22:06.822168935Z",
|
||||
"ExitCode": 0,
|
||||
"Output": " File: /etc/passwd\n Size: 334 \tBlocks: 8 IO Block: 4096 regular file\nDevice: 32h/50d\tInode: 12 Links: 1\nAccess: (0664/-rw-rw-r--) Uid: ( 0/ root) Gid: ( 0/ root)\nAccess: 2015-12-05 22:05:32.000000000\nModify: 2015..."
|
||||
},
|
||||
{
|
||||
"Start": "2016-05-25T17:22:08.823956535Z",
|
||||
"End": "2016-05-25T17:22:08.897359124Z",
|
||||
"ExitCode": 1,
|
||||
"Output": "stat: can't stat '/etc/passwd': No such file or directory\n"
|
||||
},
|
||||
{
|
||||
"Start": "2016-05-25T17:22:10.898802931Z",
|
||||
"End": "2016-05-25T17:22:10.969631866Z",
|
||||
"ExitCode": 1,
|
||||
"Output": "stat: can't stat '/etc/passwd': No such file or directory\n"
|
||||
},
|
||||
{
|
||||
"Start": "2016-05-25T17:22:12.971033523Z",
|
||||
"End": "2016-05-25T17:22:13.082015516Z",
|
||||
"ExitCode": 1,
|
||||
"Output": "stat: can't stat '/etc/passwd': No such file or directory\n"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
The health status is also displayed in the `docker ps` output.
|
||||
|
||||
### TMPFS (mount tmpfs filesystems)
|
||||
|
||||
```bash
|
||||
|
|
154
integration-cli/docker_cli_health_test.go
Normal file
154
integration-cli/docker_cli_health_test.go
Normal file
|
@ -0,0 +1,154 @@
|
|||
package main
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"github.com/docker/docker/pkg/integration/checker"
|
||||
"github.com/docker/engine-api/types"
|
||||
"github.com/go-check/check"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
)
|
||||
|
||||
func waitForStatus(c *check.C, name string, prev string, expected string) {
|
||||
prev = prev + "\n"
|
||||
expected = expected + "\n"
|
||||
for {
|
||||
out, _ := dockerCmd(c, "inspect", "--format={{.State.Status}}", name)
|
||||
if out == expected {
|
||||
return
|
||||
}
|
||||
c.Check(out, checker.Equals, prev)
|
||||
if out != prev {
|
||||
return
|
||||
}
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
}
|
||||
}
|
||||
|
||||
func waitForHealthStatus(c *check.C, name string, prev string, expected string) {
|
||||
prev = prev + "\n"
|
||||
expected = expected + "\n"
|
||||
for {
|
||||
out, _ := dockerCmd(c, "inspect", "--format={{.State.Health.Status}}", name)
|
||||
if out == expected {
|
||||
return
|
||||
}
|
||||
c.Check(out, checker.Equals, prev)
|
||||
if out != prev {
|
||||
return
|
||||
}
|
||||
time.Sleep(100 * time.Millisecond)
|
||||
}
|
||||
}
|
||||
|
||||
func getHealth(c *check.C, name string) *types.Health {
|
||||
out, _ := dockerCmd(c, "inspect", "--format={{json .State.Health}}", name)
|
||||
var health types.Health
|
||||
err := json.Unmarshal([]byte(out), &health)
|
||||
c.Check(err, checker.Equals, nil)
|
||||
return &health
|
||||
}
|
||||
|
||||
func (s *DockerSuite) TestHealth(c *check.C) {
|
||||
testRequires(c, DaemonIsLinux) // busybox doesn't work on Windows
|
||||
|
||||
imageName := "testhealth"
|
||||
_, err := buildImage(imageName,
|
||||
`FROM busybox
|
||||
RUN echo OK > /status
|
||||
CMD ["/bin/sleep", "120"]
|
||||
STOPSIGNAL SIGKILL
|
||||
HEALTHCHECK --interval=1s --timeout=30s \
|
||||
CMD cat /status`,
|
||||
true)
|
||||
|
||||
c.Check(err, check.IsNil)
|
||||
|
||||
// No health status before starting
|
||||
name := "test_health"
|
||||
dockerCmd(c, "create", "--name", name, imageName)
|
||||
out, _ := dockerCmd(c, "ps", "-a", "--format={{.Status}}")
|
||||
c.Check(out, checker.Equals, "Created\n")
|
||||
|
||||
// Inspect the options
|
||||
out, _ = dockerCmd(c, "inspect",
|
||||
"--format='timeout={{.Config.Healthcheck.Timeout}} "+
|
||||
"interval={{.Config.Healthcheck.Interval}} "+
|
||||
"retries={{.Config.Healthcheck.Retries}} "+
|
||||
"test={{.Config.Healthcheck.Test}}'", name)
|
||||
c.Check(out, checker.Equals, "timeout=30s interval=1s retries=0 test=[CMD-SHELL cat /status]\n")
|
||||
|
||||
// Start
|
||||
dockerCmd(c, "start", name)
|
||||
waitForHealthStatus(c, name, "starting", "healthy")
|
||||
|
||||
// Make it fail
|
||||
dockerCmd(c, "exec", name, "rm", "/status")
|
||||
waitForHealthStatus(c, name, "healthy", "unhealthy")
|
||||
|
||||
// Inspect the status
|
||||
out, _ = dockerCmd(c, "inspect", "--format={{.State.Health.Status}}", name)
|
||||
c.Check(out, checker.Equals, "unhealthy\n")
|
||||
|
||||
// Make it healthy again
|
||||
dockerCmd(c, "exec", name, "touch", "/status")
|
||||
waitForHealthStatus(c, name, "unhealthy", "healthy")
|
||||
|
||||
// Remove container
|
||||
dockerCmd(c, "rm", "-f", name)
|
||||
|
||||
// Disable the check from the CLI
|
||||
out, _ = dockerCmd(c, "create", "--name=noh", "--no-healthcheck", imageName)
|
||||
out, _ = dockerCmd(c, "inspect", "--format={{.Config.Healthcheck.Test}}", "noh")
|
||||
c.Check(out, checker.Equals, "[NONE]\n")
|
||||
dockerCmd(c, "rm", "noh")
|
||||
|
||||
// Disable the check with a new build
|
||||
_, err = buildImage("no_healthcheck",
|
||||
`FROM testhealth
|
||||
HEALTHCHECK NONE`, true)
|
||||
c.Check(err, check.IsNil)
|
||||
|
||||
out, _ = dockerCmd(c, "inspect", "--format={{.ContainerConfig.Healthcheck.Test}}", "no_healthcheck")
|
||||
c.Check(out, checker.Equals, "[NONE]\n")
|
||||
|
||||
// Enable the checks from the CLI
|
||||
_, _ = dockerCmd(c, "run", "-d", "--name=fatal_healthcheck",
|
||||
"--health-interval=0.5s",
|
||||
"--health-retries=3",
|
||||
"--health-cmd=cat /status",
|
||||
"no_healthcheck")
|
||||
waitForHealthStatus(c, "fatal_healthcheck", "starting", "healthy")
|
||||
health := getHealth(c, "fatal_healthcheck")
|
||||
c.Check(health.Status, checker.Equals, "healthy")
|
||||
c.Check(health.FailingStreak, checker.Equals, 0)
|
||||
last := health.Log[len(health.Log)-1]
|
||||
c.Check(last.ExitCode, checker.Equals, 0)
|
||||
c.Check(last.Output, checker.Equals, "OK\n")
|
||||
|
||||
// Fail the check, which should now make it exit
|
||||
dockerCmd(c, "exec", "fatal_healthcheck", "rm", "/status")
|
||||
waitForStatus(c, "fatal_healthcheck", "running", "exited")
|
||||
|
||||
out, _ = dockerCmd(c, "inspect", "--format={{.State.Health.Status}}", "fatal_healthcheck")
|
||||
c.Check(out, checker.Equals, "unhealthy\n")
|
||||
failsStr, _ := dockerCmd(c, "inspect", "--format={{.State.Health.FailingStreak}}", "fatal_healthcheck")
|
||||
fails, err := strconv.Atoi(strings.TrimSpace(failsStr))
|
||||
c.Check(err, check.IsNil)
|
||||
c.Check(fails >= 3, checker.Equals, true)
|
||||
dockerCmd(c, "rm", "-f", "fatal_healthcheck")
|
||||
|
||||
// Check timeout
|
||||
// Note: if the interval is too small, it seems that Docker spends all its time running health
|
||||
// checks and never gets around to killing it.
|
||||
_, _ = dockerCmd(c, "run", "-d", "--name=test",
|
||||
"--health-interval=1s", "--health-cmd=sleep 5m", "--health-timeout=1ms", imageName)
|
||||
waitForHealthStatus(c, "test", "starting", "unhealthy")
|
||||
health = getHealth(c, "test")
|
||||
last = health.Log[len(health.Log)-1]
|
||||
c.Check(health.Status, checker.Equals, "unhealthy")
|
||||
c.Check(last.ExitCode, checker.Equals, -1)
|
||||
c.Check(last.Output, checker.Equals, "Health check exceeded timeout (1ms)")
|
||||
dockerCmd(c, "rm", "-f", "test")
|
||||
}
|
|
@ -190,6 +190,17 @@ func (clnt *client) Signal(containerID string, sig int) error {
|
|||
return err
|
||||
}
|
||||
|
||||
func (clnt *client) SignalProcess(containerID string, pid string, sig int) error {
|
||||
clnt.lock(containerID)
|
||||
defer clnt.unlock(containerID)
|
||||
_, err := clnt.remote.apiClient.Signal(context.Background(), &containerd.SignalRequest{
|
||||
Id: containerID,
|
||||
Pid: pid,
|
||||
Signal: uint32(sig),
|
||||
})
|
||||
return err
|
||||
}
|
||||
|
||||
func (clnt *client) Resize(containerID, processFriendlyName string, width, height int) error {
|
||||
clnt.lock(containerID)
|
||||
defer clnt.unlock(containerID)
|
||||
|
|
|
@ -304,6 +304,25 @@ func (clnt *client) Signal(containerID string, sig int) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
// While Linux has support for the full range of signals, signals aren't really implemented on Windows.
|
||||
// We try to terminate the specified process whatever signal is requested.
|
||||
func (clnt *client) SignalProcess(containerID string, processFriendlyName string, sig int) error {
|
||||
clnt.lock(containerID)
|
||||
defer clnt.unlock(containerID)
|
||||
cont, err := clnt.getContainer(containerID)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
for _, p := range cont.processes {
|
||||
if p.friendlyName == processFriendlyName {
|
||||
return hcsshim.TerminateProcessInComputeSystem(containerID, p.systemPid)
|
||||
}
|
||||
}
|
||||
|
||||
return fmt.Errorf("SignalProcess could not find process %s in %s", processFriendlyName, containerID)
|
||||
}
|
||||
|
||||
// Resize handles a CLI event to resize an interactive docker run or docker exec
|
||||
// window.
|
||||
func (clnt *client) Resize(containerID, processFriendlyName string, width, height int) error {
|
||||
|
|
|
@ -34,6 +34,7 @@ type Backend interface {
|
|||
type Client interface {
|
||||
Create(containerID string, spec Spec, options ...CreateOption) error
|
||||
Signal(containerID string, sig int) error
|
||||
SignalProcess(containerID string, processFriendlyName string, sig int) error
|
||||
AddProcess(containerID, processFriendlyName string, process Process) error
|
||||
Resize(containerID, processFriendlyName string, width, height int) error
|
||||
Pause(containerID string) error
|
||||
|
|
|
@ -100,6 +100,12 @@ func Parse(cmd *flag.FlagSet, args []string) (*container.Config, *container.Host
|
|||
flStopSignal = cmd.String([]string{"-stop-signal"}, signal.DefaultStopSignal, fmt.Sprintf("Signal to stop a container, %v by default", signal.DefaultStopSignal))
|
||||
flIsolation = cmd.String([]string{"-isolation"}, "", "Container isolation technology")
|
||||
flShmSize = cmd.String([]string{"-shm-size"}, "", "Size of /dev/shm, default value is 64MB")
|
||||
// Healthcheck
|
||||
flNoHealthcheck = cmd.Bool([]string{"-no-healthcheck"}, false, "Disable any container-specified HEALTHCHECK")
|
||||
flHealthCmd = cmd.String([]string{"-health-cmd"}, "", "Command to run to check health")
|
||||
flHealthInterval = cmd.Duration([]string{"-health-interval"}, 0, "Time between running the check")
|
||||
flHealthTimeout = cmd.Duration([]string{"-health-timeout"}, 0, "Maximum time to allow one check to run")
|
||||
flHealthRetries = cmd.Int([]string{"-health-retries"}, 0, "Consecutive failures needed to report unhealthy")
|
||||
)
|
||||
|
||||
cmd.Var(&flAttach, []string{"a", "-attach"}, "Attach to STDIN, STDOUT or STDERR")
|
||||
|
@ -351,6 +357,39 @@ func Parse(cmd *flag.FlagSet, args []string) (*container.Config, *container.Host
|
|||
return nil, nil, nil, cmd, err
|
||||
}
|
||||
|
||||
// Healthcheck
|
||||
var healthConfig *container.HealthConfig
|
||||
haveHealthSettings := *flHealthCmd != "" ||
|
||||
*flHealthInterval != 0 ||
|
||||
*flHealthTimeout != 0 ||
|
||||
*flHealthRetries != 0
|
||||
if *flNoHealthcheck {
|
||||
if haveHealthSettings {
|
||||
return nil, nil, nil, cmd, fmt.Errorf("--no-healthcheck conflicts with --health-* options")
|
||||
}
|
||||
test := strslice.StrSlice{"NONE"}
|
||||
healthConfig = &container.HealthConfig{Test: test}
|
||||
} else if haveHealthSettings {
|
||||
var probe strslice.StrSlice
|
||||
if *flHealthCmd != "" {
|
||||
args := []string{"CMD-SHELL", *flHealthCmd}
|
||||
probe = strslice.StrSlice(args)
|
||||
}
|
||||
if *flHealthInterval < 0 {
|
||||
return nil, nil, nil, cmd, fmt.Errorf("--health-interval cannot be negative")
|
||||
}
|
||||
if *flHealthTimeout < 0 {
|
||||
return nil, nil, nil, cmd, fmt.Errorf("--health-timeout cannot be negative")
|
||||
}
|
||||
|
||||
healthConfig = &container.HealthConfig{
|
||||
Test: probe,
|
||||
Interval: *flHealthInterval,
|
||||
Timeout: *flHealthTimeout,
|
||||
Retries: *flHealthRetries,
|
||||
}
|
||||
}
|
||||
|
||||
resources := container.Resources{
|
||||
CgroupParent: *flCgroupParent,
|
||||
Memory: flMemory,
|
||||
|
@ -399,6 +438,7 @@ func Parse(cmd *flag.FlagSet, args []string) (*container.Config, *container.Host
|
|||
Entrypoint: entrypoint,
|
||||
WorkingDir: *flWorkingDir,
|
||||
Labels: ConvertKVStringsToMap(labels),
|
||||
Healthcheck: healthConfig,
|
||||
}
|
||||
if cmd.IsSet("-stop-signal") {
|
||||
config.StopSignal = *flStopSignal
|
||||
|
|
|
@ -9,6 +9,7 @@ import (
|
|||
"runtime"
|
||||
"strings"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
flag "github.com/docker/docker/pkg/mflag"
|
||||
"github.com/docker/docker/runconfig"
|
||||
|
@ -584,6 +585,45 @@ func TestParseRestartPolicy(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestParseHealth(t *testing.T) {
|
||||
checkOk := func(args ...string) *container.HealthConfig {
|
||||
config, _, _, _, err := parseRun(args)
|
||||
if err != nil {
|
||||
t.Fatalf("%#v: %v", args, err)
|
||||
}
|
||||
return config.Healthcheck
|
||||
}
|
||||
checkError := func(expected string, args ...string) {
|
||||
config, _, _, _, err := parseRun(args)
|
||||
if err == nil {
|
||||
t.Fatalf("Expected error, but got %#v", config)
|
||||
}
|
||||
if err.Error() != expected {
|
||||
t.Fatalf("Expected %#v, got %#v", expected, err)
|
||||
}
|
||||
}
|
||||
health := checkOk("--no-healthcheck", "img", "cmd")
|
||||
if health == nil || len(health.Test) != 1 || health.Test[0] != "NONE" {
|
||||
t.Fatalf("--no-healthcheck failed: %#v", health)
|
||||
}
|
||||
|
||||
health = checkOk("--health-cmd=/check.sh -q", "img", "cmd")
|
||||
if len(health.Test) != 2 || health.Test[0] != "CMD-SHELL" || health.Test[1] != "/check.sh -q" {
|
||||
t.Fatalf("--health-cmd: got %#v", health.Test)
|
||||
}
|
||||
if health.Timeout != 0 {
|
||||
t.Fatalf("--health-cmd: timeout = %f", health.Timeout)
|
||||
}
|
||||
|
||||
checkError("--no-healthcheck conflicts with --health-* options",
|
||||
"--no-healthcheck", "--health-cmd=/check.sh -q", "img", "cmd")
|
||||
|
||||
health = checkOk("--health-timeout=2s", "--health-retries=3", "--health-interval=4.5s", "img", "cmd")
|
||||
if health.Timeout != 2*time.Second || health.Retries != 3 || health.Interval != 4500*time.Millisecond {
|
||||
t.Fatalf("--health-*: got %#v", health)
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLoggingOpts(t *testing.T) {
|
||||
// logging opts ko
|
||||
if _, _, _, _, err := parseRun([]string{"--log-driver=none", "--log-opt=anything", "img", "cmd"}); err == nil || err.Error() != "invalid logging opts for driver none" {
|
||||
|
|
Loading…
Add table
Reference in a new issue