From e4c03623c2fcd3013343d246e3432409850c8c37 Mon Sep 17 00:00:00 2001 From: Brian Goff Date: Thu, 9 Feb 2017 21:57:35 -0500 Subject: [PATCH] Use counter for tracking container states Container state counts are used for reporting in the `/info` endpoint. Currently when `/info` is called, each container is iterated over and the containers 'StateString()' is called. This is not very efficient with lots of containers, and is also racey since `StateString()` is not using a mutex and the mutex is not otherwise locked. We could just lock the container mutex, but this is proven to be problematic since there are frequent deadlock scenarios and we should always have the `/info` endpoint available since this endpoint is used to get general information about the docker host. Really, these metrics on `/info` should be deprecated. But until then, we can just keep a running tally in memory for each of the reported states. Signed-off-by: Brian Goff --- daemon/create.go | 1 + daemon/daemon.go | 1 + daemon/delete.go | 1 + daemon/info.go | 15 +---- daemon/metrics.go | 66 ++++++++++++++++++- daemon/monitor.go | 20 +++++- vendor.conf | 2 +- vendor/github.com/docker/go-metrics/README.md | 63 +++++++++++++++++- .../github.com/docker/go-metrics/namespace.go | 38 +++++++---- 9 files changed, 176 insertions(+), 31 deletions(-) diff --git a/daemon/create.go b/daemon/create.go index 55a106c646..c6d2b40648 100644 --- a/daemon/create.go +++ b/daemon/create.go @@ -151,6 +151,7 @@ func (daemon *Daemon) create(params types.ContainerCreateConfig, managed bool) ( return nil, err } daemon.Register(container) + stateCtr.set(container.ID, "stopped") daemon.LogContainerEvent(container, "create") return container, nil } diff --git a/daemon/daemon.go b/daemon/daemon.go index 59d84a0260..35ff2a66ef 100644 --- a/daemon/daemon.go +++ b/daemon/daemon.go @@ -198,6 +198,7 @@ func (daemon *Daemon) restore() error { if err := backportMountSpec(c); err != nil { logrus.Error("Failed to migrate old mounts to use new spec format") } + daemon.setStateCounter(c) if c.IsRunning() || c.IsPaused() { c.RestartManager().Cancel() // manually start containers because some need to wait for swarm networking diff --git a/daemon/delete.go b/daemon/delete.go index fd1759ee1c..483241db53 100644 --- a/daemon/delete.go +++ b/daemon/delete.go @@ -124,6 +124,7 @@ func (daemon *Daemon) cleanupContainer(container *container.Container, forceRemo logrus.Error(e) } daemon.LogContainerEvent(container, "destroy") + stateCtr.del(container.ID) } }() diff --git a/daemon/info.go b/daemon/info.go index b6c2565f44..c3f1c3b8b1 100644 --- a/daemon/info.go +++ b/daemon/info.go @@ -4,14 +4,12 @@ import ( "fmt" "os" "runtime" - "sync/atomic" "time" "github.com/Sirupsen/logrus" "github.com/docker/docker/api" "github.com/docker/docker/api/types" "github.com/docker/docker/cli/debug" - "github.com/docker/docker/container" "github.com/docker/docker/daemon/logger" "github.com/docker/docker/dockerversion" "github.com/docker/docker/pkg/fileutils" @@ -58,18 +56,7 @@ func (daemon *Daemon) SystemInfo() (*types.Info, error) { } sysInfo := sysinfo.New(true) - - var cRunning, cPaused, cStopped int32 - daemon.containers.ApplyAll(func(c *container.Container) { - switch c.StateString() { - case "paused": - atomic.AddInt32(&cPaused, 1) - case "running": - atomic.AddInt32(&cRunning, 1) - default: - atomic.AddInt32(&cStopped, 1) - } - }) + cRunning, cPaused, cStopped := stateCtr.get() securityOptions := []string{} if sysInfo.AppArmor { diff --git a/daemon/metrics.go b/daemon/metrics.go index 69dbfd9378..65d92901ce 100644 --- a/daemon/metrics.go +++ b/daemon/metrics.go @@ -1,9 +1,15 @@ package daemon -import "github.com/docker/go-metrics" +import ( + "sync" + + "github.com/docker/go-metrics" + "github.com/prometheus/client_golang/prometheus" +) var ( containerActions metrics.LabeledTimer + containerStates metrics.LabeledGauge imageActions metrics.LabeledTimer networkActions metrics.LabeledTimer engineVersion metrics.LabeledGauge @@ -11,6 +17,8 @@ var ( engineMemory metrics.Gauge healthChecksCounter metrics.Counter healthChecksFailedCounter metrics.Counter + + stateCtr *stateCounter ) func init() { @@ -25,6 +33,7 @@ func init() { } { containerActions.WithValues(a).Update(0) } + networkActions = ns.NewLabeledTimer("network_actions", "The number of seconds it takes to process each network action", "action") engineVersion = ns.NewLabeledGauge("engine", "The version and commit information for the engine process", metrics.Unit("info"), "version", @@ -38,5 +47,60 @@ func init() { healthChecksCounter = ns.NewCounter("health_checks", "The total number of health checks") healthChecksFailedCounter = ns.NewCounter("health_checks_failed", "The total number of failed health checks") imageActions = ns.NewLabeledTimer("image_actions", "The number of seconds it takes to process each image action", "action") + + stateCtr = newStateCounter(ns.NewDesc("container_states", "The count of containers in various states", metrics.Unit("containers"), "state")) + ns.Add(stateCtr) + metrics.Register(ns) } + +type stateCounter struct { + mu sync.Mutex + states map[string]string + desc *prometheus.Desc +} + +func newStateCounter(desc *prometheus.Desc) *stateCounter { + return &stateCounter{ + states: make(map[string]string), + desc: desc, + } +} + +func (ctr *stateCounter) get() (running int, paused int, stopped int) { + ctr.mu.Lock() + defer ctr.mu.Unlock() + + states := map[string]int{ + "running": 0, + "paused": 0, + "stopped": 0, + } + for _, state := range ctr.states { + states[state]++ + } + return states["running"], states["paused"], states["stopped"] +} + +func (ctr *stateCounter) set(id, label string) { + ctr.mu.Lock() + ctr.states[id] = label + ctr.mu.Unlock() +} + +func (ctr *stateCounter) del(id string) { + ctr.mu.Lock() + delete(ctr.states, id) + ctr.mu.Unlock() +} + +func (ctr *stateCounter) Describe(ch chan<- *prometheus.Desc) { + ch <- ctr.desc +} + +func (ctr *stateCounter) Collect(ch chan<- prometheus.Metric) { + running, paused, stopped := ctr.get() + ch <- prometheus.MustNewConstMetric(ctr.desc, prometheus.GaugeValue, float64(running), "running") + ch <- prometheus.MustNewConstMetric(ctr.desc, prometheus.GaugeValue, float64(paused), "paused") + ch <- prometheus.MustNewConstMetric(ctr.desc, prometheus.GaugeValue, float64(stopped), "stopped") +} diff --git a/daemon/monitor.go b/daemon/monitor.go index 9227525e72..b243b74784 100644 --- a/daemon/monitor.go +++ b/daemon/monitor.go @@ -9,10 +9,22 @@ import ( "github.com/Sirupsen/logrus" "github.com/docker/docker/api/types" + "github.com/docker/docker/container" "github.com/docker/docker/libcontainerd" "github.com/docker/docker/restartmanager" ) +func (daemon *Daemon) setStateCounter(c *container.Container) { + switch c.StateString() { + case "paused": + stateCtr.set(c.ID, "paused") + case "running": + stateCtr.set(c.ID, "running") + default: + stateCtr.set(c.ID, "stopped") + } +} + // StateChanged updates daemon state changes from containerd func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error { c := daemon.containers.Get(id) @@ -81,6 +93,8 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error { }() } + daemon.setStateCounter(c) + defer c.Unlock() if err := c.ToDisk(); err != nil { return err @@ -109,15 +123,19 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error { c.SetRunning(int(e.Pid), e.State == libcontainerd.StateStart) c.HasBeenManuallyStopped = false c.HasBeenStartedBefore = true + daemon.setStateCounter(c) + if err := c.ToDisk(); err != nil { c.Reset(false) return err } daemon.initHealthMonitor(c) + daemon.LogContainerEvent(c, "start") case libcontainerd.StatePause: // Container is already locked in this case c.Paused = true + daemon.setStateCounter(c) if err := c.ToDisk(); err != nil { return err } @@ -126,12 +144,12 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error { case libcontainerd.StateResume: // Container is already locked in this case c.Paused = false + daemon.setStateCounter(c) if err := c.ToDisk(); err != nil { return err } daemon.updateHealthMonitor(c) daemon.LogContainerEvent(c, "unpause") } - return nil } diff --git a/vendor.conf b/vendor.conf index 9632ed26aa..ed3c2a5fca 100644 --- a/vendor.conf +++ b/vendor.conf @@ -133,7 +133,7 @@ github.com/flynn-archive/go-shlex 3f9db97f856818214da2e1057f8ad84803971cff github.com/Nvveen/Gotty a8b993ba6abdb0e0c12b0125c603323a71c7790c https://github.com/ijc25/Gotty # metrics -github.com/docker/go-metrics 86138d05f285fd9737a99bee2d9be30866b59d72 +github.com/docker/go-metrics 8fd5772bf1584597834c6f7961a530f06cbfbb87 # composefile github.com/mitchellh/mapstructure f3009df150dadf309fdee4a54ed65c124afad715 diff --git a/vendor/github.com/docker/go-metrics/README.md b/vendor/github.com/docker/go-metrics/README.md index 7407f34ce8..fdf7fb746f 100644 --- a/vendor/github.com/docker/go-metrics/README.md +++ b/vendor/github.com/docker/go-metrics/README.md @@ -2,10 +2,67 @@ This package is small wrapper around the prometheus go client to help enforce convention and best practices for metrics collection in Docker projects. -## Status +## Best Practices -This project is a work in progress. -It is under heavy development and not intended to be used. +This packages is meant to be used for collecting metrics in Docker projects. +It is not meant to be used as a replacement for the prometheus client but to help enforce consistent naming across metrics collected. +If you have not already read the prometheus best practices around naming and labels you can read the page [here](https://prometheus.io/docs/practices/naming/). + +The following are a few Docker specific rules that will help you name and work with metrics in your project. + +1. Namespace and Subsystem + +This package provides you with a namespace type that allows you to specify the same namespace and subsystem for your metrics. + +```go +ns := metrics.NewNamespace("engine", "daemon", metrics.Labels{ + "version": dockerversion.Version, + "commit": dockerversion.GitCommit, +}) +``` + +In the example above we are creating metrics for the Docker engine's daemon package. +`engine` would be the namespace in this example where `daemon` is the subsystem or package where we are collecting the metrics. + +A namespace also allows you to attach constant labels to the metrics such as the git commit and version that it is collecting. + +2. Declaring your Metrics + +Try to keep all your metric declarations in one file. +This makes it easy for others to see what constant labels are defined on the namespace and what labels are defined on the metrics when they are created. + +3. Use labels instead of multiple metrics + +Labels allow you to define one metric such as the time it takes to perform a certain action on an object. +If we wanted to collect timings on various container actions such as create, start, and delete then we can define one metric called `container_actions` and use labels to specify the type of action. + + +```go +containerActions = ns.NewLabeledTimer("container_actions", "The number of milliseconds it takes to process each container action", "action") +``` + +The last parameter is the label name or key. +When adding a data point to the metric you will use the `WithValues` function to specify the `action` that you are collecting for. + +```go +containerActions.WithValues("create").UpdateSince(start) +``` + +4. Always use a unit + +The metric name should describe what you are measuring but you also need to provide the unit that it is being measured with. +For a timer, the standard unit is seconds and a counter's standard unit is a total. +For gauges you must provide the unit. +This package provides a standard set of units for use within the Docker projects. + +```go +Nanoseconds Unit = "nanoseconds" +Seconds Unit = "seconds" +Bytes Unit = "bytes" +Total Unit = "total" +``` + +If you need to use a unit but it is not defined in the package please open a PR to add it but first try to see if one of the already created units will work for your metric, i.e. seconds or nanoseconds vs adding milliseconds. ## Docs diff --git a/vendor/github.com/docker/go-metrics/namespace.go b/vendor/github.com/docker/go-metrics/namespace.go index f49d88266d..27dab786df 100644 --- a/vendor/github.com/docker/go-metrics/namespace.go +++ b/vendor/github.com/docker/go-metrics/namespace.go @@ -40,21 +40,25 @@ type Namespace struct { // Only metrics created with the returned namespace will get the new constant // labels. The returned namespace must be registered separately. func (n *Namespace) WithConstLabels(labels Labels) *Namespace { - ns := *n - ns.metrics = nil // blank this out - ns.labels = mergeLabels(ns.labels, labels) - return &ns + n.mu.Lock() + ns := &Namespace{ + name: n.name, + subsystem: n.subsystem, + labels: mergeLabels(n.labels, labels), + } + n.mu.Unlock() + return ns } func (n *Namespace) NewCounter(name, help string) Counter { c := &counter{pc: prometheus.NewCounter(n.newCounterOpts(name, help))} - n.addMetric(c) + n.Add(c) return c } func (n *Namespace) NewLabeledCounter(name, help string, labels ...string) LabeledCounter { c := &labeledCounter{pc: prometheus.NewCounterVec(n.newCounterOpts(name, help), labels)} - n.addMetric(c) + n.Add(c) return c } @@ -72,7 +76,7 @@ func (n *Namespace) NewTimer(name, help string) Timer { t := &timer{ m: prometheus.NewHistogram(n.newTimerOpts(name, help)), } - n.addMetric(t) + n.Add(t) return t } @@ -80,7 +84,7 @@ func (n *Namespace) NewLabeledTimer(name, help string, labels ...string) Labeled t := &labeledTimer{ m: prometheus.NewHistogramVec(n.newTimerOpts(name, help), labels), } - n.addMetric(t) + n.Add(t) return t } @@ -98,7 +102,7 @@ func (n *Namespace) NewGauge(name, help string, unit Unit) Gauge { g := &gauge{ pg: prometheus.NewGauge(n.newGaugeOpts(name, help, unit)), } - n.addMetric(g) + n.Add(g) return g } @@ -106,7 +110,7 @@ func (n *Namespace) NewLabeledGauge(name, help string, unit Unit, labels ...stri g := &labeledGauge{ pg: prometheus.NewGaugeVec(n.newGaugeOpts(name, help, unit), labels), } - n.addMetric(g) + n.Add(g) return g } @@ -138,12 +142,24 @@ func (n *Namespace) Collect(ch chan<- prometheus.Metric) { } } -func (n *Namespace) addMetric(collector prometheus.Collector) { +func (n *Namespace) Add(collector prometheus.Collector) { n.mu.Lock() n.metrics = append(n.metrics, collector) n.mu.Unlock() } +func (n *Namespace) NewDesc(name, help string, unit Unit, labels ...string) *prometheus.Desc { + if string(unit) != "" { + name = fmt.Sprintf("%s_%s", name, unit) + } + namespace := n.name + if n.subsystem != "" { + namespace = fmt.Sprintf("%s_%s", namespace, n.subsystem) + } + name = fmt.Sprintf("%s_%s", namespace, name) + return prometheus.NewDesc(name, help, labels, prometheus.Labels(n.labels)) +} + // mergeLabels merges two or more labels objects into a single map, favoring // the later labels. func mergeLabels(lbs ...Labels) Labels {