Use counter for tracking container states

Container state counts are used for reporting in the `/info` endpoint.
Currently when `/info` is called, each container is iterated over and
the containers 'StateString()' is called. This is not very efficient
with lots of containers, and is also racey since `StateString()` is not
using a mutex and the mutex is not otherwise locked.

We could just lock the container mutex, but this is proven to be
problematic since there are frequent deadlock scenarios and we should
always have the `/info` endpoint available since this endpoint is used
to get general information about the docker host.

Really, these metrics on `/info` should be deprecated. But until then,
we can just keep a running tally in memory for each of the reported
states.

Signed-off-by: Brian Goff <cpuguy83@gmail.com>
This commit is contained in:
Brian Goff 2017-02-09 21:57:35 -05:00
parent 52bded9868
commit e4c03623c2
9 changed files with 176 additions and 31 deletions

View File

@ -151,6 +151,7 @@ func (daemon *Daemon) create(params types.ContainerCreateConfig, managed bool) (
return nil, err
}
daemon.Register(container)
stateCtr.set(container.ID, "stopped")
daemon.LogContainerEvent(container, "create")
return container, nil
}

View File

@ -198,6 +198,7 @@ func (daemon *Daemon) restore() error {
if err := backportMountSpec(c); err != nil {
logrus.Error("Failed to migrate old mounts to use new spec format")
}
daemon.setStateCounter(c)
if c.IsRunning() || c.IsPaused() {
c.RestartManager().Cancel() // manually start containers because some need to wait for swarm networking

View File

@ -124,6 +124,7 @@ func (daemon *Daemon) cleanupContainer(container *container.Container, forceRemo
logrus.Error(e)
}
daemon.LogContainerEvent(container, "destroy")
stateCtr.del(container.ID)
}
}()

View File

@ -4,14 +4,12 @@ import (
"fmt"
"os"
"runtime"
"sync/atomic"
"time"
"github.com/Sirupsen/logrus"
"github.com/docker/docker/api"
"github.com/docker/docker/api/types"
"github.com/docker/docker/cli/debug"
"github.com/docker/docker/container"
"github.com/docker/docker/daemon/logger"
"github.com/docker/docker/dockerversion"
"github.com/docker/docker/pkg/fileutils"
@ -58,18 +56,7 @@ func (daemon *Daemon) SystemInfo() (*types.Info, error) {
}
sysInfo := sysinfo.New(true)
var cRunning, cPaused, cStopped int32
daemon.containers.ApplyAll(func(c *container.Container) {
switch c.StateString() {
case "paused":
atomic.AddInt32(&cPaused, 1)
case "running":
atomic.AddInt32(&cRunning, 1)
default:
atomic.AddInt32(&cStopped, 1)
}
})
cRunning, cPaused, cStopped := stateCtr.get()
securityOptions := []string{}
if sysInfo.AppArmor {

View File

@ -1,9 +1,15 @@
package daemon
import "github.com/docker/go-metrics"
import (
"sync"
"github.com/docker/go-metrics"
"github.com/prometheus/client_golang/prometheus"
)
var (
containerActions metrics.LabeledTimer
containerStates metrics.LabeledGauge
imageActions metrics.LabeledTimer
networkActions metrics.LabeledTimer
engineVersion metrics.LabeledGauge
@ -11,6 +17,8 @@ var (
engineMemory metrics.Gauge
healthChecksCounter metrics.Counter
healthChecksFailedCounter metrics.Counter
stateCtr *stateCounter
)
func init() {
@ -25,6 +33,7 @@ func init() {
} {
containerActions.WithValues(a).Update(0)
}
networkActions = ns.NewLabeledTimer("network_actions", "The number of seconds it takes to process each network action", "action")
engineVersion = ns.NewLabeledGauge("engine", "The version and commit information for the engine process", metrics.Unit("info"),
"version",
@ -38,5 +47,60 @@ func init() {
healthChecksCounter = ns.NewCounter("health_checks", "The total number of health checks")
healthChecksFailedCounter = ns.NewCounter("health_checks_failed", "The total number of failed health checks")
imageActions = ns.NewLabeledTimer("image_actions", "The number of seconds it takes to process each image action", "action")
stateCtr = newStateCounter(ns.NewDesc("container_states", "The count of containers in various states", metrics.Unit("containers"), "state"))
ns.Add(stateCtr)
metrics.Register(ns)
}
type stateCounter struct {
mu sync.Mutex
states map[string]string
desc *prometheus.Desc
}
func newStateCounter(desc *prometheus.Desc) *stateCounter {
return &stateCounter{
states: make(map[string]string),
desc: desc,
}
}
func (ctr *stateCounter) get() (running int, paused int, stopped int) {
ctr.mu.Lock()
defer ctr.mu.Unlock()
states := map[string]int{
"running": 0,
"paused": 0,
"stopped": 0,
}
for _, state := range ctr.states {
states[state]++
}
return states["running"], states["paused"], states["stopped"]
}
func (ctr *stateCounter) set(id, label string) {
ctr.mu.Lock()
ctr.states[id] = label
ctr.mu.Unlock()
}
func (ctr *stateCounter) del(id string) {
ctr.mu.Lock()
delete(ctr.states, id)
ctr.mu.Unlock()
}
func (ctr *stateCounter) Describe(ch chan<- *prometheus.Desc) {
ch <- ctr.desc
}
func (ctr *stateCounter) Collect(ch chan<- prometheus.Metric) {
running, paused, stopped := ctr.get()
ch <- prometheus.MustNewConstMetric(ctr.desc, prometheus.GaugeValue, float64(running), "running")
ch <- prometheus.MustNewConstMetric(ctr.desc, prometheus.GaugeValue, float64(paused), "paused")
ch <- prometheus.MustNewConstMetric(ctr.desc, prometheus.GaugeValue, float64(stopped), "stopped")
}

View File

@ -9,10 +9,22 @@ import (
"github.com/Sirupsen/logrus"
"github.com/docker/docker/api/types"
"github.com/docker/docker/container"
"github.com/docker/docker/libcontainerd"
"github.com/docker/docker/restartmanager"
)
func (daemon *Daemon) setStateCounter(c *container.Container) {
switch c.StateString() {
case "paused":
stateCtr.set(c.ID, "paused")
case "running":
stateCtr.set(c.ID, "running")
default:
stateCtr.set(c.ID, "stopped")
}
}
// StateChanged updates daemon state changes from containerd
func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
c := daemon.containers.Get(id)
@ -81,6 +93,8 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
}()
}
daemon.setStateCounter(c)
defer c.Unlock()
if err := c.ToDisk(); err != nil {
return err
@ -109,15 +123,19 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
c.SetRunning(int(e.Pid), e.State == libcontainerd.StateStart)
c.HasBeenManuallyStopped = false
c.HasBeenStartedBefore = true
daemon.setStateCounter(c)
if err := c.ToDisk(); err != nil {
c.Reset(false)
return err
}
daemon.initHealthMonitor(c)
daemon.LogContainerEvent(c, "start")
case libcontainerd.StatePause:
// Container is already locked in this case
c.Paused = true
daemon.setStateCounter(c)
if err := c.ToDisk(); err != nil {
return err
}
@ -126,12 +144,12 @@ func (daemon *Daemon) StateChanged(id string, e libcontainerd.StateInfo) error {
case libcontainerd.StateResume:
// Container is already locked in this case
c.Paused = false
daemon.setStateCounter(c)
if err := c.ToDisk(); err != nil {
return err
}
daemon.updateHealthMonitor(c)
daemon.LogContainerEvent(c, "unpause")
}
return nil
}

View File

@ -133,7 +133,7 @@ github.com/flynn-archive/go-shlex 3f9db97f856818214da2e1057f8ad84803971cff
github.com/Nvveen/Gotty a8b993ba6abdb0e0c12b0125c603323a71c7790c https://github.com/ijc25/Gotty
# metrics
github.com/docker/go-metrics 86138d05f285fd9737a99bee2d9be30866b59d72
github.com/docker/go-metrics 8fd5772bf1584597834c6f7961a530f06cbfbb87
# composefile
github.com/mitchellh/mapstructure f3009df150dadf309fdee4a54ed65c124afad715

View File

@ -2,10 +2,67 @@
This package is small wrapper around the prometheus go client to help enforce convention and best practices for metrics collection in Docker projects.
## Status
## Best Practices
This project is a work in progress.
It is under heavy development and not intended to be used.
This packages is meant to be used for collecting metrics in Docker projects.
It is not meant to be used as a replacement for the prometheus client but to help enforce consistent naming across metrics collected.
If you have not already read the prometheus best practices around naming and labels you can read the page [here](https://prometheus.io/docs/practices/naming/).
The following are a few Docker specific rules that will help you name and work with metrics in your project.
1. Namespace and Subsystem
This package provides you with a namespace type that allows you to specify the same namespace and subsystem for your metrics.
```go
ns := metrics.NewNamespace("engine", "daemon", metrics.Labels{
"version": dockerversion.Version,
"commit": dockerversion.GitCommit,
})
```
In the example above we are creating metrics for the Docker engine's daemon package.
`engine` would be the namespace in this example where `daemon` is the subsystem or package where we are collecting the metrics.
A namespace also allows you to attach constant labels to the metrics such as the git commit and version that it is collecting.
2. Declaring your Metrics
Try to keep all your metric declarations in one file.
This makes it easy for others to see what constant labels are defined on the namespace and what labels are defined on the metrics when they are created.
3. Use labels instead of multiple metrics
Labels allow you to define one metric such as the time it takes to perform a certain action on an object.
If we wanted to collect timings on various container actions such as create, start, and delete then we can define one metric called `container_actions` and use labels to specify the type of action.
```go
containerActions = ns.NewLabeledTimer("container_actions", "The number of milliseconds it takes to process each container action", "action")
```
The last parameter is the label name or key.
When adding a data point to the metric you will use the `WithValues` function to specify the `action` that you are collecting for.
```go
containerActions.WithValues("create").UpdateSince(start)
```
4. Always use a unit
The metric name should describe what you are measuring but you also need to provide the unit that it is being measured with.
For a timer, the standard unit is seconds and a counter's standard unit is a total.
For gauges you must provide the unit.
This package provides a standard set of units for use within the Docker projects.
```go
Nanoseconds Unit = "nanoseconds"
Seconds Unit = "seconds"
Bytes Unit = "bytes"
Total Unit = "total"
```
If you need to use a unit but it is not defined in the package please open a PR to add it but first try to see if one of the already created units will work for your metric, i.e. seconds or nanoseconds vs adding milliseconds.
## Docs

View File

@ -40,21 +40,25 @@ type Namespace struct {
// Only metrics created with the returned namespace will get the new constant
// labels. The returned namespace must be registered separately.
func (n *Namespace) WithConstLabels(labels Labels) *Namespace {
ns := *n
ns.metrics = nil // blank this out
ns.labels = mergeLabels(ns.labels, labels)
return &ns
n.mu.Lock()
ns := &Namespace{
name: n.name,
subsystem: n.subsystem,
labels: mergeLabels(n.labels, labels),
}
n.mu.Unlock()
return ns
}
func (n *Namespace) NewCounter(name, help string) Counter {
c := &counter{pc: prometheus.NewCounter(n.newCounterOpts(name, help))}
n.addMetric(c)
n.Add(c)
return c
}
func (n *Namespace) NewLabeledCounter(name, help string, labels ...string) LabeledCounter {
c := &labeledCounter{pc: prometheus.NewCounterVec(n.newCounterOpts(name, help), labels)}
n.addMetric(c)
n.Add(c)
return c
}
@ -72,7 +76,7 @@ func (n *Namespace) NewTimer(name, help string) Timer {
t := &timer{
m: prometheus.NewHistogram(n.newTimerOpts(name, help)),
}
n.addMetric(t)
n.Add(t)
return t
}
@ -80,7 +84,7 @@ func (n *Namespace) NewLabeledTimer(name, help string, labels ...string) Labeled
t := &labeledTimer{
m: prometheus.NewHistogramVec(n.newTimerOpts(name, help), labels),
}
n.addMetric(t)
n.Add(t)
return t
}
@ -98,7 +102,7 @@ func (n *Namespace) NewGauge(name, help string, unit Unit) Gauge {
g := &gauge{
pg: prometheus.NewGauge(n.newGaugeOpts(name, help, unit)),
}
n.addMetric(g)
n.Add(g)
return g
}
@ -106,7 +110,7 @@ func (n *Namespace) NewLabeledGauge(name, help string, unit Unit, labels ...stri
g := &labeledGauge{
pg: prometheus.NewGaugeVec(n.newGaugeOpts(name, help, unit), labels),
}
n.addMetric(g)
n.Add(g)
return g
}
@ -138,12 +142,24 @@ func (n *Namespace) Collect(ch chan<- prometheus.Metric) {
}
}
func (n *Namespace) addMetric(collector prometheus.Collector) {
func (n *Namespace) Add(collector prometheus.Collector) {
n.mu.Lock()
n.metrics = append(n.metrics, collector)
n.mu.Unlock()
}
func (n *Namespace) NewDesc(name, help string, unit Unit, labels ...string) *prometheus.Desc {
if string(unit) != "" {
name = fmt.Sprintf("%s_%s", name, unit)
}
namespace := n.name
if n.subsystem != "" {
namespace = fmt.Sprintf("%s_%s", namespace, n.subsystem)
}
name = fmt.Sprintf("%s_%s", namespace, name)
return prometheus.NewDesc(name, help, labels, prometheus.Labels(n.labels))
}
// mergeLabels merges two or more labels objects into a single map, favoring
// the later labels.
func mergeLabels(lbs ...Labels) Labels {