From 115f91d7575d6de6c7781a96a082f144fd17e400 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Mon, 6 Mar 2017 17:29:09 +0000 Subject: [PATCH] Correct CPU usage calculation in presence of offline CPUs and newer Linux In https://github.com/torvalds/linux/commit/5ca3726 (released in v4.7-rc1) the content of the `cpuacct.usage_percpu` file in sysfs was changed to include both online and offline cpus. This broke the arithmetic in the stats helpers used by `docker stats`, since it was using the length of the PerCPUUsage array as a proxy for the number of online CPUs. Add current number of online CPUs to types.StatsJSON and use it in the calculation. Keep a fallback to `len(v.CPUStats.CPUUsage.PercpuUsage)` so this code continues to work when talking to an older daemon. An old client talking to a new daemon will ignore the new field and behave as before. Fixes #28941. Signed-off-by: Ian Campbell --- api/swagger.yaml | 6 ++++++ api/types/stats.go | 3 +++ cli/command/container/stats_helpers.go | 6 +++++- daemon/stats/collector.go | 7 +++++++ daemon/stats/collector_unix.go | 13 +++++++++++++ daemon/stats/collector_windows.go | 4 ++++ docs/api/version-history.md | 1 + 7 files changed, 39 insertions(+), 1 deletion(-) diff --git a/api/swagger.yaml b/api/swagger.yaml index 568379f29d..6ca3ffee8b 100644 --- a/api/swagger.yaml +++ b/api/swagger.yaml @@ -3468,6 +3468,10 @@ paths: The `precpu_stats` is the CPU statistic of last read, which is used for calculating the CPU usage percentage. It is not the same as the `cpu_stats` field. + + If either `precpu_stats.online_cpus` or `cpu_stats.online_cpus` is + nil then for compatibility with older daemons the length of the + corresponding `cpu_usage.percpu_usage` array should be used. operationId: "ContainerStats" produces: ["application/json"] responses: @@ -3546,6 +3550,7 @@ paths: total_usage: 100215355 usage_in_kernelmode: 30000000 system_cpu_usage: 739306590000000 + online_cpus: 4 throttling_data: periods: 0 throttled_periods: 0 @@ -3561,6 +3566,7 @@ paths: total_usage: 100093996 usage_in_kernelmode: 30000000 system_cpu_usage: 9492140000000 + online_cpus: 4 throttling_data: periods: 0 throttled_periods: 0 diff --git a/api/types/stats.go b/api/types/stats.go index 9bf1928b8c..7ca76a5b63 100644 --- a/api/types/stats.go +++ b/api/types/stats.go @@ -47,6 +47,9 @@ type CPUStats struct { // System Usage. Linux only. SystemUsage uint64 `json:"system_cpu_usage,omitempty"` + // Online CPUs. Linux only. + OnlineCPUs uint32 `json:"online_cpus,omitempty"` + // Throttling Data. Linux only. ThrottlingData ThrottlingData `json:"throttling_data,omitempty"` } diff --git a/cli/command/container/stats_helpers.go b/cli/command/container/stats_helpers.go index 5fad740435..3dc939a137 100644 --- a/cli/command/container/stats_helpers.go +++ b/cli/command/container/stats_helpers.go @@ -178,10 +178,14 @@ func calculateCPUPercentUnix(previousCPU, previousSystem uint64, v *types.StatsJ cpuDelta = float64(v.CPUStats.CPUUsage.TotalUsage) - float64(previousCPU) // calculate the change for the entire system between readings systemDelta = float64(v.CPUStats.SystemUsage) - float64(previousSystem) + onlineCPUs = float64(v.CPUStats.OnlineCPUs) ) + if onlineCPUs == 0.0 { + onlineCPUs = float64(len(v.CPUStats.CPUUsage.PercpuUsage)) + } if systemDelta > 0.0 && cpuDelta > 0.0 { - cpuPercent = (cpuDelta / systemDelta) * float64(len(v.CPUStats.CPUUsage.PercpuUsage)) * 100.0 + cpuPercent = (cpuDelta / systemDelta) * onlineCPUs * 100.0 } return cpuPercent } diff --git a/daemon/stats/collector.go b/daemon/stats/collector.go index 4627154024..71bcff446e 100644 --- a/daemon/stats/collector.go +++ b/daemon/stats/collector.go @@ -80,6 +80,12 @@ func (s *Collector) Run() { continue } + onlineCPUs, err := s.getNumberOnlineCPUs() + if err != nil { + logrus.Errorf("collecting system online cpu count: %v", err) + continue + } + for _, pair := range pairs { stats, err := s.supervisor.GetContainerStats(pair.container) if err != nil { @@ -97,6 +103,7 @@ func (s *Collector) Run() { } // FIXME: move to containerd on Linux (not Windows) stats.CPUStats.SystemUsage = systemUsage + stats.CPUStats.OnlineCPUs = onlineCPUs pair.publisher.Publish(*stats) } diff --git a/daemon/stats/collector_unix.go b/daemon/stats/collector_unix.go index 611df1ed69..5ad9658690 100644 --- a/daemon/stats/collector_unix.go +++ b/daemon/stats/collector_unix.go @@ -11,6 +11,11 @@ import ( "github.com/opencontainers/runc/libcontainer/system" ) +/* +#include +*/ +import "C" + // platformNewStatsCollector performs platform specific initialisation of the // Collector structure. func platformNewStatsCollector(s *Collector) { @@ -64,3 +69,11 @@ func (s *Collector) getSystemCPUUsage() (uint64, error) { } return 0, fmt.Errorf("invalid stat format. Error trying to parse the '/proc/stat' file") } + +func (s *Collector) getNumberOnlineCPUs() (uint32, error) { + i, err := C.sysconf(C._SC_NPROCESSORS_ONLN) + if err != nil { + return 0, err + } + return uint32(i), nil +} diff --git a/daemon/stats/collector_windows.go b/daemon/stats/collector_windows.go index e51495f048..5fb27ced66 100644 --- a/daemon/stats/collector_windows.go +++ b/daemon/stats/collector_windows.go @@ -13,3 +13,7 @@ func platformNewStatsCollector(s *Collector) { func (s *Collector) getSystemCPUUsage() (uint64, error) { return 0, nil } + +func (s *Collector) getNumberOnlineCPUs() (uint32, error) { + return 0, nil +} diff --git a/docs/api/version-history.md b/docs/api/version-history.md index 541111c14a..5d8bb15b98 100644 --- a/docs/api/version-history.md +++ b/docs/api/version-history.md @@ -24,6 +24,7 @@ keywords: "API, Docker, rcli, REST, documentation" * `POST /build` now accepts `extrahosts` parameter to specify a host to ip mapping to use during the build. * `POST /services/create` and `POST /services/(id or name)/update` now accept a `rollback` value for `FailureAction`. * `POST /services/create` and `POST /services/(id or name)/update` now accept an optional `RollbackConfig` object which specifies rollback options. +* `GET /containers/(id or name)/stats` now includes an `online_cpus` field in both `precpu_stats` and `cpu_stats`. If this field is `nil` then for compatibility with older daemons the length of the corresponding `cpu_usage.percpu_usage` array should be used. ## v1.26 API changes