From fd0e24b7189374e0fe7c55b6d26ee916d3ee1655 Mon Sep 17 00:00:00 2001 From: Stephen J Day Date: Wed, 7 Mar 2018 13:20:21 -0800 Subject: [PATCH] daemon/stats: more resilient cpu sampling To avoid noise in sampling CPU usage metrics, we now sample the system usage closer to the actual response from the underlying runtime. Because the response from the runtime may be delayed, this makes the sampling more resilient in loaded conditions. In addition to this, we also replace the tick with a sleep to avoid situations where ticks can backup under loaded conditions. The trade off here is slightly more load reading the system CPU usage for each container. There may be an optimization required for large amounts of containers but the cost is on the order of 15 ms per 1000 containers. If this becomes a problem, we can time slot the sampling, but the complexity may not be worth it unless we can test further. Unfortunately, there aren't really any good tests for this condition. Triggering this behavior is highly system dependent. As a matter of course, we should qualify the fix with the users that are affected. Signed-off-by: Stephen J Day --- daemon/stats/collector.go | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/daemon/stats/collector.go b/daemon/stats/collector.go index 39c76128b0..24d41a3d2e 100644 --- a/daemon/stats/collector.go +++ b/daemon/stats/collector.go @@ -57,7 +57,7 @@ func (s *Collector) Run() { // it will grow enough in first iteration var pairs []publishersPair - for range time.Tick(s.interval) { + for { // it does not make sense in the first iteration, // but saves allocations in further iterations pairs = pairs[:0] @@ -72,12 +72,6 @@ func (s *Collector) Run() { continue } - systemUsage, err := s.getSystemCPUUsage() - if err != nil { - logrus.Errorf("collecting system cpu usage: %v", err) - continue - } - onlineCPUs, err := s.getNumberOnlineCPUs() if err != nil { logrus.Errorf("collecting system online cpu count: %v", err) @@ -89,6 +83,14 @@ func (s *Collector) Run() { switch err.(type) { case nil: + // Sample system CPU usage close to container usage to avoid + // noise in metric calculations. + systemUsage, err := s.getSystemCPUUsage() + if err != nil { + logrus.WithError(err).WithField("container_id", pair.container.ID).Errorf("collecting system cpu usage") + continue + } + // FIXME: move to containerd on Linux (not Windows) stats.CPUStats.SystemUsage = systemUsage stats.CPUStats.OnlineCPUs = onlineCPUs @@ -106,6 +108,8 @@ func (s *Collector) Run() { logrus.Errorf("collecting stats for %s: %v", pair.container.ID, err) } } + + time.Sleep(s.interval) } }