1
0
Fork 0
mirror of https://github.com/moby/moby.git synced 2022-11-09 12:21:53 -05:00

daemon: switch to semaphore-gated WaitGroup for startup tasks

Many startup tasks have to run for each container, and thus using a
WaitGroup (which doesn't have a limit to the number of parallel tasks)
can result in Docker exceeding the NOFILE limit quite trivially. A more
optimal solution is to have a parallelism limit by using a semaphore.

In addition, several startup tasks were not parallelised previously
which resulted in very long startup times. According to my testing, 20K
dead containers resulted in ~6 minute startup times (during which time
Docker is completely unusable).

This patch fixes both issues, and the parallelStartupTimes factor chosen
(128 * NumCPU) is based on my own significant testing of the 20K
container case. This patch (on my machines) reduces the startup time
from 6 minutes to less than a minute (ideally this could be further
reduced by removing the need to scan all dead containers on startup --
but that's beyond the scope of this patchset).

In order to avoid the NOFILE limit problem, we also detect this
on-startup and if NOFILE < 2*128*NumCPU we will reduce the parallelism
factor to avoid hitting NOFILE limits (but also emit a warning since
this is almost certainly a mis-configuration).

Signed-off-by: Aleksa Sarai <asarai@suse.de>
This commit is contained in:
Aleksa Sarai 2018-12-05 03:44:45 +11:00
parent 1895e082b6
commit 5a52917e4d
No known key found for this signature in database
GPG key ID: 9E18AA267DDB8DB4
3 changed files with 179 additions and 73 deletions

View file

@ -67,6 +67,7 @@ import (
"github.com/docker/libnetwork/cluster" "github.com/docker/libnetwork/cluster"
nwconfig "github.com/docker/libnetwork/config" nwconfig "github.com/docker/libnetwork/config"
"github.com/pkg/errors" "github.com/pkg/errors"
"golang.org/x/sync/semaphore"
) )
// ContainersNamespace is the name of the namespace used for users containers // ContainersNamespace is the name of the namespace used for users containers
@ -198,6 +199,7 @@ func (daemon *Daemon) NewResolveOptionsFunc() resolver.ResolveOptionsFunc {
} }
func (daemon *Daemon) restore() error { func (daemon *Daemon) restore() error {
var mapLock sync.Mutex
containers := make(map[string]*container.Container) containers := make(map[string]*container.Container)
logrus.Info("Loading containers: start.") logrus.Info("Loading containers: start.")
@ -207,68 +209,99 @@ func (daemon *Daemon) restore() error {
return err return err
} }
for _, v := range dir { // parallelLimit is the maximum number of parallel startup jobs that we
id := v.Name() // allow (this is the limited used for all startup semaphores). The multipler
container, err := daemon.load(id) // (128) was chosen after some fairly significant benchmarking -- don't change
if err != nil { // it unless you've tested it significantly (this value is adjusted if
logrus.Errorf("Failed to load container %v: %v", id, err) // RLIMIT_NOFILE is small to avoid EMFILE).
continue parallelLimit := adjustParallelLimit(len(dir), 128*runtime.NumCPU())
}
if !system.IsOSSupported(container.OS) {
logrus.Errorf("Failed to load container %v: %s (%q)", id, system.ErrNotSupportedOperatingSystem, container.OS)
continue
}
// Ignore the container if it does not support the current driver being used by the graph
currentDriverForContainerOS := daemon.graphDrivers[container.OS]
if (container.Driver == "" && currentDriverForContainerOS == "aufs") || container.Driver == currentDriverForContainerOS {
rwlayer, err := daemon.imageService.GetLayerByID(container.ID, container.OS)
if err != nil {
logrus.Errorf("Failed to load container mount %v: %v", id, err)
continue
}
container.RWLayer = rwlayer
logrus.Debugf("Loaded container %v, isRunning: %v", container.ID, container.IsRunning())
containers[container.ID] = container // Re-used for all parallel startup jobs.
} else { var group sync.WaitGroup
logrus.Debugf("Cannot load container %s because it was created with another graph driver.", container.ID) sem := semaphore.NewWeighted(int64(parallelLimit))
}
for _, v := range dir {
group.Add(1)
go func(id string) {
defer group.Done()
_ = sem.Acquire(context.Background(), 1)
defer sem.Release(1)
container, err := daemon.load(id)
if err != nil {
logrus.Errorf("Failed to load container %v: %v", id, err)
return
}
if !system.IsOSSupported(container.OS) {
logrus.Errorf("Failed to load container %v: %s (%q)", id, system.ErrNotSupportedOperatingSystem, container.OS)
return
}
// Ignore the container if it does not support the current driver being used by the graph
currentDriverForContainerOS := daemon.graphDrivers[container.OS]
if (container.Driver == "" && currentDriverForContainerOS == "aufs") || container.Driver == currentDriverForContainerOS {
rwlayer, err := daemon.imageService.GetLayerByID(container.ID, container.OS)
if err != nil {
logrus.Errorf("Failed to load container mount %v: %v", id, err)
return
}
container.RWLayer = rwlayer
logrus.Debugf("Loaded container %v, isRunning: %v", container.ID, container.IsRunning())
mapLock.Lock()
containers[container.ID] = container
mapLock.Unlock()
} else {
logrus.Debugf("Cannot load container %s because it was created with another graph driver.", container.ID)
}
}(v.Name())
} }
group.Wait()
removeContainers := make(map[string]*container.Container) removeContainers := make(map[string]*container.Container)
restartContainers := make(map[*container.Container]chan struct{}) restartContainers := make(map[*container.Container]chan struct{})
activeSandboxes := make(map[string]interface{}) activeSandboxes := make(map[string]interface{})
for id, c := range containers { for id, c := range containers {
if err := daemon.registerName(c); err != nil { group.Add(1)
logrus.Errorf("Failed to register container name %s: %s", c.ID, err)
delete(containers, id)
continue
}
if err := daemon.Register(c); err != nil {
logrus.Errorf("Failed to register container %s: %s", c.ID, err)
delete(containers, id)
continue
}
// The LogConfig.Type is empty if the container was created before docker 1.12 with default log driver.
// We should rewrite it to use the daemon defaults.
// Fixes https://github.com/docker/docker/issues/22536
if c.HostConfig.LogConfig.Type == "" {
if err := daemon.mergeAndVerifyLogConfig(&c.HostConfig.LogConfig); err != nil {
logrus.Errorf("Failed to verify log config for container %s: %q", c.ID, err)
continue
}
}
}
var (
wg sync.WaitGroup
mapLock sync.Mutex
)
for _, c := range containers {
wg.Add(1)
go func(c *container.Container) { go func(c *container.Container) {
defer wg.Done() defer group.Done()
_ = sem.Acquire(context.Background(), 1)
defer sem.Release(1)
if err := daemon.registerName(c); err != nil {
logrus.Errorf("Failed to register container name %s: %s", c.ID, err)
mapLock.Lock()
delete(containers, id)
mapLock.Unlock()
return
}
if err := daemon.Register(c); err != nil {
logrus.Errorf("Failed to register container %s: %s", c.ID, err)
mapLock.Lock()
delete(containers, id)
mapLock.Unlock()
return
}
// The LogConfig.Type is empty if the container was created before docker 1.12 with default log driver.
// We should rewrite it to use the daemon defaults.
// Fixes https://github.com/docker/docker/issues/22536
if c.HostConfig.LogConfig.Type == "" {
if err := daemon.mergeAndVerifyLogConfig(&c.HostConfig.LogConfig); err != nil {
logrus.Errorf("Failed to verify log config for container %s: %q", c.ID, err)
}
}
}(c)
}
group.Wait()
for _, c := range containers {
group.Add(1)
go func(c *container.Container) {
defer group.Done()
_ = sem.Acquire(context.Background(), 1)
defer sem.Release(1)
daemon.backportMountSpec(c) daemon.backportMountSpec(c)
if err := daemon.checkpointAndSave(c); err != nil { if err := daemon.checkpointAndSave(c); err != nil {
logrus.WithError(err).WithField("container", c.ID).Error("error saving backported mountspec to disk") logrus.WithError(err).WithField("container", c.ID).Error("error saving backported mountspec to disk")
@ -415,7 +448,8 @@ func (daemon *Daemon) restore() error {
c.Unlock() c.Unlock()
}(c) }(c)
} }
wg.Wait() group.Wait()
daemon.netController, err = daemon.initNetworkController(daemon.configStore, activeSandboxes) daemon.netController, err = daemon.initNetworkController(daemon.configStore, activeSandboxes)
if err != nil { if err != nil {
return fmt.Errorf("Error initializing network controller: %v", err) return fmt.Errorf("Error initializing network controller: %v", err)
@ -423,18 +457,24 @@ func (daemon *Daemon) restore() error {
// Now that all the containers are registered, register the links // Now that all the containers are registered, register the links
for _, c := range containers { for _, c := range containers {
if err := daemon.registerLinks(c, c.HostConfig); err != nil { group.Add(1)
logrus.Errorf("failed to register link for container %s: %v", c.ID, err) go func(c *container.Container) {
} _ = sem.Acquire(context.Background(), 1)
}
if err := daemon.registerLinks(c, c.HostConfig); err != nil {
logrus.Errorf("failed to register link for container %s: %v", c.ID, err)
}
sem.Release(1)
group.Done()
}(c)
}
group.Wait()
group := sync.WaitGroup{}
for c, notifier := range restartContainers { for c, notifier := range restartContainers {
group.Add(1) group.Add(1)
go func(c *container.Container, chNotify chan struct{}) { go func(c *container.Container, chNotify chan struct{}) {
defer group.Done() _ = sem.Acquire(context.Background(), 1)
logrus.Debugf("Starting container %s", c.ID) logrus.Debugf("Starting container %s", c.ID)
// ignore errors here as this is a best effort to wait for children to be // ignore errors here as this is a best effort to wait for children to be
@ -456,22 +496,27 @@ func (daemon *Daemon) restore() error {
logrus.Errorf("Failed to start container %s: %s", c.ID, err) logrus.Errorf("Failed to start container %s: %s", c.ID, err)
} }
close(chNotify) close(chNotify)
}(c, notifier)
sem.Release(1)
group.Done()
}(c, notifier)
} }
group.Wait() group.Wait()
removeGroup := sync.WaitGroup{}
for id := range removeContainers { for id := range removeContainers {
removeGroup.Add(1) group.Add(1)
go func(cid string) { go func(cid string) {
_ = sem.Acquire(context.Background(), 1)
if err := daemon.ContainerRm(cid, &types.ContainerRmConfig{ForceRemove: true, RemoveVolume: true}); err != nil { if err := daemon.ContainerRm(cid, &types.ContainerRmConfig{ForceRemove: true, RemoveVolume: true}); err != nil {
logrus.Errorf("Failed to remove container %s: %s", cid, err) logrus.Errorf("Failed to remove container %s: %s", cid, err)
} }
removeGroup.Done()
sem.Release(1)
group.Done()
}(id) }(id)
} }
removeGroup.Wait() group.Wait()
// any containers that were started above would already have had this done, // any containers that were started above would already have had this done,
// however we need to now prepare the mountpoints for the rest of the containers as well. // however we need to now prepare the mountpoints for the rest of the containers as well.
@ -492,13 +537,16 @@ func (daemon *Daemon) restore() error {
group.Add(1) group.Add(1)
go func(c *container.Container) { go func(c *container.Container) {
defer group.Done() _ = sem.Acquire(context.Background(), 1)
if err := daemon.prepareMountPoints(c); err != nil { if err := daemon.prepareMountPoints(c); err != nil {
logrus.Error(err) logrus.Error(err)
} }
sem.Release(1)
group.Done()
}(c) }(c)
} }
group.Wait() group.Wait()
logrus.Info("Loading containers: done.") logrus.Info("Loading containers: done.")
@ -509,7 +557,18 @@ func (daemon *Daemon) restore() error {
// RestartSwarmContainers restarts any autostart container which has a // RestartSwarmContainers restarts any autostart container which has a
// swarm endpoint. // swarm endpoint.
func (daemon *Daemon) RestartSwarmContainers() { func (daemon *Daemon) RestartSwarmContainers() {
group := sync.WaitGroup{} ctx := context.Background()
// parallelLimit is the maximum number of parallel startup jobs that we
// allow (this is the limited used for all startup semaphores). The multipler
// (128) was chosen after some fairly significant benchmarking -- don't change
// it unless you've tested it significantly (this value is adjusted if
// RLIMIT_NOFILE is small to avoid EMFILE).
parallelLimit := adjustParallelLimit(len(daemon.List()), 128*runtime.NumCPU())
var group sync.WaitGroup
sem := semaphore.NewWeighted(int64(parallelLimit))
for _, c := range daemon.List() { for _, c := range daemon.List() {
if !c.IsRunning() && !c.IsPaused() { if !c.IsRunning() && !c.IsPaused() {
// Autostart all the containers which has a // Autostart all the containers which has a
@ -518,14 +577,21 @@ func (daemon *Daemon) RestartSwarmContainers() {
if daemon.configStore.AutoRestart && c.ShouldRestart() && c.NetworkSettings.HasSwarmEndpoint && c.HasBeenStartedBefore { if daemon.configStore.AutoRestart && c.ShouldRestart() && c.NetworkSettings.HasSwarmEndpoint && c.HasBeenStartedBefore {
group.Add(1) group.Add(1)
go func(c *container.Container) { go func(c *container.Container) {
defer group.Done() if err := sem.Acquire(ctx, 1); err != nil {
// ctx is done.
group.Done()
return
}
if err := daemon.containerStart(c, "", "", true); err != nil { if err := daemon.containerStart(c, "", "", true); err != nil {
logrus.Error(err) logrus.Error(err)
} }
sem.Release(1)
group.Done()
}(c) }(c)
} }
} }
} }
group.Wait() group.Wait()
} }

View file

@ -257,6 +257,41 @@ func getBlkioThrottleDevices(devs []*blkiodev.ThrottleDevice) ([]specs.LinuxThro
return throttleDevices, nil return throttleDevices, nil
} }
// adjustParallelLimit takes a number of objects and a proposed limit and
// figures out if it's reasonable (and adjusts it accordingly). This is only
// used for daemon startup, which does a lot of parallel loading of containers
// (and if we exceed RLIMIT_NOFILE then we're in trouble).
func adjustParallelLimit(n int, limit int) int {
// Rule-of-thumb overhead factor (how many files will each goroutine open
// simultaneously). Yes, this is ugly but to be frank this whole thing is
// ugly.
const overhead = 2
// On Linux, we need to ensure that parallelStartupJobs doesn't cause us to
// exceed RLIMIT_NOFILE. If parallelStartupJobs is too large, we reduce it
// and give a warning (since in theory the user should increase their
// ulimits to the largest possible value for dockerd).
var rlim unix.Rlimit
if err := unix.Getrlimit(unix.RLIMIT_NOFILE, &rlim); err != nil {
logrus.Warnf("Couldn't find dockerd's RLIMIT_NOFILE to double-check startup parallelism factor: %v", err)
return limit
}
softRlimit := int(rlim.Cur)
// Much fewer containers than RLIMIT_NOFILE. No need to adjust anything.
if softRlimit > overhead*n {
return limit
}
// RLIMIT_NOFILE big enough, no need to adjust anything.
if softRlimit > overhead*limit {
return limit
}
logrus.Warnf("Found dockerd's open file ulimit (%v) is far too small -- consider increasing it significantly (at least %v)", softRlimit, overhead*limit)
return softRlimit / overhead
}
func checkKernel() error { func checkKernel() error {
// Check for unsupported kernel versions // Check for unsupported kernel versions
// FIXME: it would be cleaner to not test for specific versions, but rather // FIXME: it would be cleaner to not test for specific versions, but rather

View file

@ -40,6 +40,11 @@ const (
windowsMaxCPUPercent = 100 windowsMaxCPUPercent = 100
) )
// Windows doesn't really have rlimits.
func adjustParallelLimit(n int, limit int) int {
return limit
}
// Windows has no concept of an execution state directory. So use config.Root here. // Windows has no concept of an execution state directory. So use config.Root here.
func getPluginExecRoot(root string) string { func getPluginExecRoot(root string) string {
return filepath.Join(root, "plugins") return filepath.Join(root, "plugins")