mirror of
https://github.com/moby/moby.git
synced 2022-11-09 12:21:53 -05:00
daemon: switch to semaphore-gated WaitGroup for startup tasks
Many startup tasks have to run for each container, and thus using a WaitGroup (which doesn't have a limit to the number of parallel tasks) can result in Docker exceeding the NOFILE limit quite trivially. A more optimal solution is to have a parallelism limit by using a semaphore. In addition, several startup tasks were not parallelised previously which resulted in very long startup times. According to my testing, 20K dead containers resulted in ~6 minute startup times (during which time Docker is completely unusable). This patch fixes both issues, and the parallelStartupTimes factor chosen (128 * NumCPU) is based on my own significant testing of the 20K container case. This patch (on my machines) reduces the startup time from 6 minutes to less than a minute (ideally this could be further reduced by removing the need to scan all dead containers on startup -- but that's beyond the scope of this patchset). In order to avoid the NOFILE limit problem, we also detect this on-startup and if NOFILE < 2*128*NumCPU we will reduce the parallelism factor to avoid hitting NOFILE limits (but also emit a warning since this is almost certainly a mis-configuration). Signed-off-by: Aleksa Sarai <asarai@suse.de>
This commit is contained in:
parent
1895e082b6
commit
5a52917e4d
3 changed files with 179 additions and 73 deletions
212
daemon/daemon.go
212
daemon/daemon.go
|
@ -67,6 +67,7 @@ import (
|
||||||
"github.com/docker/libnetwork/cluster"
|
"github.com/docker/libnetwork/cluster"
|
||||||
nwconfig "github.com/docker/libnetwork/config"
|
nwconfig "github.com/docker/libnetwork/config"
|
||||||
"github.com/pkg/errors"
|
"github.com/pkg/errors"
|
||||||
|
"golang.org/x/sync/semaphore"
|
||||||
)
|
)
|
||||||
|
|
||||||
// ContainersNamespace is the name of the namespace used for users containers
|
// ContainersNamespace is the name of the namespace used for users containers
|
||||||
|
@ -198,6 +199,7 @@ func (daemon *Daemon) NewResolveOptionsFunc() resolver.ResolveOptionsFunc {
|
||||||
}
|
}
|
||||||
|
|
||||||
func (daemon *Daemon) restore() error {
|
func (daemon *Daemon) restore() error {
|
||||||
|
var mapLock sync.Mutex
|
||||||
containers := make(map[string]*container.Container)
|
containers := make(map[string]*container.Container)
|
||||||
|
|
||||||
logrus.Info("Loading containers: start.")
|
logrus.Info("Loading containers: start.")
|
||||||
|
@ -207,68 +209,99 @@ func (daemon *Daemon) restore() error {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, v := range dir {
|
// parallelLimit is the maximum number of parallel startup jobs that we
|
||||||
id := v.Name()
|
// allow (this is the limited used for all startup semaphores). The multipler
|
||||||
container, err := daemon.load(id)
|
// (128) was chosen after some fairly significant benchmarking -- don't change
|
||||||
if err != nil {
|
// it unless you've tested it significantly (this value is adjusted if
|
||||||
logrus.Errorf("Failed to load container %v: %v", id, err)
|
// RLIMIT_NOFILE is small to avoid EMFILE).
|
||||||
continue
|
parallelLimit := adjustParallelLimit(len(dir), 128*runtime.NumCPU())
|
||||||
}
|
|
||||||
if !system.IsOSSupported(container.OS) {
|
|
||||||
logrus.Errorf("Failed to load container %v: %s (%q)", id, system.ErrNotSupportedOperatingSystem, container.OS)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
// Ignore the container if it does not support the current driver being used by the graph
|
|
||||||
currentDriverForContainerOS := daemon.graphDrivers[container.OS]
|
|
||||||
if (container.Driver == "" && currentDriverForContainerOS == "aufs") || container.Driver == currentDriverForContainerOS {
|
|
||||||
rwlayer, err := daemon.imageService.GetLayerByID(container.ID, container.OS)
|
|
||||||
if err != nil {
|
|
||||||
logrus.Errorf("Failed to load container mount %v: %v", id, err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
container.RWLayer = rwlayer
|
|
||||||
logrus.Debugf("Loaded container %v, isRunning: %v", container.ID, container.IsRunning())
|
|
||||||
|
|
||||||
containers[container.ID] = container
|
// Re-used for all parallel startup jobs.
|
||||||
} else {
|
var group sync.WaitGroup
|
||||||
logrus.Debugf("Cannot load container %s because it was created with another graph driver.", container.ID)
|
sem := semaphore.NewWeighted(int64(parallelLimit))
|
||||||
}
|
|
||||||
|
for _, v := range dir {
|
||||||
|
group.Add(1)
|
||||||
|
go func(id string) {
|
||||||
|
defer group.Done()
|
||||||
|
_ = sem.Acquire(context.Background(), 1)
|
||||||
|
defer sem.Release(1)
|
||||||
|
|
||||||
|
container, err := daemon.load(id)
|
||||||
|
if err != nil {
|
||||||
|
logrus.Errorf("Failed to load container %v: %v", id, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if !system.IsOSSupported(container.OS) {
|
||||||
|
logrus.Errorf("Failed to load container %v: %s (%q)", id, system.ErrNotSupportedOperatingSystem, container.OS)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
// Ignore the container if it does not support the current driver being used by the graph
|
||||||
|
currentDriverForContainerOS := daemon.graphDrivers[container.OS]
|
||||||
|
if (container.Driver == "" && currentDriverForContainerOS == "aufs") || container.Driver == currentDriverForContainerOS {
|
||||||
|
rwlayer, err := daemon.imageService.GetLayerByID(container.ID, container.OS)
|
||||||
|
if err != nil {
|
||||||
|
logrus.Errorf("Failed to load container mount %v: %v", id, err)
|
||||||
|
return
|
||||||
|
}
|
||||||
|
container.RWLayer = rwlayer
|
||||||
|
logrus.Debugf("Loaded container %v, isRunning: %v", container.ID, container.IsRunning())
|
||||||
|
|
||||||
|
mapLock.Lock()
|
||||||
|
containers[container.ID] = container
|
||||||
|
mapLock.Unlock()
|
||||||
|
} else {
|
||||||
|
logrus.Debugf("Cannot load container %s because it was created with another graph driver.", container.ID)
|
||||||
|
}
|
||||||
|
}(v.Name())
|
||||||
}
|
}
|
||||||
|
group.Wait()
|
||||||
|
|
||||||
removeContainers := make(map[string]*container.Container)
|
removeContainers := make(map[string]*container.Container)
|
||||||
restartContainers := make(map[*container.Container]chan struct{})
|
restartContainers := make(map[*container.Container]chan struct{})
|
||||||
activeSandboxes := make(map[string]interface{})
|
activeSandboxes := make(map[string]interface{})
|
||||||
|
|
||||||
for id, c := range containers {
|
for id, c := range containers {
|
||||||
if err := daemon.registerName(c); err != nil {
|
group.Add(1)
|
||||||
logrus.Errorf("Failed to register container name %s: %s", c.ID, err)
|
|
||||||
delete(containers, id)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
if err := daemon.Register(c); err != nil {
|
|
||||||
logrus.Errorf("Failed to register container %s: %s", c.ID, err)
|
|
||||||
delete(containers, id)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
|
|
||||||
// The LogConfig.Type is empty if the container was created before docker 1.12 with default log driver.
|
|
||||||
// We should rewrite it to use the daemon defaults.
|
|
||||||
// Fixes https://github.com/docker/docker/issues/22536
|
|
||||||
if c.HostConfig.LogConfig.Type == "" {
|
|
||||||
if err := daemon.mergeAndVerifyLogConfig(&c.HostConfig.LogConfig); err != nil {
|
|
||||||
logrus.Errorf("Failed to verify log config for container %s: %q", c.ID, err)
|
|
||||||
continue
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
var (
|
|
||||||
wg sync.WaitGroup
|
|
||||||
mapLock sync.Mutex
|
|
||||||
)
|
|
||||||
for _, c := range containers {
|
|
||||||
wg.Add(1)
|
|
||||||
go func(c *container.Container) {
|
go func(c *container.Container) {
|
||||||
defer wg.Done()
|
defer group.Done()
|
||||||
|
_ = sem.Acquire(context.Background(), 1)
|
||||||
|
defer sem.Release(1)
|
||||||
|
|
||||||
|
if err := daemon.registerName(c); err != nil {
|
||||||
|
logrus.Errorf("Failed to register container name %s: %s", c.ID, err)
|
||||||
|
mapLock.Lock()
|
||||||
|
delete(containers, id)
|
||||||
|
mapLock.Unlock()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
if err := daemon.Register(c); err != nil {
|
||||||
|
logrus.Errorf("Failed to register container %s: %s", c.ID, err)
|
||||||
|
mapLock.Lock()
|
||||||
|
delete(containers, id)
|
||||||
|
mapLock.Unlock()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
|
// The LogConfig.Type is empty if the container was created before docker 1.12 with default log driver.
|
||||||
|
// We should rewrite it to use the daemon defaults.
|
||||||
|
// Fixes https://github.com/docker/docker/issues/22536
|
||||||
|
if c.HostConfig.LogConfig.Type == "" {
|
||||||
|
if err := daemon.mergeAndVerifyLogConfig(&c.HostConfig.LogConfig); err != nil {
|
||||||
|
logrus.Errorf("Failed to verify log config for container %s: %q", c.ID, err)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}(c)
|
||||||
|
}
|
||||||
|
group.Wait()
|
||||||
|
|
||||||
|
for _, c := range containers {
|
||||||
|
group.Add(1)
|
||||||
|
go func(c *container.Container) {
|
||||||
|
defer group.Done()
|
||||||
|
_ = sem.Acquire(context.Background(), 1)
|
||||||
|
defer sem.Release(1)
|
||||||
|
|
||||||
daemon.backportMountSpec(c)
|
daemon.backportMountSpec(c)
|
||||||
if err := daemon.checkpointAndSave(c); err != nil {
|
if err := daemon.checkpointAndSave(c); err != nil {
|
||||||
logrus.WithError(err).WithField("container", c.ID).Error("error saving backported mountspec to disk")
|
logrus.WithError(err).WithField("container", c.ID).Error("error saving backported mountspec to disk")
|
||||||
|
@ -415,7 +448,8 @@ func (daemon *Daemon) restore() error {
|
||||||
c.Unlock()
|
c.Unlock()
|
||||||
}(c)
|
}(c)
|
||||||
}
|
}
|
||||||
wg.Wait()
|
group.Wait()
|
||||||
|
|
||||||
daemon.netController, err = daemon.initNetworkController(daemon.configStore, activeSandboxes)
|
daemon.netController, err = daemon.initNetworkController(daemon.configStore, activeSandboxes)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return fmt.Errorf("Error initializing network controller: %v", err)
|
return fmt.Errorf("Error initializing network controller: %v", err)
|
||||||
|
@ -423,18 +457,24 @@ func (daemon *Daemon) restore() error {
|
||||||
|
|
||||||
// Now that all the containers are registered, register the links
|
// Now that all the containers are registered, register the links
|
||||||
for _, c := range containers {
|
for _, c := range containers {
|
||||||
if err := daemon.registerLinks(c, c.HostConfig); err != nil {
|
group.Add(1)
|
||||||
logrus.Errorf("failed to register link for container %s: %v", c.ID, err)
|
go func(c *container.Container) {
|
||||||
}
|
_ = sem.Acquire(context.Background(), 1)
|
||||||
}
|
|
||||||
|
if err := daemon.registerLinks(c, c.HostConfig); err != nil {
|
||||||
|
logrus.Errorf("failed to register link for container %s: %v", c.ID, err)
|
||||||
|
}
|
||||||
|
|
||||||
|
sem.Release(1)
|
||||||
|
group.Done()
|
||||||
|
}(c)
|
||||||
|
}
|
||||||
|
group.Wait()
|
||||||
|
|
||||||
group := sync.WaitGroup{}
|
|
||||||
for c, notifier := range restartContainers {
|
for c, notifier := range restartContainers {
|
||||||
group.Add(1)
|
group.Add(1)
|
||||||
|
|
||||||
go func(c *container.Container, chNotify chan struct{}) {
|
go func(c *container.Container, chNotify chan struct{}) {
|
||||||
defer group.Done()
|
_ = sem.Acquire(context.Background(), 1)
|
||||||
|
|
||||||
logrus.Debugf("Starting container %s", c.ID)
|
logrus.Debugf("Starting container %s", c.ID)
|
||||||
|
|
||||||
// ignore errors here as this is a best effort to wait for children to be
|
// ignore errors here as this is a best effort to wait for children to be
|
||||||
|
@ -456,22 +496,27 @@ func (daemon *Daemon) restore() error {
|
||||||
logrus.Errorf("Failed to start container %s: %s", c.ID, err)
|
logrus.Errorf("Failed to start container %s: %s", c.ID, err)
|
||||||
}
|
}
|
||||||
close(chNotify)
|
close(chNotify)
|
||||||
}(c, notifier)
|
|
||||||
|
|
||||||
|
sem.Release(1)
|
||||||
|
group.Done()
|
||||||
|
}(c, notifier)
|
||||||
}
|
}
|
||||||
group.Wait()
|
group.Wait()
|
||||||
|
|
||||||
removeGroup := sync.WaitGroup{}
|
|
||||||
for id := range removeContainers {
|
for id := range removeContainers {
|
||||||
removeGroup.Add(1)
|
group.Add(1)
|
||||||
go func(cid string) {
|
go func(cid string) {
|
||||||
|
_ = sem.Acquire(context.Background(), 1)
|
||||||
|
|
||||||
if err := daemon.ContainerRm(cid, &types.ContainerRmConfig{ForceRemove: true, RemoveVolume: true}); err != nil {
|
if err := daemon.ContainerRm(cid, &types.ContainerRmConfig{ForceRemove: true, RemoveVolume: true}); err != nil {
|
||||||
logrus.Errorf("Failed to remove container %s: %s", cid, err)
|
logrus.Errorf("Failed to remove container %s: %s", cid, err)
|
||||||
}
|
}
|
||||||
removeGroup.Done()
|
|
||||||
|
sem.Release(1)
|
||||||
|
group.Done()
|
||||||
}(id)
|
}(id)
|
||||||
}
|
}
|
||||||
removeGroup.Wait()
|
group.Wait()
|
||||||
|
|
||||||
// any containers that were started above would already have had this done,
|
// any containers that were started above would already have had this done,
|
||||||
// however we need to now prepare the mountpoints for the rest of the containers as well.
|
// however we need to now prepare the mountpoints for the rest of the containers as well.
|
||||||
|
@ -492,13 +537,16 @@ func (daemon *Daemon) restore() error {
|
||||||
|
|
||||||
group.Add(1)
|
group.Add(1)
|
||||||
go func(c *container.Container) {
|
go func(c *container.Container) {
|
||||||
defer group.Done()
|
_ = sem.Acquire(context.Background(), 1)
|
||||||
|
|
||||||
if err := daemon.prepareMountPoints(c); err != nil {
|
if err := daemon.prepareMountPoints(c); err != nil {
|
||||||
logrus.Error(err)
|
logrus.Error(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
sem.Release(1)
|
||||||
|
group.Done()
|
||||||
}(c)
|
}(c)
|
||||||
}
|
}
|
||||||
|
|
||||||
group.Wait()
|
group.Wait()
|
||||||
|
|
||||||
logrus.Info("Loading containers: done.")
|
logrus.Info("Loading containers: done.")
|
||||||
|
@ -509,7 +557,18 @@ func (daemon *Daemon) restore() error {
|
||||||
// RestartSwarmContainers restarts any autostart container which has a
|
// RestartSwarmContainers restarts any autostart container which has a
|
||||||
// swarm endpoint.
|
// swarm endpoint.
|
||||||
func (daemon *Daemon) RestartSwarmContainers() {
|
func (daemon *Daemon) RestartSwarmContainers() {
|
||||||
group := sync.WaitGroup{}
|
ctx := context.Background()
|
||||||
|
|
||||||
|
// parallelLimit is the maximum number of parallel startup jobs that we
|
||||||
|
// allow (this is the limited used for all startup semaphores). The multipler
|
||||||
|
// (128) was chosen after some fairly significant benchmarking -- don't change
|
||||||
|
// it unless you've tested it significantly (this value is adjusted if
|
||||||
|
// RLIMIT_NOFILE is small to avoid EMFILE).
|
||||||
|
parallelLimit := adjustParallelLimit(len(daemon.List()), 128*runtime.NumCPU())
|
||||||
|
|
||||||
|
var group sync.WaitGroup
|
||||||
|
sem := semaphore.NewWeighted(int64(parallelLimit))
|
||||||
|
|
||||||
for _, c := range daemon.List() {
|
for _, c := range daemon.List() {
|
||||||
if !c.IsRunning() && !c.IsPaused() {
|
if !c.IsRunning() && !c.IsPaused() {
|
||||||
// Autostart all the containers which has a
|
// Autostart all the containers which has a
|
||||||
|
@ -518,14 +577,21 @@ func (daemon *Daemon) RestartSwarmContainers() {
|
||||||
if daemon.configStore.AutoRestart && c.ShouldRestart() && c.NetworkSettings.HasSwarmEndpoint && c.HasBeenStartedBefore {
|
if daemon.configStore.AutoRestart && c.ShouldRestart() && c.NetworkSettings.HasSwarmEndpoint && c.HasBeenStartedBefore {
|
||||||
group.Add(1)
|
group.Add(1)
|
||||||
go func(c *container.Container) {
|
go func(c *container.Container) {
|
||||||
defer group.Done()
|
if err := sem.Acquire(ctx, 1); err != nil {
|
||||||
|
// ctx is done.
|
||||||
|
group.Done()
|
||||||
|
return
|
||||||
|
}
|
||||||
|
|
||||||
if err := daemon.containerStart(c, "", "", true); err != nil {
|
if err := daemon.containerStart(c, "", "", true); err != nil {
|
||||||
logrus.Error(err)
|
logrus.Error(err)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
sem.Release(1)
|
||||||
|
group.Done()
|
||||||
}(c)
|
}(c)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
group.Wait()
|
group.Wait()
|
||||||
}
|
}
|
||||||
|
|
|
@ -257,6 +257,41 @@ func getBlkioThrottleDevices(devs []*blkiodev.ThrottleDevice) ([]specs.LinuxThro
|
||||||
return throttleDevices, nil
|
return throttleDevices, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// adjustParallelLimit takes a number of objects and a proposed limit and
|
||||||
|
// figures out if it's reasonable (and adjusts it accordingly). This is only
|
||||||
|
// used for daemon startup, which does a lot of parallel loading of containers
|
||||||
|
// (and if we exceed RLIMIT_NOFILE then we're in trouble).
|
||||||
|
func adjustParallelLimit(n int, limit int) int {
|
||||||
|
// Rule-of-thumb overhead factor (how many files will each goroutine open
|
||||||
|
// simultaneously). Yes, this is ugly but to be frank this whole thing is
|
||||||
|
// ugly.
|
||||||
|
const overhead = 2
|
||||||
|
|
||||||
|
// On Linux, we need to ensure that parallelStartupJobs doesn't cause us to
|
||||||
|
// exceed RLIMIT_NOFILE. If parallelStartupJobs is too large, we reduce it
|
||||||
|
// and give a warning (since in theory the user should increase their
|
||||||
|
// ulimits to the largest possible value for dockerd).
|
||||||
|
var rlim unix.Rlimit
|
||||||
|
if err := unix.Getrlimit(unix.RLIMIT_NOFILE, &rlim); err != nil {
|
||||||
|
logrus.Warnf("Couldn't find dockerd's RLIMIT_NOFILE to double-check startup parallelism factor: %v", err)
|
||||||
|
return limit
|
||||||
|
}
|
||||||
|
softRlimit := int(rlim.Cur)
|
||||||
|
|
||||||
|
// Much fewer containers than RLIMIT_NOFILE. No need to adjust anything.
|
||||||
|
if softRlimit > overhead*n {
|
||||||
|
return limit
|
||||||
|
}
|
||||||
|
|
||||||
|
// RLIMIT_NOFILE big enough, no need to adjust anything.
|
||||||
|
if softRlimit > overhead*limit {
|
||||||
|
return limit
|
||||||
|
}
|
||||||
|
|
||||||
|
logrus.Warnf("Found dockerd's open file ulimit (%v) is far too small -- consider increasing it significantly (at least %v)", softRlimit, overhead*limit)
|
||||||
|
return softRlimit / overhead
|
||||||
|
}
|
||||||
|
|
||||||
func checkKernel() error {
|
func checkKernel() error {
|
||||||
// Check for unsupported kernel versions
|
// Check for unsupported kernel versions
|
||||||
// FIXME: it would be cleaner to not test for specific versions, but rather
|
// FIXME: it would be cleaner to not test for specific versions, but rather
|
||||||
|
|
|
@ -40,6 +40,11 @@ const (
|
||||||
windowsMaxCPUPercent = 100
|
windowsMaxCPUPercent = 100
|
||||||
)
|
)
|
||||||
|
|
||||||
|
// Windows doesn't really have rlimits.
|
||||||
|
func adjustParallelLimit(n int, limit int) int {
|
||||||
|
return limit
|
||||||
|
}
|
||||||
|
|
||||||
// Windows has no concept of an execution state directory. So use config.Root here.
|
// Windows has no concept of an execution state directory. So use config.Root here.
|
||||||
func getPluginExecRoot(root string) string {
|
func getPluginExecRoot(root string) string {
|
||||||
return filepath.Join(root, "plugins")
|
return filepath.Join(root, "plugins")
|
||||||
|
|
Loading…
Add table
Reference in a new issue