From c9fb551d60584ac4ad01561e2f56b7b7cc9483b9 Mon Sep 17 00:00:00 2001 From: Jana Radhakrishnan Date: Fri, 9 Sep 2016 09:55:57 -0700 Subject: [PATCH] Fix autostart for swarm scope connected containers The swarm scope network connected containers with autostart enabled there was a dependency problem with the cluster to be initialized before we can autostart them. With the current container restart code happening before cluster init, these containers were not getting autostarted properly. Added a fix to delay the container start of those containers which has atleast one swarm scope endpoint to until after the cluster is initialized. Signed-off-by: Jana Radhakrishnan --- cmd/dockerd/daemon.go | 5 ++++ daemon/cluster/cluster.go | 37 ++++++++++++++++++------ daemon/container_operations.go | 5 ++++ daemon/daemon.go | 32 +++++++++++++++++++- daemon/network/settings.go | 1 + integration-cli/docker_cli_swarm_test.go | 22 ++++++++++++++ 6 files changed, 92 insertions(+), 10 deletions(-) diff --git a/cmd/dockerd/daemon.go b/cmd/dockerd/daemon.go index eecbd00480..e860c34d8e 100644 --- a/cmd/dockerd/daemon.go +++ b/cmd/dockerd/daemon.go @@ -262,6 +262,11 @@ func (cli *DaemonCli) start(opts daemonOptions) (err error) { logrus.Fatalf("Error creating cluster component: %v", err) } + // Restart all autostart containers which has a swarm endpoint + // and is not yet running now that we have successfully + // initialized the cluster. + d.RestartSwarmContainers() + logrus.Info("Daemon has completed initialization") logrus.WithFields(logrus.Fields{ diff --git a/daemon/cluster/cluster.go b/daemon/cluster/cluster.go index 8106115837..97f36995ff 100644 --- a/daemon/cluster/cluster.go +++ b/daemon/cluster/cluster.go @@ -135,10 +135,11 @@ type Cluster struct { // helps in identifying the attachment ID via the taskID and the // corresponding attachment configuration obtained from the manager. type attacher struct { - taskID string - config *network.NetworkingConfig - attachWaitCh chan *network.NetworkingConfig - detachWaitCh chan struct{} + taskID string + config *network.NetworkingConfig + attachWaitCh chan *network.NetworkingConfig + attachCompleteCh chan struct{} + detachWaitCh chan struct{} } type node struct { @@ -1262,12 +1263,24 @@ func (c *Cluster) WaitForDetachment(ctx context.Context, networkName, networkID, agent := c.node.Agent() c.RUnlock() - if ok && attacher != nil && attacher.detachWaitCh != nil { + if ok && attacher != nil && + attacher.detachWaitCh != nil && + attacher.attachCompleteCh != nil { + // Attachment may be in progress still so wait for + // attachment to complete. select { - case <-attacher.detachWaitCh: + case <-attacher.attachCompleteCh: case <-ctx.Done(): return ctx.Err() } + + if attacher.taskID == taskID { + select { + case <-attacher.detachWaitCh: + case <-ctx.Done(): + return ctx.Err() + } + } } return agent.ResourceAllocator().DetachNetwork(ctx, taskID) @@ -1289,9 +1302,11 @@ func (c *Cluster) AttachNetwork(target string, containerID string, addresses []s agent := c.node.Agent() attachWaitCh := make(chan *network.NetworkingConfig) detachWaitCh := make(chan struct{}) + attachCompleteCh := make(chan struct{}) c.attachers[aKey] = &attacher{ - attachWaitCh: attachWaitCh, - detachWaitCh: detachWaitCh, + attachWaitCh: attachWaitCh, + attachCompleteCh: attachCompleteCh, + detachWaitCh: detachWaitCh, } c.Unlock() @@ -1306,6 +1321,11 @@ func (c *Cluster) AttachNetwork(target string, containerID string, addresses []s return nil, fmt.Errorf("Could not attach to network %s: %v", target, err) } + c.Lock() + c.attachers[aKey].taskID = taskID + close(attachCompleteCh) + c.Unlock() + logrus.Debugf("Successfully attached to network %s with tid %s", target, taskID) var config *network.NetworkingConfig @@ -1316,7 +1336,6 @@ func (c *Cluster) AttachNetwork(target string, containerID string, addresses []s } c.Lock() - c.attachers[aKey].taskID = taskID c.attachers[aKey].config = config c.Unlock() return config, nil diff --git a/daemon/container_operations.go b/daemon/container_operations.go index 818c82ccbf..cb30d77f2e 100644 --- a/daemon/container_operations.go +++ b/daemon/container_operations.go @@ -384,6 +384,9 @@ func (daemon *Daemon) findAndAttachNetwork(container *container.Container, idOrN return nil, nil, err } + // This container has attachment to a swarm scope + // network. Update the container network settings accordingly. + container.NetworkSettings.HasSwarmEndpoint = true return n, config, nil } @@ -492,6 +495,7 @@ func (daemon *Daemon) allocateNetwork(container *container.Container) error { // on first network connecting. defaultNetName := runconfig.DefaultDaemonNetworkMode().NetworkName() if nConf, ok := container.NetworkSettings.Networks[defaultNetName]; ok { + cleanOperationalData(nConf) if err := daemon.connectToNetwork(container, defaultNetName, nConf.EndpointSettings, updateSettings); err != nil { return err } @@ -512,6 +516,7 @@ func (daemon *Daemon) allocateNetwork(container *container.Container) error { } for i, epConf := range epConfigs { + cleanOperationalData(epConf) if err := daemon.connectToNetwork(container, networks[i], epConf.EndpointSettings, updateSettings); err != nil { return err } diff --git a/daemon/daemon.go b/daemon/daemon.go index e28faa6af7..335dbf37c8 100644 --- a/daemon/daemon.go +++ b/daemon/daemon.go @@ -202,7 +202,13 @@ func (daemon *Daemon) restore() error { // fixme: only if not running // get list of containers we need to restart if !c.IsRunning() && !c.IsPaused() { - if daemon.configStore.AutoRestart && c.ShouldRestart() { + // Do not autostart containers which + // has endpoints in a swarm scope + // network yet since the cluster is + // not initialized yet. We will start + // it after the cluster is + // initialized. + if daemon.configStore.AutoRestart && c.ShouldRestart() && !c.NetworkSettings.HasSwarmEndpoint { mapLock.Lock() restartContainers[c] = make(chan struct{}) mapLock.Unlock() @@ -346,6 +352,30 @@ func (daemon *Daemon) restore() error { return nil } +// RestartSwarmContainers restarts any autostart container which has a +// swarm endpoint. +func (daemon *Daemon) RestartSwarmContainers() { + group := sync.WaitGroup{} + for _, c := range daemon.List() { + if !c.IsRunning() && !c.IsPaused() { + // Autostart all the containers which has a + // swarm endpoint now that the cluster is + // initialized. + if daemon.configStore.AutoRestart && c.ShouldRestart() && c.NetworkSettings.HasSwarmEndpoint { + group.Add(1) + go func(c *container.Container) { + defer group.Done() + if err := daemon.containerStart(c, ""); err != nil { + logrus.Error(err) + } + }(c) + } + } + + } + group.Wait() +} + // waitForNetworks is used during daemon initialization when starting up containers // It ensures that all of a container's networks are available before the daemon tries to start the container. // In practice it just makes sure the discovery service is available for containers which use a network that require discovery. diff --git a/daemon/network/settings.go b/daemon/network/settings.go index 1958f2330a..8f6b7dd59e 100644 --- a/daemon/network/settings.go +++ b/daemon/network/settings.go @@ -21,6 +21,7 @@ type Settings struct { SecondaryIPAddresses []networktypes.Address SecondaryIPv6Addresses []networktypes.Address IsAnonymousEndpoint bool + HasSwarmEndpoint bool } // EndpointSettings is a package local wrapper for diff --git a/integration-cli/docker_cli_swarm_test.go b/integration-cli/docker_cli_swarm_test.go index a1ffcd9027..e9c7ace3f3 100644 --- a/integration-cli/docker_cli_swarm_test.go +++ b/integration-cli/docker_cli_swarm_test.go @@ -242,3 +242,25 @@ func (s *DockerSwarmSuite) TestSwarmServiceWithGroup(c *check.C) { c.Assert(err, checker.IsNil) c.Assert(strings.TrimSpace(out), checker.Equals, "uid=0(root) gid=0(root) groups=10(wheel),29(audio),50(staff),777") } + +func (s *DockerSwarmSuite) TestSwarmContainerAutoStart(c *check.C) { + d := s.AddDaemon(c, true, true) + + out, err := d.Cmd("network", "create", "--attachable", "-d", "overlay", "foo") + c.Assert(err, checker.IsNil) + c.Assert(strings.TrimSpace(out), checker.Not(checker.Equals), "") + + out, err = d.Cmd("run", "-id", "--restart=always", "--net=foo", "--name=test", "busybox", "top") + c.Assert(err, checker.IsNil) + c.Assert(strings.TrimSpace(out), checker.Not(checker.Equals), "") + + out, err = d.Cmd("ps", "-q") + c.Assert(err, checker.IsNil) + c.Assert(strings.TrimSpace(out), checker.Not(checker.Equals), "") + + d.Restart() + + out, err = d.Cmd("ps", "-q") + c.Assert(err, checker.IsNil) + c.Assert(strings.TrimSpace(out), checker.Not(checker.Equals), "") +}