From 849e345e2c366d875b03f8b33b31308ba16fb4fd Mon Sep 17 00:00:00 2001 From: Jana Radhakrishnan Date: Mon, 17 Oct 2016 14:33:56 -0700 Subject: [PATCH] Retry AttachNetwork when it fails to find network When trying to attach to swarm scope network for an unmanaged container sometimes even if attaching to network succeeds, we may not find the network because some other container which was using the network went down and removed the network. So if it is not found, try to detach and reattach to re-download the network from the manager. Fixes #26588 Signed-off-by: Jana Radhakrishnan --- daemon/container_operations.go | 55 ++++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 15 deletions(-) diff --git a/daemon/container_operations.go b/daemon/container_operations.go index 5fee8a43d2..b3585cf137 100644 --- a/daemon/container_operations.go +++ b/daemon/container_operations.go @@ -357,26 +357,51 @@ func (daemon *Daemon) findAndAttachNetwork(container *container.Container, idOrN } } - // In all other cases, attempt to attach to the network to - // trigger attachment in the swarm cluster manager. - var config *networktypes.NetworkingConfig - if daemon.clusterProvider != nil { - var err error - config, err = daemon.clusterProvider.AttachNetwork(idOrName, container.ID, addresses) - if err != nil { - return nil, nil, err - } - } + var ( + config *networktypes.NetworkingConfig + retryCount int + ) - n, err = daemon.FindNetwork(idOrName) - if err != nil { + for { + // In all other cases, attempt to attach to the network to + // trigger attachment in the swarm cluster manager. if daemon.clusterProvider != nil { - if err := daemon.clusterProvider.DetachNetwork(idOrName, container.ID); err != nil { - logrus.Warnf("Could not rollback attachment for container %s to network %s: %v", container.ID, idOrName, err) + var err error + config, err = daemon.clusterProvider.AttachNetwork(idOrName, container.ID, addresses) + if err != nil { + return nil, nil, err } } - return nil, nil, err + n, err = daemon.FindNetwork(idOrName) + if err != nil { + if daemon.clusterProvider != nil { + if err := daemon.clusterProvider.DetachNetwork(idOrName, container.ID); err != nil { + logrus.Warnf("Could not rollback attachment for container %s to network %s: %v", container.ID, idOrName, err) + } + } + + // Retry network attach again if we failed to + // find the network after successfull + // attachment because the only reason that + // would happen is if some other container + // attached to the swarm scope network went down + // and removed the network while we were in + // the process of attaching. + if config != nil { + if _, ok := err.(libnetwork.ErrNoSuchNetwork); ok { + if retryCount >= 5 { + return nil, nil, fmt.Errorf("could not find network %s after successful attachment", idOrName) + } + retryCount++ + continue + } + } + + return nil, nil, err + } + + break } // This container has attachment to a swarm scope