Fix race in attachable network attachment

Attachable networks are networks created on the cluster which can then be attached to by non-swarm containers. These networks are lazily created on the node that wants to attach to that network. When no container is currently attached to one of these networks on a node, and then multiple containers which want that network are started concurrently, this can cause a race condition in the network attachment where essentially we try to attach the same network to the node twice. To easily reproduce this issue you must use a multi-node cluster with a worker node that has lots of CPUs (I used a 36 CPU node). Repro steps: 1. On manager, `docker network create -d overlay --attachable test` 2. On worker, `docker create --restart=always --network test busybox top`, many times... 200 is a good number (but not much more due to subnet size restrictions) 3. Restart the daemon When the daemon restarts, it will attempt to start all those containers simultaneously. Note that you could try to do this yourself over the API, but it's harder to trigger due to the added latency from going over the API. The error produced happens when the daemon tries to start the container upon allocating the network resources: ``` attaching to network failed, make sure your network options are correct and check manager logs: context deadline exceeded ``` What happens here is the worker makes a network attachment request to the manager. This is an async call which in the happy case would cause a task to be placed on the node, which the worker is waiting for to get the network configuration. In the case of this race, the error ocurrs on the manager like this: ``` task allocation failure" error="failed during network allocation for task n7bwwwbymj2o2h9asqkza8gom: failed to allocate network IP for task n7bwwwbymj2o2h9asqkza8gom network rj4szie2zfauqnpgh4eri1yue: could not find an available IP" module=node node.id=u3489c490fx1df8onlyfo1v6e ``` The task is not created and the worker times out waiting for the task. --- The mitigation for this is to make sure that only one attachment reuest is in flight for a given network at a time *when the network doesn't already exist on the node*. If the network already exists on the node there is no need for synchronization because the network is already allocated and on the node so there is no need to request it from the manager. This basically comes down to a race with `Find(network) || Create(network)` without any sort of syncronization. Signed-off-by: Brian Goff <cpuguy83@gmail.com>
2022-11-09 12:21:53 -05:00 · 2018-02-02 12:54:08 -05:00 · 2018-02-02 12:54:08 -05:00 · c379d2681f
commit c379d2681f
parent 53a58da551
2 changed files with 15 additions and 5 deletions
--- a/daemon/container_operations.go
+++ b/daemon/container_operations.go
@ -346,7 +346,9 @@ func (daemon *Daemon) updateNetwork(container *container.Container) error {
 }

 func (daemon *Daemon) findAndAttachNetwork(container *container.Container, idOrName string, epConfig *networktypes.EndpointSettings) (libnetwork.Network, *networktypes.NetworkingConfig, error) {
-	n, err := daemon.FindNetwork(getNetworkID(idOrName, epConfig))
+	id := getNetworkID(idOrName, epConfig)
+
+	n, err := daemon.FindNetwork(id)
 	if err != nil {
 		// We should always be able to find the network for a
 		// managed container.
@ -379,21 +381,26 @@ func (daemon *Daemon) findAndAttachNetwork(container *container.Container, idOrN
 		retryCount int
 	)

+	if n == nil && daemon.attachableNetworkLock != nil {
+		daemon.attachableNetworkLock.Lock(id)
+		defer daemon.attachableNetworkLock.Unlock(id)
+	}
+
 	for {
 		// In all other cases, attempt to attach to the network to
 		// trigger attachment in the swarm cluster manager.
 		if daemon.clusterProvider != nil {
 			var err error
-			config, err = daemon.clusterProvider.AttachNetwork(getNetworkID(idOrName, epConfig), container.ID, addresses)
+			config, err = daemon.clusterProvider.AttachNetwork(id, container.ID, addresses)
 			if err != nil {
 				return nil, nil, err
 			}
 		}

-		n, err = daemon.FindNetwork(getNetworkID(idOrName, epConfig))
+		n, err = daemon.FindNetwork(id)
 		if err != nil {
 			if daemon.clusterProvider != nil {
-				if err := daemon.clusterProvider.DetachNetwork(getNetworkID(idOrName, epConfig), container.ID); err != nil {
+				if err := daemon.clusterProvider.DetachNetwork(id, container.ID); err != nil {
 					logrus.Warnf("Could not rollback attachment for container %s to network %s: %v", container.ID, idOrName, err)
 				}
 			}
--- a/daemon/daemon.go
+++ b/daemon/daemon.go
@ -43,6 +43,7 @@ import (
 	"github.com/docker/docker/migrate/v1"
 	"github.com/docker/docker/pkg/containerfs"
 	"github.com/docker/docker/pkg/idtools"
+	"github.com/docker/docker/pkg/locker"
 	"github.com/docker/docker/pkg/plugingetter"
 	"github.com/docker/docker/pkg/sysinfo"
 	"github.com/docker/docker/pkg/system"
@ -120,7 +121,8 @@ type Daemon struct {
 	hosts            map[string]bool // hosts stores the addresses the daemon is listening on
 	startupDone      chan struct{}

-	attachmentStore network.AttachmentStore
+	attachmentStore       network.AttachmentStore
+	attachableNetworkLock *locker.Locker
 }

 // StoreHosts stores the addresses the daemon is listening on
@ -564,6 +566,7 @@ func (daemon *Daemon) DaemonLeavesCluster() {
 func (daemon *Daemon) setClusterProvider(clusterProvider cluster.Provider) {
 	daemon.clusterProvider = clusterProvider
 	daemon.netController.SetClusterProvider(clusterProvider)
+	daemon.attachableNetworkLock = locker.New()
 }

 // IsSwarmCompatible verifies if the current daemon