From 7c97896747726554165480d102d9e46c54334cba Mon Sep 17 00:00:00 2001 From: Steffen Butzer Date: Tue, 22 Jun 2021 00:00:52 +0200 Subject: [PATCH] libnetwork: processEndpointDelete: Fix deadlock between getSvcRecords and processEndpointDelete We had some hosts with quite a bit of cycling containers that ocassionally causes docker daemons to lock up. Most prominently `docker run` commands do not respond and nothing happens anymore. Looking at the stack trace the following is at least likely sometimes a cause to that: Two goroutines g0 and g1 can race against each other: * (g0) 1. getSvcRecords is called and calls (*network).Lock() --> Network is locked. * (g1) 2. processEndpointDelete is called, and calls (*controller).Lock() --> Controller is locked * (g1) 3. processEndpointDelete tries (*network).ID() which calls (*network).Lock(). * (g0) 4. getSvcRecords calls (*controller).Lock(). 3./4. are deadlocked against each other since the other goroutine holds the lock they need. References https://github.com/moby/libnetwork/blob/b5dc37037049d9b9ef68a3c4611e5eb1b35dd2af/network.go Signed-off-by: Steffen Butzer --- libnetwork/store.go | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/libnetwork/store.go b/libnetwork/store.go index 65ae281fad..576110b668 100644 --- a/libnetwork/store.go +++ b/libnetwork/store.go @@ -399,11 +399,14 @@ func (c *controller) processEndpointDelete(nmap map[string]*netWatch, ep *endpoi return } + networkID := n.ID() + endpointID := ep.ID() + c.Lock() - nw, ok := nmap[n.ID()] + nw, ok := nmap[networkID] if ok { - delete(nw.localEps, ep.ID()) + delete(nw.localEps, endpointID) c.Unlock() // Update the svc db about local endpoint leave right away @@ -417,9 +420,9 @@ func (c *controller) processEndpointDelete(nmap map[string]*netWatch, ep *endpoi // This is the last container going away for the network. Destroy // this network's svc db entry - delete(c.svcRecords, n.ID()) + delete(c.svcRecords, networkID) - delete(nmap, n.ID()) + delete(nmap, networkID) } } c.Unlock()