From 7c97896747726554165480d102d9e46c54334cba Mon Sep 17 00:00:00 2001
From: Steffen Butzer <steffen.butzer@outlook.com>
Date: Tue, 22 Jun 2021 00:00:52 +0200
Subject: [PATCH] libnetwork: processEndpointDelete: Fix deadlock between
 getSvcRecords and processEndpointDelete

We had some hosts with quite a bit of cycling containers that ocassionally causes docker daemons to lock up.
Most prominently `docker run` commands do not respond and nothing happens anymore.

Looking at the stack trace the following is at least likely sometimes a cause to that:
Two goroutines g0 and g1 can race against each other:
* (g0) 1. getSvcRecords is called and calls (*network).Lock()
       --> Network is locked.
* (g1) 2. processEndpointDelete is called, and calls (*controller).Lock()
       --> Controller is locked
* (g1) 3. processEndpointDelete tries (*network).ID() which calls (*network).Lock().
* (g0) 4. getSvcRecords calls (*controller).Lock().

3./4. are deadlocked against each other since the other goroutine holds the lock they need.

References https://github.com/moby/libnetwork/blob/b5dc37037049d9b9ef68a3c4611e5eb1b35dd2af/network.go

Signed-off-by: Steffen Butzer <steffen.butzer@outlook.com>
---
 libnetwork/store.go | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/libnetwork/store.go b/libnetwork/store.go
index 65ae281fad..576110b668 100644
--- a/libnetwork/store.go
+++ b/libnetwork/store.go
@@ -399,11 +399,14 @@ func (c *controller) processEndpointDelete(nmap map[string]*netWatch, ep *endpoi
 		return
 	}
 
+	networkID := n.ID()
+	endpointID := ep.ID()
+
 	c.Lock()
-	nw, ok := nmap[n.ID()]
+	nw, ok := nmap[networkID]
 
 	if ok {
-		delete(nw.localEps, ep.ID())
+		delete(nw.localEps, endpointID)
 		c.Unlock()
 
 		// Update the svc db about local endpoint leave right away
@@ -417,9 +420,9 @@ func (c *controller) processEndpointDelete(nmap map[string]*netWatch, ep *endpoi
 
 			// This is the last container going away for the network. Destroy
 			// this network's svc db entry
-			delete(c.svcRecords, n.ID())
+			delete(c.svcRecords, networkID)
 
-			delete(nmap, n.ID())
+			delete(nmap, networkID)
 		}
 	}
 	c.Unlock()