From e41b4765bdcd28a1f305a59b2a0e3f05e216520a Mon Sep 17 00:00:00 2001 From: Jana Radhakrishnan Date: Wed, 7 Oct 2015 20:01:38 -0700 Subject: [PATCH] Cleanup dangling sandboxes on boot up Currently when docker exits ungracefully it may leave dangling sandboxes which may hold onto precious network resources. Added checkpoint state for sandboxes which on boot up will be used to clean up the sandboxes and network resources. On bootup the remaining dangling state in the checkpoint are read and cleaned up before accepting any new network allocation requests. Signed-off-by: Jana Radhakrishnan --- libnetwork/controller.go | 14 ++ libnetwork/endpoint.go | 2 +- libnetwork/libnetwork_test.go | 78 +------ libnetwork/sandbox.go | 15 +- libnetwork/sandbox_store.go | 212 +++++++++++++++++++ libnetwork/store_test.go | 4 +- libnetwork/test/integration/dnet/bridge.bats | 25 +++ 7 files changed, 274 insertions(+), 76 deletions(-) create mode 100644 libnetwork/sandbox_store.go diff --git a/libnetwork/controller.go b/libnetwork/controller.go index dbe13a5490..ab19174732 100644 --- a/libnetwork/controller.go +++ b/libnetwork/controller.go @@ -191,6 +191,8 @@ func New(cfgOptions ...config.Option) (NetworkController, error) { return nil, err } + c.sandboxCleanup() + if err := c.startExternalKeyListener(); err != nil { return nil, err } @@ -500,6 +502,18 @@ func (c *controller) NewSandbox(containerID string, options ...SandboxOption) (S c.Lock() c.sandboxes[sb.id] = sb c.Unlock() + defer func() { + if err != nil { + c.Lock() + delete(c.sandboxes, sb.id) + c.Unlock() + } + }() + + err = sb.storeUpdate() + if err != nil { + return nil, fmt.Errorf("updating the store state of sandbox failed: %v", err) + } return sb, nil } diff --git a/libnetwork/endpoint.go b/libnetwork/endpoint.go index a76cb235e7..c0a05b6c2f 100644 --- a/libnetwork/endpoint.go +++ b/libnetwork/endpoint.go @@ -221,7 +221,7 @@ func (ep *endpoint) Exists() bool { } func (ep *endpoint) Skip() bool { - return ep.getNetwork().Skip() || ep.DataScope() == datastore.LocalScope + return ep.getNetwork().Skip() } func (ep *endpoint) processOptions(options ...EndpointOption) { diff --git a/libnetwork/libnetwork_test.go b/libnetwork/libnetwork_test.go index e50b7c09a2..8e40c05f5d 100644 --- a/libnetwork/libnetwork_test.go +++ b/libnetwork/libnetwork_test.go @@ -1479,21 +1479,11 @@ func TestLeaveAll(t *testing.T) { if err != nil { t.Fatal(err) } - defer func() { - if err := ep1.Delete(); err != nil { - t.Fatal(err) - } - }() ep2, err := n.CreateEndpoint("ep2") if err != nil { t.Fatal(err) } - defer func() { - if err := ep2.Delete(); err != nil { - t.Fatal(err) - } - }() cnt, err := controller.NewSandbox("leaveall") if err != nil { @@ -1607,21 +1597,11 @@ func TestEndpointUpdateParent(t *testing.T) { if err != nil { t.Fatal(err) } - defer func() { - if err := ep1.Delete(); err != nil { - t.Fatal(err) - } - }() ep2, err := n.CreateEndpoint("ep2") if err != nil { t.Fatal(err) } - defer func() { - if err := ep2.Delete(); err != nil { - t.Fatal(err) - } - }() sbx1, err := controller.NewSandbox(containerID, libnetwork.OptionHostname("test"), @@ -1661,12 +1641,6 @@ func TestEndpointUpdateParent(t *testing.T) { if err != nil { t.Fatal(err) } - - err = ep2.Leave(sbx2) - runtime.LockOSThread() - if err != nil { - t.Fatal(err) - } } func TestEnableIPv6(t *testing.T) { @@ -1714,11 +1688,6 @@ func TestEnableIPv6(t *testing.T) { if err != nil { t.Fatal(err) } - defer func() { - if err := ep1.Delete(); err != nil { - t.Fatal(err) - } - }() if err := ioutil.WriteFile("/etc/resolv.conf", tmpResolvConf, 0644); err != nil { t.Fatal(err) @@ -1741,13 +1710,6 @@ func TestEnableIPv6(t *testing.T) { if err != nil { t.Fatal(err) } - defer func() { - err = ep1.Leave(sb) - runtime.LockOSThread() - if err != nil { - t.Fatal(err) - } - }() content, err := ioutil.ReadFile(resolvConfPath) if err != nil { @@ -1884,11 +1846,6 @@ func TestResolvConf(t *testing.T) { if err != nil { t.Fatal(err) } - defer func() { - if err := ep.Delete(); err != nil { - t.Fatal(err) - } - }() if err := ioutil.WriteFile("/etc/resolv.conf", tmpResolvConf1, 0644); err != nil { t.Fatal(err) @@ -2204,24 +2161,9 @@ func parallelLeave(t *testing.T, rc libnetwork.Sandbox, ep libnetwork.Endpoint, debugf("L%d.", thrNumber) var err error - cid := fmt.Sprintf("%drace", thrNumber) sb := sboxes[thrNumber-1] - if thrNumber == first { - err = ep.Leave(sb) - } else { - err = sb.Delete() - // re add sandbox - defer func() { - if err == nil { - var e error - if sboxes[thrNumber-1], e = controller.NewSandbox(cid); e != nil { - t.Fatalf("Failed to recreate sandbox %s: %v", cid, e) - } - } - }() - } - + err = ep.Leave(sb) runtime.LockOSThread() if err != nil { if _, ok := err.(types.ForbiddenError); !ok { @@ -2324,11 +2266,10 @@ func runParallelTests(t *testing.T, thrNumber int) { debugf("\n") - err = ep.Delete() + err = sb.Delete() if err != nil { t.Fatal(err) } - if thrNumber == first { for thrdone := range done { select { @@ -2337,19 +2278,14 @@ func runParallelTests(t *testing.T, thrNumber int) { } testns.Close() - err = sb.Delete() - if err != nil { - t.Fatal(err) - } - - ep.Delete() - if err != nil { - t.Fatal(err) - } - if err := net2.Delete(); err != nil { t.Fatal(err) } + } else { + err = ep.Delete() + if err != nil { + t.Fatal(err) + } } } diff --git a/libnetwork/sandbox.go b/libnetwork/sandbox.go index eb1941a6ec..b56431112c 100644 --- a/libnetwork/sandbox.go +++ b/libnetwork/sandbox.go @@ -64,6 +64,8 @@ type sandbox struct { endpoints epHeap epPriority map[string]int joinLeaveDone chan struct{} + dbIndex uint64 + dbExists bool sync.Mutex } @@ -153,15 +155,24 @@ func (sb *sandbox) Delete() error { if ep.endpointInGWNetwork() { continue } + if err := ep.Leave(sb); err != nil { log.Warnf("Failed detaching sandbox %s from endpoint %s: %v\n", sb.ID(), ep.ID(), err) } + + if err := ep.Delete(); err != nil { + log.Warnf("Failed deleting endpoint %s: %v\n", ep.ID(), err) + } } if sb.osSbox != nil { sb.osSbox.Destroy() } + if err := sb.storeDelete(); err != nil { + log.Warnf("Failed to delete sandbox %s from store: %v", sb.ID(), err) + } + c.Lock() delete(c.sandboxes, sb.ID()) c.Unlock() @@ -369,7 +380,7 @@ func (sb *sandbox) populateNetworkResources(ep *endpoint) error { } } } - return nil + return sb.storeUpdate() } func (sb *sandbox) clearNetworkResources(origEp *endpoint) error { @@ -442,7 +453,7 @@ func (sb *sandbox) clearNetworkResources(origEp *endpoint) error { sb.updateGateway(gwepAfter) } - return nil + return sb.storeUpdate() } const ( diff --git a/libnetwork/sandbox_store.go b/libnetwork/sandbox_store.go new file mode 100644 index 0000000000..56abbb6424 --- /dev/null +++ b/libnetwork/sandbox_store.go @@ -0,0 +1,212 @@ +package libnetwork + +import ( + "container/heap" + "encoding/json" + + "github.com/Sirupsen/logrus" + "github.com/docker/libnetwork/datastore" + "github.com/docker/libnetwork/osl" +) + +const ( + sandboxPrefix = "sandbox" +) + +type epState struct { + Eid string + Nid string +} + +type sbState struct { + ID string + Cid string + c *controller + dbIndex uint64 + dbExists bool + Eps []epState +} + +func (sbs *sbState) Key() []string { + return []string{sandboxPrefix, sbs.ID} +} + +func (sbs *sbState) KeyPrefix() []string { + return []string{sandboxPrefix} +} + +func (sbs *sbState) Value() []byte { + b, err := json.Marshal(sbs) + if err != nil { + return nil + } + return b +} + +func (sbs *sbState) SetValue(value []byte) error { + return json.Unmarshal(value, sbs) +} + +func (sbs *sbState) Index() uint64 { + sbi, err := sbs.c.SandboxByID(sbs.ID) + if err != nil { + return sbs.dbIndex + } + + sb := sbi.(*sandbox) + maxIndex := sb.dbIndex + if sbs.dbIndex > maxIndex { + maxIndex = sbs.dbIndex + } + + return maxIndex +} + +func (sbs *sbState) SetIndex(index uint64) { + sbs.dbIndex = index + sbs.dbExists = true + + sbi, err := sbs.c.SandboxByID(sbs.ID) + if err != nil { + return + } + + sb := sbi.(*sandbox) + sb.dbIndex = index + sb.dbExists = true +} + +func (sbs *sbState) Exists() bool { + if sbs.dbExists { + return sbs.dbExists + } + + sbi, err := sbs.c.SandboxByID(sbs.ID) + if err != nil { + return false + } + + sb := sbi.(*sandbox) + return sb.dbExists +} + +func (sbs *sbState) Skip() bool { + return false +} + +func (sbs *sbState) New() datastore.KVObject { + return &sbState{c: sbs.c} +} + +func (sbs *sbState) CopyTo(o datastore.KVObject) error { + dstSbs := o.(*sbState) + dstSbs.c = sbs.c + dstSbs.ID = sbs.ID + dstSbs.Cid = sbs.Cid + dstSbs.dbIndex = sbs.dbIndex + dstSbs.dbExists = sbs.dbExists + + for _, eps := range sbs.Eps { + dstSbs.Eps = append(dstSbs.Eps, eps) + } + + return nil +} + +func (sbs *sbState) DataScope() string { + return datastore.LocalScope +} + +func (sb *sandbox) storeUpdate() error { + sbs := &sbState{ + c: sb.controller, + ID: sb.id, + } + + for _, ep := range sb.getConnectedEndpoints() { + eps := epState{ + Nid: ep.getNetwork().ID(), + Eid: ep.ID(), + } + + sbs.Eps = append(sbs.Eps, eps) + } + + return sb.controller.updateToStore(sbs) +} + +func (sb *sandbox) storeDelete() error { + sbs := &sbState{ + c: sb.controller, + ID: sb.id, + Cid: sb.containerID, + dbIndex: sb.dbIndex, + dbExists: sb.dbExists, + } + + return sb.controller.deleteFromStore(sbs) +} + +func (c *controller) sandboxCleanup() { + store := c.getStore(datastore.LocalScope) + if store == nil { + logrus.Errorf("Could not find local scope store while trying to cleanup sandboxes") + return + } + + kvol, err := store.List(datastore.Key(sandboxPrefix), &sbState{c: c}) + if err != nil && err != datastore.ErrKeyNotFound { + logrus.Errorf("failed to get sandboxes for scope %s: %v", store.Scope(), err) + return + } + + // It's normal for no sandboxes to be found. Just bail out. + if err == datastore.ErrKeyNotFound { + return + } + + for _, kvo := range kvol { + sbs := kvo.(*sbState) + + logrus.Printf("sandboxcleanup sbs = %+v", sbs) + sb := &sandbox{ + id: sbs.ID, + controller: sbs.c, + containerID: sbs.Cid, + endpoints: epHeap{}, + epPriority: map[string]int{}, + dbIndex: sbs.dbIndex, + dbExists: true, + } + + sb.osSbox, err = osl.NewSandbox(sb.Key(), true) + if err != nil { + logrus.Errorf("failed to create new osl sandbox while trying to build sandbox for cleanup: %v", err) + continue + } + + for _, eps := range sbs.Eps { + n, err := c.getNetworkFromStore(eps.Nid) + if err != nil { + logrus.Errorf("getNetworkFromStore for nid %s failed while trying to build sandbox for cleanup: %v", eps.Nid, err) + continue + } + + ep, err := n.getEndpointFromStore(eps.Eid) + if err != nil { + logrus.Errorf("getEndpointFromStore for eid %s failed while trying to build sandbox for cleanup: %v", eps.Eid, err) + continue + } + + heap.Push(&sb.endpoints, ep) + } + + c.Lock() + c.sandboxes[sb.id] = sb + c.Unlock() + + if err := sb.Delete(); err != nil { + logrus.Errorf("failed to delete sandbox %s while trying to cleanup: %v", sb.id, err) + } + } +} diff --git a/libnetwork/store_test.go b/libnetwork/store_test.go index eb49535150..b435a788d2 100644 --- a/libnetwork/store_test.go +++ b/libnetwork/store_test.go @@ -76,8 +76,8 @@ func testLocalBackend(t *testing.T, provider, url string, storeConfig *store.Con if exists, err := store.Exists(datastore.Key(datastore.NetworkKeyPrefix, string(nw.ID()))); !exists || err != nil { t.Fatalf("Network key should have been created.") } - if exists, err := store.Exists(datastore.Key([]string{datastore.EndpointKeyPrefix, string(nw.ID()), string(ep.ID())}...)); exists || err != nil { - t.Fatalf("Endpoint key shouldn't have been created.") + if exists, err := store.Exists(datastore.Key([]string{datastore.EndpointKeyPrefix, string(nw.ID()), string(ep.ID())}...)); !exists || err != nil { + t.Fatalf("Endpoint key should have been created.") } ctrl.(*controller).getStore(datastore.LocalScope).Close() diff --git a/libnetwork/test/integration/dnet/bridge.bats b/libnetwork/test/integration/dnet/bridge.bats index 12c265f0ae..ac07e9b3d2 100644 --- a/libnetwork/test/integration/dnet/bridge.bats +++ b/libnetwork/test/integration/dnet/bridge.bats @@ -30,6 +30,10 @@ function test_single_network_connectivity() { done done + if [ -n "$3" ]; then + return + fi + # Teardown the container connections and the network for i in `seq ${start} ${end}`; do @@ -70,6 +74,27 @@ function test_single_network_connectivity() { dnet_cmd $(inst_id2port 1) network rm singlehost } +@test "Test bridge network dnet ungraceful restart" { + skip_for_circleci + + echo $(docker ps) + dnet_cmd $(inst_id2port 1) network create -d bridge singlehost + + for iter in `seq 1 2`; + do + if [ "$iter" -eq 1 ]; then + test_single_network_connectivity singlehost 3 skip + else + test_single_network_connectivity singlehost 3 + fi + + docker restart dnet-1-bridge + sleep 5 + done + + dnet_cmd $(inst_id2port 1) network rm singlehost +} + @test "Test multiple bridge networks" { skip_for_circleci