From e98b152bacafec4676161749785a5a5ff2fd4bc3 Mon Sep 17 00:00:00 2001 From: Santhosh Manohar Date: Fri, 30 Sep 2016 14:03:10 -0700 Subject: [PATCH] Reap failed nodes after 24 hours Signed-off-by: Santhosh Manohar --- libnetwork/networkdb/cluster.go | 22 +++++++++++++++++++--- libnetwork/networkdb/event_delegate.go | 2 ++ libnetwork/networkdb/networkdb.go | 2 ++ 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/libnetwork/networkdb/cluster.go b/libnetwork/networkdb/cluster.go index 562f971af3..1ba1b14b0d 100644 --- a/libnetwork/networkdb/cluster.go +++ b/libnetwork/networkdb/cluster.go @@ -16,9 +16,11 @@ import ( ) const ( - reapInterval = 60 * time.Second - reapPeriod = 5 * time.Second - retryInterval = 1 * time.Second + reapInterval = 60 * time.Second + reapPeriod = 5 * time.Second + retryInterval = 1 * time.Second + nodeReapInterval = 24 * time.Hour + nodeReapPeriod = 2 * time.Hour ) type logWriter struct{} @@ -147,6 +149,7 @@ func (nDB *NetworkDB) clusterInit() error { {config.GossipInterval, nDB.gossip}, {config.PushPullInterval, nDB.bulkSyncTables}, {retryInterval, nDB.reconnectNode}, + {nodeReapPeriod, nDB.reapDeadNode}, } { t := time.NewTicker(trigger.interval) go nDB.triggerFunc(trigger.interval, t.C, nDB.stopCh, trigger.fn) @@ -234,6 +237,19 @@ func (nDB *NetworkDB) triggerFunc(stagger time.Duration, C <-chan time.Time, sto } } +func (nDB *NetworkDB) reapDeadNode() { + nDB.Lock() + defer nDB.Unlock() + for id, n := range nDB.failedNodes { + if n.reapTime > 0 { + n.reapTime -= reapPeriod + continue + } + logrus.Debugf("Removing failed node %v from gossip cluster", n.Name) + delete(nDB.failedNodes, id) + } +} + func (nDB *NetworkDB) reconnectNode() { nDB.RLock() if len(nDB.failedNodes) == 0 { diff --git a/libnetwork/networkdb/event_delegate.go b/libnetwork/networkdb/event_delegate.go index 019cafbd06..c22d09eba3 100644 --- a/libnetwork/networkdb/event_delegate.go +++ b/libnetwork/networkdb/event_delegate.go @@ -29,6 +29,8 @@ func (e *eventDelegate) NotifyLeave(mn *memberlist.Node) { e.nDB.Lock() if n, ok := e.nDB.nodes[mn.Name]; ok { delete(e.nDB.nodes, mn.Name) + + n.reapTime = reapInterval e.nDB.failedNodes[mn.Name] = n } e.nDB.Unlock() diff --git a/libnetwork/networkdb/networkdb.go b/libnetwork/networkdb/networkdb.go index 69fc4fe249..a79b4231d2 100644 --- a/libnetwork/networkdb/networkdb.go +++ b/libnetwork/networkdb/networkdb.go @@ -94,6 +94,8 @@ type NetworkDB struct { type node struct { memberlist.Node ltime serf.LamportTime + // Number of hours left before the reaper removes the node + reapTime time.Duration } // network describes the node/network attachment.