Avoid alignment of reapNetwork and tableEntries

Make sure that the network is garbage collected after
the entries. Entries to be deleted requires that the network
is present.

Signed-off-by: Flavio Crisciani <flavio.crisciani@docker.com>
This commit is contained in:
Flavio Crisciani 2017-09-22 10:23:07 -07:00
parent 053a534ab1
commit a4e64d05c1
3 changed files with 19 additions and 13 deletions

View File

@ -17,11 +17,15 @@ import (
)
const (
reapInterval = 30 * time.Minute
reapPeriod = 5 * time.Second
retryInterval = 1 * time.Second
nodeReapInterval = 24 * time.Hour
nodeReapPeriod = 2 * time.Hour
// The garbage collection logic for entries leverage the presence of the network.
// For this reason the expiration time of the network is put slightly higher than the entry expiration so that
// there is at least 5 extra cycle to make sure that all the entries are properly deleted before deleting the network.
reapEntryInterval = 30 * time.Minute
reapNetworkInterval = reapEntryInterval + 5*reapPeriod
reapPeriod = 5 * time.Second
retryInterval = 1 * time.Second
nodeReapInterval = 24 * time.Hour
nodeReapPeriod = 2 * time.Hour
)
type logWriter struct{}
@ -300,8 +304,9 @@ func (nDB *NetworkDB) reconnectNode() {
// the reaper runs. NOTE nDB.reapTableEntries updates the reapTime with a readlock. This
// is safe as long as no other concurrent path touches the reapTime field.
func (nDB *NetworkDB) reapState() {
nDB.reapNetworks()
// The reapTableEntries leverage the presence of the network so garbage collect entries first
nDB.reapTableEntries()
nDB.reapNetworks()
}
func (nDB *NetworkDB) reapNetworks() {
@ -414,8 +419,8 @@ func (nDB *NetworkDB) gossip() {
// Collect stats and print the queue info, note this code is here also to have a view of the queues empty
network.qMessagesSent += len(msgs)
if printStats {
logrus.Infof("NetworkDB stats - net:%s Entries:%d Queue qLen:%d netPeers:%d netMsg/s:%d",
nid, network.entriesNumber, broadcastQ.NumQueued(), broadcastQ.NumNodes(),
logrus.Infof("NetworkDB stats - netID:%s leaving:%t netPeers:%d entries:%d Queue qLen:%d netMsg/s:%d",
nid, network.leaving, broadcastQ.NumNodes(), network.entriesNumber, broadcastQ.NumQueued(),
network.qMessagesSent/int((nDB.config.StatsPrintPeriod/time.Second)))
network.qMessagesSent = 0
}

View File

@ -165,7 +165,7 @@ func (nDB *NetworkDB) handleNetworkEvent(nEvent *NetworkEvent) bool {
n.ltime = nEvent.LTime
n.leaving = nEvent.Type == NetworkEventTypeLeave
if n.leaving {
n.reapTime = reapInterval
n.reapTime = reapNetworkInterval
// The remote node is leaving the network, but not the gossip cluster.
// Mark all its entries in deleted state, this will guarantee that
@ -242,7 +242,7 @@ func (nDB *NetworkDB) handleTableEvent(tEvent *TableEvent) bool {
// field. If that is not the case, this can be a BUG
if e.deleting && e.reapTime == 0 {
logrus.Warnf("handleTableEvent object %+v has a 0 reapTime, is the cluster running the same docker engine version?", tEvent)
e.reapTime = reapInterval
e.reapTime = reapEntryInterval
}
nDB.Lock()

View File

@ -405,7 +405,7 @@ func (nDB *NetworkDB) DeleteEntry(tname, nid, key string) error {
node: nDB.config.NodeName,
value: value,
deleting: true,
reapTime: reapInterval,
reapTime: reapEntryInterval,
}
if err := nDB.sendTableEvent(TableEventTypeDelete, nid, tname, key, entry); err != nil {
@ -478,7 +478,7 @@ func (nDB *NetworkDB) deleteNodeNetworkEntries(nid, node string) {
node: oldEntry.node,
value: oldEntry.value,
deleting: true,
reapTime: reapInterval,
reapTime: reapEntryInterval,
}
// we arrived at this point in 2 cases:
@ -619,8 +619,9 @@ func (nDB *NetworkDB) LeaveNetwork(nid string) error {
return fmt.Errorf("could not find network %s while trying to leave", nid)
}
logrus.Debugf("%s: leaving network %s", nDB.config.NodeName, nid)
n.ltime = ltime
n.reapTime = reapInterval
n.reapTime = reapNetworkInterval
n.leaving = true
return nil
}