mirror of
https://github.com/moby/moby.git
synced 2022-11-09 12:21:53 -05:00
Purge stale nodes with same prefix and IP
Since the node name randomization fix, we need to make sure that we purge the old node with the same prefix and same IP from the nodes database if it still present. This causes unnecessary reconnect attempts. Also added a change to avoid unnecessary update of local lamport time and only do it of we are ready to do a push pull on a join. Join should happen only when the node is bootstrapped or when trying to reconnect with a failed node. Signed-off-by: Jana Radhakrishnan <mrjana@docker.com>
This commit is contained in:
parent
84862ee8fe
commit
7b905d3c63
3 changed files with 49 additions and 18 deletions
|
@ -237,13 +237,6 @@ func (nDB *NetworkDB) reconnectNode() {
|
|||
}
|
||||
nDB.RUnlock()
|
||||
|
||||
// Update all the local state to a new time to force update on
|
||||
// the node we are trying to rejoin, just in case that node
|
||||
// has these in leaving/deleting state still. This is
|
||||
// facilitate fast convergence after recovering from a gossip
|
||||
// failure.
|
||||
nDB.updateLocalStateTime()
|
||||
|
||||
node := nodes[randomOffset(len(nodes))]
|
||||
addr := net.UDPAddr{IP: node.Addr, Port: int(node.Port)}
|
||||
|
||||
|
@ -256,6 +249,13 @@ func (nDB *NetworkDB) reconnectNode() {
|
|||
return
|
||||
}
|
||||
|
||||
// Update all the local table state to a new time to
|
||||
// force update on the node we are trying to rejoin, just in
|
||||
// case that node has these in deleting state still. This is
|
||||
// facilitate fast convergence after recovering from a gossip
|
||||
// failure.
|
||||
nDB.updateLocalTableTime()
|
||||
|
||||
logrus.Debugf("Initiating bulk sync with node %s after reconnect", node.Name)
|
||||
nDB.bulkSync([]string{node.Name}, true)
|
||||
}
|
||||
|
|
|
@ -3,6 +3,7 @@ package networkdb
|
|||
import (
|
||||
"fmt"
|
||||
"net"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/Sirupsen/logrus"
|
||||
|
@ -31,7 +32,7 @@ func (nDB *NetworkDB) checkAndGetNode(nEvent *NodeEvent) *node {
|
|||
return nil
|
||||
}
|
||||
|
||||
delete(nDB.failedNodes, n.Name)
|
||||
delete(nodes, n.Name)
|
||||
return n
|
||||
}
|
||||
}
|
||||
|
@ -39,16 +40,36 @@ func (nDB *NetworkDB) checkAndGetNode(nEvent *NodeEvent) *node {
|
|||
return nil
|
||||
}
|
||||
|
||||
func (nDB *NetworkDB) handleNodeEvent(nEvent *NodeEvent) bool {
|
||||
// Update our local clock if the received messages has newer
|
||||
// time.
|
||||
nDB.networkClock.Witness(nEvent.LTime)
|
||||
func (nDB *NetworkDB) purgeSameNode(n *node) {
|
||||
nDB.Lock()
|
||||
defer nDB.Unlock()
|
||||
|
||||
prefix := strings.Split(n.Name, "-")[0]
|
||||
for _, nodes := range []map[string]*node{
|
||||
nDB.failedNodes,
|
||||
nDB.leftNodes,
|
||||
nDB.nodes,
|
||||
} {
|
||||
var nodeNames []string
|
||||
for name, node := range nodes {
|
||||
if strings.HasPrefix(name, prefix) && n.Addr.Equal(node.Addr) {
|
||||
nodeNames = append(nodeNames, name)
|
||||
}
|
||||
}
|
||||
|
||||
for _, name := range nodeNames {
|
||||
delete(nodes, name)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (nDB *NetworkDB) handleNodeEvent(nEvent *NodeEvent) bool {
|
||||
n := nDB.checkAndGetNode(nEvent)
|
||||
if n == nil {
|
||||
return false
|
||||
}
|
||||
|
||||
nDB.purgeSameNode(n)
|
||||
n.ltime = nEvent.LTime
|
||||
|
||||
switch nEvent.Type {
|
||||
|
@ -357,6 +378,15 @@ func (d *delegate) GetBroadcasts(overhead, limit int) [][]byte {
|
|||
}
|
||||
|
||||
func (d *delegate) LocalState(join bool) []byte {
|
||||
if join {
|
||||
// Update all the local node/network state to a new time to
|
||||
// force update on the node we are trying to rejoin, just in
|
||||
// case that node has these in leaving state still. This is
|
||||
// facilitate fast convergence after recovering from a gossip
|
||||
// failure.
|
||||
d.nDB.updateLocalNetworkTime()
|
||||
}
|
||||
|
||||
d.nDB.RLock()
|
||||
defer d.nDB.RUnlock()
|
||||
|
||||
|
@ -408,10 +438,6 @@ func (d *delegate) MergeRemoteState(buf []byte, isJoin bool) {
|
|||
return
|
||||
}
|
||||
|
||||
if pp.LTime > 0 {
|
||||
d.nDB.networkClock.Witness(pp.LTime)
|
||||
}
|
||||
|
||||
nodeEvent := &NodeEvent{
|
||||
LTime: pp.LTime,
|
||||
NodeName: pp.NodeName,
|
||||
|
|
|
@ -524,7 +524,7 @@ func (nDB *NetworkDB) findCommonNetworks(nodeName string) []string {
|
|||
return networks
|
||||
}
|
||||
|
||||
func (nDB *NetworkDB) updateLocalStateTime() {
|
||||
func (nDB *NetworkDB) updateLocalNetworkTime() {
|
||||
nDB.Lock()
|
||||
defer nDB.Unlock()
|
||||
|
||||
|
@ -532,8 +532,13 @@ func (nDB *NetworkDB) updateLocalStateTime() {
|
|||
for _, n := range nDB.networks[nDB.config.NodeName] {
|
||||
n.ltime = ltime
|
||||
}
|
||||
}
|
||||
|
||||
ltime = nDB.tableClock.Increment()
|
||||
func (nDB *NetworkDB) updateLocalTableTime() {
|
||||
nDB.Lock()
|
||||
defer nDB.Unlock()
|
||||
|
||||
ltime := nDB.tableClock.Increment()
|
||||
nDB.indexes[byTable].Walk(func(path string, v interface{}) bool {
|
||||
entry := v.(*entry)
|
||||
if entry.node != nDB.config.NodeName {
|
||||
|
|
Loading…
Add table
Reference in a new issue