2016-03-28 20:28:57 -04:00
|
|
|
package networkdb
|
|
|
|
|
2017-01-31 12:13:08 -05:00
|
|
|
import (
|
|
|
|
"encoding/json"
|
|
|
|
"net"
|
|
|
|
|
|
|
|
"github.com/hashicorp/memberlist"
|
2017-07-26 17:18:31 -04:00
|
|
|
"github.com/sirupsen/logrus"
|
2017-01-31 12:13:08 -05:00
|
|
|
)
|
2016-03-28 20:28:57 -04:00
|
|
|
|
|
|
|
type eventDelegate struct {
|
|
|
|
nDB *NetworkDB
|
|
|
|
}
|
|
|
|
|
2017-01-31 12:13:08 -05:00
|
|
|
func (e *eventDelegate) broadcastNodeEvent(addr net.IP, op opType) {
|
|
|
|
value, err := json.Marshal(&NodeAddr{addr})
|
|
|
|
if err == nil {
|
|
|
|
e.nDB.broadcaster.Write(makeEvent(op, NodeTable, "", "", value))
|
|
|
|
} else {
|
|
|
|
logrus.Errorf("Error marshalling node broadcast event %s", addr.String())
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-11-16 19:30:27 -05:00
|
|
|
func (e *eventDelegate) purgeReincarnation(mn *memberlist.Node) {
|
|
|
|
for name, node := range e.nDB.failedNodes {
|
|
|
|
if node.Addr.Equal(mn.Addr) {
|
|
|
|
logrus.Infof("Node %s/%s, is the new incarnation of the failed node %s/%s", mn.Name, mn.Addr, name, node.Addr)
|
|
|
|
delete(e.nDB.failedNodes, name)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
for name, node := range e.nDB.leftNodes {
|
|
|
|
if node.Addr.Equal(mn.Addr) {
|
|
|
|
logrus.Infof("Node %s/%s, is the new incarnation of the shutdown node %s/%s", mn.Name, mn.Addr, name, node.Addr)
|
|
|
|
delete(e.nDB.leftNodes, name)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-09-15 01:24:14 -04:00
|
|
|
func (e *eventDelegate) NotifyJoin(mn *memberlist.Node) {
|
2017-05-22 21:36:43 -04:00
|
|
|
logrus.Infof("Node %s/%s, joined gossip cluster", mn.Name, mn.Addr)
|
2017-01-31 12:13:08 -05:00
|
|
|
e.broadcastNodeEvent(mn.Addr, opCreate)
|
2016-03-28 20:28:57 -04:00
|
|
|
e.nDB.Lock()
|
2017-11-16 19:30:27 -05:00
|
|
|
defer e.nDB.Unlock()
|
2016-09-15 01:24:14 -04:00
|
|
|
// In case the node is rejoining after a failure or leave,
|
|
|
|
// wait until an explicit join message arrives before adding
|
|
|
|
// it to the nodes just to make sure this is not a stale
|
|
|
|
// join. If you don't know about this node add it immediately.
|
|
|
|
_, fOk := e.nDB.failedNodes[mn.Name]
|
|
|
|
_, lOk := e.nDB.leftNodes[mn.Name]
|
|
|
|
if fOk || lOk {
|
|
|
|
return
|
|
|
|
}
|
|
|
|
|
2017-11-16 19:30:27 -05:00
|
|
|
// Every node has a unique ID
|
|
|
|
// Check on the base of the IP address if the new node that joined is actually a new incarnation of a previous
|
|
|
|
// failed or shutdown one
|
|
|
|
e.purgeReincarnation(mn)
|
|
|
|
|
2016-09-15 01:24:14 -04:00
|
|
|
e.nDB.nodes[mn.Name] = &node{Node: *mn}
|
2017-05-22 21:36:43 -04:00
|
|
|
logrus.Infof("Node %s/%s, added to nodes list", mn.Name, mn.Addr)
|
2016-03-28 20:28:57 -04:00
|
|
|
}
|
|
|
|
|
2016-09-15 01:24:14 -04:00
|
|
|
func (e *eventDelegate) NotifyLeave(mn *memberlist.Node) {
|
2017-05-22 21:36:43 -04:00
|
|
|
var failed bool
|
|
|
|
logrus.Infof("Node %s/%s, left gossip cluster", mn.Name, mn.Addr)
|
2017-01-31 12:13:08 -05:00
|
|
|
e.broadcastNodeEvent(mn.Addr, opDelete)
|
2017-07-17 11:36:43 -04:00
|
|
|
// The node left or failed, delete all the entries created by it.
|
|
|
|
// If the node was temporary down, deleting the entries will guarantee that the CREATE events will be accepted
|
|
|
|
// If the node instead left because was going down, then it makes sense to just delete all its state
|
2016-03-28 20:28:57 -04:00
|
|
|
e.nDB.Lock()
|
2017-11-16 19:30:27 -05:00
|
|
|
defer e.nDB.Unlock()
|
2017-07-17 11:36:43 -04:00
|
|
|
e.nDB.deleteNetworkEntriesForNode(mn.Name)
|
|
|
|
e.nDB.deleteNodeTableEntries(mn.Name)
|
2016-09-15 01:24:14 -04:00
|
|
|
if n, ok := e.nDB.nodes[mn.Name]; ok {
|
|
|
|
delete(e.nDB.nodes, mn.Name)
|
2016-09-30 17:03:10 -04:00
|
|
|
|
2017-11-16 19:30:27 -05:00
|
|
|
// Check if a new incarnation of the same node already joined
|
|
|
|
// In that case this node can simply be removed and no further action are needed
|
|
|
|
for name, node := range e.nDB.nodes {
|
|
|
|
if node.Addr.Equal(mn.Addr) {
|
|
|
|
logrus.Infof("Node %s/%s, is the new incarnation of the failed node %s/%s", name, node.Addr, mn.Name, mn.Addr)
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-05-22 20:11:56 -04:00
|
|
|
// In case of node failure, keep retrying to reconnect every retryInterval (1sec) for nodeReapInterval (24h)
|
|
|
|
// Explicit leave will have already removed the node from the list of nodes (nDB.nodes) and put it into the leftNodes map
|
|
|
|
n.reapTime = nodeReapInterval
|
2016-09-15 01:24:14 -04:00
|
|
|
e.nDB.failedNodes[mn.Name] = n
|
2017-05-22 21:36:43 -04:00
|
|
|
failed = true
|
2016-09-15 01:24:14 -04:00
|
|
|
}
|
2017-11-16 19:30:27 -05:00
|
|
|
|
2017-05-22 21:36:43 -04:00
|
|
|
if failed {
|
|
|
|
logrus.Infof("Node %s/%s, added to failed nodes list", mn.Name, mn.Addr)
|
|
|
|
}
|
2016-03-28 20:28:57 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
func (e *eventDelegate) NotifyUpdate(n *memberlist.Node) {
|
|
|
|
}
|