mirror of
https://github.com/moby/moby.git
synced 2022-11-09 12:21:53 -05:00
5f5dad3c02
Currently if there is any transient gossip failure in any node the recoevry process depends on other nodes propogating the information indirectly. In cases if these transient failures affects all the nodes that this node has in its memberlist then this node will be permenantly cutoff from the the gossip channel. Added node state management code in networkdb to address these problems by trying to rejoin the cluster via the failed nodes when there is a failure. This also necessitates the need to add new messages called node event messages to differentiate between node leave and node failure. Signed-off-by: Jana Radhakrishnan <mrjana@docker.com>
163 lines
3.2 KiB
Go
163 lines
3.2 KiB
Go
package networkdb
|
|
|
|
import (
|
|
"fmt"
|
|
"time"
|
|
|
|
"github.com/hashicorp/memberlist"
|
|
"github.com/hashicorp/serf/serf"
|
|
)
|
|
|
|
const broadcastTimeout = 5 * time.Second
|
|
|
|
type networkEventMessage struct {
|
|
id string
|
|
node string
|
|
msg []byte
|
|
}
|
|
|
|
func (m *networkEventMessage) Invalidates(other memberlist.Broadcast) bool {
|
|
otherm := other.(*networkEventMessage)
|
|
return m.id == otherm.id && m.node == otherm.node
|
|
}
|
|
|
|
func (m *networkEventMessage) Message() []byte {
|
|
return m.msg
|
|
}
|
|
|
|
func (m *networkEventMessage) Finished() {
|
|
}
|
|
|
|
func (nDB *NetworkDB) sendNetworkEvent(nid string, event NetworkEvent_Type, ltime serf.LamportTime) error {
|
|
nEvent := NetworkEvent{
|
|
Type: event,
|
|
LTime: ltime,
|
|
NodeName: nDB.config.NodeName,
|
|
NetworkID: nid,
|
|
}
|
|
|
|
raw, err := encodeMessage(MessageTypeNetworkEvent, &nEvent)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
nDB.networkBroadcasts.QueueBroadcast(&networkEventMessage{
|
|
msg: raw,
|
|
id: nid,
|
|
node: nDB.config.NodeName,
|
|
})
|
|
return nil
|
|
}
|
|
|
|
type nodeEventMessage struct {
|
|
msg []byte
|
|
notify chan<- struct{}
|
|
}
|
|
|
|
func (m *nodeEventMessage) Invalidates(other memberlist.Broadcast) bool {
|
|
return false
|
|
}
|
|
|
|
func (m *nodeEventMessage) Message() []byte {
|
|
return m.msg
|
|
}
|
|
|
|
func (m *nodeEventMessage) Finished() {
|
|
if m.notify != nil {
|
|
close(m.notify)
|
|
}
|
|
}
|
|
|
|
func (nDB *NetworkDB) sendNodeEvent(event NodeEvent_Type) error {
|
|
nEvent := NodeEvent{
|
|
Type: event,
|
|
LTime: nDB.networkClock.Increment(),
|
|
NodeName: nDB.config.NodeName,
|
|
}
|
|
|
|
raw, err := encodeMessage(MessageTypeNodeEvent, &nEvent)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
notifyCh := make(chan struct{})
|
|
nDB.nodeBroadcasts.QueueBroadcast(&nodeEventMessage{
|
|
msg: raw,
|
|
notify: notifyCh,
|
|
})
|
|
|
|
// Wait for the broadcast
|
|
select {
|
|
case <-notifyCh:
|
|
case <-time.After(broadcastTimeout):
|
|
return fmt.Errorf("timed out broadcasting node event")
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
type tableEventMessage struct {
|
|
id string
|
|
tname string
|
|
key string
|
|
msg []byte
|
|
node string
|
|
}
|
|
|
|
func (m *tableEventMessage) Invalidates(other memberlist.Broadcast) bool {
|
|
otherm := other.(*tableEventMessage)
|
|
return m.id == otherm.id && m.tname == otherm.tname && m.key == otherm.key
|
|
}
|
|
|
|
func (m *tableEventMessage) Message() []byte {
|
|
return m.msg
|
|
}
|
|
|
|
func (m *tableEventMessage) Finished() {
|
|
}
|
|
|
|
func (nDB *NetworkDB) sendTableEvent(event TableEvent_Type, nid string, tname string, key string, entry *entry) error {
|
|
tEvent := TableEvent{
|
|
Type: event,
|
|
LTime: entry.ltime,
|
|
NodeName: nDB.config.NodeName,
|
|
NetworkID: nid,
|
|
TableName: tname,
|
|
Key: key,
|
|
Value: entry.value,
|
|
}
|
|
|
|
raw, err := encodeMessage(MessageTypeTableEvent, &tEvent)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
var broadcastQ *memberlist.TransmitLimitedQueue
|
|
nDB.RLock()
|
|
thisNodeNetworks, ok := nDB.networks[nDB.config.NodeName]
|
|
if ok {
|
|
// The network may have been removed
|
|
network, networkOk := thisNodeNetworks[nid]
|
|
if !networkOk {
|
|
nDB.RUnlock()
|
|
return nil
|
|
}
|
|
|
|
broadcastQ = network.tableBroadcasts
|
|
}
|
|
nDB.RUnlock()
|
|
|
|
// The network may have been removed
|
|
if broadcastQ == nil {
|
|
return nil
|
|
}
|
|
|
|
broadcastQ.QueueBroadcast(&tableEventMessage{
|
|
msg: raw,
|
|
id: nid,
|
|
tname: tname,
|
|
key: key,
|
|
node: nDB.config.NodeName,
|
|
})
|
|
return nil
|
|
}
|