1
0
Fork 0
mirror of https://github.com/moby/moby.git synced 2022-11-09 12:21:53 -05:00
moby--moby/libnetwork/networkdb/broadcast.go
Jana Radhakrishnan 5f5dad3c02 Recover from transient gossip failures
Currently if there is any transient gossip failure in any node the
recoevry process depends on other nodes propogating the information
indirectly. In cases if these transient failures affects all the nodes
that this node has in its memberlist then this node will be permenantly
cutoff from the the gossip channel. Added node state management code in
networkdb to address these problems by trying to rejoin the cluster via
the failed nodes when there is a failure. This also necessitates the
need to add new messages called node event messages to differentiate
between node leave and node failure.

Signed-off-by: Jana Radhakrishnan <mrjana@docker.com>
2016-09-19 15:58:14 -07:00

163 lines
3.2 KiB
Go

package networkdb
import (
"fmt"
"time"
"github.com/hashicorp/memberlist"
"github.com/hashicorp/serf/serf"
)
const broadcastTimeout = 5 * time.Second
type networkEventMessage struct {
id string
node string
msg []byte
}
func (m *networkEventMessage) Invalidates(other memberlist.Broadcast) bool {
otherm := other.(*networkEventMessage)
return m.id == otherm.id && m.node == otherm.node
}
func (m *networkEventMessage) Message() []byte {
return m.msg
}
func (m *networkEventMessage) Finished() {
}
func (nDB *NetworkDB) sendNetworkEvent(nid string, event NetworkEvent_Type, ltime serf.LamportTime) error {
nEvent := NetworkEvent{
Type: event,
LTime: ltime,
NodeName: nDB.config.NodeName,
NetworkID: nid,
}
raw, err := encodeMessage(MessageTypeNetworkEvent, &nEvent)
if err != nil {
return err
}
nDB.networkBroadcasts.QueueBroadcast(&networkEventMessage{
msg: raw,
id: nid,
node: nDB.config.NodeName,
})
return nil
}
type nodeEventMessage struct {
msg []byte
notify chan<- struct{}
}
func (m *nodeEventMessage) Invalidates(other memberlist.Broadcast) bool {
return false
}
func (m *nodeEventMessage) Message() []byte {
return m.msg
}
func (m *nodeEventMessage) Finished() {
if m.notify != nil {
close(m.notify)
}
}
func (nDB *NetworkDB) sendNodeEvent(event NodeEvent_Type) error {
nEvent := NodeEvent{
Type: event,
LTime: nDB.networkClock.Increment(),
NodeName: nDB.config.NodeName,
}
raw, err := encodeMessage(MessageTypeNodeEvent, &nEvent)
if err != nil {
return err
}
notifyCh := make(chan struct{})
nDB.nodeBroadcasts.QueueBroadcast(&nodeEventMessage{
msg: raw,
notify: notifyCh,
})
// Wait for the broadcast
select {
case <-notifyCh:
case <-time.After(broadcastTimeout):
return fmt.Errorf("timed out broadcasting node event")
}
return nil
}
type tableEventMessage struct {
id string
tname string
key string
msg []byte
node string
}
func (m *tableEventMessage) Invalidates(other memberlist.Broadcast) bool {
otherm := other.(*tableEventMessage)
return m.id == otherm.id && m.tname == otherm.tname && m.key == otherm.key
}
func (m *tableEventMessage) Message() []byte {
return m.msg
}
func (m *tableEventMessage) Finished() {
}
func (nDB *NetworkDB) sendTableEvent(event TableEvent_Type, nid string, tname string, key string, entry *entry) error {
tEvent := TableEvent{
Type: event,
LTime: entry.ltime,
NodeName: nDB.config.NodeName,
NetworkID: nid,
TableName: tname,
Key: key,
Value: entry.value,
}
raw, err := encodeMessage(MessageTypeTableEvent, &tEvent)
if err != nil {
return err
}
var broadcastQ *memberlist.TransmitLimitedQueue
nDB.RLock()
thisNodeNetworks, ok := nDB.networks[nDB.config.NodeName]
if ok {
// The network may have been removed
network, networkOk := thisNodeNetworks[nid]
if !networkOk {
nDB.RUnlock()
return nil
}
broadcastQ = network.tableBroadcasts
}
nDB.RUnlock()
// The network may have been removed
if broadcastQ == nil {
return nil
}
broadcastQ.QueueBroadcast(&tableEventMessage{
msg: raw,
id: nid,
tname: tname,
key: key,
node: nDB.config.NodeName,
})
return nil
}