mirror of
https://github.com/moby/moby.git
synced 2022-11-09 12:21:53 -05:00
b000d5321a
full diff: 18e7e58ea1...59163bf75d
- Add missing return when configuring VXLAN port
- Prevent possible panic in cnmallocator.IsAttachmentAllocated()
- update github.com/pivotal-golang/clock
- new name for package: code.cloudfoundry.org/clock
Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
285 lines
10 KiB
Go
285 lines
10 KiB
Go
package manager
|
|
|
|
import (
|
|
"context"
|
|
"time"
|
|
|
|
"code.cloudfoundry.org/clock"
|
|
"github.com/docker/swarmkit/api"
|
|
"github.com/docker/swarmkit/log"
|
|
"github.com/docker/swarmkit/manager/state/raft"
|
|
"github.com/docker/swarmkit/manager/state/raft/membership"
|
|
"github.com/docker/swarmkit/manager/state/store"
|
|
)
|
|
|
|
const (
|
|
// roleReconcileInterval is how often to retry removing a node, if a reconciliation or
|
|
// removal failed
|
|
roleReconcileInterval = 5 * time.Second
|
|
|
|
// removalTimeout is how long to wait before a raft member removal fails to be applied
|
|
// to the store
|
|
removalTimeout = 5 * time.Second
|
|
)
|
|
|
|
// roleManager reconciles the raft member list with desired role changes.
|
|
type roleManager struct {
|
|
ctx context.Context
|
|
cancel func()
|
|
|
|
store *store.MemoryStore
|
|
raft *raft.Node
|
|
doneChan chan struct{}
|
|
|
|
// pendingReconciliation contains changed nodes that have not yet been reconciled in
|
|
// the raft member list.
|
|
pendingReconciliation map[string]*api.Node
|
|
|
|
// pendingRemoval contains the IDs of nodes that have been deleted - if these correspond
|
|
// to members in the raft cluster, those members need to be removed from raft
|
|
pendingRemoval map[string]struct{}
|
|
|
|
// leave this nil except for tests which need to inject a fake time source
|
|
clocksource clock.Clock
|
|
}
|
|
|
|
// newRoleManager creates a new roleManager.
|
|
func newRoleManager(store *store.MemoryStore, raftNode *raft.Node) *roleManager {
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
return &roleManager{
|
|
ctx: ctx,
|
|
cancel: cancel,
|
|
store: store,
|
|
raft: raftNode,
|
|
doneChan: make(chan struct{}),
|
|
pendingReconciliation: make(map[string]*api.Node),
|
|
pendingRemoval: make(map[string]struct{}),
|
|
}
|
|
}
|
|
|
|
// getTicker returns a ticker based on the configured clock source
|
|
func (rm *roleManager) getTicker(interval time.Duration) clock.Ticker {
|
|
if rm.clocksource == nil {
|
|
return clock.NewClock().NewTicker(interval)
|
|
}
|
|
return rm.clocksource.NewTicker(interval)
|
|
|
|
}
|
|
|
|
// Run is roleManager's main loop. On startup, it looks at every node object in the cluster and
|
|
// attempts to reconcile the raft member list with all the nodes' desired roles. If any nodes
|
|
// need to be demoted or promoted, it will add them to a reconciliation queue, and if any raft
|
|
// members' node have been deleted, it will add them to a removal queue.
|
|
|
|
// These queues are processed immediately, and any nodes that failed to be processed are
|
|
// processed again in the next reconciliation interval, so that nodes will hopefully eventually
|
|
// be reconciled. As node updates come in, any promotions or demotions are also added to the
|
|
// reconciliation queue and reconciled. As node removals come in, they are added to the removal
|
|
// queue to be removed from the raft cluster.
|
|
|
|
// Removal from a raft cluster is idempotent (and it's the only raft cluster change that will occur
|
|
// during reconciliation or removal), so it's fine if a node is in both the removal and reconciliation
|
|
// queues.
|
|
|
|
// The ctx param is only used for logging.
|
|
func (rm *roleManager) Run(ctx context.Context) {
|
|
defer close(rm.doneChan)
|
|
|
|
var (
|
|
nodes []*api.Node
|
|
|
|
// ticker and tickerCh are used to time the reconciliation interval, which will
|
|
// periodically attempt to re-reconcile nodes that failed to reconcile the first
|
|
// time through
|
|
ticker clock.Ticker
|
|
tickerCh <-chan time.Time
|
|
)
|
|
|
|
watcher, cancelWatch, err := store.ViewAndWatch(rm.store,
|
|
func(readTx store.ReadTx) error {
|
|
var err error
|
|
nodes, err = store.FindNodes(readTx, store.All)
|
|
return err
|
|
},
|
|
api.EventUpdateNode{},
|
|
api.EventDeleteNode{})
|
|
defer cancelWatch()
|
|
|
|
if err != nil {
|
|
log.G(ctx).WithError(err).Error("failed to check nodes for role changes")
|
|
} else {
|
|
// Assume all raft members have been deleted from the cluster, until the node list
|
|
// tells us otherwise. We can make this assumption because the node object must
|
|
// exist first before the raft member object.
|
|
|
|
// Background life-cycle for a manager: it joins the cluster, getting a new TLS
|
|
// certificate. To get a TLS certificate, it makes an RPC call to the CA server,
|
|
// which on successful join adds its information to the cluster node list and
|
|
// eventually generates a TLS certificate for it. Once it has a TLS certificate,
|
|
// it can contact the other nodes, and makes an RPC call to request to join the
|
|
// raft cluster. The node it contacts will add the node to the raft membership.
|
|
for _, member := range rm.raft.GetMemberlist() {
|
|
rm.pendingRemoval[member.NodeID] = struct{}{}
|
|
}
|
|
for _, node := range nodes {
|
|
// if the node exists, we don't want it removed from the raft membership cluster
|
|
// necessarily
|
|
delete(rm.pendingRemoval, node.ID)
|
|
|
|
// reconcile each existing node
|
|
rm.pendingReconciliation[node.ID] = node
|
|
rm.reconcileRole(ctx, node)
|
|
}
|
|
for nodeID := range rm.pendingRemoval {
|
|
rm.evictRemovedNode(ctx, nodeID)
|
|
}
|
|
// If any reconciliations or member removals failed, we want to try again, so
|
|
// make sure that we start the ticker so we can try again and again every
|
|
// roleReconciliationInterval seconds until the queues are both empty.
|
|
if len(rm.pendingReconciliation) != 0 || len(rm.pendingRemoval) != 0 {
|
|
ticker = rm.getTicker(roleReconcileInterval)
|
|
tickerCh = ticker.C()
|
|
}
|
|
}
|
|
|
|
for {
|
|
select {
|
|
case event := <-watcher:
|
|
switch ev := event.(type) {
|
|
case api.EventUpdateNode:
|
|
rm.pendingReconciliation[ev.Node.ID] = ev.Node
|
|
rm.reconcileRole(ctx, ev.Node)
|
|
case api.EventDeleteNode:
|
|
rm.pendingRemoval[ev.Node.ID] = struct{}{}
|
|
rm.evictRemovedNode(ctx, ev.Node.ID)
|
|
}
|
|
// If any reconciliations or member removals failed, we want to try again, so
|
|
// make sure that we start the ticker so we can try again and again every
|
|
// roleReconciliationInterval seconds until the queues are both empty.
|
|
if (len(rm.pendingReconciliation) != 0 || len(rm.pendingRemoval) != 0) && ticker == nil {
|
|
ticker = rm.getTicker(roleReconcileInterval)
|
|
tickerCh = ticker.C()
|
|
}
|
|
case <-tickerCh:
|
|
for _, node := range rm.pendingReconciliation {
|
|
rm.reconcileRole(ctx, node)
|
|
}
|
|
for nodeID := range rm.pendingRemoval {
|
|
rm.evictRemovedNode(ctx, nodeID)
|
|
}
|
|
if len(rm.pendingReconciliation) == 0 && len(rm.pendingRemoval) == 0 {
|
|
ticker.Stop()
|
|
ticker = nil
|
|
tickerCh = nil
|
|
}
|
|
case <-rm.ctx.Done():
|
|
if ticker != nil {
|
|
ticker.Stop()
|
|
}
|
|
return
|
|
}
|
|
}
|
|
}
|
|
|
|
// evictRemovedNode evicts a removed node from the raft cluster membership. This is to cover an edge case in which
|
|
// a node might have been removed, but somehow the role was not reconciled (possibly a demotion and a removal happened
|
|
// in rapid succession before the raft membership configuration went through).
|
|
func (rm *roleManager) evictRemovedNode(ctx context.Context, nodeID string) {
|
|
// Check if the member still exists in the membership
|
|
member := rm.raft.GetMemberByNodeID(nodeID)
|
|
if member != nil {
|
|
// We first try to remove the raft node from the raft cluster. On the next tick, if the node
|
|
// has been removed from the cluster membership, we then delete it from the removed list
|
|
rm.removeMember(ctx, member)
|
|
return
|
|
}
|
|
delete(rm.pendingRemoval, nodeID)
|
|
}
|
|
|
|
// removeMember removes a member from the raft cluster membership
|
|
func (rm *roleManager) removeMember(ctx context.Context, member *membership.Member) {
|
|
// Quorum safeguard - quorum should have been checked before a node was allowed to be demoted, but if in the
|
|
// intervening time some other node disconnected, removing this node would result in a loss of cluster quorum.
|
|
// We leave it
|
|
if !rm.raft.CanRemoveMember(member.RaftID) {
|
|
// TODO(aaronl): Retry later
|
|
log.G(ctx).Debugf("can't demote node %s at this time: removing member from raft would result in a loss of quorum", member.NodeID)
|
|
return
|
|
}
|
|
|
|
rmCtx, rmCancel := context.WithTimeout(rm.ctx, removalTimeout)
|
|
defer rmCancel()
|
|
|
|
if member.RaftID == rm.raft.Config.ID {
|
|
// Don't use rmCtx, because we expect to lose
|
|
// leadership, which will cancel this context.
|
|
log.G(ctx).Info("demoted; transferring leadership")
|
|
err := rm.raft.TransferLeadership(context.Background())
|
|
if err == nil {
|
|
return
|
|
}
|
|
log.G(ctx).WithError(err).Info("failed to transfer leadership")
|
|
}
|
|
if err := rm.raft.RemoveMember(rmCtx, member.RaftID); err != nil {
|
|
// TODO(aaronl): Retry later
|
|
log.G(ctx).WithError(err).Debugf("can't demote node %s at this time", member.NodeID)
|
|
}
|
|
}
|
|
|
|
// reconcileRole looks at the desired role for a node, and if it is being demoted or promoted, updates the
|
|
// node role accordingly. If the node is being demoted, it also removes the node from the raft cluster membership.
|
|
func (rm *roleManager) reconcileRole(ctx context.Context, node *api.Node) {
|
|
if node.Role == node.Spec.DesiredRole {
|
|
// Nothing to do.
|
|
delete(rm.pendingReconciliation, node.ID)
|
|
return
|
|
}
|
|
|
|
// Promotion can proceed right away.
|
|
if node.Spec.DesiredRole == api.NodeRoleManager && node.Role == api.NodeRoleWorker {
|
|
err := rm.store.Update(func(tx store.Tx) error {
|
|
updatedNode := store.GetNode(tx, node.ID)
|
|
if updatedNode == nil || updatedNode.Spec.DesiredRole != node.Spec.DesiredRole || updatedNode.Role != node.Role {
|
|
return nil
|
|
}
|
|
updatedNode.Role = api.NodeRoleManager
|
|
return store.UpdateNode(tx, updatedNode)
|
|
})
|
|
if err != nil {
|
|
log.G(ctx).WithError(err).Errorf("failed to promote node %s", node.ID)
|
|
} else {
|
|
delete(rm.pendingReconciliation, node.ID)
|
|
}
|
|
} else if node.Spec.DesiredRole == api.NodeRoleWorker && node.Role == api.NodeRoleManager {
|
|
// Check for node in memberlist
|
|
member := rm.raft.GetMemberByNodeID(node.ID)
|
|
if member != nil {
|
|
// We first try to remove the raft node from the raft cluster. On the next tick, if the node
|
|
// has been removed from the cluster membership, we then update the store to reflect the fact
|
|
// that it has been successfully demoted, and if that works, remove it from the pending list.
|
|
rm.removeMember(ctx, member)
|
|
return
|
|
}
|
|
|
|
err := rm.store.Update(func(tx store.Tx) error {
|
|
updatedNode := store.GetNode(tx, node.ID)
|
|
if updatedNode == nil || updatedNode.Spec.DesiredRole != node.Spec.DesiredRole || updatedNode.Role != node.Role {
|
|
return nil
|
|
}
|
|
updatedNode.Role = api.NodeRoleWorker
|
|
|
|
return store.UpdateNode(tx, updatedNode)
|
|
})
|
|
if err != nil {
|
|
log.G(ctx).WithError(err).Errorf("failed to demote node %s", node.ID)
|
|
} else {
|
|
delete(rm.pendingReconciliation, node.ID)
|
|
}
|
|
}
|
|
}
|
|
|
|
// Stop stops the roleManager and waits for the main loop to exit.
|
|
func (rm *roleManager) Stop() {
|
|
rm.cancel()
|
|
<-rm.doneChan
|
|
}
|