2018-02-05 16:05:59 -05:00
package cluster // import "github.com/docker/docker/daemon/cluster"
2016-06-13 22:52:49 -04:00
2016-11-16 17:17:18 -05:00
//
// ## Swarmkit integration
//
// Cluster - static configurable object for accessing everything swarm related.
// Contains methods for connecting and controlling the cluster. Exists always,
// even if swarm mode is not enabled.
//
// NodeRunner - Manager for starting the swarmkit node. Is present only and
// always if swarm mode is enabled. Implements backoff restart loop in case of
// errors.
//
// NodeState - Information about the current node status including access to
// gRPC clients if a manager is active.
//
// ### Locking
//
// `cluster.controlMutex` - taken for the whole lifecycle of the processes that
// can reconfigure cluster(init/join/leave etc). Protects that one
// reconfiguration action has fully completed before another can start.
//
// `cluster.mu` - taken when the actual changes in cluster configurations
// happen. Different from `controlMutex` because in some cases we need to
// access current cluster state even if the long-running reconfiguration is
// going on. For example network stack may ask for the current cluster state in
// the middle of the shutdown. Any time current cluster state is asked you
// should take the read lock of `cluster.mu`. If you are writing an API
// responder that returns synchronously, hold `cluster.mu.RLock()` for the
// duration of the whole handler function. That ensures that node will not be
// shut down until the handler has finished.
//
// NodeRunner implements its internal locks that should not be used outside of
// the struct. Instead, you should just call `nodeRunner.State()` method to get
// the current state of the cluster(still need `cluster.mu.RLock()` to access
// `cluster.nr` reference itself). Most of the changes in NodeRunner happen
// because of an external event(network problem, unexpected swarmkit error) and
// Docker shouldn't take any locks that delay these changes from happening.
//
2016-06-13 22:52:49 -04:00
import (
2018-04-19 18:30:59 -04:00
"context"
2016-06-13 22:52:49 -04:00
"fmt"
2018-10-29 20:44:11 -04:00
"math"
2016-06-30 21:07:35 -04:00
"net"
2016-06-13 22:52:49 -04:00
"os"
"path/filepath"
2019-10-12 20:29:21 -04:00
"runtime"
2016-06-13 22:52:49 -04:00
"sync"
"time"
2016-08-23 19:50:15 -04:00
"github.com/docker/docker/api/types/network"
2016-09-06 14:18:12 -04:00
types "github.com/docker/docker/api/types/swarm"
2017-06-07 13:07:01 -04:00
"github.com/docker/docker/daemon/cluster/controllers/plugin"
2016-06-13 22:52:49 -04:00
executorpkg "github.com/docker/docker/daemon/cluster/executor"
2021-04-05 20:24:47 -04:00
lncluster "github.com/docker/docker/libnetwork/cluster"
2021-07-15 11:33:55 -04:00
"github.com/docker/docker/pkg/stack"
2022-04-21 17:33:07 -04:00
swarmapi "github.com/moby/swarmkit/v2/api"
swarmnode "github.com/moby/swarmkit/v2/node"
2016-10-21 21:07:55 -04:00
"github.com/pkg/errors"
2017-07-26 17:42:13 -04:00
"github.com/sirupsen/logrus"
2019-06-10 12:06:11 -04:00
"google.golang.org/grpc"
2016-06-13 22:52:49 -04:00
)
const (
2019-10-12 20:29:21 -04:00
swarmDirName = "swarm"
controlSocket = "control.sock"
swarmConnectTimeout = 20 * time . Second
swarmRequestTimeout = 20 * time . Second
stateFile = "docker-state.json"
defaultAddr = "0.0.0.0:2377"
isWindows = runtime . GOOS == "windows"
2018-10-29 20:44:11 -04:00
initialReconnectDelay = 100 * time . Millisecond
maxReconnectDelay = 30 * time . Second
contextPrefix = "com.docker.swarm"
defaultRecvSizeForListResponse = math . MaxInt32 // the max recv limit grpc <1.4.0
2016-06-13 22:52:49 -04:00
)
2016-06-30 21:07:35 -04:00
// NetworkSubnetsProvider exposes functions for retrieving the subnets
// of networks managed by Docker, so they can be filtered.
type NetworkSubnetsProvider interface {
2017-02-28 04:51:40 -05:00
Subnets ( ) ( [ ] net . IPNet , [ ] net . IPNet )
2016-06-13 22:52:49 -04:00
}
// Config provides values for Cluster.
type Config struct {
2016-06-30 21:07:35 -04:00
Root string
Name string
Backend executorpkg . Backend
2018-02-02 17:18:46 -05:00
ImageBackend executorpkg . ImageBackend
2017-06-07 13:07:01 -04:00
PluginBackend plugin . Backend
2018-03-22 17:11:03 -04:00
VolumeBackend executorpkg . VolumeBackend
2016-06-30 21:07:35 -04:00
NetworkSubnetsProvider NetworkSubnetsProvider
// DefaultAdvertiseAddr is the default host/IP or network interface to use
// if no AdvertiseAddr value is specified.
DefaultAdvertiseAddr string
2016-08-19 16:06:28 -04:00
// path to store runtime state, such as the swarm control socket
RuntimeRoot string
2017-04-02 18:21:56 -04:00
// WatchStream is a channel to pass watch API notifications to daemon
WatchStream chan * swarmapi . WatchMessage
2018-03-28 19:54:43 -04:00
// RaftHeartbeatTick is the number of ticks for heartbeat of quorum members
RaftHeartbeatTick uint32
// RaftElectionTick is the number of ticks to elapse before followers propose a new round of leader election
// This value should be 10x that of RaftHeartbeatTick
RaftElectionTick uint32
2016-06-13 22:52:49 -04:00
}
2016-06-24 14:52:28 -04:00
// Cluster provides capabilities to participate in a cluster as a worker or a
// manager.
2016-06-13 22:52:49 -04:00
type Cluster struct {
2016-11-16 17:17:18 -05:00
mu sync . RWMutex
controlMutex sync . RWMutex // protect init/join/leave user operations
nr * nodeRunner
root string
runtimeRoot string
config Config
2017-04-30 17:51:43 -04:00
configEvent chan lncluster . ConfigEventType // todo: make this array and goroutine safe
2016-11-16 17:17:18 -05:00
attachers map [ string ] * attacher
2017-04-02 18:21:56 -04:00
watchStream chan * swarmapi . WatchMessage
2016-08-23 19:50:15 -04:00
}
// attacher manages the in-memory attachment state of a container
// attachment to a global scope network managed by swarm manager. It
// helps in identifying the attachment ID via the taskID and the
// corresponding attachment configuration obtained from the manager.
type attacher struct {
2016-09-09 12:55:57 -04:00
taskID string
config * network . NetworkingConfig
2017-03-24 09:43:23 -04:00
inProgress bool
2016-09-09 12:55:57 -04:00
attachWaitCh chan * network . NetworkingConfig
attachCompleteCh chan struct { }
detachWaitCh chan struct { }
2016-06-20 19:35:33 -04:00
}
2016-06-13 22:52:49 -04:00
// New creates a new Cluster instance using provided config.
func New ( config Config ) ( * Cluster , error ) {
root := filepath . Join ( config . Root , swarmDirName )
if err := os . MkdirAll ( root , 0700 ) ; err != nil {
return nil , err
}
2016-08-19 16:06:28 -04:00
if config . RuntimeRoot == "" {
config . RuntimeRoot = root
}
2018-03-28 19:54:43 -04:00
if config . RaftHeartbeatTick == 0 {
config . RaftHeartbeatTick = 1
}
if config . RaftElectionTick == 0 {
// 10X heartbeat tick is the recommended ratio according to etcd docs.
config . RaftElectionTick = 10 * config . RaftHeartbeatTick
}
2016-08-19 16:06:28 -04:00
if err := os . MkdirAll ( config . RuntimeRoot , 0700 ) ; err != nil {
return nil , err
}
2016-06-13 22:52:49 -04:00
c := & Cluster {
2016-06-20 19:35:33 -04:00
root : root ,
config : config ,
2017-04-30 17:51:43 -04:00
configEvent : make ( chan lncluster . ConfigEventType , 10 ) ,
2016-08-19 16:06:28 -04:00
runtimeRoot : config . RuntimeRoot ,
2016-08-23 19:50:15 -04:00
attachers : make ( map [ string ] * attacher ) ,
2017-04-02 18:21:56 -04:00
watchStream : config . WatchStream ,
2016-06-13 22:52:49 -04:00
}
2017-04-30 17:51:43 -04:00
return c , nil
}
// Start the Cluster instance
// TODO The split between New and Start can be join again when the SendClusterEvent
// method is no longer required
func ( c * Cluster ) Start ( ) error {
root := filepath . Join ( c . config . Root , swarmDirName )
2016-06-13 22:52:49 -04:00
2016-11-16 17:17:18 -05:00
nodeConfig , err := loadPersistentState ( root )
2016-06-13 22:52:49 -04:00
if err != nil {
if os . IsNotExist ( err ) {
2017-04-30 17:51:43 -04:00
return nil
2016-06-13 22:52:49 -04:00
}
2017-04-30 17:51:43 -04:00
return err
2016-06-13 22:52:49 -04:00
}
2016-11-16 17:17:18 -05:00
nr , err := c . newNodeRunner ( * nodeConfig )
2016-06-13 22:52:49 -04:00
if err != nil {
2017-04-30 17:51:43 -04:00
return err
2016-06-13 22:52:49 -04:00
}
2016-11-16 17:17:18 -05:00
c . nr = nr
2016-06-13 22:52:49 -04:00
2019-01-09 13:24:03 -05:00
timer := time . NewTimer ( swarmConnectTimeout )
defer timer . Stop ( )
2016-06-13 22:52:49 -04:00
select {
2019-01-09 13:24:03 -05:00
case <- timer . C :
2016-11-01 00:05:01 -04:00
logrus . Error ( "swarm component could not be started before timeout was reached" )
2016-11-16 17:17:18 -05:00
case err := <- nr . Ready ( ) :
2016-06-13 22:52:49 -04:00
if err != nil {
2017-03-07 19:50:39 -05:00
logrus . WithError ( err ) . Error ( "swarm component could not be started" )
2017-04-30 17:51:43 -04:00
return nil
2016-06-13 22:52:49 -04:00
}
}
2017-04-30 17:51:43 -04:00
return nil
2016-06-13 22:52:49 -04:00
}
2016-11-16 17:17:18 -05:00
func ( c * Cluster ) newNodeRunner ( conf nodeStartConfig ) ( * nodeRunner , error ) {
2016-06-14 12:13:53 -04:00
if err := c . config . Backend . IsSwarmCompatible ( ) ; err != nil {
2016-06-20 19:35:33 -04:00
return nil , err
2016-06-13 22:52:49 -04:00
}
2016-06-30 21:07:35 -04:00
2016-10-21 16:31:45 -04:00
actualLocalAddr := conf . LocalAddr
2016-06-30 21:07:35 -04:00
if actualLocalAddr == "" {
// If localAddr was not specified, resolve it automatically
// based on the route to joinAddr. localAddr can only be left
// empty on "join".
2016-10-21 16:31:45 -04:00
listenHost , _ , err := net . SplitHostPort ( conf . ListenAddr )
2016-06-30 21:07:35 -04:00
if err != nil {
return nil , fmt . Errorf ( "could not parse listen address: %v" , err )
}
listenAddrIP := net . ParseIP ( listenHost )
if listenAddrIP == nil || ! listenAddrIP . IsUnspecified ( ) {
actualLocalAddr = listenHost
} else {
2016-10-21 16:31:45 -04:00
if conf . RemoteAddr == "" {
2016-06-30 21:07:35 -04:00
// Should never happen except using swarms created by
// old versions that didn't save remoteAddr.
2016-10-21 16:31:45 -04:00
conf . RemoteAddr = "8.8.8.8:53"
2016-06-30 21:07:35 -04:00
}
2016-10-21 16:31:45 -04:00
conn , err := net . Dial ( "udp" , conf . RemoteAddr )
2016-06-30 21:07:35 -04:00
if err != nil {
return nil , fmt . Errorf ( "could not find local IP address: %v" , err )
}
localHostPort := conn . LocalAddr ( ) . String ( )
actualLocalAddr , _ , _ = net . SplitHostPort ( localHostPort )
conn . Close ( )
}
}
2016-11-16 17:17:18 -05:00
nr := & nodeRunner { cluster : c }
nr . actualLocalAddr = actualLocalAddr
2016-10-21 21:07:55 -04:00
2016-11-16 17:17:18 -05:00
if err := nr . Start ( conf ) ; err != nil {
2016-06-20 19:35:33 -04:00
return nil , err
2016-06-13 22:52:49 -04:00
}
2016-06-30 21:07:35 -04:00
2017-01-13 23:14:03 -05:00
c . config . Backend . DaemonJoinsCluster ( c )
2016-06-13 22:52:49 -04:00
2016-11-16 17:17:18 -05:00
return nr , nil
2016-06-13 22:52:49 -04:00
}
2016-07-15 13:58:21 -04:00
func ( c * Cluster ) getRequestContext ( ) ( context . Context , func ( ) ) { // TODO: not needed when requests don't block on qourum lost
return context . WithTimeout ( context . Background ( ) , swarmRequestTimeout )
2016-06-13 22:52:49 -04:00
}
2016-06-24 14:52:28 -04:00
// IsManager returns true if Cluster is participating as a manager.
2016-06-13 22:52:49 -04:00
func ( c * Cluster ) IsManager ( ) bool {
2016-11-16 17:17:18 -05:00
c . mu . RLock ( )
defer c . mu . RUnlock ( )
return c . currentNodeState ( ) . IsActiveManager ( )
2016-06-13 22:52:49 -04:00
}
2016-06-24 14:52:28 -04:00
// IsAgent returns true if Cluster is participating as a worker/agent.
2016-06-13 22:52:49 -04:00
func ( c * Cluster ) IsAgent ( ) bool {
2016-11-16 17:17:18 -05:00
c . mu . RLock ( )
defer c . mu . RUnlock ( )
return c . currentNodeState ( ) . status == types . LocalNodeStateActive
2016-06-13 22:52:49 -04:00
}
2016-06-30 21:07:35 -04:00
// GetLocalAddress returns the local address.
func ( c * Cluster ) GetLocalAddress ( ) string {
2016-11-16 17:17:18 -05:00
c . mu . RLock ( )
defer c . mu . RUnlock ( )
return c . currentNodeState ( ) . actualLocalAddr
2016-06-30 21:07:35 -04:00
}
2016-09-22 21:43:54 -04:00
// GetListenAddress returns the listen address.
func ( c * Cluster ) GetListenAddress ( ) string {
2016-11-16 17:17:18 -05:00
c . mu . RLock ( )
defer c . mu . RUnlock ( )
if c . nr != nil {
return c . nr . config . ListenAddr
2016-10-21 16:31:45 -04:00
}
return ""
2016-09-22 21:43:54 -04:00
}
2016-06-30 21:07:35 -04:00
// GetAdvertiseAddress returns the remotely reachable address of this node.
func ( c * Cluster ) GetAdvertiseAddress ( ) string {
2016-11-16 17:17:18 -05:00
c . mu . RLock ( )
defer c . mu . RUnlock ( )
if c . nr != nil && c . nr . config . AdvertiseAddr != "" {
advertiseHost , _ , _ := net . SplitHostPort ( c . nr . config . AdvertiseAddr )
2016-06-30 21:07:35 -04:00
return advertiseHost
2016-06-13 22:52:49 -04:00
}
2016-11-16 17:17:18 -05:00
return c . currentNodeState ( ) . actualLocalAddr
2016-06-13 22:52:49 -04:00
}
2017-04-14 19:54:17 -04:00
// GetDataPathAddress returns the address to be used for the data path traffic, if specified.
func ( c * Cluster ) GetDataPathAddress ( ) string {
c . mu . RLock ( )
defer c . mu . RUnlock ( )
if c . nr != nil {
return c . nr . config . DataPathAddr
}
return ""
}
2017-04-27 20:06:16 -04:00
// GetRemoteAddressList returns the advertise address for each of the remote managers if
2016-06-13 22:52:49 -04:00
// available.
2017-04-27 20:06:16 -04:00
func ( c * Cluster ) GetRemoteAddressList ( ) [ ] string {
2016-11-16 17:17:18 -05:00
c . mu . RLock ( )
defer c . mu . RUnlock ( )
2017-04-27 20:06:16 -04:00
return c . getRemoteAddressList ( )
2016-06-13 22:52:49 -04:00
}
2018-04-12 14:44:20 -04:00
// GetWatchStream returns the channel to pass changes from store watch API
func ( c * Cluster ) GetWatchStream ( ) chan * swarmapi . WatchMessage {
c . mu . RLock ( )
defer c . mu . RUnlock ( )
return c . watchStream
}
2017-04-27 20:06:16 -04:00
func ( c * Cluster ) getRemoteAddressList ( ) [ ] string {
2016-11-16 17:17:18 -05:00
state := c . currentNodeState ( )
if state . swarmNode == nil {
2017-04-27 20:06:16 -04:00
return [ ] string { }
2016-06-13 22:52:49 -04:00
}
2017-04-27 20:06:16 -04:00
2016-11-16 17:17:18 -05:00
nodeID := state . swarmNode . NodeID ( )
2017-04-27 20:06:16 -04:00
remotes := state . swarmNode . Remotes ( )
addressList := make ( [ ] string , 0 , len ( remotes ) )
for _ , r := range remotes {
2016-06-13 22:52:49 -04:00
if r . NodeID != nodeID {
2017-04-27 20:06:16 -04:00
addressList = append ( addressList , r . Addr )
2016-06-13 22:52:49 -04:00
}
}
2017-04-27 20:06:16 -04:00
return addressList
2016-06-13 22:52:49 -04:00
}
// ListenClusterEvents returns a channel that receives messages on cluster
// participation changes.
// todo: make cancelable and accessible to multiple callers
2017-04-30 17:51:43 -04:00
func ( c * Cluster ) ListenClusterEvents ( ) <- chan lncluster . ConfigEventType {
2016-06-13 22:52:49 -04:00
return c . configEvent
}
2016-11-16 17:17:18 -05:00
// currentNodeState should not be called without a read lock
func ( c * Cluster ) currentNodeState ( ) nodeState {
return c . nr . State ( )
2016-11-08 21:03:47 -05:00
}
2016-06-23 16:52:41 -04:00
// errNoManager returns error describing why manager commands can't be used.
// Call with read lock.
2016-11-16 17:17:18 -05:00
func ( c * Cluster ) errNoManager ( st nodeState ) error {
if st . swarmNode == nil {
2020-04-17 06:01:01 -04:00
if errors . Is ( st . err , errSwarmLocked ) {
2016-12-02 04:14:32 -05:00
return errSwarmLocked
2016-10-21 21:07:55 -04:00
}
2016-12-02 04:14:32 -05:00
if st . err == errSwarmCertificatesExpired {
return errSwarmCertificatesExpired
2016-11-08 21:03:47 -05:00
}
2017-07-19 10:20:13 -04:00
return errors . WithStack ( notAvailableError ( "This node is not a swarm manager. Use \"docker swarm init\" or \"docker swarm join\" to connect this node to swarm and try again." ) )
2016-06-23 16:52:41 -04:00
}
2016-11-16 17:17:18 -05:00
if st . swarmNode . Manager ( ) != nil {
2017-07-19 10:20:13 -04:00
return errors . WithStack ( notAvailableError ( "This node is not a swarm manager. Manager is being prepared or has trouble connecting to the cluster." ) )
2016-06-23 16:52:41 -04:00
}
2017-07-19 10:20:13 -04:00
return errors . WithStack ( notAvailableError ( "This node is not a swarm manager. Worker nodes can't be used to view or modify cluster state. Please run this command on a manager node or promote the current node to a manager." ) )
2016-06-23 16:52:41 -04:00
}
2016-06-13 22:52:49 -04:00
// Cleanup stops active swarm node. This is run before daemon shutdown.
func ( c * Cluster ) Cleanup ( ) {
2016-11-16 17:17:18 -05:00
c . controlMutex . Lock ( )
defer c . controlMutex . Unlock ( )
c . mu . Lock ( )
node := c . nr
2016-06-13 22:52:49 -04:00
if node == nil {
2016-11-16 17:17:18 -05:00
c . mu . Unlock ( )
2016-06-13 22:52:49 -04:00
return
}
2016-11-16 17:17:18 -05:00
state := c . currentNodeState ( )
2017-04-07 21:27:35 -04:00
c . mu . Unlock ( )
2016-11-16 17:17:18 -05:00
if state . IsActiveManager ( ) {
active , reachable , unreachable , err := managerStats ( state . controlClient , state . NodeID ( ) )
2016-06-13 22:52:49 -04:00
if err == nil {
2016-08-19 16:49:58 -04:00
singlenode := active && isLastManager ( reachable , unreachable )
if active && ! singlenode && removingManagerCausesLossOfQuorum ( reachable , unreachable ) {
2016-06-13 22:52:49 -04:00
logrus . Errorf ( "Leaving cluster with %v managers left out of %v. Raft quorum will be lost." , reachable - 1 , reachable + unreachable )
}
}
}
2017-04-07 21:27:35 -04:00
2016-11-16 17:17:18 -05:00
if err := node . Stop ( ) ; err != nil {
logrus . Errorf ( "failed to shut down cluster node: %v" , err )
2021-07-15 11:33:55 -04:00
stack . Dump ( )
2016-11-16 17:17:18 -05:00
}
2017-04-07 21:27:35 -04:00
c . mu . Lock ( )
2016-11-16 17:17:18 -05:00
c . nr = nil
2017-04-07 21:27:35 -04:00
c . mu . Unlock ( )
2016-06-13 22:52:49 -04:00
}
2016-11-16 17:17:18 -05:00
func managerStats ( client swarmapi . ControlClient , currentNodeID string ) ( current bool , reachable int , unreachable int , err error ) {
2016-07-15 13:58:21 -04:00
ctx , cancel := context . WithTimeout ( context . Background ( ) , 5 * time . Second )
defer cancel ( )
2019-06-10 12:06:11 -04:00
nodes , err := client . ListNodes (
ctx , & swarmapi . ListNodesRequest { } ,
grpc . MaxCallRecvMsgSize ( defaultRecvSizeForListResponse ) ,
)
2016-06-13 22:52:49 -04:00
if err != nil {
return false , 0 , 0 , err
}
for _ , n := range nodes . Nodes {
if n . ManagerStatus != nil {
2016-06-14 20:23:01 -04:00
if n . ManagerStatus . Reachability == swarmapi . RaftMemberStatus_REACHABLE {
2016-06-13 22:52:49 -04:00
reachable ++
2016-11-16 17:17:18 -05:00
if n . ID == currentNodeID {
2016-06-13 22:52:49 -04:00
current = true
}
}
2016-06-14 20:23:01 -04:00
if n . ManagerStatus . Reachability == swarmapi . RaftMemberStatus_UNREACHABLE {
2016-06-13 22:52:49 -04:00
unreachable ++
}
}
}
return
}
2016-10-21 21:07:55 -04:00
func detectLockedError ( err error ) error {
2016-10-27 21:50:49 -04:00
if err == swarmnode . ErrInvalidUnlockKey {
2016-12-02 04:14:32 -05:00
return errors . WithStack ( errSwarmLocked )
2016-10-21 21:07:55 -04:00
}
return err
}
2017-02-28 05:12:11 -05:00
func ( c * Cluster ) lockedManagerAction ( fn func ( ctx context . Context , state nodeState ) error ) error {
c . mu . RLock ( )
defer c . mu . RUnlock ( )
state := c . currentNodeState ( )
if ! state . IsActiveManager ( ) {
return c . errNoManager ( state )
}
ctx , cancel := c . getRequestContext ( )
defer cancel ( )
return fn ( ctx , state )
}
2017-04-30 17:51:43 -04:00
// SendClusterEvent allows to send cluster events on the configEvent channel
// TODO This method should not be exposed.
// Currently it is used to notify the network controller that the keys are
// available
func ( c * Cluster ) SendClusterEvent ( event lncluster . ConfigEventType ) {
c . mu . RLock ( )
defer c . mu . RUnlock ( )
c . configEvent <- event
}