mirror of
https://github.com/moby/moby.git
synced 2022-11-09 12:21:53 -05:00
7ff546467d
fix error message and typos in swarm cluster
1189 lines
30 KiB
Go
1189 lines
30 KiB
Go
package cluster
|
|
|
|
import (
|
|
"encoding/json"
|
|
"fmt"
|
|
"io/ioutil"
|
|
"os"
|
|
"path/filepath"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"google.golang.org/grpc"
|
|
|
|
"github.com/Sirupsen/logrus"
|
|
"github.com/docker/distribution/digest"
|
|
"github.com/docker/docker/daemon/cluster/convert"
|
|
executorpkg "github.com/docker/docker/daemon/cluster/executor"
|
|
"github.com/docker/docker/daemon/cluster/executor/container"
|
|
"github.com/docker/docker/errors"
|
|
"github.com/docker/docker/opts"
|
|
"github.com/docker/docker/pkg/ioutils"
|
|
"github.com/docker/docker/runconfig"
|
|
apitypes "github.com/docker/engine-api/types"
|
|
types "github.com/docker/engine-api/types/swarm"
|
|
swarmagent "github.com/docker/swarmkit/agent"
|
|
swarmapi "github.com/docker/swarmkit/api"
|
|
"golang.org/x/net/context"
|
|
)
|
|
|
|
const swarmDirName = "swarm"
|
|
const controlSocket = "control.sock"
|
|
const swarmConnectTimeout = 20 * time.Second
|
|
const stateFile = "docker-state.json"
|
|
const defaultAddr = "0.0.0.0:2377"
|
|
|
|
const (
|
|
initialReconnectDelay = 100 * time.Millisecond
|
|
maxReconnectDelay = 30 * time.Second
|
|
)
|
|
|
|
// ErrNoSwarm is returned on leaving a cluster that was never initialized
|
|
var ErrNoSwarm = fmt.Errorf("This node is not part of Swarm")
|
|
|
|
// ErrSwarmExists is returned on initialize or join request for a cluster that has already been activated
|
|
var ErrSwarmExists = fmt.Errorf("This node is already part of a Swarm cluster. Use \"docker swarm leave\" to leave this cluster and join another one.")
|
|
|
|
// ErrPendingSwarmExists is returned on initialize or join request for a cluster that is already processing a similar request but has not succeeded yet.
|
|
var ErrPendingSwarmExists = fmt.Errorf("This node is processing an existing join request that has not succeeded yet. Use \"docker swarm leave\" to cancel the current request.")
|
|
|
|
// ErrSwarmJoinTimeoutReached is returned when cluster join could not complete before timeout was reached.
|
|
var ErrSwarmJoinTimeoutReached = fmt.Errorf("Timeout was reached before node was joined. Attempt to join the cluster will continue in the background. Use \"docker info\" command to see the current Swarm status of your node.")
|
|
|
|
// defaultSpec contains some sane defaults if cluster options are missing on init
|
|
var defaultSpec = types.Spec{
|
|
Raft: types.RaftConfig{
|
|
SnapshotInterval: 10000,
|
|
KeepOldSnapshots: 0,
|
|
LogEntriesForSlowFollowers: 500,
|
|
HeartbeatTick: 1,
|
|
ElectionTick: 3,
|
|
},
|
|
CAConfig: types.CAConfig{
|
|
NodeCertExpiry: 90 * 24 * time.Hour,
|
|
},
|
|
Dispatcher: types.DispatcherConfig{
|
|
HeartbeatPeriod: uint64((5 * time.Second).Nanoseconds()),
|
|
},
|
|
Orchestration: types.OrchestrationConfig{
|
|
TaskHistoryRetentionLimit: 10,
|
|
},
|
|
}
|
|
|
|
type state struct {
|
|
ListenAddr string
|
|
}
|
|
|
|
// Config provides values for Cluster.
|
|
type Config struct {
|
|
Root string
|
|
Name string
|
|
Backend executorpkg.Backend
|
|
}
|
|
|
|
// Cluster provides capabilities to participate in a cluster as a worker or a
|
|
// manager.
|
|
type Cluster struct {
|
|
sync.RWMutex
|
|
root string
|
|
config Config
|
|
configEvent chan struct{} // todo: make this array and goroutine safe
|
|
node *swarmagent.Node
|
|
conn *grpc.ClientConn
|
|
client swarmapi.ControlClient
|
|
ready bool
|
|
listenAddr string
|
|
err error
|
|
reconnectDelay time.Duration
|
|
stop bool
|
|
cancelDelay func()
|
|
}
|
|
|
|
// New creates a new Cluster instance using provided config.
|
|
func New(config Config) (*Cluster, error) {
|
|
root := filepath.Join(config.Root, swarmDirName)
|
|
if err := os.MkdirAll(root, 0700); err != nil {
|
|
return nil, err
|
|
}
|
|
c := &Cluster{
|
|
root: root,
|
|
config: config,
|
|
configEvent: make(chan struct{}, 10),
|
|
reconnectDelay: initialReconnectDelay,
|
|
}
|
|
|
|
st, err := c.loadState()
|
|
if err != nil {
|
|
if os.IsNotExist(err) {
|
|
return c, nil
|
|
}
|
|
return nil, err
|
|
}
|
|
|
|
n, ctx, err := c.startNewNode(false, st.ListenAddr, "", "", "", false)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
select {
|
|
case <-time.After(swarmConnectTimeout):
|
|
logrus.Errorf("swarm component could not be started before timeout was reached")
|
|
case <-n.Ready():
|
|
case <-ctx.Done():
|
|
}
|
|
if ctx.Err() != nil {
|
|
return nil, fmt.Errorf("swarm component could not be started")
|
|
}
|
|
go c.reconnectOnFailure(ctx)
|
|
return c, nil
|
|
}
|
|
|
|
func (c *Cluster) loadState() (*state, error) {
|
|
dt, err := ioutil.ReadFile(filepath.Join(c.root, stateFile))
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
// missing certificate means no actual state to restore from
|
|
if _, err := os.Stat(filepath.Join(c.root, "certificates/swarm-node.crt")); err != nil {
|
|
if os.IsNotExist(err) {
|
|
c.clearState()
|
|
}
|
|
return nil, err
|
|
}
|
|
var st state
|
|
if err := json.Unmarshal(dt, &st); err != nil {
|
|
return nil, err
|
|
}
|
|
return &st, nil
|
|
}
|
|
|
|
func (c *Cluster) saveState() error {
|
|
dt, err := json.Marshal(state{ListenAddr: c.listenAddr})
|
|
if err != nil {
|
|
return err
|
|
}
|
|
return ioutils.AtomicWriteFile(filepath.Join(c.root, stateFile), dt, 0600)
|
|
}
|
|
|
|
func (c *Cluster) reconnectOnFailure(ctx context.Context) {
|
|
for {
|
|
<-ctx.Done()
|
|
c.Lock()
|
|
if c.stop || c.node != nil {
|
|
c.Unlock()
|
|
return
|
|
}
|
|
c.reconnectDelay *= 2
|
|
if c.reconnectDelay > maxReconnectDelay {
|
|
c.reconnectDelay = maxReconnectDelay
|
|
}
|
|
logrus.Warnf("Restarting swarm in %.2f seconds", c.reconnectDelay.Seconds())
|
|
delayCtx, cancel := context.WithTimeout(context.Background(), c.reconnectDelay)
|
|
c.cancelDelay = cancel
|
|
c.Unlock()
|
|
<-delayCtx.Done()
|
|
if delayCtx.Err() != context.DeadlineExceeded {
|
|
return
|
|
}
|
|
c.Lock()
|
|
if c.node != nil {
|
|
c.Unlock()
|
|
return
|
|
}
|
|
var err error
|
|
_, ctx, err = c.startNewNode(false, c.listenAddr, c.getRemoteAddress(), "", "", false)
|
|
if err != nil {
|
|
c.err = err
|
|
ctx = delayCtx
|
|
}
|
|
c.Unlock()
|
|
}
|
|
}
|
|
|
|
func (c *Cluster) startNewNode(forceNewCluster bool, listenAddr, joinAddr, secret, cahash string, ismanager bool) (*swarmagent.Node, context.Context, error) {
|
|
if err := c.config.Backend.IsSwarmCompatible(); err != nil {
|
|
return nil, nil, err
|
|
}
|
|
c.node = nil
|
|
c.cancelDelay = nil
|
|
node, err := swarmagent.NewNode(&swarmagent.NodeConfig{
|
|
Hostname: c.config.Name,
|
|
ForceNewCluster: forceNewCluster,
|
|
ListenControlAPI: filepath.Join(c.root, controlSocket),
|
|
ListenRemoteAPI: listenAddr,
|
|
JoinAddr: joinAddr,
|
|
StateDir: c.root,
|
|
CAHash: cahash,
|
|
Secret: secret,
|
|
Executor: container.NewExecutor(c.config.Backend),
|
|
HeartbeatTick: 1,
|
|
ElectionTick: 3,
|
|
IsManager: ismanager,
|
|
})
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
ctx, cancel := context.WithCancel(context.Background())
|
|
if err := node.Start(ctx); err != nil {
|
|
return nil, nil, err
|
|
}
|
|
|
|
c.node = node
|
|
c.listenAddr = listenAddr
|
|
c.saveState()
|
|
c.config.Backend.SetClusterProvider(c)
|
|
go func() {
|
|
err := node.Err(ctx)
|
|
if err != nil {
|
|
logrus.Errorf("cluster exited with error: %v", err)
|
|
}
|
|
c.Lock()
|
|
c.conn = nil
|
|
c.client = nil
|
|
c.node = nil
|
|
c.ready = false
|
|
c.err = err
|
|
c.Unlock()
|
|
cancel()
|
|
}()
|
|
|
|
go func() {
|
|
select {
|
|
case <-node.Ready():
|
|
c.Lock()
|
|
c.reconnectDelay = initialReconnectDelay
|
|
c.Unlock()
|
|
case <-ctx.Done():
|
|
}
|
|
if ctx.Err() == nil {
|
|
c.Lock()
|
|
c.ready = true
|
|
c.err = nil
|
|
c.Unlock()
|
|
}
|
|
c.configEvent <- struct{}{}
|
|
}()
|
|
|
|
go func() {
|
|
for conn := range node.ListenControlSocket(ctx) {
|
|
c.Lock()
|
|
if c.conn != conn {
|
|
c.client = swarmapi.NewControlClient(conn)
|
|
}
|
|
if c.conn != nil {
|
|
c.client = nil
|
|
}
|
|
c.conn = conn
|
|
c.Unlock()
|
|
c.configEvent <- struct{}{}
|
|
}
|
|
}()
|
|
|
|
return node, ctx, nil
|
|
}
|
|
|
|
// Init initializes new cluster from user provided request.
|
|
func (c *Cluster) Init(req types.InitRequest) (string, error) {
|
|
c.Lock()
|
|
if node := c.node; node != nil {
|
|
c.Unlock()
|
|
if !req.ForceNewCluster {
|
|
return "", errSwarmExists(node)
|
|
}
|
|
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
|
|
defer cancel()
|
|
c.cancelReconnect()
|
|
if err := c.node.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") {
|
|
return "", err
|
|
}
|
|
c.Lock()
|
|
c.node = nil
|
|
c.conn = nil
|
|
c.ready = false
|
|
}
|
|
|
|
if err := validateAndSanitizeInitRequest(&req); err != nil {
|
|
c.Unlock()
|
|
return "", err
|
|
}
|
|
|
|
// todo: check current state existing
|
|
n, ctx, err := c.startNewNode(req.ForceNewCluster, req.ListenAddr, "", "", "", false)
|
|
if err != nil {
|
|
c.Unlock()
|
|
return "", err
|
|
}
|
|
c.Unlock()
|
|
|
|
select {
|
|
case <-n.Ready():
|
|
if err := initClusterSpec(n, req.Spec); err != nil {
|
|
return "", err
|
|
}
|
|
go c.reconnectOnFailure(ctx)
|
|
return n.NodeID(), nil
|
|
case <-ctx.Done():
|
|
c.RLock()
|
|
defer c.RUnlock()
|
|
if c.err != nil {
|
|
if !req.ForceNewCluster { // if failure on first attempt don't keep state
|
|
if err := c.clearState(); err != nil {
|
|
return "", err
|
|
}
|
|
}
|
|
return "", c.err
|
|
}
|
|
return "", ctx.Err()
|
|
}
|
|
}
|
|
|
|
// Join makes current Cluster part of an existing swarm cluster.
|
|
func (c *Cluster) Join(req types.JoinRequest) error {
|
|
c.Lock()
|
|
if node := c.node; node != nil {
|
|
c.Unlock()
|
|
return errSwarmExists(node)
|
|
}
|
|
if err := validateAndSanitizeJoinRequest(&req); err != nil {
|
|
c.Unlock()
|
|
return err
|
|
}
|
|
// todo: check current state existing
|
|
n, ctx, err := c.startNewNode(false, req.ListenAddr, req.RemoteAddrs[0], req.Secret, req.CACertHash, req.Manager)
|
|
if err != nil {
|
|
c.Unlock()
|
|
return err
|
|
}
|
|
c.Unlock()
|
|
|
|
certificateRequested := n.CertificateRequested()
|
|
for {
|
|
select {
|
|
case <-certificateRequested:
|
|
if n.NodeMembership() == swarmapi.NodeMembershipPending {
|
|
return fmt.Errorf("Your node is in the process of joining the cluster but needs to be accepted by existing cluster member.\nTo accept this node into cluster run \"docker node accept %v\" in an existing cluster manager. Use \"docker info\" command to see the current Swarm status of your node.", n.NodeID())
|
|
}
|
|
certificateRequested = nil
|
|
case <-time.After(swarmConnectTimeout):
|
|
// attempt to connect will continue in background, also reconnecting
|
|
go c.reconnectOnFailure(ctx)
|
|
return ErrSwarmJoinTimeoutReached
|
|
case <-n.Ready():
|
|
go c.reconnectOnFailure(ctx)
|
|
return nil
|
|
case <-ctx.Done():
|
|
c.RLock()
|
|
defer c.RUnlock()
|
|
if c.err != nil {
|
|
return c.err
|
|
}
|
|
return ctx.Err()
|
|
}
|
|
}
|
|
}
|
|
|
|
func (c *Cluster) cancelReconnect() {
|
|
c.stop = true
|
|
if c.cancelDelay != nil {
|
|
c.cancelDelay()
|
|
c.cancelDelay = nil
|
|
}
|
|
}
|
|
|
|
// Leave shuts down Cluster and removes current state.
|
|
func (c *Cluster) Leave(force bool) error {
|
|
c.Lock()
|
|
node := c.node
|
|
if node == nil {
|
|
c.Unlock()
|
|
return ErrNoSwarm
|
|
}
|
|
|
|
if node.Manager() != nil && !force {
|
|
msg := "You are attempting to leave cluster on a node that is participating as a manager. "
|
|
if c.isActiveManager() {
|
|
active, reachable, unreachable, err := c.managerStats()
|
|
if err == nil {
|
|
if active && reachable-2 <= unreachable {
|
|
if reachable == 1 && unreachable == 0 {
|
|
msg += "Leaving last manager will remove all current state of the cluster. Use `--force` to ignore this message. "
|
|
c.Unlock()
|
|
return fmt.Errorf(msg)
|
|
}
|
|
msg += fmt.Sprintf("Leaving cluster will leave you with %v managers out of %v. This means Raft quorum will be lost and your cluster will become inaccessible. ", reachable-1, reachable+unreachable)
|
|
}
|
|
}
|
|
} else {
|
|
msg += "Doing so may lose the consensus of your cluster. "
|
|
}
|
|
|
|
msg += "Only way to restore a cluster that has lost consensus is to reinitialize it with `--force-new-cluster`. Use `--force` to ignore this message."
|
|
c.Unlock()
|
|
return fmt.Errorf(msg)
|
|
}
|
|
c.cancelReconnect()
|
|
c.Unlock()
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
|
|
defer cancel()
|
|
if err := node.Stop(ctx); err != nil && !strings.Contains(err.Error(), "context canceled") {
|
|
return err
|
|
}
|
|
if nodeID := node.NodeID(); nodeID != "" {
|
|
for _, id := range c.config.Backend.ListContainersForNode(nodeID) {
|
|
if err := c.config.Backend.ContainerRm(id, &apitypes.ContainerRmConfig{ForceRemove: true}); err != nil {
|
|
logrus.Errorf("error removing %v: %v", id, err)
|
|
}
|
|
}
|
|
}
|
|
c.Lock()
|
|
defer c.Unlock()
|
|
c.node = nil
|
|
c.conn = nil
|
|
c.ready = false
|
|
c.configEvent <- struct{}{}
|
|
// todo: cleanup optional?
|
|
if err := c.clearState(); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func (c *Cluster) clearState() error {
|
|
// todo: backup this data instead of removing?
|
|
if err := os.RemoveAll(c.root); err != nil {
|
|
return err
|
|
}
|
|
if err := os.MkdirAll(c.root, 0700); err != nil {
|
|
return err
|
|
}
|
|
c.config.Backend.SetClusterProvider(nil)
|
|
return nil
|
|
}
|
|
|
|
func (c *Cluster) getRequestContext() context.Context { // TODO: not needed when requests don't block on qourum lost
|
|
ctx, _ := context.WithTimeout(context.Background(), 5*time.Second)
|
|
return ctx
|
|
}
|
|
|
|
// Inspect retrieves the configuration properties of a managed swarm cluster.
|
|
func (c *Cluster) Inspect() (types.Swarm, error) {
|
|
c.RLock()
|
|
defer c.RUnlock()
|
|
|
|
if !c.isActiveManager() {
|
|
return types.Swarm{}, c.errNoManager()
|
|
}
|
|
|
|
swarm, err := getSwarm(c.getRequestContext(), c.client)
|
|
if err != nil {
|
|
return types.Swarm{}, err
|
|
}
|
|
|
|
if err != nil {
|
|
return types.Swarm{}, err
|
|
}
|
|
|
|
return convert.SwarmFromGRPC(*swarm), nil
|
|
}
|
|
|
|
// Update updates configuration of a managed swarm cluster.
|
|
func (c *Cluster) Update(version uint64, spec types.Spec) error {
|
|
c.RLock()
|
|
defer c.RUnlock()
|
|
|
|
if !c.isActiveManager() {
|
|
return c.errNoManager()
|
|
}
|
|
|
|
swarm, err := getSwarm(c.getRequestContext(), c.client)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
swarmSpec, err := convert.SwarmSpecToGRPCandMerge(spec, &swarm.Spec)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
_, err = c.client.UpdateCluster(
|
|
c.getRequestContext(),
|
|
&swarmapi.UpdateClusterRequest{
|
|
ClusterID: swarm.ID,
|
|
Spec: &swarmSpec,
|
|
ClusterVersion: &swarmapi.Version{
|
|
Index: version,
|
|
},
|
|
},
|
|
)
|
|
return err
|
|
}
|
|
|
|
// IsManager returns true if Cluster is participating as a manager.
|
|
func (c *Cluster) IsManager() bool {
|
|
c.RLock()
|
|
defer c.RUnlock()
|
|
return c.isActiveManager()
|
|
}
|
|
|
|
// IsAgent returns true if Cluster is participating as a worker/agent.
|
|
func (c *Cluster) IsAgent() bool {
|
|
c.RLock()
|
|
defer c.RUnlock()
|
|
return c.ready
|
|
}
|
|
|
|
// GetListenAddress returns the listening address for current manager's
|
|
// consensus and dispatcher APIs.
|
|
func (c *Cluster) GetListenAddress() string {
|
|
c.RLock()
|
|
defer c.RUnlock()
|
|
if c.conn != nil {
|
|
return c.listenAddr
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// GetRemoteAddress returns a known advertise address of a remote manager if
|
|
// available.
|
|
// todo: change to array/connect with info
|
|
func (c *Cluster) GetRemoteAddress() string {
|
|
c.RLock()
|
|
defer c.RUnlock()
|
|
return c.getRemoteAddress()
|
|
}
|
|
|
|
func (c *Cluster) getRemoteAddress() string {
|
|
if c.node == nil {
|
|
return ""
|
|
}
|
|
nodeID := c.node.NodeID()
|
|
for _, r := range c.node.Remotes() {
|
|
if r.NodeID != nodeID {
|
|
return r.Addr
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
// ListenClusterEvents returns a channel that receives messages on cluster
|
|
// participation changes.
|
|
// todo: make cancelable and accessible to multiple callers
|
|
func (c *Cluster) ListenClusterEvents() <-chan struct{} {
|
|
return c.configEvent
|
|
}
|
|
|
|
// Info returns information about the current cluster state.
|
|
func (c *Cluster) Info() types.Info {
|
|
var info types.Info
|
|
c.RLock()
|
|
defer c.RUnlock()
|
|
|
|
if c.node == nil {
|
|
info.LocalNodeState = types.LocalNodeStateInactive
|
|
if c.cancelDelay != nil {
|
|
info.LocalNodeState = types.LocalNodeStateError
|
|
}
|
|
} else {
|
|
info.LocalNodeState = types.LocalNodeStatePending
|
|
if c.ready == true {
|
|
info.LocalNodeState = types.LocalNodeStateActive
|
|
}
|
|
}
|
|
if c.err != nil {
|
|
info.Error = c.err.Error()
|
|
}
|
|
|
|
if c.isActiveManager() {
|
|
info.ControlAvailable = true
|
|
if r, err := c.client.ListNodes(c.getRequestContext(), &swarmapi.ListNodesRequest{}); err == nil {
|
|
info.Nodes = len(r.Nodes)
|
|
for _, n := range r.Nodes {
|
|
if n.ManagerStatus != nil {
|
|
info.Managers = info.Managers + 1
|
|
}
|
|
}
|
|
}
|
|
|
|
if swarm, err := getSwarm(c.getRequestContext(), c.client); err == nil && swarm != nil {
|
|
info.CACertHash = swarm.RootCA.CACertHash
|
|
}
|
|
}
|
|
|
|
if c.node != nil {
|
|
for _, r := range c.node.Remotes() {
|
|
info.RemoteManagers = append(info.RemoteManagers, types.Peer{NodeID: r.NodeID, Addr: r.Addr})
|
|
}
|
|
info.NodeID = c.node.NodeID()
|
|
}
|
|
|
|
return info
|
|
}
|
|
|
|
// isActiveManager should not be called without a read lock
|
|
func (c *Cluster) isActiveManager() bool {
|
|
return c.conn != nil
|
|
}
|
|
|
|
// errNoManager returns error describing why manager commands can't be used.
|
|
// Call with read lock.
|
|
func (c *Cluster) errNoManager() error {
|
|
if c.node == nil {
|
|
return fmt.Errorf("This node is not a Swarm manager. Use \"docker swarm init\" or \"docker swarm join --manager\" to connect this node to Swarm and try again.")
|
|
}
|
|
if c.node.Manager() != nil {
|
|
return fmt.Errorf("This node is not a Swarm manager. Manager is being prepared or has trouble connecting to the cluster.")
|
|
}
|
|
return fmt.Errorf("This node is not a Swarm manager. Worker nodes can't be used to view or modify cluster state. Please run this command on a manager node or promote the current node to a manager.")
|
|
}
|
|
|
|
// GetServices returns all services of a managed swarm cluster.
|
|
func (c *Cluster) GetServices(options apitypes.ServiceListOptions) ([]types.Service, error) {
|
|
c.RLock()
|
|
defer c.RUnlock()
|
|
|
|
if !c.isActiveManager() {
|
|
return nil, c.errNoManager()
|
|
}
|
|
|
|
filters, err := newListServicesFilters(options.Filter)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
r, err := c.client.ListServices(
|
|
c.getRequestContext(),
|
|
&swarmapi.ListServicesRequest{Filters: filters})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var services []types.Service
|
|
|
|
for _, service := range r.Services {
|
|
services = append(services, convert.ServiceFromGRPC(*service))
|
|
}
|
|
|
|
return services, nil
|
|
}
|
|
|
|
// CreateService creates a new service in a managed swarm cluster.
|
|
func (c *Cluster) CreateService(s types.ServiceSpec) (string, error) {
|
|
c.RLock()
|
|
defer c.RUnlock()
|
|
|
|
if !c.isActiveManager() {
|
|
return "", c.errNoManager()
|
|
}
|
|
|
|
ctx := c.getRequestContext()
|
|
|
|
err := populateNetworkID(ctx, c.client, &s)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
serviceSpec, err := convert.ServiceSpecToGRPC(s)
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
r, err := c.client.CreateService(ctx, &swarmapi.CreateServiceRequest{Spec: &serviceSpec})
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
return r.Service.ID, nil
|
|
}
|
|
|
|
// GetService returns a service based on an ID or name.
|
|
func (c *Cluster) GetService(input string) (types.Service, error) {
|
|
c.RLock()
|
|
defer c.RUnlock()
|
|
|
|
if !c.isActiveManager() {
|
|
return types.Service{}, c.errNoManager()
|
|
}
|
|
|
|
service, err := getService(c.getRequestContext(), c.client, input)
|
|
if err != nil {
|
|
return types.Service{}, err
|
|
}
|
|
return convert.ServiceFromGRPC(*service), nil
|
|
}
|
|
|
|
// UpdateService updates existing service to match new properties.
|
|
func (c *Cluster) UpdateService(serviceID string, version uint64, spec types.ServiceSpec) error {
|
|
c.RLock()
|
|
defer c.RUnlock()
|
|
|
|
if !c.isActiveManager() {
|
|
return c.errNoManager()
|
|
}
|
|
|
|
serviceSpec, err := convert.ServiceSpecToGRPC(spec)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
_, err = c.client.UpdateService(
|
|
c.getRequestContext(),
|
|
&swarmapi.UpdateServiceRequest{
|
|
ServiceID: serviceID,
|
|
Spec: &serviceSpec,
|
|
ServiceVersion: &swarmapi.Version{
|
|
Index: version,
|
|
},
|
|
},
|
|
)
|
|
return err
|
|
}
|
|
|
|
// RemoveService removes a service from a managed swarm cluster.
|
|
func (c *Cluster) RemoveService(input string) error {
|
|
c.RLock()
|
|
defer c.RUnlock()
|
|
|
|
if !c.isActiveManager() {
|
|
return c.errNoManager()
|
|
}
|
|
|
|
service, err := getService(c.getRequestContext(), c.client, input)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if _, err := c.client.RemoveService(c.getRequestContext(), &swarmapi.RemoveServiceRequest{ServiceID: service.ID}); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// GetNodes returns a list of all nodes known to a cluster.
|
|
func (c *Cluster) GetNodes(options apitypes.NodeListOptions) ([]types.Node, error) {
|
|
c.RLock()
|
|
defer c.RUnlock()
|
|
|
|
if !c.isActiveManager() {
|
|
return nil, c.errNoManager()
|
|
}
|
|
|
|
filters, err := newListNodesFilters(options.Filter)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
r, err := c.client.ListNodes(
|
|
c.getRequestContext(),
|
|
&swarmapi.ListNodesRequest{Filters: filters})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
nodes := []types.Node{}
|
|
|
|
for _, node := range r.Nodes {
|
|
nodes = append(nodes, convert.NodeFromGRPC(*node))
|
|
}
|
|
return nodes, nil
|
|
}
|
|
|
|
// GetNode returns a node based on an ID or name.
|
|
func (c *Cluster) GetNode(input string) (types.Node, error) {
|
|
c.RLock()
|
|
defer c.RUnlock()
|
|
|
|
if !c.isActiveManager() {
|
|
return types.Node{}, c.errNoManager()
|
|
}
|
|
|
|
node, err := getNode(c.getRequestContext(), c.client, input)
|
|
if err != nil {
|
|
return types.Node{}, err
|
|
}
|
|
return convert.NodeFromGRPC(*node), nil
|
|
}
|
|
|
|
// UpdateNode updates existing nodes properties.
|
|
func (c *Cluster) UpdateNode(nodeID string, version uint64, spec types.NodeSpec) error {
|
|
c.RLock()
|
|
defer c.RUnlock()
|
|
|
|
if !c.isActiveManager() {
|
|
return c.errNoManager()
|
|
}
|
|
|
|
nodeSpec, err := convert.NodeSpecToGRPC(spec)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
_, err = c.client.UpdateNode(
|
|
c.getRequestContext(),
|
|
&swarmapi.UpdateNodeRequest{
|
|
NodeID: nodeID,
|
|
Spec: &nodeSpec,
|
|
NodeVersion: &swarmapi.Version{
|
|
Index: version,
|
|
},
|
|
},
|
|
)
|
|
return err
|
|
}
|
|
|
|
// RemoveNode removes a node from a cluster
|
|
func (c *Cluster) RemoveNode(input string) error {
|
|
c.RLock()
|
|
defer c.RUnlock()
|
|
|
|
if !c.isActiveManager() {
|
|
return c.errNoManager()
|
|
}
|
|
|
|
ctx := c.getRequestContext()
|
|
|
|
node, err := getNode(ctx, c.client, input)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if _, err := c.client.RemoveNode(ctx, &swarmapi.RemoveNodeRequest{NodeID: node.ID}); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// GetTasks returns a list of tasks matching the filter options.
|
|
func (c *Cluster) GetTasks(options apitypes.TaskListOptions) ([]types.Task, error) {
|
|
c.RLock()
|
|
defer c.RUnlock()
|
|
|
|
if !c.isActiveManager() {
|
|
return nil, c.errNoManager()
|
|
}
|
|
|
|
filters, err := newListTasksFilters(options.Filter)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
r, err := c.client.ListTasks(
|
|
c.getRequestContext(),
|
|
&swarmapi.ListTasksRequest{Filters: filters})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
tasks := []types.Task{}
|
|
|
|
for _, task := range r.Tasks {
|
|
tasks = append(tasks, convert.TaskFromGRPC(*task))
|
|
}
|
|
return tasks, nil
|
|
}
|
|
|
|
// GetTask returns a task by an ID.
|
|
func (c *Cluster) GetTask(input string) (types.Task, error) {
|
|
c.RLock()
|
|
defer c.RUnlock()
|
|
|
|
if !c.isActiveManager() {
|
|
return types.Task{}, c.errNoManager()
|
|
}
|
|
|
|
task, err := getTask(c.getRequestContext(), c.client, input)
|
|
if err != nil {
|
|
return types.Task{}, err
|
|
}
|
|
return convert.TaskFromGRPC(*task), nil
|
|
}
|
|
|
|
// GetNetwork returns a cluster network by an ID.
|
|
func (c *Cluster) GetNetwork(input string) (apitypes.NetworkResource, error) {
|
|
c.RLock()
|
|
defer c.RUnlock()
|
|
|
|
if !c.isActiveManager() {
|
|
return apitypes.NetworkResource{}, c.errNoManager()
|
|
}
|
|
|
|
network, err := getNetwork(c.getRequestContext(), c.client, input)
|
|
if err != nil {
|
|
return apitypes.NetworkResource{}, err
|
|
}
|
|
return convert.BasicNetworkFromGRPC(*network), nil
|
|
}
|
|
|
|
// GetNetworks returns all current cluster managed networks.
|
|
func (c *Cluster) GetNetworks() ([]apitypes.NetworkResource, error) {
|
|
c.RLock()
|
|
defer c.RUnlock()
|
|
|
|
if !c.isActiveManager() {
|
|
return nil, c.errNoManager()
|
|
}
|
|
|
|
r, err := c.client.ListNetworks(c.getRequestContext(), &swarmapi.ListNetworksRequest{})
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var networks []apitypes.NetworkResource
|
|
|
|
for _, network := range r.Networks {
|
|
networks = append(networks, convert.BasicNetworkFromGRPC(*network))
|
|
}
|
|
|
|
return networks, nil
|
|
}
|
|
|
|
// CreateNetwork creates a new cluster managed network.
|
|
func (c *Cluster) CreateNetwork(s apitypes.NetworkCreateRequest) (string, error) {
|
|
c.RLock()
|
|
defer c.RUnlock()
|
|
|
|
if !c.isActiveManager() {
|
|
return "", c.errNoManager()
|
|
}
|
|
|
|
if runconfig.IsPreDefinedNetwork(s.Name) {
|
|
err := fmt.Errorf("%s is a pre-defined network and cannot be created", s.Name)
|
|
return "", errors.NewRequestForbiddenError(err)
|
|
}
|
|
|
|
networkSpec := convert.BasicNetworkCreateToGRPC(s)
|
|
r, err := c.client.CreateNetwork(c.getRequestContext(), &swarmapi.CreateNetworkRequest{Spec: &networkSpec})
|
|
if err != nil {
|
|
return "", err
|
|
}
|
|
|
|
return r.Network.ID, nil
|
|
}
|
|
|
|
// RemoveNetwork removes a cluster network.
|
|
func (c *Cluster) RemoveNetwork(input string) error {
|
|
c.RLock()
|
|
defer c.RUnlock()
|
|
|
|
if !c.isActiveManager() {
|
|
return c.errNoManager()
|
|
}
|
|
|
|
network, err := getNetwork(c.getRequestContext(), c.client, input)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if _, err := c.client.RemoveNetwork(c.getRequestContext(), &swarmapi.RemoveNetworkRequest{NetworkID: network.ID}); err != nil {
|
|
return err
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func populateNetworkID(ctx context.Context, c swarmapi.ControlClient, s *types.ServiceSpec) error {
|
|
for i, n := range s.Networks {
|
|
apiNetwork, err := getNetwork(ctx, c, n.Target)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
s.Networks[i].Target = apiNetwork.ID
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func getNetwork(ctx context.Context, c swarmapi.ControlClient, input string) (*swarmapi.Network, error) {
|
|
// GetNetwork to match via full ID.
|
|
rg, err := c.GetNetwork(ctx, &swarmapi.GetNetworkRequest{NetworkID: input})
|
|
if err != nil {
|
|
// If any error (including NotFound), ListNetworks to match via ID prefix and full name.
|
|
rl, err := c.ListNetworks(ctx, &swarmapi.ListNetworksRequest{Filters: &swarmapi.ListNetworksRequest_Filters{Names: []string{input}}})
|
|
if err != nil || len(rl.Networks) == 0 {
|
|
rl, err = c.ListNetworks(ctx, &swarmapi.ListNetworksRequest{Filters: &swarmapi.ListNetworksRequest_Filters{IDPrefixes: []string{input}}})
|
|
}
|
|
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if len(rl.Networks) == 0 {
|
|
return nil, fmt.Errorf("network %s not found", input)
|
|
}
|
|
|
|
if l := len(rl.Networks); l > 1 {
|
|
return nil, fmt.Errorf("network %s is ambigious (%d matches found)", input, l)
|
|
}
|
|
|
|
return rl.Networks[0], nil
|
|
}
|
|
return rg.Network, nil
|
|
}
|
|
|
|
// Cleanup stops active swarm node. This is run before daemon shutdown.
|
|
func (c *Cluster) Cleanup() {
|
|
c.Lock()
|
|
node := c.node
|
|
if node == nil {
|
|
c.Unlock()
|
|
return
|
|
}
|
|
|
|
if c.isActiveManager() {
|
|
active, reachable, unreachable, err := c.managerStats()
|
|
if err == nil {
|
|
singlenode := active && reachable == 1 && unreachable == 0
|
|
if active && !singlenode && reachable-2 <= unreachable {
|
|
logrus.Errorf("Leaving cluster with %v managers left out of %v. Raft quorum will be lost.", reachable-1, reachable+unreachable)
|
|
}
|
|
}
|
|
}
|
|
c.cancelReconnect()
|
|
c.Unlock()
|
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
|
defer cancel()
|
|
if err := node.Stop(ctx); err != nil {
|
|
logrus.Errorf("error cleaning up cluster: %v", err)
|
|
}
|
|
c.Lock()
|
|
c.node = nil
|
|
c.ready = false
|
|
c.conn = nil
|
|
c.Unlock()
|
|
}
|
|
|
|
func (c *Cluster) managerStats() (current bool, reachable int, unreachable int, err error) {
|
|
ctx, _ := context.WithTimeout(context.Background(), 3*time.Second)
|
|
nodes, err := c.client.ListNodes(ctx, &swarmapi.ListNodesRequest{})
|
|
if err != nil {
|
|
return false, 0, 0, err
|
|
}
|
|
for _, n := range nodes.Nodes {
|
|
if n.ManagerStatus != nil {
|
|
if n.ManagerStatus.Reachability == swarmapi.RaftMemberStatus_REACHABLE {
|
|
reachable++
|
|
if n.ID == c.node.NodeID() {
|
|
current = true
|
|
}
|
|
}
|
|
if n.ManagerStatus.Reachability == swarmapi.RaftMemberStatus_UNREACHABLE {
|
|
unreachable++
|
|
}
|
|
}
|
|
}
|
|
return
|
|
}
|
|
|
|
func validateAndSanitizeInitRequest(req *types.InitRequest) error {
|
|
var err error
|
|
req.ListenAddr, err = validateAddr(req.ListenAddr)
|
|
if err != nil {
|
|
return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err)
|
|
}
|
|
|
|
spec := &req.Spec
|
|
// provide sane defaults instead of erroring
|
|
if spec.Name == "" {
|
|
spec.Name = "default"
|
|
}
|
|
if spec.Raft.SnapshotInterval == 0 {
|
|
spec.Raft.SnapshotInterval = defaultSpec.Raft.SnapshotInterval
|
|
}
|
|
if spec.Raft.LogEntriesForSlowFollowers == 0 {
|
|
spec.Raft.LogEntriesForSlowFollowers = defaultSpec.Raft.LogEntriesForSlowFollowers
|
|
}
|
|
if spec.Raft.ElectionTick == 0 {
|
|
spec.Raft.ElectionTick = defaultSpec.Raft.ElectionTick
|
|
}
|
|
if spec.Raft.HeartbeatTick == 0 {
|
|
spec.Raft.HeartbeatTick = defaultSpec.Raft.HeartbeatTick
|
|
}
|
|
if spec.Dispatcher.HeartbeatPeriod == 0 {
|
|
spec.Dispatcher.HeartbeatPeriod = defaultSpec.Dispatcher.HeartbeatPeriod
|
|
}
|
|
if spec.CAConfig.NodeCertExpiry == 0 {
|
|
spec.CAConfig.NodeCertExpiry = defaultSpec.CAConfig.NodeCertExpiry
|
|
}
|
|
if spec.Orchestration.TaskHistoryRetentionLimit == 0 {
|
|
spec.Orchestration.TaskHistoryRetentionLimit = defaultSpec.Orchestration.TaskHistoryRetentionLimit
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func validateAndSanitizeJoinRequest(req *types.JoinRequest) error {
|
|
var err error
|
|
req.ListenAddr, err = validateAddr(req.ListenAddr)
|
|
if err != nil {
|
|
return fmt.Errorf("invalid ListenAddr %q: %v", req.ListenAddr, err)
|
|
}
|
|
if len(req.RemoteAddrs) == 0 {
|
|
return fmt.Errorf("at least 1 RemoteAddr is required to join")
|
|
}
|
|
for i := range req.RemoteAddrs {
|
|
req.RemoteAddrs[i], err = validateAddr(req.RemoteAddrs[i])
|
|
if err != nil {
|
|
return fmt.Errorf("invalid remoteAddr %q: %v", req.RemoteAddrs[i], err)
|
|
}
|
|
}
|
|
if req.CACertHash != "" {
|
|
if _, err := digest.ParseDigest(req.CACertHash); err != nil {
|
|
return fmt.Errorf("invalid CACertHash %q, %v", req.CACertHash, err)
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func validateAddr(addr string) (string, error) {
|
|
if addr == "" {
|
|
return addr, fmt.Errorf("invalid empty address")
|
|
}
|
|
newaddr, err := opts.ParseTCPAddr(addr, defaultAddr)
|
|
if err != nil {
|
|
return addr, nil
|
|
}
|
|
return strings.TrimPrefix(newaddr, "tcp://"), nil
|
|
}
|
|
|
|
func errSwarmExists(node *swarmagent.Node) error {
|
|
if node.NodeMembership() != swarmapi.NodeMembershipAccepted {
|
|
return ErrPendingSwarmExists
|
|
}
|
|
return ErrSwarmExists
|
|
}
|
|
|
|
func initClusterSpec(node *swarmagent.Node, spec types.Spec) error {
|
|
ctx, _ := context.WithTimeout(context.Background(), 5*time.Second)
|
|
for conn := range node.ListenControlSocket(ctx) {
|
|
if ctx.Err() != nil {
|
|
return ctx.Err()
|
|
}
|
|
if conn != nil {
|
|
client := swarmapi.NewControlClient(conn)
|
|
var cluster *swarmapi.Cluster
|
|
for i := 0; ; i++ {
|
|
lcr, err := client.ListClusters(ctx, &swarmapi.ListClustersRequest{})
|
|
if err != nil {
|
|
return fmt.Errorf("error on listing clusters: %v", err)
|
|
}
|
|
if len(lcr.Clusters) == 0 {
|
|
if i < 10 {
|
|
time.Sleep(200 * time.Millisecond)
|
|
continue
|
|
}
|
|
return fmt.Errorf("empty list of clusters was returned")
|
|
}
|
|
cluster = lcr.Clusters[0]
|
|
break
|
|
}
|
|
newspec, err := convert.SwarmSpecToGRPCandMerge(spec, &cluster.Spec)
|
|
if err != nil {
|
|
return fmt.Errorf("error updating cluster settings: %v", err)
|
|
}
|
|
_, err = client.UpdateCluster(ctx, &swarmapi.UpdateClusterRequest{
|
|
ClusterID: cluster.ID,
|
|
ClusterVersion: &cluster.Meta.Version,
|
|
Spec: &newspec,
|
|
})
|
|
if err != nil {
|
|
return fmt.Errorf("error updating cluster settings: %v", err)
|
|
}
|
|
return nil
|
|
}
|
|
}
|
|
return ctx.Err()
|
|
}
|