Service update failure thresholds and rollback

This adds support for two enhancements to swarm service rolling updates:

- Failure thresholds: In Docker 1.12, a service update could be set up
  to either pause or continue after a single failure occurs. This adds
  an --update-max-failure-ratio flag that controls how many tasks need to
  fail to update for the update as a whole to be considered a failure. A
  counterpart flag, --update-monitor, controls how long to monitor each
  task for a failure after starting it during the update.

- Rollback flag: service update --rollback reverts the service to its
  previous version. If a service update encounters task failures, or
  fails to function properly for some other reason, the user can roll back
  the update.

SwarmKit also has the ability to roll back updates automatically after
hitting the failure thresholds, but we've decided not to expose this in
the Docker API/CLI for now, favoring a workflow where the decision to
roll back is always made by an admin. Depending on user feedback, we may
add a "rollback" option to --update-failure-action in the future.

Signed-off-by: Aaron Lehmann <aaron.lehmann@docker.com>
This commit is contained in:
Aaron Lehmann 2016-09-02 14:12:05 -07:00
parent 67bebd6d81
commit 6d4b527699
17 changed files with 398 additions and 207 deletions

View File

@ -15,7 +15,7 @@ type Backend interface {
GetServices(basictypes.ServiceListOptions) ([]types.Service, error)
GetService(string) (types.Service, error)
CreateService(types.ServiceSpec, string) (string, error)
UpdateService(string, uint64, types.ServiceSpec, string) error
UpdateService(string, uint64, types.ServiceSpec, string, string) error
RemoveService(string) error
GetNodes(basictypes.NodeListOptions) ([]types.Node, error)
GetNode(string) (types.Node, error)

View File

@ -156,7 +156,9 @@ func (sr *swarmRouter) updateService(ctx context.Context, w http.ResponseWriter,
// Get returns "" if the header does not exist
encodedAuth := r.Header.Get("X-Registry-Auth")
if err := sr.backend.UpdateService(vars["id"], version, service, encodedAuth); err != nil {
registryAuthFrom := r.URL.Query().Get("registryAuthFrom")
if err := sr.backend.UpdateService(vars["id"], version, service, encodedAuth, registryAuthFrom); err != nil {
logrus.Errorf("Error updating service %s: %v", vars["id"], err)
return err
}

View File

@ -90,16 +90,16 @@ type UpdateConfig struct {
// be used.
Monitor time.Duration `json:",omitempty"`
// AllowedFailureFraction is the fraction of tasks that may fail during
// MaxFailureRatio is the fraction of tasks that may fail during
// an update before the failure action is invoked. Any task created by
// the current update which ends up in one of the states REJECTED,
// COMPLETED or FAILED within Monitor from its creation counts as a
// failure. The number of failures is divided by the number of tasks
// being updated, and if this fraction is greater than
// AllowedFailureFraction, the failure action is invoked.
// MaxFailureRatio, the failure action is invoked.
//
// If the failure action is CONTINUE, there is no effect.
// If the failure action is PAUSE, no more tasks will be updated until
// another update is started.
AllowedFailureFraction float32
MaxFailureRatio float32
}

View File

@ -41,10 +41,14 @@ Placement:
{{- if .HasUpdateConfig }}
UpdateConfig:
Parallelism: {{ .UpdateParallelism }}
{{- if .HasUpdateDelay -}}
{{- if .HasUpdateDelay}}
Delay: {{ .UpdateDelay }}
{{- end }}
On failure: {{ .UpdateOnFailure }}
{{- if .HasUpdateMonitor}}
Monitoring Period: {{ .UpdateMonitor }}
{{- end }}
Max failure ratio: {{ .UpdateMaxFailureRatio }}
{{- end }}
ContainerSpec:
Image: {{ .ContainerImage }}
@ -218,6 +222,18 @@ func (ctx *serviceInspectContext) UpdateOnFailure() string {
return ctx.Service.Spec.UpdateConfig.FailureAction
}
func (ctx *serviceInspectContext) HasUpdateMonitor() bool {
return ctx.Service.Spec.UpdateConfig.Monitor.Nanoseconds() > 0
}
func (ctx *serviceInspectContext) UpdateMonitor() time.Duration {
return ctx.Service.Spec.UpdateConfig.Monitor
}
func (ctx *serviceInspectContext) UpdateMaxFailureRatio() float32 {
return ctx.Service.Spec.UpdateConfig.MaxFailureRatio
}
func (ctx *serviceInspectContext) ContainerImage() string {
return ctx.Service.Spec.TaskTemplate.ContainerSpec.Image
}

View File

@ -267,9 +267,11 @@ func (m *MountOpt) Value() []mounttypes.Mount {
}
type updateOptions struct {
parallelism uint64
delay time.Duration
onFailure string
parallelism uint64
delay time.Duration
monitor time.Duration
onFailure string
maxFailureRatio float32
}
type resourceOptions struct {
@ -458,9 +460,11 @@ func (opts *serviceOptions) ToService() (swarm.ServiceSpec, error) {
Networks: convertNetworks(opts.networks),
Mode: swarm.ServiceMode{},
UpdateConfig: &swarm.UpdateConfig{
Parallelism: opts.update.parallelism,
Delay: opts.update.delay,
FailureAction: opts.update.onFailure,
Parallelism: opts.update.parallelism,
Delay: opts.update.delay,
Monitor: opts.update.monitor,
FailureAction: opts.update.onFailure,
MaxFailureRatio: opts.update.maxFailureRatio,
},
EndpointSpec: opts.endpoint.ToEndpointSpec(),
}
@ -507,7 +511,9 @@ func addServiceFlags(cmd *cobra.Command, opts *serviceOptions) {
flags.Uint64Var(&opts.update.parallelism, flagUpdateParallelism, 1, "Maximum number of tasks updated simultaneously (0 to update all at once)")
flags.DurationVar(&opts.update.delay, flagUpdateDelay, time.Duration(0), "Delay between updates")
flags.DurationVar(&opts.update.monitor, flagUpdateMonitor, time.Duration(0), "Duration after each task update to monitor for failure")
flags.StringVar(&opts.update.onFailure, flagUpdateFailureAction, "pause", "Action on update failure (pause|continue)")
flags.Float32Var(&opts.update.maxFailureRatio, flagUpdateMaxFailureRatio, 0, "Failure rate to tolerate during an update")
flags.StringVar(&opts.endpoint.mode, flagEndpointMode, "", "Endpoint mode (vip or dnsrr)")
@ -518,46 +524,48 @@ func addServiceFlags(cmd *cobra.Command, opts *serviceOptions) {
}
const (
flagConstraint = "constraint"
flagConstraintRemove = "constraint-rm"
flagConstraintAdd = "constraint-add"
flagContainerLabel = "container-label"
flagContainerLabelRemove = "container-label-rm"
flagContainerLabelAdd = "container-label-add"
flagEndpointMode = "endpoint-mode"
flagEnv = "env"
flagEnvRemove = "env-rm"
flagEnvAdd = "env-add"
flagGroupAdd = "group-add"
flagGroupRemove = "group-rm"
flagLabel = "label"
flagLabelRemove = "label-rm"
flagLabelAdd = "label-add"
flagLimitCPU = "limit-cpu"
flagLimitMemory = "limit-memory"
flagMode = "mode"
flagMount = "mount"
flagMountRemove = "mount-rm"
flagMountAdd = "mount-add"
flagName = "name"
flagNetwork = "network"
flagPublish = "publish"
flagPublishRemove = "publish-rm"
flagPublishAdd = "publish-add"
flagReplicas = "replicas"
flagReserveCPU = "reserve-cpu"
flagReserveMemory = "reserve-memory"
flagRestartCondition = "restart-condition"
flagRestartDelay = "restart-delay"
flagRestartMaxAttempts = "restart-max-attempts"
flagRestartWindow = "restart-window"
flagStopGracePeriod = "stop-grace-period"
flagUpdateDelay = "update-delay"
flagUpdateFailureAction = "update-failure-action"
flagUpdateParallelism = "update-parallelism"
flagUser = "user"
flagWorkdir = "workdir"
flagRegistryAuth = "with-registry-auth"
flagLogDriver = "log-driver"
flagLogOpt = "log-opt"
flagConstraint = "constraint"
flagConstraintRemove = "constraint-rm"
flagConstraintAdd = "constraint-add"
flagContainerLabel = "container-label"
flagContainerLabelRemove = "container-label-rm"
flagContainerLabelAdd = "container-label-add"
flagEndpointMode = "endpoint-mode"
flagEnv = "env"
flagEnvRemove = "env-rm"
flagEnvAdd = "env-add"
flagGroupAdd = "group-add"
flagGroupRemove = "group-rm"
flagLabel = "label"
flagLabelRemove = "label-rm"
flagLabelAdd = "label-add"
flagLimitCPU = "limit-cpu"
flagLimitMemory = "limit-memory"
flagMode = "mode"
flagMount = "mount"
flagMountRemove = "mount-rm"
flagMountAdd = "mount-add"
flagName = "name"
flagNetwork = "network"
flagPublish = "publish"
flagPublishRemove = "publish-rm"
flagPublishAdd = "publish-add"
flagReplicas = "replicas"
flagReserveCPU = "reserve-cpu"
flagReserveMemory = "reserve-memory"
flagRestartCondition = "restart-condition"
flagRestartDelay = "restart-delay"
flagRestartMaxAttempts = "restart-max-attempts"
flagRestartWindow = "restart-window"
flagStopGracePeriod = "stop-grace-period"
flagUpdateDelay = "update-delay"
flagUpdateFailureAction = "update-failure-action"
flagUpdateMaxFailureRatio = "update-max-failure-ratio"
flagUpdateMonitor = "update-monitor"
flagUpdateParallelism = "update-parallelism"
flagUser = "user"
flagWorkdir = "workdir"
flagRegistryAuth = "with-registry-auth"
flagLogDriver = "log-driver"
flagLogOpt = "log-opt"
)

View File

@ -36,6 +36,7 @@ func newUpdateCommand(dockerCli *command.DockerCli) *cobra.Command {
flags := cmd.Flags()
flags.String("image", "", "Service image tag")
flags.String("args", "", "Service command args")
flags.Bool("rollback", false, "Rollback to previous specification")
addServiceFlags(cmd, opts)
flags.Var(newListOptsVar(), flagEnvRemove, "Remove an environment variable")
@ -68,7 +69,20 @@ func runUpdate(dockerCli *command.DockerCli, flags *pflag.FlagSet, serviceID str
return err
}
err = updateService(flags, &service.Spec)
rollback, err := flags.GetBool("rollback")
if err != nil {
return err
}
spec := &service.Spec
if rollback {
spec = service.PreviousSpec
if spec == nil {
return fmt.Errorf("service does not have a previous specification to roll back to")
}
}
err = updateService(flags, spec)
if err != nil {
return err
}
@ -81,15 +95,19 @@ func runUpdate(dockerCli *command.DockerCli, flags *pflag.FlagSet, serviceID str
if sendAuth {
// Retrieve encoded auth token from the image reference
// This would be the old image if it didn't change in this update
image := service.Spec.TaskTemplate.ContainerSpec.Image
image := spec.TaskTemplate.ContainerSpec.Image
encodedAuth, err := command.RetrieveAuthTokenFromImage(ctx, dockerCli, image)
if err != nil {
return err
}
updateOpts.EncodedRegistryAuth = encodedAuth
} else if rollback {
updateOpts.RegistryAuthFrom = types.RegistryAuthFromPreviousSpec
} else {
updateOpts.RegistryAuthFrom = types.RegistryAuthFromSpec
}
err = apiClient.ServiceUpdate(ctx, service.ID, service.Version, service.Spec, updateOpts)
err = apiClient.ServiceUpdate(ctx, service.ID, service.Version, *spec, updateOpts)
if err != nil {
return err
}
@ -111,6 +129,12 @@ func updateService(flags *pflag.FlagSet, spec *swarm.ServiceSpec) error {
}
}
updateFloat32 := func(flag string, field *float32) {
if flags.Changed(flag) {
*field, _ = flags.GetFloat32(flag)
}
}
updateDuration := func(flag string, field *time.Duration) {
if flags.Changed(flag) {
*field, _ = flags.GetDuration(flag)
@ -195,13 +219,15 @@ func updateService(flags *pflag.FlagSet, spec *swarm.ServiceSpec) error {
return err
}
if anyChanged(flags, flagUpdateParallelism, flagUpdateDelay, flagUpdateFailureAction) {
if anyChanged(flags, flagUpdateParallelism, flagUpdateDelay, flagUpdateMonitor, flagUpdateFailureAction, flagUpdateMaxFailureRatio) {
if spec.UpdateConfig == nil {
spec.UpdateConfig = &swarm.UpdateConfig{}
}
updateUint64(flagUpdateParallelism, &spec.UpdateConfig.Parallelism)
updateDuration(flagUpdateDelay, &spec.UpdateConfig.Delay)
updateDuration(flagUpdateMonitor, &spec.UpdateConfig.Monitor)
updateString(flagUpdateFailureAction, &spec.UpdateConfig.FailureAction)
updateFloat32(flagUpdateMaxFailureRatio, &spec.UpdateConfig.MaxFailureRatio)
}
if flags.Changed(flagEndpointMode) {

View File

@ -1760,9 +1760,12 @@ _docker_service_update() {
--restart-delay
--restart-max-attempts
--restart-window
--rollback
--stop-grace-period
--update-delay
--update-failure-action
--update-max-failure-ratio
--update-monitor
--update-parallelism
--user -u
--workdir -w

View File

@ -1108,6 +1108,8 @@ __docker_service_subcommand() {
"($help)--stop-grace-period=[Time to wait before force killing a container]:grace period: "
"($help)--update-delay=[Delay between updates]:delay: "
"($help)--update-failure-action=[Action on update failure]:mode:(pause continue)"
"($help)--update-max-failure-ratio=[Failure rate to tolerate during an update]:fraction: "
"($help)--update-monitor=[Duration after each task update to monitor for failure]:window: "
"($help)--update-parallelism=[Maximum number of tasks updated simultaneously]:number: "
"($help -u --user)"{-u=,--user=}"[Username or UID]:user:_users"
"($help)--with-registry-auth[Send registry authentication details to swarm agents]"
@ -1185,6 +1187,7 @@ __docker_service_subcommand() {
"($help)*--container-label-rm=[Remove a container label by its key]:label: " \
"($help)*--group-rm=[Remove previously added user groups from the container]:group:_groups" \
"($help)--image=[Service image tag]:image:__docker_repositories" \
"($help)--rollback[Rollback to previous specification]" \
"($help -)1:service:__docker_complete_services" && ret=0
;;
(help)

View File

@ -913,7 +913,7 @@ func (c *Cluster) GetService(input string) (types.Service, error) {
}
// UpdateService updates existing service to match new properties.
func (c *Cluster) UpdateService(serviceIDOrName string, version uint64, spec types.ServiceSpec, encodedAuth string) error {
func (c *Cluster) UpdateService(serviceIDOrName string, version uint64, spec types.ServiceSpec, encodedAuth string, registryAuthFrom string) error {
c.RLock()
defer c.RUnlock()
@ -948,7 +948,18 @@ func (c *Cluster) UpdateService(serviceIDOrName string, version uint64, spec typ
} else {
// this is needed because if the encodedAuth isn't being updated then we
// shouldn't lose it, and continue to use the one that was already present
ctnr := currentService.Spec.Task.GetContainer()
var ctnr *swarmapi.ContainerSpec
switch registryAuthFrom {
case apitypes.RegistryAuthFromSpec, "":
ctnr = currentService.Spec.Task.GetContainer()
case apitypes.RegistryAuthFromPreviousSpec:
if currentService.PreviousSpec == nil {
return fmt.Errorf("service does not have a previous spec")
}
ctnr = currentService.PreviousSpec.Task.GetContainer()
default:
return fmt.Errorf("unsupported registryAuthFromValue")
}
if ctnr == nil {
return fmt.Errorf("service does not use container tasks")
}

View File

@ -12,35 +12,11 @@ import (
// ServiceFromGRPC converts a grpc Service to a Service.
func ServiceFromGRPC(s swarmapi.Service) types.Service {
spec := s.Spec
containerConfig := spec.Task.Runtime.(*swarmapi.TaskSpec_Container).Container
serviceNetworks := make([]types.NetworkAttachmentConfig, 0, len(spec.Networks))
for _, n := range spec.Networks {
serviceNetworks = append(serviceNetworks, types.NetworkAttachmentConfig{Target: n.Target, Aliases: n.Aliases})
}
taskNetworks := make([]types.NetworkAttachmentConfig, 0, len(spec.Task.Networks))
for _, n := range spec.Task.Networks {
taskNetworks = append(taskNetworks, types.NetworkAttachmentConfig{Target: n.Target, Aliases: n.Aliases})
}
service := types.Service{
ID: s.ID,
ID: s.ID,
Spec: *serviceSpecFromGRPC(&s.Spec),
PreviousSpec: serviceSpecFromGRPC(s.PreviousSpec),
Spec: types.ServiceSpec{
TaskTemplate: types.TaskSpec{
ContainerSpec: containerSpecFromGRPC(containerConfig),
Resources: resourcesFromGRPC(s.Spec.Task.Resources),
RestartPolicy: restartPolicyFromGRPC(s.Spec.Task.Restart),
Placement: placementFromGRPC(s.Spec.Task.Placement),
LogDriver: driverFromGRPC(s.Spec.Task.LogDriver),
Networks: taskNetworks,
},
Networks: serviceNetworks,
EndpointSpec: endpointSpecFromGRPC(s.Spec.Endpoint),
},
Endpoint: endpointFromGRPC(s.Endpoint),
}
@ -49,36 +25,6 @@ func ServiceFromGRPC(s swarmapi.Service) types.Service {
service.CreatedAt, _ = ptypes.Timestamp(s.Meta.CreatedAt)
service.UpdatedAt, _ = ptypes.Timestamp(s.Meta.UpdatedAt)
// Annotations
service.Spec.Name = s.Spec.Annotations.Name
service.Spec.Labels = s.Spec.Annotations.Labels
// UpdateConfig
if s.Spec.Update != nil {
service.Spec.UpdateConfig = &types.UpdateConfig{
Parallelism: s.Spec.Update.Parallelism,
}
service.Spec.UpdateConfig.Delay, _ = ptypes.Duration(&s.Spec.Update.Delay)
switch s.Spec.Update.FailureAction {
case swarmapi.UpdateConfig_PAUSE:
service.Spec.UpdateConfig.FailureAction = types.UpdateFailureActionPause
case swarmapi.UpdateConfig_CONTINUE:
service.Spec.UpdateConfig.FailureAction = types.UpdateFailureActionContinue
}
}
// Mode
switch t := s.Spec.GetMode().(type) {
case *swarmapi.ServiceSpec_Global:
service.Spec.Mode.Global = &types.GlobalService{}
case *swarmapi.ServiceSpec_Replicated:
service.Spec.Mode.Replicated = &types.ReplicatedService{
Replicas: &t.Replicated.Replicas,
}
}
// UpdateStatus
service.UpdateStatus = types.UpdateStatus{}
if s.UpdateStatus != nil {
@ -99,6 +45,74 @@ func ServiceFromGRPC(s swarmapi.Service) types.Service {
return service
}
func serviceSpecFromGRPC(spec *swarmapi.ServiceSpec) *types.ServiceSpec {
if spec == nil {
return nil
}
serviceNetworks := make([]types.NetworkAttachmentConfig, 0, len(spec.Networks))
for _, n := range spec.Networks {
serviceNetworks = append(serviceNetworks, types.NetworkAttachmentConfig{Target: n.Target, Aliases: n.Aliases})
}
taskNetworks := make([]types.NetworkAttachmentConfig, 0, len(spec.Task.Networks))
for _, n := range spec.Task.Networks {
taskNetworks = append(taskNetworks, types.NetworkAttachmentConfig{Target: n.Target, Aliases: n.Aliases})
}
containerConfig := spec.Task.Runtime.(*swarmapi.TaskSpec_Container).Container
convertedSpec := &types.ServiceSpec{
Annotations: types.Annotations{
Name: spec.Annotations.Name,
Labels: spec.Annotations.Labels,
},
TaskTemplate: types.TaskSpec{
ContainerSpec: containerSpecFromGRPC(containerConfig),
Resources: resourcesFromGRPC(spec.Task.Resources),
RestartPolicy: restartPolicyFromGRPC(spec.Task.Restart),
Placement: placementFromGRPC(spec.Task.Placement),
LogDriver: driverFromGRPC(spec.Task.LogDriver),
Networks: taskNetworks,
},
Networks: serviceNetworks,
EndpointSpec: endpointSpecFromGRPC(spec.Endpoint),
}
// UpdateConfig
if spec.Update != nil {
convertedSpec.UpdateConfig = &types.UpdateConfig{
Parallelism: spec.Update.Parallelism,
MaxFailureRatio: spec.Update.MaxFailureRatio,
}
convertedSpec.UpdateConfig.Delay, _ = ptypes.Duration(&spec.Update.Delay)
if spec.Update.Monitor != nil {
convertedSpec.UpdateConfig.Monitor, _ = ptypes.Duration(spec.Update.Monitor)
}
switch spec.Update.FailureAction {
case swarmapi.UpdateConfig_PAUSE:
convertedSpec.UpdateConfig.FailureAction = types.UpdateFailureActionPause
case swarmapi.UpdateConfig_CONTINUE:
convertedSpec.UpdateConfig.FailureAction = types.UpdateFailureActionContinue
}
}
// Mode
switch t := spec.GetMode().(type) {
case *swarmapi.ServiceSpec_Global:
convertedSpec.Mode.Global = &types.GlobalService{}
case *swarmapi.ServiceSpec_Replicated:
convertedSpec.Mode.Replicated = &types.ReplicatedService{
Replicas: &t.Replicated.Replicas,
}
}
return convertedSpec
}
// ServiceSpecToGRPC converts a ServiceSpec to a grpc ServiceSpec.
func ServiceSpecToGRPC(s types.ServiceSpec) (swarmapi.ServiceSpec, error) {
name := s.Name
@ -158,9 +172,13 @@ func ServiceSpecToGRPC(s types.ServiceSpec) (swarmapi.ServiceSpec, error) {
return swarmapi.ServiceSpec{}, fmt.Errorf("unrecongized update failure action %s", s.UpdateConfig.FailureAction)
}
spec.Update = &swarmapi.UpdateConfig{
Parallelism: s.UpdateConfig.Parallelism,
Delay: *ptypes.DurationProto(s.UpdateConfig.Delay),
FailureAction: failureAction,
Parallelism: s.UpdateConfig.Parallelism,
Delay: *ptypes.DurationProto(s.UpdateConfig.Delay),
FailureAction: failureAction,
MaxFailureRatio: s.UpdateConfig.MaxFailureRatio,
}
if s.UpdateConfig.Monitor != 0 {
spec.Update.Monitor = ptypes.DurationProto(s.UpdateConfig.Monitor)
}
}

View File

@ -129,6 +129,7 @@ This section lists each version from latest to oldest. Each listing includes a
* `GET /containers/json` now supports a `is-task` filter to filter
containers that are tasks (part of a service in swarm mode).
* `POST /containers/create` now takes `StopTimeout` field.
* `POST /services/create` and `POST /services/(id or name)/update` now accept `Monitor` and `MaxFailureRatio` parameters, which control the response to failures during service updates.
### v1.24 API changes

View File

@ -4877,7 +4877,9 @@ List services
},
"UpdateConfig": {
"Parallelism": 1,
"FailureAction": "pause"
"FailureAction": "pause",
"Monitor": 15000000000,
"MaxFailureRatio": 0.15
},
"EndpointSpec": {
"Mode": "vip",
@ -5077,8 +5079,8 @@ image](#create-an-image) section for more details.
- **RestartPolicy** Specification for the restart policy which applies to containers created
as part of this service.
- **Condition** Condition for restart (`none`, `on-failure`, or `any`).
- **Delay** Delay between restart attempts.
- **Attempts** Maximum attempts to restart a given container before giving up (default value
- **Delay** Delay between restart attempts, in nanoseconds.
- **MaxAttempts** Maximum attempts to restart a given container before giving up (default value
is 0, which is ignored).
- **Window** Windows is the time window used to evaluate the restart policy (default value is
0, which is unbounded).
@ -5087,9 +5089,12 @@ image](#create-an-image) section for more details.
- **UpdateConfig** Specification for the update strategy of the service.
- **Parallelism** Maximum number of tasks to be updated in one iteration (0 means unlimited
parallelism).
- **Delay** Amount of time between updates.
- **Delay** Amount of time between updates, in nanoseconds.
- **FailureAction** - Action to take if an updated task fails to run, or stops running during the
update. Values are `continue` and `pause`.
- **Monitor** - Amount of time to monitor each updated task for failures, in nanoseconds.
- **MaxFailureRatio** - The fraction of tasks that may fail during an update before the
failure action is invoked, specified as a floating point number between 0 and 1. The default is 0.
- **Networks** Array of network names or IDs to attach the service to.
- **EndpointSpec** Properties that can be configured to access and load balance a service.
- **Mode** The mode of resolution to use for internal load balancing
@ -5259,7 +5264,9 @@ image](#create-an-image) section for more details.
}
},
"UpdateConfig": {
"Parallelism": 1
"Parallelism": 1,
"Monitor": 15000000000,
"MaxFailureRatio": 0.15
},
"EndpointSpec": {
"Mode": "vip"
@ -5314,7 +5321,7 @@ image](#create-an-image) section for more details.
- **RestartPolicy** Specification for the restart policy which applies to containers created
as part of this service.
- **Condition** Condition for restart (`none`, `on-failure`, or `any`).
- **Delay** Delay between restart attempts.
- **Delay** Delay between restart attempts, in nanoseconds.
- **MaxAttempts** Maximum attempts to restart a given container before giving up (default value
is 0, which is ignored).
- **Window** Windows is the time window used to evaluate the restart policy (default value is
@ -5324,7 +5331,12 @@ image](#create-an-image) section for more details.
- **UpdateConfig** Specification for the update strategy of the service.
- **Parallelism** Maximum number of tasks to be updated in one iteration (0 means unlimited
parallelism).
- **Delay** Amount of time between updates.
- **Delay** Amount of time between updates, in nanoseconds.
- **FailureAction** - Action to take if an updated task fails to run, or stops running during the
update. Values are `continue` and `pause`.
- **Monitor** - Amount of time to monitor each updated task for failures, in nanoseconds.
- **MaxFailureRatio** - The fraction of tasks that may fail during an update before the
failure action is invoked, specified as a floating point number between 0 and 1. The default is 0.
- **Networks** Array of network names or IDs to attach the service to.
- **EndpointSpec** Properties that can be configured to access and load balance a service.
- **Mode** The mode of resolution to use for internal load balancing
@ -5338,6 +5350,10 @@ image](#create-an-image) section for more details.
- **version** The version number of the service object being updated. This is
required to avoid conflicting writes.
- **registryAuthFrom** - If the X-Registry-Auth header is not specified, this
parameter indicates where to find registry authorization credentials. The
valid values are `spec` and `previous-spec`. If unspecified, the default is
`spec`.
**Request Headers**:

View File

@ -12,36 +12,38 @@ Usage: docker service create [OPTIONS] IMAGE [COMMAND] [ARG...]
Create a new service
Options:
--constraint value Placement constraints (default [])
--container-label value Service container labels (default [])
--endpoint-mode string Endpoint mode (vip or dnsrr)
-e, --env value Set environment variables (default [])
--group-add value Add additional user groups to the container (default [])
--help Print usage
-l, --label value Service labels (default [])
--limit-cpu value Limit CPUs (default 0.000)
--limit-memory value Limit Memory (default 0 B)
--log-driver string Logging driver for service
--log-opt value Logging driver options (default [])
--mode string Service mode (replicated or global) (default "replicated")
--mount value Attach a mount to the service
--name string Service name
--network value Network attachments (default [])
-p, --publish value Publish a port as a node port (default [])
--replicas value Number of tasks (default none)
--reserve-cpu value Reserve CPUs (default 0.000)
--reserve-memory value Reserve Memory (default 0 B)
--restart-condition string Restart when condition is met (none, on-failure, or any)
--restart-delay value Delay between restart attempts (default none)
--restart-max-attempts value Maximum number of restarts before giving up (default none)
--restart-window value Window used to evaluate the restart policy (default none)
--stop-grace-period value Time to wait before force killing a container (default none)
--update-delay duration Delay between updates
--update-failure-action string Action on update failure (pause|continue) (default "pause")
--update-parallelism uint Maximum number of tasks updated simultaneously (0 to update all at once) (default 1)
-u, --user string Username or UID (format: <name|uid>[:<group|gid>])
--with-registry-auth Send registry authentication details to Swarm agents
-w, --workdir string Working directory inside the container
--constraint value Placement constraints (default [])
--container-label value Service container labels (default [])
--endpoint-mode string Endpoint mode (vip or dnsrr)
-e, --env value Set environment variables (default [])
--group-add value Add additional user groups to the container (default [])
--help Print usage
-l, --label value Service labels (default [])
--limit-cpu value Limit CPUs (default 0.000)
--limit-memory value Limit Memory (default 0 B)
--log-driver string Logging driver for service
--log-opt value Logging driver options (default [])
--mode string Service mode (replicated or global) (default "replicated")
--mount value Attach a mount to the service
--name string Service name
--network value Network attachments (default [])
-p, --publish value Publish a port as a node port (default [])
--replicas value Number of tasks (default none)
--reserve-cpu value Reserve CPUs (default 0.000)
--reserve-memory value Reserve Memory (default 0 B)
--restart-condition string Restart when condition is met (none, on-failure, or any)
--restart-delay value Delay between restart attempts (default none)
--restart-max-attempts value Maximum number of restarts before giving up (default none)
--restart-window value Window used to evaluate the restart policy (default none)
--stop-grace-period value Time to wait before force killing a container (default none)
--update-delay duration Delay between updates
--update-failure-action string Action on update failure (pause|continue) (default "pause")
--update-max-failure-ratio value Failure rate to tolerate during an update
--update-monitor duration Duration after each task update to monitor for failure (default 0s)
--update-parallelism uint Maximum number of tasks updated simultaneously (0 to update all at once) (default 1)
-u, --user string Username or UID (format: <name|uid>[:<group|gid>])
--with-registry-auth Send registry authentication details to Swarm agents
-w, --workdir string Working directory inside the container
```
Creates a service as described by the specified parameters. You must run this

View File

@ -12,43 +12,46 @@ Usage: docker service update [OPTIONS] SERVICE
Update a service
Options:
--args string Service command args
--constraint-add value Add or update placement constraints (default [])
--constraint-rm value Remove a constraint (default [])
--container-label-add value Add or update container labels (default [])
--container-label-rm value Remove a container label by its key (default [])
--endpoint-mode string Endpoint mode (vip or dnsrr)
--env-add value Add or update environment variables (default [])
--env-rm value Remove an environment variable (default [])
--group-add value Add additional user groups to the container (default [])
--group-rm value Remove previously added user groups from the container (default [])
--help Print usage
--image string Service image tag
--label-add value Add or update service labels (default [])
--label-rm value Remove a label by its key (default [])
--limit-cpu value Limit CPUs (default 0.000)
--limit-memory value Limit Memory (default 0 B)
--log-driver string Logging driver for service
--log-opt value Logging driver options (default [])
--mount-add value Add or update a mount on a service
--mount-rm value Remove a mount by its target path (default [])
--name string Service name
--publish-add value Add or update a published port (default [])
--publish-rm value Remove a published port by its target port (default [])
--replicas value Number of tasks (default none)
--reserve-cpu value Reserve CPUs (default 0.000)
--reserve-memory value Reserve Memory (default 0 B)
--restart-condition string Restart when condition is met (none, on-failure, or any)
--restart-delay value Delay between restart attempts (default none)
--restart-max-attempts value Maximum number of restarts before giving up (default none)
--restart-window value Window used to evaluate the restart policy (default none)
--stop-grace-period value Time to wait before force killing a container (default none)
--update-delay duration Delay between updates
--update-failure-action string Action on update failure (pause|continue) (default "pause")
--update-parallelism uint Maximum number of tasks updated simultaneously (0 to update all at once) (default 1)
-u, --user string Username or UID (format: <name|uid>[:<group|gid>])
--with-registry-auth Send registry authentication details to Swarm agents
-w, --workdir string Working directory inside the container
--args string Service command args
--constraint-add value Add or update placement constraints (default [])
--constraint-rm value Remove a constraint (default [])
--container-label-add value Add or update container labels (default [])
--container-label-rm value Remove a container label by its key (default [])
--endpoint-mode string Endpoint mode (vip or dnsrr)
--env-add value Add or update environment variables (default [])
--env-rm value Remove an environment variable (default [])
--group-add value Add additional user groups to the container (default [])
--group-rm value Remove previously added user groups from the container (default [])
--help Print usage
--image string Service image tag
--label-add value Add or update service labels (default [])
--label-rm value Remove a label by its key (default [])
--limit-cpu value Limit CPUs (default 0.000)
--limit-memory value Limit Memory (default 0 B)
--log-driver string Logging driver for service
--log-opt value Logging driver options (default [])
--mount-add value Add or update a mount on a service
--mount-rm value Remove a mount by its target path (default [])
--name string Service name
--publish-add value Add or update a published port (default [])
--publish-rm value Remove a published port by its target port (default [])
--replicas value Number of tasks (default none)
--reserve-cpu value Reserve CPUs (default 0.000)
--reserve-memory value Reserve Memory (default 0 B)
--restart-condition string Restart when condition is met (none, on-failure, or any)
--restart-delay value Delay between restart attempts (default none)
--restart-max-attempts value Maximum number of restarts before giving up (default none)
--restart-window value Window used to evaluate the restart policy (default none)
--rollback Rollback to previous specification
--stop-grace-period value Time to wait before force killing a container (default none)
--update-delay duration Delay between updates
--update-failure-action string Action on update failure (pause|continue) (default "pause")
--update-max-failure-ratio value Failure rate to tolerate during an update
--update-monitor duration Duration after each task update to monitor for failure (default 0s)
--update-parallelism uint Maximum number of tasks updated simultaneously (0 to update all at once) (default 1)
-u, --user string Username or UID (format: <name|uid>[:<group|gid>])
--with-registry-auth Send registry authentication details to Swarm agents
-w, --workdir string Working directory inside the container
```
Updates a service as described by the specified parameters. This command has to be run targeting a manager node.

View File

@ -139,8 +139,8 @@ func (d *SwarmDaemon) getServiceTasks(c *check.C, service string) []swarm.Task {
return tasks
}
func (d *SwarmDaemon) checkServiceRunningTasks(c *check.C, service string) func(*check.C) (interface{}, check.CommentInterface) {
return func(*check.C) (interface{}, check.CommentInterface) {
func (d *SwarmDaemon) checkServiceRunningTasks(service string) func(*check.C) (interface{}, check.CommentInterface) {
return func(c *check.C) (interface{}, check.CommentInterface) {
tasks := d.getServiceTasks(c, service)
var runningCount int
for _, task := range tasks {
@ -152,8 +152,15 @@ func (d *SwarmDaemon) checkServiceRunningTasks(c *check.C, service string) func(
}
}
func (d *SwarmDaemon) checkServiceTasks(c *check.C, service string) func(*check.C) (interface{}, check.CommentInterface) {
return func(*check.C) (interface{}, check.CommentInterface) {
func (d *SwarmDaemon) checkServiceUpdateState(service string) func(*check.C) (interface{}, check.CommentInterface) {
return func(c *check.C) (interface{}, check.CommentInterface) {
service := d.getService(c, service)
return service.UpdateStatus.State, nil
}
}
func (d *SwarmDaemon) checkServiceTasks(service string) func(*check.C) (interface{}, check.CommentInterface) {
return func(c *check.C) (interface{}, check.CommentInterface) {
tasks := d.getServiceTasks(c, service)
return len(tasks), nil
}

View File

@ -310,6 +310,63 @@ func (s *DockerSwarmSuite) TestAPISwarmServicesUpdate(c *check.C) {
// 3nd batch
waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkRunningTaskImages, checker.DeepEquals,
map[string]int{image2: instances})
// Roll back to the previous version. This uses the CLI because
// rollback is a client-side operation.
out, err := daemons[0].Cmd("service", "update", "--rollback", id)
c.Assert(err, checker.IsNil, check.Commentf(out))
// first batch
waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkRunningTaskImages, checker.DeepEquals,
map[string]int{image2: instances - parallelism, image1: parallelism})
// 2nd batch
waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkRunningTaskImages, checker.DeepEquals,
map[string]int{image2: instances - 2*parallelism, image1: 2 * parallelism})
// 3nd batch
waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkRunningTaskImages, checker.DeepEquals,
map[string]int{image1: instances})
}
func (s *DockerSwarmSuite) TestApiSwarmServicesFailedUpdate(c *check.C) {
const nodeCount = 3
var daemons [nodeCount]*SwarmDaemon
for i := 0; i < nodeCount; i++ {
daemons[i] = s.AddDaemon(c, true, i == 0)
}
// wait for nodes ready
waitAndAssert(c, 5*time.Second, daemons[0].checkNodeReadyCount, checker.Equals, nodeCount)
// service image at start
image1 := "busybox:latest"
// target image in update
image2 := "busybox:badtag"
// create service
instances := 5
id := daemons[0].createService(c, serviceForUpdate, setInstances(instances))
// wait for tasks ready
waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkRunningTaskImages, checker.DeepEquals,
map[string]int{image1: instances})
// issue service update
service := daemons[0].getService(c, id)
daemons[0].updateService(c, service, setImage(image2), setFailureAction(swarm.UpdateFailureActionPause), setMaxFailureRatio(0.25), setParallelism(1))
// should update 2 tasks and then pause
waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceUpdateState(id), checker.Equals, swarm.UpdateStatePaused)
v, _ := daemons[0].checkServiceRunningTasks(id)(c)
c.Assert(v, checker.Equals, instances-2)
// Roll back to the previous version. This uses the CLI because
// rollback is a client-side operation.
out, err := daemons[0].Cmd("service", "update", "--rollback", id)
c.Assert(err, checker.IsNil, check.Commentf(out))
waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkRunningTaskImages, checker.DeepEquals,
map[string]int{image1: instances})
}
func (s *DockerSwarmSuite) TestAPISwarmServiceConstraintRole(c *check.C) {
@ -326,7 +383,7 @@ func (s *DockerSwarmSuite) TestAPISwarmServiceConstraintRole(c *check.C) {
instances := 3
id := daemons[0].createService(c, simpleTestService, setConstraints(constraints), setInstances(instances))
// wait for tasks ready
waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceRunningTasks(c, id), checker.Equals, instances)
waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceRunningTasks(id), checker.Equals, instances)
// validate tasks are running on worker nodes
tasks := daemons[0].getServiceTasks(c, id)
for _, task := range tasks {
@ -340,7 +397,7 @@ func (s *DockerSwarmSuite) TestAPISwarmServiceConstraintRole(c *check.C) {
constraints = []string{"node.role!=worker"}
id = daemons[0].createService(c, simpleTestService, setConstraints(constraints), setInstances(instances))
// wait for tasks ready
waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceRunningTasks(c, id), checker.Equals, instances)
waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceRunningTasks(id), checker.Equals, instances)
tasks = daemons[0].getServiceTasks(c, id)
// validate tasks are running on manager nodes
for _, task := range tasks {
@ -354,7 +411,7 @@ func (s *DockerSwarmSuite) TestAPISwarmServiceConstraintRole(c *check.C) {
constraints = []string{"node.role==nosuchrole"}
id = daemons[0].createService(c, simpleTestService, setConstraints(constraints), setInstances(instances))
// wait for tasks created
waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceTasks(c, id), checker.Equals, instances)
waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceTasks(id), checker.Equals, instances)
// let scheduler try
time.Sleep(250 * time.Millisecond)
// validate tasks are not assigned to any node
@ -394,7 +451,7 @@ func (s *DockerSwarmSuite) TestAPISwarmServiceConstraintLabel(c *check.C) {
constraints := []string{"node.labels.security==high"}
id := daemons[0].createService(c, simpleTestService, setConstraints(constraints), setInstances(instances))
// wait for tasks ready
waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceRunningTasks(c, id), checker.Equals, instances)
waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceRunningTasks(id), checker.Equals, instances)
tasks := daemons[0].getServiceTasks(c, id)
// validate all tasks are running on nodes[0]
for _, task := range tasks {
@ -407,7 +464,7 @@ func (s *DockerSwarmSuite) TestAPISwarmServiceConstraintLabel(c *check.C) {
constraints = []string{"node.labels.security!=high"}
id = daemons[0].createService(c, simpleTestService, setConstraints(constraints), setInstances(instances))
// wait for tasks ready
waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceRunningTasks(c, id), checker.Equals, instances)
waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceRunningTasks(id), checker.Equals, instances)
tasks = daemons[0].getServiceTasks(c, id)
// validate all tasks are NOT running on nodes[0]
for _, task := range tasks {
@ -419,7 +476,7 @@ func (s *DockerSwarmSuite) TestAPISwarmServiceConstraintLabel(c *check.C) {
constraints = []string{"node.labels.security==medium"}
id = daemons[0].createService(c, simpleTestService, setConstraints(constraints), setInstances(instances))
// wait for tasks created
waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceTasks(c, id), checker.Equals, instances)
waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceTasks(id), checker.Equals, instances)
// let scheduler try
time.Sleep(250 * time.Millisecond)
tasks = daemons[0].getServiceTasks(c, id)
@ -437,7 +494,7 @@ func (s *DockerSwarmSuite) TestAPISwarmServiceConstraintLabel(c *check.C) {
}
id = daemons[0].createService(c, simpleTestService, setConstraints(constraints), setInstances(instances))
// wait for tasks created
waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceTasks(c, id), checker.Equals, instances)
waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceTasks(id), checker.Equals, instances)
// let scheduler try
time.Sleep(250 * time.Millisecond)
tasks = daemons[0].getServiceTasks(c, id)
@ -452,7 +509,7 @@ func (s *DockerSwarmSuite) TestAPISwarmServiceConstraintLabel(c *check.C) {
}
})
// wait for tasks ready
waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceRunningTasks(c, id), checker.Equals, instances)
waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceRunningTasks(id), checker.Equals, instances)
tasks = daemons[0].getServiceTasks(c, id)
for _, task := range tasks {
c.Assert(task.NodeID, checker.Equals, nodes[1].ID)
@ -1022,6 +1079,24 @@ func setImage(image string) serviceConstructor {
}
}
func setFailureAction(failureAction string) serviceConstructor {
return func(s *swarm.Service) {
s.Spec.UpdateConfig.FailureAction = failureAction
}
}
func setMaxFailureRatio(maxFailureRatio float32) serviceConstructor {
return func(s *swarm.Service) {
s.Spec.UpdateConfig.MaxFailureRatio = maxFailureRatio
}
}
func setParallelism(parallelism uint64) serviceConstructor {
return func(s *swarm.Service) {
s.Spec.UpdateConfig.Parallelism = parallelism
}
}
func setConstraints(constraints []string) serviceConstructor {
return func(s *swarm.Service) {
if s.Spec.TaskTemplate.Placement == nil {

View File

@ -349,7 +349,7 @@ func (s *DockerSwarmSuite) TestPsListContainersFilterIsTask(c *check.C) {
c.Assert(strings.TrimSpace(out), checker.Not(checker.Equals), "")
// make sure task has been deployed.
waitAndAssert(c, defaultReconciliationTimeout, d.checkServiceRunningTasks(c, name), checker.Equals, 1)
waitAndAssert(c, defaultReconciliationTimeout, d.checkServiceRunningTasks(name), checker.Equals, 1)
// Filter non-tasks
out, err = d.Cmd("ps", "-a", "-q", "--filter=is-task=false")