diff --git a/api/server/router/swarm/backend.go b/api/server/router/swarm/backend.go index e86ca1261f..a7cc9eef40 100644 --- a/api/server/router/swarm/backend.go +++ b/api/server/router/swarm/backend.go @@ -15,7 +15,7 @@ type Backend interface { GetServices(basictypes.ServiceListOptions) ([]types.Service, error) GetService(string) (types.Service, error) CreateService(types.ServiceSpec, string) (string, error) - UpdateService(string, uint64, types.ServiceSpec, string) error + UpdateService(string, uint64, types.ServiceSpec, string, string) error RemoveService(string) error GetNodes(basictypes.NodeListOptions) ([]types.Node, error) GetNode(string) (types.Node, error) diff --git a/api/server/router/swarm/cluster_routes.go b/api/server/router/swarm/cluster_routes.go index c73daf9491..387333d975 100644 --- a/api/server/router/swarm/cluster_routes.go +++ b/api/server/router/swarm/cluster_routes.go @@ -156,7 +156,9 @@ func (sr *swarmRouter) updateService(ctx context.Context, w http.ResponseWriter, // Get returns "" if the header does not exist encodedAuth := r.Header.Get("X-Registry-Auth") - if err := sr.backend.UpdateService(vars["id"], version, service, encodedAuth); err != nil { + registryAuthFrom := r.URL.Query().Get("registryAuthFrom") + + if err := sr.backend.UpdateService(vars["id"], version, service, encodedAuth, registryAuthFrom); err != nil { logrus.Errorf("Error updating service %s: %v", vars["id"], err) return err } diff --git a/api/types/swarm/service.go b/api/types/swarm/service.go index c23f68327f..0046387366 100644 --- a/api/types/swarm/service.go +++ b/api/types/swarm/service.go @@ -90,16 +90,16 @@ type UpdateConfig struct { // be used. Monitor time.Duration `json:",omitempty"` - // AllowedFailureFraction is the fraction of tasks that may fail during + // MaxFailureRatio is the fraction of tasks that may fail during // an update before the failure action is invoked. Any task created by // the current update which ends up in one of the states REJECTED, // COMPLETED or FAILED within Monitor from its creation counts as a // failure. The number of failures is divided by the number of tasks // being updated, and if this fraction is greater than - // AllowedFailureFraction, the failure action is invoked. + // MaxFailureRatio, the failure action is invoked. // // If the failure action is CONTINUE, there is no effect. // If the failure action is PAUSE, no more tasks will be updated until // another update is started. - AllowedFailureFraction float32 + MaxFailureRatio float32 } diff --git a/cli/command/formatter/service.go b/cli/command/formatter/service.go index 71ee4d656a..1549047b72 100644 --- a/cli/command/formatter/service.go +++ b/cli/command/formatter/service.go @@ -41,10 +41,14 @@ Placement: {{- if .HasUpdateConfig }} UpdateConfig: Parallelism: {{ .UpdateParallelism }} -{{- if .HasUpdateDelay -}} +{{- if .HasUpdateDelay}} Delay: {{ .UpdateDelay }} {{- end }} On failure: {{ .UpdateOnFailure }} +{{- if .HasUpdateMonitor}} + Monitoring Period: {{ .UpdateMonitor }} +{{- end }} + Max failure ratio: {{ .UpdateMaxFailureRatio }} {{- end }} ContainerSpec: Image: {{ .ContainerImage }} @@ -218,6 +222,18 @@ func (ctx *serviceInspectContext) UpdateOnFailure() string { return ctx.Service.Spec.UpdateConfig.FailureAction } +func (ctx *serviceInspectContext) HasUpdateMonitor() bool { + return ctx.Service.Spec.UpdateConfig.Monitor.Nanoseconds() > 0 +} + +func (ctx *serviceInspectContext) UpdateMonitor() time.Duration { + return ctx.Service.Spec.UpdateConfig.Monitor +} + +func (ctx *serviceInspectContext) UpdateMaxFailureRatio() float32 { + return ctx.Service.Spec.UpdateConfig.MaxFailureRatio +} + func (ctx *serviceInspectContext) ContainerImage() string { return ctx.Service.Spec.TaskTemplate.ContainerSpec.Image } diff --git a/cli/command/service/opts.go b/cli/command/service/opts.go index 1e966f90c6..cf25b78273 100644 --- a/cli/command/service/opts.go +++ b/cli/command/service/opts.go @@ -267,9 +267,11 @@ func (m *MountOpt) Value() []mounttypes.Mount { } type updateOptions struct { - parallelism uint64 - delay time.Duration - onFailure string + parallelism uint64 + delay time.Duration + monitor time.Duration + onFailure string + maxFailureRatio float32 } type resourceOptions struct { @@ -458,9 +460,11 @@ func (opts *serviceOptions) ToService() (swarm.ServiceSpec, error) { Networks: convertNetworks(opts.networks), Mode: swarm.ServiceMode{}, UpdateConfig: &swarm.UpdateConfig{ - Parallelism: opts.update.parallelism, - Delay: opts.update.delay, - FailureAction: opts.update.onFailure, + Parallelism: opts.update.parallelism, + Delay: opts.update.delay, + Monitor: opts.update.monitor, + FailureAction: opts.update.onFailure, + MaxFailureRatio: opts.update.maxFailureRatio, }, EndpointSpec: opts.endpoint.ToEndpointSpec(), } @@ -507,7 +511,9 @@ func addServiceFlags(cmd *cobra.Command, opts *serviceOptions) { flags.Uint64Var(&opts.update.parallelism, flagUpdateParallelism, 1, "Maximum number of tasks updated simultaneously (0 to update all at once)") flags.DurationVar(&opts.update.delay, flagUpdateDelay, time.Duration(0), "Delay between updates") + flags.DurationVar(&opts.update.monitor, flagUpdateMonitor, time.Duration(0), "Duration after each task update to monitor for failure") flags.StringVar(&opts.update.onFailure, flagUpdateFailureAction, "pause", "Action on update failure (pause|continue)") + flags.Float32Var(&opts.update.maxFailureRatio, flagUpdateMaxFailureRatio, 0, "Failure rate to tolerate during an update") flags.StringVar(&opts.endpoint.mode, flagEndpointMode, "", "Endpoint mode (vip or dnsrr)") @@ -518,46 +524,48 @@ func addServiceFlags(cmd *cobra.Command, opts *serviceOptions) { } const ( - flagConstraint = "constraint" - flagConstraintRemove = "constraint-rm" - flagConstraintAdd = "constraint-add" - flagContainerLabel = "container-label" - flagContainerLabelRemove = "container-label-rm" - flagContainerLabelAdd = "container-label-add" - flagEndpointMode = "endpoint-mode" - flagEnv = "env" - flagEnvRemove = "env-rm" - flagEnvAdd = "env-add" - flagGroupAdd = "group-add" - flagGroupRemove = "group-rm" - flagLabel = "label" - flagLabelRemove = "label-rm" - flagLabelAdd = "label-add" - flagLimitCPU = "limit-cpu" - flagLimitMemory = "limit-memory" - flagMode = "mode" - flagMount = "mount" - flagMountRemove = "mount-rm" - flagMountAdd = "mount-add" - flagName = "name" - flagNetwork = "network" - flagPublish = "publish" - flagPublishRemove = "publish-rm" - flagPublishAdd = "publish-add" - flagReplicas = "replicas" - flagReserveCPU = "reserve-cpu" - flagReserveMemory = "reserve-memory" - flagRestartCondition = "restart-condition" - flagRestartDelay = "restart-delay" - flagRestartMaxAttempts = "restart-max-attempts" - flagRestartWindow = "restart-window" - flagStopGracePeriod = "stop-grace-period" - flagUpdateDelay = "update-delay" - flagUpdateFailureAction = "update-failure-action" - flagUpdateParallelism = "update-parallelism" - flagUser = "user" - flagWorkdir = "workdir" - flagRegistryAuth = "with-registry-auth" - flagLogDriver = "log-driver" - flagLogOpt = "log-opt" + flagConstraint = "constraint" + flagConstraintRemove = "constraint-rm" + flagConstraintAdd = "constraint-add" + flagContainerLabel = "container-label" + flagContainerLabelRemove = "container-label-rm" + flagContainerLabelAdd = "container-label-add" + flagEndpointMode = "endpoint-mode" + flagEnv = "env" + flagEnvRemove = "env-rm" + flagEnvAdd = "env-add" + flagGroupAdd = "group-add" + flagGroupRemove = "group-rm" + flagLabel = "label" + flagLabelRemove = "label-rm" + flagLabelAdd = "label-add" + flagLimitCPU = "limit-cpu" + flagLimitMemory = "limit-memory" + flagMode = "mode" + flagMount = "mount" + flagMountRemove = "mount-rm" + flagMountAdd = "mount-add" + flagName = "name" + flagNetwork = "network" + flagPublish = "publish" + flagPublishRemove = "publish-rm" + flagPublishAdd = "publish-add" + flagReplicas = "replicas" + flagReserveCPU = "reserve-cpu" + flagReserveMemory = "reserve-memory" + flagRestartCondition = "restart-condition" + flagRestartDelay = "restart-delay" + flagRestartMaxAttempts = "restart-max-attempts" + flagRestartWindow = "restart-window" + flagStopGracePeriod = "stop-grace-period" + flagUpdateDelay = "update-delay" + flagUpdateFailureAction = "update-failure-action" + flagUpdateMaxFailureRatio = "update-max-failure-ratio" + flagUpdateMonitor = "update-monitor" + flagUpdateParallelism = "update-parallelism" + flagUser = "user" + flagWorkdir = "workdir" + flagRegistryAuth = "with-registry-auth" + flagLogDriver = "log-driver" + flagLogOpt = "log-opt" ) diff --git a/cli/command/service/update.go b/cli/command/service/update.go index be3218ed60..797c989271 100644 --- a/cli/command/service/update.go +++ b/cli/command/service/update.go @@ -36,6 +36,7 @@ func newUpdateCommand(dockerCli *command.DockerCli) *cobra.Command { flags := cmd.Flags() flags.String("image", "", "Service image tag") flags.String("args", "", "Service command args") + flags.Bool("rollback", false, "Rollback to previous specification") addServiceFlags(cmd, opts) flags.Var(newListOptsVar(), flagEnvRemove, "Remove an environment variable") @@ -68,7 +69,20 @@ func runUpdate(dockerCli *command.DockerCli, flags *pflag.FlagSet, serviceID str return err } - err = updateService(flags, &service.Spec) + rollback, err := flags.GetBool("rollback") + if err != nil { + return err + } + + spec := &service.Spec + if rollback { + spec = service.PreviousSpec + if spec == nil { + return fmt.Errorf("service does not have a previous specification to roll back to") + } + } + + err = updateService(flags, spec) if err != nil { return err } @@ -81,15 +95,19 @@ func runUpdate(dockerCli *command.DockerCli, flags *pflag.FlagSet, serviceID str if sendAuth { // Retrieve encoded auth token from the image reference // This would be the old image if it didn't change in this update - image := service.Spec.TaskTemplate.ContainerSpec.Image + image := spec.TaskTemplate.ContainerSpec.Image encodedAuth, err := command.RetrieveAuthTokenFromImage(ctx, dockerCli, image) if err != nil { return err } updateOpts.EncodedRegistryAuth = encodedAuth + } else if rollback { + updateOpts.RegistryAuthFrom = types.RegistryAuthFromPreviousSpec + } else { + updateOpts.RegistryAuthFrom = types.RegistryAuthFromSpec } - err = apiClient.ServiceUpdate(ctx, service.ID, service.Version, service.Spec, updateOpts) + err = apiClient.ServiceUpdate(ctx, service.ID, service.Version, *spec, updateOpts) if err != nil { return err } @@ -111,6 +129,12 @@ func updateService(flags *pflag.FlagSet, spec *swarm.ServiceSpec) error { } } + updateFloat32 := func(flag string, field *float32) { + if flags.Changed(flag) { + *field, _ = flags.GetFloat32(flag) + } + } + updateDuration := func(flag string, field *time.Duration) { if flags.Changed(flag) { *field, _ = flags.GetDuration(flag) @@ -195,13 +219,15 @@ func updateService(flags *pflag.FlagSet, spec *swarm.ServiceSpec) error { return err } - if anyChanged(flags, flagUpdateParallelism, flagUpdateDelay, flagUpdateFailureAction) { + if anyChanged(flags, flagUpdateParallelism, flagUpdateDelay, flagUpdateMonitor, flagUpdateFailureAction, flagUpdateMaxFailureRatio) { if spec.UpdateConfig == nil { spec.UpdateConfig = &swarm.UpdateConfig{} } updateUint64(flagUpdateParallelism, &spec.UpdateConfig.Parallelism) updateDuration(flagUpdateDelay, &spec.UpdateConfig.Delay) + updateDuration(flagUpdateMonitor, &spec.UpdateConfig.Monitor) updateString(flagUpdateFailureAction, &spec.UpdateConfig.FailureAction) + updateFloat32(flagUpdateMaxFailureRatio, &spec.UpdateConfig.MaxFailureRatio) } if flags.Changed(flagEndpointMode) { diff --git a/contrib/completion/bash/docker b/contrib/completion/bash/docker index 0103b25590..e8bfe37997 100644 --- a/contrib/completion/bash/docker +++ b/contrib/completion/bash/docker @@ -1760,9 +1760,12 @@ _docker_service_update() { --restart-delay --restart-max-attempts --restart-window + --rollback --stop-grace-period --update-delay --update-failure-action + --update-max-failure-ratio + --update-monitor --update-parallelism --user -u --workdir -w diff --git a/contrib/completion/zsh/_docker b/contrib/completion/zsh/_docker index d9246105b9..cb73073905 100644 --- a/contrib/completion/zsh/_docker +++ b/contrib/completion/zsh/_docker @@ -1108,6 +1108,8 @@ __docker_service_subcommand() { "($help)--stop-grace-period=[Time to wait before force killing a container]:grace period: " "($help)--update-delay=[Delay between updates]:delay: " "($help)--update-failure-action=[Action on update failure]:mode:(pause continue)" + "($help)--update-max-failure-ratio=[Failure rate to tolerate during an update]:fraction: " + "($help)--update-monitor=[Duration after each task update to monitor for failure]:window: " "($help)--update-parallelism=[Maximum number of tasks updated simultaneously]:number: " "($help -u --user)"{-u=,--user=}"[Username or UID]:user:_users" "($help)--with-registry-auth[Send registry authentication details to swarm agents]" @@ -1185,6 +1187,7 @@ __docker_service_subcommand() { "($help)*--container-label-rm=[Remove a container label by its key]:label: " \ "($help)*--group-rm=[Remove previously added user groups from the container]:group:_groups" \ "($help)--image=[Service image tag]:image:__docker_repositories" \ + "($help)--rollback[Rollback to previous specification]" \ "($help -)1:service:__docker_complete_services" && ret=0 ;; (help) diff --git a/daemon/cluster/cluster.go b/daemon/cluster/cluster.go index 2b31724396..8262537816 100644 --- a/daemon/cluster/cluster.go +++ b/daemon/cluster/cluster.go @@ -913,7 +913,7 @@ func (c *Cluster) GetService(input string) (types.Service, error) { } // UpdateService updates existing service to match new properties. -func (c *Cluster) UpdateService(serviceIDOrName string, version uint64, spec types.ServiceSpec, encodedAuth string) error { +func (c *Cluster) UpdateService(serviceIDOrName string, version uint64, spec types.ServiceSpec, encodedAuth string, registryAuthFrom string) error { c.RLock() defer c.RUnlock() @@ -948,7 +948,18 @@ func (c *Cluster) UpdateService(serviceIDOrName string, version uint64, spec typ } else { // this is needed because if the encodedAuth isn't being updated then we // shouldn't lose it, and continue to use the one that was already present - ctnr := currentService.Spec.Task.GetContainer() + var ctnr *swarmapi.ContainerSpec + switch registryAuthFrom { + case apitypes.RegistryAuthFromSpec, "": + ctnr = currentService.Spec.Task.GetContainer() + case apitypes.RegistryAuthFromPreviousSpec: + if currentService.PreviousSpec == nil { + return fmt.Errorf("service does not have a previous spec") + } + ctnr = currentService.PreviousSpec.Task.GetContainer() + default: + return fmt.Errorf("unsupported registryAuthFromValue") + } if ctnr == nil { return fmt.Errorf("service does not use container tasks") } diff --git a/daemon/cluster/convert/service.go b/daemon/cluster/convert/service.go index 55d693b04c..311fe22333 100644 --- a/daemon/cluster/convert/service.go +++ b/daemon/cluster/convert/service.go @@ -12,35 +12,11 @@ import ( // ServiceFromGRPC converts a grpc Service to a Service. func ServiceFromGRPC(s swarmapi.Service) types.Service { - spec := s.Spec - containerConfig := spec.Task.Runtime.(*swarmapi.TaskSpec_Container).Container - - serviceNetworks := make([]types.NetworkAttachmentConfig, 0, len(spec.Networks)) - for _, n := range spec.Networks { - serviceNetworks = append(serviceNetworks, types.NetworkAttachmentConfig{Target: n.Target, Aliases: n.Aliases}) - } - - taskNetworks := make([]types.NetworkAttachmentConfig, 0, len(spec.Task.Networks)) - for _, n := range spec.Task.Networks { - taskNetworks = append(taskNetworks, types.NetworkAttachmentConfig{Target: n.Target, Aliases: n.Aliases}) - } - service := types.Service{ - ID: s.ID, + ID: s.ID, + Spec: *serviceSpecFromGRPC(&s.Spec), + PreviousSpec: serviceSpecFromGRPC(s.PreviousSpec), - Spec: types.ServiceSpec{ - TaskTemplate: types.TaskSpec{ - ContainerSpec: containerSpecFromGRPC(containerConfig), - Resources: resourcesFromGRPC(s.Spec.Task.Resources), - RestartPolicy: restartPolicyFromGRPC(s.Spec.Task.Restart), - Placement: placementFromGRPC(s.Spec.Task.Placement), - LogDriver: driverFromGRPC(s.Spec.Task.LogDriver), - Networks: taskNetworks, - }, - - Networks: serviceNetworks, - EndpointSpec: endpointSpecFromGRPC(s.Spec.Endpoint), - }, Endpoint: endpointFromGRPC(s.Endpoint), } @@ -49,36 +25,6 @@ func ServiceFromGRPC(s swarmapi.Service) types.Service { service.CreatedAt, _ = ptypes.Timestamp(s.Meta.CreatedAt) service.UpdatedAt, _ = ptypes.Timestamp(s.Meta.UpdatedAt) - // Annotations - service.Spec.Name = s.Spec.Annotations.Name - service.Spec.Labels = s.Spec.Annotations.Labels - - // UpdateConfig - if s.Spec.Update != nil { - service.Spec.UpdateConfig = &types.UpdateConfig{ - Parallelism: s.Spec.Update.Parallelism, - } - - service.Spec.UpdateConfig.Delay, _ = ptypes.Duration(&s.Spec.Update.Delay) - - switch s.Spec.Update.FailureAction { - case swarmapi.UpdateConfig_PAUSE: - service.Spec.UpdateConfig.FailureAction = types.UpdateFailureActionPause - case swarmapi.UpdateConfig_CONTINUE: - service.Spec.UpdateConfig.FailureAction = types.UpdateFailureActionContinue - } - } - - // Mode - switch t := s.Spec.GetMode().(type) { - case *swarmapi.ServiceSpec_Global: - service.Spec.Mode.Global = &types.GlobalService{} - case *swarmapi.ServiceSpec_Replicated: - service.Spec.Mode.Replicated = &types.ReplicatedService{ - Replicas: &t.Replicated.Replicas, - } - } - // UpdateStatus service.UpdateStatus = types.UpdateStatus{} if s.UpdateStatus != nil { @@ -99,6 +45,74 @@ func ServiceFromGRPC(s swarmapi.Service) types.Service { return service } +func serviceSpecFromGRPC(spec *swarmapi.ServiceSpec) *types.ServiceSpec { + if spec == nil { + return nil + } + + serviceNetworks := make([]types.NetworkAttachmentConfig, 0, len(spec.Networks)) + for _, n := range spec.Networks { + serviceNetworks = append(serviceNetworks, types.NetworkAttachmentConfig{Target: n.Target, Aliases: n.Aliases}) + } + + taskNetworks := make([]types.NetworkAttachmentConfig, 0, len(spec.Task.Networks)) + for _, n := range spec.Task.Networks { + taskNetworks = append(taskNetworks, types.NetworkAttachmentConfig{Target: n.Target, Aliases: n.Aliases}) + } + + containerConfig := spec.Task.Runtime.(*swarmapi.TaskSpec_Container).Container + convertedSpec := &types.ServiceSpec{ + Annotations: types.Annotations{ + Name: spec.Annotations.Name, + Labels: spec.Annotations.Labels, + }, + + TaskTemplate: types.TaskSpec{ + ContainerSpec: containerSpecFromGRPC(containerConfig), + Resources: resourcesFromGRPC(spec.Task.Resources), + RestartPolicy: restartPolicyFromGRPC(spec.Task.Restart), + Placement: placementFromGRPC(spec.Task.Placement), + LogDriver: driverFromGRPC(spec.Task.LogDriver), + Networks: taskNetworks, + }, + + Networks: serviceNetworks, + EndpointSpec: endpointSpecFromGRPC(spec.Endpoint), + } + + // UpdateConfig + if spec.Update != nil { + convertedSpec.UpdateConfig = &types.UpdateConfig{ + Parallelism: spec.Update.Parallelism, + MaxFailureRatio: spec.Update.MaxFailureRatio, + } + + convertedSpec.UpdateConfig.Delay, _ = ptypes.Duration(&spec.Update.Delay) + if spec.Update.Monitor != nil { + convertedSpec.UpdateConfig.Monitor, _ = ptypes.Duration(spec.Update.Monitor) + } + + switch spec.Update.FailureAction { + case swarmapi.UpdateConfig_PAUSE: + convertedSpec.UpdateConfig.FailureAction = types.UpdateFailureActionPause + case swarmapi.UpdateConfig_CONTINUE: + convertedSpec.UpdateConfig.FailureAction = types.UpdateFailureActionContinue + } + } + + // Mode + switch t := spec.GetMode().(type) { + case *swarmapi.ServiceSpec_Global: + convertedSpec.Mode.Global = &types.GlobalService{} + case *swarmapi.ServiceSpec_Replicated: + convertedSpec.Mode.Replicated = &types.ReplicatedService{ + Replicas: &t.Replicated.Replicas, + } + } + + return convertedSpec +} + // ServiceSpecToGRPC converts a ServiceSpec to a grpc ServiceSpec. func ServiceSpecToGRPC(s types.ServiceSpec) (swarmapi.ServiceSpec, error) { name := s.Name @@ -158,9 +172,13 @@ func ServiceSpecToGRPC(s types.ServiceSpec) (swarmapi.ServiceSpec, error) { return swarmapi.ServiceSpec{}, fmt.Errorf("unrecongized update failure action %s", s.UpdateConfig.FailureAction) } spec.Update = &swarmapi.UpdateConfig{ - Parallelism: s.UpdateConfig.Parallelism, - Delay: *ptypes.DurationProto(s.UpdateConfig.Delay), - FailureAction: failureAction, + Parallelism: s.UpdateConfig.Parallelism, + Delay: *ptypes.DurationProto(s.UpdateConfig.Delay), + FailureAction: failureAction, + MaxFailureRatio: s.UpdateConfig.MaxFailureRatio, + } + if s.UpdateConfig.Monitor != 0 { + spec.Update.Monitor = ptypes.DurationProto(s.UpdateConfig.Monitor) } } diff --git a/docs/reference/api/docker_remote_api.md b/docs/reference/api/docker_remote_api.md index 1b6617c45a..65c36a2886 100644 --- a/docs/reference/api/docker_remote_api.md +++ b/docs/reference/api/docker_remote_api.md @@ -129,6 +129,7 @@ This section lists each version from latest to oldest. Each listing includes a * `GET /containers/json` now supports a `is-task` filter to filter containers that are tasks (part of a service in swarm mode). * `POST /containers/create` now takes `StopTimeout` field. +* `POST /services/create` and `POST /services/(id or name)/update` now accept `Monitor` and `MaxFailureRatio` parameters, which control the response to failures during service updates. ### v1.24 API changes diff --git a/docs/reference/api/docker_remote_api_v1.25.md b/docs/reference/api/docker_remote_api_v1.25.md index b1b716cbc6..5f52e75000 100644 --- a/docs/reference/api/docker_remote_api_v1.25.md +++ b/docs/reference/api/docker_remote_api_v1.25.md @@ -4877,7 +4877,9 @@ List services }, "UpdateConfig": { "Parallelism": 1, - "FailureAction": "pause" + "FailureAction": "pause", + "Monitor": 15000000000, + "MaxFailureRatio": 0.15 }, "EndpointSpec": { "Mode": "vip", @@ -5077,8 +5079,8 @@ image](#create-an-image) section for more details. - **RestartPolicy** – Specification for the restart policy which applies to containers created as part of this service. - **Condition** – Condition for restart (`none`, `on-failure`, or `any`). - - **Delay** – Delay between restart attempts. - - **Attempts** – Maximum attempts to restart a given container before giving up (default value + - **Delay** – Delay between restart attempts, in nanoseconds. + - **MaxAttempts** – Maximum attempts to restart a given container before giving up (default value is 0, which is ignored). - **Window** – Windows is the time window used to evaluate the restart policy (default value is 0, which is unbounded). @@ -5087,9 +5089,12 @@ image](#create-an-image) section for more details. - **UpdateConfig** – Specification for the update strategy of the service. - **Parallelism** – Maximum number of tasks to be updated in one iteration (0 means unlimited parallelism). - - **Delay** – Amount of time between updates. + - **Delay** – Amount of time between updates, in nanoseconds. - **FailureAction** - Action to take if an updated task fails to run, or stops running during the update. Values are `continue` and `pause`. + - **Monitor** - Amount of time to monitor each updated task for failures, in nanoseconds. + - **MaxFailureRatio** - The fraction of tasks that may fail during an update before the + failure action is invoked, specified as a floating point number between 0 and 1. The default is 0. - **Networks** – Array of network names or IDs to attach the service to. - **EndpointSpec** – Properties that can be configured to access and load balance a service. - **Mode** – The mode of resolution to use for internal load balancing @@ -5259,7 +5264,9 @@ image](#create-an-image) section for more details. } }, "UpdateConfig": { - "Parallelism": 1 + "Parallelism": 1, + "Monitor": 15000000000, + "MaxFailureRatio": 0.15 }, "EndpointSpec": { "Mode": "vip" @@ -5314,7 +5321,7 @@ image](#create-an-image) section for more details. - **RestartPolicy** – Specification for the restart policy which applies to containers created as part of this service. - **Condition** – Condition for restart (`none`, `on-failure`, or `any`). - - **Delay** – Delay between restart attempts. + - **Delay** – Delay between restart attempts, in nanoseconds. - **MaxAttempts** – Maximum attempts to restart a given container before giving up (default value is 0, which is ignored). - **Window** – Windows is the time window used to evaluate the restart policy (default value is @@ -5324,7 +5331,12 @@ image](#create-an-image) section for more details. - **UpdateConfig** – Specification for the update strategy of the service. - **Parallelism** – Maximum number of tasks to be updated in one iteration (0 means unlimited parallelism). - - **Delay** – Amount of time between updates. + - **Delay** – Amount of time between updates, in nanoseconds. + - **FailureAction** - Action to take if an updated task fails to run, or stops running during the + update. Values are `continue` and `pause`. + - **Monitor** - Amount of time to monitor each updated task for failures, in nanoseconds. + - **MaxFailureRatio** - The fraction of tasks that may fail during an update before the + failure action is invoked, specified as a floating point number between 0 and 1. The default is 0. - **Networks** – Array of network names or IDs to attach the service to. - **EndpointSpec** – Properties that can be configured to access and load balance a service. - **Mode** – The mode of resolution to use for internal load balancing @@ -5338,6 +5350,10 @@ image](#create-an-image) section for more details. - **version** – The version number of the service object being updated. This is required to avoid conflicting writes. +- **registryAuthFrom** - If the X-Registry-Auth header is not specified, this + parameter indicates where to find registry authorization credentials. The + valid values are `spec` and `previous-spec`. If unspecified, the default is + `spec`. **Request Headers**: diff --git a/docs/reference/commandline/service_create.md b/docs/reference/commandline/service_create.md index f4d0815070..93ffb0e9a9 100644 --- a/docs/reference/commandline/service_create.md +++ b/docs/reference/commandline/service_create.md @@ -12,36 +12,38 @@ Usage: docker service create [OPTIONS] IMAGE [COMMAND] [ARG...] Create a new service Options: - --constraint value Placement constraints (default []) - --container-label value Service container labels (default []) - --endpoint-mode string Endpoint mode (vip or dnsrr) - -e, --env value Set environment variables (default []) - --group-add value Add additional user groups to the container (default []) - --help Print usage - -l, --label value Service labels (default []) - --limit-cpu value Limit CPUs (default 0.000) - --limit-memory value Limit Memory (default 0 B) - --log-driver string Logging driver for service - --log-opt value Logging driver options (default []) - --mode string Service mode (replicated or global) (default "replicated") - --mount value Attach a mount to the service - --name string Service name - --network value Network attachments (default []) - -p, --publish value Publish a port as a node port (default []) - --replicas value Number of tasks (default none) - --reserve-cpu value Reserve CPUs (default 0.000) - --reserve-memory value Reserve Memory (default 0 B) - --restart-condition string Restart when condition is met (none, on-failure, or any) - --restart-delay value Delay between restart attempts (default none) - --restart-max-attempts value Maximum number of restarts before giving up (default none) - --restart-window value Window used to evaluate the restart policy (default none) - --stop-grace-period value Time to wait before force killing a container (default none) - --update-delay duration Delay between updates - --update-failure-action string Action on update failure (pause|continue) (default "pause") - --update-parallelism uint Maximum number of tasks updated simultaneously (0 to update all at once) (default 1) - -u, --user string Username or UID (format: [:]) - --with-registry-auth Send registry authentication details to Swarm agents - -w, --workdir string Working directory inside the container + --constraint value Placement constraints (default []) + --container-label value Service container labels (default []) + --endpoint-mode string Endpoint mode (vip or dnsrr) + -e, --env value Set environment variables (default []) + --group-add value Add additional user groups to the container (default []) + --help Print usage + -l, --label value Service labels (default []) + --limit-cpu value Limit CPUs (default 0.000) + --limit-memory value Limit Memory (default 0 B) + --log-driver string Logging driver for service + --log-opt value Logging driver options (default []) + --mode string Service mode (replicated or global) (default "replicated") + --mount value Attach a mount to the service + --name string Service name + --network value Network attachments (default []) + -p, --publish value Publish a port as a node port (default []) + --replicas value Number of tasks (default none) + --reserve-cpu value Reserve CPUs (default 0.000) + --reserve-memory value Reserve Memory (default 0 B) + --restart-condition string Restart when condition is met (none, on-failure, or any) + --restart-delay value Delay between restart attempts (default none) + --restart-max-attempts value Maximum number of restarts before giving up (default none) + --restart-window value Window used to evaluate the restart policy (default none) + --stop-grace-period value Time to wait before force killing a container (default none) + --update-delay duration Delay between updates + --update-failure-action string Action on update failure (pause|continue) (default "pause") + --update-max-failure-ratio value Failure rate to tolerate during an update + --update-monitor duration Duration after each task update to monitor for failure (default 0s) + --update-parallelism uint Maximum number of tasks updated simultaneously (0 to update all at once) (default 1) + -u, --user string Username or UID (format: [:]) + --with-registry-auth Send registry authentication details to Swarm agents + -w, --workdir string Working directory inside the container ``` Creates a service as described by the specified parameters. You must run this diff --git a/docs/reference/commandline/service_update.md b/docs/reference/commandline/service_update.md index f1698c3e01..d70a656837 100644 --- a/docs/reference/commandline/service_update.md +++ b/docs/reference/commandline/service_update.md @@ -12,43 +12,46 @@ Usage: docker service update [OPTIONS] SERVICE Update a service Options: - --args string Service command args - --constraint-add value Add or update placement constraints (default []) - --constraint-rm value Remove a constraint (default []) - --container-label-add value Add or update container labels (default []) - --container-label-rm value Remove a container label by its key (default []) - --endpoint-mode string Endpoint mode (vip or dnsrr) - --env-add value Add or update environment variables (default []) - --env-rm value Remove an environment variable (default []) - --group-add value Add additional user groups to the container (default []) - --group-rm value Remove previously added user groups from the container (default []) - --help Print usage - --image string Service image tag - --label-add value Add or update service labels (default []) - --label-rm value Remove a label by its key (default []) - --limit-cpu value Limit CPUs (default 0.000) - --limit-memory value Limit Memory (default 0 B) - --log-driver string Logging driver for service - --log-opt value Logging driver options (default []) - --mount-add value Add or update a mount on a service - --mount-rm value Remove a mount by its target path (default []) - --name string Service name - --publish-add value Add or update a published port (default []) - --publish-rm value Remove a published port by its target port (default []) - --replicas value Number of tasks (default none) - --reserve-cpu value Reserve CPUs (default 0.000) - --reserve-memory value Reserve Memory (default 0 B) - --restart-condition string Restart when condition is met (none, on-failure, or any) - --restart-delay value Delay between restart attempts (default none) - --restart-max-attempts value Maximum number of restarts before giving up (default none) - --restart-window value Window used to evaluate the restart policy (default none) - --stop-grace-period value Time to wait before force killing a container (default none) - --update-delay duration Delay between updates - --update-failure-action string Action on update failure (pause|continue) (default "pause") - --update-parallelism uint Maximum number of tasks updated simultaneously (0 to update all at once) (default 1) - -u, --user string Username or UID (format: [:]) - --with-registry-auth Send registry authentication details to Swarm agents - -w, --workdir string Working directory inside the container + --args string Service command args + --constraint-add value Add or update placement constraints (default []) + --constraint-rm value Remove a constraint (default []) + --container-label-add value Add or update container labels (default []) + --container-label-rm value Remove a container label by its key (default []) + --endpoint-mode string Endpoint mode (vip or dnsrr) + --env-add value Add or update environment variables (default []) + --env-rm value Remove an environment variable (default []) + --group-add value Add additional user groups to the container (default []) + --group-rm value Remove previously added user groups from the container (default []) + --help Print usage + --image string Service image tag + --label-add value Add or update service labels (default []) + --label-rm value Remove a label by its key (default []) + --limit-cpu value Limit CPUs (default 0.000) + --limit-memory value Limit Memory (default 0 B) + --log-driver string Logging driver for service + --log-opt value Logging driver options (default []) + --mount-add value Add or update a mount on a service + --mount-rm value Remove a mount by its target path (default []) + --name string Service name + --publish-add value Add or update a published port (default []) + --publish-rm value Remove a published port by its target port (default []) + --replicas value Number of tasks (default none) + --reserve-cpu value Reserve CPUs (default 0.000) + --reserve-memory value Reserve Memory (default 0 B) + --restart-condition string Restart when condition is met (none, on-failure, or any) + --restart-delay value Delay between restart attempts (default none) + --restart-max-attempts value Maximum number of restarts before giving up (default none) + --restart-window value Window used to evaluate the restart policy (default none) + --rollback Rollback to previous specification + --stop-grace-period value Time to wait before force killing a container (default none) + --update-delay duration Delay between updates + --update-failure-action string Action on update failure (pause|continue) (default "pause") + --update-max-failure-ratio value Failure rate to tolerate during an update + --update-monitor duration Duration after each task update to monitor for failure (default 0s) + --update-parallelism uint Maximum number of tasks updated simultaneously (0 to update all at once) (default 1) + -u, --user string Username or UID (format: [:]) + --with-registry-auth Send registry authentication details to Swarm agents + -w, --workdir string Working directory inside the container ``` Updates a service as described by the specified parameters. This command has to be run targeting a manager node. diff --git a/integration-cli/daemon_swarm.go b/integration-cli/daemon_swarm.go index fe69920e64..50e464cdd2 100644 --- a/integration-cli/daemon_swarm.go +++ b/integration-cli/daemon_swarm.go @@ -139,8 +139,8 @@ func (d *SwarmDaemon) getServiceTasks(c *check.C, service string) []swarm.Task { return tasks } -func (d *SwarmDaemon) checkServiceRunningTasks(c *check.C, service string) func(*check.C) (interface{}, check.CommentInterface) { - return func(*check.C) (interface{}, check.CommentInterface) { +func (d *SwarmDaemon) checkServiceRunningTasks(service string) func(*check.C) (interface{}, check.CommentInterface) { + return func(c *check.C) (interface{}, check.CommentInterface) { tasks := d.getServiceTasks(c, service) var runningCount int for _, task := range tasks { @@ -152,8 +152,15 @@ func (d *SwarmDaemon) checkServiceRunningTasks(c *check.C, service string) func( } } -func (d *SwarmDaemon) checkServiceTasks(c *check.C, service string) func(*check.C) (interface{}, check.CommentInterface) { - return func(*check.C) (interface{}, check.CommentInterface) { +func (d *SwarmDaemon) checkServiceUpdateState(service string) func(*check.C) (interface{}, check.CommentInterface) { + return func(c *check.C) (interface{}, check.CommentInterface) { + service := d.getService(c, service) + return service.UpdateStatus.State, nil + } +} + +func (d *SwarmDaemon) checkServiceTasks(service string) func(*check.C) (interface{}, check.CommentInterface) { + return func(c *check.C) (interface{}, check.CommentInterface) { tasks := d.getServiceTasks(c, service) return len(tasks), nil } diff --git a/integration-cli/docker_api_swarm_test.go b/integration-cli/docker_api_swarm_test.go index 7865d2c7bf..cecd9e5bf1 100644 --- a/integration-cli/docker_api_swarm_test.go +++ b/integration-cli/docker_api_swarm_test.go @@ -310,6 +310,63 @@ func (s *DockerSwarmSuite) TestAPISwarmServicesUpdate(c *check.C) { // 3nd batch waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkRunningTaskImages, checker.DeepEquals, map[string]int{image2: instances}) + + // Roll back to the previous version. This uses the CLI because + // rollback is a client-side operation. + out, err := daemons[0].Cmd("service", "update", "--rollback", id) + c.Assert(err, checker.IsNil, check.Commentf(out)) + + // first batch + waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkRunningTaskImages, checker.DeepEquals, + map[string]int{image2: instances - parallelism, image1: parallelism}) + + // 2nd batch + waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkRunningTaskImages, checker.DeepEquals, + map[string]int{image2: instances - 2*parallelism, image1: 2 * parallelism}) + + // 3nd batch + waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkRunningTaskImages, checker.DeepEquals, + map[string]int{image1: instances}) +} + +func (s *DockerSwarmSuite) TestApiSwarmServicesFailedUpdate(c *check.C) { + const nodeCount = 3 + var daemons [nodeCount]*SwarmDaemon + for i := 0; i < nodeCount; i++ { + daemons[i] = s.AddDaemon(c, true, i == 0) + } + // wait for nodes ready + waitAndAssert(c, 5*time.Second, daemons[0].checkNodeReadyCount, checker.Equals, nodeCount) + + // service image at start + image1 := "busybox:latest" + // target image in update + image2 := "busybox:badtag" + + // create service + instances := 5 + id := daemons[0].createService(c, serviceForUpdate, setInstances(instances)) + + // wait for tasks ready + waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkRunningTaskImages, checker.DeepEquals, + map[string]int{image1: instances}) + + // issue service update + service := daemons[0].getService(c, id) + daemons[0].updateService(c, service, setImage(image2), setFailureAction(swarm.UpdateFailureActionPause), setMaxFailureRatio(0.25), setParallelism(1)) + + // should update 2 tasks and then pause + waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceUpdateState(id), checker.Equals, swarm.UpdateStatePaused) + v, _ := daemons[0].checkServiceRunningTasks(id)(c) + c.Assert(v, checker.Equals, instances-2) + + // Roll back to the previous version. This uses the CLI because + // rollback is a client-side operation. + out, err := daemons[0].Cmd("service", "update", "--rollback", id) + c.Assert(err, checker.IsNil, check.Commentf(out)) + + waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkRunningTaskImages, checker.DeepEquals, + map[string]int{image1: instances}) } func (s *DockerSwarmSuite) TestAPISwarmServiceConstraintRole(c *check.C) { @@ -326,7 +383,7 @@ func (s *DockerSwarmSuite) TestAPISwarmServiceConstraintRole(c *check.C) { instances := 3 id := daemons[0].createService(c, simpleTestService, setConstraints(constraints), setInstances(instances)) // wait for tasks ready - waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceRunningTasks(c, id), checker.Equals, instances) + waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceRunningTasks(id), checker.Equals, instances) // validate tasks are running on worker nodes tasks := daemons[0].getServiceTasks(c, id) for _, task := range tasks { @@ -340,7 +397,7 @@ func (s *DockerSwarmSuite) TestAPISwarmServiceConstraintRole(c *check.C) { constraints = []string{"node.role!=worker"} id = daemons[0].createService(c, simpleTestService, setConstraints(constraints), setInstances(instances)) // wait for tasks ready - waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceRunningTasks(c, id), checker.Equals, instances) + waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceRunningTasks(id), checker.Equals, instances) tasks = daemons[0].getServiceTasks(c, id) // validate tasks are running on manager nodes for _, task := range tasks { @@ -354,7 +411,7 @@ func (s *DockerSwarmSuite) TestAPISwarmServiceConstraintRole(c *check.C) { constraints = []string{"node.role==nosuchrole"} id = daemons[0].createService(c, simpleTestService, setConstraints(constraints), setInstances(instances)) // wait for tasks created - waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceTasks(c, id), checker.Equals, instances) + waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceTasks(id), checker.Equals, instances) // let scheduler try time.Sleep(250 * time.Millisecond) // validate tasks are not assigned to any node @@ -394,7 +451,7 @@ func (s *DockerSwarmSuite) TestAPISwarmServiceConstraintLabel(c *check.C) { constraints := []string{"node.labels.security==high"} id := daemons[0].createService(c, simpleTestService, setConstraints(constraints), setInstances(instances)) // wait for tasks ready - waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceRunningTasks(c, id), checker.Equals, instances) + waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceRunningTasks(id), checker.Equals, instances) tasks := daemons[0].getServiceTasks(c, id) // validate all tasks are running on nodes[0] for _, task := range tasks { @@ -407,7 +464,7 @@ func (s *DockerSwarmSuite) TestAPISwarmServiceConstraintLabel(c *check.C) { constraints = []string{"node.labels.security!=high"} id = daemons[0].createService(c, simpleTestService, setConstraints(constraints), setInstances(instances)) // wait for tasks ready - waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceRunningTasks(c, id), checker.Equals, instances) + waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceRunningTasks(id), checker.Equals, instances) tasks = daemons[0].getServiceTasks(c, id) // validate all tasks are NOT running on nodes[0] for _, task := range tasks { @@ -419,7 +476,7 @@ func (s *DockerSwarmSuite) TestAPISwarmServiceConstraintLabel(c *check.C) { constraints = []string{"node.labels.security==medium"} id = daemons[0].createService(c, simpleTestService, setConstraints(constraints), setInstances(instances)) // wait for tasks created - waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceTasks(c, id), checker.Equals, instances) + waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceTasks(id), checker.Equals, instances) // let scheduler try time.Sleep(250 * time.Millisecond) tasks = daemons[0].getServiceTasks(c, id) @@ -437,7 +494,7 @@ func (s *DockerSwarmSuite) TestAPISwarmServiceConstraintLabel(c *check.C) { } id = daemons[0].createService(c, simpleTestService, setConstraints(constraints), setInstances(instances)) // wait for tasks created - waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceTasks(c, id), checker.Equals, instances) + waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceTasks(id), checker.Equals, instances) // let scheduler try time.Sleep(250 * time.Millisecond) tasks = daemons[0].getServiceTasks(c, id) @@ -452,7 +509,7 @@ func (s *DockerSwarmSuite) TestAPISwarmServiceConstraintLabel(c *check.C) { } }) // wait for tasks ready - waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceRunningTasks(c, id), checker.Equals, instances) + waitAndAssert(c, defaultReconciliationTimeout, daemons[0].checkServiceRunningTasks(id), checker.Equals, instances) tasks = daemons[0].getServiceTasks(c, id) for _, task := range tasks { c.Assert(task.NodeID, checker.Equals, nodes[1].ID) @@ -1022,6 +1079,24 @@ func setImage(image string) serviceConstructor { } } +func setFailureAction(failureAction string) serviceConstructor { + return func(s *swarm.Service) { + s.Spec.UpdateConfig.FailureAction = failureAction + } +} + +func setMaxFailureRatio(maxFailureRatio float32) serviceConstructor { + return func(s *swarm.Service) { + s.Spec.UpdateConfig.MaxFailureRatio = maxFailureRatio + } +} + +func setParallelism(parallelism uint64) serviceConstructor { + return func(s *swarm.Service) { + s.Spec.UpdateConfig.Parallelism = parallelism + } +} + func setConstraints(constraints []string) serviceConstructor { return func(s *swarm.Service) { if s.Spec.TaskTemplate.Placement == nil { diff --git a/integration-cli/docker_cli_swarm_test.go b/integration-cli/docker_cli_swarm_test.go index 0e6a5f49c8..9092ee7ee2 100644 --- a/integration-cli/docker_cli_swarm_test.go +++ b/integration-cli/docker_cli_swarm_test.go @@ -349,7 +349,7 @@ func (s *DockerSwarmSuite) TestPsListContainersFilterIsTask(c *check.C) { c.Assert(strings.TrimSpace(out), checker.Not(checker.Equals), "") // make sure task has been deployed. - waitAndAssert(c, defaultReconciliationTimeout, d.checkServiceRunningTasks(c, name), checker.Equals, 1) + waitAndAssert(c, defaultReconciliationTimeout, d.checkServiceRunningTasks(name), checker.Equals, 1) // Filter non-tasks out, err = d.Cmd("ps", "-a", "-q", "--filter=is-task=false")