2021-08-23 09:14:53 -04:00
//go:build linux || freebsd
2017-11-01 19:37:53 -04:00
// +build linux freebsd
2017-01-23 06:23:07 -05:00
package main
import (
2019-04-19 03:53:58 -04:00
"os/exec"
2020-11-09 09:00:32 -05:00
"github.com/containerd/cgroups"
2017-01-23 06:23:07 -05:00
"github.com/docker/docker/daemon/config"
"github.com/docker/docker/opts"
2018-10-15 03:52:53 -04:00
"github.com/docker/docker/rootless"
2019-08-05 10:37:47 -04:00
units "github.com/docker/go-units"
2019-04-19 03:53:58 -04:00
"github.com/pkg/errors"
2017-01-23 06:23:07 -05:00
"github.com/spf13/pflag"
)
// installConfigFlags adds flags to the pflag.FlagSet to configure the daemon
2018-10-15 03:52:53 -04:00
func installConfigFlags ( conf * config . Config , flags * pflag . FlagSet ) error {
2017-01-23 06:23:07 -05:00
// First handle install flags which are consistent cross-platform
2018-10-15 03:52:53 -04:00
if err := installCommonConfigFlags ( conf , flags ) ; err != nil {
return err
}
2017-01-23 06:23:07 -05:00
// Then install flags common to unix platforms
installUnixConfigFlags ( conf , flags )
conf . Ulimits = make ( map [ string ] * units . Ulimit )
2016-12-13 18:04:59 -05:00
conf . NetworkConfig . DefaultAddressPools = opts . PoolsOpt { }
2017-01-23 06:23:07 -05:00
// Set default value for `--default-shm-size`
conf . ShmSize = opts . MemBytes ( config . DefaultShmSize )
// Then platform-specific install flags
flags . BoolVar ( & conf . EnableSelinuxSupport , "selinux-enabled" , false , "Enable selinux support" )
2017-04-12 03:26:42 -04:00
flags . Var ( opts . NewNamedUlimitOpt ( "default-ulimits" , & conf . Ulimits ) , "default-ulimit" , "Default ulimits for containers" )
2017-01-23 06:23:07 -05:00
flags . BoolVar ( & conf . BridgeConfig . EnableIPTables , "iptables" , true , "Enable addition of iptables rules" )
2020-07-18 14:07:59 -04:00
flags . BoolVar ( & conf . BridgeConfig . EnableIP6Tables , "ip6tables" , false , "Enable addition of ip6tables rules" )
2017-01-23 06:23:07 -05:00
flags . BoolVar ( & conf . BridgeConfig . EnableIPForward , "ip-forward" , true , "Enable net.ipv4.ip_forward" )
flags . BoolVar ( & conf . BridgeConfig . EnableIPMasq , "ip-masq" , true , "Enable IP masquerading" )
flags . BoolVar ( & conf . BridgeConfig . EnableIPv6 , "ipv6" , false , "Enable IPv6 networking" )
flags . StringVar ( & conf . BridgeConfig . FixedCIDRv6 , "fixed-cidr-v6" , "" , "IPv6 subnet for fixed IPs" )
flags . BoolVar ( & conf . BridgeConfig . EnableUserlandProxy , "userland-proxy" , true , "Use userland proxy for loopback traffic" )
2019-04-19 03:53:58 -04:00
defaultUserlandProxyPath := ""
if rootless . RunningWithRootlessKit ( ) {
var err error
// use rootlesskit-docker-proxy for exposing the ports in RootlessKit netns to the initial namespace.
defaultUserlandProxyPath , err = exec . LookPath ( rootless . RootlessKitDockerProxyBinary )
if err != nil {
return errors . Wrapf ( err , "running with RootlessKit, but %s not installed" , rootless . RootlessKitDockerProxyBinary )
}
}
flags . StringVar ( & conf . BridgeConfig . UserlandProxyPath , "userland-proxy-path" , defaultUserlandProxyPath , "Path to the userland proxy binary" )
2017-01-23 06:23:07 -05:00
flags . StringVar ( & conf . CgroupParent , "cgroup-parent" , "" , "Set parent cgroup for all containers" )
flags . StringVar ( & conf . RemappedRoot , "userns-remap" , "" , "User/Group setting for user namespaces" )
flags . BoolVar ( & conf . LiveRestoreEnabled , "live-restore" , false , "Enable live restore of docker when containers are still running" )
2020-10-05 12:55:43 -04:00
flags . IntVar ( & conf . OOMScoreAdjust , "oom-score-adjust" , 0 , "Set the oom_score_adj for the daemon" )
2017-01-23 06:23:07 -05:00
flags . BoolVar ( & conf . Init , "init" , false , "Run an init in the container to forward signals and reap processes" )
flags . StringVar ( & conf . InitPath , "init-path" , "" , "Path to the docker-init binary" )
2019-06-21 11:15:21 -04:00
flags . Int64Var ( & conf . CPURealtimePeriod , "cpu-rt-period" , 0 , "Limit the CPU real-time period in microseconds for the parent cgroup for all containers" )
flags . Int64Var ( & conf . CPURealtimeRuntime , "cpu-rt-runtime" , 0 , "Limit the CPU real-time runtime in microseconds for the parent cgroup for all containers" )
2017-01-23 06:23:07 -05:00
flags . StringVar ( & conf . SeccompProfile , "seccomp-profile" , "" , "Path to seccomp profile" )
flags . Var ( & conf . ShmSize , "default-shm-size" , "Default shm size for containers" )
2017-05-01 10:15:03 -04:00
flags . BoolVar ( & conf . NoNewPrivileges , "no-new-privileges" , false , "Set no-new-privileges by default for new containers" )
Implement none, private, and shareable ipc modes
Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and
/dev/mqueue between containers") container's /dev/shm is mounted on the
host first, then bind-mounted inside the container. This is done that
way in order to be able to share this container's IPC namespace
(and the /dev/shm mount point) with another container.
Unfortunately, this functionality breaks container checkpoint/restore
(even if IPC is not shared). Since /dev/shm is an external mount, its
contents is not saved by `criu checkpoint`, and so upon restore any
application that tries to access data under /dev/shm is severily
disappointed (which usually results in a fatal crash).
This commit solves the issue by introducing new IPC modes for containers
(in addition to 'host' and 'container:ID'). The new modes are:
- 'shareable': enables sharing this container's IPC with others
(this used to be the implicit default);
- 'private': disables sharing this container's IPC.
In 'private' mode, container's /dev/shm is truly mounted inside the
container, without any bind-mounting from the host, which solves the
issue.
While at it, let's also implement 'none' mode. The motivation, as
eloquently put by Justin Cormack, is:
> I wondered a while back about having a none shm mode, as currently it is
> not possible to have a totally unwriteable container as there is always
> a /dev/shm writeable mount. It is a bit of a niche case (and clearly
> should never be allowed to be daemon default) but it would be trivial to
> add now so maybe we should...
...so here's yet yet another mode:
- 'none': no /dev/shm mount inside the container (though it still
has its own private IPC namespace).
Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd
need to make 'private' the default mode, but unfortunately it breaks the
backward compatibility. So, let's make the default container IPC mode
per-daemon configurable (with the built-in default set to 'shareable'
for now). The default can be changed either via a daemon CLI option
(--default-shm-mode) or a daemon.json configuration file parameter
of the same name.
Note one can only set either 'shareable' or 'private' IPC modes as a
daemon default (i.e. in this context 'host', 'container', or 'none'
do not make much sense).
Some other changes this patch introduces are:
1. A mount for /dev/shm is added to default OCI Linux spec.
2. IpcMode.Valid() is simplified to remove duplicated code that parsed
'container:ID' form. Note the old version used to check that ID does
not contain a semicolon -- this is no longer the case (tests are
modified accordingly). The motivation is we should either do a
proper check for container ID validity, or don't check it at all
(since it is checked in other places anyway). I chose the latter.
3. IpcMode.Container() is modified to not return container ID if the
mode value does not start with "container:", unifying the check to
be the same as in IpcMode.IsContainer().
3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified
to add checks for newly added values.
[v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997]
[v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833]
[v4: addressed the case of upgrading from older daemon, in this case
container.HostConfig.IpcMode is unset and this is valid]
[v5: document old and new IpcMode values in api/swagger.yaml]
[v6: add the 'none' mode, changelog entry to docs/api/version-history.md]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 17:58:50 -04:00
flags . StringVar ( & conf . IpcMode , "default-ipc-mode" , config . DefaultIpcMode , ` Default mode for containers ipc ("shareable" | "private") ` )
2016-12-13 18:04:59 -05:00
flags . Var ( & conf . NetworkConfig . DefaultAddressPools , "default-address-pool" , "Default address pools for node specific local networks" )
2019-04-19 03:53:58 -04:00
// rootless needs to be explicitly specified for running "rootful" dockerd in rootless dockerd (#38702)
// Note that defaultUserlandProxyPath and honorXDG are configured according to the value of rootless.RunningWithRootlessKit, not the value of --rootless.
2020-04-28 22:06:11 -04:00
flags . BoolVar ( & conf . Rootless , "rootless" , rootless . RunningWithRootlessKit ( ) , "Enable rootless mode; typically used with RootlessKit" )
2019-11-05 04:04:21 -05:00
defaultCgroupNamespaceMode := "host"
2020-11-09 09:00:32 -05:00
if cgroups . Mode ( ) == cgroups . Unified {
2019-11-05 04:04:21 -05:00
defaultCgroupNamespaceMode = "private"
}
flags . StringVar ( & conf . CgroupNamespaceMode , "default-cgroupns-mode" , defaultCgroupNamespaceMode , ` Default mode for containers cgroup namespace ("host" | "private") ` )
2018-10-15 03:52:53 -04:00
return nil
2017-01-23 06:23:07 -05:00
}