moby--moby/cmd/dockerd/config_unix.go

77 lines
4.5 KiB
Go
Raw Normal View History

//go:build linux || freebsd
// +build linux freebsd
package main
import (
"os/exec"
"github.com/containerd/cgroups"
"github.com/docker/docker/daemon/config"
"github.com/docker/docker/opts"
"github.com/docker/docker/rootless"
units "github.com/docker/go-units"
"github.com/pkg/errors"
"github.com/spf13/pflag"
)
// installConfigFlags adds flags to the pflag.FlagSet to configure the daemon
func installConfigFlags(conf *config.Config, flags *pflag.FlagSet) error {
// First handle install flags which are consistent cross-platform
if err := installCommonConfigFlags(conf, flags); err != nil {
return err
}
// Then install flags common to unix platforms
installUnixConfigFlags(conf, flags)
conf.Ulimits = make(map[string]*units.Ulimit)
conf.NetworkConfig.DefaultAddressPools = opts.PoolsOpt{}
// Set default value for `--default-shm-size`
conf.ShmSize = opts.MemBytes(config.DefaultShmSize)
// Then platform-specific install flags
flags.BoolVar(&conf.EnableSelinuxSupport, "selinux-enabled", false, "Enable selinux support")
flags.Var(opts.NewNamedUlimitOpt("default-ulimits", &conf.Ulimits), "default-ulimit", "Default ulimits for containers")
flags.BoolVar(&conf.BridgeConfig.EnableIPTables, "iptables", true, "Enable addition of iptables rules")
flags.BoolVar(&conf.BridgeConfig.EnableIP6Tables, "ip6tables", false, "Enable addition of ip6tables rules")
flags.BoolVar(&conf.BridgeConfig.EnableIPForward, "ip-forward", true, "Enable net.ipv4.ip_forward")
flags.BoolVar(&conf.BridgeConfig.EnableIPMasq, "ip-masq", true, "Enable IP masquerading")
flags.BoolVar(&conf.BridgeConfig.EnableIPv6, "ipv6", false, "Enable IPv6 networking")
flags.StringVar(&conf.BridgeConfig.FixedCIDRv6, "fixed-cidr-v6", "", "IPv6 subnet for fixed IPs")
flags.BoolVar(&conf.BridgeConfig.EnableUserlandProxy, "userland-proxy", true, "Use userland proxy for loopback traffic")
defaultUserlandProxyPath := ""
if rootless.RunningWithRootlessKit() {
var err error
// use rootlesskit-docker-proxy for exposing the ports in RootlessKit netns to the initial namespace.
defaultUserlandProxyPath, err = exec.LookPath(rootless.RootlessKitDockerProxyBinary)
if err != nil {
return errors.Wrapf(err, "running with RootlessKit, but %s not installed", rootless.RootlessKitDockerProxyBinary)
}
}
flags.StringVar(&conf.BridgeConfig.UserlandProxyPath, "userland-proxy-path", defaultUserlandProxyPath, "Path to the userland proxy binary")
flags.StringVar(&conf.CgroupParent, "cgroup-parent", "", "Set parent cgroup for all containers")
flags.StringVar(&conf.RemappedRoot, "userns-remap", "", "User/Group setting for user namespaces")
flags.BoolVar(&conf.LiveRestoreEnabled, "live-restore", false, "Enable live restore of docker when containers are still running")
flags.IntVar(&conf.OOMScoreAdjust, "oom-score-adjust", 0, "Set the oom_score_adj for the daemon")
flags.BoolVar(&conf.Init, "init", false, "Run an init in the container to forward signals and reap processes")
flags.StringVar(&conf.InitPath, "init-path", "", "Path to the docker-init binary")
flags.Int64Var(&conf.CPURealtimePeriod, "cpu-rt-period", 0, "Limit the CPU real-time period in microseconds for the parent cgroup for all containers")
flags.Int64Var(&conf.CPURealtimeRuntime, "cpu-rt-runtime", 0, "Limit the CPU real-time runtime in microseconds for the parent cgroup for all containers")
flags.StringVar(&conf.SeccompProfile, "seccomp-profile", "", "Path to seccomp profile")
flags.Var(&conf.ShmSize, "default-shm-size", "Default shm size for containers")
flags.BoolVar(&conf.NoNewPrivileges, "no-new-privileges", false, "Set no-new-privileges by default for new containers")
Implement none, private, and shareable ipc modes Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and /dev/mqueue between containers") container's /dev/shm is mounted on the host first, then bind-mounted inside the container. This is done that way in order to be able to share this container's IPC namespace (and the /dev/shm mount point) with another container. Unfortunately, this functionality breaks container checkpoint/restore (even if IPC is not shared). Since /dev/shm is an external mount, its contents is not saved by `criu checkpoint`, and so upon restore any application that tries to access data under /dev/shm is severily disappointed (which usually results in a fatal crash). This commit solves the issue by introducing new IPC modes for containers (in addition to 'host' and 'container:ID'). The new modes are: - 'shareable': enables sharing this container's IPC with others (this used to be the implicit default); - 'private': disables sharing this container's IPC. In 'private' mode, container's /dev/shm is truly mounted inside the container, without any bind-mounting from the host, which solves the issue. While at it, let's also implement 'none' mode. The motivation, as eloquently put by Justin Cormack, is: > I wondered a while back about having a none shm mode, as currently it is > not possible to have a totally unwriteable container as there is always > a /dev/shm writeable mount. It is a bit of a niche case (and clearly > should never be allowed to be daemon default) but it would be trivial to > add now so maybe we should... ...so here's yet yet another mode: - 'none': no /dev/shm mount inside the container (though it still has its own private IPC namespace). Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd need to make 'private' the default mode, but unfortunately it breaks the backward compatibility. So, let's make the default container IPC mode per-daemon configurable (with the built-in default set to 'shareable' for now). The default can be changed either via a daemon CLI option (--default-shm-mode) or a daemon.json configuration file parameter of the same name. Note one can only set either 'shareable' or 'private' IPC modes as a daemon default (i.e. in this context 'host', 'container', or 'none' do not make much sense). Some other changes this patch introduces are: 1. A mount for /dev/shm is added to default OCI Linux spec. 2. IpcMode.Valid() is simplified to remove duplicated code that parsed 'container:ID' form. Note the old version used to check that ID does not contain a semicolon -- this is no longer the case (tests are modified accordingly). The motivation is we should either do a proper check for container ID validity, or don't check it at all (since it is checked in other places anyway). I chose the latter. 3. IpcMode.Container() is modified to not return container ID if the mode value does not start with "container:", unifying the check to be the same as in IpcMode.IsContainer(). 3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified to add checks for newly added values. [v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997] [v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833] [v4: addressed the case of upgrading from older daemon, in this case container.HostConfig.IpcMode is unset and this is valid] [v5: document old and new IpcMode values in api/swagger.yaml] [v6: add the 'none' mode, changelog entry to docs/api/version-history.md] Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 21:58:50 +00:00
flags.StringVar(&conf.IpcMode, "default-ipc-mode", config.DefaultIpcMode, `Default mode for containers ipc ("shareable" | "private")`)
flags.Var(&conf.NetworkConfig.DefaultAddressPools, "default-address-pool", "Default address pools for node specific local networks")
// rootless needs to be explicitly specified for running "rootful" dockerd in rootless dockerd (#38702)
// Note that defaultUserlandProxyPath and honorXDG are configured according to the value of rootless.RunningWithRootlessKit, not the value of --rootless.
flags.BoolVar(&conf.Rootless, "rootless", rootless.RunningWithRootlessKit(), "Enable rootless mode; typically used with RootlessKit")
defaultCgroupNamespaceMode := "host"
if cgroups.Mode() == cgroups.Unified {
defaultCgroupNamespaceMode = "private"
}
flags.StringVar(&conf.CgroupNamespaceMode, "default-cgroupns-mode", defaultCgroupNamespaceMode, `Default mode for containers cgroup namespace ("host" | "private")`)
return nil
}