diff --git a/daemon/daemon_linux_test.go b/daemon/daemon_linux_test.go index 88577a2b82..e26c08f24e 100644 --- a/daemon/daemon_linux_test.go +++ b/daemon/daemon_linux_test.go @@ -10,10 +10,7 @@ import ( "testing" containertypes "github.com/docker/docker/api/types/container" - "github.com/docker/docker/container" "github.com/docker/docker/daemon/config" - "github.com/docker/docker/oci" - "github.com/docker/docker/pkg/idtools" "github.com/docker/docker/pkg/mount" "gotest.tools/assert" is "gotest.tools/assert/cmp" @@ -115,54 +112,6 @@ func TestNotCleanupMounts(t *testing.T) { } } -// TestTmpfsDevShmSizeOverride checks that user-specified /dev/tmpfs mount -// size is not overridden by the default shmsize (that should only be used -// for default /dev/shm (as in "shareable" and "private" ipc modes). -// https://github.com/moby/moby/issues/35271 -func TestTmpfsDevShmSizeOverride(t *testing.T) { - size := "777m" - mnt := "/dev/shm" - - d := Daemon{ - idMapping: &idtools.IdentityMapping{}, - } - c := &container.Container{ - HostConfig: &containertypes.HostConfig{ - ShmSize: 48 * 1024, // size we should NOT end up with - }, - } - ms := []container.Mount{ - { - Source: "tmpfs", - Destination: mnt, - Data: "size=" + size, - }, - } - - // convert ms to spec - spec := oci.DefaultSpec() - err := setMounts(&d, &spec, c, ms) - assert.Check(t, err) - - // Check the resulting spec for the correct size - found := false - for _, m := range spec.Mounts { - if m.Destination == mnt { - for _, o := range m.Options { - if !strings.HasPrefix(o, "size=") { - continue - } - t.Logf("%+v\n", m.Options) - assert.Check(t, is.Equal("size="+size, o)) - found = true - } - } - } - if !found { - t.Fatal("/dev/shm not found in spec, or size option missing") - } -} - func TestValidateContainerIsolationLinux(t *testing.T) { d := Daemon{} diff --git a/daemon/exec_linux.go b/daemon/exec_linux.go index 2c4f96a5c8..f712dd31df 100644 --- a/daemon/exec_linux.go +++ b/daemon/exec_linux.go @@ -1,6 +1,8 @@ package daemon // import "github.com/docker/docker/daemon" import ( + "context" + "github.com/docker/docker/container" "github.com/docker/docker/daemon/exec" "github.com/docker/docker/oci/caps" @@ -54,6 +56,6 @@ func (daemon *Daemon) execSetPlatformOpt(c *container.Container, ec *exec.Config } p.ApparmorProfile = appArmorProfile } - daemon.setRlimits(&specs.Spec{Process: p}, c) - return nil + s := &specs.Spec{Process: p} + return WithRlimits(daemon, c)(context.Background(), nil, nil, s) } diff --git a/daemon/oci_linux.go b/daemon/oci_linux.go index e2346633ae..ddb4192060 100644 --- a/daemon/oci_linux.go +++ b/daemon/oci_linux.go @@ -33,27 +33,121 @@ import ( "golang.org/x/sys/unix" ) -const ( - inContainerInitPath = "/sbin/" + daemonconfig.DefaultInitBinary -) +const inContainerInitPath = "/sbin/" + daemonconfig.DefaultInitBinary -func (daemon *Daemon) setRlimits(s *specs.Spec, c *container.Container) error { - var rlimits []specs.POSIXRlimit +// WithRlimits sets the container's rlimits along with merging the daemon's rlimits +func WithRlimits(daemon *Daemon, c *container.Container) coci.SpecOpts { + return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { + var rlimits []specs.POSIXRlimit - // We want to leave the original HostConfig alone so make a copy here - hostConfig := *c.HostConfig - // Merge with the daemon defaults - daemon.mergeUlimits(&hostConfig) - for _, ul := range hostConfig.Ulimits { - rlimits = append(rlimits, specs.POSIXRlimit{ - Type: "RLIMIT_" + strings.ToUpper(ul.Name), - Soft: uint64(ul.Soft), - Hard: uint64(ul.Hard), - }) + // We want to leave the original HostConfig alone so make a copy here + hostConfig := *c.HostConfig + // Merge with the daemon defaults + daemon.mergeUlimits(&hostConfig) + for _, ul := range hostConfig.Ulimits { + rlimits = append(rlimits, specs.POSIXRlimit{ + Type: "RLIMIT_" + strings.ToUpper(ul.Name), + Soft: uint64(ul.Soft), + Hard: uint64(ul.Hard), + }) + } + + s.Process.Rlimits = rlimits + return nil } +} - s.Process.Rlimits = rlimits - return nil +// WithLibnetwork sets the libnetwork hook +func WithLibnetwork(daemon *Daemon, c *container.Container) coci.SpecOpts { + return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { + if s.Hooks == nil { + s.Hooks = &specs.Hooks{} + } + for _, ns := range s.Linux.Namespaces { + if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled { + target := filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe") + s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{ + Path: target, + Args: []string{ + "libnetwork-setkey", + "-exec-root=" + daemon.configStore.GetExecRoot(), + c.ID, + daemon.netController.ID(), + }, + }) + } + } + return nil + } +} + +// WithRootless sets the spec to the rootless configuration +func WithRootless(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { + return specconv.ToRootless(s) +} + +// WithOOMScore sets the oom score +func WithOOMScore(score *int) coci.SpecOpts { + return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { + s.Process.OOMScoreAdj = score + return nil + } +} + +// WithSelinux sets the selinux labels +func WithSelinux(c *container.Container) coci.SpecOpts { + return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { + s.Process.SelinuxLabel = c.GetProcessLabel() + s.Linux.MountLabel = c.MountLabel + return nil + } +} + +// WithApparmor sets the apparmor profile +func WithApparmor(c *container.Container) coci.SpecOpts { + return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { + if apparmor.IsEnabled() { + var appArmorProfile string + if c.AppArmorProfile != "" { + appArmorProfile = c.AppArmorProfile + } else if c.HostConfig.Privileged { + appArmorProfile = "unconfined" + } else { + appArmorProfile = "docker-default" + } + + if appArmorProfile == "docker-default" { + // Unattended upgrades and other fun services can unload AppArmor + // profiles inadvertently. Since we cannot store our profile in + // /etc/apparmor.d, nor can we practically add other ways of + // telling the system to keep our profile loaded, in order to make + // sure that we keep the default profile enabled we dynamically + // reload it if necessary. + if err := ensureDefaultAppArmorProfile(); err != nil { + return err + } + } + s.Process.ApparmorProfile = appArmorProfile + } + return nil + } +} + +// WithCapabilities sets the container's capabilties +func WithCapabilities(c *container.Container) coci.SpecOpts { + return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { + capabilities, err := caps.TweakCapabilities( + oci.DefaultCapabilities(), + c.HostConfig.CapAdd, + c.HostConfig.CapDrop, + c.HostConfig.Capabilities, + c.HostConfig.Privileged, + ) + if err != nil { + return err + } + return oci.SetCapabilities(s, capabilities) + } } func readUserFile(c *container.Container, p string) (io.ReadCloser, error) { @@ -119,99 +213,102 @@ func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) { s.Linux.Namespaces = append(s.Linux.Namespaces, ns) } -func setNamespaces(daemon *Daemon, s *specs.Spec, c *container.Container) error { - userNS := false - // user - if c.HostConfig.UsernsMode.IsPrivate() { - uidMap := daemon.idMapping.UIDs() - if uidMap != nil { - userNS = true - ns := specs.LinuxNamespace{Type: "user"} - setNamespace(s, ns) - s.Linux.UIDMappings = specMapping(uidMap) - s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDs()) +// WithNamespaces sets the container's namespaces +func WithNamespaces(daemon *Daemon, c *container.Container) coci.SpecOpts { + return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { + userNS := false + // user + if c.HostConfig.UsernsMode.IsPrivate() { + uidMap := daemon.idMapping.UIDs() + if uidMap != nil { + userNS = true + ns := specs.LinuxNamespace{Type: "user"} + setNamespace(s, ns) + s.Linux.UIDMappings = specMapping(uidMap) + s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDs()) + } } - } - // network - if !c.Config.NetworkDisabled { - ns := specs.LinuxNamespace{Type: "network"} - parts := strings.SplitN(string(c.HostConfig.NetworkMode), ":", 2) - if parts[0] == "container" { - nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer()) + // network + if !c.Config.NetworkDisabled { + ns := specs.LinuxNamespace{Type: "network"} + parts := strings.SplitN(string(c.HostConfig.NetworkMode), ":", 2) + if parts[0] == "container" { + nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer()) + if err != nil { + return err + } + ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID()) + if userNS { + // to share a net namespace, they must also share a user namespace + nsUser := specs.LinuxNamespace{Type: "user"} + nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID()) + setNamespace(s, nsUser) + } + } else if c.HostConfig.NetworkMode.IsHost() { + ns.Path = c.NetworkSettings.SandboxKey + } + setNamespace(s, ns) + } + + // ipc + ipcMode := c.HostConfig.IpcMode + switch { + case ipcMode.IsContainer(): + ns := specs.LinuxNamespace{Type: "ipc"} + ic, err := daemon.getIpcContainer(ipcMode.Container()) if err != nil { return err } - ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID()) + ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID()) + setNamespace(s, ns) if userNS { - // to share a net namespace, they must also share a user namespace + // to share an IPC namespace, they must also share a user namespace nsUser := specs.LinuxNamespace{Type: "user"} - nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID()) + nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID()) setNamespace(s, nsUser) } - } else if c.HostConfig.NetworkMode.IsHost() { - ns.Path = c.NetworkSettings.SandboxKey + case ipcMode.IsHost(): + oci.RemoveNamespace(s, specs.LinuxNamespaceType("ipc")) + case ipcMode.IsEmpty(): + // A container was created by an older version of the daemon. + // The default behavior used to be what is now called "shareable". + fallthrough + case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone(): + ns := specs.LinuxNamespace{Type: "ipc"} + setNamespace(s, ns) + default: + return fmt.Errorf("Invalid IPC mode: %v", ipcMode) } - setNamespace(s, ns) - } - // ipc - ipcMode := c.HostConfig.IpcMode - switch { - case ipcMode.IsContainer(): - ns := specs.LinuxNamespace{Type: "ipc"} - ic, err := daemon.getIpcContainer(ipcMode.Container()) - if err != nil { - return err + // pid + if c.HostConfig.PidMode.IsContainer() { + ns := specs.LinuxNamespace{Type: "pid"} + pc, err := daemon.getPidContainer(c) + if err != nil { + return err + } + ns.Path = fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()) + setNamespace(s, ns) + if userNS { + // to share a PID namespace, they must also share a user namespace + nsUser := specs.LinuxNamespace{Type: "user"} + nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()) + setNamespace(s, nsUser) + } + } else if c.HostConfig.PidMode.IsHost() { + oci.RemoveNamespace(s, specs.LinuxNamespaceType("pid")) + } else { + ns := specs.LinuxNamespace{Type: "pid"} + setNamespace(s, ns) } - ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID()) - setNamespace(s, ns) - if userNS { - // to share an IPC namespace, they must also share a user namespace - nsUser := specs.LinuxNamespace{Type: "user"} - nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID()) - setNamespace(s, nsUser) + // uts + if c.HostConfig.UTSMode.IsHost() { + oci.RemoveNamespace(s, specs.LinuxNamespaceType("uts")) + s.Hostname = "" } - case ipcMode.IsHost(): - oci.RemoveNamespace(s, specs.LinuxNamespaceType("ipc")) - case ipcMode.IsEmpty(): - // A container was created by an older version of the daemon. - // The default behavior used to be what is now called "shareable". - fallthrough - case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone(): - ns := specs.LinuxNamespace{Type: "ipc"} - setNamespace(s, ns) - default: - return fmt.Errorf("Invalid IPC mode: %v", ipcMode) - } - // pid - if c.HostConfig.PidMode.IsContainer() { - ns := specs.LinuxNamespace{Type: "pid"} - pc, err := daemon.getPidContainer(c) - if err != nil { - return err - } - ns.Path = fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID()) - setNamespace(s, ns) - if userNS { - // to share a PID namespace, they must also share a user namespace - nsUser := specs.LinuxNamespace{Type: "user"} - nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID()) - setNamespace(s, nsUser) - } - } else if c.HostConfig.PidMode.IsHost() { - oci.RemoveNamespace(s, specs.LinuxNamespaceType("pid")) - } else { - ns := specs.LinuxNamespace{Type: "pid"} - setNamespace(s, ns) + return nil } - // uts - if c.HostConfig.UTSMode.IsHost() { - oci.RemoveNamespace(s, specs.LinuxNamespaceType("uts")) - s.Hostname = "" - } - - return nil } func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping { @@ -361,233 +458,284 @@ func inSlice(slice []string, s string) bool { return false } -func setMounts(daemon *Daemon, s *specs.Spec, c *container.Container, mounts []container.Mount) error { - userMounts := make(map[string]struct{}) - for _, m := range mounts { - userMounts[m.Destination] = struct{}{} - } - - // Copy all mounts from spec to defaultMounts, except for - // - mounts overridden by a user supplied mount; - // - all mounts under /dev if a user supplied /dev is present; - // - /dev/shm, in case IpcMode is none. - // While at it, also - // - set size for /dev/shm from shmsize. - defaultMounts := s.Mounts[:0] - _, mountDev := userMounts["/dev"] - for _, m := range s.Mounts { - if _, ok := userMounts[m.Destination]; ok { - // filter out mount overridden by a user supplied mount - continue - } - if mountDev && strings.HasPrefix(m.Destination, "/dev/") { - // filter out everything under /dev if /dev is user-mounted - continue +// WithMounts sets the container's mounts +func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts { + return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) (err error) { + if err := daemon.setupContainerMountsRoot(c); err != nil { + return err } - if m.Destination == "/dev/shm" { - if c.HostConfig.IpcMode.IsNone() { - // filter out /dev/shm for "none" IpcMode + if err := daemon.setupIpcDirs(c); err != nil { + return err + } + + defer func() { + if err != nil { + daemon.cleanupSecretDir(c) + } + }() + + if err := daemon.setupSecretDir(c); err != nil { + return err + } + + ms, err := daemon.setupMounts(c) + if err != nil { + return err + } + + if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() { + ms = append(ms, c.IpcMounts()...) + } + + tmpfsMounts, err := c.TmpfsMounts() + if err != nil { + return err + } + ms = append(ms, tmpfsMounts...) + + secretMounts, err := c.SecretMounts() + if err != nil { + return err + } + ms = append(ms, secretMounts...) + + sort.Sort(mounts(ms)) + + mounts := ms + + userMounts := make(map[string]struct{}) + for _, m := range mounts { + userMounts[m.Destination] = struct{}{} + } + + // Copy all mounts from spec to defaultMounts, except for + // - mounts overridden by a user supplied mount; + // - all mounts under /dev if a user supplied /dev is present; + // - /dev/shm, in case IpcMode is none. + // While at it, also + // - set size for /dev/shm from shmsize. + defaultMounts := s.Mounts[:0] + _, mountDev := userMounts["/dev"] + for _, m := range s.Mounts { + if _, ok := userMounts[m.Destination]; ok { + // filter out mount overridden by a user supplied mount continue } - // set size for /dev/shm mount from spec - sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10) - m.Options = append(m.Options, sizeOpt) - } - - defaultMounts = append(defaultMounts, m) - } - - s.Mounts = defaultMounts - for _, m := range mounts { - if m.Source == "tmpfs" { - data := m.Data - parser := volumemounts.NewParser("linux") - options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())} - if data != "" { - options = append(options, strings.Split(data, ",")...) - } - - merged, err := mount.MergeTmpfsOptions(options) - if err != nil { - return err - } - - s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged}) - continue - } - - mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"} - - // Determine property of RootPropagation based on volume - // properties. If a volume is shared, then keep root propagation - // shared. This should work for slave and private volumes too. - // - // For slave volumes, it can be either [r]shared/[r]slave. - // - // For private volumes any root propagation value should work. - pFlag := mountPropagationMap[m.Propagation] - switch pFlag { - case mount.SHARED, mount.RSHARED: - if err := ensureShared(m.Source); err != nil { - return err - } - rootpg := mountPropagationMap[s.Linux.RootfsPropagation] - if rootpg != mount.SHARED && rootpg != mount.RSHARED { - s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED] - } - case mount.SLAVE, mount.RSLAVE: - var fallback bool - if err := ensureSharedOrSlave(m.Source); err != nil { - // For backwards compatibility purposes, treat mounts from the daemon root - // as special since we automatically add rslave propagation to these mounts - // when the user did not set anything, so we should fallback to the old - // behavior which is to use private propagation which is normally the - // default. - if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) { - return err - } - - cm, ok := c.MountPoints[m.Destination] - if !ok { - return err - } - if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" { - // This means the user explicitly set a propagation, do not fallback in that case. - return err - } - fallback = true - logrus.WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root") - } - if !fallback { - rootpg := mountPropagationMap[s.Linux.RootfsPropagation] - if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE { - s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE] - } - } - } - - bindMode := "rbind" - if m.NonRecursive { - bindMode = "bind" - } - opts := []string{bindMode} - if !m.Writable { - opts = append(opts, "ro") - } - if pFlag != 0 { - opts = append(opts, mountPropagationReverseMap[pFlag]) - } - - // If we are using user namespaces, then we must make sure that we - // don't drop any of the CL_UNPRIVILEGED "locked" flags of the source - // "mount" when we bind-mount. The reason for this is that at the point - // when runc sets up the root filesystem, it is already inside a user - // namespace, and thus cannot change any flags that are locked. - if daemon.configStore.RemappedRoot != "" { - unprivOpts, err := getUnprivilegedMountFlags(m.Source) - if err != nil { - return err - } - opts = append(opts, unprivOpts...) - } - - mt.Options = opts - s.Mounts = append(s.Mounts, mt) - } - - if s.Root.Readonly { - for i, m := range s.Mounts { - switch m.Destination { - case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev": + if mountDev && strings.HasPrefix(m.Destination, "/dev/") { + // filter out everything under /dev if /dev is user-mounted continue } - if _, ok := userMounts[m.Destination]; !ok { - if !inSlice(m.Options, "ro") { - s.Mounts[i].Options = append(s.Mounts[i].Options, "ro") + + if m.Destination == "/dev/shm" { + if c.HostConfig.IpcMode.IsNone() { + // filter out /dev/shm for "none" IpcMode + continue } + // set size for /dev/shm mount from spec + sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10) + m.Options = append(m.Options, sizeOpt) } + + defaultMounts = append(defaultMounts, m) } - } - if c.HostConfig.Privileged { - // clear readonly for /sys - for i := range s.Mounts { - if s.Mounts[i].Destination == "/sys" { - clearReadOnly(&s.Mounts[i]) - } - } - s.Linux.ReadonlyPaths = nil - s.Linux.MaskedPaths = nil - } + s.Mounts = defaultMounts + for _, m := range mounts { + if m.Source == "tmpfs" { + data := m.Data + parser := volumemounts.NewParser("linux") + options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())} + if data != "" { + options = append(options, strings.Split(data, ",")...) + } - // TODO: until a kernel/mount solution exists for handling remount in a user namespace, - // we must clear the readonly flag for the cgroups mount (@mrunalp concurs) - if uidMap := daemon.idMapping.UIDs(); uidMap != nil || c.HostConfig.Privileged { - for i, m := range s.Mounts { - if m.Type == "cgroup" { - clearReadOnly(&s.Mounts[i]) - } - } - } - - return nil -} - -func (daemon *Daemon) populateCommonSpec(s *specs.Spec, c *container.Container) error { - if c.BaseFS == nil { - return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly nil") - } - linkedEnv, err := daemon.setupLinkedContainers(c) - if err != nil { - return err - } - s.Root = &specs.Root{ - Path: c.BaseFS.Path(), - Readonly: c.HostConfig.ReadonlyRootfs, - } - if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil { - return err - } - cwd := c.Config.WorkingDir - if len(cwd) == 0 { - cwd = "/" - } - s.Process.Args = append([]string{c.Path}, c.Args...) - - // only add the custom init if it is specified and the container is running in its - // own private pid namespace. It does not make sense to add if it is running in the - // host namespace or another container's pid namespace where we already have an init - if c.HostConfig.PidMode.IsPrivate() { - if (c.HostConfig.Init != nil && *c.HostConfig.Init) || - (c.HostConfig.Init == nil && daemon.configStore.Init) { - s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...) - path := daemon.configStore.InitPath - if path == "" { - path, err = exec.LookPath(daemonconfig.DefaultInitBinary) + merged, err := mount.MergeTmpfsOptions(options) if err != nil { return err } + + s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged}) + continue } - s.Mounts = append(s.Mounts, specs.Mount{ - Destination: inContainerInitPath, - Type: "bind", - Source: path, - Options: []string{"bind", "ro"}, - }) + + mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"} + + // Determine property of RootPropagation based on volume + // properties. If a volume is shared, then keep root propagation + // shared. This should work for slave and private volumes too. + // + // For slave volumes, it can be either [r]shared/[r]slave. + // + // For private volumes any root propagation value should work. + pFlag := mountPropagationMap[m.Propagation] + switch pFlag { + case mount.SHARED, mount.RSHARED: + if err := ensureShared(m.Source); err != nil { + return err + } + rootpg := mountPropagationMap[s.Linux.RootfsPropagation] + if rootpg != mount.SHARED && rootpg != mount.RSHARED { + s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED] + } + case mount.SLAVE, mount.RSLAVE: + var fallback bool + if err := ensureSharedOrSlave(m.Source); err != nil { + // For backwards compatibility purposes, treat mounts from the daemon root + // as special since we automatically add rslave propagation to these mounts + // when the user did not set anything, so we should fallback to the old + // behavior which is to use private propagation which is normally the + // default. + if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) { + return err + } + + cm, ok := c.MountPoints[m.Destination] + if !ok { + return err + } + if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" { + // This means the user explicitly set a propagation, do not fallback in that case. + return err + } + fallback = true + logrus.WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root") + } + if !fallback { + rootpg := mountPropagationMap[s.Linux.RootfsPropagation] + if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE { + s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE] + } + } + } + + bindMode := "rbind" + if m.NonRecursive { + bindMode = "bind" + } + opts := []string{bindMode} + if !m.Writable { + opts = append(opts, "ro") + } + if pFlag != 0 { + opts = append(opts, mountPropagationReverseMap[pFlag]) + } + + // If we are using user namespaces, then we must make sure that we + // don't drop any of the CL_UNPRIVILEGED "locked" flags of the source + // "mount" when we bind-mount. The reason for this is that at the point + // when runc sets up the root filesystem, it is already inside a user + // namespace, and thus cannot change any flags that are locked. + if daemon.configStore.RemappedRoot != "" { + unprivOpts, err := getUnprivilegedMountFlags(m.Source) + if err != nil { + return err + } + opts = append(opts, unprivOpts...) + } + + mt.Options = opts + s.Mounts = append(s.Mounts, mt) } + + if s.Root.Readonly { + for i, m := range s.Mounts { + switch m.Destination { + case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev": + continue + } + if _, ok := userMounts[m.Destination]; !ok { + if !inSlice(m.Options, "ro") { + s.Mounts[i].Options = append(s.Mounts[i].Options, "ro") + } + } + } + } + + if c.HostConfig.Privileged { + // clear readonly for /sys + for i := range s.Mounts { + if s.Mounts[i].Destination == "/sys" { + clearReadOnly(&s.Mounts[i]) + } + } + s.Linux.ReadonlyPaths = nil + s.Linux.MaskedPaths = nil + } + + // TODO: until a kernel/mount solution exists for handling remount in a user namespace, + // we must clear the readonly flag for the cgroups mount (@mrunalp concurs) + if uidMap := daemon.idMapping.UIDs(); uidMap != nil || c.HostConfig.Privileged { + for i, m := range s.Mounts { + if m.Type == "cgroup" { + clearReadOnly(&s.Mounts[i]) + } + } + } + + return nil + } - s.Process.Cwd = cwd - s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv) - s.Process.Terminal = c.Config.Tty - - s.Hostname = c.Config.Hostname - setLinuxDomainname(c, s) - - return nil } -func withCgroups(daemon *Daemon, c *container.Container) coci.SpecOpts { +// WithCommonOptions sets common docker options +func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts { + return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { + if c.BaseFS == nil { + return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly nil") + } + linkedEnv, err := daemon.setupLinkedContainers(c) + if err != nil { + return err + } + s.Root = &specs.Root{ + Path: c.BaseFS.Path(), + Readonly: c.HostConfig.ReadonlyRootfs, + } + if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil { + return err + } + cwd := c.Config.WorkingDir + if len(cwd) == 0 { + cwd = "/" + } + s.Process.Args = append([]string{c.Path}, c.Args...) + + // only add the custom init if it is specified and the container is running in its + // own private pid namespace. It does not make sense to add if it is running in the + // host namespace or another container's pid namespace where we already have an init + if c.HostConfig.PidMode.IsPrivate() { + if (c.HostConfig.Init != nil && *c.HostConfig.Init) || + (c.HostConfig.Init == nil && daemon.configStore.Init) { + s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...) + path := daemon.configStore.InitPath + if path == "" { + path, err = exec.LookPath(daemonconfig.DefaultInitBinary) + if err != nil { + return err + } + } + s.Mounts = append(s.Mounts, specs.Mount{ + Destination: inContainerInitPath, + Type: "bind", + Source: path, + Options: []string{"bind", "ro"}, + }) + } + } + s.Process.Cwd = cwd + s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv) + s.Process.Terminal = c.Config.Tty + + s.Hostname = c.Config.Hostname + setLinuxDomainname(c, s) + + return nil + } +} + +// WithCgroups sets the container's cgroups +func WithCgroups(daemon *Daemon, c *container.Container) coci.SpecOpts { return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { var cgroupsPath string scopePrefix := "docker" @@ -636,7 +784,8 @@ func withCgroups(daemon *Daemon, c *container.Container) coci.SpecOpts { } } -func withContainerDevices(daemon *Daemon, c *container.Container) coci.SpecOpts { +// WithDevices sets the container's devices +func WithDevices(daemon *Daemon, c *container.Container) coci.SpecOpts { return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { // Build lists of devices allowed and created within the container. var devs []specs.LinuxDevice @@ -684,7 +833,8 @@ func withContainerDevices(daemon *Daemon, c *container.Container) coci.SpecOpts } } -func withResources(c *container.Container) coci.SpecOpts { +// WithResources applies the container resources +func WithResources(c *container.Container) coci.SpecOpts { return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { r := c.HostConfig.Resources weightDevices, err := getBlkioWeightDevices(r) @@ -738,7 +888,8 @@ func withResources(c *container.Container) coci.SpecOpts { } } -func withSysctls(c *container.Container) coci.SpecOpts { +// WithSysctls sets the container's sysctls +func WithSysctls(c *container.Container) coci.SpecOpts { return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { // We merge the sysctls injected above with the HostConfig (latter takes // precedence for backwards-compatibility reasons). @@ -749,7 +900,8 @@ func withSysctls(c *container.Container) coci.SpecOpts { } } -func withUser(c *container.Container) coci.SpecOpts { +// WithUser sets the container's user +func WithUser(c *container.Container) coci.SpecOpts { return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { uid, gid, additionalGids, err := getUser(c, c.Config.User) if err != nil { @@ -767,133 +919,36 @@ func (daemon *Daemon) createSpec(c *container.Container) (retSpec *specs.Spec, e opts []coci.SpecOpts s = oci.DefaultSpec() ) - if err := daemon.populateCommonSpec(&s, c); err != nil { - return nil, err - } - opts = append(opts, - withCgroups(daemon, c), - withResources(c), - withSysctls(c), - withContainerDevices(daemon, c), - withUser(c), + WithCommonOptions(daemon, c), + WithCgroups(daemon, c), + WithResources(c), + WithSysctls(c), + WithDevices(daemon, c), + WithUser(c), + WithRlimits(daemon, c), + WithNamespaces(daemon, c), + WithCapabilities(c), + WithSeccomp(daemon, c), + WithMounts(daemon, c), + WithLibnetwork(daemon, c), + WithApparmor(c), + WithSelinux(c), + WithOOMScore(&c.HostConfig.OomScoreAdj), ) - - if err := daemon.setRlimits(&s, c); err != nil { - return nil, fmt.Errorf("linux runtime spec rlimits: %v", err) + if c.NoNewPrivileges { + opts = append(opts, coci.WithNoNewPrivileges) } - if err := setNamespaces(daemon, &s, c); err != nil { - return nil, fmt.Errorf("linux spec namespaces: %v", err) - } - capabilities, err := caps.TweakCapabilities(oci.DefaultCapabilities(), c.HostConfig.CapAdd, c.HostConfig.CapDrop, c.HostConfig.Capabilities, c.HostConfig.Privileged) - if err != nil { - return nil, fmt.Errorf("linux spec capabilities: %v", err) - } - if err := oci.SetCapabilities(&s, capabilities); err != nil { - return nil, fmt.Errorf("linux spec capabilities: %v", err) - } - if err := setSeccomp(daemon, &s, c); err != nil { - return nil, fmt.Errorf("linux seccomp: %v", err) - } - - if err := daemon.setupContainerMountsRoot(c); err != nil { - return nil, err - } - - if err := daemon.setupIpcDirs(c); err != nil { - return nil, err - } - - defer func() { - if err != nil { - daemon.cleanupSecretDir(c) - } - }() - - if err := daemon.setupSecretDir(c); err != nil { - return nil, err - } - - ms, err := daemon.setupMounts(c) - if err != nil { - return nil, err - } - - if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() { - ms = append(ms, c.IpcMounts()...) - } - - tmpfsMounts, err := c.TmpfsMounts() - if err != nil { - return nil, err - } - ms = append(ms, tmpfsMounts...) - - secretMounts, err := c.SecretMounts() - if err != nil { - return nil, err - } - ms = append(ms, secretMounts...) - - sort.Sort(mounts(ms)) - if err := setMounts(daemon, &s, c, ms); err != nil { - return nil, fmt.Errorf("linux mounts: %v", err) - } - - if s.Hooks == nil { - s.Hooks = &specs.Hooks{} - } - for _, ns := range s.Linux.Namespaces { - if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled { - target := filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe") - s.Hooks.Prestart = append(s.Hooks.Prestart, specs.Hook{ - Path: target, - Args: []string{"libnetwork-setkey", "-exec-root=" + daemon.configStore.GetExecRoot(), c.ID, daemon.netController.ID()}, - }) - } - } - - if apparmor.IsEnabled() { - var appArmorProfile string - if c.AppArmorProfile != "" { - appArmorProfile = c.AppArmorProfile - } else if c.HostConfig.Privileged { - appArmorProfile = "unconfined" - } else { - appArmorProfile = "docker-default" - } - - if appArmorProfile == "docker-default" { - // Unattended upgrades and other fun services can unload AppArmor - // profiles inadvertently. Since we cannot store our profile in - // /etc/apparmor.d, nor can we practically add other ways of - // telling the system to keep our profile loaded, in order to make - // sure that we keep the default profile enabled we dynamically - // reload it if necessary. - if err := ensureDefaultAppArmorProfile(); err != nil { - return nil, err - } - } - - s.Process.ApparmorProfile = appArmorProfile - } - s.Process.SelinuxLabel = c.GetProcessLabel() - s.Process.NoNewPrivileges = c.NoNewPrivileges - s.Process.OOMScoreAdj = &c.HostConfig.OomScoreAdj - s.Linux.MountLabel = c.MountLabel // Set the masked and readonly paths with regard to the host config options if they are set. if c.HostConfig.MaskedPaths != nil { - s.Linux.MaskedPaths = c.HostConfig.MaskedPaths + opts = append(opts, coci.WithMaskedPaths(c.HostConfig.MaskedPaths)) } if c.HostConfig.ReadonlyPaths != nil { - s.Linux.ReadonlyPaths = c.HostConfig.ReadonlyPaths + opts = append(opts, coci.WithReadonlyPaths(c.HostConfig.ReadonlyPaths)) } - if daemon.configStore.Rootless { - if err := specconv.ToRootless(&s); err != nil { - return nil, err - } + opts = append(opts, WithRootless) } return &s, coci.ApplyOpts(context.Background(), nil, &containers.Container{ ID: c.ID, diff --git a/daemon/seccomp_disabled.go b/daemon/seccomp_disabled.go index 3855c7830e..56b30b1aec 100644 --- a/daemon/seccomp_disabled.go +++ b/daemon/seccomp_disabled.go @@ -3,17 +3,22 @@ package daemon // import "github.com/docker/docker/daemon" import ( + "context" "fmt" + "github.com/containerd/containerd/containers" + coci "github.com/containerd/containerd/oci" "github.com/docker/docker/container" - "github.com/opencontainers/runtime-spec/specs-go" ) var supportsSeccomp = false -func setSeccomp(daemon *Daemon, rs *specs.Spec, c *container.Container) error { - if c.SeccompProfile != "" && c.SeccompProfile != "unconfined" { - return fmt.Errorf("seccomp profiles are not supported on this daemon, you cannot specify a custom seccomp profile") +// WithSeccomp sets the seccomp profile +func WithSeccomp(daemon *Daemon, c *container.Container) coci.SpecOpts { + return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { + if c.SeccompProfile != "" && c.SeccompProfile != "unconfined" { + return fmt.Errorf("seccomp profiles are not supported on this daemon, you cannot specify a custom seccomp profile") + } + return nil } - return nil } diff --git a/daemon/seccomp_linux.go b/daemon/seccomp_linux.go index 66ab8c768c..27461775d4 100644 --- a/daemon/seccomp_linux.go +++ b/daemon/seccomp_linux.go @@ -3,8 +3,11 @@ package daemon // import "github.com/docker/docker/daemon" import ( + "context" "fmt" + "github.com/containerd/containerd/containers" + coci "github.com/containerd/containerd/oci" "github.com/docker/docker/container" "github.com/docker/docker/profiles/seccomp" "github.com/opencontainers/runtime-spec/specs-go" @@ -13,43 +16,46 @@ import ( var supportsSeccomp = true -func setSeccomp(daemon *Daemon, rs *specs.Spec, c *container.Container) error { - var profile *specs.LinuxSeccomp - var err error +// WithSeccomp sets the seccomp profile +func WithSeccomp(daemon *Daemon, c *container.Container) coci.SpecOpts { + return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { + var profile *specs.LinuxSeccomp + var err error - if c.HostConfig.Privileged { - return nil - } + if c.HostConfig.Privileged { + return nil + } - if !daemon.seccompEnabled { - if c.SeccompProfile != "" && c.SeccompProfile != "unconfined" { - return fmt.Errorf("Seccomp is not enabled in your kernel, cannot run a custom seccomp profile.") + if !daemon.seccompEnabled { + if c.SeccompProfile != "" && c.SeccompProfile != "unconfined" { + return fmt.Errorf("Seccomp is not enabled in your kernel, cannot run a custom seccomp profile.") + } + logrus.Warn("Seccomp is not enabled in your kernel, running container without default profile.") + c.SeccompProfile = "unconfined" } - logrus.Warn("Seccomp is not enabled in your kernel, running container without default profile.") - c.SeccompProfile = "unconfined" - } - if c.SeccompProfile == "unconfined" { - return nil - } - if c.SeccompProfile != "" { - profile, err = seccomp.LoadProfile(c.SeccompProfile, rs) - if err != nil { - return err + if c.SeccompProfile == "unconfined" { + return nil } - } else { - if daemon.seccompProfile != nil { - profile, err = seccomp.LoadProfile(string(daemon.seccompProfile), rs) + if c.SeccompProfile != "" { + profile, err = seccomp.LoadProfile(c.SeccompProfile, s) if err != nil { return err } } else { - profile, err = seccomp.GetDefaultProfile(rs) - if err != nil { - return err + if daemon.seccompProfile != nil { + profile, err = seccomp.LoadProfile(string(daemon.seccompProfile), s) + if err != nil { + return err + } + } else { + profile, err = seccomp.GetDefaultProfile(s) + if err != nil { + return err + } } } - } - rs.Linux.Seccomp = profile - return nil + s.Linux.Seccomp = profile + return nil + } } diff --git a/daemon/seccomp_unsupported.go b/daemon/seccomp_unsupported.go index a323fe0be1..0cd5088918 100644 --- a/daemon/seccomp_unsupported.go +++ b/daemon/seccomp_unsupported.go @@ -2,4 +2,19 @@ package daemon // import "github.com/docker/docker/daemon" +import ( + "context" + + "github.com/containerd/containerd/containers" + coci "github.com/containerd/containerd/oci" + "github.com/docker/docker/container" +) + var supportsSeccomp = false + +// WithSeccomp sets the seccomp profile +func WithSeccomp(daemon *Daemon, c *container.Container) coci.SpecOpts { + return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { + return nil + } +}