1
0
Fork 0
mirror of https://github.com/moby/moby.git synced 2022-11-09 12:21:53 -05:00
moby--moby/oci/defaults.go
Kir Kolyshkin 7120976d74 Implement none, private, and shareable ipc modes
Since the commit d88fe447df ("Add support for sharing /dev/shm/ and
/dev/mqueue between containers") container's /dev/shm is mounted on the
host first, then bind-mounted inside the container. This is done that
way in order to be able to share this container's IPC namespace
(and the /dev/shm mount point) with another container.

Unfortunately, this functionality breaks container checkpoint/restore
(even if IPC is not shared). Since /dev/shm is an external mount, its
contents is not saved by `criu checkpoint`, and so upon restore any
application that tries to access data under /dev/shm is severily
disappointed (which usually results in a fatal crash).

This commit solves the issue by introducing new IPC modes for containers
(in addition to 'host' and 'container:ID'). The new modes are:

 - 'shareable':	enables sharing this container's IPC with others
		(this used to be the implicit default);

 - 'private':	disables sharing this container's IPC.

In 'private' mode, container's /dev/shm is truly mounted inside the
container, without any bind-mounting from the host, which solves the
issue.

While at it, let's also implement 'none' mode. The motivation, as
eloquently put by Justin Cormack, is:

> I wondered a while back about having a none shm mode, as currently it is
> not possible to have a totally unwriteable container as there is always
> a /dev/shm writeable mount. It is a bit of a niche case (and clearly
> should never be allowed to be daemon default) but it would be trivial to
> add now so maybe we should...

...so here's yet yet another mode:

 - 'none':	no /dev/shm mount inside the container (though it still
		has its own private IPC namespace).

Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd
need to make 'private' the default mode, but unfortunately it breaks the
backward compatibility. So, let's make the default container IPC mode
per-daemon configurable (with the built-in default set to 'shareable'
for now). The default can be changed either via a daemon CLI option
(--default-shm-mode) or a daemon.json configuration file parameter
of the same name.

Note one can only set either 'shareable' or 'private' IPC modes as a
daemon default (i.e. in this context 'host', 'container', or 'none'
do not make much sense).

Some other changes this patch introduces are:

1. A mount for /dev/shm is added to default OCI Linux spec.

2. IpcMode.Valid() is simplified to remove duplicated code that parsed
   'container:ID' form. Note the old version used to check that ID does
   not contain a semicolon -- this is no longer the case (tests are
   modified accordingly). The motivation is we should either do a
   proper check for container ID validity, or don't check it at all
   (since it is checked in other places anyway). I chose the latter.

3. IpcMode.Container() is modified to not return container ID if the
   mode value does not start with "container:", unifying the check to
   be the same as in IpcMode.IsContainer().

3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified
   to add checks for newly added values.

[v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997]
[v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833]
[v4: addressed the case of upgrading from older daemon, in this case
     container.HostConfig.IpcMode is unset and this is valid]
[v5: document old and new IpcMode values in api/swagger.yaml]
[v6: add the 'none' mode, changelog entry to docs/api/version-history.md]

Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-08-14 10:50:39 +03:00

227 lines
4.9 KiB
Go

package oci
import (
"os"
"runtime"
"github.com/opencontainers/runtime-spec/specs-go"
)
func iPtr(i int64) *int64 { return &i }
func u32Ptr(i int64) *uint32 { u := uint32(i); return &u }
func fmPtr(i int64) *os.FileMode { fm := os.FileMode(i); return &fm }
func defaultCapabilities() []string {
return []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
}
}
// DefaultSpec returns the default spec used by docker for the current Platform
func DefaultSpec() specs.Spec {
return DefaultOSSpec(runtime.GOOS)
}
// DefaultOSSpec returns the spec for a given OS
func DefaultOSSpec(osName string) specs.Spec {
if osName == "windows" {
return DefaultWindowsSpec()
} else if osName == "solaris" {
return DefaultSolarisSpec()
} else {
return DefaultLinuxSpec()
}
}
// DefaultWindowsSpec create a default spec for running Windows containers
func DefaultWindowsSpec() specs.Spec {
return specs.Spec{
Version: specs.Version,
Platform: specs.Platform{
OS: runtime.GOOS,
Arch: runtime.GOARCH,
},
Windows: &specs.Windows{},
}
}
// DefaultSolarisSpec create a default spec for running Solaris containers
func DefaultSolarisSpec() specs.Spec {
s := specs.Spec{
Version: "0.6.0",
Platform: specs.Platform{
OS: "SunOS",
Arch: runtime.GOARCH,
},
}
s.Solaris = &specs.Solaris{}
return s
}
// DefaultLinuxSpec create a default spec for running Linux containers
func DefaultLinuxSpec() specs.Spec {
s := specs.Spec{
Version: specs.Version,
Platform: specs.Platform{
OS: "linux",
Arch: runtime.GOARCH,
},
}
s.Mounts = []specs.Mount{
{
Destination: "/proc",
Type: "proc",
Source: "proc",
Options: []string{"nosuid", "noexec", "nodev"},
},
{
Destination: "/dev",
Type: "tmpfs",
Source: "tmpfs",
Options: []string{"nosuid", "strictatime", "mode=755", "size=65536k"},
},
{
Destination: "/dev/pts",
Type: "devpts",
Source: "devpts",
Options: []string{"nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620", "gid=5"},
},
{
Destination: "/sys",
Type: "sysfs",
Source: "sysfs",
Options: []string{"nosuid", "noexec", "nodev", "ro"},
},
{
Destination: "/sys/fs/cgroup",
Type: "cgroup",
Source: "cgroup",
Options: []string{"ro", "nosuid", "noexec", "nodev"},
},
{
Destination: "/dev/mqueue",
Type: "mqueue",
Source: "mqueue",
Options: []string{"nosuid", "noexec", "nodev"},
},
{
Destination: "/dev/shm",
Type: "tmpfs",
Source: "shm",
Options: []string{"nosuid", "noexec", "nodev", "mode=1777"},
},
}
s.Process.Capabilities = &specs.LinuxCapabilities{
Bounding: defaultCapabilities(),
Permitted: defaultCapabilities(),
Inheritable: defaultCapabilities(),
Effective: defaultCapabilities(),
}
s.Linux = &specs.Linux{
MaskedPaths: []string{
"/proc/kcore",
"/proc/latency_stats",
"/proc/timer_list",
"/proc/timer_stats",
"/proc/sched_debug",
},
ReadonlyPaths: []string{
"/proc/asound",
"/proc/bus",
"/proc/fs",
"/proc/irq",
"/proc/sys",
"/proc/sysrq-trigger",
},
Namespaces: []specs.LinuxNamespace{
{Type: "mount"},
{Type: "network"},
{Type: "uts"},
{Type: "pid"},
{Type: "ipc"},
},
// Devices implicitly contains the following devices:
// null, zero, full, random, urandom, tty, console, and ptmx.
// ptmx is a bind-mount or symlink of the container's ptmx.
// See also: https://github.com/opencontainers/runtime-spec/blob/master/config-linux.md#default-devices
Devices: []specs.LinuxDevice{},
Resources: &specs.LinuxResources{
Devices: []specs.LinuxDeviceCgroup{
{
Allow: false,
Access: "rwm",
},
{
Allow: true,
Type: "c",
Major: iPtr(1),
Minor: iPtr(5),
Access: "rwm",
},
{
Allow: true,
Type: "c",
Major: iPtr(1),
Minor: iPtr(3),
Access: "rwm",
},
{
Allow: true,
Type: "c",
Major: iPtr(1),
Minor: iPtr(9),
Access: "rwm",
},
{
Allow: true,
Type: "c",
Major: iPtr(1),
Minor: iPtr(8),
Access: "rwm",
},
{
Allow: true,
Type: "c",
Major: iPtr(5),
Minor: iPtr(0),
Access: "rwm",
},
{
Allow: true,
Type: "c",
Major: iPtr(5),
Minor: iPtr(1),
Access: "rwm",
},
{
Allow: false,
Type: "c",
Major: iPtr(10),
Minor: iPtr(229),
Access: "rwm",
},
},
},
}
// For LCOW support, don't mask /sys/firmware
if runtime.GOOS != "windows" {
s.Linux.MaskedPaths = append(s.Linux.MaskedPaths, "/sys/firmware")
}
return s
}