2016-03-18 14:50:19 -04:00
|
|
|
package oci
|
|
|
|
|
|
|
|
import (
|
|
|
|
"os"
|
|
|
|
"runtime"
|
|
|
|
|
2016-08-17 12:38:34 -04:00
|
|
|
"github.com/opencontainers/runtime-spec/specs-go"
|
2016-03-18 14:50:19 -04:00
|
|
|
)
|
|
|
|
|
|
|
|
func iPtr(i int64) *int64 { return &i }
|
|
|
|
func u32Ptr(i int64) *uint32 { u := uint32(i); return &u }
|
|
|
|
func fmPtr(i int64) *os.FileMode { fm := os.FileMode(i); return &fm }
|
|
|
|
|
2017-04-27 17:52:47 -04:00
|
|
|
func defaultCapabilities() []string {
|
|
|
|
return []string{
|
|
|
|
"CAP_CHOWN",
|
|
|
|
"CAP_DAC_OVERRIDE",
|
|
|
|
"CAP_FSETID",
|
|
|
|
"CAP_FOWNER",
|
|
|
|
"CAP_MKNOD",
|
|
|
|
"CAP_NET_RAW",
|
|
|
|
"CAP_SETGID",
|
|
|
|
"CAP_SETUID",
|
|
|
|
"CAP_SETFCAP",
|
|
|
|
"CAP_SETPCAP",
|
|
|
|
"CAP_NET_BIND_SERVICE",
|
|
|
|
"CAP_SYS_CHROOT",
|
|
|
|
"CAP_KILL",
|
|
|
|
"CAP_AUDIT_WRITE",
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2017-05-26 19:14:18 -04:00
|
|
|
// DefaultSpec returns the default spec used by docker for the current Platform
|
2016-03-18 14:50:19 -04:00
|
|
|
func DefaultSpec() specs.Spec {
|
2017-05-26 19:14:18 -04:00
|
|
|
return DefaultOSSpec(runtime.GOOS)
|
|
|
|
}
|
|
|
|
|
|
|
|
// DefaultOSSpec returns the spec for a given OS
|
|
|
|
func DefaultOSSpec(osName string) specs.Spec {
|
|
|
|
if osName == "windows" {
|
|
|
|
return DefaultWindowsSpec()
|
|
|
|
}
|
2017-10-24 14:32:52 -04:00
|
|
|
return DefaultLinuxSpec()
|
2017-05-26 19:14:18 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
// DefaultWindowsSpec create a default spec for running Windows containers
|
|
|
|
func DefaultWindowsSpec() specs.Spec {
|
|
|
|
return specs.Spec{
|
2016-03-18 14:50:19 -04:00
|
|
|
Version: specs.Version,
|
2017-05-26 19:14:18 -04:00
|
|
|
Windows: &specs.Windows{},
|
2017-08-01 13:00:38 -04:00
|
|
|
Process: &specs.Process{},
|
|
|
|
Root: &specs.Root{},
|
2017-05-26 19:14:18 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// DefaultLinuxSpec create a default spec for running Linux containers
|
|
|
|
func DefaultLinuxSpec() specs.Spec {
|
|
|
|
s := specs.Spec{
|
|
|
|
Version: specs.Version,
|
2017-09-22 09:52:41 -04:00
|
|
|
Process: &specs.Process{
|
|
|
|
Capabilities: &specs.LinuxCapabilities{
|
|
|
|
Bounding: defaultCapabilities(),
|
|
|
|
Permitted: defaultCapabilities(),
|
|
|
|
Inheritable: defaultCapabilities(),
|
|
|
|
Effective: defaultCapabilities(),
|
|
|
|
},
|
|
|
|
},
|
2017-10-26 17:16:43 -04:00
|
|
|
Root: &specs.Root{},
|
2016-03-18 14:50:19 -04:00
|
|
|
}
|
|
|
|
s.Mounts = []specs.Mount{
|
|
|
|
{
|
|
|
|
Destination: "/proc",
|
|
|
|
Type: "proc",
|
|
|
|
Source: "proc",
|
|
|
|
Options: []string{"nosuid", "noexec", "nodev"},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Destination: "/dev",
|
|
|
|
Type: "tmpfs",
|
|
|
|
Source: "tmpfs",
|
2017-07-18 16:52:31 -04:00
|
|
|
Options: []string{"nosuid", "strictatime", "mode=755", "size=65536k"},
|
2016-03-18 14:50:19 -04:00
|
|
|
},
|
|
|
|
{
|
|
|
|
Destination: "/dev/pts",
|
|
|
|
Type: "devpts",
|
|
|
|
Source: "devpts",
|
|
|
|
Options: []string{"nosuid", "noexec", "newinstance", "ptmxmode=0666", "mode=0620", "gid=5"},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Destination: "/sys",
|
|
|
|
Type: "sysfs",
|
|
|
|
Source: "sysfs",
|
|
|
|
Options: []string{"nosuid", "noexec", "nodev", "ro"},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Destination: "/sys/fs/cgroup",
|
|
|
|
Type: "cgroup",
|
|
|
|
Source: "cgroup",
|
|
|
|
Options: []string{"ro", "nosuid", "noexec", "nodev"},
|
|
|
|
},
|
|
|
|
{
|
|
|
|
Destination: "/dev/mqueue",
|
|
|
|
Type: "mqueue",
|
|
|
|
Source: "mqueue",
|
|
|
|
Options: []string{"nosuid", "noexec", "nodev"},
|
|
|
|
},
|
Implement none, private, and shareable ipc modes
Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and
/dev/mqueue between containers") container's /dev/shm is mounted on the
host first, then bind-mounted inside the container. This is done that
way in order to be able to share this container's IPC namespace
(and the /dev/shm mount point) with another container.
Unfortunately, this functionality breaks container checkpoint/restore
(even if IPC is not shared). Since /dev/shm is an external mount, its
contents is not saved by `criu checkpoint`, and so upon restore any
application that tries to access data under /dev/shm is severily
disappointed (which usually results in a fatal crash).
This commit solves the issue by introducing new IPC modes for containers
(in addition to 'host' and 'container:ID'). The new modes are:
- 'shareable': enables sharing this container's IPC with others
(this used to be the implicit default);
- 'private': disables sharing this container's IPC.
In 'private' mode, container's /dev/shm is truly mounted inside the
container, without any bind-mounting from the host, which solves the
issue.
While at it, let's also implement 'none' mode. The motivation, as
eloquently put by Justin Cormack, is:
> I wondered a while back about having a none shm mode, as currently it is
> not possible to have a totally unwriteable container as there is always
> a /dev/shm writeable mount. It is a bit of a niche case (and clearly
> should never be allowed to be daemon default) but it would be trivial to
> add now so maybe we should...
...so here's yet yet another mode:
- 'none': no /dev/shm mount inside the container (though it still
has its own private IPC namespace).
Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd
need to make 'private' the default mode, but unfortunately it breaks the
backward compatibility. So, let's make the default container IPC mode
per-daemon configurable (with the built-in default set to 'shareable'
for now). The default can be changed either via a daemon CLI option
(--default-shm-mode) or a daemon.json configuration file parameter
of the same name.
Note one can only set either 'shareable' or 'private' IPC modes as a
daemon default (i.e. in this context 'host', 'container', or 'none'
do not make much sense).
Some other changes this patch introduces are:
1. A mount for /dev/shm is added to default OCI Linux spec.
2. IpcMode.Valid() is simplified to remove duplicated code that parsed
'container:ID' form. Note the old version used to check that ID does
not contain a semicolon -- this is no longer the case (tests are
modified accordingly). The motivation is we should either do a
proper check for container ID validity, or don't check it at all
(since it is checked in other places anyway). I chose the latter.
3. IpcMode.Container() is modified to not return container ID if the
mode value does not start with "container:", unifying the check to
be the same as in IpcMode.IsContainer().
3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified
to add checks for newly added values.
[v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997]
[v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833]
[v4: addressed the case of upgrading from older daemon, in this case
container.HostConfig.IpcMode is unset and this is valid]
[v5: document old and new IpcMode values in api/swagger.yaml]
[v6: add the 'none' mode, changelog entry to docs/api/version-history.md]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 17:58:50 -04:00
|
|
|
{
|
|
|
|
Destination: "/dev/shm",
|
|
|
|
Type: "tmpfs",
|
|
|
|
Source: "shm",
|
|
|
|
Options: []string{"nosuid", "noexec", "nodev", "mode=1777"},
|
|
|
|
},
|
2016-03-18 14:50:19 -04:00
|
|
|
}
|
|
|
|
|
2016-09-27 13:26:59 -04:00
|
|
|
s.Linux = &specs.Linux{
|
2016-04-04 17:27:44 -04:00
|
|
|
MaskedPaths: []string{
|
|
|
|
"/proc/kcore",
|
|
|
|
"/proc/latency_stats",
|
Adding /proc/timer_list to the masked paths list
/proc/timer_list seems to leak information about the host. Here is
an example from a busybox container running on docker+kubernetes.
# cat /proc/timer_list | grep -i -e kube
<ffff8800b8cc3db0>, hrtimer_wakeup, S:01, futex_wait_queue_me, kubelet/2497
<ffff880129ac3db0>, hrtimer_wakeup, S:01, futex_wait_queue_me, kube-proxy/3478
<ffff8800b1b77db0>, hrtimer_wakeup, S:01, futex_wait_queue_me, kube-proxy/3470
<ffff8800bb6abdb0>, hrtimer_wakeup, S:01, futex_wait_queue_me, kubelet/2499
Signed-Off-By: Davanum Srinivas <davanum@gmail.com>
Signed-off-by: Davanum Srinivas <davanum@gmail.com>
2016-08-11 15:12:35 -04:00
|
|
|
"/proc/timer_list",
|
2016-04-04 17:27:44 -04:00
|
|
|
"/proc/timer_stats",
|
|
|
|
"/proc/sched_debug",
|
2017-11-03 11:12:22 -04:00
|
|
|
"/proc/scsi",
|
2017-11-08 15:10:42 -05:00
|
|
|
"/sys/firmware",
|
2016-04-04 17:27:44 -04:00
|
|
|
},
|
|
|
|
ReadonlyPaths: []string{
|
|
|
|
"/proc/asound",
|
|
|
|
"/proc/bus",
|
|
|
|
"/proc/fs",
|
|
|
|
"/proc/irq",
|
|
|
|
"/proc/sys",
|
|
|
|
"/proc/sysrq-trigger",
|
|
|
|
},
|
2017-04-27 17:52:47 -04:00
|
|
|
Namespaces: []specs.LinuxNamespace{
|
2016-03-18 14:50:19 -04:00
|
|
|
{Type: "mount"},
|
|
|
|
{Type: "network"},
|
|
|
|
{Type: "uts"},
|
|
|
|
{Type: "pid"},
|
|
|
|
{Type: "ipc"},
|
|
|
|
},
|
2016-04-08 01:28:37 -04:00
|
|
|
// Devices implicitly contains the following devices:
|
|
|
|
// null, zero, full, random, urandom, tty, console, and ptmx.
|
2017-08-19 10:23:38 -04:00
|
|
|
// ptmx is a bind mount or symlink of the container's ptmx.
|
2016-04-08 01:28:37 -04:00
|
|
|
// See also: https://github.com/opencontainers/runtime-spec/blob/master/config-linux.md#default-devices
|
2017-04-27 17:52:47 -04:00
|
|
|
Devices: []specs.LinuxDevice{},
|
|
|
|
Resources: &specs.LinuxResources{
|
|
|
|
Devices: []specs.LinuxDeviceCgroup{
|
2016-03-18 14:50:19 -04:00
|
|
|
{
|
|
|
|
Allow: false,
|
2017-04-27 17:52:47 -04:00
|
|
|
Access: "rwm",
|
2016-03-18 14:50:19 -04:00
|
|
|
},
|
|
|
|
{
|
|
|
|
Allow: true,
|
2017-04-27 17:52:47 -04:00
|
|
|
Type: "c",
|
2016-03-18 14:50:19 -04:00
|
|
|
Major: iPtr(1),
|
|
|
|
Minor: iPtr(5),
|
2017-04-27 17:52:47 -04:00
|
|
|
Access: "rwm",
|
2016-03-18 14:50:19 -04:00
|
|
|
},
|
|
|
|
{
|
|
|
|
Allow: true,
|
2017-04-27 17:52:47 -04:00
|
|
|
Type: "c",
|
2016-03-18 14:50:19 -04:00
|
|
|
Major: iPtr(1),
|
|
|
|
Minor: iPtr(3),
|
2017-04-27 17:52:47 -04:00
|
|
|
Access: "rwm",
|
2016-03-18 14:50:19 -04:00
|
|
|
},
|
|
|
|
{
|
|
|
|
Allow: true,
|
2017-04-27 17:52:47 -04:00
|
|
|
Type: "c",
|
2016-03-18 14:50:19 -04:00
|
|
|
Major: iPtr(1),
|
|
|
|
Minor: iPtr(9),
|
2017-04-27 17:52:47 -04:00
|
|
|
Access: "rwm",
|
2016-03-18 14:50:19 -04:00
|
|
|
},
|
|
|
|
{
|
|
|
|
Allow: true,
|
2017-04-27 17:52:47 -04:00
|
|
|
Type: "c",
|
2016-03-18 14:50:19 -04:00
|
|
|
Major: iPtr(1),
|
|
|
|
Minor: iPtr(8),
|
2017-04-27 17:52:47 -04:00
|
|
|
Access: "rwm",
|
2016-03-18 14:50:19 -04:00
|
|
|
},
|
|
|
|
{
|
|
|
|
Allow: true,
|
2017-04-27 17:52:47 -04:00
|
|
|
Type: "c",
|
2016-03-18 14:50:19 -04:00
|
|
|
Major: iPtr(5),
|
|
|
|
Minor: iPtr(0),
|
2017-04-27 17:52:47 -04:00
|
|
|
Access: "rwm",
|
2016-03-18 14:50:19 -04:00
|
|
|
},
|
|
|
|
{
|
|
|
|
Allow: true,
|
2017-04-27 17:52:47 -04:00
|
|
|
Type: "c",
|
2016-03-18 14:50:19 -04:00
|
|
|
Major: iPtr(5),
|
|
|
|
Minor: iPtr(1),
|
2017-04-27 17:52:47 -04:00
|
|
|
Access: "rwm",
|
2016-03-18 14:50:19 -04:00
|
|
|
},
|
|
|
|
{
|
|
|
|
Allow: false,
|
2017-04-27 17:52:47 -04:00
|
|
|
Type: "c",
|
2016-03-18 14:50:19 -04:00
|
|
|
Major: iPtr(10),
|
|
|
|
Minor: iPtr(229),
|
2017-04-27 17:52:47 -04:00
|
|
|
Access: "rwm",
|
2016-03-18 14:50:19 -04:00
|
|
|
},
|
|
|
|
},
|
|
|
|
},
|
|
|
|
}
|
|
|
|
|
2017-08-01 13:00:38 -04:00
|
|
|
// For LCOW support, populate a blank Windows spec
|
|
|
|
if runtime.GOOS == "windows" {
|
|
|
|
s.Windows = &specs.Windows{}
|
|
|
|
}
|
|
|
|
|
2016-03-18 14:50:19 -04:00
|
|
|
return s
|
|
|
|
}
|