2018-02-05 16:05:59 -05:00
|
|
|
package daemon // import "github.com/docker/docker/daemon"
|
2016-03-18 14:50:19 -04:00
|
|
|
|
|
|
|
import (
|
|
|
|
"fmt"
|
|
|
|
"io"
|
|
|
|
"os"
|
2016-06-27 17:38:47 -04:00
|
|
|
"os/exec"
|
2016-03-18 14:50:19 -04:00
|
|
|
"path/filepath"
|
2016-04-26 04:20:17 -04:00
|
|
|
"sort"
|
2016-03-18 14:50:19 -04:00
|
|
|
"strconv"
|
|
|
|
"strings"
|
|
|
|
|
2016-09-06 14:18:12 -04:00
|
|
|
containertypes "github.com/docker/docker/api/types/container"
|
2016-03-18 14:50:19 -04:00
|
|
|
"github.com/docker/docker/container"
|
2017-04-10 05:25:15 -04:00
|
|
|
daemonconfig "github.com/docker/docker/daemon/config"
|
2016-03-18 14:50:19 -04:00
|
|
|
"github.com/docker/docker/oci"
|
2018-12-16 10:11:37 -05:00
|
|
|
"github.com/docker/docker/oci/caps"
|
2016-03-18 14:50:19 -04:00
|
|
|
"github.com/docker/docker/pkg/idtools"
|
|
|
|
"github.com/docker/docker/pkg/mount"
|
2018-04-17 16:50:28 -04:00
|
|
|
volumemounts "github.com/docker/docker/volume/mounts"
|
2016-03-18 14:50:19 -04:00
|
|
|
"github.com/opencontainers/runc/libcontainer/apparmor"
|
2016-06-07 15:05:43 -04:00
|
|
|
"github.com/opencontainers/runc/libcontainer/cgroups"
|
2016-03-18 14:50:19 -04:00
|
|
|
"github.com/opencontainers/runc/libcontainer/devices"
|
|
|
|
"github.com/opencontainers/runc/libcontainer/user"
|
2018-05-19 07:38:54 -04:00
|
|
|
"github.com/opencontainers/runtime-spec/specs-go"
|
2018-01-24 18:10:01 -05:00
|
|
|
"github.com/pkg/errors"
|
2017-07-26 17:42:13 -04:00
|
|
|
"github.com/sirupsen/logrus"
|
2017-10-15 02:06:20 -04:00
|
|
|
"golang.org/x/sys/unix"
|
2016-03-18 14:50:19 -04:00
|
|
|
)
|
|
|
|
|
2018-08-22 16:05:12 -04:00
|
|
|
const (
|
|
|
|
inContainerInitPath = "/sbin/" + daemonconfig.DefaultInitBinary
|
|
|
|
)
|
|
|
|
|
2016-03-18 14:50:19 -04:00
|
|
|
func setResources(s *specs.Spec, r containertypes.Resources) error {
|
|
|
|
weightDevices, err := getBlkioWeightDevices(r)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2016-04-29 16:39:04 -04:00
|
|
|
readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps)
|
2016-03-18 14:50:19 -04:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2016-04-29 16:39:04 -04:00
|
|
|
writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps)
|
2016-03-18 14:50:19 -04:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2016-04-29 16:39:04 -04:00
|
|
|
readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps)
|
2016-03-18 14:50:19 -04:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2016-04-29 16:39:04 -04:00
|
|
|
writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps)
|
2016-03-18 14:50:19 -04:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
memoryRes := getMemoryResources(r)
|
2017-04-27 17:52:47 -04:00
|
|
|
cpuRes, err := getCPUResources(r)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2016-03-18 14:50:19 -04:00
|
|
|
blkioWeight := r.BlkioWeight
|
|
|
|
|
2017-04-27 17:52:47 -04:00
|
|
|
specResources := &specs.LinuxResources{
|
2016-03-18 14:50:19 -04:00
|
|
|
Memory: memoryRes,
|
|
|
|
CPU: cpuRes,
|
2017-04-27 17:52:47 -04:00
|
|
|
BlockIO: &specs.LinuxBlockIO{
|
2016-03-18 14:50:19 -04:00
|
|
|
Weight: &blkioWeight,
|
|
|
|
WeightDevice: weightDevices,
|
|
|
|
ThrottleReadBpsDevice: readBpsDevice,
|
|
|
|
ThrottleWriteBpsDevice: writeBpsDevice,
|
|
|
|
ThrottleReadIOPSDevice: readIOpsDevice,
|
|
|
|
ThrottleWriteIOPSDevice: writeIOpsDevice,
|
|
|
|
},
|
2017-04-27 17:52:47 -04:00
|
|
|
Pids: &specs.LinuxPids{
|
|
|
|
Limit: r.PidsLimit,
|
2016-03-18 14:50:19 -04:00
|
|
|
},
|
|
|
|
}
|
|
|
|
|
|
|
|
if s.Linux.Resources != nil && len(s.Linux.Resources.Devices) > 0 {
|
|
|
|
specResources.Devices = s.Linux.Resources.Devices
|
|
|
|
}
|
|
|
|
|
|
|
|
s.Linux.Resources = specResources
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func setDevices(s *specs.Spec, c *container.Container) error {
|
|
|
|
// Build lists of devices allowed and created within the container.
|
2017-04-27 17:52:47 -04:00
|
|
|
var devs []specs.LinuxDevice
|
2016-03-24 15:01:12 -04:00
|
|
|
devPermissions := s.Linux.Resources.Devices
|
2016-03-18 14:50:19 -04:00
|
|
|
if c.HostConfig.Privileged {
|
|
|
|
hostDevices, err := devices.HostDevices()
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
for _, d := range hostDevices {
|
2016-11-16 19:18:43 -05:00
|
|
|
devs = append(devs, oci.Device(d))
|
2016-03-18 14:50:19 -04:00
|
|
|
}
|
2017-04-27 17:52:47 -04:00
|
|
|
devPermissions = []specs.LinuxDeviceCgroup{
|
2016-03-24 15:01:12 -04:00
|
|
|
{
|
|
|
|
Allow: true,
|
2017-04-27 17:52:47 -04:00
|
|
|
Access: "rwm",
|
2016-03-24 15:01:12 -04:00
|
|
|
},
|
|
|
|
}
|
2016-03-18 14:50:19 -04:00
|
|
|
} else {
|
|
|
|
for _, deviceMapping := range c.HostConfig.Devices {
|
2016-11-16 19:18:43 -05:00
|
|
|
d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions)
|
2016-03-18 14:50:19 -04:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
devs = append(devs, d...)
|
2016-03-24 15:01:12 -04:00
|
|
|
devPermissions = append(devPermissions, dPermissions...)
|
2016-03-18 14:50:19 -04:00
|
|
|
}
|
2016-05-06 18:09:46 -04:00
|
|
|
|
2018-06-15 19:14:17 -04:00
|
|
|
var err error
|
2018-12-10 15:40:40 -05:00
|
|
|
devPermissions, err = oci.AppendDevicePermissionsFromCgroupRules(devPermissions, c.HostConfig.DeviceCgroupRules)
|
2018-06-15 19:14:17 -04:00
|
|
|
if err != nil {
|
|
|
|
return err
|
2016-05-06 18:09:46 -04:00
|
|
|
}
|
2016-03-18 14:50:19 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
s.Linux.Devices = append(s.Linux.Devices, devs...)
|
2016-03-24 15:01:12 -04:00
|
|
|
s.Linux.Resources.Devices = devPermissions
|
2016-03-18 14:50:19 -04:00
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2017-09-22 09:52:41 -04:00
|
|
|
func (daemon *Daemon) setRlimits(s *specs.Spec, c *container.Container) error {
|
2017-08-01 11:51:24 -04:00
|
|
|
var rlimits []specs.POSIXRlimit
|
2016-03-18 14:50:19 -04:00
|
|
|
|
2016-09-08 00:23:56 -04:00
|
|
|
// We want to leave the original HostConfig alone so make a copy here
|
|
|
|
hostConfig := *c.HostConfig
|
|
|
|
// Merge with the daemon defaults
|
|
|
|
daemon.mergeUlimits(&hostConfig)
|
|
|
|
for _, ul := range hostConfig.Ulimits {
|
2017-08-01 11:51:24 -04:00
|
|
|
rlimits = append(rlimits, specs.POSIXRlimit{
|
2016-03-18 14:50:19 -04:00
|
|
|
Type: "RLIMIT_" + strings.ToUpper(ul.Name),
|
|
|
|
Soft: uint64(ul.Soft),
|
|
|
|
Hard: uint64(ul.Hard),
|
|
|
|
})
|
|
|
|
}
|
|
|
|
|
|
|
|
s.Process.Rlimits = rlimits
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func setUser(s *specs.Spec, c *container.Container) error {
|
|
|
|
uid, gid, additionalGids, err := getUser(c, c.Config.User)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
s.Process.User.UID = uid
|
|
|
|
s.Process.User.GID = gid
|
|
|
|
s.Process.User.AdditionalGids = additionalGids
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func readUserFile(c *container.Container, p string) (io.ReadCloser, error) {
|
2017-08-03 20:22:00 -04:00
|
|
|
fp, err := c.GetResourcePath(p)
|
2016-03-18 14:50:19 -04:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
return os.Open(fp)
|
|
|
|
}
|
|
|
|
|
|
|
|
func getUser(c *container.Container, username string) (uint32, uint32, []uint32, error) {
|
|
|
|
passwdPath, err := user.GetPasswdPath()
|
|
|
|
if err != nil {
|
|
|
|
return 0, 0, nil, err
|
|
|
|
}
|
|
|
|
groupPath, err := user.GetGroupPath()
|
|
|
|
if err != nil {
|
|
|
|
return 0, 0, nil, err
|
|
|
|
}
|
|
|
|
passwdFile, err := readUserFile(c, passwdPath)
|
|
|
|
if err == nil {
|
|
|
|
defer passwdFile.Close()
|
|
|
|
}
|
|
|
|
groupFile, err := readUserFile(c, groupPath)
|
|
|
|
if err == nil {
|
|
|
|
defer groupFile.Close()
|
|
|
|
}
|
|
|
|
|
|
|
|
execUser, err := user.GetExecUser(username, nil, passwdFile, groupFile)
|
|
|
|
if err != nil {
|
|
|
|
return 0, 0, nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// todo: fix this double read by a change to libcontainer/user pkg
|
|
|
|
groupFile, err = readUserFile(c, groupPath)
|
|
|
|
if err == nil {
|
|
|
|
defer groupFile.Close()
|
|
|
|
}
|
|
|
|
var addGroups []int
|
|
|
|
if len(c.HostConfig.GroupAdd) > 0 {
|
|
|
|
addGroups, err = user.GetAdditionalGroups(c.HostConfig.GroupAdd, groupFile)
|
|
|
|
if err != nil {
|
|
|
|
return 0, 0, nil, err
|
|
|
|
}
|
|
|
|
}
|
|
|
|
uid := uint32(execUser.Uid)
|
|
|
|
gid := uint32(execUser.Gid)
|
|
|
|
sgids := append(execUser.Sgids, addGroups...)
|
|
|
|
var additionalGids []uint32
|
|
|
|
for _, g := range sgids {
|
|
|
|
additionalGids = append(additionalGids, uint32(g))
|
|
|
|
}
|
|
|
|
return uid, gid, additionalGids, nil
|
|
|
|
}
|
|
|
|
|
2017-04-27 17:52:47 -04:00
|
|
|
func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) {
|
2016-03-18 14:50:19 -04:00
|
|
|
for i, n := range s.Linux.Namespaces {
|
|
|
|
if n.Type == ns.Type {
|
|
|
|
s.Linux.Namespaces[i] = ns
|
|
|
|
return
|
|
|
|
}
|
|
|
|
}
|
|
|
|
s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
|
|
|
|
}
|
|
|
|
|
|
|
|
func setNamespaces(daemon *Daemon, s *specs.Spec, c *container.Container) error {
|
2016-03-21 21:30:21 -04:00
|
|
|
userNS := false
|
|
|
|
// user
|
|
|
|
if c.HostConfig.UsernsMode.IsPrivate() {
|
2017-11-16 01:20:33 -05:00
|
|
|
uidMap := daemon.idMapping.UIDs()
|
2016-03-21 21:30:21 -04:00
|
|
|
if uidMap != nil {
|
|
|
|
userNS = true
|
2017-04-27 17:52:47 -04:00
|
|
|
ns := specs.LinuxNamespace{Type: "user"}
|
2016-03-21 21:30:21 -04:00
|
|
|
setNamespace(s, ns)
|
|
|
|
s.Linux.UIDMappings = specMapping(uidMap)
|
2017-11-16 01:20:33 -05:00
|
|
|
s.Linux.GIDMappings = specMapping(daemon.idMapping.GIDs())
|
2016-03-21 21:30:21 -04:00
|
|
|
}
|
|
|
|
}
|
2016-03-18 14:50:19 -04:00
|
|
|
// network
|
|
|
|
if !c.Config.NetworkDisabled {
|
2017-04-27 17:52:47 -04:00
|
|
|
ns := specs.LinuxNamespace{Type: "network"}
|
2016-03-18 14:50:19 -04:00
|
|
|
parts := strings.SplitN(string(c.HostConfig.NetworkMode), ":", 2)
|
|
|
|
if parts[0] == "container" {
|
|
|
|
nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer())
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID())
|
2016-03-21 21:30:21 -04:00
|
|
|
if userNS {
|
|
|
|
// to share a net namespace, they must also share a user namespace
|
2017-04-27 17:52:47 -04:00
|
|
|
nsUser := specs.LinuxNamespace{Type: "user"}
|
2016-03-21 21:30:21 -04:00
|
|
|
nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID())
|
|
|
|
setNamespace(s, nsUser)
|
|
|
|
}
|
2016-03-18 14:50:19 -04:00
|
|
|
} else if c.HostConfig.NetworkMode.IsHost() {
|
|
|
|
ns.Path = c.NetworkSettings.SandboxKey
|
|
|
|
}
|
|
|
|
setNamespace(s, ns)
|
|
|
|
}
|
Implement none, private, and shareable ipc modes
Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and
/dev/mqueue between containers") container's /dev/shm is mounted on the
host first, then bind-mounted inside the container. This is done that
way in order to be able to share this container's IPC namespace
(and the /dev/shm mount point) with another container.
Unfortunately, this functionality breaks container checkpoint/restore
(even if IPC is not shared). Since /dev/shm is an external mount, its
contents is not saved by `criu checkpoint`, and so upon restore any
application that tries to access data under /dev/shm is severily
disappointed (which usually results in a fatal crash).
This commit solves the issue by introducing new IPC modes for containers
(in addition to 'host' and 'container:ID'). The new modes are:
- 'shareable': enables sharing this container's IPC with others
(this used to be the implicit default);
- 'private': disables sharing this container's IPC.
In 'private' mode, container's /dev/shm is truly mounted inside the
container, without any bind-mounting from the host, which solves the
issue.
While at it, let's also implement 'none' mode. The motivation, as
eloquently put by Justin Cormack, is:
> I wondered a while back about having a none shm mode, as currently it is
> not possible to have a totally unwriteable container as there is always
> a /dev/shm writeable mount. It is a bit of a niche case (and clearly
> should never be allowed to be daemon default) but it would be trivial to
> add now so maybe we should...
...so here's yet yet another mode:
- 'none': no /dev/shm mount inside the container (though it still
has its own private IPC namespace).
Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd
need to make 'private' the default mode, but unfortunately it breaks the
backward compatibility. So, let's make the default container IPC mode
per-daemon configurable (with the built-in default set to 'shareable'
for now). The default can be changed either via a daemon CLI option
(--default-shm-mode) or a daemon.json configuration file parameter
of the same name.
Note one can only set either 'shareable' or 'private' IPC modes as a
daemon default (i.e. in this context 'host', 'container', or 'none'
do not make much sense).
Some other changes this patch introduces are:
1. A mount for /dev/shm is added to default OCI Linux spec.
2. IpcMode.Valid() is simplified to remove duplicated code that parsed
'container:ID' form. Note the old version used to check that ID does
not contain a semicolon -- this is no longer the case (tests are
modified accordingly). The motivation is we should either do a
proper check for container ID validity, or don't check it at all
(since it is checked in other places anyway). I chose the latter.
3. IpcMode.Container() is modified to not return container ID if the
mode value does not start with "container:", unifying the check to
be the same as in IpcMode.IsContainer().
3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified
to add checks for newly added values.
[v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997]
[v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833]
[v4: addressed the case of upgrading from older daemon, in this case
container.HostConfig.IpcMode is unset and this is valid]
[v5: document old and new IpcMode values in api/swagger.yaml]
[v6: add the 'none' mode, changelog entry to docs/api/version-history.md]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 17:58:50 -04:00
|
|
|
|
2016-03-18 14:50:19 -04:00
|
|
|
// ipc
|
Implement none, private, and shareable ipc modes
Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and
/dev/mqueue between containers") container's /dev/shm is mounted on the
host first, then bind-mounted inside the container. This is done that
way in order to be able to share this container's IPC namespace
(and the /dev/shm mount point) with another container.
Unfortunately, this functionality breaks container checkpoint/restore
(even if IPC is not shared). Since /dev/shm is an external mount, its
contents is not saved by `criu checkpoint`, and so upon restore any
application that tries to access data under /dev/shm is severily
disappointed (which usually results in a fatal crash).
This commit solves the issue by introducing new IPC modes for containers
(in addition to 'host' and 'container:ID'). The new modes are:
- 'shareable': enables sharing this container's IPC with others
(this used to be the implicit default);
- 'private': disables sharing this container's IPC.
In 'private' mode, container's /dev/shm is truly mounted inside the
container, without any bind-mounting from the host, which solves the
issue.
While at it, let's also implement 'none' mode. The motivation, as
eloquently put by Justin Cormack, is:
> I wondered a while back about having a none shm mode, as currently it is
> not possible to have a totally unwriteable container as there is always
> a /dev/shm writeable mount. It is a bit of a niche case (and clearly
> should never be allowed to be daemon default) but it would be trivial to
> add now so maybe we should...
...so here's yet yet another mode:
- 'none': no /dev/shm mount inside the container (though it still
has its own private IPC namespace).
Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd
need to make 'private' the default mode, but unfortunately it breaks the
backward compatibility. So, let's make the default container IPC mode
per-daemon configurable (with the built-in default set to 'shareable'
for now). The default can be changed either via a daemon CLI option
(--default-shm-mode) or a daemon.json configuration file parameter
of the same name.
Note one can only set either 'shareable' or 'private' IPC modes as a
daemon default (i.e. in this context 'host', 'container', or 'none'
do not make much sense).
Some other changes this patch introduces are:
1. A mount for /dev/shm is added to default OCI Linux spec.
2. IpcMode.Valid() is simplified to remove duplicated code that parsed
'container:ID' form. Note the old version used to check that ID does
not contain a semicolon -- this is no longer the case (tests are
modified accordingly). The motivation is we should either do a
proper check for container ID validity, or don't check it at all
(since it is checked in other places anyway). I chose the latter.
3. IpcMode.Container() is modified to not return container ID if the
mode value does not start with "container:", unifying the check to
be the same as in IpcMode.IsContainer().
3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified
to add checks for newly added values.
[v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997]
[v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833]
[v4: addressed the case of upgrading from older daemon, in this case
container.HostConfig.IpcMode is unset and this is valid]
[v5: document old and new IpcMode values in api/swagger.yaml]
[v6: add the 'none' mode, changelog entry to docs/api/version-history.md]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 17:58:50 -04:00
|
|
|
ipcMode := c.HostConfig.IpcMode
|
|
|
|
switch {
|
|
|
|
case ipcMode.IsContainer():
|
2017-04-27 17:52:47 -04:00
|
|
|
ns := specs.LinuxNamespace{Type: "ipc"}
|
Implement none, private, and shareable ipc modes
Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and
/dev/mqueue between containers") container's /dev/shm is mounted on the
host first, then bind-mounted inside the container. This is done that
way in order to be able to share this container's IPC namespace
(and the /dev/shm mount point) with another container.
Unfortunately, this functionality breaks container checkpoint/restore
(even if IPC is not shared). Since /dev/shm is an external mount, its
contents is not saved by `criu checkpoint`, and so upon restore any
application that tries to access data under /dev/shm is severily
disappointed (which usually results in a fatal crash).
This commit solves the issue by introducing new IPC modes for containers
(in addition to 'host' and 'container:ID'). The new modes are:
- 'shareable': enables sharing this container's IPC with others
(this used to be the implicit default);
- 'private': disables sharing this container's IPC.
In 'private' mode, container's /dev/shm is truly mounted inside the
container, without any bind-mounting from the host, which solves the
issue.
While at it, let's also implement 'none' mode. The motivation, as
eloquently put by Justin Cormack, is:
> I wondered a while back about having a none shm mode, as currently it is
> not possible to have a totally unwriteable container as there is always
> a /dev/shm writeable mount. It is a bit of a niche case (and clearly
> should never be allowed to be daemon default) but it would be trivial to
> add now so maybe we should...
...so here's yet yet another mode:
- 'none': no /dev/shm mount inside the container (though it still
has its own private IPC namespace).
Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd
need to make 'private' the default mode, but unfortunately it breaks the
backward compatibility. So, let's make the default container IPC mode
per-daemon configurable (with the built-in default set to 'shareable'
for now). The default can be changed either via a daemon CLI option
(--default-shm-mode) or a daemon.json configuration file parameter
of the same name.
Note one can only set either 'shareable' or 'private' IPC modes as a
daemon default (i.e. in this context 'host', 'container', or 'none'
do not make much sense).
Some other changes this patch introduces are:
1. A mount for /dev/shm is added to default OCI Linux spec.
2. IpcMode.Valid() is simplified to remove duplicated code that parsed
'container:ID' form. Note the old version used to check that ID does
not contain a semicolon -- this is no longer the case (tests are
modified accordingly). The motivation is we should either do a
proper check for container ID validity, or don't check it at all
(since it is checked in other places anyway). I chose the latter.
3. IpcMode.Container() is modified to not return container ID if the
mode value does not start with "container:", unifying the check to
be the same as in IpcMode.IsContainer().
3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified
to add checks for newly added values.
[v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997]
[v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833]
[v4: addressed the case of upgrading from older daemon, in this case
container.HostConfig.IpcMode is unset and this is valid]
[v5: document old and new IpcMode values in api/swagger.yaml]
[v6: add the 'none' mode, changelog entry to docs/api/version-history.md]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 17:58:50 -04:00
|
|
|
ic, err := daemon.getIpcContainer(ipcMode.Container())
|
2016-03-18 14:50:19 -04:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID())
|
|
|
|
setNamespace(s, ns)
|
2016-03-21 21:30:21 -04:00
|
|
|
if userNS {
|
|
|
|
// to share an IPC namespace, they must also share a user namespace
|
2017-04-27 17:52:47 -04:00
|
|
|
nsUser := specs.LinuxNamespace{Type: "user"}
|
2016-03-21 21:30:21 -04:00
|
|
|
nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID())
|
|
|
|
setNamespace(s, nsUser)
|
|
|
|
}
|
Implement none, private, and shareable ipc modes
Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and
/dev/mqueue between containers") container's /dev/shm is mounted on the
host first, then bind-mounted inside the container. This is done that
way in order to be able to share this container's IPC namespace
(and the /dev/shm mount point) with another container.
Unfortunately, this functionality breaks container checkpoint/restore
(even if IPC is not shared). Since /dev/shm is an external mount, its
contents is not saved by `criu checkpoint`, and so upon restore any
application that tries to access data under /dev/shm is severily
disappointed (which usually results in a fatal crash).
This commit solves the issue by introducing new IPC modes for containers
(in addition to 'host' and 'container:ID'). The new modes are:
- 'shareable': enables sharing this container's IPC with others
(this used to be the implicit default);
- 'private': disables sharing this container's IPC.
In 'private' mode, container's /dev/shm is truly mounted inside the
container, without any bind-mounting from the host, which solves the
issue.
While at it, let's also implement 'none' mode. The motivation, as
eloquently put by Justin Cormack, is:
> I wondered a while back about having a none shm mode, as currently it is
> not possible to have a totally unwriteable container as there is always
> a /dev/shm writeable mount. It is a bit of a niche case (and clearly
> should never be allowed to be daemon default) but it would be trivial to
> add now so maybe we should...
...so here's yet yet another mode:
- 'none': no /dev/shm mount inside the container (though it still
has its own private IPC namespace).
Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd
need to make 'private' the default mode, but unfortunately it breaks the
backward compatibility. So, let's make the default container IPC mode
per-daemon configurable (with the built-in default set to 'shareable'
for now). The default can be changed either via a daemon CLI option
(--default-shm-mode) or a daemon.json configuration file parameter
of the same name.
Note one can only set either 'shareable' or 'private' IPC modes as a
daemon default (i.e. in this context 'host', 'container', or 'none'
do not make much sense).
Some other changes this patch introduces are:
1. A mount for /dev/shm is added to default OCI Linux spec.
2. IpcMode.Valid() is simplified to remove duplicated code that parsed
'container:ID' form. Note the old version used to check that ID does
not contain a semicolon -- this is no longer the case (tests are
modified accordingly). The motivation is we should either do a
proper check for container ID validity, or don't check it at all
(since it is checked in other places anyway). I chose the latter.
3. IpcMode.Container() is modified to not return container ID if the
mode value does not start with "container:", unifying the check to
be the same as in IpcMode.IsContainer().
3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified
to add checks for newly added values.
[v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997]
[v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833]
[v4: addressed the case of upgrading from older daemon, in this case
container.HostConfig.IpcMode is unset and this is valid]
[v5: document old and new IpcMode values in api/swagger.yaml]
[v6: add the 'none' mode, changelog entry to docs/api/version-history.md]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 17:58:50 -04:00
|
|
|
case ipcMode.IsHost():
|
2017-04-27 17:52:47 -04:00
|
|
|
oci.RemoveNamespace(s, specs.LinuxNamespaceType("ipc"))
|
Implement none, private, and shareable ipc modes
Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and
/dev/mqueue between containers") container's /dev/shm is mounted on the
host first, then bind-mounted inside the container. This is done that
way in order to be able to share this container's IPC namespace
(and the /dev/shm mount point) with another container.
Unfortunately, this functionality breaks container checkpoint/restore
(even if IPC is not shared). Since /dev/shm is an external mount, its
contents is not saved by `criu checkpoint`, and so upon restore any
application that tries to access data under /dev/shm is severily
disappointed (which usually results in a fatal crash).
This commit solves the issue by introducing new IPC modes for containers
(in addition to 'host' and 'container:ID'). The new modes are:
- 'shareable': enables sharing this container's IPC with others
(this used to be the implicit default);
- 'private': disables sharing this container's IPC.
In 'private' mode, container's /dev/shm is truly mounted inside the
container, without any bind-mounting from the host, which solves the
issue.
While at it, let's also implement 'none' mode. The motivation, as
eloquently put by Justin Cormack, is:
> I wondered a while back about having a none shm mode, as currently it is
> not possible to have a totally unwriteable container as there is always
> a /dev/shm writeable mount. It is a bit of a niche case (and clearly
> should never be allowed to be daemon default) but it would be trivial to
> add now so maybe we should...
...so here's yet yet another mode:
- 'none': no /dev/shm mount inside the container (though it still
has its own private IPC namespace).
Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd
need to make 'private' the default mode, but unfortunately it breaks the
backward compatibility. So, let's make the default container IPC mode
per-daemon configurable (with the built-in default set to 'shareable'
for now). The default can be changed either via a daemon CLI option
(--default-shm-mode) or a daemon.json configuration file parameter
of the same name.
Note one can only set either 'shareable' or 'private' IPC modes as a
daemon default (i.e. in this context 'host', 'container', or 'none'
do not make much sense).
Some other changes this patch introduces are:
1. A mount for /dev/shm is added to default OCI Linux spec.
2. IpcMode.Valid() is simplified to remove duplicated code that parsed
'container:ID' form. Note the old version used to check that ID does
not contain a semicolon -- this is no longer the case (tests are
modified accordingly). The motivation is we should either do a
proper check for container ID validity, or don't check it at all
(since it is checked in other places anyway). I chose the latter.
3. IpcMode.Container() is modified to not return container ID if the
mode value does not start with "container:", unifying the check to
be the same as in IpcMode.IsContainer().
3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified
to add checks for newly added values.
[v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997]
[v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833]
[v4: addressed the case of upgrading from older daemon, in this case
container.HostConfig.IpcMode is unset and this is valid]
[v5: document old and new IpcMode values in api/swagger.yaml]
[v6: add the 'none' mode, changelog entry to docs/api/version-history.md]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 17:58:50 -04:00
|
|
|
case ipcMode.IsEmpty():
|
|
|
|
// A container was created by an older version of the daemon.
|
|
|
|
// The default behavior used to be what is now called "shareable".
|
|
|
|
fallthrough
|
|
|
|
case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
|
2017-04-27 17:52:47 -04:00
|
|
|
ns := specs.LinuxNamespace{Type: "ipc"}
|
2016-03-18 14:50:19 -04:00
|
|
|
setNamespace(s, ns)
|
Implement none, private, and shareable ipc modes
Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and
/dev/mqueue between containers") container's /dev/shm is mounted on the
host first, then bind-mounted inside the container. This is done that
way in order to be able to share this container's IPC namespace
(and the /dev/shm mount point) with another container.
Unfortunately, this functionality breaks container checkpoint/restore
(even if IPC is not shared). Since /dev/shm is an external mount, its
contents is not saved by `criu checkpoint`, and so upon restore any
application that tries to access data under /dev/shm is severily
disappointed (which usually results in a fatal crash).
This commit solves the issue by introducing new IPC modes for containers
(in addition to 'host' and 'container:ID'). The new modes are:
- 'shareable': enables sharing this container's IPC with others
(this used to be the implicit default);
- 'private': disables sharing this container's IPC.
In 'private' mode, container's /dev/shm is truly mounted inside the
container, without any bind-mounting from the host, which solves the
issue.
While at it, let's also implement 'none' mode. The motivation, as
eloquently put by Justin Cormack, is:
> I wondered a while back about having a none shm mode, as currently it is
> not possible to have a totally unwriteable container as there is always
> a /dev/shm writeable mount. It is a bit of a niche case (and clearly
> should never be allowed to be daemon default) but it would be trivial to
> add now so maybe we should...
...so here's yet yet another mode:
- 'none': no /dev/shm mount inside the container (though it still
has its own private IPC namespace).
Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd
need to make 'private' the default mode, but unfortunately it breaks the
backward compatibility. So, let's make the default container IPC mode
per-daemon configurable (with the built-in default set to 'shareable'
for now). The default can be changed either via a daemon CLI option
(--default-shm-mode) or a daemon.json configuration file parameter
of the same name.
Note one can only set either 'shareable' or 'private' IPC modes as a
daemon default (i.e. in this context 'host', 'container', or 'none'
do not make much sense).
Some other changes this patch introduces are:
1. A mount for /dev/shm is added to default OCI Linux spec.
2. IpcMode.Valid() is simplified to remove duplicated code that parsed
'container:ID' form. Note the old version used to check that ID does
not contain a semicolon -- this is no longer the case (tests are
modified accordingly). The motivation is we should either do a
proper check for container ID validity, or don't check it at all
(since it is checked in other places anyway). I chose the latter.
3. IpcMode.Container() is modified to not return container ID if the
mode value does not start with "container:", unifying the check to
be the same as in IpcMode.IsContainer().
3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified
to add checks for newly added values.
[v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997]
[v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833]
[v4: addressed the case of upgrading from older daemon, in this case
container.HostConfig.IpcMode is unset and this is valid]
[v5: document old and new IpcMode values in api/swagger.yaml]
[v6: add the 'none' mode, changelog entry to docs/api/version-history.md]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 17:58:50 -04:00
|
|
|
default:
|
|
|
|
return fmt.Errorf("Invalid IPC mode: %v", ipcMode)
|
2016-03-18 14:50:19 -04:00
|
|
|
}
|
Implement none, private, and shareable ipc modes
Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and
/dev/mqueue between containers") container's /dev/shm is mounted on the
host first, then bind-mounted inside the container. This is done that
way in order to be able to share this container's IPC namespace
(and the /dev/shm mount point) with another container.
Unfortunately, this functionality breaks container checkpoint/restore
(even if IPC is not shared). Since /dev/shm is an external mount, its
contents is not saved by `criu checkpoint`, and so upon restore any
application that tries to access data under /dev/shm is severily
disappointed (which usually results in a fatal crash).
This commit solves the issue by introducing new IPC modes for containers
(in addition to 'host' and 'container:ID'). The new modes are:
- 'shareable': enables sharing this container's IPC with others
(this used to be the implicit default);
- 'private': disables sharing this container's IPC.
In 'private' mode, container's /dev/shm is truly mounted inside the
container, without any bind-mounting from the host, which solves the
issue.
While at it, let's also implement 'none' mode. The motivation, as
eloquently put by Justin Cormack, is:
> I wondered a while back about having a none shm mode, as currently it is
> not possible to have a totally unwriteable container as there is always
> a /dev/shm writeable mount. It is a bit of a niche case (and clearly
> should never be allowed to be daemon default) but it would be trivial to
> add now so maybe we should...
...so here's yet yet another mode:
- 'none': no /dev/shm mount inside the container (though it still
has its own private IPC namespace).
Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd
need to make 'private' the default mode, but unfortunately it breaks the
backward compatibility. So, let's make the default container IPC mode
per-daemon configurable (with the built-in default set to 'shareable'
for now). The default can be changed either via a daemon CLI option
(--default-shm-mode) or a daemon.json configuration file parameter
of the same name.
Note one can only set either 'shareable' or 'private' IPC modes as a
daemon default (i.e. in this context 'host', 'container', or 'none'
do not make much sense).
Some other changes this patch introduces are:
1. A mount for /dev/shm is added to default OCI Linux spec.
2. IpcMode.Valid() is simplified to remove duplicated code that parsed
'container:ID' form. Note the old version used to check that ID does
not contain a semicolon -- this is no longer the case (tests are
modified accordingly). The motivation is we should either do a
proper check for container ID validity, or don't check it at all
(since it is checked in other places anyway). I chose the latter.
3. IpcMode.Container() is modified to not return container ID if the
mode value does not start with "container:", unifying the check to
be the same as in IpcMode.IsContainer().
3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified
to add checks for newly added values.
[v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997]
[v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833]
[v4: addressed the case of upgrading from older daemon, in this case
container.HostConfig.IpcMode is unset and this is valid]
[v5: document old and new IpcMode values in api/swagger.yaml]
[v6: add the 'none' mode, changelog entry to docs/api/version-history.md]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 17:58:50 -04:00
|
|
|
|
2016-03-18 14:50:19 -04:00
|
|
|
// pid
|
2016-05-06 14:56:03 -04:00
|
|
|
if c.HostConfig.PidMode.IsContainer() {
|
2017-04-27 17:52:47 -04:00
|
|
|
ns := specs.LinuxNamespace{Type: "pid"}
|
2016-05-06 14:56:03 -04:00
|
|
|
pc, err := daemon.getPidContainer(c)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
ns.Path = fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID())
|
|
|
|
setNamespace(s, ns)
|
|
|
|
if userNS {
|
2016-05-07 21:36:10 -04:00
|
|
|
// to share a PID namespace, they must also share a user namespace
|
2017-04-27 17:52:47 -04:00
|
|
|
nsUser := specs.LinuxNamespace{Type: "user"}
|
2016-05-06 14:56:03 -04:00
|
|
|
nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID())
|
|
|
|
setNamespace(s, nsUser)
|
|
|
|
}
|
|
|
|
} else if c.HostConfig.PidMode.IsHost() {
|
2017-04-27 17:52:47 -04:00
|
|
|
oci.RemoveNamespace(s, specs.LinuxNamespaceType("pid"))
|
2016-05-06 14:56:03 -04:00
|
|
|
} else {
|
2017-04-27 17:52:47 -04:00
|
|
|
ns := specs.LinuxNamespace{Type: "pid"}
|
2016-05-06 14:56:03 -04:00
|
|
|
setNamespace(s, ns)
|
2016-03-18 14:50:19 -04:00
|
|
|
}
|
|
|
|
// uts
|
|
|
|
if c.HostConfig.UTSMode.IsHost() {
|
2017-04-27 17:52:47 -04:00
|
|
|
oci.RemoveNamespace(s, specs.LinuxNamespaceType("uts"))
|
2016-03-18 14:50:19 -04:00
|
|
|
s.Hostname = ""
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2017-04-27 17:52:47 -04:00
|
|
|
func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
|
|
|
|
var ids []specs.LinuxIDMapping
|
2016-03-18 14:50:19 -04:00
|
|
|
for _, item := range s {
|
2017-04-27 17:52:47 -04:00
|
|
|
ids = append(ids, specs.LinuxIDMapping{
|
2016-03-18 14:50:19 -04:00
|
|
|
HostID: uint32(item.HostID),
|
|
|
|
ContainerID: uint32(item.ContainerID),
|
|
|
|
Size: uint32(item.Size),
|
|
|
|
})
|
|
|
|
}
|
|
|
|
return ids
|
|
|
|
}
|
|
|
|
|
|
|
|
// Get the source mount point of directory passed in as argument. Also return
|
|
|
|
// optional fields.
|
|
|
|
func getSourceMount(source string) (string, string, error) {
|
|
|
|
// Ensure any symlinks are resolved.
|
|
|
|
sourcePath, err := filepath.EvalSymlinks(source)
|
|
|
|
if err != nil {
|
|
|
|
return "", "", err
|
|
|
|
}
|
|
|
|
|
getSourceMount(): simplify
The flow of getSourceMount was:
1 get all entries from /proc/self/mountinfo
2 do a linear search for the `source` directory
3 if found, return its data
4 get the parent directory of `source`, goto 2
The repeated linear search through the whole mountinfo (which can have
thousands of records) is inefficient. Instead, let's just
1 collect all the relevant records (only those mount points
that can be a parent of `source`)
2 find the record with the longest mountpath, return its data
This was tested manually with something like
```go
func TestGetSourceMount(t *testing.T) {
mnt, flags, err := getSourceMount("/sys/devices/msr/")
assert.NoError(t, err)
t.Logf("mnt: %v, flags: %v", mnt, flags)
}
```
...but it relies on having a specific mount points on the system
being used for testing.
[v2: add unit tests for ParentsFilter]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2018-01-25 23:13:46 -05:00
|
|
|
mi, err := mount.GetMounts(mount.ParentsFilter(sourcePath))
|
2016-03-18 14:50:19 -04:00
|
|
|
if err != nil {
|
|
|
|
return "", "", err
|
|
|
|
}
|
getSourceMount(): simplify
The flow of getSourceMount was:
1 get all entries from /proc/self/mountinfo
2 do a linear search for the `source` directory
3 if found, return its data
4 get the parent directory of `source`, goto 2
The repeated linear search through the whole mountinfo (which can have
thousands of records) is inefficient. Instead, let's just
1 collect all the relevant records (only those mount points
that can be a parent of `source`)
2 find the record with the longest mountpath, return its data
This was tested manually with something like
```go
func TestGetSourceMount(t *testing.T) {
mnt, flags, err := getSourceMount("/sys/devices/msr/")
assert.NoError(t, err)
t.Logf("mnt: %v, flags: %v", mnt, flags)
}
```
...but it relies on having a specific mount points on the system
being used for testing.
[v2: add unit tests for ParentsFilter]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2018-01-25 23:13:46 -05:00
|
|
|
if len(mi) < 1 {
|
|
|
|
return "", "", fmt.Errorf("Can't find mount point of %s", source)
|
2016-03-18 14:50:19 -04:00
|
|
|
}
|
|
|
|
|
getSourceMount(): simplify
The flow of getSourceMount was:
1 get all entries from /proc/self/mountinfo
2 do a linear search for the `source` directory
3 if found, return its data
4 get the parent directory of `source`, goto 2
The repeated linear search through the whole mountinfo (which can have
thousands of records) is inefficient. Instead, let's just
1 collect all the relevant records (only those mount points
that can be a parent of `source`)
2 find the record with the longest mountpath, return its data
This was tested manually with something like
```go
func TestGetSourceMount(t *testing.T) {
mnt, flags, err := getSourceMount("/sys/devices/msr/")
assert.NoError(t, err)
t.Logf("mnt: %v, flags: %v", mnt, flags)
}
```
...but it relies on having a specific mount points on the system
being used for testing.
[v2: add unit tests for ParentsFilter]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2018-01-25 23:13:46 -05:00
|
|
|
// find the longest mount point
|
|
|
|
var idx, maxlen int
|
|
|
|
for i := range mi {
|
|
|
|
if len(mi[i].Mountpoint) > maxlen {
|
|
|
|
maxlen = len(mi[i].Mountpoint)
|
|
|
|
idx = i
|
2016-03-18 14:50:19 -04:00
|
|
|
}
|
|
|
|
}
|
2018-05-10 15:01:50 -04:00
|
|
|
return mi[idx].Mountpoint, mi[idx].Optional, nil
|
2016-03-18 14:50:19 -04:00
|
|
|
}
|
|
|
|
|
2018-01-24 18:10:01 -05:00
|
|
|
const (
|
|
|
|
sharedPropagationOption = "shared:"
|
|
|
|
slavePropagationOption = "master:"
|
|
|
|
)
|
|
|
|
|
|
|
|
// hasMountinfoOption checks if any of the passed any of the given option values
|
|
|
|
// are set in the passed in option string.
|
|
|
|
func hasMountinfoOption(opts string, vals ...string) bool {
|
|
|
|
for _, opt := range strings.Split(opts, " ") {
|
|
|
|
for _, val := range vals {
|
|
|
|
if strings.HasPrefix(opt, val) {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
2016-03-18 14:50:19 -04:00
|
|
|
// Ensure mount point on which path is mounted, is shared.
|
|
|
|
func ensureShared(path string) error {
|
|
|
|
sourceMount, optionalOpts, err := getSourceMount(path)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
// Make sure source mount point is shared.
|
2018-01-24 18:10:01 -05:00
|
|
|
if !hasMountinfoOption(optionalOpts, sharedPropagationOption) {
|
|
|
|
return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount)
|
2016-03-18 14:50:19 -04:00
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
// Ensure mount point on which path is mounted, is either shared or slave.
|
|
|
|
func ensureSharedOrSlave(path string) error {
|
|
|
|
sourceMount, optionalOpts, err := getSourceMount(path)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
2018-01-24 18:10:01 -05:00
|
|
|
if !hasMountinfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) {
|
|
|
|
return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount)
|
2016-03-18 14:50:19 -04:00
|
|
|
}
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2017-10-15 02:06:20 -04:00
|
|
|
// Get the set of mount flags that are set on the mount that contains the given
|
|
|
|
// path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
|
|
|
|
// bind-mounting "with options" will not fail with user namespaces, due to
|
|
|
|
// kernel restrictions that require user namespace mounts to preserve
|
|
|
|
// CL_UNPRIVILEGED locked flags.
|
|
|
|
func getUnprivilegedMountFlags(path string) ([]string, error) {
|
|
|
|
var statfs unix.Statfs_t
|
|
|
|
if err := unix.Statfs(path, &statfs); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
// The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
|
|
|
|
unprivilegedFlags := map[uint64]string{
|
|
|
|
unix.MS_RDONLY: "ro",
|
|
|
|
unix.MS_NODEV: "nodev",
|
|
|
|
unix.MS_NOEXEC: "noexec",
|
|
|
|
unix.MS_NOSUID: "nosuid",
|
|
|
|
unix.MS_NOATIME: "noatime",
|
|
|
|
unix.MS_RELATIME: "relatime",
|
|
|
|
unix.MS_NODIRATIME: "nodiratime",
|
|
|
|
}
|
|
|
|
|
|
|
|
var flags []string
|
|
|
|
for mask, flag := range unprivilegedFlags {
|
|
|
|
if uint64(statfs.Flags)&mask == mask {
|
|
|
|
flags = append(flags, flag)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return flags, nil
|
|
|
|
}
|
|
|
|
|
2016-03-18 14:50:19 -04:00
|
|
|
var (
|
|
|
|
mountPropagationMap = map[string]int{
|
|
|
|
"private": mount.PRIVATE,
|
|
|
|
"rprivate": mount.RPRIVATE,
|
|
|
|
"shared": mount.SHARED,
|
|
|
|
"rshared": mount.RSHARED,
|
|
|
|
"slave": mount.SLAVE,
|
|
|
|
"rslave": mount.RSLAVE,
|
|
|
|
}
|
|
|
|
|
|
|
|
mountPropagationReverseMap = map[int]string{
|
|
|
|
mount.PRIVATE: "private",
|
|
|
|
mount.RPRIVATE: "rprivate",
|
|
|
|
mount.SHARED: "shared",
|
|
|
|
mount.RSHARED: "rshared",
|
|
|
|
mount.SLAVE: "slave",
|
|
|
|
mount.RSLAVE: "rslave",
|
|
|
|
}
|
|
|
|
)
|
|
|
|
|
2017-11-10 00:18:48 -05:00
|
|
|
// inSlice tests whether a string is contained in a slice of strings or not.
|
|
|
|
// Comparison is case sensitive
|
|
|
|
func inSlice(slice []string, s string) bool {
|
|
|
|
for _, ss := range slice {
|
|
|
|
if s == ss {
|
|
|
|
return true
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return false
|
|
|
|
}
|
|
|
|
|
2016-03-18 14:50:19 -04:00
|
|
|
func setMounts(daemon *Daemon, s *specs.Spec, c *container.Container, mounts []container.Mount) error {
|
|
|
|
userMounts := make(map[string]struct{})
|
|
|
|
for _, m := range mounts {
|
|
|
|
userMounts[m.Destination] = struct{}{}
|
|
|
|
}
|
|
|
|
|
2017-10-27 03:21:41 -04:00
|
|
|
// Copy all mounts from spec to defaultMounts, except for
|
2018-08-08 11:45:00 -04:00
|
|
|
// - mounts overridden by a user supplied mount;
|
2017-10-27 03:21:41 -04:00
|
|
|
// - all mounts under /dev if a user supplied /dev is present;
|
|
|
|
// - /dev/shm, in case IpcMode is none.
|
|
|
|
// While at it, also
|
|
|
|
// - set size for /dev/shm from shmsize.
|
2017-08-17 07:48:11 -04:00
|
|
|
defaultMounts := s.Mounts[:0]
|
2016-03-18 14:50:19 -04:00
|
|
|
_, mountDev := userMounts["/dev"]
|
|
|
|
for _, m := range s.Mounts {
|
2017-10-27 03:21:41 -04:00
|
|
|
if _, ok := userMounts[m.Destination]; ok {
|
|
|
|
// filter out mount overridden by a user supplied mount
|
Implement none, private, and shareable ipc modes
Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and
/dev/mqueue between containers") container's /dev/shm is mounted on the
host first, then bind-mounted inside the container. This is done that
way in order to be able to share this container's IPC namespace
(and the /dev/shm mount point) with another container.
Unfortunately, this functionality breaks container checkpoint/restore
(even if IPC is not shared). Since /dev/shm is an external mount, its
contents is not saved by `criu checkpoint`, and so upon restore any
application that tries to access data under /dev/shm is severily
disappointed (which usually results in a fatal crash).
This commit solves the issue by introducing new IPC modes for containers
(in addition to 'host' and 'container:ID'). The new modes are:
- 'shareable': enables sharing this container's IPC with others
(this used to be the implicit default);
- 'private': disables sharing this container's IPC.
In 'private' mode, container's /dev/shm is truly mounted inside the
container, without any bind-mounting from the host, which solves the
issue.
While at it, let's also implement 'none' mode. The motivation, as
eloquently put by Justin Cormack, is:
> I wondered a while back about having a none shm mode, as currently it is
> not possible to have a totally unwriteable container as there is always
> a /dev/shm writeable mount. It is a bit of a niche case (and clearly
> should never be allowed to be daemon default) but it would be trivial to
> add now so maybe we should...
...so here's yet yet another mode:
- 'none': no /dev/shm mount inside the container (though it still
has its own private IPC namespace).
Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd
need to make 'private' the default mode, but unfortunately it breaks the
backward compatibility. So, let's make the default container IPC mode
per-daemon configurable (with the built-in default set to 'shareable'
for now). The default can be changed either via a daemon CLI option
(--default-shm-mode) or a daemon.json configuration file parameter
of the same name.
Note one can only set either 'shareable' or 'private' IPC modes as a
daemon default (i.e. in this context 'host', 'container', or 'none'
do not make much sense).
Some other changes this patch introduces are:
1. A mount for /dev/shm is added to default OCI Linux spec.
2. IpcMode.Valid() is simplified to remove duplicated code that parsed
'container:ID' form. Note the old version used to check that ID does
not contain a semicolon -- this is no longer the case (tests are
modified accordingly). The motivation is we should either do a
proper check for container ID validity, or don't check it at all
(since it is checked in other places anyway). I chose the latter.
3. IpcMode.Container() is modified to not return container ID if the
mode value does not start with "container:", unifying the check to
be the same as in IpcMode.IsContainer().
3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified
to add checks for newly added values.
[v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997]
[v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833]
[v4: addressed the case of upgrading from older daemon, in this case
container.HostConfig.IpcMode is unset and this is valid]
[v5: document old and new IpcMode values in api/swagger.yaml]
[v6: add the 'none' mode, changelog entry to docs/api/version-history.md]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 17:58:50 -04:00
|
|
|
continue
|
|
|
|
}
|
2017-10-27 03:21:41 -04:00
|
|
|
if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
|
|
|
|
// filter out everything under /dev if /dev is user-mounted
|
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
if m.Destination == "/dev/shm" {
|
|
|
|
if c.HostConfig.IpcMode.IsNone() {
|
|
|
|
// filter out /dev/shm for "none" IpcMode
|
2016-03-18 14:50:19 -04:00
|
|
|
continue
|
|
|
|
}
|
2017-10-27 03:21:41 -04:00
|
|
|
// set size for /dev/shm mount from spec
|
|
|
|
sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
|
|
|
|
m.Options = append(m.Options, sizeOpt)
|
2016-03-18 14:50:19 -04:00
|
|
|
}
|
2017-10-27 03:21:41 -04:00
|
|
|
|
|
|
|
defaultMounts = append(defaultMounts, m)
|
2016-03-18 14:50:19 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
s.Mounts = defaultMounts
|
|
|
|
for _, m := range mounts {
|
|
|
|
if m.Source == "tmpfs" {
|
2016-09-22 16:14:15 -04:00
|
|
|
data := m.Data
|
2018-04-17 16:50:28 -04:00
|
|
|
parser := volumemounts.NewParser("linux")
|
2017-08-01 13:32:44 -04:00
|
|
|
options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
|
2016-06-06 05:57:11 -04:00
|
|
|
if data != "" {
|
|
|
|
options = append(options, strings.Split(data, ",")...)
|
2016-03-18 14:50:19 -04:00
|
|
|
}
|
|
|
|
|
Inconsistent --tmpfs behavior
This fix tries to address the issue raised in #22420. When
`--tmpfs` is specified with `/tmp`, the default value is
`rw,nosuid,nodev,noexec,relatime,size=65536k`. When `--tmpfs`
is specified with `/tmp:rw`, then the value changed to
`rw,nosuid,nodev,noexec,relatime`.
The reason for such an inconsistency is because docker tries
to add `size=65536k` option only when user provides no option.
This fix tries to address this issue by always pre-progating
`size=65536k` along with `rw,nosuid,nodev,noexec,relatime`.
If user provides a different value (e.g., `size=8192k`), it
will override the `size=65536k` anyway since the combined
options will be parsed and merged to remove any duplicates.
Additional test cases have been added to cover the changes
in this fix.
This fix fixes #22420.
Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
2016-04-30 22:42:19 -04:00
|
|
|
merged, err := mount.MergeTmpfsOptions(options)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
|
2016-03-18 14:50:19 -04:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
|
|
|
|
mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
|
|
|
|
|
|
|
|
// Determine property of RootPropagation based on volume
|
|
|
|
// properties. If a volume is shared, then keep root propagation
|
|
|
|
// shared. This should work for slave and private volumes too.
|
|
|
|
//
|
|
|
|
// For slave volumes, it can be either [r]shared/[r]slave.
|
|
|
|
//
|
|
|
|
// For private volumes any root propagation value should work.
|
|
|
|
pFlag := mountPropagationMap[m.Propagation]
|
2018-01-18 16:55:27 -05:00
|
|
|
switch pFlag {
|
|
|
|
case mount.SHARED, mount.RSHARED:
|
2016-03-18 14:50:19 -04:00
|
|
|
if err := ensureShared(m.Source); err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
|
|
|
|
if rootpg != mount.SHARED && rootpg != mount.RSHARED {
|
|
|
|
s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
|
|
|
|
}
|
2018-01-18 16:55:27 -05:00
|
|
|
case mount.SLAVE, mount.RSLAVE:
|
|
|
|
var fallback bool
|
2016-03-18 14:50:19 -04:00
|
|
|
if err := ensureSharedOrSlave(m.Source); err != nil {
|
2018-08-08 11:45:00 -04:00
|
|
|
// For backwards compatibility purposes, treat mounts from the daemon root
|
2018-01-18 16:55:27 -05:00
|
|
|
// as special since we automatically add rslave propagation to these mounts
|
|
|
|
// when the user did not set anything, so we should fallback to the old
|
|
|
|
// behavior which is to use private propagation which is normally the
|
|
|
|
// default.
|
|
|
|
if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
|
|
|
|
cm, ok := c.MountPoints[m.Destination]
|
|
|
|
if !ok {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" {
|
|
|
|
// This means the user explicitly set a propagation, do not fallback in that case.
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
fallback = true
|
|
|
|
logrus.WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root")
|
2016-03-18 14:50:19 -04:00
|
|
|
}
|
2018-01-18 16:55:27 -05:00
|
|
|
if !fallback {
|
|
|
|
rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
|
|
|
|
if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
|
|
|
|
s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
|
|
|
|
}
|
2016-03-18 14:50:19 -04:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-10-10 06:20:13 -04:00
|
|
|
bindMode := "rbind"
|
|
|
|
if m.NonRecursive {
|
|
|
|
bindMode = "bind"
|
|
|
|
}
|
|
|
|
opts := []string{bindMode}
|
2016-03-18 14:50:19 -04:00
|
|
|
if !m.Writable {
|
|
|
|
opts = append(opts, "ro")
|
|
|
|
}
|
|
|
|
if pFlag != 0 {
|
|
|
|
opts = append(opts, mountPropagationReverseMap[pFlag])
|
|
|
|
}
|
|
|
|
|
2017-10-15 02:06:20 -04:00
|
|
|
// If we are using user namespaces, then we must make sure that we
|
|
|
|
// don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
|
|
|
|
// "mount" when we bind-mount. The reason for this is that at the point
|
|
|
|
// when runc sets up the root filesystem, it is already inside a user
|
|
|
|
// namespace, and thus cannot change any flags that are locked.
|
|
|
|
if daemon.configStore.RemappedRoot != "" {
|
|
|
|
unprivOpts, err := getUnprivilegedMountFlags(m.Source)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
opts = append(opts, unprivOpts...)
|
|
|
|
}
|
|
|
|
|
2016-03-18 14:50:19 -04:00
|
|
|
mt.Options = opts
|
|
|
|
s.Mounts = append(s.Mounts, mt)
|
|
|
|
}
|
|
|
|
|
|
|
|
if s.Root.Readonly {
|
|
|
|
for i, m := range s.Mounts {
|
|
|
|
switch m.Destination {
|
daemon/setMounts(): do not make /dev/shm ro
It has been pointed out that if --read-only flag is given, /dev/shm
also becomes read-only in case of --ipc private.
This happens because in this case the mount comes from OCI spec
(since commit 7120976d74195), and is a regression caused by that
commit.
The meaning of --read-only flag is to only have a "main" container
filesystem read-only, not the auxiliary stuff (that includes /dev/shm,
other mounts and volumes, --tmpfs, /proc, /dev and so on).
So, let's make sure /dev/shm that comes from OCI spec is not made
read-only.
Fixes: 7120976d74195 ("Implement none, private, and shareable ipc modes")
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2018-03-07 23:14:16 -05:00
|
|
|
case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev":
|
2016-03-18 14:50:19 -04:00
|
|
|
continue
|
|
|
|
}
|
|
|
|
if _, ok := userMounts[m.Destination]; !ok {
|
2017-11-10 00:18:48 -05:00
|
|
|
if !inSlice(m.Options, "ro") {
|
2016-03-18 14:50:19 -04:00
|
|
|
s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if c.HostConfig.Privileged {
|
2018-04-06 09:01:38 -04:00
|
|
|
// clear readonly for /sys
|
|
|
|
for i := range s.Mounts {
|
|
|
|
if s.Mounts[i].Destination == "/sys" {
|
|
|
|
clearReadOnly(&s.Mounts[i])
|
2016-03-18 14:50:19 -04:00
|
|
|
}
|
|
|
|
}
|
2016-04-04 17:27:44 -04:00
|
|
|
s.Linux.ReadonlyPaths = nil
|
|
|
|
s.Linux.MaskedPaths = nil
|
2016-03-18 14:50:19 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
// TODO: until a kernel/mount solution exists for handling remount in a user namespace,
|
|
|
|
// we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
|
2017-11-16 01:20:33 -05:00
|
|
|
if uidMap := daemon.idMapping.UIDs(); uidMap != nil || c.HostConfig.Privileged {
|
2016-03-18 14:50:19 -04:00
|
|
|
for i, m := range s.Mounts {
|
|
|
|
if m.Type == "cgroup" {
|
|
|
|
clearReadOnly(&s.Mounts[i])
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
|
|
|
func (daemon *Daemon) populateCommonSpec(s *specs.Spec, c *container.Container) error {
|
2018-03-13 22:45:21 -04:00
|
|
|
if c.BaseFS == nil {
|
|
|
|
return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly nil")
|
|
|
|
}
|
2016-03-18 14:50:19 -04:00
|
|
|
linkedEnv, err := daemon.setupLinkedContainers(c)
|
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
2017-08-01 11:51:24 -04:00
|
|
|
s.Root = &specs.Root{
|
2017-08-03 20:22:00 -04:00
|
|
|
Path: c.BaseFS.Path(),
|
2016-03-18 14:50:19 -04:00
|
|
|
Readonly: c.HostConfig.ReadonlyRootfs,
|
|
|
|
}
|
2017-11-16 01:20:33 -05:00
|
|
|
if err := c.SetupWorkingDirectory(daemon.idMapping.RootPair()); err != nil {
|
2016-03-18 14:50:19 -04:00
|
|
|
return err
|
|
|
|
}
|
|
|
|
cwd := c.Config.WorkingDir
|
|
|
|
if len(cwd) == 0 {
|
|
|
|
cwd = "/"
|
|
|
|
}
|
|
|
|
s.Process.Args = append([]string{c.Path}, c.Args...)
|
2016-06-27 17:38:47 -04:00
|
|
|
|
|
|
|
// only add the custom init if it is specified and the container is running in its
|
|
|
|
// own private pid namespace. It does not make sense to add if it is running in the
|
|
|
|
// host namespace or another container's pid namespace where we already have an init
|
|
|
|
if c.HostConfig.PidMode.IsPrivate() {
|
|
|
|
if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
|
|
|
|
(c.HostConfig.Init == nil && daemon.configStore.Init) {
|
2018-08-22 16:05:12 -04:00
|
|
|
s.Process.Args = append([]string{inContainerInitPath, "--", c.Path}, c.Args...)
|
|
|
|
path := daemon.configStore.InitPath
|
|
|
|
if path == "" {
|
2017-04-10 05:25:15 -04:00
|
|
|
path, err = exec.LookPath(daemonconfig.DefaultInitBinary)
|
2016-09-27 06:51:42 -04:00
|
|
|
if err != nil {
|
|
|
|
return err
|
|
|
|
}
|
|
|
|
}
|
2016-06-27 17:38:47 -04:00
|
|
|
s.Mounts = append(s.Mounts, specs.Mount{
|
2018-08-22 16:05:12 -04:00
|
|
|
Destination: inContainerInitPath,
|
2016-06-27 17:38:47 -04:00
|
|
|
Type: "bind",
|
|
|
|
Source: path,
|
|
|
|
Options: []string{"bind", "ro"},
|
|
|
|
})
|
|
|
|
}
|
|
|
|
}
|
2016-03-18 14:50:19 -04:00
|
|
|
s.Process.Cwd = cwd
|
2016-09-28 18:21:33 -04:00
|
|
|
s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
|
2016-03-18 14:50:19 -04:00
|
|
|
s.Process.Terminal = c.Config.Tty
|
2018-06-17 03:05:54 -04:00
|
|
|
|
|
|
|
s.Hostname = c.Config.Hostname
|
|
|
|
// There isn't a field in the OCI for the NIS domainname, but luckily there
|
|
|
|
// is a sysctl which has an identical effect to setdomainname(2) so there's
|
|
|
|
// no explicit need for runtime support.
|
|
|
|
s.Linux.Sysctl = make(map[string]string)
|
|
|
|
if c.Config.Domainname != "" {
|
|
|
|
s.Linux.Sysctl["kernel.domainname"] = c.Config.Domainname
|
|
|
|
}
|
2016-03-18 14:50:19 -04:00
|
|
|
|
|
|
|
return nil
|
|
|
|
}
|
|
|
|
|
2017-06-26 21:46:30 -04:00
|
|
|
func (daemon *Daemon) createSpec(c *container.Container) (retSpec *specs.Spec, err error) {
|
2016-03-18 14:50:19 -04:00
|
|
|
s := oci.DefaultSpec()
|
|
|
|
if err := daemon.populateCommonSpec(&s, c); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
|
|
|
var cgroupsPath string
|
2016-03-24 12:18:03 -04:00
|
|
|
scopePrefix := "docker"
|
|
|
|
parent := "/docker"
|
|
|
|
useSystemd := UsingSystemd(daemon.configStore)
|
|
|
|
if useSystemd {
|
|
|
|
parent = "system.slice"
|
|
|
|
}
|
|
|
|
|
2016-03-18 14:50:19 -04:00
|
|
|
if c.HostConfig.CgroupParent != "" {
|
2016-03-24 12:18:03 -04:00
|
|
|
parent = c.HostConfig.CgroupParent
|
|
|
|
} else if daemon.configStore.CgroupParent != "" {
|
|
|
|
parent = daemon.configStore.CgroupParent
|
|
|
|
}
|
|
|
|
|
|
|
|
if useSystemd {
|
|
|
|
cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID
|
|
|
|
logrus.Debugf("createSpec: cgroupsPath: %s", cgroupsPath)
|
2016-03-18 14:50:19 -04:00
|
|
|
} else {
|
2016-03-24 12:18:03 -04:00
|
|
|
cgroupsPath = filepath.Join(parent, c.ID)
|
2016-03-18 14:50:19 -04:00
|
|
|
}
|
2017-04-27 17:52:47 -04:00
|
|
|
s.Linux.CgroupsPath = cgroupsPath
|
2016-03-18 14:50:19 -04:00
|
|
|
|
|
|
|
if err := setResources(&s, c.HostConfig.Resources); err != nil {
|
|
|
|
return nil, fmt.Errorf("linux runtime spec resources: %v", err)
|
|
|
|
}
|
2018-06-17 03:05:54 -04:00
|
|
|
// We merge the sysctls injected above with the HostConfig (latter takes
|
|
|
|
// precedence for backwards-compatibility reasons).
|
|
|
|
for k, v := range c.HostConfig.Sysctls {
|
|
|
|
s.Linux.Sysctl[k] = v
|
|
|
|
}
|
2016-06-07 15:05:43 -04:00
|
|
|
|
2017-04-27 17:52:47 -04:00
|
|
|
p := s.Linux.CgroupsPath
|
2016-06-07 15:05:43 -04:00
|
|
|
if useSystemd {
|
2017-04-27 17:52:47 -04:00
|
|
|
initPath, err := cgroups.GetInitCgroup("cpu")
|
2016-06-07 15:05:43 -04:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2017-09-22 09:52:41 -04:00
|
|
|
_, err = cgroups.GetOwnCgroup("cpu")
|
2016-06-07 15:05:43 -04:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2017-09-22 09:52:41 -04:00
|
|
|
p = filepath.Join(initPath, s.Linux.CgroupsPath)
|
2016-06-07 15:05:43 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
// Clean path to guard against things like ../../../BAD
|
|
|
|
parentPath := filepath.Dir(p)
|
|
|
|
if !filepath.IsAbs(parentPath) {
|
|
|
|
parentPath = filepath.Clean("/" + parentPath)
|
|
|
|
}
|
|
|
|
|
|
|
|
if err := daemon.initCgroupsPath(parentPath); err != nil {
|
|
|
|
return nil, fmt.Errorf("linux init cgroups path: %v", err)
|
|
|
|
}
|
2016-03-18 14:50:19 -04:00
|
|
|
if err := setDevices(&s, c); err != nil {
|
|
|
|
return nil, fmt.Errorf("linux runtime spec devices: %v", err)
|
|
|
|
}
|
2017-09-22 09:52:41 -04:00
|
|
|
if err := daemon.setRlimits(&s, c); err != nil {
|
2016-03-18 14:50:19 -04:00
|
|
|
return nil, fmt.Errorf("linux runtime spec rlimits: %v", err)
|
|
|
|
}
|
|
|
|
if err := setUser(&s, c); err != nil {
|
|
|
|
return nil, fmt.Errorf("linux spec user: %v", err)
|
|
|
|
}
|
|
|
|
if err := setNamespaces(daemon, &s, c); err != nil {
|
|
|
|
return nil, fmt.Errorf("linux spec namespaces: %v", err)
|
|
|
|
}
|
2018-12-16 10:11:37 -05:00
|
|
|
capabilities, err := caps.TweakCapabilities(oci.DefaultCapabilities(), c.HostConfig.CapAdd, c.HostConfig.CapDrop, c.HostConfig.Capabilities, c.HostConfig.Privileged)
|
|
|
|
if err != nil {
|
|
|
|
return nil, fmt.Errorf("linux spec capabilities: %v", err)
|
|
|
|
}
|
|
|
|
if err := oci.SetCapabilities(&s, capabilities); err != nil {
|
2016-03-18 14:50:19 -04:00
|
|
|
return nil, fmt.Errorf("linux spec capabilities: %v", err)
|
|
|
|
}
|
|
|
|
if err := setSeccomp(daemon, &s, c); err != nil {
|
|
|
|
return nil, fmt.Errorf("linux seccomp: %v", err)
|
|
|
|
}
|
|
|
|
|
2017-12-18 16:02:23 -05:00
|
|
|
if err := daemon.setupContainerMountsRoot(c); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2016-03-18 14:50:19 -04:00
|
|
|
if err := daemon.setupIpcDirs(c); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2017-06-26 21:46:30 -04:00
|
|
|
defer func() {
|
2018-01-11 17:28:56 -05:00
|
|
|
if err != nil {
|
2018-01-17 10:49:58 -05:00
|
|
|
daemon.cleanupSecretDir(c)
|
2017-06-26 21:46:30 -04:00
|
|
|
}
|
|
|
|
}()
|
2016-10-19 12:22:02 -04:00
|
|
|
|
2018-01-11 17:28:56 -05:00
|
|
|
if err := daemon.setupSecretDir(c); err != nil {
|
2017-03-16 17:23:33 -04:00
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
|
2016-04-26 04:20:17 -04:00
|
|
|
ms, err := daemon.setupMounts(c)
|
2016-03-18 14:50:19 -04:00
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2016-10-27 03:41:32 -04:00
|
|
|
|
Implement none, private, and shareable ipc modes
Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and
/dev/mqueue between containers") container's /dev/shm is mounted on the
host first, then bind-mounted inside the container. This is done that
way in order to be able to share this container's IPC namespace
(and the /dev/shm mount point) with another container.
Unfortunately, this functionality breaks container checkpoint/restore
(even if IPC is not shared). Since /dev/shm is an external mount, its
contents is not saved by `criu checkpoint`, and so upon restore any
application that tries to access data under /dev/shm is severily
disappointed (which usually results in a fatal crash).
This commit solves the issue by introducing new IPC modes for containers
(in addition to 'host' and 'container:ID'). The new modes are:
- 'shareable': enables sharing this container's IPC with others
(this used to be the implicit default);
- 'private': disables sharing this container's IPC.
In 'private' mode, container's /dev/shm is truly mounted inside the
container, without any bind-mounting from the host, which solves the
issue.
While at it, let's also implement 'none' mode. The motivation, as
eloquently put by Justin Cormack, is:
> I wondered a while back about having a none shm mode, as currently it is
> not possible to have a totally unwriteable container as there is always
> a /dev/shm writeable mount. It is a bit of a niche case (and clearly
> should never be allowed to be daemon default) but it would be trivial to
> add now so maybe we should...
...so here's yet yet another mode:
- 'none': no /dev/shm mount inside the container (though it still
has its own private IPC namespace).
Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd
need to make 'private' the default mode, but unfortunately it breaks the
backward compatibility. So, let's make the default container IPC mode
per-daemon configurable (with the built-in default set to 'shareable'
for now). The default can be changed either via a daemon CLI option
(--default-shm-mode) or a daemon.json configuration file parameter
of the same name.
Note one can only set either 'shareable' or 'private' IPC modes as a
daemon default (i.e. in this context 'host', 'container', or 'none'
do not make much sense).
Some other changes this patch introduces are:
1. A mount for /dev/shm is added to default OCI Linux spec.
2. IpcMode.Valid() is simplified to remove duplicated code that parsed
'container:ID' form. Note the old version used to check that ID does
not contain a semicolon -- this is no longer the case (tests are
modified accordingly). The motivation is we should either do a
proper check for container ID validity, or don't check it at all
(since it is checked in other places anyway). I chose the latter.
3. IpcMode.Container() is modified to not return container ID if the
mode value does not start with "container:", unifying the check to
be the same as in IpcMode.IsContainer().
3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified
to add checks for newly added values.
[v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997]
[v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833]
[v4: addressed the case of upgrading from older daemon, in this case
container.HostConfig.IpcMode is unset and this is valid]
[v5: document old and new IpcMode values in api/swagger.yaml]
[v6: add the 'none' mode, changelog entry to docs/api/version-history.md]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 17:58:50 -04:00
|
|
|
if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() {
|
|
|
|
ms = append(ms, c.IpcMounts()...)
|
|
|
|
}
|
2016-10-19 12:22:02 -04:00
|
|
|
|
2016-09-22 16:14:15 -04:00
|
|
|
tmpfsMounts, err := c.TmpfsMounts()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
|
|
|
ms = append(ms, tmpfsMounts...)
|
2016-10-19 12:22:02 -04:00
|
|
|
|
2017-12-18 16:02:23 -05:00
|
|
|
secretMounts, err := c.SecretMounts()
|
|
|
|
if err != nil {
|
|
|
|
return nil, err
|
2016-10-27 03:41:32 -04:00
|
|
|
}
|
2017-12-18 16:02:23 -05:00
|
|
|
ms = append(ms, secretMounts...)
|
2016-10-26 16:30:53 -04:00
|
|
|
|
2016-04-26 04:20:17 -04:00
|
|
|
sort.Sort(mounts(ms))
|
|
|
|
if err := setMounts(daemon, &s, c, ms); err != nil {
|
2016-03-18 14:50:19 -04:00
|
|
|
return nil, fmt.Errorf("linux mounts: %v", err)
|
|
|
|
}
|
|
|
|
|
|
|
|
for _, ns := range s.Linux.Namespaces {
|
|
|
|
if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled {
|
2018-01-26 13:40:32 -05:00
|
|
|
target := filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe")
|
2017-04-27 17:52:47 -04:00
|
|
|
s.Hooks = &specs.Hooks{
|
2016-03-18 14:50:19 -04:00
|
|
|
Prestart: []specs.Hook{{
|
2018-01-26 13:40:32 -05:00
|
|
|
Path: target,
|
2018-09-14 11:21:43 -04:00
|
|
|
Args: []string{"libnetwork-setkey", "-exec-root=" + daemon.configStore.GetExecRoot(), c.ID, daemon.netController.ID()},
|
2016-03-18 14:50:19 -04:00
|
|
|
}},
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if apparmor.IsEnabled() {
|
2016-12-05 08:12:17 -05:00
|
|
|
var appArmorProfile string
|
|
|
|
if c.AppArmorProfile != "" {
|
2016-03-18 14:50:19 -04:00
|
|
|
appArmorProfile = c.AppArmorProfile
|
2016-04-04 17:27:44 -04:00
|
|
|
} else if c.HostConfig.Privileged {
|
|
|
|
appArmorProfile = "unconfined"
|
2016-12-05 08:12:17 -05:00
|
|
|
} else {
|
|
|
|
appArmorProfile = "docker-default"
|
|
|
|
}
|
|
|
|
|
|
|
|
if appArmorProfile == "docker-default" {
|
|
|
|
// Unattended upgrades and other fun services can unload AppArmor
|
|
|
|
// profiles inadvertently. Since we cannot store our profile in
|
|
|
|
// /etc/apparmor.d, nor can we practically add other ways of
|
|
|
|
// telling the system to keep our profile loaded, in order to make
|
|
|
|
// sure that we keep the default profile enabled we dynamically
|
|
|
|
// reload it if necessary.
|
|
|
|
if err := ensureDefaultAppArmorProfile(); err != nil {
|
|
|
|
return nil, err
|
|
|
|
}
|
2016-03-18 14:50:19 -04:00
|
|
|
}
|
2016-12-05 08:12:17 -05:00
|
|
|
|
2016-03-18 14:50:19 -04:00
|
|
|
s.Process.ApparmorProfile = appArmorProfile
|
|
|
|
}
|
|
|
|
s.Process.SelinuxLabel = c.GetProcessLabel()
|
|
|
|
s.Process.NoNewPrivileges = c.NoNewPrivileges
|
2017-08-01 11:51:24 -04:00
|
|
|
s.Process.OOMScoreAdj = &c.HostConfig.OomScoreAdj
|
2016-04-25 15:55:28 -04:00
|
|
|
s.Linux.MountLabel = c.MountLabel
|
2016-03-18 14:50:19 -04:00
|
|
|
|
2018-03-20 13:29:18 -04:00
|
|
|
// Set the masked and readonly paths with regard to the host config options if they are set.
|
|
|
|
if c.HostConfig.MaskedPaths != nil {
|
|
|
|
s.Linux.MaskedPaths = c.HostConfig.MaskedPaths
|
|
|
|
}
|
|
|
|
if c.HostConfig.ReadonlyPaths != nil {
|
|
|
|
s.Linux.ReadonlyPaths = c.HostConfig.ReadonlyPaths
|
|
|
|
}
|
|
|
|
|
2017-08-24 13:11:44 -04:00
|
|
|
return &s, nil
|
2016-03-18 14:50:19 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
func clearReadOnly(m *specs.Mount) {
|
|
|
|
var opt []string
|
|
|
|
for _, o := range m.Options {
|
|
|
|
if o != "ro" {
|
|
|
|
opt = append(opt, o)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
m.Options = opt
|
|
|
|
}
|
2016-09-08 00:23:56 -04:00
|
|
|
|
|
|
|
// mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
|
|
|
|
func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig) {
|
|
|
|
ulimits := c.Ulimits
|
|
|
|
// Merge ulimits with daemon defaults
|
|
|
|
ulIdx := make(map[string]struct{})
|
|
|
|
for _, ul := range ulimits {
|
|
|
|
ulIdx[ul.Name] = struct{}{}
|
|
|
|
}
|
|
|
|
for name, ul := range daemon.configStore.Ulimits {
|
|
|
|
if _, exists := ulIdx[name]; !exists {
|
|
|
|
ulimits = append(ulimits, ul)
|
|
|
|
}
|
|
|
|
}
|
|
|
|
c.Ulimits = ulimits
|
|
|
|
}
|