1
0
Fork 0
mirror of https://github.com/moby/moby.git synced 2022-11-09 12:21:53 -05:00
moby--moby/container/container_unix.go

473 lines
14 KiB
Go
Raw Normal View History

// +build linux freebsd
package container
import (
"io/ioutil"
"os"
"github.com/docker/docker/api/types"
containertypes "github.com/docker/docker/api/types/container"
mounttypes "github.com/docker/docker/api/types/mount"
"github.com/docker/docker/pkg/chrootarchive"
"github.com/docker/docker/pkg/mount"
"github.com/docker/docker/pkg/stringid"
"github.com/docker/docker/pkg/system"
"github.com/docker/docker/volume"
"github.com/opencontainers/selinux/go-selinux/label"
Implement none, private, and shareable ipc modes Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and /dev/mqueue between containers") container's /dev/shm is mounted on the host first, then bind-mounted inside the container. This is done that way in order to be able to share this container's IPC namespace (and the /dev/shm mount point) with another container. Unfortunately, this functionality breaks container checkpoint/restore (even if IPC is not shared). Since /dev/shm is an external mount, its contents is not saved by `criu checkpoint`, and so upon restore any application that tries to access data under /dev/shm is severily disappointed (which usually results in a fatal crash). This commit solves the issue by introducing new IPC modes for containers (in addition to 'host' and 'container:ID'). The new modes are: - 'shareable': enables sharing this container's IPC with others (this used to be the implicit default); - 'private': disables sharing this container's IPC. In 'private' mode, container's /dev/shm is truly mounted inside the container, without any bind-mounting from the host, which solves the issue. While at it, let's also implement 'none' mode. The motivation, as eloquently put by Justin Cormack, is: > I wondered a while back about having a none shm mode, as currently it is > not possible to have a totally unwriteable container as there is always > a /dev/shm writeable mount. It is a bit of a niche case (and clearly > should never be allowed to be daemon default) but it would be trivial to > add now so maybe we should... ...so here's yet yet another mode: - 'none': no /dev/shm mount inside the container (though it still has its own private IPC namespace). Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd need to make 'private' the default mode, but unfortunately it breaks the backward compatibility. So, let's make the default container IPC mode per-daemon configurable (with the built-in default set to 'shareable' for now). The default can be changed either via a daemon CLI option (--default-shm-mode) or a daemon.json configuration file parameter of the same name. Note one can only set either 'shareable' or 'private' IPC modes as a daemon default (i.e. in this context 'host', 'container', or 'none' do not make much sense). Some other changes this patch introduces are: 1. A mount for /dev/shm is added to default OCI Linux spec. 2. IpcMode.Valid() is simplified to remove duplicated code that parsed 'container:ID' form. Note the old version used to check that ID does not contain a semicolon -- this is no longer the case (tests are modified accordingly). The motivation is we should either do a proper check for container ID validity, or don't check it at all (since it is checked in other places anyway). I chose the latter. 3. IpcMode.Container() is modified to not return container ID if the mode value does not start with "container:", unifying the check to be the same as in IpcMode.IsContainer(). 3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified to add checks for newly added values. [v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997] [v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833] [v4: addressed the case of upgrading from older daemon, in this case container.HostConfig.IpcMode is unset and this is valid] [v5: document old and new IpcMode values in api/swagger.yaml] [v6: add the 'none' mode, changelog entry to docs/api/version-history.md] Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 17:58:50 -04:00
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
const (
// DefaultStopTimeout is the timeout (in seconds) for the syscall signal used to stop a container.
DefaultStopTimeout = 10
containerSecretMountPath = "/run/secrets"
)
// TrySetNetworkMount attempts to set the network mounts given a provided destination and
// the path to use for it; return true if the given destination was a network mount file
func (container *Container) TrySetNetworkMount(destination string, path string) bool {
if destination == "/etc/resolv.conf" {
container.ResolvConfPath = path
return true
}
if destination == "/etc/hostname" {
container.HostnamePath = path
return true
}
if destination == "/etc/hosts" {
container.HostsPath = path
return true
}
return false
}
// BuildHostnameFile writes the container's hostname file.
func (container *Container) BuildHostnameFile() error {
hostnamePath, err := container.GetRootResourcePath("hostname")
if err != nil {
return err
}
container.HostnamePath = hostnamePath
return ioutil.WriteFile(container.HostnamePath, []byte(container.Config.Hostname+"\n"), 0644)
}
// NetworkMounts returns the list of network mounts.
func (container *Container) NetworkMounts() []Mount {
var mounts []Mount
shared := container.HostConfig.NetworkMode.IsContainer()
parser := volume.NewParser(container.OS)
if container.ResolvConfPath != "" {
if _, err := os.Stat(container.ResolvConfPath); err != nil {
logrus.Warnf("ResolvConfPath set to %q, but can't stat this filename (err = %v); skipping", container.ResolvConfPath, err)
} else {
if !container.HasMountFor("/etc/resolv.conf") {
label.Relabel(container.ResolvConfPath, container.MountLabel, shared)
}
writable := !container.HostConfig.ReadonlyRootfs
if m, exists := container.MountPoints["/etc/resolv.conf"]; exists {
writable = m.RW
}
mounts = append(mounts, Mount{
Source: container.ResolvConfPath,
Destination: "/etc/resolv.conf",
Writable: writable,
Propagation: string(parser.DefaultPropagationMode()),
})
}
}
if container.HostnamePath != "" {
if _, err := os.Stat(container.HostnamePath); err != nil {
logrus.Warnf("HostnamePath set to %q, but can't stat this filename (err = %v); skipping", container.HostnamePath, err)
} else {
if !container.HasMountFor("/etc/hostname") {
label.Relabel(container.HostnamePath, container.MountLabel, shared)
}
writable := !container.HostConfig.ReadonlyRootfs
if m, exists := container.MountPoints["/etc/hostname"]; exists {
writable = m.RW
}
mounts = append(mounts, Mount{
Source: container.HostnamePath,
Destination: "/etc/hostname",
Writable: writable,
Propagation: string(parser.DefaultPropagationMode()),
})
}
}
if container.HostsPath != "" {
if _, err := os.Stat(container.HostsPath); err != nil {
logrus.Warnf("HostsPath set to %q, but can't stat this filename (err = %v); skipping", container.HostsPath, err)
} else {
if !container.HasMountFor("/etc/hosts") {
label.Relabel(container.HostsPath, container.MountLabel, shared)
}
writable := !container.HostConfig.ReadonlyRootfs
if m, exists := container.MountPoints["/etc/hosts"]; exists {
writable = m.RW
}
mounts = append(mounts, Mount{
Source: container.HostsPath,
Destination: "/etc/hosts",
Writable: writable,
Propagation: string(parser.DefaultPropagationMode()),
})
}
}
return mounts
}
// CopyImagePathContent copies files in destination to the volume.
func (container *Container) CopyImagePathContent(v volume.Volume, destination string) error {
rootfs, err := container.GetResourcePath(destination)
if err != nil {
return err
}
if _, err = ioutil.ReadDir(rootfs); err != nil {
if os.IsNotExist(err) {
return nil
}
return err
}
id := stringid.GenerateNonCryptoID()
path, err := v.Mount(id)
if err != nil {
return err
}
defer func() {
if err := v.Unmount(id); err != nil {
logrus.Warnf("error while unmounting volume %s: %v", v.Name(), err)
}
}()
if err := label.Relabel(path, container.MountLabel, true); err != nil && err != unix.ENOTSUP {
return err
}
return copyExistingContents(rootfs, path)
}
// ShmResourcePath returns path to shm
func (container *Container) ShmResourcePath() (string, error) {
return container.GetRootResourcePath("shm")
}
// HasMountFor checks if path is a mountpoint
func (container *Container) HasMountFor(path string) bool {
_, exists := container.MountPoints[path]
return exists
}
Implement none, private, and shareable ipc modes Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and /dev/mqueue between containers") container's /dev/shm is mounted on the host first, then bind-mounted inside the container. This is done that way in order to be able to share this container's IPC namespace (and the /dev/shm mount point) with another container. Unfortunately, this functionality breaks container checkpoint/restore (even if IPC is not shared). Since /dev/shm is an external mount, its contents is not saved by `criu checkpoint`, and so upon restore any application that tries to access data under /dev/shm is severily disappointed (which usually results in a fatal crash). This commit solves the issue by introducing new IPC modes for containers (in addition to 'host' and 'container:ID'). The new modes are: - 'shareable': enables sharing this container's IPC with others (this used to be the implicit default); - 'private': disables sharing this container's IPC. In 'private' mode, container's /dev/shm is truly mounted inside the container, without any bind-mounting from the host, which solves the issue. While at it, let's also implement 'none' mode. The motivation, as eloquently put by Justin Cormack, is: > I wondered a while back about having a none shm mode, as currently it is > not possible to have a totally unwriteable container as there is always > a /dev/shm writeable mount. It is a bit of a niche case (and clearly > should never be allowed to be daemon default) but it would be trivial to > add now so maybe we should... ...so here's yet yet another mode: - 'none': no /dev/shm mount inside the container (though it still has its own private IPC namespace). Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd need to make 'private' the default mode, but unfortunately it breaks the backward compatibility. So, let's make the default container IPC mode per-daemon configurable (with the built-in default set to 'shareable' for now). The default can be changed either via a daemon CLI option (--default-shm-mode) or a daemon.json configuration file parameter of the same name. Note one can only set either 'shareable' or 'private' IPC modes as a daemon default (i.e. in this context 'host', 'container', or 'none' do not make much sense). Some other changes this patch introduces are: 1. A mount for /dev/shm is added to default OCI Linux spec. 2. IpcMode.Valid() is simplified to remove duplicated code that parsed 'container:ID' form. Note the old version used to check that ID does not contain a semicolon -- this is no longer the case (tests are modified accordingly). The motivation is we should either do a proper check for container ID validity, or don't check it at all (since it is checked in other places anyway). I chose the latter. 3. IpcMode.Container() is modified to not return container ID if the mode value does not start with "container:", unifying the check to be the same as in IpcMode.IsContainer(). 3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified to add checks for newly added values. [v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997] [v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833] [v4: addressed the case of upgrading from older daemon, in this case container.HostConfig.IpcMode is unset and this is valid] [v5: document old and new IpcMode values in api/swagger.yaml] [v6: add the 'none' mode, changelog entry to docs/api/version-history.md] Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 17:58:50 -04:00
// UnmountIpcMount uses the provided unmount function to unmount shm if it was mounted
func (container *Container) UnmountIpcMount(unmount func(pth string) error) error {
if container.HasMountFor("/dev/shm") {
return nil
}
Implement none, private, and shareable ipc modes Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and /dev/mqueue between containers") container's /dev/shm is mounted on the host first, then bind-mounted inside the container. This is done that way in order to be able to share this container's IPC namespace (and the /dev/shm mount point) with another container. Unfortunately, this functionality breaks container checkpoint/restore (even if IPC is not shared). Since /dev/shm is an external mount, its contents is not saved by `criu checkpoint`, and so upon restore any application that tries to access data under /dev/shm is severily disappointed (which usually results in a fatal crash). This commit solves the issue by introducing new IPC modes for containers (in addition to 'host' and 'container:ID'). The new modes are: - 'shareable': enables sharing this container's IPC with others (this used to be the implicit default); - 'private': disables sharing this container's IPC. In 'private' mode, container's /dev/shm is truly mounted inside the container, without any bind-mounting from the host, which solves the issue. While at it, let's also implement 'none' mode. The motivation, as eloquently put by Justin Cormack, is: > I wondered a while back about having a none shm mode, as currently it is > not possible to have a totally unwriteable container as there is always > a /dev/shm writeable mount. It is a bit of a niche case (and clearly > should never be allowed to be daemon default) but it would be trivial to > add now so maybe we should... ...so here's yet yet another mode: - 'none': no /dev/shm mount inside the container (though it still has its own private IPC namespace). Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd need to make 'private' the default mode, but unfortunately it breaks the backward compatibility. So, let's make the default container IPC mode per-daemon configurable (with the built-in default set to 'shareable' for now). The default can be changed either via a daemon CLI option (--default-shm-mode) or a daemon.json configuration file parameter of the same name. Note one can only set either 'shareable' or 'private' IPC modes as a daemon default (i.e. in this context 'host', 'container', or 'none' do not make much sense). Some other changes this patch introduces are: 1. A mount for /dev/shm is added to default OCI Linux spec. 2. IpcMode.Valid() is simplified to remove duplicated code that parsed 'container:ID' form. Note the old version used to check that ID does not contain a semicolon -- this is no longer the case (tests are modified accordingly). The motivation is we should either do a proper check for container ID validity, or don't check it at all (since it is checked in other places anyway). I chose the latter. 3. IpcMode.Container() is modified to not return container ID if the mode value does not start with "container:", unifying the check to be the same as in IpcMode.IsContainer(). 3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified to add checks for newly added values. [v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997] [v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833] [v4: addressed the case of upgrading from older daemon, in this case container.HostConfig.IpcMode is unset and this is valid] [v5: document old and new IpcMode values in api/swagger.yaml] [v6: add the 'none' mode, changelog entry to docs/api/version-history.md] Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 17:58:50 -04:00
// container.ShmPath should not be used here as it may point
// to the host's or other container's /dev/shm
shmPath, err := container.ShmResourcePath()
if err != nil {
return err
}
Implement none, private, and shareable ipc modes Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and /dev/mqueue between containers") container's /dev/shm is mounted on the host first, then bind-mounted inside the container. This is done that way in order to be able to share this container's IPC namespace (and the /dev/shm mount point) with another container. Unfortunately, this functionality breaks container checkpoint/restore (even if IPC is not shared). Since /dev/shm is an external mount, its contents is not saved by `criu checkpoint`, and so upon restore any application that tries to access data under /dev/shm is severily disappointed (which usually results in a fatal crash). This commit solves the issue by introducing new IPC modes for containers (in addition to 'host' and 'container:ID'). The new modes are: - 'shareable': enables sharing this container's IPC with others (this used to be the implicit default); - 'private': disables sharing this container's IPC. In 'private' mode, container's /dev/shm is truly mounted inside the container, without any bind-mounting from the host, which solves the issue. While at it, let's also implement 'none' mode. The motivation, as eloquently put by Justin Cormack, is: > I wondered a while back about having a none shm mode, as currently it is > not possible to have a totally unwriteable container as there is always > a /dev/shm writeable mount. It is a bit of a niche case (and clearly > should never be allowed to be daemon default) but it would be trivial to > add now so maybe we should... ...so here's yet yet another mode: - 'none': no /dev/shm mount inside the container (though it still has its own private IPC namespace). Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd need to make 'private' the default mode, but unfortunately it breaks the backward compatibility. So, let's make the default container IPC mode per-daemon configurable (with the built-in default set to 'shareable' for now). The default can be changed either via a daemon CLI option (--default-shm-mode) or a daemon.json configuration file parameter of the same name. Note one can only set either 'shareable' or 'private' IPC modes as a daemon default (i.e. in this context 'host', 'container', or 'none' do not make much sense). Some other changes this patch introduces are: 1. A mount for /dev/shm is added to default OCI Linux spec. 2. IpcMode.Valid() is simplified to remove duplicated code that parsed 'container:ID' form. Note the old version used to check that ID does not contain a semicolon -- this is no longer the case (tests are modified accordingly). The motivation is we should either do a proper check for container ID validity, or don't check it at all (since it is checked in other places anyway). I chose the latter. 3. IpcMode.Container() is modified to not return container ID if the mode value does not start with "container:", unifying the check to be the same as in IpcMode.IsContainer(). 3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified to add checks for newly added values. [v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997] [v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833] [v4: addressed the case of upgrading from older daemon, in this case container.HostConfig.IpcMode is unset and this is valid] [v5: document old and new IpcMode values in api/swagger.yaml] [v6: add the 'none' mode, changelog entry to docs/api/version-history.md] Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 17:58:50 -04:00
if shmPath == "" {
return nil
}
if err = unmount(shmPath); err != nil && !os.IsNotExist(err) {
if mounted, mErr := mount.Mounted(shmPath); mounted || mErr != nil {
return errors.Wrapf(err, "umount %s", shmPath)
}
}
Implement none, private, and shareable ipc modes Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and /dev/mqueue between containers") container's /dev/shm is mounted on the host first, then bind-mounted inside the container. This is done that way in order to be able to share this container's IPC namespace (and the /dev/shm mount point) with another container. Unfortunately, this functionality breaks container checkpoint/restore (even if IPC is not shared). Since /dev/shm is an external mount, its contents is not saved by `criu checkpoint`, and so upon restore any application that tries to access data under /dev/shm is severily disappointed (which usually results in a fatal crash). This commit solves the issue by introducing new IPC modes for containers (in addition to 'host' and 'container:ID'). The new modes are: - 'shareable': enables sharing this container's IPC with others (this used to be the implicit default); - 'private': disables sharing this container's IPC. In 'private' mode, container's /dev/shm is truly mounted inside the container, without any bind-mounting from the host, which solves the issue. While at it, let's also implement 'none' mode. The motivation, as eloquently put by Justin Cormack, is: > I wondered a while back about having a none shm mode, as currently it is > not possible to have a totally unwriteable container as there is always > a /dev/shm writeable mount. It is a bit of a niche case (and clearly > should never be allowed to be daemon default) but it would be trivial to > add now so maybe we should... ...so here's yet yet another mode: - 'none': no /dev/shm mount inside the container (though it still has its own private IPC namespace). Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd need to make 'private' the default mode, but unfortunately it breaks the backward compatibility. So, let's make the default container IPC mode per-daemon configurable (with the built-in default set to 'shareable' for now). The default can be changed either via a daemon CLI option (--default-shm-mode) or a daemon.json configuration file parameter of the same name. Note one can only set either 'shareable' or 'private' IPC modes as a daemon default (i.e. in this context 'host', 'container', or 'none' do not make much sense). Some other changes this patch introduces are: 1. A mount for /dev/shm is added to default OCI Linux spec. 2. IpcMode.Valid() is simplified to remove duplicated code that parsed 'container:ID' form. Note the old version used to check that ID does not contain a semicolon -- this is no longer the case (tests are modified accordingly). The motivation is we should either do a proper check for container ID validity, or don't check it at all (since it is checked in other places anyway). I chose the latter. 3. IpcMode.Container() is modified to not return container ID if the mode value does not start with "container:", unifying the check to be the same as in IpcMode.IsContainer(). 3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified to add checks for newly added values. [v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997] [v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833] [v4: addressed the case of upgrading from older daemon, in this case container.HostConfig.IpcMode is unset and this is valid] [v5: document old and new IpcMode values in api/swagger.yaml] [v6: add the 'none' mode, changelog entry to docs/api/version-history.md] Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 17:58:50 -04:00
return nil
}
// IpcMounts returns the list of IPC mounts
func (container *Container) IpcMounts() []Mount {
var mounts []Mount
parser := volume.NewParser(container.OS)
Implement none, private, and shareable ipc modes Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and /dev/mqueue between containers") container's /dev/shm is mounted on the host first, then bind-mounted inside the container. This is done that way in order to be able to share this container's IPC namespace (and the /dev/shm mount point) with another container. Unfortunately, this functionality breaks container checkpoint/restore (even if IPC is not shared). Since /dev/shm is an external mount, its contents is not saved by `criu checkpoint`, and so upon restore any application that tries to access data under /dev/shm is severily disappointed (which usually results in a fatal crash). This commit solves the issue by introducing new IPC modes for containers (in addition to 'host' and 'container:ID'). The new modes are: - 'shareable': enables sharing this container's IPC with others (this used to be the implicit default); - 'private': disables sharing this container's IPC. In 'private' mode, container's /dev/shm is truly mounted inside the container, without any bind-mounting from the host, which solves the issue. While at it, let's also implement 'none' mode. The motivation, as eloquently put by Justin Cormack, is: > I wondered a while back about having a none shm mode, as currently it is > not possible to have a totally unwriteable container as there is always > a /dev/shm writeable mount. It is a bit of a niche case (and clearly > should never be allowed to be daemon default) but it would be trivial to > add now so maybe we should... ...so here's yet yet another mode: - 'none': no /dev/shm mount inside the container (though it still has its own private IPC namespace). Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd need to make 'private' the default mode, but unfortunately it breaks the backward compatibility. So, let's make the default container IPC mode per-daemon configurable (with the built-in default set to 'shareable' for now). The default can be changed either via a daemon CLI option (--default-shm-mode) or a daemon.json configuration file parameter of the same name. Note one can only set either 'shareable' or 'private' IPC modes as a daemon default (i.e. in this context 'host', 'container', or 'none' do not make much sense). Some other changes this patch introduces are: 1. A mount for /dev/shm is added to default OCI Linux spec. 2. IpcMode.Valid() is simplified to remove duplicated code that parsed 'container:ID' form. Note the old version used to check that ID does not contain a semicolon -- this is no longer the case (tests are modified accordingly). The motivation is we should either do a proper check for container ID validity, or don't check it at all (since it is checked in other places anyway). I chose the latter. 3. IpcMode.Container() is modified to not return container ID if the mode value does not start with "container:", unifying the check to be the same as in IpcMode.IsContainer(). 3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified to add checks for newly added values. [v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997] [v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833] [v4: addressed the case of upgrading from older daemon, in this case container.HostConfig.IpcMode is unset and this is valid] [v5: document old and new IpcMode values in api/swagger.yaml] [v6: add the 'none' mode, changelog entry to docs/api/version-history.md] Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 17:58:50 -04:00
if container.HasMountFor("/dev/shm") {
return mounts
}
Implement none, private, and shareable ipc modes Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and /dev/mqueue between containers") container's /dev/shm is mounted on the host first, then bind-mounted inside the container. This is done that way in order to be able to share this container's IPC namespace (and the /dev/shm mount point) with another container. Unfortunately, this functionality breaks container checkpoint/restore (even if IPC is not shared). Since /dev/shm is an external mount, its contents is not saved by `criu checkpoint`, and so upon restore any application that tries to access data under /dev/shm is severily disappointed (which usually results in a fatal crash). This commit solves the issue by introducing new IPC modes for containers (in addition to 'host' and 'container:ID'). The new modes are: - 'shareable': enables sharing this container's IPC with others (this used to be the implicit default); - 'private': disables sharing this container's IPC. In 'private' mode, container's /dev/shm is truly mounted inside the container, without any bind-mounting from the host, which solves the issue. While at it, let's also implement 'none' mode. The motivation, as eloquently put by Justin Cormack, is: > I wondered a while back about having a none shm mode, as currently it is > not possible to have a totally unwriteable container as there is always > a /dev/shm writeable mount. It is a bit of a niche case (and clearly > should never be allowed to be daemon default) but it would be trivial to > add now so maybe we should... ...so here's yet yet another mode: - 'none': no /dev/shm mount inside the container (though it still has its own private IPC namespace). Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd need to make 'private' the default mode, but unfortunately it breaks the backward compatibility. So, let's make the default container IPC mode per-daemon configurable (with the built-in default set to 'shareable' for now). The default can be changed either via a daemon CLI option (--default-shm-mode) or a daemon.json configuration file parameter of the same name. Note one can only set either 'shareable' or 'private' IPC modes as a daemon default (i.e. in this context 'host', 'container', or 'none' do not make much sense). Some other changes this patch introduces are: 1. A mount for /dev/shm is added to default OCI Linux spec. 2. IpcMode.Valid() is simplified to remove duplicated code that parsed 'container:ID' form. Note the old version used to check that ID does not contain a semicolon -- this is no longer the case (tests are modified accordingly). The motivation is we should either do a proper check for container ID validity, or don't check it at all (since it is checked in other places anyway). I chose the latter. 3. IpcMode.Container() is modified to not return container ID if the mode value does not start with "container:", unifying the check to be the same as in IpcMode.IsContainer(). 3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified to add checks for newly added values. [v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997] [v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833] [v4: addressed the case of upgrading from older daemon, in this case container.HostConfig.IpcMode is unset and this is valid] [v5: document old and new IpcMode values in api/swagger.yaml] [v6: add the 'none' mode, changelog entry to docs/api/version-history.md] Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 17:58:50 -04:00
if container.ShmPath == "" {
return mounts
}
label.SetFileLabel(container.ShmPath, container.MountLabel)
mounts = append(mounts, Mount{
Source: container.ShmPath,
Destination: "/dev/shm",
Writable: true,
Propagation: string(parser.DefaultPropagationMode()),
Implement none, private, and shareable ipc modes Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and /dev/mqueue between containers") container's /dev/shm is mounted on the host first, then bind-mounted inside the container. This is done that way in order to be able to share this container's IPC namespace (and the /dev/shm mount point) with another container. Unfortunately, this functionality breaks container checkpoint/restore (even if IPC is not shared). Since /dev/shm is an external mount, its contents is not saved by `criu checkpoint`, and so upon restore any application that tries to access data under /dev/shm is severily disappointed (which usually results in a fatal crash). This commit solves the issue by introducing new IPC modes for containers (in addition to 'host' and 'container:ID'). The new modes are: - 'shareable': enables sharing this container's IPC with others (this used to be the implicit default); - 'private': disables sharing this container's IPC. In 'private' mode, container's /dev/shm is truly mounted inside the container, without any bind-mounting from the host, which solves the issue. While at it, let's also implement 'none' mode. The motivation, as eloquently put by Justin Cormack, is: > I wondered a while back about having a none shm mode, as currently it is > not possible to have a totally unwriteable container as there is always > a /dev/shm writeable mount. It is a bit of a niche case (and clearly > should never be allowed to be daemon default) but it would be trivial to > add now so maybe we should... ...so here's yet yet another mode: - 'none': no /dev/shm mount inside the container (though it still has its own private IPC namespace). Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd need to make 'private' the default mode, but unfortunately it breaks the backward compatibility. So, let's make the default container IPC mode per-daemon configurable (with the built-in default set to 'shareable' for now). The default can be changed either via a daemon CLI option (--default-shm-mode) or a daemon.json configuration file parameter of the same name. Note one can only set either 'shareable' or 'private' IPC modes as a daemon default (i.e. in this context 'host', 'container', or 'none' do not make much sense). Some other changes this patch introduces are: 1. A mount for /dev/shm is added to default OCI Linux spec. 2. IpcMode.Valid() is simplified to remove duplicated code that parsed 'container:ID' form. Note the old version used to check that ID does not contain a semicolon -- this is no longer the case (tests are modified accordingly). The motivation is we should either do a proper check for container ID validity, or don't check it at all (since it is checked in other places anyway). I chose the latter. 3. IpcMode.Container() is modified to not return container ID if the mode value does not start with "container:", unifying the check to be the same as in IpcMode.IsContainer(). 3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified to add checks for newly added values. [v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997] [v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833] [v4: addressed the case of upgrading from older daemon, in this case container.HostConfig.IpcMode is unset and this is valid] [v5: document old and new IpcMode values in api/swagger.yaml] [v6: add the 'none' mode, changelog entry to docs/api/version-history.md] Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 17:58:50 -04:00
})
return mounts
}
// SecretMounts returns the mounts for the secret path.
func (container *Container) SecretMounts() []Mount {
var mounts []Mount
for _, r := range container.SecretReferences {
if r.File == nil {
continue
}
mounts = append(mounts, Mount{
Source: container.SecretFilePath(*r),
Destination: getSecretTargetPath(r),
Writable: false,
})
}
return mounts
}
// UnmountSecrets unmounts the local tmpfs for secrets
func (container *Container) UnmountSecrets() error {
if _, err := os.Stat(container.SecretMountPath()); err != nil {
if os.IsNotExist(err) {
return nil
}
return err
}
return detachMounted(container.SecretMountPath())
}
// ConfigMounts returns the mounts for configs.
func (container *Container) ConfigMounts() []Mount {
var mounts []Mount
for _, configRef := range container.ConfigReferences {
if configRef.File == nil {
continue
}
mounts = append(mounts, Mount{
Source: container.ConfigFilePath(*configRef),
Destination: configRef.File.Name,
Writable: false,
})
}
return mounts
}
type conflictingUpdateOptions string
func (e conflictingUpdateOptions) Error() string {
return string(e)
}
func (e conflictingUpdateOptions) Conflict() {}
// UpdateContainer updates configuration of a container. Callers must hold a Lock on the Container.
func (container *Container) UpdateContainer(hostConfig *containertypes.HostConfig) error {
// update resources of container
resources := hostConfig.Resources
cResources := &container.HostConfig.Resources
// validate NanoCPUs, CPUPeriod, and CPUQuota
// Because NanoCPU effectively updates CPUPeriod/CPUQuota,
// once NanoCPU is already set, updating CPUPeriod/CPUQuota will be blocked, and vice versa.
// In the following we make sure the intended update (resources) does not conflict with the existing (cResource).
if resources.NanoCPUs > 0 && cResources.CPUPeriod > 0 {
return conflictingUpdateOptions("Conflicting options: Nano CPUs cannot be updated as CPU Period has already been set")
}
if resources.NanoCPUs > 0 && cResources.CPUQuota > 0 {
return conflictingUpdateOptions("Conflicting options: Nano CPUs cannot be updated as CPU Quota has already been set")
}
if resources.CPUPeriod > 0 && cResources.NanoCPUs > 0 {
return conflictingUpdateOptions("Conflicting options: CPU Period cannot be updated as NanoCPUs has already been set")
}
if resources.CPUQuota > 0 && cResources.NanoCPUs > 0 {
return conflictingUpdateOptions("Conflicting options: CPU Quota cannot be updated as NanoCPUs has already been set")
}
if resources.BlkioWeight != 0 {
cResources.BlkioWeight = resources.BlkioWeight
}
if resources.CPUShares != 0 {
cResources.CPUShares = resources.CPUShares
}
if resources.NanoCPUs != 0 {
cResources.NanoCPUs = resources.NanoCPUs
}
if resources.CPUPeriod != 0 {
cResources.CPUPeriod = resources.CPUPeriod
}
if resources.CPUQuota != 0 {
cResources.CPUQuota = resources.CPUQuota
}
if resources.CpusetCpus != "" {
cResources.CpusetCpus = resources.CpusetCpus
}
if resources.CpusetMems != "" {
cResources.CpusetMems = resources.CpusetMems
}
if resources.Memory != 0 {
// if memory limit smaller than already set memoryswap limit and doesn't
// update the memoryswap limit, then error out.
if resources.Memory > cResources.MemorySwap && resources.MemorySwap == 0 {
return conflictingUpdateOptions("Memory limit should be smaller than already set memoryswap limit, update the memoryswap at the same time")
}
cResources.Memory = resources.Memory
}
if resources.MemorySwap != 0 {
cResources.MemorySwap = resources.MemorySwap
}
if resources.MemoryReservation != 0 {
cResources.MemoryReservation = resources.MemoryReservation
}
if resources.KernelMemory != 0 {
cResources.KernelMemory = resources.KernelMemory
}
// update HostConfig of container
if hostConfig.RestartPolicy.Name != "" {
if container.HostConfig.AutoRemove && !hostConfig.RestartPolicy.IsNone() {
return conflictingUpdateOptions("Restart policy cannot be updated because AutoRemove is enabled for the container")
}
container.HostConfig.RestartPolicy = hostConfig.RestartPolicy
}
return nil
}
2016-10-03 13:53:06 -04:00
// DetachAndUnmount uses a detached mount on all mount destinations, then
// unmounts each volume normally.
// This is used from daemon/archive for `docker cp`
func (container *Container) DetachAndUnmount(volumeEventLog func(name, action string, attributes map[string]string)) error {
networkMounts := container.NetworkMounts()
mountPaths := make([]string, 0, len(container.MountPoints)+len(networkMounts))
for _, mntPoint := range container.MountPoints {
dest, err := container.GetResourcePath(mntPoint.Destination)
if err != nil {
2016-10-03 13:53:06 -04:00
logrus.Warnf("Failed to get volume destination path for container '%s' at '%s' while lazily unmounting: %v", container.ID, mntPoint.Destination, err)
continue
}
2016-10-03 13:53:06 -04:00
mountPaths = append(mountPaths, dest)
}
2016-10-03 13:53:06 -04:00
for _, m := range networkMounts {
dest, err := container.GetResourcePath(m.Destination)
if err != nil {
logrus.Warnf("Failed to get volume destination path for container '%s' at '%s' while lazily unmounting: %v", container.ID, m.Destination, err)
continue
}
2016-10-03 13:53:06 -04:00
mountPaths = append(mountPaths, dest)
}
2016-10-03 13:53:06 -04:00
for _, mountPath := range mountPaths {
if err := detachMounted(mountPath); err != nil {
logrus.Warnf("%s unmountVolumes: Failed to do lazy umount fo volume '%s': %v", container.ID, mountPath, err)
}
}
2016-10-03 13:53:06 -04:00
return container.UnmountVolumes(volumeEventLog)
}
// copyExistingContents copies from the source to the destination and
// ensures the ownership is appropriately set.
func copyExistingContents(source, destination string) error {
volList, err := ioutil.ReadDir(source)
if err != nil {
return err
}
if len(volList) > 0 {
srcList, err := ioutil.ReadDir(destination)
if err != nil {
return err
}
if len(srcList) == 0 {
// If the source volume is empty, copies files from the root into the volume
if err := chrootarchive.NewArchiver(nil).CopyWithTar(source, destination); err != nil {
return err
}
}
}
return copyOwnership(source, destination)
}
// copyOwnership copies the permissions and uid:gid of the source file
// to the destination file
func copyOwnership(source, destination string) error {
stat, err := system.Stat(source)
if err != nil {
return err
}
destStat, err := system.Stat(destination)
if err != nil {
return err
}
// In some cases, even though UID/GID match and it would effectively be a no-op,
// this can return a permission denied error... for example if this is an NFS
// mount.
// Since it's not really an error that we can't chown to the same UID/GID, don't
// even bother trying in such cases.
if stat.UID() != destStat.UID() || stat.GID() != destStat.GID() {
if err := os.Chown(destination, int(stat.UID()), int(stat.GID())); err != nil {
return err
}
}
if stat.Mode() != destStat.Mode() {
return os.Chmod(destination, os.FileMode(stat.Mode()))
}
return nil
}
// TmpfsMounts returns the list of tmpfs mounts
func (container *Container) TmpfsMounts() ([]Mount, error) {
parser := volume.NewParser(container.OS)
var mounts []Mount
for dest, data := range container.HostConfig.Tmpfs {
mounts = append(mounts, Mount{
Source: "tmpfs",
Destination: dest,
Data: data,
})
}
for dest, mnt := range container.MountPoints {
if mnt.Type == mounttypes.TypeTmpfs {
data, err := parser.ConvertTmpfsOptions(mnt.Spec.TmpfsOptions, mnt.Spec.ReadOnly)
if err != nil {
return nil, err
}
mounts = append(mounts, Mount{
Source: "tmpfs",
Destination: dest,
Data: data,
})
}
}
return mounts, nil
}
// EnableServiceDiscoveryOnDefaultNetwork Enable service discovery on default network
func (container *Container) EnableServiceDiscoveryOnDefaultNetwork() bool {
return false
}
// GetMountPoints gives a platform specific transformation to types.MountPoint. Callers must hold a Container lock.
func (container *Container) GetMountPoints() []types.MountPoint {
mountPoints := make([]types.MountPoint, 0, len(container.MountPoints))
for _, m := range container.MountPoints {
mountPoints = append(mountPoints, types.MountPoint{
Type: m.Type,
Name: m.Name,
Source: m.Path(),
Destination: m.Destination,
Driver: m.Driver,
Mode: m.Mode,
RW: m.RW,
Propagation: m.Propagation,
})
}
return mountPoints
}