mirror of
https://github.com/moby/moby.git
synced 2022-11-09 12:21:53 -05:00
15ff09395c
As soon as the initial executable in the container is executed as a non root user, permitted and effective capabilities are dropped. Drop them earlier than this, so that they are dropped before executing the file. The main effect of this is that if `CAP_DAC_OVERRIDE` is set (the default) the user will not be able to execute files they do not have permission to execute, which previously they could. The old behaviour was somewhat surprising and the new one is definitely correct, but it is not in any meaningful way exploitable, and I do not think it is necessary to backport this fix. It is unlikely to have any negative effects as almost all executables have world execute permission anyway. Use the bounding set not the effective set as the canonical set of capabilities, as effective will now vary. Signed-off-by: Justin Cormack <justin.cormack@docker.com>
953 lines
26 KiB
Go
953 lines
26 KiB
Go
package daemon // import "github.com/docker/docker/daemon"
|
|
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"os/exec"
|
|
"path/filepath"
|
|
"regexp"
|
|
"sort"
|
|
"strconv"
|
|
"strings"
|
|
|
|
containertypes "github.com/docker/docker/api/types/container"
|
|
"github.com/docker/docker/container"
|
|
"github.com/docker/docker/daemon/caps"
|
|
daemonconfig "github.com/docker/docker/daemon/config"
|
|
"github.com/docker/docker/oci"
|
|
"github.com/docker/docker/pkg/idtools"
|
|
"github.com/docker/docker/pkg/mount"
|
|
"github.com/docker/docker/volume"
|
|
"github.com/opencontainers/runc/libcontainer/apparmor"
|
|
"github.com/opencontainers/runc/libcontainer/cgroups"
|
|
"github.com/opencontainers/runc/libcontainer/devices"
|
|
"github.com/opencontainers/runc/libcontainer/user"
|
|
specs "github.com/opencontainers/runtime-spec/specs-go"
|
|
"github.com/pkg/errors"
|
|
"github.com/sirupsen/logrus"
|
|
"golang.org/x/sys/unix"
|
|
)
|
|
|
|
// nolint: gosimple
|
|
var (
|
|
deviceCgroupRuleRegex = regexp.MustCompile("^([acb]) ([0-9]+|\\*):([0-9]+|\\*) ([rwm]{1,3})$")
|
|
)
|
|
|
|
func setResources(s *specs.Spec, r containertypes.Resources) error {
|
|
weightDevices, err := getBlkioWeightDevices(r)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
readBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadBps)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
writeBpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteBps)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
readIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceReadIOps)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
writeIOpsDevice, err := getBlkioThrottleDevices(r.BlkioDeviceWriteIOps)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
memoryRes := getMemoryResources(r)
|
|
cpuRes, err := getCPUResources(r)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
blkioWeight := r.BlkioWeight
|
|
|
|
specResources := &specs.LinuxResources{
|
|
Memory: memoryRes,
|
|
CPU: cpuRes,
|
|
BlockIO: &specs.LinuxBlockIO{
|
|
Weight: &blkioWeight,
|
|
WeightDevice: weightDevices,
|
|
ThrottleReadBpsDevice: readBpsDevice,
|
|
ThrottleWriteBpsDevice: writeBpsDevice,
|
|
ThrottleReadIOPSDevice: readIOpsDevice,
|
|
ThrottleWriteIOPSDevice: writeIOpsDevice,
|
|
},
|
|
Pids: &specs.LinuxPids{
|
|
Limit: r.PidsLimit,
|
|
},
|
|
}
|
|
|
|
if s.Linux.Resources != nil && len(s.Linux.Resources.Devices) > 0 {
|
|
specResources.Devices = s.Linux.Resources.Devices
|
|
}
|
|
|
|
s.Linux.Resources = specResources
|
|
return nil
|
|
}
|
|
|
|
func setDevices(s *specs.Spec, c *container.Container) error {
|
|
// Build lists of devices allowed and created within the container.
|
|
var devs []specs.LinuxDevice
|
|
devPermissions := s.Linux.Resources.Devices
|
|
if c.HostConfig.Privileged {
|
|
hostDevices, err := devices.HostDevices()
|
|
if err != nil {
|
|
return err
|
|
}
|
|
for _, d := range hostDevices {
|
|
devs = append(devs, oci.Device(d))
|
|
}
|
|
devPermissions = []specs.LinuxDeviceCgroup{
|
|
{
|
|
Allow: true,
|
|
Access: "rwm",
|
|
},
|
|
}
|
|
} else {
|
|
for _, deviceMapping := range c.HostConfig.Devices {
|
|
d, dPermissions, err := oci.DevicesFromPath(deviceMapping.PathOnHost, deviceMapping.PathInContainer, deviceMapping.CgroupPermissions)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
devs = append(devs, d...)
|
|
devPermissions = append(devPermissions, dPermissions...)
|
|
}
|
|
|
|
for _, deviceCgroupRule := range c.HostConfig.DeviceCgroupRules {
|
|
ss := deviceCgroupRuleRegex.FindAllStringSubmatch(deviceCgroupRule, -1)
|
|
if len(ss[0]) != 5 {
|
|
return fmt.Errorf("invalid device cgroup rule format: '%s'", deviceCgroupRule)
|
|
}
|
|
matches := ss[0]
|
|
|
|
dPermissions := specs.LinuxDeviceCgroup{
|
|
Allow: true,
|
|
Type: matches[1],
|
|
Access: matches[4],
|
|
}
|
|
if matches[2] == "*" {
|
|
major := int64(-1)
|
|
dPermissions.Major = &major
|
|
} else {
|
|
major, err := strconv.ParseInt(matches[2], 10, 64)
|
|
if err != nil {
|
|
return fmt.Errorf("invalid major value in device cgroup rule format: '%s'", deviceCgroupRule)
|
|
}
|
|
dPermissions.Major = &major
|
|
}
|
|
if matches[3] == "*" {
|
|
minor := int64(-1)
|
|
dPermissions.Minor = &minor
|
|
} else {
|
|
minor, err := strconv.ParseInt(matches[3], 10, 64)
|
|
if err != nil {
|
|
return fmt.Errorf("invalid minor value in device cgroup rule format: '%s'", deviceCgroupRule)
|
|
}
|
|
dPermissions.Minor = &minor
|
|
}
|
|
devPermissions = append(devPermissions, dPermissions)
|
|
}
|
|
}
|
|
|
|
s.Linux.Devices = append(s.Linux.Devices, devs...)
|
|
s.Linux.Resources.Devices = devPermissions
|
|
return nil
|
|
}
|
|
|
|
func (daemon *Daemon) setRlimits(s *specs.Spec, c *container.Container) error {
|
|
var rlimits []specs.POSIXRlimit
|
|
|
|
// We want to leave the original HostConfig alone so make a copy here
|
|
hostConfig := *c.HostConfig
|
|
// Merge with the daemon defaults
|
|
daemon.mergeUlimits(&hostConfig)
|
|
for _, ul := range hostConfig.Ulimits {
|
|
rlimits = append(rlimits, specs.POSIXRlimit{
|
|
Type: "RLIMIT_" + strings.ToUpper(ul.Name),
|
|
Soft: uint64(ul.Soft),
|
|
Hard: uint64(ul.Hard),
|
|
})
|
|
}
|
|
|
|
s.Process.Rlimits = rlimits
|
|
return nil
|
|
}
|
|
|
|
func setUser(s *specs.Spec, c *container.Container) error {
|
|
uid, gid, additionalGids, err := getUser(c, c.Config.User)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
s.Process.User.UID = uid
|
|
s.Process.User.GID = gid
|
|
s.Process.User.AdditionalGids = additionalGids
|
|
return nil
|
|
}
|
|
|
|
func readUserFile(c *container.Container, p string) (io.ReadCloser, error) {
|
|
fp, err := c.GetResourcePath(p)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
return os.Open(fp)
|
|
}
|
|
|
|
func getUser(c *container.Container, username string) (uint32, uint32, []uint32, error) {
|
|
passwdPath, err := user.GetPasswdPath()
|
|
if err != nil {
|
|
return 0, 0, nil, err
|
|
}
|
|
groupPath, err := user.GetGroupPath()
|
|
if err != nil {
|
|
return 0, 0, nil, err
|
|
}
|
|
passwdFile, err := readUserFile(c, passwdPath)
|
|
if err == nil {
|
|
defer passwdFile.Close()
|
|
}
|
|
groupFile, err := readUserFile(c, groupPath)
|
|
if err == nil {
|
|
defer groupFile.Close()
|
|
}
|
|
|
|
execUser, err := user.GetExecUser(username, nil, passwdFile, groupFile)
|
|
if err != nil {
|
|
return 0, 0, nil, err
|
|
}
|
|
|
|
// todo: fix this double read by a change to libcontainer/user pkg
|
|
groupFile, err = readUserFile(c, groupPath)
|
|
if err == nil {
|
|
defer groupFile.Close()
|
|
}
|
|
var addGroups []int
|
|
if len(c.HostConfig.GroupAdd) > 0 {
|
|
addGroups, err = user.GetAdditionalGroups(c.HostConfig.GroupAdd, groupFile)
|
|
if err != nil {
|
|
return 0, 0, nil, err
|
|
}
|
|
}
|
|
uid := uint32(execUser.Uid)
|
|
gid := uint32(execUser.Gid)
|
|
sgids := append(execUser.Sgids, addGroups...)
|
|
var additionalGids []uint32
|
|
for _, g := range sgids {
|
|
additionalGids = append(additionalGids, uint32(g))
|
|
}
|
|
return uid, gid, additionalGids, nil
|
|
}
|
|
|
|
func setNamespace(s *specs.Spec, ns specs.LinuxNamespace) {
|
|
for i, n := range s.Linux.Namespaces {
|
|
if n.Type == ns.Type {
|
|
s.Linux.Namespaces[i] = ns
|
|
return
|
|
}
|
|
}
|
|
s.Linux.Namespaces = append(s.Linux.Namespaces, ns)
|
|
}
|
|
|
|
func setCapabilities(s *specs.Spec, c *container.Container) error {
|
|
var caplist []string
|
|
var err error
|
|
if c.HostConfig.Privileged {
|
|
caplist = caps.GetAllCapabilities()
|
|
} else {
|
|
caplist, err = caps.TweakCapabilities(s.Process.Capabilities.Bounding, c.HostConfig.CapAdd, c.HostConfig.CapDrop)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
s.Process.Capabilities.Effective = caplist
|
|
s.Process.Capabilities.Bounding = caplist
|
|
s.Process.Capabilities.Permitted = caplist
|
|
s.Process.Capabilities.Inheritable = caplist
|
|
// setUser has already been executed here
|
|
// if non root drop capabilities in the way execve does
|
|
if s.Process.User.UID != 0 {
|
|
s.Process.Capabilities.Effective = []string{}
|
|
s.Process.Capabilities.Permitted = []string{}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func setNamespaces(daemon *Daemon, s *specs.Spec, c *container.Container) error {
|
|
userNS := false
|
|
// user
|
|
if c.HostConfig.UsernsMode.IsPrivate() {
|
|
uidMap := daemon.idMappings.UIDs()
|
|
if uidMap != nil {
|
|
userNS = true
|
|
ns := specs.LinuxNamespace{Type: "user"}
|
|
setNamespace(s, ns)
|
|
s.Linux.UIDMappings = specMapping(uidMap)
|
|
s.Linux.GIDMappings = specMapping(daemon.idMappings.GIDs())
|
|
}
|
|
}
|
|
// network
|
|
if !c.Config.NetworkDisabled {
|
|
ns := specs.LinuxNamespace{Type: "network"}
|
|
parts := strings.SplitN(string(c.HostConfig.NetworkMode), ":", 2)
|
|
if parts[0] == "container" {
|
|
nc, err := daemon.getNetworkedContainer(c.ID, c.HostConfig.NetworkMode.ConnectedContainer())
|
|
if err != nil {
|
|
return err
|
|
}
|
|
ns.Path = fmt.Sprintf("/proc/%d/ns/net", nc.State.GetPID())
|
|
if userNS {
|
|
// to share a net namespace, they must also share a user namespace
|
|
nsUser := specs.LinuxNamespace{Type: "user"}
|
|
nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", nc.State.GetPID())
|
|
setNamespace(s, nsUser)
|
|
}
|
|
} else if c.HostConfig.NetworkMode.IsHost() {
|
|
ns.Path = c.NetworkSettings.SandboxKey
|
|
}
|
|
setNamespace(s, ns)
|
|
}
|
|
|
|
// ipc
|
|
ipcMode := c.HostConfig.IpcMode
|
|
switch {
|
|
case ipcMode.IsContainer():
|
|
ns := specs.LinuxNamespace{Type: "ipc"}
|
|
ic, err := daemon.getIpcContainer(ipcMode.Container())
|
|
if err != nil {
|
|
return err
|
|
}
|
|
ns.Path = fmt.Sprintf("/proc/%d/ns/ipc", ic.State.GetPID())
|
|
setNamespace(s, ns)
|
|
if userNS {
|
|
// to share an IPC namespace, they must also share a user namespace
|
|
nsUser := specs.LinuxNamespace{Type: "user"}
|
|
nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", ic.State.GetPID())
|
|
setNamespace(s, nsUser)
|
|
}
|
|
case ipcMode.IsHost():
|
|
oci.RemoveNamespace(s, specs.LinuxNamespaceType("ipc"))
|
|
case ipcMode.IsEmpty():
|
|
// A container was created by an older version of the daemon.
|
|
// The default behavior used to be what is now called "shareable".
|
|
fallthrough
|
|
case ipcMode.IsPrivate(), ipcMode.IsShareable(), ipcMode.IsNone():
|
|
ns := specs.LinuxNamespace{Type: "ipc"}
|
|
setNamespace(s, ns)
|
|
default:
|
|
return fmt.Errorf("Invalid IPC mode: %v", ipcMode)
|
|
}
|
|
|
|
// pid
|
|
if c.HostConfig.PidMode.IsContainer() {
|
|
ns := specs.LinuxNamespace{Type: "pid"}
|
|
pc, err := daemon.getPidContainer(c)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
ns.Path = fmt.Sprintf("/proc/%d/ns/pid", pc.State.GetPID())
|
|
setNamespace(s, ns)
|
|
if userNS {
|
|
// to share a PID namespace, they must also share a user namespace
|
|
nsUser := specs.LinuxNamespace{Type: "user"}
|
|
nsUser.Path = fmt.Sprintf("/proc/%d/ns/user", pc.State.GetPID())
|
|
setNamespace(s, nsUser)
|
|
}
|
|
} else if c.HostConfig.PidMode.IsHost() {
|
|
oci.RemoveNamespace(s, specs.LinuxNamespaceType("pid"))
|
|
} else {
|
|
ns := specs.LinuxNamespace{Type: "pid"}
|
|
setNamespace(s, ns)
|
|
}
|
|
// uts
|
|
if c.HostConfig.UTSMode.IsHost() {
|
|
oci.RemoveNamespace(s, specs.LinuxNamespaceType("uts"))
|
|
s.Hostname = ""
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func specMapping(s []idtools.IDMap) []specs.LinuxIDMapping {
|
|
var ids []specs.LinuxIDMapping
|
|
for _, item := range s {
|
|
ids = append(ids, specs.LinuxIDMapping{
|
|
HostID: uint32(item.HostID),
|
|
ContainerID: uint32(item.ContainerID),
|
|
Size: uint32(item.Size),
|
|
})
|
|
}
|
|
return ids
|
|
}
|
|
|
|
func getMountInfo(mountinfo []*mount.Info, dir string) *mount.Info {
|
|
for _, m := range mountinfo {
|
|
if m.Mountpoint == dir {
|
|
return m
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Get the source mount point of directory passed in as argument. Also return
|
|
// optional fields.
|
|
func getSourceMount(source string) (string, string, error) {
|
|
// Ensure any symlinks are resolved.
|
|
sourcePath, err := filepath.EvalSymlinks(source)
|
|
if err != nil {
|
|
return "", "", err
|
|
}
|
|
|
|
mountinfos, err := mount.GetMounts()
|
|
if err != nil {
|
|
return "", "", err
|
|
}
|
|
|
|
mountinfo := getMountInfo(mountinfos, sourcePath)
|
|
if mountinfo != nil {
|
|
return sourcePath, mountinfo.Optional, nil
|
|
}
|
|
|
|
path := sourcePath
|
|
for {
|
|
path = filepath.Dir(path)
|
|
|
|
mountinfo = getMountInfo(mountinfos, path)
|
|
if mountinfo != nil {
|
|
return path, mountinfo.Optional, nil
|
|
}
|
|
|
|
if path == "/" {
|
|
break
|
|
}
|
|
}
|
|
|
|
// If we are here, we did not find parent mount. Something is wrong.
|
|
return "", "", fmt.Errorf("Could not find source mount of %s", source)
|
|
}
|
|
|
|
const (
|
|
sharedPropagationOption = "shared:"
|
|
slavePropagationOption = "master:"
|
|
)
|
|
|
|
// hasMountinfoOption checks if any of the passed any of the given option values
|
|
// are set in the passed in option string.
|
|
func hasMountinfoOption(opts string, vals ...string) bool {
|
|
for _, opt := range strings.Split(opts, " ") {
|
|
for _, val := range vals {
|
|
if strings.HasPrefix(opt, val) {
|
|
return true
|
|
}
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
// Ensure mount point on which path is mounted, is shared.
|
|
func ensureShared(path string) error {
|
|
sourceMount, optionalOpts, err := getSourceMount(path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
// Make sure source mount point is shared.
|
|
if !hasMountinfoOption(optionalOpts, sharedPropagationOption) {
|
|
return errors.Errorf("path %s is mounted on %s but it is not a shared mount", path, sourceMount)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Ensure mount point on which path is mounted, is either shared or slave.
|
|
func ensureSharedOrSlave(path string) error {
|
|
sourceMount, optionalOpts, err := getSourceMount(path)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if !hasMountinfoOption(optionalOpts, sharedPropagationOption, slavePropagationOption) {
|
|
return errors.Errorf("path %s is mounted on %s but it is not a shared or slave mount", path, sourceMount)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// Get the set of mount flags that are set on the mount that contains the given
|
|
// path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
|
|
// bind-mounting "with options" will not fail with user namespaces, due to
|
|
// kernel restrictions that require user namespace mounts to preserve
|
|
// CL_UNPRIVILEGED locked flags.
|
|
func getUnprivilegedMountFlags(path string) ([]string, error) {
|
|
var statfs unix.Statfs_t
|
|
if err := unix.Statfs(path, &statfs); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
// The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
|
|
unprivilegedFlags := map[uint64]string{
|
|
unix.MS_RDONLY: "ro",
|
|
unix.MS_NODEV: "nodev",
|
|
unix.MS_NOEXEC: "noexec",
|
|
unix.MS_NOSUID: "nosuid",
|
|
unix.MS_NOATIME: "noatime",
|
|
unix.MS_RELATIME: "relatime",
|
|
unix.MS_NODIRATIME: "nodiratime",
|
|
}
|
|
|
|
var flags []string
|
|
for mask, flag := range unprivilegedFlags {
|
|
if uint64(statfs.Flags)&mask == mask {
|
|
flags = append(flags, flag)
|
|
}
|
|
}
|
|
|
|
return flags, nil
|
|
}
|
|
|
|
var (
|
|
mountPropagationMap = map[string]int{
|
|
"private": mount.PRIVATE,
|
|
"rprivate": mount.RPRIVATE,
|
|
"shared": mount.SHARED,
|
|
"rshared": mount.RSHARED,
|
|
"slave": mount.SLAVE,
|
|
"rslave": mount.RSLAVE,
|
|
}
|
|
|
|
mountPropagationReverseMap = map[int]string{
|
|
mount.PRIVATE: "private",
|
|
mount.RPRIVATE: "rprivate",
|
|
mount.SHARED: "shared",
|
|
mount.RSHARED: "rshared",
|
|
mount.SLAVE: "slave",
|
|
mount.RSLAVE: "rslave",
|
|
}
|
|
)
|
|
|
|
// inSlice tests whether a string is contained in a slice of strings or not.
|
|
// Comparison is case sensitive
|
|
func inSlice(slice []string, s string) bool {
|
|
for _, ss := range slice {
|
|
if s == ss {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func setMounts(daemon *Daemon, s *specs.Spec, c *container.Container, mounts []container.Mount) error {
|
|
userMounts := make(map[string]struct{})
|
|
for _, m := range mounts {
|
|
userMounts[m.Destination] = struct{}{}
|
|
}
|
|
|
|
// Copy all mounts from spec to defaultMounts, except for
|
|
// - mounts overriden by a user supplied mount;
|
|
// - all mounts under /dev if a user supplied /dev is present;
|
|
// - /dev/shm, in case IpcMode is none.
|
|
// While at it, also
|
|
// - set size for /dev/shm from shmsize.
|
|
var defaultMounts []specs.Mount
|
|
_, mountDev := userMounts["/dev"]
|
|
for _, m := range s.Mounts {
|
|
if _, ok := userMounts[m.Destination]; ok {
|
|
// filter out mount overridden by a user supplied mount
|
|
continue
|
|
}
|
|
if mountDev && strings.HasPrefix(m.Destination, "/dev/") {
|
|
// filter out everything under /dev if /dev is user-mounted
|
|
continue
|
|
}
|
|
|
|
if m.Destination == "/dev/shm" {
|
|
if c.HostConfig.IpcMode.IsNone() {
|
|
// filter out /dev/shm for "none" IpcMode
|
|
continue
|
|
}
|
|
// set size for /dev/shm mount from spec
|
|
sizeOpt := "size=" + strconv.FormatInt(c.HostConfig.ShmSize, 10)
|
|
m.Options = append(m.Options, sizeOpt)
|
|
}
|
|
|
|
defaultMounts = append(defaultMounts, m)
|
|
}
|
|
|
|
s.Mounts = defaultMounts
|
|
for _, m := range mounts {
|
|
for _, cm := range s.Mounts {
|
|
if cm.Destination == m.Destination {
|
|
return duplicateMountPointError(m.Destination)
|
|
}
|
|
}
|
|
|
|
if m.Source == "tmpfs" {
|
|
data := m.Data
|
|
parser := volume.NewParser("linux")
|
|
options := []string{"noexec", "nosuid", "nodev", string(parser.DefaultPropagationMode())}
|
|
if data != "" {
|
|
options = append(options, strings.Split(data, ",")...)
|
|
}
|
|
|
|
merged, err := mount.MergeTmpfsOptions(options)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
s.Mounts = append(s.Mounts, specs.Mount{Destination: m.Destination, Source: m.Source, Type: "tmpfs", Options: merged})
|
|
continue
|
|
}
|
|
|
|
mt := specs.Mount{Destination: m.Destination, Source: m.Source, Type: "bind"}
|
|
|
|
// Determine property of RootPropagation based on volume
|
|
// properties. If a volume is shared, then keep root propagation
|
|
// shared. This should work for slave and private volumes too.
|
|
//
|
|
// For slave volumes, it can be either [r]shared/[r]slave.
|
|
//
|
|
// For private volumes any root propagation value should work.
|
|
pFlag := mountPropagationMap[m.Propagation]
|
|
switch pFlag {
|
|
case mount.SHARED, mount.RSHARED:
|
|
if err := ensureShared(m.Source); err != nil {
|
|
return err
|
|
}
|
|
rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
|
|
if rootpg != mount.SHARED && rootpg != mount.RSHARED {
|
|
s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.SHARED]
|
|
}
|
|
case mount.SLAVE, mount.RSLAVE:
|
|
var fallback bool
|
|
if err := ensureSharedOrSlave(m.Source); err != nil {
|
|
// For backwards compatability purposes, treat mounts from the daemon root
|
|
// as special since we automatically add rslave propagation to these mounts
|
|
// when the user did not set anything, so we should fallback to the old
|
|
// behavior which is to use private propagation which is normally the
|
|
// default.
|
|
if !strings.HasPrefix(m.Source, daemon.root) && !strings.HasPrefix(daemon.root, m.Source) {
|
|
return err
|
|
}
|
|
|
|
cm, ok := c.MountPoints[m.Destination]
|
|
if !ok {
|
|
return err
|
|
}
|
|
if cm.Spec.BindOptions != nil && cm.Spec.BindOptions.Propagation != "" {
|
|
// This means the user explicitly set a propagation, do not fallback in that case.
|
|
return err
|
|
}
|
|
fallback = true
|
|
logrus.WithField("container", c.ID).WithField("source", m.Source).Warn("Falling back to default propagation for bind source in daemon root")
|
|
}
|
|
if !fallback {
|
|
rootpg := mountPropagationMap[s.Linux.RootfsPropagation]
|
|
if rootpg != mount.SHARED && rootpg != mount.RSHARED && rootpg != mount.SLAVE && rootpg != mount.RSLAVE {
|
|
s.Linux.RootfsPropagation = mountPropagationReverseMap[mount.RSLAVE]
|
|
}
|
|
}
|
|
}
|
|
|
|
opts := []string{"rbind"}
|
|
if !m.Writable {
|
|
opts = append(opts, "ro")
|
|
}
|
|
if pFlag != 0 {
|
|
opts = append(opts, mountPropagationReverseMap[pFlag])
|
|
}
|
|
|
|
// If we are using user namespaces, then we must make sure that we
|
|
// don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
|
|
// "mount" when we bind-mount. The reason for this is that at the point
|
|
// when runc sets up the root filesystem, it is already inside a user
|
|
// namespace, and thus cannot change any flags that are locked.
|
|
if daemon.configStore.RemappedRoot != "" {
|
|
unprivOpts, err := getUnprivilegedMountFlags(m.Source)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
opts = append(opts, unprivOpts...)
|
|
}
|
|
|
|
mt.Options = opts
|
|
s.Mounts = append(s.Mounts, mt)
|
|
}
|
|
|
|
if s.Root.Readonly {
|
|
for i, m := range s.Mounts {
|
|
switch m.Destination {
|
|
case "/proc", "/dev/pts", "/dev/shm", "/dev/mqueue", "/dev":
|
|
continue
|
|
}
|
|
if _, ok := userMounts[m.Destination]; !ok {
|
|
if !inSlice(m.Options, "ro") {
|
|
s.Mounts[i].Options = append(s.Mounts[i].Options, "ro")
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if c.HostConfig.Privileged {
|
|
if !s.Root.Readonly {
|
|
// clear readonly for /sys
|
|
for i := range s.Mounts {
|
|
if s.Mounts[i].Destination == "/sys" {
|
|
clearReadOnly(&s.Mounts[i])
|
|
}
|
|
}
|
|
}
|
|
s.Linux.ReadonlyPaths = nil
|
|
s.Linux.MaskedPaths = nil
|
|
}
|
|
|
|
// TODO: until a kernel/mount solution exists for handling remount in a user namespace,
|
|
// we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
|
|
if uidMap := daemon.idMappings.UIDs(); uidMap != nil || c.HostConfig.Privileged {
|
|
for i, m := range s.Mounts {
|
|
if m.Type == "cgroup" {
|
|
clearReadOnly(&s.Mounts[i])
|
|
}
|
|
}
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func (daemon *Daemon) populateCommonSpec(s *specs.Spec, c *container.Container) error {
|
|
if c.BaseFS == nil {
|
|
return errors.New("populateCommonSpec: BaseFS of container " + c.ID + " is unexpectedly nil")
|
|
}
|
|
linkedEnv, err := daemon.setupLinkedContainers(c)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
s.Root = &specs.Root{
|
|
Path: c.BaseFS.Path(),
|
|
Readonly: c.HostConfig.ReadonlyRootfs,
|
|
}
|
|
if err := c.SetupWorkingDirectory(daemon.idMappings.RootPair()); err != nil {
|
|
return err
|
|
}
|
|
cwd := c.Config.WorkingDir
|
|
if len(cwd) == 0 {
|
|
cwd = "/"
|
|
}
|
|
s.Process.Args = append([]string{c.Path}, c.Args...)
|
|
|
|
// only add the custom init if it is specified and the container is running in its
|
|
// own private pid namespace. It does not make sense to add if it is running in the
|
|
// host namespace or another container's pid namespace where we already have an init
|
|
if c.HostConfig.PidMode.IsPrivate() {
|
|
if (c.HostConfig.Init != nil && *c.HostConfig.Init) ||
|
|
(c.HostConfig.Init == nil && daemon.configStore.Init) {
|
|
s.Process.Args = append([]string{"/dev/init", "--", c.Path}, c.Args...)
|
|
var path string
|
|
if daemon.configStore.InitPath == "" {
|
|
path, err = exec.LookPath(daemonconfig.DefaultInitBinary)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
if daemon.configStore.InitPath != "" {
|
|
path = daemon.configStore.InitPath
|
|
}
|
|
s.Mounts = append(s.Mounts, specs.Mount{
|
|
Destination: "/dev/init",
|
|
Type: "bind",
|
|
Source: path,
|
|
Options: []string{"bind", "ro"},
|
|
})
|
|
}
|
|
}
|
|
s.Process.Cwd = cwd
|
|
s.Process.Env = c.CreateDaemonEnvironment(c.Config.Tty, linkedEnv)
|
|
s.Process.Terminal = c.Config.Tty
|
|
s.Hostname = c.FullHostname()
|
|
|
|
return nil
|
|
}
|
|
|
|
func (daemon *Daemon) createSpec(c *container.Container) (retSpec *specs.Spec, err error) {
|
|
s := oci.DefaultSpec()
|
|
if err := daemon.populateCommonSpec(&s, c); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var cgroupsPath string
|
|
scopePrefix := "docker"
|
|
parent := "/docker"
|
|
useSystemd := UsingSystemd(daemon.configStore)
|
|
if useSystemd {
|
|
parent = "system.slice"
|
|
}
|
|
|
|
if c.HostConfig.CgroupParent != "" {
|
|
parent = c.HostConfig.CgroupParent
|
|
} else if daemon.configStore.CgroupParent != "" {
|
|
parent = daemon.configStore.CgroupParent
|
|
}
|
|
|
|
if useSystemd {
|
|
cgroupsPath = parent + ":" + scopePrefix + ":" + c.ID
|
|
logrus.Debugf("createSpec: cgroupsPath: %s", cgroupsPath)
|
|
} else {
|
|
cgroupsPath = filepath.Join(parent, c.ID)
|
|
}
|
|
s.Linux.CgroupsPath = cgroupsPath
|
|
|
|
if err := setResources(&s, c.HostConfig.Resources); err != nil {
|
|
return nil, fmt.Errorf("linux runtime spec resources: %v", err)
|
|
}
|
|
s.Linux.Sysctl = c.HostConfig.Sysctls
|
|
|
|
p := s.Linux.CgroupsPath
|
|
if useSystemd {
|
|
initPath, err := cgroups.GetInitCgroup("cpu")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
_, err = cgroups.GetOwnCgroup("cpu")
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
p = filepath.Join(initPath, s.Linux.CgroupsPath)
|
|
}
|
|
|
|
// Clean path to guard against things like ../../../BAD
|
|
parentPath := filepath.Dir(p)
|
|
if !filepath.IsAbs(parentPath) {
|
|
parentPath = filepath.Clean("/" + parentPath)
|
|
}
|
|
|
|
if err := daemon.initCgroupsPath(parentPath); err != nil {
|
|
return nil, fmt.Errorf("linux init cgroups path: %v", err)
|
|
}
|
|
if err := setDevices(&s, c); err != nil {
|
|
return nil, fmt.Errorf("linux runtime spec devices: %v", err)
|
|
}
|
|
if err := daemon.setRlimits(&s, c); err != nil {
|
|
return nil, fmt.Errorf("linux runtime spec rlimits: %v", err)
|
|
}
|
|
if err := setUser(&s, c); err != nil {
|
|
return nil, fmt.Errorf("linux spec user: %v", err)
|
|
}
|
|
if err := setNamespaces(daemon, &s, c); err != nil {
|
|
return nil, fmt.Errorf("linux spec namespaces: %v", err)
|
|
}
|
|
if err := setCapabilities(&s, c); err != nil {
|
|
return nil, fmt.Errorf("linux spec capabilities: %v", err)
|
|
}
|
|
if err := setSeccomp(daemon, &s, c); err != nil {
|
|
return nil, fmt.Errorf("linux seccomp: %v", err)
|
|
}
|
|
|
|
if err := daemon.setupContainerMountsRoot(c); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if err := daemon.setupIpcDirs(c); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
defer func() {
|
|
if err != nil {
|
|
daemon.cleanupSecretDir(c)
|
|
}
|
|
}()
|
|
|
|
if err := daemon.setupSecretDir(c); err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
ms, err := daemon.setupMounts(c)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if !c.HostConfig.IpcMode.IsPrivate() && !c.HostConfig.IpcMode.IsEmpty() {
|
|
ms = append(ms, c.IpcMounts()...)
|
|
}
|
|
|
|
tmpfsMounts, err := c.TmpfsMounts()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
ms = append(ms, tmpfsMounts...)
|
|
|
|
secretMounts, err := c.SecretMounts()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
ms = append(ms, secretMounts...)
|
|
|
|
sort.Sort(mounts(ms))
|
|
if err := setMounts(daemon, &s, c, ms); err != nil {
|
|
return nil, fmt.Errorf("linux mounts: %v", err)
|
|
}
|
|
|
|
for _, ns := range s.Linux.Namespaces {
|
|
if ns.Type == "network" && ns.Path == "" && !c.Config.NetworkDisabled {
|
|
target := filepath.Join("/proc", strconv.Itoa(os.Getpid()), "exe")
|
|
s.Hooks = &specs.Hooks{
|
|
Prestart: []specs.Hook{{
|
|
Path: target,
|
|
Args: []string{"libnetwork-setkey", c.ID, daemon.netController.ID()},
|
|
}},
|
|
}
|
|
}
|
|
}
|
|
|
|
if apparmor.IsEnabled() {
|
|
var appArmorProfile string
|
|
if c.AppArmorProfile != "" {
|
|
appArmorProfile = c.AppArmorProfile
|
|
} else if c.HostConfig.Privileged {
|
|
appArmorProfile = "unconfined"
|
|
} else {
|
|
appArmorProfile = "docker-default"
|
|
}
|
|
|
|
if appArmorProfile == "docker-default" {
|
|
// Unattended upgrades and other fun services can unload AppArmor
|
|
// profiles inadvertently. Since we cannot store our profile in
|
|
// /etc/apparmor.d, nor can we practically add other ways of
|
|
// telling the system to keep our profile loaded, in order to make
|
|
// sure that we keep the default profile enabled we dynamically
|
|
// reload it if necessary.
|
|
if err := ensureDefaultAppArmorProfile(); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
|
|
s.Process.ApparmorProfile = appArmorProfile
|
|
}
|
|
s.Process.SelinuxLabel = c.GetProcessLabel()
|
|
s.Process.NoNewPrivileges = c.NoNewPrivileges
|
|
s.Process.OOMScoreAdj = &c.HostConfig.OomScoreAdj
|
|
s.Linux.MountLabel = c.MountLabel
|
|
|
|
return &s, nil
|
|
}
|
|
|
|
func clearReadOnly(m *specs.Mount) {
|
|
var opt []string
|
|
for _, o := range m.Options {
|
|
if o != "ro" {
|
|
opt = append(opt, o)
|
|
}
|
|
}
|
|
m.Options = opt
|
|
}
|
|
|
|
// mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
|
|
func (daemon *Daemon) mergeUlimits(c *containertypes.HostConfig) {
|
|
ulimits := c.Ulimits
|
|
// Merge ulimits with daemon defaults
|
|
ulIdx := make(map[string]struct{})
|
|
for _, ul := range ulimits {
|
|
ulIdx[ul.Name] = struct{}{}
|
|
}
|
|
for name, ul := range daemon.configStore.Ulimits {
|
|
if _, exists := ulIdx[name]; !exists {
|
|
ulimits = append(ulimits, ul)
|
|
}
|
|
}
|
|
c.Ulimits = ulimits
|
|
}
|