2021-08-23 09:14:53 -04:00
//go:build linux || freebsd
2015-08-07 12:33:29 -04:00
// +build linux freebsd
2015-05-15 19:34:26 -04:00
2018-02-05 16:05:59 -05:00
package daemon // import "github.com/docker/docker/daemon"
2015-05-15 19:34:26 -04:00
import (
2016-10-04 15:35:56 -04:00
"bufio"
2017-09-22 09:52:41 -04:00
"context"
2015-05-15 19:34:26 -04:00
"fmt"
"net"
"os"
"path/filepath"
2016-01-07 22:43:11 -05:00
"runtime"
2015-12-02 05:26:30 -05:00
"runtime/debug"
2015-10-10 12:43:03 -04:00
"strconv"
2015-05-15 19:34:26 -04:00
"strings"
2020-11-09 09:21:27 -05:00
"sync"
2016-11-01 13:12:29 -04:00
"time"
2015-05-15 19:34:26 -04:00
2020-11-09 09:00:32 -05:00
"github.com/containerd/cgroups"
2019-11-01 11:18:06 -04:00
statsV1 "github.com/containerd/cgroups/stats/v1"
2020-03-09 17:40:34 -04:00
statsV2 "github.com/containerd/cgroups/v2/stats"
2021-06-18 05:01:24 -04:00
"github.com/containerd/containerd/pkg/userns"
2016-09-06 14:18:12 -04:00
"github.com/docker/docker/api/types"
"github.com/docker/docker/api/types/blkiodev"
pblkiodev "github.com/docker/docker/api/types/blkiodev"
containertypes "github.com/docker/docker/api/types/container"
2015-11-12 14:55:17 -05:00
"github.com/docker/docker/container"
2017-01-23 06:23:07 -05:00
"github.com/docker/docker/daemon/config"
2018-02-13 14:29:14 -05:00
"github.com/docker/docker/daemon/initlayer"
2019-09-02 17:39:24 -04:00
"github.com/docker/docker/errdefs"
2021-02-26 18:23:55 -05:00
"github.com/docker/docker/libcontainerd/remote"
2021-05-27 20:15:56 -04:00
"github.com/docker/docker/libnetwork"
nwconfig "github.com/docker/docker/libnetwork/config"
"github.com/docker/docker/libnetwork/drivers/bridge"
"github.com/docker/docker/libnetwork/netlabel"
"github.com/docker/docker/libnetwork/netutils"
"github.com/docker/docker/libnetwork/options"
lntypes "github.com/docker/docker/libnetwork/types"
2016-12-23 14:09:12 -05:00
"github.com/docker/docker/opts"
2015-10-08 11:51:41 -04:00
"github.com/docker/docker/pkg/idtools"
2016-01-22 21:15:09 -05:00
"github.com/docker/docker/pkg/parsers"
2015-05-15 19:34:26 -04:00
"github.com/docker/docker/pkg/parsers/kernel"
2015-08-06 07:54:48 -04:00
"github.com/docker/docker/pkg/sysinfo"
2015-05-15 19:34:26 -04:00
"github.com/docker/docker/runconfig"
2018-04-17 16:50:28 -04:00
volumemounts "github.com/docker/docker/volume/mounts"
2020-03-13 19:38:24 -04:00
"github.com/moby/sys/mount"
2019-08-09 12:34:35 -04:00
specs "github.com/opencontainers/runtime-spec/specs-go"
2020-12-14 05:46:58 -05:00
"github.com/opencontainers/selinux/go-selinux"
2017-04-18 09:26:36 -04:00
"github.com/opencontainers/selinux/go-selinux/label"
2016-09-17 01:46:20 -04:00
"github.com/pkg/errors"
2017-07-26 17:42:13 -04:00
"github.com/sirupsen/logrus"
2016-09-27 16:16:00 -04:00
"github.com/vishvananda/netlink"
2017-07-27 03:51:23 -04:00
"golang.org/x/sys/unix"
2015-05-15 19:34:26 -04:00
)
2015-08-05 20:15:14 -04:00
const (
2019-10-12 20:29:21 -04:00
isWindows = false
2015-08-05 20:15:14 -04:00
// See https://git.kernel.org/cgit/linux/kernel/git/tip/tip.git/tree/kernel/sched/sched.h?id=8cd9234c64c584432f6992fe944ca9e46ca8ea76#n269
linuxMinCPUShares = 2
linuxMaxCPUShares = 262144
2015-08-07 12:33:29 -04:00
platformSupported = true
Set minimum memory limit to 6M, to account for higher startup memory use
For some time, we defined a minimum limit for `--memory` limits to account for
overhead during startup, and to supply a reasonable functional container.
Changes in the runtime (runc) introduced a higher memory footprint during container
startup, which now lead to obscure error-messages that are unfriendly for users:
run --rm --memory=4m alpine echo success
docker: Error response from daemon: OCI runtime create failed: container_linux.go:349: starting container process caused "process_linux.go:449: container init caused \"process_linux.go:415: setting cgroup config for procHooks process caused \\\"failed to write \\\\\\\"4194304\\\\\\\" to \\\\\\\"/sys/fs/cgroup/memory/docker/1254c8d63f85442e599b17dff895f4543c897755ee3bd9b56d5d3d17724b38d7/memory.limit_in_bytes\\\\\\\": write /sys/fs/cgroup/memory/docker/1254c8d63f85442e599b17dff895f4543c897755ee3bd9b56d5d3d17724b38d7/memory.limit_in_bytes: device or resource busy\\\"\"": unknown.
ERRO[0000] error waiting for container: context canceled
Containers that fail to start because of this limit, will not be marked as OOMKilled,
which makes it harder for users to find the cause of the failure.
Note that _after_ this memory is only required during startup of the container. After
the container was started, the container may not consume this memory, and limits
could (manually) be lowered, for example, an alpine container running only a shell
can run with 512k of memory;
echo 524288 > /sys/fs/cgroup/memory/docker/acdd326419f0898be63b0463cfc81cd17fb34d2dae6f8aa3768ee6a075ca5c86/memory.limit_in_bytes
However, restarting the container will reset that manual limit to the container's
configuration. While `docker container update` would allow for the updated limit to
be persisted, (re)starting the container after updating produces the same error message
again, so we cannot use different limits for `docker run` / `docker create` and `docker update`.
This patch raises the minimum memory limnit to 6M, so that a better error-message is
produced if a user tries to create a container with a memory-limit that is too low:
docker create --memory=4m alpine echo success
docker: Error response from daemon: Minimum memory limit allowed is 6MB.
Possibly, this constraint could be handled by runc, so that different runtimes
could set a best-matching limit (other runtimes may require less overhead).
Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
2020-07-01 06:04:23 -04:00
// It's not kernel limit, we want this 6M limit to account for overhead during startup, and to supply a reasonable functional container
linuxMinMemory = 6291456
2016-01-07 22:43:11 -05:00
// constants for remapped root settings
2018-05-19 07:38:54 -04:00
defaultIDSpecifier = "default"
defaultRemappedID = "dockremap"
2016-03-18 15:43:17 -04:00
// constant for cgroup drivers
cgroupFsDriver = "cgroupfs"
cgroupSystemdDriver = "systemd"
2019-06-02 11:03:27 -04:00
cgroupNoneDriver = "none"
2015-08-05 20:15:14 -04:00
)
2017-08-01 15:04:37 -04:00
type containerGetter interface {
GetContainer ( string ) ( * container . Container , error )
}
2017-04-27 17:52:47 -04:00
func getMemoryResources ( config containertypes . Resources ) * specs . LinuxMemory {
memory := specs . LinuxMemory { }
2016-03-18 14:50:19 -04:00
if config . Memory > 0 {
2017-08-01 11:51:24 -04:00
memory . Limit = & config . Memory
2016-03-18 14:50:19 -04:00
}
if config . MemoryReservation > 0 {
2017-08-01 11:51:24 -04:00
memory . Reservation = & config . MemoryReservation
2016-03-18 14:50:19 -04:00
}
2017-04-27 17:52:47 -04:00
if config . MemorySwap > 0 {
2017-08-01 11:51:24 -04:00
memory . Swap = & config . MemorySwap
2016-03-18 14:50:19 -04:00
}
if config . MemorySwappiness != nil {
swappiness := uint64 ( * config . MemorySwappiness )
memory . Swappiness = & swappiness
}
2018-02-04 21:05:57 -05:00
if config . OomKillDisable != nil {
memory . DisableOOMKiller = config . OomKillDisable
}
2016-03-18 14:50:19 -04:00
if config . KernelMemory != 0 {
2017-08-01 11:51:24 -04:00
memory . Kernel = & config . KernelMemory
2016-03-18 14:50:19 -04:00
}
2018-05-11 15:46:11 -04:00
if config . KernelMemoryTCP != 0 {
memory . KernelTCP = & config . KernelMemoryTCP
}
2016-03-18 14:50:19 -04:00
return & memory
}
2017-04-11 07:28:13 -04:00
func getPidsLimit ( config containertypes . Resources ) * specs . LinuxPids {
2019-02-24 09:36:45 -05:00
if config . PidsLimit == nil {
return nil
}
if * config . PidsLimit <= 0 {
// docker API allows 0 and negative values to unset this to be consistent
// with default values. When updating values, runc requires -1 to unset
// the previous limit.
return & specs . LinuxPids { Limit : - 1 }
2017-04-11 07:28:13 -04:00
}
2019-02-24 09:36:45 -05:00
return & specs . LinuxPids { Limit : * config . PidsLimit }
2017-04-11 07:28:13 -04:00
}
2017-04-27 17:52:47 -04:00
func getCPUResources ( config containertypes . Resources ) ( * specs . LinuxCPU , error ) {
cpu := specs . LinuxCPU { }
2016-03-18 14:50:19 -04:00
2017-04-27 17:52:47 -04:00
if config . CPUShares < 0 {
return nil , fmt . Errorf ( "shares: invalid argument" )
}
if config . CPUShares >= 0 {
2016-03-18 14:50:19 -04:00
shares := uint64 ( config . CPUShares )
cpu . Shares = & shares
}
if config . CpusetCpus != "" {
2017-04-27 17:52:47 -04:00
cpu . Cpus = config . CpusetCpus
2016-03-18 14:50:19 -04:00
}
if config . CpusetMems != "" {
2017-04-27 17:52:47 -04:00
cpu . Mems = config . CpusetMems
2016-03-18 14:50:19 -04:00
}
2016-11-01 13:12:29 -04:00
if config . NanoCPUs > 0 {
// https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt
2016-11-17 19:39:47 -05:00
period := uint64 ( 100 * time . Millisecond / time . Microsecond )
2017-04-27 17:52:47 -04:00
quota := config . NanoCPUs * int64 ( period ) / 1e9
2016-11-01 13:12:29 -04:00
cpu . Period = & period
cpu . Quota = & quota
}
2016-03-18 14:50:19 -04:00
if config . CPUPeriod != 0 {
period := uint64 ( config . CPUPeriod )
cpu . Period = & period
}
if config . CPUQuota != 0 {
2017-04-27 17:52:47 -04:00
q := config . CPUQuota
cpu . Quota = & q
2016-03-18 14:50:19 -04:00
}
2016-06-07 15:05:43 -04:00
if config . CPURealtimePeriod != 0 {
period := uint64 ( config . CPURealtimePeriod )
cpu . RealtimePeriod = & period
}
if config . CPURealtimeRuntime != 0 {
2017-04-27 17:52:47 -04:00
c := config . CPURealtimeRuntime
cpu . RealtimeRuntime = & c
2016-06-07 15:05:43 -04:00
}
2017-04-27 17:52:47 -04:00
return & cpu , nil
2016-03-18 14:50:19 -04:00
}
2017-04-27 17:52:47 -04:00
func getBlkioWeightDevices ( config containertypes . Resources ) ( [ ] specs . LinuxWeightDevice , error ) {
2017-07-27 03:51:23 -04:00
var stat unix . Stat_t
2017-04-27 17:52:47 -04:00
var blkioWeightDevices [ ] specs . LinuxWeightDevice
2015-06-11 20:34:20 -04:00
for _ , weightDevice := range config . BlkioWeightDevice {
2017-07-27 03:51:23 -04:00
if err := unix . Stat ( weightDevice . Path , & stat ) ; err != nil {
2020-08-08 20:27:43 -04:00
return nil , errors . WithStack ( & os . PathError { Op : "stat" , Path : weightDevice . Path , Err : err } )
2015-06-11 20:34:20 -04:00
}
2016-03-18 14:50:19 -04:00
weight := weightDevice . Weight
2017-04-27 17:52:47 -04:00
d := specs . LinuxWeightDevice { Weight : & weight }
2019-08-01 04:48:48 -04:00
// The type is 32bit on mips.
2021-05-31 05:39:04 -04:00
d . Major = int64 ( unix . Major ( uint64 ( stat . Rdev ) ) ) //nolint: unconvert
d . Minor = int64 ( unix . Minor ( uint64 ( stat . Rdev ) ) ) //nolint: unconvert
2016-03-18 14:50:19 -04:00
blkioWeightDevices = append ( blkioWeightDevices , d )
2015-06-11 20:34:20 -04:00
}
2015-12-14 20:50:16 -05:00
return blkioWeightDevices , nil
2015-06-11 20:34:20 -04:00
}
2017-01-08 20:22:05 -05:00
func ( daemon * Daemon ) parseSecurityOpt ( container * container . Container , hostConfig * containertypes . HostConfig ) error {
container . NoNewPrivileges = daemon . configStore . NoNewPrivileges
return parseSecurityOpt ( container , hostConfig )
}
2015-12-18 13:36:17 -05:00
func parseSecurityOpt ( container * container . Container , config * containertypes . HostConfig ) error {
2015-05-15 19:34:26 -04:00
var (
labelOpts [ ] string
err error
)
for _ , opt := range config . SecurityOpt {
2016-03-15 18:34:29 -04:00
if opt == "no-new-privileges" {
container . NoNewPrivileges = true
2016-10-31 22:35:18 -04:00
continue
}
2017-04-27 17:52:47 -04:00
if opt == "disable" {
labelOpts = append ( labelOpts , "disable" )
continue
}
2016-10-31 22:35:18 -04:00
var con [ ] string
if strings . Contains ( opt , "=" ) {
con = strings . SplitN ( opt , "=" , 2 )
} else if strings . Contains ( opt , ":" ) {
con = strings . SplitN ( opt , ":" , 2 )
2017-02-16 02:22:50 -05:00
logrus . Warn ( "Security options with `:` as a separator are deprecated and will be completely unsupported in 17.04, use `=` instead." )
2016-10-31 22:35:18 -04:00
}
if len ( con ) != 2 {
return fmt . Errorf ( "invalid --security-opt 1: %q" , opt )
}
switch con [ 0 ] {
case "label" :
labelOpts = append ( labelOpts , con [ 1 ] )
case "apparmor" :
container . AppArmorProfile = con [ 1 ]
case "seccomp" :
container . SeccompProfile = con [ 1 ]
2017-01-08 20:22:05 -05:00
case "no-new-privileges" :
noNewPrivileges , err := strconv . ParseBool ( con [ 1 ] )
if err != nil {
return fmt . Errorf ( "invalid --security-opt 2: %q" , opt )
}
container . NoNewPrivileges = noNewPrivileges
2016-10-31 22:35:18 -04:00
default :
return fmt . Errorf ( "invalid --security-opt 2: %q" , opt )
2015-05-15 19:34:26 -04:00
}
}
container . ProcessLabel , container . MountLabel , err = label . InitLabels ( labelOpts )
return err
}
2017-04-27 17:52:47 -04:00
func getBlkioThrottleDevices ( devs [ ] * blkiodev . ThrottleDevice ) ( [ ] specs . LinuxThrottleDevice , error ) {
var throttleDevices [ ] specs . LinuxThrottleDevice
2017-07-27 03:51:23 -04:00
var stat unix . Stat_t
2015-07-08 07:06:48 -04:00
2016-04-29 16:39:04 -04:00
for _ , d := range devs {
2017-07-27 03:51:23 -04:00
if err := unix . Stat ( d . Path , & stat ) ; err != nil {
2020-08-08 20:27:43 -04:00
return nil , errors . WithStack ( & os . PathError { Op : "stat" , Path : d . Path , Err : err } )
2015-07-08 07:06:48 -04:00
}
2017-04-27 17:52:47 -04:00
d := specs . LinuxThrottleDevice { Rate : d . Rate }
2019-08-01 04:48:48 -04:00
// the type is 32bit on mips
2021-05-31 05:39:04 -04:00
d . Major = int64 ( unix . Major ( uint64 ( stat . Rdev ) ) ) //nolint: unconvert
d . Minor = int64 ( unix . Minor ( uint64 ( stat . Rdev ) ) ) //nolint: unconvert
2016-04-29 16:39:04 -04:00
throttleDevices = append ( throttleDevices , d )
2015-07-08 07:06:48 -04:00
}
2016-04-29 16:39:04 -04:00
return throttleDevices , nil
2015-07-08 07:06:48 -04:00
}
2018-12-04 11:44:45 -05:00
// adjustParallelLimit takes a number of objects and a proposed limit and
// figures out if it's reasonable (and adjusts it accordingly). This is only
// used for daemon startup, which does a lot of parallel loading of containers
// (and if we exceed RLIMIT_NOFILE then we're in trouble).
func adjustParallelLimit ( n int , limit int ) int {
// Rule-of-thumb overhead factor (how many files will each goroutine open
// simultaneously). Yes, this is ugly but to be frank this whole thing is
// ugly.
const overhead = 2
// On Linux, we need to ensure that parallelStartupJobs doesn't cause us to
// exceed RLIMIT_NOFILE. If parallelStartupJobs is too large, we reduce it
// and give a warning (since in theory the user should increase their
// ulimits to the largest possible value for dockerd).
var rlim unix . Rlimit
if err := unix . Getrlimit ( unix . RLIMIT_NOFILE , & rlim ) ; err != nil {
logrus . Warnf ( "Couldn't find dockerd's RLIMIT_NOFILE to double-check startup parallelism factor: %v" , err )
return limit
}
softRlimit := int ( rlim . Cur )
// Much fewer containers than RLIMIT_NOFILE. No need to adjust anything.
if softRlimit > overhead * n {
return limit
}
// RLIMIT_NOFILE big enough, no need to adjust anything.
if softRlimit > overhead * limit {
return limit
}
logrus . Warnf ( "Found dockerd's open file ulimit (%v) is far too small -- consider increasing it significantly (at least %v)" , softRlimit , overhead * limit )
return softRlimit / overhead
}
2015-07-30 18:28:11 -04:00
// adaptContainerSettings is called during container creation to modify any
// settings necessary in the HostConfig structure.
2015-12-18 13:36:17 -05:00
func ( daemon * Daemon ) adaptContainerSettings ( hostConfig * containertypes . HostConfig , adjustCPUShares bool ) error {
2015-08-05 20:15:14 -04:00
if adjustCPUShares && hostConfig . CPUShares > 0 {
// Handle unsupported CPUShares
if hostConfig . CPUShares < linuxMinCPUShares {
logrus . Warnf ( "Changing requested CPUShares of %d to minimum allowed of %d" , hostConfig . CPUShares , linuxMinCPUShares )
hostConfig . CPUShares = linuxMinCPUShares
} else if hostConfig . CPUShares > linuxMaxCPUShares {
logrus . Warnf ( "Changing requested CPUShares of %d to maximum allowed of %d" , hostConfig . CPUShares , linuxMaxCPUShares )
hostConfig . CPUShares = linuxMaxCPUShares
}
}
2015-07-13 03:17:43 -04:00
if hostConfig . Memory > 0 && hostConfig . MemorySwap == 0 {
// By default, MemorySwap is set to twice the size of Memory.
hostConfig . MemorySwap = hostConfig . Memory * 2
}
2015-12-29 15:49:17 -05:00
if hostConfig . ShmSize == 0 {
2017-01-23 06:23:07 -05:00
hostConfig . ShmSize = config . DefaultShmSize
2016-12-25 04:11:12 -05:00
if daemon . configStore != nil {
hostConfig . ShmSize = int64 ( daemon . configStore . ShmSize )
}
2015-11-26 07:14:09 -05:00
}
Implement none, private, and shareable ipc modes
Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and
/dev/mqueue between containers") container's /dev/shm is mounted on the
host first, then bind-mounted inside the container. This is done that
way in order to be able to share this container's IPC namespace
(and the /dev/shm mount point) with another container.
Unfortunately, this functionality breaks container checkpoint/restore
(even if IPC is not shared). Since /dev/shm is an external mount, its
contents is not saved by `criu checkpoint`, and so upon restore any
application that tries to access data under /dev/shm is severily
disappointed (which usually results in a fatal crash).
This commit solves the issue by introducing new IPC modes for containers
(in addition to 'host' and 'container:ID'). The new modes are:
- 'shareable': enables sharing this container's IPC with others
(this used to be the implicit default);
- 'private': disables sharing this container's IPC.
In 'private' mode, container's /dev/shm is truly mounted inside the
container, without any bind-mounting from the host, which solves the
issue.
While at it, let's also implement 'none' mode. The motivation, as
eloquently put by Justin Cormack, is:
> I wondered a while back about having a none shm mode, as currently it is
> not possible to have a totally unwriteable container as there is always
> a /dev/shm writeable mount. It is a bit of a niche case (and clearly
> should never be allowed to be daemon default) but it would be trivial to
> add now so maybe we should...
...so here's yet yet another mode:
- 'none': no /dev/shm mount inside the container (though it still
has its own private IPC namespace).
Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd
need to make 'private' the default mode, but unfortunately it breaks the
backward compatibility. So, let's make the default container IPC mode
per-daemon configurable (with the built-in default set to 'shareable'
for now). The default can be changed either via a daemon CLI option
(--default-shm-mode) or a daemon.json configuration file parameter
of the same name.
Note one can only set either 'shareable' or 'private' IPC modes as a
daemon default (i.e. in this context 'host', 'container', or 'none'
do not make much sense).
Some other changes this patch introduces are:
1. A mount for /dev/shm is added to default OCI Linux spec.
2. IpcMode.Valid() is simplified to remove duplicated code that parsed
'container:ID' form. Note the old version used to check that ID does
not contain a semicolon -- this is no longer the case (tests are
modified accordingly). The motivation is we should either do a
proper check for container ID validity, or don't check it at all
(since it is checked in other places anyway). I chose the latter.
3. IpcMode.Container() is modified to not return container ID if the
mode value does not start with "container:", unifying the check to
be the same as in IpcMode.IsContainer().
3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified
to add checks for newly added values.
[v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997]
[v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833]
[v4: addressed the case of upgrading from older daemon, in this case
container.HostConfig.IpcMode is unset and this is valid]
[v5: document old and new IpcMode values in api/swagger.yaml]
[v6: add the 'none' mode, changelog entry to docs/api/version-history.md]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 17:58:50 -04:00
// Set default IPC mode, if unset for container
if hostConfig . IpcMode . IsEmpty ( ) {
m := config . DefaultIpcMode
if daemon . configStore != nil {
2019-10-12 20:07:36 -04:00
m = containertypes . IpcMode ( daemon . configStore . IpcMode )
Implement none, private, and shareable ipc modes
Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and
/dev/mqueue between containers") container's /dev/shm is mounted on the
host first, then bind-mounted inside the container. This is done that
way in order to be able to share this container's IPC namespace
(and the /dev/shm mount point) with another container.
Unfortunately, this functionality breaks container checkpoint/restore
(even if IPC is not shared). Since /dev/shm is an external mount, its
contents is not saved by `criu checkpoint`, and so upon restore any
application that tries to access data under /dev/shm is severily
disappointed (which usually results in a fatal crash).
This commit solves the issue by introducing new IPC modes for containers
(in addition to 'host' and 'container:ID'). The new modes are:
- 'shareable': enables sharing this container's IPC with others
(this used to be the implicit default);
- 'private': disables sharing this container's IPC.
In 'private' mode, container's /dev/shm is truly mounted inside the
container, without any bind-mounting from the host, which solves the
issue.
While at it, let's also implement 'none' mode. The motivation, as
eloquently put by Justin Cormack, is:
> I wondered a while back about having a none shm mode, as currently it is
> not possible to have a totally unwriteable container as there is always
> a /dev/shm writeable mount. It is a bit of a niche case (and clearly
> should never be allowed to be daemon default) but it would be trivial to
> add now so maybe we should...
...so here's yet yet another mode:
- 'none': no /dev/shm mount inside the container (though it still
has its own private IPC namespace).
Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd
need to make 'private' the default mode, but unfortunately it breaks the
backward compatibility. So, let's make the default container IPC mode
per-daemon configurable (with the built-in default set to 'shareable'
for now). The default can be changed either via a daemon CLI option
(--default-shm-mode) or a daemon.json configuration file parameter
of the same name.
Note one can only set either 'shareable' or 'private' IPC modes as a
daemon default (i.e. in this context 'host', 'container', or 'none'
do not make much sense).
Some other changes this patch introduces are:
1. A mount for /dev/shm is added to default OCI Linux spec.
2. IpcMode.Valid() is simplified to remove duplicated code that parsed
'container:ID' form. Note the old version used to check that ID does
not contain a semicolon -- this is no longer the case (tests are
modified accordingly). The motivation is we should either do a
proper check for container ID validity, or don't check it at all
(since it is checked in other places anyway). I chose the latter.
3. IpcMode.Container() is modified to not return container ID if the
mode value does not start with "container:", unifying the check to
be the same as in IpcMode.IsContainer().
3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified
to add checks for newly added values.
[v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997]
[v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833]
[v4: addressed the case of upgrading from older daemon, in this case
container.HostConfig.IpcMode is unset and this is valid]
[v5: document old and new IpcMode values in api/swagger.yaml]
[v6: add the 'none' mode, changelog entry to docs/api/version-history.md]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 17:58:50 -04:00
}
2019-10-12 20:07:36 -04:00
hostConfig . IpcMode = m
Implement none, private, and shareable ipc modes
Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and
/dev/mqueue between containers") container's /dev/shm is mounted on the
host first, then bind-mounted inside the container. This is done that
way in order to be able to share this container's IPC namespace
(and the /dev/shm mount point) with another container.
Unfortunately, this functionality breaks container checkpoint/restore
(even if IPC is not shared). Since /dev/shm is an external mount, its
contents is not saved by `criu checkpoint`, and so upon restore any
application that tries to access data under /dev/shm is severily
disappointed (which usually results in a fatal crash).
This commit solves the issue by introducing new IPC modes for containers
(in addition to 'host' and 'container:ID'). The new modes are:
- 'shareable': enables sharing this container's IPC with others
(this used to be the implicit default);
- 'private': disables sharing this container's IPC.
In 'private' mode, container's /dev/shm is truly mounted inside the
container, without any bind-mounting from the host, which solves the
issue.
While at it, let's also implement 'none' mode. The motivation, as
eloquently put by Justin Cormack, is:
> I wondered a while back about having a none shm mode, as currently it is
> not possible to have a totally unwriteable container as there is always
> a /dev/shm writeable mount. It is a bit of a niche case (and clearly
> should never be allowed to be daemon default) but it would be trivial to
> add now so maybe we should...
...so here's yet yet another mode:
- 'none': no /dev/shm mount inside the container (though it still
has its own private IPC namespace).
Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd
need to make 'private' the default mode, but unfortunately it breaks the
backward compatibility. So, let's make the default container IPC mode
per-daemon configurable (with the built-in default set to 'shareable'
for now). The default can be changed either via a daemon CLI option
(--default-shm-mode) or a daemon.json configuration file parameter
of the same name.
Note one can only set either 'shareable' or 'private' IPC modes as a
daemon default (i.e. in this context 'host', 'container', or 'none'
do not make much sense).
Some other changes this patch introduces are:
1. A mount for /dev/shm is added to default OCI Linux spec.
2. IpcMode.Valid() is simplified to remove duplicated code that parsed
'container:ID' form. Note the old version used to check that ID does
not contain a semicolon -- this is no longer the case (tests are
modified accordingly). The motivation is we should either do a
proper check for container ID validity, or don't check it at all
(since it is checked in other places anyway). I chose the latter.
3. IpcMode.Container() is modified to not return container ID if the
mode value does not start with "container:", unifying the check to
be the same as in IpcMode.IsContainer().
3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified
to add checks for newly added values.
[v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997]
[v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833]
[v4: addressed the case of upgrading from older daemon, in this case
container.HostConfig.IpcMode is unset and this is valid]
[v5: document old and new IpcMode values in api/swagger.yaml]
[v6: add the 'none' mode, changelog entry to docs/api/version-history.md]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 17:58:50 -04:00
}
2019-03-14 23:44:18 -04:00
// Set default cgroup namespace mode, if unset for container
if hostConfig . CgroupnsMode . IsEmpty ( ) {
2019-11-05 04:04:21 -05:00
// for cgroup v2: unshare cgroupns even for privileged containers
// https://github.com/containers/libpod/pull/4374#issuecomment-549776387
2020-11-09 09:00:32 -05:00
if hostConfig . Privileged && cgroups . Mode ( ) != cgroups . Unified {
2019-10-13 08:18:57 -04:00
hostConfig . CgroupnsMode = containertypes . CgroupnsModeHost
2019-07-29 18:33:18 -04:00
} else {
2019-10-13 08:18:57 -04:00
m := containertypes . CgroupnsModeHost
2020-11-09 09:00:32 -05:00
if cgroups . Mode ( ) == cgroups . Unified {
2019-10-13 08:18:57 -04:00
m = containertypes . CgroupnsModePrivate
2019-11-05 04:04:21 -05:00
}
2019-07-29 18:33:18 -04:00
if daemon . configStore != nil {
2019-10-13 08:18:57 -04:00
m = containertypes . CgroupnsMode ( daemon . configStore . CgroupNamespaceMode )
2019-07-29 18:33:18 -04:00
}
2019-10-13 08:18:57 -04:00
hostConfig . CgroupnsMode = m
2019-03-14 23:44:18 -04:00
}
}
2017-08-01 15:04:37 -04:00
adaptSharedNamespaceContainer ( daemon , hostConfig )
2015-11-30 00:10:18 -05:00
var err error
2019-08-09 08:10:07 -04:00
secOpts , err := daemon . generateSecurityOpt ( hostConfig )
2016-11-03 12:44:40 -04:00
if err != nil {
return err
2015-11-30 00:10:18 -05:00
}
2019-08-09 08:10:07 -04:00
hostConfig . SecurityOpt = append ( hostConfig . SecurityOpt , secOpts ... )
2015-12-31 01:17:18 -05:00
if hostConfig . OomKillDisable == nil {
defaultOomKillDisable := false
hostConfig . OomKillDisable = & defaultOomKillDisable
}
2015-11-30 00:10:18 -05:00
return nil
2015-07-13 03:17:43 -04:00
}
2017-08-01 15:04:37 -04:00
// adaptSharedNamespaceContainer replaces container name with its ID in hostConfig.
// To be more precisely, it modifies `container:name` to `container:ID` of PidMode, IpcMode
// and NetworkMode.
//
// When a container shares its namespace with another container, use ID can keep the namespace
// sharing connection between the two containers even the another container is renamed.
func adaptSharedNamespaceContainer ( daemon containerGetter , hostConfig * containertypes . HostConfig ) {
containerPrefix := "container:"
if hostConfig . PidMode . IsContainer ( ) {
pidContainer := hostConfig . PidMode . Container ( )
// if there is any error returned here, we just ignore it and leave it to be
// handled in the following logic
if c , err := daemon . GetContainer ( pidContainer ) ; err == nil {
hostConfig . PidMode = containertypes . PidMode ( containerPrefix + c . ID )
}
}
if hostConfig . IpcMode . IsContainer ( ) {
ipcContainer := hostConfig . IpcMode . Container ( )
if c , err := daemon . GetContainer ( ipcContainer ) ; err == nil {
hostConfig . IpcMode = containertypes . IpcMode ( containerPrefix + c . ID )
}
}
if hostConfig . NetworkMode . IsContainer ( ) {
netContainer := hostConfig . NetworkMode . ConnectedContainer ( )
if c , err := daemon . GetContainer ( netContainer ) ; err == nil {
hostConfig . NetworkMode = containertypes . NetworkMode ( containerPrefix + c . ID )
}
}
}
2018-12-18 17:41:52 -05:00
// verifyPlatformContainerResources performs platform-specific validation of the container's resource-configuration
func verifyPlatformContainerResources ( resources * containertypes . Resources , sysInfo * sysinfo . SysInfo , update bool ) ( warnings [ ] string , err error ) {
2017-06-30 13:34:40 -04:00
fixMemorySwappiness ( resources )
2015-05-15 19:34:26 -04:00
2015-08-06 07:55:56 -04:00
// memory subsystem checks and adjustments
2015-12-10 21:59:29 -05:00
if resources . Memory != 0 && resources . Memory < linuxMinMemory {
Set minimum memory limit to 6M, to account for higher startup memory use
For some time, we defined a minimum limit for `--memory` limits to account for
overhead during startup, and to supply a reasonable functional container.
Changes in the runtime (runc) introduced a higher memory footprint during container
startup, which now lead to obscure error-messages that are unfriendly for users:
run --rm --memory=4m alpine echo success
docker: Error response from daemon: OCI runtime create failed: container_linux.go:349: starting container process caused "process_linux.go:449: container init caused \"process_linux.go:415: setting cgroup config for procHooks process caused \\\"failed to write \\\\\\\"4194304\\\\\\\" to \\\\\\\"/sys/fs/cgroup/memory/docker/1254c8d63f85442e599b17dff895f4543c897755ee3bd9b56d5d3d17724b38d7/memory.limit_in_bytes\\\\\\\": write /sys/fs/cgroup/memory/docker/1254c8d63f85442e599b17dff895f4543c897755ee3bd9b56d5d3d17724b38d7/memory.limit_in_bytes: device or resource busy\\\"\"": unknown.
ERRO[0000] error waiting for container: context canceled
Containers that fail to start because of this limit, will not be marked as OOMKilled,
which makes it harder for users to find the cause of the failure.
Note that _after_ this memory is only required during startup of the container. After
the container was started, the container may not consume this memory, and limits
could (manually) be lowered, for example, an alpine container running only a shell
can run with 512k of memory;
echo 524288 > /sys/fs/cgroup/memory/docker/acdd326419f0898be63b0463cfc81cd17fb34d2dae6f8aa3768ee6a075ca5c86/memory.limit_in_bytes
However, restarting the container will reset that manual limit to the container's
configuration. While `docker container update` would allow for the updated limit to
be persisted, (re)starting the container after updating produces the same error message
again, so we cannot use different limits for `docker run` / `docker create` and `docker update`.
This patch raises the minimum memory limnit to 6M, so that a better error-message is
produced if a user tries to create a container with a memory-limit that is too low:
docker create --memory=4m alpine echo success
docker: Error response from daemon: Minimum memory limit allowed is 6MB.
Possibly, this constraint could be handled by runc, so that different runtimes
could set a best-matching limit (other runtimes may require less overhead).
Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
2020-07-01 06:04:23 -04:00
return warnings , fmt . Errorf ( "Minimum memory limit allowed is 6MB" )
2015-05-15 19:34:26 -04:00
}
2015-12-10 21:59:29 -05:00
if resources . Memory > 0 && ! sysInfo . MemoryLimit {
2016-07-18 16:56:41 -04:00
warnings = append ( warnings , "Your kernel does not support memory limit capabilities or the cgroup is not mounted. Limitation discarded." )
2015-12-10 21:59:29 -05:00
resources . Memory = 0
resources . MemorySwap = - 1
2015-05-15 19:34:26 -04:00
}
2015-12-10 21:59:29 -05:00
if resources . Memory > 0 && resources . MemorySwap != - 1 && ! sysInfo . SwapLimit {
2016-07-18 16:56:41 -04:00
warnings = append ( warnings , "Your kernel does not support swap limit capabilities or the cgroup is not mounted. Memory limited without swap." )
2015-12-10 21:59:29 -05:00
resources . MemorySwap = - 1
2015-05-15 19:34:26 -04:00
}
2015-12-10 21:59:29 -05:00
if resources . Memory > 0 && resources . MemorySwap > 0 && resources . MemorySwap < resources . Memory {
2016-03-21 20:53:57 -04:00
return warnings , fmt . Errorf ( "Minimum memoryswap limit should be larger than memory limit, see usage" )
2015-05-15 19:34:26 -04:00
}
2016-02-24 00:36:47 -05:00
if resources . Memory == 0 && resources . MemorySwap > 0 && ! update {
2016-03-21 20:53:57 -04:00
return warnings , fmt . Errorf ( "You should always set the Memory limit when using Memoryswap limit, see usage" )
2015-05-15 19:34:26 -04:00
}
2017-06-30 13:34:40 -04:00
if resources . MemorySwappiness != nil && ! sysInfo . MemorySwappiness {
2016-07-18 16:56:41 -04:00
warnings = append ( warnings , "Your kernel does not support memory swappiness capabilities or the cgroup is not mounted. Memory swappiness discarded." )
2015-12-10 21:59:29 -05:00
resources . MemorySwappiness = nil
2015-07-14 01:52:57 -04:00
}
2015-12-10 21:59:29 -05:00
if resources . MemorySwappiness != nil {
swappiness := * resources . MemorySwappiness
2017-06-30 13:34:40 -04:00
if swappiness < 0 || swappiness > 100 {
2016-03-21 20:53:57 -04:00
return warnings , fmt . Errorf ( "Invalid value: %v, valid memory swappiness range is 0-100" , swappiness )
2015-07-29 16:04:12 -04:00
}
2015-07-14 01:52:57 -04:00
}
2015-12-10 21:59:29 -05:00
if resources . MemoryReservation > 0 && ! sysInfo . MemoryReservation {
2016-07-18 16:56:41 -04:00
warnings = append ( warnings , "Your kernel does not support memory soft limit capabilities or the cgroup is not mounted. Limitation discarded." )
2015-12-10 21:59:29 -05:00
resources . MemoryReservation = 0
2015-09-23 02:02:45 -04:00
}
2016-04-05 21:37:51 -04:00
if resources . MemoryReservation > 0 && resources . MemoryReservation < linuxMinMemory {
2021-01-05 06:26:29 -05:00
return warnings , fmt . Errorf ( "Minimum memory reservation allowed is 6MB" )
2016-04-05 21:37:51 -04:00
}
2015-12-10 21:59:29 -05:00
if resources . Memory > 0 && resources . MemoryReservation > 0 && resources . Memory < resources . MemoryReservation {
2016-07-11 06:29:17 -04:00
return warnings , fmt . Errorf ( "Minimum memory limit can not be less than memory reservation limit, see usage" )
2015-09-23 02:02:45 -04:00
}
2020-07-24 04:20:56 -04:00
if resources . KernelMemory > 0 {
// Kernel memory limit is not supported on cgroup v2.
// Even on cgroup v1, kernel memory limit (`kmem.limit_in_bytes`) has been deprecated since kernel 5.4.
// https://github.com/torvalds/linux/commit/0158115f702b0ba208ab0b5adf44cae99b3ebcc7
2021-09-21 03:58:31 -04:00
if ! sysInfo . KernelMemory {
warnings = append ( warnings , "Your kernel does not support kernel memory limit capabilities or the cgroup is not mounted. Limitation discarded." )
resources . KernelMemory = 0
}
2022-02-07 11:09:23 -05:00
if resources . KernelMemory > 0 && resources . KernelMemory < linuxMinMemory {
2021-09-21 03:58:31 -04:00
return warnings , fmt . Errorf ( "Minimum kernel memory limit allowed is 6MB" )
}
if ! kernel . CheckKernelVersion ( 4 , 0 , 0 ) {
warnings = append ( warnings , "You specified a kernel memory limit on a kernel older than 4.0. Kernel memory limits are experimental on older kernels, it won't work as expected and can cause your system to be unstable." )
}
2015-08-19 11:56:55 -04:00
}
2015-12-31 01:17:18 -05:00
if resources . OomKillDisable != nil && ! sysInfo . OomKillDisable {
2016-01-13 14:53:44 -05:00
// only produce warnings if the setting wasn't to *disable* the OOM Kill; no point
// warning the caller if they already wanted the feature to be off
if * resources . OomKillDisable {
2016-07-11 06:29:17 -04:00
warnings = append ( warnings , "Your kernel does not support OomKillDisable. OomKillDisable discarded." )
2016-01-13 14:53:44 -05:00
}
2015-12-31 01:17:18 -05:00
resources . OomKillDisable = nil
2015-12-22 03:08:04 -05:00
}
2018-12-17 05:23:41 -05:00
if resources . OomKillDisable != nil && * resources . OomKillDisable && resources . Memory == 0 {
warnings = append ( warnings , "OOM killer is disabled for the container, but no memory limit is set, this can result in the system running out of resources." )
}
2019-02-24 09:36:45 -05:00
if resources . PidsLimit != nil && ! sysInfo . PidsLimit {
if * resources . PidsLimit > 0 {
warnings = append ( warnings , "Your kernel does not support PIDs limit capabilities or the cgroup is not mounted. PIDs limit discarded." )
}
resources . PidsLimit = nil
2015-12-15 14:15:43 -05:00
}
2015-12-10 21:59:29 -05:00
// cpu subsystem checks and adjustments
2016-11-01 13:12:29 -04:00
if resources . NanoCPUs > 0 && resources . CPUPeriod > 0 {
return warnings , fmt . Errorf ( "Conflicting options: Nano CPUs and CPU Period cannot both be set" )
}
if resources . NanoCPUs > 0 && resources . CPUQuota > 0 {
return warnings , fmt . Errorf ( "Conflicting options: Nano CPUs and CPU Quota cannot both be set" )
}
2020-05-22 17:18:06 -04:00
if resources . NanoCPUs > 0 && ! sysInfo . CPUCfs {
return warnings , fmt . Errorf ( "NanoCPUs can not be set, as your kernel does not support CPU CFS scheduler or the cgroup is not mounted" )
2016-11-01 13:12:29 -04:00
}
2016-11-15 18:48:46 -05:00
// The highest precision we could get on Linux is 0.001, by setting
// cpu.cfs_period_us=1000ms
// cpu.cfs_quota=1ms
// See the following link for details:
// https://www.kernel.org/doc/Documentation/scheduler/sched-bwc.txt
// Here we don't set the lower limit and it is up to the underlying platform (e.g., Linux) to return an error.
// The error message is 0.01 so that this is consistent with Windows
2016-11-01 13:12:29 -04:00
if resources . NanoCPUs < 0 || resources . NanoCPUs > int64 ( sysinfo . NumCPU ( ) ) * 1e9 {
2016-11-15 18:48:46 -05:00
return warnings , fmt . Errorf ( "Range of CPUs is from 0.01 to %d.00, as there are only %d CPUs available" , sysinfo . NumCPU ( ) , sysinfo . NumCPU ( ) )
2016-11-01 13:12:29 -04:00
}
2015-12-10 21:59:29 -05:00
if resources . CPUShares > 0 && ! sysInfo . CPUShares {
2016-07-18 16:56:41 -04:00
warnings = append ( warnings , "Your kernel does not support CPU shares or the cgroup is not mounted. Shares discarded." )
2015-12-10 21:59:29 -05:00
resources . CPUShares = 0
2015-08-05 10:35:18 -04:00
}
2020-05-22 17:18:06 -04:00
if ( resources . CPUPeriod != 0 || resources . CPUQuota != 0 ) && ! sysInfo . CPUCfs {
warnings = append ( warnings , "Your kernel does not support CPU CFS scheduler. CPU period/quota discarded." )
2015-12-10 21:59:29 -05:00
resources . CPUPeriod = 0
2020-05-22 17:18:06 -04:00
resources . CPUQuota = 0
2015-05-15 19:34:26 -04:00
}
2016-04-21 02:50:25 -04:00
if resources . CPUPeriod != 0 && ( resources . CPUPeriod < 1000 || resources . CPUPeriod > 1000000 ) {
2016-03-17 23:16:53 -04:00
return warnings , fmt . Errorf ( "CPU cfs period can not be less than 1ms (i.e. 1000) or larger than 1s (i.e. 1000000)" )
}
if resources . CPUQuota > 0 && resources . CPUQuota < 1000 {
return warnings , fmt . Errorf ( "CPU cfs quota can not be less than 1ms (i.e. 1000)" )
}
2016-03-04 20:24:09 -05:00
if resources . CPUPercent > 0 {
2016-10-16 10:57:44 -04:00
warnings = append ( warnings , fmt . Sprintf ( "%s does not support CPU percent. Percent discarded." , runtime . GOOS ) )
2016-03-04 20:24:09 -05:00
resources . CPUPercent = 0
}
2015-12-10 21:59:29 -05:00
// cpuset subsystem checks and adjustments
if ( resources . CpusetCpus != "" || resources . CpusetMems != "" ) && ! sysInfo . Cpuset {
2016-07-18 16:56:41 -04:00
warnings = append ( warnings , "Your kernel does not support cpuset or the cgroup is not mounted. Cpuset discarded." )
2015-12-10 21:59:29 -05:00
resources . CpusetCpus = ""
resources . CpusetMems = ""
2015-08-05 10:35:18 -04:00
}
2015-12-10 21:59:29 -05:00
cpusAvailable , err := sysInfo . IsCpusetCpusAvailable ( resources . CpusetCpus )
2015-09-08 14:40:55 -04:00
if err != nil {
2018-09-04 10:49:09 -04:00
return warnings , errors . Wrapf ( err , "Invalid value %s for cpuset cpus" , resources . CpusetCpus )
2015-09-08 14:40:55 -04:00
}
if ! cpusAvailable {
2016-03-21 20:53:57 -04:00
return warnings , fmt . Errorf ( "Requested CPUs are not available - requested %s, available: %s" , resources . CpusetCpus , sysInfo . Cpus )
2015-09-08 14:40:55 -04:00
}
2015-12-10 21:59:29 -05:00
memsAvailable , err := sysInfo . IsCpusetMemsAvailable ( resources . CpusetMems )
2015-09-08 14:40:55 -04:00
if err != nil {
2018-09-04 10:49:09 -04:00
return warnings , errors . Wrapf ( err , "Invalid value %s for cpuset mems" , resources . CpusetMems )
2015-09-08 14:40:55 -04:00
}
if ! memsAvailable {
2016-03-21 20:53:57 -04:00
return warnings , fmt . Errorf ( "Requested memory nodes are not available - requested %s, available: %s" , resources . CpusetMems , sysInfo . Mems )
2015-09-08 14:40:55 -04:00
}
2015-12-10 21:59:29 -05:00
// blkio subsystem checks and adjustments
if resources . BlkioWeight > 0 && ! sysInfo . BlkioWeight {
2016-07-18 16:56:41 -04:00
warnings = append ( warnings , "Your kernel does not support Block I/O weight or the cgroup is not mounted. Weight discarded." )
2015-12-10 21:59:29 -05:00
resources . BlkioWeight = 0
2015-08-05 10:35:18 -04:00
}
2015-12-10 21:59:29 -05:00
if resources . BlkioWeight > 0 && ( resources . BlkioWeight < 10 || resources . BlkioWeight > 1000 ) {
2016-03-21 20:53:57 -04:00
return warnings , fmt . Errorf ( "Range of blkio weight is from 10 to 1000" )
2015-05-15 19:34:26 -04:00
}
2016-02-24 20:51:46 -05:00
if resources . IOMaximumBandwidth != 0 || resources . IOMaximumIOps != 0 {
return warnings , fmt . Errorf ( "Invalid QoS settings: %s does not support Maximum IO Bandwidth or Maximum IO IOps" , runtime . GOOS )
}
2015-12-10 21:59:29 -05:00
if len ( resources . BlkioWeightDevice ) > 0 && ! sysInfo . BlkioWeightDevice {
2016-07-18 16:56:41 -04:00
warnings = append ( warnings , "Your kernel does not support Block I/O weight_device or the cgroup is not mounted. Weight-device discarded." )
2015-12-10 21:59:29 -05:00
resources . BlkioWeightDevice = [ ] * pblkiodev . WeightDevice { }
2015-06-11 20:34:20 -04:00
}
2015-12-10 21:59:29 -05:00
if len ( resources . BlkioDeviceReadBps ) > 0 && ! sysInfo . BlkioReadBpsDevice {
2016-07-18 16:56:41 -04:00
warnings = append ( warnings , "Your kernel does not support BPS Block I/O read limit or the cgroup is not mounted. Block I/O BPS read limit discarded." )
2015-12-10 21:59:29 -05:00
resources . BlkioDeviceReadBps = [ ] * pblkiodev . ThrottleDevice { }
2015-07-08 07:06:48 -04:00
}
2015-12-10 21:59:29 -05:00
if len ( resources . BlkioDeviceWriteBps ) > 0 && ! sysInfo . BlkioWriteBpsDevice {
2016-07-18 16:56:41 -04:00
warnings = append ( warnings , "Your kernel does not support BPS Block I/O write limit or the cgroup is not mounted. Block I/O BPS write limit discarded." )
2015-12-10 21:59:29 -05:00
resources . BlkioDeviceWriteBps = [ ] * pblkiodev . ThrottleDevice { }
2015-07-08 07:06:48 -04:00
}
2015-07-08 07:06:48 -04:00
if len ( resources . BlkioDeviceReadIOps ) > 0 && ! sysInfo . BlkioReadIOpsDevice {
2016-07-18 16:56:41 -04:00
warnings = append ( warnings , "Your kernel does not support IOPS Block read limit or the cgroup is not mounted. Block I/O IOPS read limit discarded." )
2015-07-08 07:06:48 -04:00
resources . BlkioDeviceReadIOps = [ ] * pblkiodev . ThrottleDevice { }
}
if len ( resources . BlkioDeviceWriteIOps ) > 0 && ! sysInfo . BlkioWriteIOpsDevice {
2016-07-18 16:56:41 -04:00
warnings = append ( warnings , "Your kernel does not support IOPS Block write limit or the cgroup is not mounted. Block I/O IOPS write limit discarded." )
2015-07-08 07:06:48 -04:00
resources . BlkioDeviceWriteIOps = [ ] * pblkiodev . ThrottleDevice { }
}
2015-12-10 21:59:29 -05:00
return warnings , nil
}
2016-02-18 05:10:31 -05:00
func ( daemon * Daemon ) getCgroupDriver ( ) string {
2020-02-10 00:37:22 -05:00
if UsingSystemd ( daemon . configStore ) {
return cgroupSystemdDriver
}
2019-06-02 11:03:27 -04:00
if daemon . Rootless ( ) {
return cgroupNoneDriver
}
2020-02-10 00:37:22 -05:00
return cgroupFsDriver
2016-03-24 12:18:03 -04:00
}
// getCD gets the raw value of the native.cgroupdriver option, if set.
2017-01-23 06:23:07 -05:00
func getCD ( config * config . Config ) string {
2016-03-24 12:18:03 -04:00
for _ , option := range config . ExecOptions {
2016-01-22 21:15:09 -05:00
key , val , err := parsers . ParseKeyValueOpt ( option )
if err != nil || ! strings . EqualFold ( key , "native.cgroupdriver" ) {
continue
}
2016-03-24 12:18:03 -04:00
return val
2016-01-22 21:15:09 -05:00
}
2016-03-24 12:18:03 -04:00
return ""
2016-03-18 15:43:17 -04:00
}
2020-11-09 09:26:24 -05:00
// verifyCgroupDriver validates native.cgroupdriver
func verifyCgroupDriver ( config * config . Config ) error {
2016-03-24 12:18:03 -04:00
cd := getCD ( config )
if cd == "" || cd == cgroupFsDriver || cd == cgroupSystemdDriver {
return nil
}
2019-06-02 11:03:27 -04:00
if cd == cgroupNoneDriver {
return fmt . Errorf ( "native.cgroupdriver option %s is internally used and cannot be specified manually" , cd )
}
2016-03-24 12:18:03 -04:00
return fmt . Errorf ( "native.cgroupdriver option %s not supported" , cd )
2016-01-22 21:15:09 -05:00
}
2016-03-24 12:18:03 -04:00
// UsingSystemd returns true if cli option includes native.cgroupdriver=systemd
2017-01-23 06:23:07 -05:00
func UsingSystemd ( config * config . Config ) bool {
2021-09-24 07:51:39 -04:00
cd := getCD ( config )
if cd == cgroupSystemdDriver {
2020-04-21 10:56:23 -04:00
return true
}
// On cgroup v2 hosts, default to systemd driver
2021-09-24 07:51:39 -04:00
if cd == "" && cgroups . Mode ( ) == cgroups . Unified && isRunningSystemd ( ) {
2020-04-21 10:56:23 -04:00
return true
}
return false
}
2020-11-09 09:21:27 -05:00
var (
runningSystemd bool
detectSystemd sync . Once
)
2020-11-09 09:15:45 -05:00
// isRunningSystemd checks whether the host was booted with systemd as its init
// system. This functions similarly to systemd's `sd_booted(3)`: internally, it
// checks whether /run/systemd/system/ exists and is a directory.
// http://www.freedesktop.org/software/systemd/man/sd_booted.html
//
// NOTE: This function comes from package github.com/coreos/go-systemd/util
// It was borrowed here to avoid a dependency on cgo.
func isRunningSystemd ( ) bool {
2020-11-09 09:21:27 -05:00
detectSystemd . Do ( func ( ) {
fi , err := os . Lstat ( "/run/systemd/system" )
if err != nil {
return
}
runningSystemd = fi . IsDir ( )
} )
return runningSystemd
2016-01-22 21:15:09 -05:00
}
2015-12-10 21:59:29 -05:00
// verifyPlatformContainerSettings performs platform-specific validation of the
// hostconfig and config structures.
2018-12-18 17:20:17 -05:00
func verifyPlatformContainerSettings ( daemon * Daemon , hostConfig * containertypes . HostConfig , update bool ) ( warnings [ ] string , err error ) {
2018-12-18 19:28:08 -05:00
if hostConfig == nil {
return nil , nil
}
2021-07-14 10:45:02 -04:00
sysInfo := daemon . RawSysInfo ( )
2015-12-10 21:59:29 -05:00
2018-12-18 17:41:52 -05:00
w , err := verifyPlatformContainerResources ( & hostConfig . Resources , sysInfo , update )
2016-08-31 12:23:56 -04:00
// no matter err is nil or not, w could have data in itself.
warnings = append ( warnings , w ... )
2015-12-10 21:59:29 -05:00
if err != nil {
return warnings , err
}
Fix validation of IpcMode, PidMode, UTSMode, CgroupnsMode
These HostConfig properties were not validated until the OCI spec for the container
was created, which meant that `container run` and `docker create` would accept
invalid values, and the invalid value would not be detected until `start` was
called, returning a 500 "internal server error", as well as errors from containerd
("cleanup: failed to delete container from containerd: no such container") in the
daemon logs.
As a result, a faulty container was created, and the container state remained
in the `created` state.
This patch:
- Updates `oci.WithNamespaces()` to return the correct `errdefs.InvalidParameter`
- Updates `verifyPlatformContainerSettings()` to validate these settings, so that
an error is returned when _creating_ the container.
Before this patch:
docker run -dit --ipc=shared --name foo busybox
2a00d74e9fbb7960c4718def8f6c74fa8ee754030eeb93ee26a516e27d4d029f
docker: Error response from daemon: Invalid IPC mode: shared.
docker ps -a --filter name=foo
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
2a00d74e9fbb busybox "sh" About a minute ago Created foo
After this patch:
docker run -dit --ipc=shared --name foo busybox
docker: Error response from daemon: invalid IPC mode: shared.
docker ps -a --filter name=foo
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
An integration test was added to verify the new validation, which can be run with:
make BIND_DIR=. TEST_FILTER=TestCreateInvalidHostConfig DOCKER_GRAPHDRIVER=vfs test-integration
Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
2022-05-25 07:17:16 -04:00
if ! hostConfig . IpcMode . Valid ( ) {
return warnings , errors . Errorf ( "invalid IPC mode: %v" , hostConfig . IpcMode )
}
if ! hostConfig . PidMode . Valid ( ) {
return warnings , errors . Errorf ( "invalid PID mode: %v" , hostConfig . PidMode )
}
2015-12-29 15:49:17 -05:00
if hostConfig . ShmSize < 0 {
2016-07-11 06:29:17 -04:00
return warnings , fmt . Errorf ( "SHM size can not be less than 0" )
2015-12-10 21:59:29 -05:00
}
Fix validation of IpcMode, PidMode, UTSMode, CgroupnsMode
These HostConfig properties were not validated until the OCI spec for the container
was created, which meant that `container run` and `docker create` would accept
invalid values, and the invalid value would not be detected until `start` was
called, returning a 500 "internal server error", as well as errors from containerd
("cleanup: failed to delete container from containerd: no such container") in the
daemon logs.
As a result, a faulty container was created, and the container state remained
in the `created` state.
This patch:
- Updates `oci.WithNamespaces()` to return the correct `errdefs.InvalidParameter`
- Updates `verifyPlatformContainerSettings()` to validate these settings, so that
an error is returned when _creating_ the container.
Before this patch:
docker run -dit --ipc=shared --name foo busybox
2a00d74e9fbb7960c4718def8f6c74fa8ee754030eeb93ee26a516e27d4d029f
docker: Error response from daemon: Invalid IPC mode: shared.
docker ps -a --filter name=foo
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
2a00d74e9fbb busybox "sh" About a minute ago Created foo
After this patch:
docker run -dit --ipc=shared --name foo busybox
docker: Error response from daemon: invalid IPC mode: shared.
docker ps -a --filter name=foo
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
An integration test was added to verify the new validation, which can be run with:
make BIND_DIR=. TEST_FILTER=TestCreateInvalidHostConfig DOCKER_GRAPHDRIVER=vfs test-integration
Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
2022-05-25 07:17:16 -04:00
if ! hostConfig . UTSMode . Valid ( ) {
return warnings , errors . Errorf ( "invalid UTS mode: %v" , hostConfig . UTSMode )
}
2015-12-10 21:59:29 -05:00
2015-10-13 05:26:27 -04:00
if hostConfig . OomScoreAdj < - 1000 || hostConfig . OomScoreAdj > 1000 {
2016-03-21 20:53:57 -04:00
return warnings , fmt . Errorf ( "Invalid value %d, range for oom score adj is [-1000, 1000]" , hostConfig . OomScoreAdj )
2015-10-13 05:26:27 -04:00
}
2016-05-18 14:10:31 -04:00
2016-05-20 11:39:05 -04:00
// ip-forwarding does not affect container with '--net=host' (or '--net=none')
if sysInfo . IPv4ForwardingDisabled && ! ( hostConfig . NetworkMode . IsHost ( ) || hostConfig . NetworkMode . IsNone ( ) ) {
2015-05-15 19:34:26 -04:00
warnings = append ( warnings , "IPv4 forwarding is disabled. Networking will not work." )
}
2018-12-18 16:42:57 -05:00
if hostConfig . NetworkMode . IsHost ( ) && len ( hostConfig . PortBindings ) > 0 {
warnings = append ( warnings , "Published ports are discarded when using host network mode" )
}
2016-01-08 09:03:17 -05:00
// check for various conflicting options with user namespaces
2016-02-08 09:23:24 -05:00
if daemon . configStore . RemappedRoot != "" && hostConfig . UsernsMode . IsPrivate ( ) {
2016-01-08 09:03:17 -05:00
if hostConfig . Privileged {
2017-08-17 15:16:30 -04:00
return warnings , fmt . Errorf ( "privileged mode is incompatible with user namespaces. You must run the container in the host namespace when running privileged mode" )
2016-01-08 09:03:17 -05:00
}
2016-08-16 17:16:14 -04:00
if hostConfig . NetworkMode . IsHost ( ) && ! hostConfig . UsernsMode . IsHost ( ) {
2017-08-17 15:16:30 -04:00
return warnings , fmt . Errorf ( "cannot share the host's network namespace when user namespaces are enabled" )
2016-01-08 09:03:17 -05:00
}
2016-08-16 17:16:14 -04:00
if hostConfig . PidMode . IsHost ( ) && ! hostConfig . UsernsMode . IsHost ( ) {
2017-08-17 15:16:30 -04:00
return warnings , fmt . Errorf ( "cannot share the host PID namespace when user namespaces are enabled" )
2016-01-08 09:03:17 -05:00
}
2016-01-07 22:43:11 -05:00
}
2016-03-24 12:18:03 -04:00
if hostConfig . CgroupParent != "" && UsingSystemd ( daemon . configStore ) {
2016-01-22 21:15:09 -05:00
// CgroupParent for systemd cgroup should be named as "xxx.slice"
if len ( hostConfig . CgroupParent ) <= 6 || ! strings . HasSuffix ( hostConfig . CgroupParent , ".slice" ) {
return warnings , fmt . Errorf ( "cgroup-parent for systemd cgroup should be a valid slice named as \"xxx.slice\"" )
}
}
2016-05-23 17:49:50 -04:00
if hostConfig . Runtime == "" {
hostConfig . Runtime = daemon . configStore . GetDefaultRuntimeName ( )
}
daemon: support other containerd runtimes (MVP)
Contrary to popular belief, the OCI Runtime specification does not
specify the command-line API for runtimes. Looking at containerd's
architecture from the lens of the OCI Runtime spec, the _shim_ is the
OCI Runtime and runC is "just" an implementation detail of the
io.containerd.runc.v2 runtime. When one configures a non-default runtime
in Docker, what they're really doing is instructing Docker to create
containers using the io.containerd.runc.v2 runtime with a configuration
option telling the runtime that the runC binary is at some non-default
path. Consequently, only OCI runtimes which are compatible with the
io.containerd.runc.v2 shim, such as crun, can be used in this manner.
Other OCI runtimes, including kata-containers v2, come with their own
containerd shim and are not compatible with io.containerd.runc.v2.
As Docker has not historically provided a way to select a non-default
runtime which requires its own shim, runtimes such as kata-containers v2
could not be used with Docker.
Allow other containerd shims to be used with Docker; no daemon
configuration required. If the daemon is instructed to create a
container with a runtime name which does not match any of the configured
or stock runtimes, it passes the name along to containerd verbatim. A
user can start a container with the kata-containers runtime, for
example, simply by calling
docker run --runtime io.containerd.kata.v2
Runtime names which containerd would interpret as a path to an arbitrary
binary are disallowed. While handy for development and testing it is not
strictly necessary and would allow anyone with Engine API access to
trivially execute any binary on the host as root, so we have decided it
would be safest for our users if it was not allowed.
It is not yet possible to set an alternative containerd shim as the
default runtime; it can only be configured per-container.
Signed-off-by: Cory Snider <csnider@mirantis.com>
2022-07-20 16:12:01 -04:00
if _ , err := daemon . getRuntime ( hostConfig . Runtime ) ; err != nil {
return warnings , err
2016-05-23 17:49:50 -04:00
}
2021-06-11 15:01:18 -04:00
parser := volumemounts . NewParser ( )
2017-01-16 04:52:43 -05:00
for dest := range hostConfig . Tmpfs {
2017-08-01 13:32:44 -04:00
if err := parser . ValidateTmpfsMountDestination ( dest ) ; err != nil {
2017-01-16 04:52:43 -05:00
return warnings , err
}
}
2019-03-14 23:44:18 -04:00
if ! hostConfig . CgroupnsMode . Valid ( ) {
return warnings , fmt . Errorf ( "invalid cgroup namespace mode: %v" , hostConfig . CgroupnsMode )
}
if hostConfig . CgroupnsMode . IsPrivate ( ) {
if ! sysInfo . CgroupNamespaces {
warnings = append ( warnings , "Your kernel does not support cgroup namespaces. Cgroup namespace setting discarded." )
}
}
2020-07-07 16:33:46 -04:00
return warnings , nil
2017-09-22 09:52:41 -04:00
}
2016-01-22 21:15:09 -05:00
// verifyDaemonSettings performs validation of daemon config struct
2017-01-23 06:23:07 -05:00
func verifyDaemonSettings ( conf * config . Config ) error {
2019-07-11 19:42:16 -04:00
if conf . ContainerdNamespace == conf . ContainerdPluginNamespace {
return errors . New ( "containers namespace and plugins namespace cannot be the same" )
}
2015-05-15 19:34:26 -04:00
// Check for mutually incompatible config options
2017-01-23 06:23:07 -05:00
if conf . BridgeConfig . Iface != "" && conf . BridgeConfig . IP != "" {
2016-02-03 09:56:34 -05:00
return fmt . Errorf ( "You specified -b & --bip, mutually exclusive options. Please specify only one" )
2015-05-15 19:34:26 -04:00
}
2017-01-23 06:23:07 -05:00
if ! conf . BridgeConfig . EnableIPTables && ! conf . BridgeConfig . InterContainerCommunication {
2016-02-03 09:56:34 -05:00
return fmt . Errorf ( "You specified --iptables=false with --icc=false. ICC=false uses iptables to function. Please set --icc or --iptables to true" )
2015-05-15 19:34:26 -04:00
}
2020-12-02 16:19:44 -05:00
if conf . BridgeConfig . EnableIP6Tables && ! conf . Experimental {
return fmt . Errorf ( "ip6tables rules are only available if experimental features are enabled" )
}
2017-01-23 06:23:07 -05:00
if ! conf . BridgeConfig . EnableIPTables && conf . BridgeConfig . EnableIPMasq {
conf . BridgeConfig . EnableIPMasq = false
2015-05-15 19:34:26 -04:00
}
2020-11-09 09:26:24 -05:00
if err := verifyCgroupDriver ( conf ) ; err != nil {
2016-03-24 12:18:03 -04:00
return err
}
2017-01-23 06:23:07 -05:00
if conf . CgroupParent != "" && UsingSystemd ( conf ) {
if len ( conf . CgroupParent ) <= 6 || ! strings . HasSuffix ( conf . CgroupParent , ".slice" ) {
2016-01-22 21:15:09 -05:00
return fmt . Errorf ( "cgroup-parent for systemd cgroup should be a valid slice named as \"xxx.slice\"" )
}
}
2016-05-23 17:49:50 -04:00
2020-11-09 09:00:32 -05:00
if conf . Rootless && UsingSystemd ( conf ) && cgroups . Mode ( ) != cgroups . Unified {
2020-03-10 23:49:03 -04:00
return fmt . Errorf ( "exec-opt native.cgroupdriver=systemd requires cgroup v2 for rootless mode" )
}
2020-07-07 16:33:46 -04:00
configureRuntimes ( conf )
if rtName := conf . GetDefaultRuntimeName ( ) ; rtName != "" {
if conf . GetRuntime ( rtName ) == nil {
2022-08-17 14:50:19 -04:00
if ! config . IsPermissibleC8dRuntimeName ( rtName ) {
return fmt . Errorf ( "specified default runtime '%s' does not exist" , rtName )
}
2020-07-07 16:33:46 -04:00
}
2016-05-23 17:49:50 -04:00
}
2015-05-15 19:34:26 -04:00
return nil
}
2015-07-11 15:32:08 -04:00
// checkSystem validates platform-specific requirements
2015-05-15 19:34:26 -04:00
func checkSystem ( ) error {
2022-02-17 09:37:58 -05:00
return nil
2015-05-15 19:34:26 -04:00
}
2015-12-02 05:26:30 -05:00
// configureMaxThreads sets the Go runtime max threads threshold
// which is 90% of the kernel setting from /proc/sys/kernel/threads-max
2017-01-23 06:23:07 -05:00
func configureMaxThreads ( config * config . Config ) error {
2021-08-24 06:10:50 -04:00
mt , err := os . ReadFile ( "/proc/sys/kernel/threads-max" )
2015-12-02 05:26:30 -05:00
if err != nil {
return err
}
mtint , err := strconv . Atoi ( strings . TrimSpace ( string ( mt ) ) )
if err != nil {
return err
}
maxThreads := ( mtint / 100 ) * 90
debug . SetMaxThreads ( maxThreads )
logrus . Debugf ( "Golang's threads limit set to %d" , maxThreads )
return nil
}
2016-10-04 15:35:56 -04:00
func overlaySupportsSelinux ( ) ( bool , error ) {
f , err := os . Open ( "/proc/kallsyms" )
if err != nil {
if os . IsNotExist ( err ) {
return false , nil
}
return false , err
}
defer f . Close ( )
s := bufio . NewScanner ( f )
for s . Scan ( ) {
2020-03-11 22:09:30 -04:00
if strings . HasSuffix ( s . Text ( ) , " security_inode_copy_up" ) {
2016-10-04 15:35:56 -04:00
return true , nil
}
}
2020-03-11 22:09:30 -04:00
return false , s . Err ( )
2016-10-04 15:35:56 -04:00
}
2016-03-24 11:57:11 -04:00
// configureKernelSecuritySupport configures and validates security support for the kernel
2017-08-24 14:48:16 -04:00
func configureKernelSecuritySupport ( config * config . Config , driverName string ) error {
2015-05-15 19:34:26 -04:00
if config . EnableSelinuxSupport {
2020-12-14 05:46:58 -05:00
if ! selinux . GetEnabled ( ) {
2015-05-15 19:34:26 -04:00
logrus . Warn ( "Docker could not enable SELinux on the host system" )
2016-10-04 15:35:56 -04:00
return nil
}
2022-08-03 05:20:54 -04:00
if driverName == "overlay" || driverName == "overlay2" || driverName == "overlayfs" {
2016-10-04 15:35:56 -04:00
// If driver is overlay or overlay2, make sure kernel
// supports selinux with overlay.
supported , err := overlaySupportsSelinux ( )
if err != nil {
return err
}
if ! supported {
2017-08-24 14:48:16 -04:00
logrus . Warnf ( "SELinux is not supported with the %v graph driver on this kernel" , driverName )
2016-10-04 15:35:56 -04:00
}
2015-05-15 19:34:26 -04:00
}
} else {
2020-12-14 05:46:58 -05:00
selinux . SetDisabled ( )
2015-05-15 19:34:26 -04:00
}
return nil
}
2022-04-26 04:32:10 -04:00
// initNetworkController initializes the libnetwork controller and configures
// network settings. If there's active sandboxes, configuration changes will not
// take effect.
func ( daemon * Daemon ) initNetworkController ( activeSandboxes map [ string ] interface { } ) error {
2022-04-23 17:12:55 -04:00
netOptions , err := daemon . networkOptions ( daemon . PluginStore , activeSandboxes )
2015-05-20 08:20:19 -04:00
if err != nil {
2022-04-26 04:32:10 -04:00
return err
2015-05-20 08:20:19 -04:00
}
2022-04-26 04:32:10 -04:00
daemon . netController , err = libnetwork . New ( netOptions ... )
2015-05-15 19:34:26 -04:00
if err != nil {
2022-04-26 04:32:10 -04:00
return fmt . Errorf ( "error obtaining controller instance: %v" , err )
2015-05-15 19:34:26 -04:00
}
2016-06-14 12:13:53 -04:00
if len ( activeSandboxes ) > 0 {
2022-04-26 04:32:10 -04:00
logrus . Info ( "there are running containers, updated network configuration will not take affect" )
} else if err := configureNetworking ( daemon . netController , daemon . configStore ) ; err != nil {
return err
2016-06-14 12:13:53 -04:00
}
2022-04-26 04:32:10 -04:00
// Set HostGatewayIP to the default bridge's IP if it is empty
setHostGatewayIP ( daemon . netController , daemon . configStore )
return nil
}
func configureNetworking ( controller libnetwork . NetworkController , conf * config . Config ) error {
2015-05-15 19:34:26 -04:00
// Initialize default network on "null"
2016-06-14 12:13:53 -04:00
if n , _ := controller . NetworkByName ( "none" ) ; n == nil {
if _ , err := controller . NewNetwork ( "null" , "none" , "" , libnetwork . NetworkOptionPersist ( true ) ) ; err != nil {
2022-04-26 04:32:10 -04:00
return errors . Wrap ( err , ` error creating default "null" network ` )
2016-06-14 12:13:53 -04:00
}
2015-05-15 19:34:26 -04:00
}
// Initialize default network on "host"
2016-06-14 12:13:53 -04:00
if n , _ := controller . NetworkByName ( "host" ) ; n == nil {
if _ , err := controller . NewNetwork ( "host" , "host" , "" , libnetwork . NetworkOptionPersist ( true ) ) ; err != nil {
2022-04-26 04:32:10 -04:00
return errors . Wrap ( err , ` error creating default "host" network ` )
2016-06-14 12:13:53 -04:00
}
2015-05-15 19:34:26 -04:00
}
2016-09-27 16:16:00 -04:00
// Clear stale bridge network
if n , err := controller . NetworkByName ( "bridge" ) ; err == nil {
if err = n . Delete ( ) ; err != nil {
2022-04-26 04:32:10 -04:00
return errors . Wrap ( err , ` could not delete the default "bridge"" network ` )
2016-12-13 18:04:59 -05:00
}
2022-04-23 17:12:55 -04:00
if len ( conf . NetworkConfig . DefaultAddressPools . Value ( ) ) > 0 && ! conf . LiveRestoreEnabled {
2016-12-13 18:04:59 -05:00
removeDefaultBridgeInterface ( )
2016-09-27 16:16:00 -04:00
}
}
2022-04-23 17:12:55 -04:00
if ! conf . DisableBridge {
2015-06-30 13:34:15 -04:00
// Initialize default driver "bridge"
2022-04-23 17:12:55 -04:00
if err := initBridgeDriver ( controller , conf ) ; err != nil {
2022-04-26 04:32:10 -04:00
return err
2015-06-30 13:34:15 -04:00
}
2016-09-27 16:16:00 -04:00
} else {
removeDefaultBridgeInterface ( )
2015-06-30 13:34:15 -04:00
}
2022-04-26 04:32:10 -04:00
return nil
2021-08-25 15:51:59 -04:00
}
// setHostGatewayIP sets cfg.HostGatewayIP to the default bridge's IP if it is empty.
2022-04-26 04:32:10 -04:00
func setHostGatewayIP ( controller libnetwork . NetworkController , config * config . Config ) {
2021-08-25 15:51:59 -04:00
if config . HostGatewayIP != nil {
return
}
if n , err := controller . NetworkByName ( "bridge" ) ; err == nil {
v4Info , v6Info := n . Info ( ) . IpamInfo ( )
var gateway net . IP
if len ( v4Info ) > 0 {
gateway = v4Info [ 0 ] . Gateway . IP
} else if len ( v6Info ) > 0 {
gateway = v6Info [ 0 ] . Gateway . IP
2019-11-01 20:09:40 -04:00
}
2021-08-25 15:51:59 -04:00
config . HostGatewayIP = gateway
2019-11-01 20:09:40 -04:00
}
2015-06-30 13:34:15 -04:00
}
2021-07-27 06:12:11 -04:00
func driverOptions ( config * config . Config ) nwconfig . Option {
return nwconfig . OptionDriverConfig ( "bridge" , options . Generic {
netlabel . GenericData : options . Generic {
"EnableIPForwarding" : config . BridgeConfig . EnableIPForward ,
"EnableIPTables" : config . BridgeConfig . EnableIPTables ,
"EnableIP6Tables" : config . BridgeConfig . EnableIP6Tables ,
"EnableUserlandProxy" : config . BridgeConfig . EnableUserlandProxy ,
"UserlandProxyPath" : config . BridgeConfig . UserlandProxyPath ,
} ,
} )
2015-09-24 23:00:05 -04:00
}
2015-05-15 19:34:26 -04:00
2017-01-23 06:23:07 -05:00
func initBridgeDriver ( controller libnetwork . NetworkController , config * config . Config ) error {
2015-10-10 12:43:03 -04:00
bridgeName := bridge . DefaultBridgeName
2017-01-23 06:23:07 -05:00
if config . BridgeConfig . Iface != "" {
bridgeName = config . BridgeConfig . Iface
2015-10-10 12:43:03 -04:00
}
netOption := map [ string ] string {
bridge . BridgeName : bridgeName ,
bridge . DefaultBridge : strconv . FormatBool ( true ) ,
netlabel . DriverMTU : strconv . Itoa ( config . Mtu ) ,
2017-01-23 06:23:07 -05:00
bridge . EnableIPMasquerade : strconv . FormatBool ( config . BridgeConfig . EnableIPMasq ) ,
bridge . EnableICC : strconv . FormatBool ( config . BridgeConfig . InterContainerCommunication ) ,
2015-10-10 12:43:03 -04:00
}
// --ip processing
2017-01-23 06:23:07 -05:00
if config . BridgeConfig . DefaultIP != nil {
netOption [ bridge . DefaultBindingIP ] = config . BridgeConfig . DefaultIP . String ( )
2015-10-10 12:43:03 -04:00
}
2019-08-09 12:34:35 -04:00
ipamV4Conf := & libnetwork . IpamConf { AuxAddresses : make ( map [ string ] string ) }
2015-10-10 12:43:03 -04:00
2016-09-17 01:46:20 -04:00
nwList , nw6List , err := netutils . ElectInterfaceAddresses ( bridgeName )
if err != nil {
return errors . Wrap ( err , "list bridge addresses failed" )
}
nw := nwList [ 0 ]
2017-01-23 06:23:07 -05:00
if len ( nwList ) > 1 && config . BridgeConfig . FixedCIDR != "" {
_ , fCIDR , err := net . ParseCIDR ( config . BridgeConfig . FixedCIDR )
2016-09-17 01:46:20 -04:00
if err != nil {
return errors . Wrap ( err , "parse CIDR failed" )
2015-10-10 12:43:03 -04:00
}
2016-09-17 01:46:20 -04:00
// Iterate through in case there are multiple addresses for the bridge
for _ , entry := range nwList {
if fCIDR . Contains ( entry . IP ) {
nw = entry
break
}
}
}
ipamV4Conf . PreferredPool = lntypes . GetIPNetCanonical ( nw ) . String ( )
hip , _ := lntypes . GetHostPartIP ( nw . IP , nw . Mask )
if hip . IsGlobalUnicast ( ) {
ipamV4Conf . Gateway = nw . IP . String ( )
2015-05-15 19:34:26 -04:00
}
2017-01-23 06:23:07 -05:00
if config . BridgeConfig . IP != "" {
2020-02-10 20:34:30 -05:00
ip , ipNet , err := net . ParseCIDR ( config . BridgeConfig . IP )
2015-05-15 19:34:26 -04:00
if err != nil {
2015-06-30 13:34:15 -04:00
return err
2015-05-15 19:34:26 -04:00
}
2020-02-10 20:34:30 -05:00
ipamV4Conf . PreferredPool = ipNet . String ( )
2015-10-10 12:43:03 -04:00
ipamV4Conf . Gateway = ip . String ( )
2015-10-26 14:46:20 -04:00
} else if bridgeName == bridge . DefaultBridgeName && ipamV4Conf . PreferredPool != "" {
logrus . Infof ( "Default bridge (%s) is assigned with an IP address %s. Daemon option --bip can be used to set a preferred IP address" , bridgeName , ipamV4Conf . PreferredPool )
2015-05-15 19:34:26 -04:00
}
2017-01-23 06:23:07 -05:00
if config . BridgeConfig . FixedCIDR != "" {
_ , fCIDR , err := net . ParseCIDR ( config . BridgeConfig . FixedCIDR )
2015-05-15 19:34:26 -04:00
if err != nil {
2015-06-30 13:34:15 -04:00
return err
2015-05-15 19:34:26 -04:00
}
2015-10-10 12:43:03 -04:00
ipamV4Conf . SubPool = fCIDR . String ( )
2015-05-15 19:34:26 -04:00
}
2017-01-23 06:23:07 -05:00
if config . BridgeConfig . DefaultGatewayIPv4 != nil {
ipamV4Conf . AuxAddresses [ "DefaultGatewayIPv4" ] = config . BridgeConfig . DefaultGatewayIPv4 . String ( )
2015-10-10 12:43:03 -04:00
}
2019-08-09 12:34:35 -04:00
var (
deferIPv6Alloc bool
ipamV6Conf * libnetwork . IpamConf
)
2020-01-10 21:53:59 -05:00
if config . BridgeConfig . EnableIPv6 && config . BridgeConfig . FixedCIDRv6 == "" {
return errdefs . InvalidParameter ( errors . New ( "IPv6 is enabled for the default bridge, but no subnet is configured. Specify an IPv6 subnet using --fixed-cidr-v6" ) )
} else if config . BridgeConfig . FixedCIDRv6 != "" {
2017-01-23 06:23:07 -05:00
_ , fCIDRv6 , err := net . ParseCIDR ( config . BridgeConfig . FixedCIDRv6 )
2015-05-15 19:34:26 -04:00
if err != nil {
2015-06-30 13:34:15 -04:00
return err
2015-05-15 19:34:26 -04:00
}
2015-11-11 00:14:05 -05:00
// In case user has specified the daemon flag --fixed-cidr-v6 and the passed network has
// at least 48 host bits, we need to guarantee the current behavior where the containers'
// IPv6 addresses will be constructed based on the containers' interface MAC address.
// We do so by telling libnetwork to defer the IPv6 address allocation for the endpoints
// on this network until after the driver has created the endpoint and returned the
// constructed address. Libnetwork will then reserve this address with the ipam driver.
ones , _ := fCIDRv6 . Mask . Size ( )
deferIPv6Alloc = ones <= 80
2019-08-09 12:34:35 -04:00
ipamV6Conf = & libnetwork . IpamConf {
AuxAddresses : make ( map [ string ] string ) ,
PreferredPool : fCIDRv6 . String ( ) ,
2015-10-10 12:43:03 -04:00
}
2016-01-12 02:47:44 -05:00
// In case the --fixed-cidr-v6 is specified and the current docker0 bridge IPv6
// address belongs to the same network, we need to inform libnetwork about it, so
// that it can be reserved with IPAM and it will not be given away to somebody else
for _ , nw6 := range nw6List {
if fCIDRv6 . Contains ( nw6 . IP ) {
ipamV6Conf . Gateway = nw6 . IP . String ( )
break
}
}
2015-05-15 19:34:26 -04:00
}
2017-01-23 06:23:07 -05:00
if config . BridgeConfig . DefaultGatewayIPv6 != nil {
2015-10-10 12:43:03 -04:00
if ipamV6Conf == nil {
2015-12-30 17:51:51 -05:00
ipamV6Conf = & libnetwork . IpamConf { AuxAddresses : make ( map [ string ] string ) }
2015-10-10 12:43:03 -04:00
}
2017-01-23 06:23:07 -05:00
ipamV6Conf . AuxAddresses [ "DefaultGatewayIPv6" ] = config . BridgeConfig . DefaultGatewayIPv6 . String ( )
2015-05-15 19:34:26 -04:00
}
2016-01-12 02:47:44 -05:00
v4Conf := [ ] * libnetwork . IpamConf { ipamV4Conf }
2015-10-10 12:43:03 -04:00
v6Conf := [ ] * libnetwork . IpamConf { }
if ipamV6Conf != nil {
v6Conf = append ( v6Conf , ipamV6Conf )
2015-05-15 19:34:26 -04:00
}
// Initialize default network on "bridge" with the same name
2016-05-08 03:33:16 -04:00
_ , err = controller . NewNetwork ( "bridge" , "bridge" , "" ,
2017-01-23 06:23:07 -05:00
libnetwork . NetworkOptionEnableIPv6 ( config . BridgeConfig . EnableIPv6 ) ,
2015-12-10 09:02:50 -05:00
libnetwork . NetworkOptionDriverOpts ( netOption ) ,
2016-01-08 16:38:52 -05:00
libnetwork . NetworkOptionIpam ( "default" , "" , v4Conf , v6Conf , nil ) ,
2015-11-11 00:14:05 -05:00
libnetwork . NetworkOptionDeferIPv6Alloc ( deferIPv6Alloc ) )
2015-05-15 19:34:26 -04:00
if err != nil {
2015-06-30 13:34:15 -04:00
return fmt . Errorf ( "Error creating default \"bridge\" network: %v" , err )
2015-05-15 19:34:26 -04:00
}
2015-06-30 13:34:15 -04:00
return nil
2015-05-15 19:34:26 -04:00
}
2015-06-16 14:06:53 -04:00
2016-09-27 16:16:00 -04:00
// Remove default bridge interface if present (--bridge=none use case)
func removeDefaultBridgeInterface ( ) {
if lnk , err := netlink . LinkByName ( bridge . DefaultBridgeName ) ; err == nil {
if err := netlink . LinkDel ( lnk ) ; err != nil {
logrus . Warnf ( "Failed to remove bridge interface (%s): %v" , bridge . DefaultBridgeName , err )
}
}
}
2022-09-23 14:21:31 -04:00
func setupInitLayer ( idMapping idtools . IdentityMapping ) func ( string ) error {
return func ( initPath string ) error {
2017-11-16 01:20:33 -05:00
return initlayer . Setup ( initPath , idMapping . RootPair ( ) )
2018-02-13 14:29:14 -05:00
}
2016-09-21 14:45:25 -04:00
}
2016-01-07 22:43:11 -05:00
// Parse the remapped root (user namespace) option, which can be one of:
//
2022-07-08 12:27:07 -04:00
// - username - valid username from /etc/passwd
// - username:groupname - valid username; valid groupname from /etc/group
// - uid - 32-bit unsigned int valid Linux UID value
// - uid:gid - uid value; 32-bit unsigned int Linux GID value
2016-01-07 22:43:11 -05:00
//
2022-07-08 12:27:07 -04:00
// If no groupname is specified, and a username is specified, an attempt
// will be made to lookup a gid for that username as a groupname
//
// If names are used, they are verified to exist in passwd/group
2016-01-07 22:43:11 -05:00
func parseRemappedRoot ( usergrp string ) ( string , string , error ) {
var (
userID , groupID int
username , groupname string
)
idparts := strings . Split ( usergrp , ":" )
if len ( idparts ) > 2 {
return "" , "" , fmt . Errorf ( "Invalid user/group specification in --userns-remap: %q" , usergrp )
}
if uid , err := strconv . ParseInt ( idparts [ 0 ] , 10 , 32 ) ; err == nil {
// must be a uid; take it as valid
userID = int ( uid )
2016-10-20 15:43:42 -04:00
luser , err := idtools . LookupUID ( userID )
2016-01-07 22:43:11 -05:00
if err != nil {
return "" , "" , fmt . Errorf ( "Uid %d has no entry in /etc/passwd: %v" , userID , err )
}
username = luser . Name
if len ( idparts ) == 1 {
// if the uid was numeric and no gid was specified, take the uid as the gid
groupID = userID
2016-10-20 15:43:42 -04:00
lgrp , err := idtools . LookupGID ( groupID )
2016-01-07 22:43:11 -05:00
if err != nil {
return "" , "" , fmt . Errorf ( "Gid %d has no entry in /etc/group: %v" , groupID , err )
}
groupname = lgrp . Name
}
} else {
lookupName := idparts [ 0 ]
// special case: if the user specified "default", they want Docker to create or
// use (after creation) the "dockremap" user/group for root remapping
if lookupName == defaultIDSpecifier {
lookupName = defaultRemappedID
}
2016-10-20 15:43:42 -04:00
luser , err := idtools . LookupUser ( lookupName )
2016-01-07 22:43:11 -05:00
if err != nil && idparts [ 0 ] != defaultIDSpecifier {
// error if the name requested isn't the special "dockremap" ID
return "" , "" , fmt . Errorf ( "Error during uid lookup for %q: %v" , lookupName , err )
} else if err != nil {
// special case-- if the username == "default", then we have been asked
// to create a new entry pair in /etc/{passwd,group} for which the /etc/sub{uid,gid}
// ranges will be used for the user and group mappings in user namespaced containers
_ , _ , err := idtools . AddNamespaceRangesUser ( defaultRemappedID )
if err == nil {
return defaultRemappedID , defaultRemappedID , nil
}
return "" , "" , fmt . Errorf ( "Error during %q user creation: %v" , defaultRemappedID , err )
}
username = luser . Name
if len ( idparts ) == 1 {
// we only have a string username, and no group specified; look up gid from username as group
2016-10-20 15:43:42 -04:00
group , err := idtools . LookupGroup ( lookupName )
2016-01-07 22:43:11 -05:00
if err != nil {
return "" , "" , fmt . Errorf ( "Error during gid lookup for %q: %v" , lookupName , err )
}
groupname = group . Name
}
}
if len ( idparts ) == 2 {
// groupname or gid is separately specified and must be resolved
2016-03-24 11:57:11 -04:00
// to an unsigned 32-bit gid
2016-01-07 22:43:11 -05:00
if gid , err := strconv . ParseInt ( idparts [ 1 ] , 10 , 32 ) ; err == nil {
// must be a gid, take it as valid
groupID = int ( gid )
2016-10-20 15:43:42 -04:00
lgrp , err := idtools . LookupGID ( groupID )
2016-01-07 22:43:11 -05:00
if err != nil {
return "" , "" , fmt . Errorf ( "Gid %d has no entry in /etc/passwd: %v" , groupID , err )
}
groupname = lgrp . Name
} else {
// not a number; attempt a lookup
2016-10-20 15:43:42 -04:00
if _ , err := idtools . LookupGroup ( idparts [ 1 ] ) ; err != nil {
2016-03-16 22:43:26 -04:00
return "" , "" , fmt . Errorf ( "Error during groupname lookup for %q: %v" , idparts [ 1 ] , err )
2016-01-07 22:43:11 -05:00
}
groupname = idparts [ 1 ]
}
}
return username , groupname , nil
}
2022-03-14 15:24:29 -04:00
func setupRemappedRoot ( config * config . Config ) ( idtools . IdentityMapping , error ) {
2016-01-07 22:43:11 -05:00
if runtime . GOOS != "linux" && config . RemappedRoot != "" {
2022-03-14 15:24:29 -04:00
return idtools . IdentityMapping { } , fmt . Errorf ( "User namespaces are only supported on Linux" )
2016-01-07 22:43:11 -05:00
}
// if the daemon was started with remapped root option, parse
// the config option to the int uid,gid values
if config . RemappedRoot != "" {
username , groupname , err := parseRemappedRoot ( config . RemappedRoot )
if err != nil {
2022-03-14 15:24:29 -04:00
return idtools . IdentityMapping { } , err
2016-01-07 22:43:11 -05:00
}
if username == "root" {
// Cannot setup user namespaces with a 1-to-1 mapping; "--root=0:0" is a no-op
// effectively
2016-06-11 13:42:38 -04:00
logrus . Warn ( "User namespaces: root cannot be remapped with itself; user namespaces are OFF" )
2022-03-14 15:24:29 -04:00
return idtools . IdentityMapping { } , nil
2016-01-07 22:43:11 -05:00
}
2020-05-24 09:29:06 -04:00
logrus . Infof ( "User namespaces: ID ranges will be mapped to subuid/subgid ranges of: %s" , username )
2016-01-07 22:43:11 -05:00
// update remapped root setting now that we have resolved them to actual names
config . RemappedRoot = fmt . Sprintf ( "%s:%s" , username , groupname )
2022-03-14 15:24:29 -04:00
mappings , err := idtools . LoadIdentityMapping ( username )
2020-05-24 09:29:06 -04:00
if err != nil {
2022-03-14 15:24:29 -04:00
return idtools . IdentityMapping { } , errors . Wrap ( err , "Can't create ID mappings" )
2016-01-07 22:43:11 -05:00
}
2020-05-24 09:29:06 -04:00
return mappings , nil
2016-01-07 22:43:11 -05:00
}
2022-03-14 15:24:29 -04:00
return idtools . IdentityMapping { } , nil
2016-01-07 22:43:11 -05:00
}
2020-10-06 15:43:24 -04:00
func setupDaemonRoot ( config * config . Config , rootDir string , remappedRoot idtools . Identity ) error {
2016-01-07 22:43:11 -05:00
config . Root = rootDir
2016-03-16 04:24:03 -04:00
// the docker root metadata directory needs to have execute permissions for all users (g+x,o+x)
2016-01-07 22:43:11 -05:00
// so that syscalls executing as non-root, operating on subdirectories of the graph root
// (e.g. mounted layers of a container) can traverse this path.
// The user namespace support will create subdirectories for the remapped root host uid:gid
// pair owned by that same uid:gid pair for proper write access to those needed metadata and
// layer content subtrees.
if _ , err := os . Stat ( rootDir ) ; err == nil {
// root current exists; verify the access bits are correct by setting them
2016-03-16 04:24:03 -04:00
if err = os . Chmod ( rootDir , 0711 ) ; err != nil {
2016-01-07 22:43:11 -05:00
return err
}
} else if os . IsNotExist ( err ) {
2016-03-16 04:24:03 -04:00
// no root exists yet, create it 0711 with root:root ownership
if err := os . MkdirAll ( rootDir , 0711 ) ; err != nil {
2016-01-07 22:43:11 -05:00
return err
}
}
2021-07-02 13:27:45 -04:00
id := idtools . Identity { UID : idtools . CurrentIdentity ( ) . UID , GID : remappedRoot . GID }
// First make sure the current root dir has the correct perms.
if err := idtools . MkdirAllAndChown ( config . Root , 0710 , id ) ; err != nil {
return errors . Wrapf ( err , "could not create or set daemon root permissions: %s" , config . Root )
}
2016-01-07 22:43:11 -05:00
// if user namespaces are enabled we will create a subtree underneath the specified root
// with any/all specified remapped root uid/gid options on the daemon creating
// a new subdirectory with ownership set to the remapped uid/gid (so as to allow
// `chdir()` to work for containers namespaced to that uid/gid)
if config . RemappedRoot != "" {
2020-10-06 15:43:24 -04:00
config . Root = filepath . Join ( rootDir , fmt . Sprintf ( "%d.%d" , remappedRoot . UID , remappedRoot . GID ) )
2016-01-07 22:43:11 -05:00
logrus . Debugf ( "Creating user namespaced daemon root: %s" , config . Root )
2016-03-24 11:57:11 -04:00
// Create the root directory if it doesn't exist
2021-07-02 13:27:45 -04:00
if err := idtools . MkdirAllAndChown ( config . Root , 0710 , id ) ; err != nil {
2016-01-07 22:43:11 -05:00
return fmt . Errorf ( "Cannot create daemon root: %s: %v" , config . Root , err )
}
2016-08-23 12:49:13 -04:00
// we also need to verify that any pre-existing directories in the path to
// the graphroot won't block access to remapped root--if any pre-existing directory
// has strict permissions that don't allow "x", container start will fail, so
// better to warn and fail now
dirPath := config . Root
for {
dirPath = filepath . Dir ( dirPath )
if dirPath == "/" {
break
}
2020-10-06 15:43:24 -04:00
if ! idtools . CanAccess ( dirPath , remappedRoot ) {
2017-08-17 15:16:30 -04:00
return fmt . Errorf ( "a subdirectory in your graphroot path (%s) restricts access to the remapped root uid/gid; please fix by allowing 'o+x' permissions on existing directories" , config . Root )
2016-08-23 12:49:13 -04:00
}
}
2016-01-07 22:43:11 -05:00
}
2018-01-23 14:08:55 -05:00
2018-04-17 11:30:39 -04:00
if err := setupDaemonRootPropagation ( config ) ; err != nil {
logrus . WithError ( err ) . WithField ( "dir" , config . Root ) . Warn ( "Error while setting daemon root propagation, this is not generally critical but may cause some functionality to not work or fallback to less desirable behavior" )
}
return nil
}
func setupDaemonRootPropagation ( cfg * config . Config ) error {
2019-08-09 08:10:07 -04:00
rootParentMount , mountOptions , err := getSourceMount ( cfg . Root )
2018-04-17 11:30:39 -04:00
if err != nil {
return errors . Wrap ( err , "error getting daemon root's parent mount" )
}
var cleanupOldFile bool
cleanupFile := getUnmountOnShutdownPath ( cfg )
defer func ( ) {
if ! cleanupOldFile {
return
2018-01-23 14:08:55 -05:00
}
2018-04-17 11:30:39 -04:00
if err := os . Remove ( cleanupFile ) ; err != nil && ! os . IsNotExist ( err ) {
logrus . WithError ( err ) . WithField ( "file" , cleanupFile ) . Warn ( "could not clean up old root propagation unmount file" )
}
} ( )
2019-08-09 08:10:07 -04:00
if hasMountInfoOption ( mountOptions , sharedPropagationOption , slavePropagationOption ) {
2018-04-17 11:30:39 -04:00
cleanupOldFile = true
return nil
}
if err := mount . MakeShared ( cfg . Root ) ; err != nil {
return errors . Wrap ( err , "could not setup daemon root propagation to shared" )
}
// check the case where this may have already been a mount to itself.
// If so then the daemon only performed a remount and should not try to unmount this later.
if rootParentMount == cfg . Root {
cleanupOldFile = true
return nil
}
2019-07-11 16:30:36 -04:00
if err := os . MkdirAll ( filepath . Dir ( cleanupFile ) , 0700 ) ; err != nil {
return errors . Wrap ( err , "error creating dir to store mount cleanup file" )
}
2021-08-24 06:10:50 -04:00
if err := os . WriteFile ( cleanupFile , nil , 0600 ) ; err != nil {
2018-04-17 11:30:39 -04:00
return errors . Wrap ( err , "error writing file to signal mount cleanup on shutdown" )
2018-01-23 14:08:55 -05:00
}
2016-01-07 22:43:11 -05:00
return nil
}
2018-04-17 11:30:39 -04:00
// getUnmountOnShutdownPath generates the path to used when writing the file that signals to the daemon that on shutdown
// the daemon root should be unmounted.
func getUnmountOnShutdownPath ( config * config . Config ) string {
return filepath . Join ( config . ExecRoot , "unmount-on-shutdown" )
}
2015-07-30 17:01:53 -04:00
// registerLinks writes the links to a file.
2015-12-18 13:36:17 -05:00
func ( daemon * Daemon ) registerLinks ( container * container . Container , hostConfig * containertypes . HostConfig ) error {
2016-01-05 14:20:47 -05:00
if hostConfig == nil || hostConfig . NetworkMode . IsUserDefined ( ) {
2015-06-23 13:13:42 -04:00
return nil
}
for _ , l := range hostConfig . Links {
2016-12-23 14:09:12 -05:00
name , alias , err := opts . ParseLink ( l )
2015-06-23 13:13:42 -04:00
if err != nil {
return err
}
2015-12-11 12:39:28 -05:00
child , err := daemon . GetContainer ( name )
2015-06-23 13:13:42 -04:00
if err != nil {
2019-09-02 17:39:24 -04:00
if errdefs . IsNotFound ( err ) {
// Trying to link to a non-existing container is not valid, and
// should return an "invalid parameter" error. Returning a "not
// found" error here would make the client report the container's
// image could not be found (see moby/moby#39823)
err = errdefs . InvalidParameter ( err )
}
2017-07-19 10:20:13 -04:00
return errors . Wrapf ( err , "could not get container for %s" , name )
2015-06-23 13:13:42 -04:00
}
2015-11-12 14:55:17 -05:00
for child . HostConfig . NetworkMode . IsContainer ( ) {
parts := strings . SplitN ( string ( child . HostConfig . NetworkMode ) , ":" , 2 )
2015-12-11 12:39:28 -05:00
child , err = daemon . GetContainer ( parts [ 1 ] )
2015-06-23 13:13:42 -04:00
if err != nil {
2019-09-02 17:39:24 -04:00
if errdefs . IsNotFound ( err ) {
// Trying to link to a non-existing container is not valid, and
// should return an "invalid parameter" error. Returning a "not
// found" error here would make the client report the container's
// image could not be found (see moby/moby#39823)
err = errdefs . InvalidParameter ( err )
}
2017-07-19 10:20:13 -04:00
return errors . Wrapf ( err , "Could not get container for %s" , parts [ 1 ] )
2015-06-23 13:13:42 -04:00
}
}
2015-11-12 14:55:17 -05:00
if child . HostConfig . NetworkMode . IsHost ( ) {
2015-06-23 13:13:42 -04:00
return runconfig . ErrConflictHostNetworkAndLinks
}
2015-07-30 17:01:53 -04:00
if err := daemon . registerLink ( container , child , alias ) ; err != nil {
2015-06-23 13:13:42 -04:00
return err
}
}
// After we load all the links into the daemon
// set them to nil on the hostconfig
2017-04-06 13:43:10 -04:00
_ , err := container . WriteHostConfig ( )
return err
2015-06-23 13:13:42 -04:00
}
2015-07-16 17:14:58 -04:00
2015-11-02 20:06:09 -05:00
// conditionalMountOnStart is a platform specific helper function during the
// container start to call mount.
2015-11-12 14:55:17 -05:00
func ( daemon * Daemon ) conditionalMountOnStart ( container * container . Container ) error {
2015-11-02 20:06:09 -05:00
return daemon . Mount ( container )
}
// conditionalUnmountOnCleanup is a platform specific helper function called
// during the cleanup of a container to unmount.
2016-03-18 14:50:19 -04:00
func ( daemon * Daemon ) conditionalUnmountOnCleanup ( container * container . Container ) error {
return daemon . Unmount ( container )
2015-11-02 20:06:09 -05:00
}
2019-11-01 11:18:06 -04:00
func copyBlkioEntry ( entries [ ] * statsV1 . BlkIOEntry ) [ ] types . BlkioStatEntry {
2017-09-22 09:52:41 -04:00
out := make ( [ ] types . BlkioStatEntry , len ( entries ) )
for i , re := range entries {
out [ i ] = types . BlkioStatEntry {
Major : re . Major ,
Minor : re . Minor ,
Op : re . Op ,
Value : re . Value ,
}
}
return out
}
2016-03-18 14:50:19 -04:00
func ( daemon * Daemon ) stats ( c * container . Container ) ( * types . StatsJSON , error ) {
2022-05-10 15:59:00 -04:00
c . Lock ( )
task , err := c . GetRunningTask ( )
c . Unlock ( )
if err != nil {
return nil , err
2016-03-18 14:50:19 -04:00
}
2022-05-10 15:59:00 -04:00
cs , err := task . Stats ( context . Background ( ) )
2016-03-18 14:50:19 -04:00
if err != nil {
2017-07-07 03:33:45 -04:00
if strings . Contains ( err . Error ( ) , "container not found" ) {
2017-07-19 10:20:13 -04:00
return nil , containerNotFound ( c . ID )
2017-07-07 03:33:45 -04:00
}
2016-03-18 14:50:19 -04:00
return nil , err
}
s := & types . StatsJSON { }
2017-09-22 09:52:41 -04:00
s . Read = cs . Read
stats := cs . Metrics
2020-03-09 17:40:34 -04:00
switch t := stats . ( type ) {
case * statsV1 . Metrics :
return daemon . statsV1 ( s , t )
case * statsV2 . Metrics :
return daemon . statsV2 ( s , t )
default :
return nil , errors . Errorf ( "unexpected type of metrics %+v" , t )
}
}
func ( daemon * Daemon ) statsV1 ( s * types . StatsJSON , stats * statsV1 . Metrics ) ( * types . StatsJSON , error ) {
2017-09-22 09:52:41 -04:00
if stats . Blkio != nil {
2016-03-18 14:50:19 -04:00
s . BlkioStats = types . BlkioStats {
2017-09-22 09:52:41 -04:00
IoServiceBytesRecursive : copyBlkioEntry ( stats . Blkio . IoServiceBytesRecursive ) ,
IoServicedRecursive : copyBlkioEntry ( stats . Blkio . IoServicedRecursive ) ,
IoQueuedRecursive : copyBlkioEntry ( stats . Blkio . IoQueuedRecursive ) ,
IoServiceTimeRecursive : copyBlkioEntry ( stats . Blkio . IoServiceTimeRecursive ) ,
IoWaitTimeRecursive : copyBlkioEntry ( stats . Blkio . IoWaitTimeRecursive ) ,
IoMergedRecursive : copyBlkioEntry ( stats . Blkio . IoMergedRecursive ) ,
IoTimeRecursive : copyBlkioEntry ( stats . Blkio . IoTimeRecursive ) ,
SectorsRecursive : copyBlkioEntry ( stats . Blkio . SectorsRecursive ) ,
}
}
if stats . CPU != nil {
2016-03-18 14:50:19 -04:00
s . CPUStats = types . CPUStats {
CPUUsage : types . CPUUsage {
2017-09-22 09:52:41 -04:00
TotalUsage : stats . CPU . Usage . Total ,
PercpuUsage : stats . CPU . Usage . PerCPU ,
UsageInKernelmode : stats . CPU . Usage . Kernel ,
UsageInUsermode : stats . CPU . Usage . User ,
2016-03-18 14:50:19 -04:00
} ,
ThrottlingData : types . ThrottlingData {
2017-09-22 09:52:41 -04:00
Periods : stats . CPU . Throttling . Periods ,
ThrottledPeriods : stats . CPU . Throttling . ThrottledPeriods ,
ThrottledTime : stats . CPU . Throttling . ThrottledTime ,
2016-03-18 14:50:19 -04:00
} ,
}
2017-09-22 09:52:41 -04:00
}
if stats . Memory != nil {
2021-07-11 08:16:13 -04:00
raw := map [ string ] uint64 {
"cache" : stats . Memory . Cache ,
"rss" : stats . Memory . RSS ,
"rss_huge" : stats . Memory . RSSHuge ,
"mapped_file" : stats . Memory . MappedFile ,
"dirty" : stats . Memory . Dirty ,
"writeback" : stats . Memory . Writeback ,
"pgpgin" : stats . Memory . PgPgIn ,
"pgpgout" : stats . Memory . PgPgOut ,
"pgfault" : stats . Memory . PgFault ,
"pgmajfault" : stats . Memory . PgMajFault ,
"inactive_anon" : stats . Memory . InactiveAnon ,
"active_anon" : stats . Memory . ActiveAnon ,
"inactive_file" : stats . Memory . InactiveFile ,
"active_file" : stats . Memory . ActiveFile ,
"unevictable" : stats . Memory . Unevictable ,
"hierarchical_memory_limit" : stats . Memory . HierarchicalMemoryLimit ,
"hierarchical_memsw_limit" : stats . Memory . HierarchicalSwapLimit ,
"total_cache" : stats . Memory . TotalCache ,
"total_rss" : stats . Memory . TotalRSS ,
"total_rss_huge" : stats . Memory . TotalRSSHuge ,
"total_mapped_file" : stats . Memory . TotalMappedFile ,
"total_dirty" : stats . Memory . TotalDirty ,
"total_writeback" : stats . Memory . TotalWriteback ,
"total_pgpgin" : stats . Memory . TotalPgPgIn ,
"total_pgpgout" : stats . Memory . TotalPgPgOut ,
"total_pgfault" : stats . Memory . TotalPgFault ,
"total_pgmajfault" : stats . Memory . TotalPgMajFault ,
"total_inactive_anon" : stats . Memory . TotalInactiveAnon ,
"total_active_anon" : stats . Memory . TotalActiveAnon ,
"total_inactive_file" : stats . Memory . TotalInactiveFile ,
"total_active_file" : stats . Memory . TotalActiveFile ,
"total_unevictable" : stats . Memory . TotalUnevictable ,
}
2017-09-22 09:52:41 -04:00
if stats . Memory . Usage != nil {
s . MemoryStats = types . MemoryStats {
Stats : raw ,
Usage : stats . Memory . Usage . Usage ,
MaxUsage : stats . Memory . Usage . Max ,
Limit : stats . Memory . Usage . Limit ,
Failcnt : stats . Memory . Usage . Failcnt ,
}
} else {
s . MemoryStats = types . MemoryStats {
Stats : raw ,
}
2016-04-07 22:09:07 -04:00
}
2017-09-22 09:52:41 -04:00
2016-04-07 22:09:07 -04:00
// if the container does not set memory limit, use the machineMemory
2017-09-22 09:52:41 -04:00
if s . MemoryStats . Limit > daemon . machineMemory && daemon . machineMemory > 0 {
2017-01-04 12:01:59 -05:00
s . MemoryStats . Limit = daemon . machineMemory
2016-03-18 14:50:19 -04:00
}
}
2017-09-22 09:52:41 -04:00
if stats . Pids != nil {
s . PidsStats = types . PidsStats {
Current : stats . Pids . Current ,
Limit : stats . Pids . Limit ,
}
2016-07-10 14:11:27 -04:00
}
2017-09-22 09:52:41 -04:00
2016-03-18 14:50:19 -04:00
return s , nil
}
2020-03-09 17:40:34 -04:00
func ( daemon * Daemon ) statsV2 ( s * types . StatsJSON , stats * statsV2 . Metrics ) ( * types . StatsJSON , error ) {
if stats . Io != nil {
var isbr [ ] types . BlkioStatEntry
for _ , re := range stats . Io . Usage {
isbr = append ( isbr ,
types . BlkioStatEntry {
Major : re . Major ,
Minor : re . Minor ,
Op : "read" ,
Value : re . Rbytes ,
} ,
types . BlkioStatEntry {
Major : re . Major ,
Minor : re . Minor ,
Op : "write" ,
Value : re . Wbytes ,
} ,
)
}
s . BlkioStats = types . BlkioStats {
IoServiceBytesRecursive : isbr ,
// Other fields are unsupported
}
}
if stats . CPU != nil {
s . CPUStats = types . CPUStats {
CPUUsage : types . CPUUsage {
TotalUsage : stats . CPU . UsageUsec * 1000 ,
// PercpuUsage is not supported
UsageInKernelmode : stats . CPU . SystemUsec * 1000 ,
UsageInUsermode : stats . CPU . UserUsec * 1000 ,
} ,
ThrottlingData : types . ThrottlingData {
Periods : stats . CPU . NrPeriods ,
ThrottledPeriods : stats . CPU . NrThrottled ,
ThrottledTime : stats . CPU . ThrottledUsec * 1000 ,
} ,
}
}
if stats . Memory != nil {
s . MemoryStats = types . MemoryStats {
// Stats is not compatible with v1
2021-07-11 08:16:13 -04:00
Stats : map [ string ] uint64 {
"anon" : stats . Memory . Anon ,
"file" : stats . Memory . File ,
"kernel_stack" : stats . Memory . KernelStack ,
"slab" : stats . Memory . Slab ,
"sock" : stats . Memory . Sock ,
"shmem" : stats . Memory . Shmem ,
"file_mapped" : stats . Memory . FileMapped ,
"file_dirty" : stats . Memory . FileDirty ,
"file_writeback" : stats . Memory . FileWriteback ,
"anon_thp" : stats . Memory . AnonThp ,
"inactive_anon" : stats . Memory . InactiveAnon ,
"active_anon" : stats . Memory . ActiveAnon ,
"inactive_file" : stats . Memory . InactiveFile ,
"active_file" : stats . Memory . ActiveFile ,
"unevictable" : stats . Memory . Unevictable ,
"slab_reclaimable" : stats . Memory . SlabReclaimable ,
"slab_unreclaimable" : stats . Memory . SlabUnreclaimable ,
"pgfault" : stats . Memory . Pgfault ,
"pgmajfault" : stats . Memory . Pgmajfault ,
"workingset_refault" : stats . Memory . WorkingsetRefault ,
"workingset_activate" : stats . Memory . WorkingsetActivate ,
"workingset_nodereclaim" : stats . Memory . WorkingsetNodereclaim ,
"pgrefill" : stats . Memory . Pgrefill ,
"pgscan" : stats . Memory . Pgscan ,
"pgsteal" : stats . Memory . Pgsteal ,
"pgactivate" : stats . Memory . Pgactivate ,
"pgdeactivate" : stats . Memory . Pgdeactivate ,
"pglazyfree" : stats . Memory . Pglazyfree ,
"pglazyfreed" : stats . Memory . Pglazyfreed ,
"thp_fault_alloc" : stats . Memory . ThpFaultAlloc ,
"thp_collapse_alloc" : stats . Memory . ThpCollapseAlloc ,
} ,
2020-03-09 17:40:34 -04:00
Usage : stats . Memory . Usage ,
// MaxUsage is not supported
Limit : stats . Memory . UsageLimit ,
}
// if the container does not set memory limit, use the machineMemory
if s . MemoryStats . Limit > daemon . machineMemory && daemon . machineMemory > 0 {
s . MemoryStats . Limit = daemon . machineMemory
}
2020-07-24 08:06:29 -04:00
if stats . MemoryEvents != nil {
// Failcnt is set to the "oom" field of the "memory.events" file.
// See https://www.kernel.org/doc/html/latest/admin-guide/cgroup-v2.html
s . MemoryStats . Failcnt = stats . MemoryEvents . Oom
}
2020-03-09 17:40:34 -04:00
}
if stats . Pids != nil {
s . PidsStats = types . PidsStats {
Current : stats . Pids . Current ,
Limit : stats . Pids . Limit ,
}
}
return s , nil
}
2016-03-24 11:57:11 -04:00
// setDefaultIsolation determines the default isolation mode for the
2016-03-18 14:50:19 -04:00
// daemon to run in. This is only applicable on Windows
func ( daemon * Daemon ) setDefaultIsolation ( ) error {
return nil
}
2016-03-21 12:56:51 -04:00
2016-07-11 18:26:23 -04:00
// setupDaemonProcess sets various settings for the daemon's process
2017-01-23 06:23:07 -05:00
func setupDaemonProcess ( config * config . Config ) error {
2016-07-11 18:26:23 -04:00
// setup the daemons oom_score_adj
2017-09-18 09:26:34 -04:00
if err := setupOOMScoreAdj ( config . OOMScoreAdjust ) ; err != nil {
return err
}
2017-10-11 13:27:08 -04:00
if err := setMayDetachMounts ( ) ; err != nil {
logrus . WithError ( err ) . Warn ( "Could not set may_detach_mounts kernel parameter" )
}
return nil
2017-09-18 09:26:34 -04:00
}
// This is used to allow removal of mountpoints that may be mounted in other
// namespaces on RHEL based kernels starting from RHEL 7.4.
// Without this setting, removals on these RHEL based kernels may fail with
// "device or resource busy".
// This setting is not available in upstream kernels as it is not configurable,
// but has been in the upstream kernels since 3.15.
func setMayDetachMounts ( ) error {
f , err := os . OpenFile ( "/proc/sys/fs/may_detach_mounts" , os . O_WRONLY , 0 )
if err != nil {
if os . IsNotExist ( err ) {
return nil
}
return errors . Wrap ( err , "error opening may_detach_mounts kernel config file" )
}
defer f . Close ( )
_ , err = f . WriteString ( "1" )
if os . IsPermission ( err ) {
// Setting may_detach_mounts does not work in an
// unprivileged container. Ignore the error, but log
// it if we appear not to be in that situation.
2021-06-18 05:01:24 -04:00
if ! userns . RunningInUserNS ( ) {
2017-09-18 09:26:34 -04:00
logrus . Debugf ( "Permission denied writing %q to /proc/sys/fs/may_detach_mounts" , "1" )
}
return nil
}
return err
2016-07-11 18:26:23 -04:00
}
func setupOOMScoreAdj ( score int ) error {
2020-10-05 11:50:23 -04:00
if score == 0 {
return nil
}
2016-07-11 18:26:23 -04:00
f , err := os . OpenFile ( "/proc/self/oom_score_adj" , os . O_WRONLY , 0 )
if err != nil {
return err
}
2016-11-18 02:56:52 -05:00
defer f . Close ( )
2016-09-21 03:36:36 -04:00
stringScore := strconv . Itoa ( score )
_ , err = f . WriteString ( stringScore )
2016-09-18 22:27:10 -04:00
if os . IsPermission ( err ) {
// Setting oom_score_adj does not work in an
2016-09-21 03:36:36 -04:00
// unprivileged container. Ignore the error, but log
// it if we appear not to be in that situation.
2021-06-18 05:01:24 -04:00
if ! userns . RunningInUserNS ( ) {
2016-09-21 03:36:36 -04:00
logrus . Debugf ( "Permission denied writing %q to /proc/self/oom_score_adj" , stringScore )
}
2016-09-18 22:27:10 -04:00
return nil
}
2016-11-18 02:56:52 -05:00
2016-07-11 18:26:23 -04:00
return err
}
2016-06-07 15:05:43 -04:00
2020-05-22 18:05:13 -04:00
func ( daemon * Daemon ) initCPURtController ( mnt , path string ) error {
2016-06-07 15:05:43 -04:00
if path == "/" || path == "." {
return nil
}
2017-01-03 08:54:30 -05:00
// Recursively create cgroup to ensure that the system and all parent cgroups have values set
// for the period and runtime as this limits what the children can be set to.
2020-05-22 18:05:13 -04:00
if err := daemon . initCPURtController ( mnt , filepath . Dir ( path ) ) ; err != nil {
2016-06-07 15:05:43 -04:00
return err
}
2020-05-22 18:05:13 -04:00
path = filepath . Join ( mnt , path )
if err := os . MkdirAll ( path , 0755 ) ; err != nil {
2017-02-28 05:12:06 -05:00
return err
2016-06-07 15:05:43 -04:00
}
2020-05-22 18:05:13 -04:00
if err := maybeCreateCPURealTimeFile ( daemon . configStore . CPURealtimePeriod , "cpu.rt_period_us" , path ) ; err != nil {
return err
}
return maybeCreateCPURealTimeFile ( daemon . configStore . CPURealtimeRuntime , "cpu.rt_runtime_us" , path )
2017-02-28 05:12:06 -05:00
}
2020-05-22 18:05:13 -04:00
func maybeCreateCPURealTimeFile ( configValue int64 , file string , path string ) error {
if configValue == 0 {
return nil
2016-06-07 15:05:43 -04:00
}
2021-08-24 06:10:50 -04:00
return os . WriteFile ( filepath . Join ( path , file ) , [ ] byte ( strconv . FormatInt ( configValue , 10 ) ) , 0700 )
2016-09-02 09:20:54 -04:00
}
2016-06-07 15:05:43 -04:00
2016-09-02 09:20:54 -04:00
func ( daemon * Daemon ) setupSeccompProfile ( ) error {
2021-07-07 07:09:54 -04:00
switch profile := daemon . configStore . SeccompProfile ; profile {
case "" , config . SeccompProfileDefault :
daemon . seccompProfilePath = config . SeccompProfileDefault
case config . SeccompProfileUnconfined :
daemon . seccompProfilePath = config . SeccompProfileUnconfined
default :
daemon . seccompProfilePath = profile
2021-08-24 06:10:50 -04:00
b , err := os . ReadFile ( profile )
2021-07-07 07:09:54 -04:00
if err != nil {
return fmt . Errorf ( "opening seccomp profile (%s) failed: %v" , profile , err )
2016-09-02 09:20:54 -04:00
}
2021-07-07 07:09:54 -04:00
daemon . seccompProfile = b
2016-09-02 09:20:54 -04:00
}
2016-06-07 15:05:43 -04:00
return nil
}
2019-11-05 02:10:19 -05:00
2022-06-03 11:35:23 -04:00
func getSysInfo ( daemon * Daemon ) * sysinfo . SysInfo {
2021-06-05 15:09:59 -04:00
var siOpts [ ] sysinfo . Opt
2020-03-10 08:09:25 -04:00
if daemon . getCgroupDriver ( ) == cgroupSystemdDriver {
2021-06-05 15:09:59 -04:00
if euid := os . Getenv ( "ROOTLESSKIT_PARENT_EUID" ) ; euid != "" {
siOpts = append ( siOpts , sysinfo . WithCgroup2GroupPath ( "/user.slice/user-" + euid + ".slice" ) )
2020-03-10 08:09:25 -04:00
}
}
2022-06-03 11:35:23 -04:00
return sysinfo . New ( siOpts ... )
2020-09-19 12:45:41 -04:00
}
2021-02-26 18:23:55 -05:00
func ( daemon * Daemon ) initLibcontainerd ( ctx context . Context ) error {
var err error
daemon . containerd , err = remote . NewClient (
ctx ,
daemon . containerdCli ,
filepath . Join ( daemon . configStore . ExecRoot , "containerd" ) ,
daemon . configStore . ContainerdNamespace ,
daemon ,
)
return err
}
daemon: load and cache sysInfo on initialization
The `daemon.RawSysInfo()` function can be a heavy operation, as it collects
information about all cgroups on the host, networking, AppArmor, Seccomp, etc.
While looking at our code, I noticed that various parts in the code call this
function, potentially even _multiple times_ per container, for example, it is
called from:
- `verifyPlatformContainerSettings()`
- `oci.WithCgroups()` if the daemon has `cpu-rt-period` or `cpu-rt-runtime` configured
- in `ContainerDecoder.DecodeConfig()`, which is called on boith `container create` and `container commit`
Given that this information is not expected to change during the daemon's
lifecycle, and various information coming from this (such as seccomp and
apparmor status) was already cached, we may as well load it once, and cache
the results in the daemon instance.
This patch updates `daemon.RawSysInfo()` to use a `sync.Once()` so that
it's only executed once for the daemon's lifecycle.
Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
2022-01-07 06:54:47 -05:00
func recursiveUnmount ( target string ) error {
return mount . RecursiveUnmount ( target )
}