2018-02-05 16:05:59 -05:00
package daemon // import "github.com/docker/docker/daemon"
2016-03-18 14:50:19 -04:00
import (
2019-04-09 16:51:40 -04:00
"context"
2016-03-18 14:50:19 -04:00
"fmt"
2020-02-10 00:37:22 -05:00
"io/ioutil"
2016-03-18 14:50:19 -04:00
"os"
2016-06-27 17:38:47 -04:00
"os/exec"
2016-03-18 14:50:19 -04:00
"path/filepath"
2016-04-26 04:20:17 -04:00
"sort"
2016-03-18 14:50:19 -04:00
"strconv"
"strings"
2020-11-09 09:00:32 -05:00
cdcgroups "github.com/containerd/cgroups"
2019-04-09 16:51:40 -04:00
"github.com/containerd/containerd/containers"
coci "github.com/containerd/containerd/oci"
2020-06-15 07:06:08 -04:00
"github.com/containerd/containerd/sys"
2016-09-06 14:18:12 -04:00
containertypes "github.com/docker/docker/api/types/container"
2016-03-18 14:50:19 -04:00
"github.com/docker/docker/container"
2017-04-10 05:25:15 -04:00
daemonconfig "github.com/docker/docker/daemon/config"
2016-03-18 14:50:19 -04:00
"github.com/docker/docker/oci"
2018-12-16 10:11:37 -05:00
"github.com/docker/docker/oci/caps"
2016-03-18 14:50:19 -04:00
"github.com/docker/docker/pkg/idtools"
2019-08-29 03:56:37 -04:00
"github.com/docker/docker/pkg/stringid"
2018-10-15 03:52:53 -04:00
"github.com/docker/docker/rootless/specconv"
2018-04-17 16:50:28 -04:00
volumemounts "github.com/docker/docker/volume/mounts"
2020-03-13 19:38:24 -04:00
"github.com/moby/sys/mount"
"github.com/moby/sys/mountinfo"
2016-03-18 14:50:19 -04:00
"github.com/opencontainers/runc/libcontainer/apparmor"
2016-06-07 15:05:43 -04:00
"github.com/opencontainers/runc/libcontainer/cgroups"
2016-03-18 14:50:19 -04:00
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/opencontainers/runc/libcontainer/user"
2019-08-05 10:37:47 -04:00
specs "github.com/opencontainers/runtime-spec/specs-go"
2018-01-24 18:10:01 -05:00
"github.com/pkg/errors"
2017-07-26 17:42:13 -04:00
"github.com/sirupsen/logrus"
2017-10-15 02:06:20 -04:00
"golang.org/x/sys/unix"
2016-03-18 14:50:19 -04:00
)
2019-04-10 14:45:14 -04:00
const inContainerInitPath = "/sbin/" + daemonconfig . DefaultInitBinary
2018-08-22 16:05:12 -04:00
2019-04-10 14:45:14 -04:00
// WithRlimits sets the container's rlimits along with merging the daemon's rlimits
func WithRlimits ( daemon * Daemon , c * container . Container ) coci . SpecOpts {
return func ( ctx context . Context , _ coci . Client , _ * containers . Container , s * coci . Spec ) error {
var rlimits [ ] specs . POSIXRlimit
// We want to leave the original HostConfig alone so make a copy here
hostConfig := * c . HostConfig
// Merge with the daemon defaults
daemon . mergeUlimits ( & hostConfig )
for _ , ul := range hostConfig . Ulimits {
rlimits = append ( rlimits , specs . POSIXRlimit {
Type : "RLIMIT_" + strings . ToUpper ( ul . Name ) ,
Soft : uint64 ( ul . Soft ) ,
Hard : uint64 ( ul . Hard ) ,
} )
}
s . Process . Rlimits = rlimits
return nil
2016-03-18 14:50:19 -04:00
}
2019-04-10 14:45:14 -04:00
}
2016-03-18 14:50:19 -04:00
2019-04-10 14:45:14 -04:00
// WithLibnetwork sets the libnetwork hook
func WithLibnetwork ( daemon * Daemon , c * container . Container ) coci . SpecOpts {
return func ( ctx context . Context , _ coci . Client , _ * containers . Container , s * coci . Spec ) error {
if s . Hooks == nil {
s . Hooks = & specs . Hooks { }
}
for _ , ns := range s . Linux . Namespaces {
if ns . Type == "network" && ns . Path == "" && ! c . Config . NetworkDisabled {
target := filepath . Join ( "/proc" , strconv . Itoa ( os . Getpid ( ) ) , "exe" )
2019-08-29 03:56:37 -04:00
shortNetCtlrID := stringid . TruncateID ( daemon . netController . ID ( ) )
2019-04-10 14:45:14 -04:00
s . Hooks . Prestart = append ( s . Hooks . Prestart , specs . Hook {
Path : target ,
Args : [ ] string {
"libnetwork-setkey" ,
"-exec-root=" + daemon . configStore . GetExecRoot ( ) ,
c . ID ,
2019-08-29 03:56:37 -04:00
shortNetCtlrID ,
2019-04-10 14:45:14 -04:00
} ,
} )
}
}
return nil
}
}
// WithRootless sets the spec to the rootless configuration
2020-02-10 00:37:22 -05:00
func WithRootless ( daemon * Daemon ) coci . SpecOpts {
return func ( _ context . Context , _ coci . Client , _ * containers . Container , s * coci . Spec ) error {
var v2Controllers [ ] string
if daemon . getCgroupDriver ( ) == cgroupSystemdDriver {
2020-11-09 09:00:32 -05:00
if cdcgroups . Mode ( ) != cdcgroups . Unified {
2020-02-10 00:37:22 -05:00
return errors . New ( "rootless systemd driver doesn't support cgroup v1" )
}
rootlesskitParentEUID := os . Getenv ( "ROOTLESSKIT_PARENT_EUID" )
if rootlesskitParentEUID == "" {
return errors . New ( "$ROOTLESSKIT_PARENT_EUID is not set (requires RootlessKit v0.8.0)" )
}
controllersPath := fmt . Sprintf ( "/sys/fs/cgroup/user.slice/user-%s.slice/cgroup.controllers" , rootlesskitParentEUID )
controllersFile , err := ioutil . ReadFile ( controllersPath )
if err != nil {
return err
}
v2Controllers = strings . Fields ( string ( controllersFile ) )
}
return specconv . ToRootless ( s , v2Controllers )
}
2019-04-10 14:45:14 -04:00
}
// WithOOMScore sets the oom score
func WithOOMScore ( score * int ) coci . SpecOpts {
return func ( ctx context . Context , _ coci . Client , _ * containers . Container , s * coci . Spec ) error {
s . Process . OOMScoreAdj = score
return nil
}
}
// WithSelinux sets the selinux labels
func WithSelinux ( c * container . Container ) coci . SpecOpts {
return func ( ctx context . Context , _ coci . Client , _ * containers . Container , s * coci . Spec ) error {
s . Process . SelinuxLabel = c . GetProcessLabel ( )
s . Linux . MountLabel = c . MountLabel
return nil
}
}
// WithApparmor sets the apparmor profile
func WithApparmor ( c * container . Container ) coci . SpecOpts {
return func ( ctx context . Context , _ coci . Client , _ * containers . Container , s * coci . Spec ) error {
if apparmor . IsEnabled ( ) {
var appArmorProfile string
if c . AppArmorProfile != "" {
appArmorProfile = c . AppArmorProfile
} else if c . HostConfig . Privileged {
2019-10-12 18:04:44 -04:00
appArmorProfile = unconfinedAppArmorProfile
2019-04-10 14:45:14 -04:00
} else {
2019-08-09 06:33:15 -04:00
appArmorProfile = defaultAppArmorProfile
2019-04-10 14:45:14 -04:00
}
2019-08-09 06:33:15 -04:00
if appArmorProfile == defaultAppArmorProfile {
2019-04-10 14:45:14 -04:00
// Unattended upgrades and other fun services can unload AppArmor
// profiles inadvertently. Since we cannot store our profile in
// /etc/apparmor.d, nor can we practically add other ways of
// telling the system to keep our profile loaded, in order to make
// sure that we keep the default profile enabled we dynamically
// reload it if necessary.
if err := ensureDefaultAppArmorProfile ( ) ; err != nil {
return err
}
}
s . Process . ApparmorProfile = appArmorProfile
}
return nil
}
}
// WithCapabilities sets the container's capabilties
func WithCapabilities ( c * container . Container ) coci . SpecOpts {
return func ( ctx context . Context , _ coci . Client , _ * containers . Container , s * coci . Spec ) error {
capabilities , err := caps . TweakCapabilities (
2019-11-14 12:53:52 -05:00
caps . DefaultCapabilities ( ) ,
2019-04-10 14:45:14 -04:00
c . HostConfig . CapAdd ,
c . HostConfig . CapDrop ,
c . HostConfig . Privileged ,
)
if err != nil {
return err
}
return oci . SetCapabilities ( s , capabilities )
}
2016-03-18 14:50:19 -04:00
}
2020-07-29 08:26:05 -04:00
func resourcePath ( c * container . Container , getPath func ( ) ( string , error ) ) ( string , error ) {
p , err := getPath ( )
2016-03-18 14:50:19 -04:00
if err != nil {
2020-07-29 08:26:05 -04:00
return "" , err
2020-07-28 22:43:43 -04:00
}
2020-07-29 08:26:05 -04:00
return c . GetResourcePath ( p )
2016-03-18 14:50:19 -04:00
}
2020-07-29 08:26:05 -04:00
func getUser ( c * container . Container , username string ) ( specs . User , error ) {
var usr specs . User
passwdPath , err := resourcePath ( c , user . GetPasswdPath )
2016-03-18 14:50:19 -04:00
if err != nil {
2020-07-29 08:26:05 -04:00
return usr , err
2016-03-18 14:50:19 -04:00
}
2020-07-29 08:26:05 -04:00
groupPath , err := resourcePath ( c , user . GetGroupPath )
2016-03-18 14:50:19 -04:00
if err != nil {
2020-07-29 08:26:05 -04:00
return usr , err
2016-03-18 14:50:19 -04:00
}
2020-07-29 08:26:05 -04:00
execUser , err := user . GetExecUserPath ( username , nil , passwdPath , groupPath )
2016-03-18 14:50:19 -04:00
if err != nil {
2020-07-29 08:26:05 -04:00
return usr , err
2016-03-18 14:50:19 -04:00
}
2020-07-29 08:26:05 -04:00
usr . UID = uint32 ( execUser . Uid )
usr . GID = uint32 ( execUser . Gid )
2022-06-02 05:30:15 -04:00
usr . AdditionalGids = [ ] uint32 { usr . GID }
2016-03-18 14:50:19 -04:00
var addGroups [ ] int
if len ( c . HostConfig . GroupAdd ) > 0 {
2020-07-29 08:26:05 -04:00
addGroups , err = user . GetAdditionalGroupsPath ( c . HostConfig . GroupAdd , groupPath )
2016-03-18 14:50:19 -04:00
if err != nil {
2020-07-29 08:26:05 -04:00
return usr , err
2016-03-18 14:50:19 -04:00
}
}
2020-07-29 08:26:05 -04:00
for _ , g := range append ( execUser . Sgids , addGroups ... ) {
usr . AdditionalGids = append ( usr . AdditionalGids , uint32 ( g ) )
2016-03-18 14:50:19 -04:00
}
2020-07-29 08:26:05 -04:00
return usr , nil
2016-03-18 14:50:19 -04:00
}
2017-04-27 17:52:47 -04:00
func setNamespace ( s * specs . Spec , ns specs . LinuxNamespace ) {
2016-03-18 14:50:19 -04:00
for i , n := range s . Linux . Namespaces {
if n . Type == ns . Type {
s . Linux . Namespaces [ i ] = ns
return
}
}
s . Linux . Namespaces = append ( s . Linux . Namespaces , ns )
}
2019-04-10 14:45:14 -04:00
// WithNamespaces sets the container's namespaces
func WithNamespaces ( daemon * Daemon , c * container . Container ) coci . SpecOpts {
return func ( ctx context . Context , _ coci . Client , _ * containers . Container , s * coci . Spec ) error {
userNS := false
// user
if c . HostConfig . UsernsMode . IsPrivate ( ) {
uidMap := daemon . idMapping . UIDs ( )
if uidMap != nil {
userNS = true
ns := specs . LinuxNamespace { Type : "user" }
setNamespace ( s , ns )
s . Linux . UIDMappings = specMapping ( uidMap )
s . Linux . GIDMappings = specMapping ( daemon . idMapping . GIDs ( ) )
}
}
// network
if ! c . Config . NetworkDisabled {
ns := specs . LinuxNamespace { Type : "network" }
parts := strings . SplitN ( string ( c . HostConfig . NetworkMode ) , ":" , 2 )
if parts [ 0 ] == "container" {
nc , err := daemon . getNetworkedContainer ( c . ID , c . HostConfig . NetworkMode . ConnectedContainer ( ) )
if err != nil {
return err
}
ns . Path = fmt . Sprintf ( "/proc/%d/ns/net" , nc . State . GetPID ( ) )
if userNS {
// to share a net namespace, they must also share a user namespace
nsUser := specs . LinuxNamespace { Type : "user" }
nsUser . Path = fmt . Sprintf ( "/proc/%d/ns/user" , nc . State . GetPID ( ) )
setNamespace ( s , nsUser )
}
} else if c . HostConfig . NetworkMode . IsHost ( ) {
ns . Path = c . NetworkSettings . SandboxKey
}
2016-03-21 21:30:21 -04:00
setNamespace ( s , ns )
}
2019-04-10 14:45:14 -04:00
// ipc
ipcMode := c . HostConfig . IpcMode
switch {
case ipcMode . IsContainer ( ) :
ns := specs . LinuxNamespace { Type : "ipc" }
ic , err := daemon . getIpcContainer ( ipcMode . Container ( ) )
2016-03-18 14:50:19 -04:00
if err != nil {
return err
}
2019-04-10 14:45:14 -04:00
ns . Path = fmt . Sprintf ( "/proc/%d/ns/ipc" , ic . State . GetPID ( ) )
setNamespace ( s , ns )
2016-03-21 21:30:21 -04:00
if userNS {
2019-04-10 14:45:14 -04:00
// to share an IPC namespace, they must also share a user namespace
2017-04-27 17:52:47 -04:00
nsUser := specs . LinuxNamespace { Type : "user" }
2019-04-10 14:45:14 -04:00
nsUser . Path = fmt . Sprintf ( "/proc/%d/ns/user" , ic . State . GetPID ( ) )
2016-03-21 21:30:21 -04:00
setNamespace ( s , nsUser )
}
2019-04-10 14:45:14 -04:00
case ipcMode . IsHost ( ) :
2019-08-09 06:30:18 -04:00
oci . RemoveNamespace ( s , "ipc" )
2019-04-10 14:45:14 -04:00
case ipcMode . IsEmpty ( ) :
// A container was created by an older version of the daemon.
// The default behavior used to be what is now called "shareable".
fallthrough
case ipcMode . IsPrivate ( ) , ipcMode . IsShareable ( ) , ipcMode . IsNone ( ) :
ns := specs . LinuxNamespace { Type : "ipc" }
setNamespace ( s , ns )
default :
return fmt . Errorf ( "Invalid IPC mode: %v" , ipcMode )
2016-03-18 14:50:19 -04:00
}
Implement none, private, and shareable ipc modes
Since the commit d88fe447df0e8 ("Add support for sharing /dev/shm/ and
/dev/mqueue between containers") container's /dev/shm is mounted on the
host first, then bind-mounted inside the container. This is done that
way in order to be able to share this container's IPC namespace
(and the /dev/shm mount point) with another container.
Unfortunately, this functionality breaks container checkpoint/restore
(even if IPC is not shared). Since /dev/shm is an external mount, its
contents is not saved by `criu checkpoint`, and so upon restore any
application that tries to access data under /dev/shm is severily
disappointed (which usually results in a fatal crash).
This commit solves the issue by introducing new IPC modes for containers
(in addition to 'host' and 'container:ID'). The new modes are:
- 'shareable': enables sharing this container's IPC with others
(this used to be the implicit default);
- 'private': disables sharing this container's IPC.
In 'private' mode, container's /dev/shm is truly mounted inside the
container, without any bind-mounting from the host, which solves the
issue.
While at it, let's also implement 'none' mode. The motivation, as
eloquently put by Justin Cormack, is:
> I wondered a while back about having a none shm mode, as currently it is
> not possible to have a totally unwriteable container as there is always
> a /dev/shm writeable mount. It is a bit of a niche case (and clearly
> should never be allowed to be daemon default) but it would be trivial to
> add now so maybe we should...
...so here's yet yet another mode:
- 'none': no /dev/shm mount inside the container (though it still
has its own private IPC namespace).
Now, to ultimately solve the abovementioned checkpoint/restore issue, we'd
need to make 'private' the default mode, but unfortunately it breaks the
backward compatibility. So, let's make the default container IPC mode
per-daemon configurable (with the built-in default set to 'shareable'
for now). The default can be changed either via a daemon CLI option
(--default-shm-mode) or a daemon.json configuration file parameter
of the same name.
Note one can only set either 'shareable' or 'private' IPC modes as a
daemon default (i.e. in this context 'host', 'container', or 'none'
do not make much sense).
Some other changes this patch introduces are:
1. A mount for /dev/shm is added to default OCI Linux spec.
2. IpcMode.Valid() is simplified to remove duplicated code that parsed
'container:ID' form. Note the old version used to check that ID does
not contain a semicolon -- this is no longer the case (tests are
modified accordingly). The motivation is we should either do a
proper check for container ID validity, or don't check it at all
(since it is checked in other places anyway). I chose the latter.
3. IpcMode.Container() is modified to not return container ID if the
mode value does not start with "container:", unifying the check to
be the same as in IpcMode.IsContainer().
3. IPC mode unit tests (runconfig/hostconfig_test.go) are modified
to add checks for newly added values.
[v2: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-51345997]
[v3: addressed review at https://github.com/moby/moby/pull/34087#pullrequestreview-53902833]
[v4: addressed the case of upgrading from older daemon, in this case
container.HostConfig.IpcMode is unset and this is valid]
[v5: document old and new IpcMode values in api/swagger.yaml]
[v6: add the 'none' mode, changelog entry to docs/api/version-history.md]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2017-06-27 17:58:50 -04:00
2019-04-10 14:45:14 -04:00
// pid
if c . HostConfig . PidMode . IsContainer ( ) {
pc , err := daemon . getPidContainer ( c )
if err != nil {
return err
}
2019-08-09 06:30:18 -04:00
ns := specs . LinuxNamespace {
Type : "pid" ,
Path : fmt . Sprintf ( "/proc/%d/ns/pid" , pc . State . GetPID ( ) ) ,
}
2019-04-10 14:45:14 -04:00
setNamespace ( s , ns )
if userNS {
// to share a PID namespace, they must also share a user namespace
2019-08-09 06:30:18 -04:00
nsUser := specs . LinuxNamespace {
Type : "user" ,
Path : fmt . Sprintf ( "/proc/%d/ns/user" , pc . State . GetPID ( ) ) ,
}
2019-04-10 14:45:14 -04:00
setNamespace ( s , nsUser )
}
} else if c . HostConfig . PidMode . IsHost ( ) {
2019-08-09 06:30:18 -04:00
oci . RemoveNamespace ( s , "pid" )
2019-04-10 14:45:14 -04:00
} else {
ns := specs . LinuxNamespace { Type : "pid" }
setNamespace ( s , ns )
2016-05-06 14:56:03 -04:00
}
2019-04-10 14:45:14 -04:00
// uts
if c . HostConfig . UTSMode . IsHost ( ) {
2019-08-09 06:30:18 -04:00
oci . RemoveNamespace ( s , "uts" )
2019-04-10 14:45:14 -04:00
s . Hostname = ""
2016-05-06 14:56:03 -04:00
}
2016-03-18 14:50:19 -04:00
2019-03-14 23:44:18 -04:00
// cgroup
if ! c . HostConfig . CgroupnsMode . IsEmpty ( ) {
cgroupNsMode := c . HostConfig . CgroupnsMode
if ! cgroupNsMode . Valid ( ) {
return fmt . Errorf ( "invalid cgroup namespace mode: %v" , cgroupNsMode )
}
2020-04-21 10:06:44 -04:00
if cgroupNsMode . IsPrivate ( ) {
2019-03-14 23:44:18 -04:00
nsCgroup := specs . LinuxNamespace { Type : "cgroup" }
setNamespace ( s , nsCgroup )
}
}
return nil
}
2016-03-18 14:50:19 -04:00
}
2017-04-27 17:52:47 -04:00
func specMapping ( s [ ] idtools . IDMap ) [ ] specs . LinuxIDMapping {
var ids [ ] specs . LinuxIDMapping
2016-03-18 14:50:19 -04:00
for _ , item := range s {
2017-04-27 17:52:47 -04:00
ids = append ( ids , specs . LinuxIDMapping {
2016-03-18 14:50:19 -04:00
HostID : uint32 ( item . HostID ) ,
ContainerID : uint32 ( item . ContainerID ) ,
Size : uint32 ( item . Size ) ,
} )
}
return ids
}
// Get the source mount point of directory passed in as argument. Also return
// optional fields.
func getSourceMount ( source string ) ( string , string , error ) {
// Ensure any symlinks are resolved.
sourcePath , err := filepath . EvalSymlinks ( source )
if err != nil {
return "" , "" , err
}
2020-03-13 19:38:24 -04:00
mi , err := mountinfo . GetMounts ( mountinfo . ParentsFilter ( sourcePath ) )
2016-03-18 14:50:19 -04:00
if err != nil {
return "" , "" , err
}
getSourceMount(): simplify
The flow of getSourceMount was:
1 get all entries from /proc/self/mountinfo
2 do a linear search for the `source` directory
3 if found, return its data
4 get the parent directory of `source`, goto 2
The repeated linear search through the whole mountinfo (which can have
thousands of records) is inefficient. Instead, let's just
1 collect all the relevant records (only those mount points
that can be a parent of `source`)
2 find the record with the longest mountpath, return its data
This was tested manually with something like
```go
func TestGetSourceMount(t *testing.T) {
mnt, flags, err := getSourceMount("/sys/devices/msr/")
assert.NoError(t, err)
t.Logf("mnt: %v, flags: %v", mnt, flags)
}
```
...but it relies on having a specific mount points on the system
being used for testing.
[v2: add unit tests for ParentsFilter]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2018-01-25 23:13:46 -05:00
if len ( mi ) < 1 {
return "" , "" , fmt . Errorf ( "Can't find mount point of %s" , source )
2016-03-18 14:50:19 -04:00
}
getSourceMount(): simplify
The flow of getSourceMount was:
1 get all entries from /proc/self/mountinfo
2 do a linear search for the `source` directory
3 if found, return its data
4 get the parent directory of `source`, goto 2
The repeated linear search through the whole mountinfo (which can have
thousands of records) is inefficient. Instead, let's just
1 collect all the relevant records (only those mount points
that can be a parent of `source`)
2 find the record with the longest mountpath, return its data
This was tested manually with something like
```go
func TestGetSourceMount(t *testing.T) {
mnt, flags, err := getSourceMount("/sys/devices/msr/")
assert.NoError(t, err)
t.Logf("mnt: %v, flags: %v", mnt, flags)
}
```
...but it relies on having a specific mount points on the system
being used for testing.
[v2: add unit tests for ParentsFilter]
Signed-off-by: Kir Kolyshkin <kolyshkin@gmail.com>
2018-01-25 23:13:46 -05:00
// find the longest mount point
var idx , maxlen int
for i := range mi {
if len ( mi [ i ] . Mountpoint ) > maxlen {
maxlen = len ( mi [ i ] . Mountpoint )
idx = i
2016-03-18 14:50:19 -04:00
}
}
2018-05-10 15:01:50 -04:00
return mi [ idx ] . Mountpoint , mi [ idx ] . Optional , nil
2016-03-18 14:50:19 -04:00
}
2018-01-24 18:10:01 -05:00
const (
sharedPropagationOption = "shared:"
slavePropagationOption = "master:"
)
2019-08-09 06:33:15 -04:00
// hasMountInfoOption checks if any of the passed any of the given option values
2018-01-24 18:10:01 -05:00
// are set in the passed in option string.
2019-08-09 06:33:15 -04:00
func hasMountInfoOption ( opts string , vals ... string ) bool {
2018-01-24 18:10:01 -05:00
for _ , opt := range strings . Split ( opts , " " ) {
for _ , val := range vals {
if strings . HasPrefix ( opt , val ) {
return true
}
}
}
return false
}
2016-03-18 14:50:19 -04:00
// Ensure mount point on which path is mounted, is shared.
func ensureShared ( path string ) error {
sourceMount , optionalOpts , err := getSourceMount ( path )
if err != nil {
return err
}
// Make sure source mount point is shared.
2019-08-09 06:33:15 -04:00
if ! hasMountInfoOption ( optionalOpts , sharedPropagationOption ) {
2018-01-24 18:10:01 -05:00
return errors . Errorf ( "path %s is mounted on %s but it is not a shared mount" , path , sourceMount )
2016-03-18 14:50:19 -04:00
}
return nil
}
// Ensure mount point on which path is mounted, is either shared or slave.
func ensureSharedOrSlave ( path string ) error {
sourceMount , optionalOpts , err := getSourceMount ( path )
if err != nil {
return err
}
2019-08-09 06:33:15 -04:00
if ! hasMountInfoOption ( optionalOpts , sharedPropagationOption , slavePropagationOption ) {
2018-01-24 18:10:01 -05:00
return errors . Errorf ( "path %s is mounted on %s but it is not a shared or slave mount" , path , sourceMount )
2016-03-18 14:50:19 -04:00
}
return nil
}
2017-10-15 02:06:20 -04:00
// Get the set of mount flags that are set on the mount that contains the given
// path and are locked by CL_UNPRIVILEGED. This is necessary to ensure that
// bind-mounting "with options" will not fail with user namespaces, due to
// kernel restrictions that require user namespace mounts to preserve
// CL_UNPRIVILEGED locked flags.
func getUnprivilegedMountFlags ( path string ) ( [ ] string , error ) {
var statfs unix . Statfs_t
if err := unix . Statfs ( path , & statfs ) ; err != nil {
return nil , err
}
// The set of keys come from https://github.com/torvalds/linux/blob/v4.13/fs/namespace.c#L1034-L1048.
unprivilegedFlags := map [ uint64 ] string {
unix . MS_RDONLY : "ro" ,
unix . MS_NODEV : "nodev" ,
unix . MS_NOEXEC : "noexec" ,
unix . MS_NOSUID : "nosuid" ,
unix . MS_NOATIME : "noatime" ,
unix . MS_RELATIME : "relatime" ,
unix . MS_NODIRATIME : "nodiratime" ,
}
var flags [ ] string
for mask , flag := range unprivilegedFlags {
if uint64 ( statfs . Flags ) & mask == mask {
flags = append ( flags , flag )
}
}
return flags , nil
}
2016-03-18 14:50:19 -04:00
var (
mountPropagationMap = map [ string ] int {
"private" : mount . PRIVATE ,
"rprivate" : mount . RPRIVATE ,
"shared" : mount . SHARED ,
"rshared" : mount . RSHARED ,
"slave" : mount . SLAVE ,
"rslave" : mount . RSLAVE ,
}
mountPropagationReverseMap = map [ int ] string {
mount . PRIVATE : "private" ,
mount . RPRIVATE : "rprivate" ,
mount . SHARED : "shared" ,
mount . RSHARED : "rshared" ,
mount . SLAVE : "slave" ,
mount . RSLAVE : "rslave" ,
}
)
2017-11-10 00:18:48 -05:00
// inSlice tests whether a string is contained in a slice of strings or not.
// Comparison is case sensitive
func inSlice ( slice [ ] string , s string ) bool {
for _ , ss := range slice {
if s == ss {
return true
}
}
return false
}
2019-04-10 14:45:14 -04:00
// WithMounts sets the container's mounts
func WithMounts ( daemon * Daemon , c * container . Container ) coci . SpecOpts {
return func ( ctx context . Context , _ coci . Client , _ * containers . Container , s * coci . Spec ) ( err error ) {
if err := daemon . setupContainerMountsRoot ( c ) ; err != nil {
return err
2016-03-18 14:50:19 -04:00
}
2017-10-27 03:21:41 -04:00
2019-04-10 14:45:14 -04:00
if err := daemon . setupIpcDirs ( c ) ; err != nil {
return err
}
2016-03-18 14:50:19 -04:00
2019-04-10 14:45:14 -04:00
defer func ( ) {
Inconsistent --tmpfs behavior
This fix tries to address the issue raised in #22420. When
`--tmpfs` is specified with `/tmp`, the default value is
`rw,nosuid,nodev,noexec,relatime,size=65536k`. When `--tmpfs`
is specified with `/tmp:rw`, then the value changed to
`rw,nosuid,nodev,noexec,relatime`.
The reason for such an inconsistency is because docker tries
to add `size=65536k` option only when user provides no option.
This fix tries to address this issue by always pre-progating
`size=65536k` along with `rw,nosuid,nodev,noexec,relatime`.
If user provides a different value (e.g., `size=8192k`), it
will override the `size=65536k` anyway since the combined
options will be parsed and merged to remove any duplicates.
Additional test cases have been added to cover the changes
in this fix.
This fix fixes #22420.
Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
2016-04-30 22:42:19 -04:00
if err != nil {
2019-04-10 14:45:14 -04:00
daemon . cleanupSecretDir ( c )
Inconsistent --tmpfs behavior
This fix tries to address the issue raised in #22420. When
`--tmpfs` is specified with `/tmp`, the default value is
`rw,nosuid,nodev,noexec,relatime,size=65536k`. When `--tmpfs`
is specified with `/tmp:rw`, then the value changed to
`rw,nosuid,nodev,noexec,relatime`.
The reason for such an inconsistency is because docker tries
to add `size=65536k` option only when user provides no option.
This fix tries to address this issue by always pre-progating
`size=65536k` along with `rw,nosuid,nodev,noexec,relatime`.
If user provides a different value (e.g., `size=8192k`), it
will override the `size=65536k` anyway since the combined
options will be parsed and merged to remove any duplicates.
Additional test cases have been added to cover the changes
in this fix.
This fix fixes #22420.
Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
2016-04-30 22:42:19 -04:00
}
2019-04-10 14:45:14 -04:00
} ( )
Inconsistent --tmpfs behavior
This fix tries to address the issue raised in #22420. When
`--tmpfs` is specified with `/tmp`, the default value is
`rw,nosuid,nodev,noexec,relatime,size=65536k`. When `--tmpfs`
is specified with `/tmp:rw`, then the value changed to
`rw,nosuid,nodev,noexec,relatime`.
The reason for such an inconsistency is because docker tries
to add `size=65536k` option only when user provides no option.
This fix tries to address this issue by always pre-progating
`size=65536k` along with `rw,nosuid,nodev,noexec,relatime`.
If user provides a different value (e.g., `size=8192k`), it
will override the `size=65536k` anyway since the combined
options will be parsed and merged to remove any duplicates.
Additional test cases have been added to cover the changes
in this fix.
This fix fixes #22420.
Signed-off-by: Yong Tang <yong.tang.github@outlook.com>
2016-04-30 22:42:19 -04:00
2019-04-10 14:45:14 -04:00
if err := daemon . setupSecretDir ( c ) ; err != nil {
return err
2016-03-18 14:50:19 -04:00
}
2019-04-10 14:45:14 -04:00
ms , err := daemon . setupMounts ( c )
if err != nil {
return err
}
2016-03-18 14:50:19 -04:00
2019-04-10 14:45:14 -04:00
if ! c . HostConfig . IpcMode . IsPrivate ( ) && ! c . HostConfig . IpcMode . IsEmpty ( ) {
ms = append ( ms , c . IpcMounts ( ) ... )
}
tmpfsMounts , err := c . TmpfsMounts ( )
if err != nil {
return err
}
ms = append ( ms , tmpfsMounts ... )
secretMounts , err := c . SecretMounts ( )
if err != nil {
return err
}
ms = append ( ms , secretMounts ... )
sort . Sort ( mounts ( ms ) )
mounts := ms
userMounts := make ( map [ string ] struct { } )
for _ , m := range mounts {
userMounts [ m . Destination ] = struct { } { }
}
// Copy all mounts from spec to defaultMounts, except for
// - mounts overridden by a user supplied mount;
// - all mounts under /dev if a user supplied /dev is present;
// - /dev/shm, in case IpcMode is none.
// While at it, also
// - set size for /dev/shm from shmsize.
defaultMounts := s . Mounts [ : 0 ]
_ , mountDev := userMounts [ "/dev" ]
for _ , m := range s . Mounts {
if _ , ok := userMounts [ m . Destination ] ; ok {
// filter out mount overridden by a user supplied mount
continue
2016-03-18 14:50:19 -04:00
}
2019-04-10 14:45:14 -04:00
if mountDev && strings . HasPrefix ( m . Destination , "/dev/" ) {
// filter out everything under /dev if /dev is user-mounted
continue
}
if m . Destination == "/dev/shm" {
if c . HostConfig . IpcMode . IsNone ( ) {
// filter out /dev/shm for "none" IpcMode
continue
2018-01-18 16:55:27 -05:00
}
2019-04-10 14:45:14 -04:00
// set size for /dev/shm mount from spec
sizeOpt := "size=" + strconv . FormatInt ( c . HostConfig . ShmSize , 10 )
m . Options = append ( m . Options , sizeOpt )
}
2018-01-18 16:55:27 -05:00
2019-04-10 14:45:14 -04:00
defaultMounts = append ( defaultMounts , m )
}
s . Mounts = defaultMounts
for _ , m := range mounts {
if m . Source == "tmpfs" {
data := m . Data
parser := volumemounts . NewParser ( "linux" )
options := [ ] string { "noexec" , "nosuid" , "nodev" , string ( parser . DefaultPropagationMode ( ) ) }
if data != "" {
options = append ( options , strings . Split ( data , "," ) ... )
2018-01-18 16:55:27 -05:00
}
2019-04-10 14:45:14 -04:00
merged , err := mount . MergeTmpfsOptions ( options )
if err != nil {
2018-01-18 16:55:27 -05:00
return err
}
2019-04-10 14:45:14 -04:00
s . Mounts = append ( s . Mounts , specs . Mount { Destination : m . Destination , Source : m . Source , Type : "tmpfs" , Options : merged } )
continue
2016-03-18 14:50:19 -04:00
}
2019-04-10 14:45:14 -04:00
mt := specs . Mount { Destination : m . Destination , Source : m . Source , Type : "bind" }
// Determine property of RootPropagation based on volume
// properties. If a volume is shared, then keep root propagation
// shared. This should work for slave and private volumes too.
//
// For slave volumes, it can be either [r]shared/[r]slave.
//
// For private volumes any root propagation value should work.
pFlag := mountPropagationMap [ m . Propagation ]
switch pFlag {
case mount . SHARED , mount . RSHARED :
if err := ensureShared ( m . Source ) ; err != nil {
return err
}
2018-01-18 16:55:27 -05:00
rootpg := mountPropagationMap [ s . Linux . RootfsPropagation ]
2019-04-10 14:45:14 -04:00
if rootpg != mount . SHARED && rootpg != mount . RSHARED {
s . Linux . RootfsPropagation = mountPropagationReverseMap [ mount . SHARED ]
}
case mount . SLAVE , mount . RSLAVE :
var fallback bool
if err := ensureSharedOrSlave ( m . Source ) ; err != nil {
// For backwards compatibility purposes, treat mounts from the daemon root
// as special since we automatically add rslave propagation to these mounts
// when the user did not set anything, so we should fallback to the old
// behavior which is to use private propagation which is normally the
// default.
if ! strings . HasPrefix ( m . Source , daemon . root ) && ! strings . HasPrefix ( daemon . root , m . Source ) {
return err
}
cm , ok := c . MountPoints [ m . Destination ]
if ! ok {
return err
}
if cm . Spec . BindOptions != nil && cm . Spec . BindOptions . Propagation != "" {
// This means the user explicitly set a propagation, do not fallback in that case.
return err
}
fallback = true
logrus . WithField ( "container" , c . ID ) . WithField ( "source" , m . Source ) . Warn ( "Falling back to default propagation for bind source in daemon root" )
}
if ! fallback {
rootpg := mountPropagationMap [ s . Linux . RootfsPropagation ]
if rootpg != mount . SHARED && rootpg != mount . RSHARED && rootpg != mount . SLAVE && rootpg != mount . RSLAVE {
s . Linux . RootfsPropagation = mountPropagationReverseMap [ mount . RSLAVE ]
}
2018-01-18 16:55:27 -05:00
}
2016-03-18 14:50:19 -04:00
}
2019-04-10 14:45:14 -04:00
bindMode := "rbind"
if m . NonRecursive {
bindMode = "bind"
}
opts := [ ] string { bindMode }
if ! m . Writable {
opts = append ( opts , "ro" )
}
if pFlag != 0 {
opts = append ( opts , mountPropagationReverseMap [ pFlag ] )
}
2016-03-18 14:50:19 -04:00
2019-04-10 14:45:14 -04:00
// If we are using user namespaces, then we must make sure that we
// don't drop any of the CL_UNPRIVILEGED "locked" flags of the source
// "mount" when we bind-mount. The reason for this is that at the point
// when runc sets up the root filesystem, it is already inside a user
// namespace, and thus cannot change any flags that are locked.
2021-04-01 01:58:11 -04:00
if daemon . configStore . RemappedRoot != "" || sys . RunningInUserNS ( ) {
2019-04-10 14:45:14 -04:00
unprivOpts , err := getUnprivilegedMountFlags ( m . Source )
if err != nil {
return err
}
opts = append ( opts , unprivOpts ... )
2017-10-15 02:06:20 -04:00
}
2019-04-10 14:45:14 -04:00
mt . Options = opts
s . Mounts = append ( s . Mounts , mt )
}
2016-03-18 14:50:19 -04:00
2019-04-10 14:45:14 -04:00
if s . Root . Readonly {
for i , m := range s . Mounts {
switch m . Destination {
case "/proc" , "/dev/pts" , "/dev/shm" , "/dev/mqueue" , "/dev" :
continue
}
if _ , ok := userMounts [ m . Destination ] ; ! ok {
if ! inSlice ( m . Options , "ro" ) {
s . Mounts [ i ] . Options = append ( s . Mounts [ i ] . Options , "ro" )
}
2016-03-18 14:50:19 -04:00
}
}
}
2019-04-10 14:45:14 -04:00
if c . HostConfig . Privileged {
// clear readonly for /sys
for i := range s . Mounts {
if s . Mounts [ i ] . Destination == "/sys" {
clearReadOnly ( & s . Mounts [ i ] )
}
2016-03-18 14:50:19 -04:00
}
2019-04-10 14:45:14 -04:00
s . Linux . ReadonlyPaths = nil
s . Linux . MaskedPaths = nil
2016-03-18 14:50:19 -04:00
}
2019-04-10 14:45:14 -04:00
// TODO: until a kernel/mount solution exists for handling remount in a user namespace,
// we must clear the readonly flag for the cgroups mount (@mrunalp concurs)
if uidMap := daemon . idMapping . UIDs ( ) ; uidMap != nil || c . HostConfig . Privileged {
for i , m := range s . Mounts {
if m . Type == "cgroup" {
clearReadOnly ( & s . Mounts [ i ] )
}
2016-03-18 14:50:19 -04:00
}
}
2019-04-10 14:45:14 -04:00
return nil
2016-03-18 14:50:19 -04:00
}
2019-04-10 14:45:14 -04:00
}
2020-05-26 10:58:24 -04:00
// sysctlExists checks if a sysctl exists; runc will error if we add any that do not actually
// exist, so do not add the default ones if running on an old kernel.
func sysctlExists ( s string ) bool {
f := filepath . Join ( "/proc" , "sys" , strings . Replace ( s , "." , "/" , - 1 ) )
_ , err := os . Stat ( f )
return err == nil
}
2019-04-10 14:45:14 -04:00
// WithCommonOptions sets common docker options
func WithCommonOptions ( daemon * Daemon , c * container . Container ) coci . SpecOpts {
return func ( ctx context . Context , _ coci . Client , _ * containers . Container , s * coci . Spec ) error {
if c . BaseFS == nil {
return errors . New ( "populateCommonSpec: BaseFS of container " + c . ID + " is unexpectedly nil" )
}
linkedEnv , err := daemon . setupLinkedContainers ( c )
if err != nil {
return err
}
s . Root = & specs . Root {
Path : c . BaseFS . Path ( ) ,
Readonly : c . HostConfig . ReadonlyRootfs ,
}
if err := c . SetupWorkingDirectory ( daemon . idMapping . RootPair ( ) ) ; err != nil {
return err
}
cwd := c . Config . WorkingDir
if len ( cwd ) == 0 {
cwd = "/"
}
s . Process . Args = append ( [ ] string { c . Path } , c . Args ... )
// only add the custom init if it is specified and the container is running in its
// own private pid namespace. It does not make sense to add if it is running in the
// host namespace or another container's pid namespace where we already have an init
if c . HostConfig . PidMode . IsPrivate ( ) {
if ( c . HostConfig . Init != nil && * c . HostConfig . Init ) ||
( c . HostConfig . Init == nil && daemon . configStore . Init ) {
s . Process . Args = append ( [ ] string { inContainerInitPath , "--" , c . Path } , c . Args ... )
path := daemon . configStore . InitPath
if path == "" {
path , err = exec . LookPath ( daemonconfig . DefaultInitBinary )
if err != nil {
return err
}
2016-09-27 06:51:42 -04:00
}
2019-04-10 14:45:14 -04:00
s . Mounts = append ( s . Mounts , specs . Mount {
Destination : inContainerInitPath ,
Type : "bind" ,
Source : path ,
Options : [ ] string { "bind" , "ro" } ,
} )
2016-09-27 06:51:42 -04:00
}
2016-06-27 17:38:47 -04:00
}
2019-04-10 14:45:14 -04:00
s . Process . Cwd = cwd
s . Process . Env = c . CreateDaemonEnvironment ( c . Config . Tty , linkedEnv )
s . Process . Terminal = c . Config . Tty
2018-06-17 03:05:54 -04:00
2019-04-10 14:45:14 -04:00
s . Hostname = c . Config . Hostname
setLinuxDomainname ( c , s )
2016-03-18 14:50:19 -04:00
2020-05-26 10:58:24 -04:00
// Add default sysctls that are generally safe and useful; currently we
// grant the capabilities to allow these anyway. You can override if
// you want to restore the original behaviour.
// We do not set network sysctls if network namespace is host, or if we are
// joining an existing namespace, only if we create a new net namespace.
if c . HostConfig . NetworkMode . IsPrivate ( ) {
// We cannot set up ping socket support in a user namespace
2021-08-11 14:43:30 -04:00
userNS := daemon . configStore . RemappedRoot != "" && c . HostConfig . UsernsMode . IsPrivate ( )
if ! userNS && ! sys . RunningInUserNS ( ) && sysctlExists ( "net.ipv4.ping_group_range" ) {
2020-05-26 10:58:24 -04:00
// allow unprivileged ICMP echo sockets without CAP_NET_RAW
s . Linux . Sysctl [ "net.ipv4.ping_group_range" ] = "0 2147483647"
}
// allow opening any port less than 1024 without CAP_NET_BIND_SERVICE
if sysctlExists ( "net.ipv4.ip_unprivileged_port_start" ) {
s . Linux . Sysctl [ "net.ipv4.ip_unprivileged_port_start" ] = "0"
}
}
2019-04-10 14:45:14 -04:00
return nil
}
2016-03-18 14:50:19 -04:00
}
2019-04-10 14:45:14 -04:00
// WithCgroups sets the container's cgroups
func WithCgroups ( daemon * Daemon , c * container . Container ) coci . SpecOpts {
2019-04-09 16:51:40 -04:00
return func ( ctx context . Context , _ coci . Client , _ * containers . Container , s * coci . Spec ) error {
var cgroupsPath string
scopePrefix := "docker"
parent := "/docker"
useSystemd := UsingSystemd ( daemon . configStore )
if useSystemd {
parent = "system.slice"
2020-02-10 00:37:22 -05:00
if daemon . configStore . Rootless {
parent = "user.slice"
}
2019-04-09 16:51:40 -04:00
}
2016-03-18 14:50:19 -04:00
2019-04-09 16:51:40 -04:00
if c . HostConfig . CgroupParent != "" {
parent = c . HostConfig . CgroupParent
} else if daemon . configStore . CgroupParent != "" {
parent = daemon . configStore . CgroupParent
}
2016-03-24 12:18:03 -04:00
2019-04-09 16:51:40 -04:00
if useSystemd {
cgroupsPath = parent + ":" + scopePrefix + ":" + c . ID
logrus . Debugf ( "createSpec: cgroupsPath: %s" , cgroupsPath )
} else {
cgroupsPath = filepath . Join ( parent , c . ID )
}
s . Linux . CgroupsPath = cgroupsPath
2020-05-22 18:05:13 -04:00
// the rest is only needed for CPU RT controller
if daemon . configStore . CPURealtimePeriod == 0 && daemon . configStore . CPURealtimeRuntime == 0 {
return nil
}
2020-11-09 09:00:32 -05:00
if cdcgroups . Mode ( ) == cdcgroups . Unified {
2020-05-22 18:05:13 -04:00
return errors . New ( "daemon-scoped cpu-rt-period and cpu-rt-runtime are not implemented for cgroup v2" )
}
// FIXME this is very expensive way to check if cpu rt is supported
sysInfo := daemon . RawSysInfo ( true )
if ! sysInfo . CPURealtime {
return errors . New ( "daemon-scoped cpu-rt-period and cpu-rt-runtime are not supported by the kernel" )
}
2019-04-09 16:51:40 -04:00
p := cgroupsPath
if useSystemd {
initPath , err := cgroups . GetInitCgroup ( "cpu" )
if err != nil {
2020-05-22 18:05:13 -04:00
return errors . Wrap ( err , "unable to init CPU RT controller" )
2019-04-09 16:51:40 -04:00
}
_ , err = cgroups . GetOwnCgroup ( "cpu" )
if err != nil {
2020-05-22 18:05:13 -04:00
return errors . Wrap ( err , "unable to init CPU RT controller" )
2019-04-09 16:51:40 -04:00
}
p = filepath . Join ( initPath , s . Linux . CgroupsPath )
}
2016-03-24 12:18:03 -04:00
2019-04-09 16:51:40 -04:00
// Clean path to guard against things like ../../../BAD
parentPath := filepath . Dir ( p )
if ! filepath . IsAbs ( parentPath ) {
parentPath = filepath . Clean ( "/" + parentPath )
}
2016-03-18 14:50:19 -04:00
2020-05-22 18:05:13 -04:00
mnt , root , err := cgroups . FindCgroupMountpointAndRoot ( "" , "cpu" )
if err != nil {
return errors . Wrap ( err , "unable to init CPU RT controller" )
}
// When docker is run inside docker, the root is based of the host cgroup.
// Should this be handled in runc/libcontainer/cgroups ?
if strings . HasPrefix ( root , "/docker/" ) {
root = "/"
}
mnt = filepath . Join ( mnt , root )
if err := daemon . initCPURtController ( mnt , parentPath ) ; err != nil {
return errors . Wrap ( err , "unable to init CPU RT controller" )
2019-04-09 16:51:40 -04:00
}
return nil
2016-03-18 14:50:19 -04:00
}
2019-04-09 16:51:40 -04:00
}
2019-04-10 14:45:14 -04:00
// WithDevices sets the container's devices
func WithDevices ( daemon * Daemon , c * container . Container ) coci . SpecOpts {
2019-04-09 16:51:40 -04:00
return func ( ctx context . Context , _ coci . Client , _ * containers . Container , s * coci . Spec ) error {
// Build lists of devices allowed and created within the container.
var devs [ ] specs . LinuxDevice
devPermissions := s . Linux . Resources . Devices
2019-12-06 07:49:55 -05:00
2020-06-15 07:06:08 -04:00
if c . HostConfig . Privileged && ! sys . RunningInUserNS ( ) {
2019-04-09 16:51:40 -04:00
hostDevices , err := devices . HostDevices ( )
if err != nil {
return err
}
for _ , d := range hostDevices {
devs = append ( devs , oci . Device ( d ) )
}
2019-12-06 07:49:55 -05:00
// adding device mappings in privileged containers
for _ , deviceMapping := range c . HostConfig . Devices {
// issue a warning that custom cgroup permissions are ignored in privileged mode
if deviceMapping . CgroupPermissions != "rwm" {
logrus . WithField ( "container" , c . ID ) . Warnf ( "custom %s permissions for device %s are ignored in privileged mode" , deviceMapping . CgroupPermissions , deviceMapping . PathOnHost )
}
// issue a warning that the device path already exists via /dev mounting in privileged mode
if deviceMapping . PathOnHost == deviceMapping . PathInContainer {
logrus . WithField ( "container" , c . ID ) . Warnf ( "path in container %s already exists in privileged mode" , deviceMapping . PathInContainer )
continue
}
d , _ , err := oci . DevicesFromPath ( deviceMapping . PathOnHost , deviceMapping . PathInContainer , "rwm" )
if err != nil {
return err
}
devs = append ( devs , d ... )
}
2019-04-09 16:51:40 -04:00
devPermissions = [ ] specs . LinuxDeviceCgroup {
{
Allow : true ,
Access : "rwm" ,
} ,
}
} else {
for _ , deviceMapping := range c . HostConfig . Devices {
d , dPermissions , err := oci . DevicesFromPath ( deviceMapping . PathOnHost , deviceMapping . PathInContainer , deviceMapping . CgroupPermissions )
if err != nil {
return err
}
devs = append ( devs , d ... )
devPermissions = append ( devPermissions , dPermissions ... )
}
var err error
devPermissions , err = oci . AppendDevicePermissionsFromCgroupRules ( devPermissions , c . HostConfig . DeviceCgroupRules )
if err != nil {
return err
}
}
s . Linux . Devices = append ( s . Linux . Devices , devs ... )
s . Linux . Resources . Devices = devPermissions
for _ , req := range c . HostConfig . DeviceRequests {
if err := daemon . handleDevice ( req , s ) ; err != nil {
return err
}
}
return nil
2018-06-17 03:05:54 -04:00
}
2019-04-09 16:51:40 -04:00
}
2016-06-07 15:05:43 -04:00
2019-04-10 14:45:14 -04:00
// WithResources applies the container resources
func WithResources ( c * container . Container ) coci . SpecOpts {
2019-04-09 16:51:40 -04:00
return func ( ctx context . Context , _ coci . Client , _ * containers . Container , s * coci . Spec ) error {
r := c . HostConfig . Resources
weightDevices , err := getBlkioWeightDevices ( r )
2016-06-07 15:05:43 -04:00
if err != nil {
2019-04-09 16:51:40 -04:00
return err
2016-06-07 15:05:43 -04:00
}
2019-04-09 16:51:40 -04:00
readBpsDevice , err := getBlkioThrottleDevices ( r . BlkioDeviceReadBps )
2016-06-07 15:05:43 -04:00
if err != nil {
2019-04-09 16:51:40 -04:00
return err
}
writeBpsDevice , err := getBlkioThrottleDevices ( r . BlkioDeviceWriteBps )
if err != nil {
return err
}
readIOpsDevice , err := getBlkioThrottleDevices ( r . BlkioDeviceReadIOps )
if err != nil {
return err
}
writeIOpsDevice , err := getBlkioThrottleDevices ( r . BlkioDeviceWriteIOps )
if err != nil {
return err
}
memoryRes := getMemoryResources ( r )
cpuRes , err := getCPUResources ( r )
if err != nil {
return err
}
blkioWeight := r . BlkioWeight
specResources := & specs . LinuxResources {
Memory : memoryRes ,
CPU : cpuRes ,
BlockIO : & specs . LinuxBlockIO {
Weight : & blkioWeight ,
WeightDevice : weightDevices ,
ThrottleReadBpsDevice : readBpsDevice ,
ThrottleWriteBpsDevice : writeBpsDevice ,
ThrottleReadIOPSDevice : readIOpsDevice ,
ThrottleWriteIOPSDevice : writeIOpsDevice ,
} ,
Pids : getPidsLimit ( r ) ,
2016-06-07 15:05:43 -04:00
}
2019-04-09 16:51:40 -04:00
if s . Linux . Resources != nil && len ( s . Linux . Resources . Devices ) > 0 {
specResources . Devices = s . Linux . Resources . Devices
}
s . Linux . Resources = specResources
return nil
2016-06-07 15:05:43 -04:00
}
2019-04-09 16:51:40 -04:00
}
2016-06-07 15:05:43 -04:00
2019-04-10 14:45:14 -04:00
// WithSysctls sets the container's sysctls
func WithSysctls ( c * container . Container ) coci . SpecOpts {
2019-04-09 16:51:40 -04:00
return func ( ctx context . Context , _ coci . Client , _ * containers . Container , s * coci . Spec ) error {
// We merge the sysctls injected above with the HostConfig (latter takes
// precedence for backwards-compatibility reasons).
for k , v := range c . HostConfig . Sysctls {
s . Linux . Sysctl [ k ] = v
}
return nil
2016-06-07 15:05:43 -04:00
}
2019-04-09 16:51:40 -04:00
}
2016-06-07 15:05:43 -04:00
2019-04-10 14:45:14 -04:00
// WithUser sets the container's user
func WithUser ( c * container . Container ) coci . SpecOpts {
2019-04-09 16:51:40 -04:00
return func ( ctx context . Context , _ coci . Client , _ * containers . Container , s * coci . Spec ) error {
2020-07-29 08:26:05 -04:00
var err error
s . Process . User , err = getUser ( c , c . Config . User )
return err
2016-06-07 15:05:43 -04:00
}
2019-04-09 16:51:40 -04:00
}
func ( daemon * Daemon ) createSpec ( c * container . Container ) ( retSpec * specs . Spec , err error ) {
var (
opts [ ] coci . SpecOpts
s = oci . DefaultSpec ( )
)
opts = append ( opts ,
2019-04-10 14:45:14 -04:00
WithCommonOptions ( daemon , c ) ,
WithCgroups ( daemon , c ) ,
WithResources ( c ) ,
WithSysctls ( c ) ,
WithDevices ( daemon , c ) ,
WithUser ( c ) ,
WithRlimits ( daemon , c ) ,
WithNamespaces ( daemon , c ) ,
WithCapabilities ( c ) ,
WithSeccomp ( daemon , c ) ,
WithMounts ( daemon , c ) ,
WithLibnetwork ( daemon , c ) ,
WithApparmor ( c ) ,
WithSelinux ( c ) ,
WithOOMScore ( & c . HostConfig . OomScoreAdj ) ,
2019-04-09 16:51:40 -04:00
)
2019-04-10 14:45:14 -04:00
if c . NoNewPrivileges {
opts = append ( opts , coci . WithNoNewPrivileges )
2016-03-18 14:50:19 -04:00
}
2018-03-20 13:29:18 -04:00
// Set the masked and readonly paths with regard to the host config options if they are set.
if c . HostConfig . MaskedPaths != nil {
2019-04-10 14:45:14 -04:00
opts = append ( opts , coci . WithMaskedPaths ( c . HostConfig . MaskedPaths ) )
2018-03-20 13:29:18 -04:00
}
if c . HostConfig . ReadonlyPaths != nil {
2019-04-10 14:45:14 -04:00
opts = append ( opts , coci . WithReadonlyPaths ( c . HostConfig . ReadonlyPaths ) )
2018-03-20 13:29:18 -04:00
}
2018-10-15 03:52:53 -04:00
if daemon . configStore . Rootless {
2020-02-10 00:37:22 -05:00
opts = append ( opts , WithRootless ( daemon ) )
2018-10-15 03:52:53 -04:00
}
2019-04-09 16:51:40 -04:00
return & s , coci . ApplyOpts ( context . Background ( ) , nil , & containers . Container {
ID : c . ID ,
} , & s , opts ... )
2016-03-18 14:50:19 -04:00
}
func clearReadOnly ( m * specs . Mount ) {
var opt [ ] string
for _ , o := range m . Options {
if o != "ro" {
opt = append ( opt , o )
}
}
m . Options = opt
}
2016-09-08 00:23:56 -04:00
// mergeUlimits merge the Ulimits from HostConfig with daemon defaults, and update HostConfig
func ( daemon * Daemon ) mergeUlimits ( c * containertypes . HostConfig ) {
ulimits := c . Ulimits
// Merge ulimits with daemon defaults
ulIdx := make ( map [ string ] struct { } )
for _ , ul := range ulimits {
ulIdx [ ul . Name ] = struct { } { }
}
for name , ul := range daemon . configStore . Ulimits {
if _ , exists := ulIdx [ name ] ; ! exists {
ulimits = append ( ulimits , ul )
}
}
c . Ulimits = ulimits
}