1
0
Fork 0
mirror of https://github.com/moby/moby.git synced 2022-11-09 12:21:53 -05:00

vendor: github.com/opencontainers/runc v1.0.0-rc95

full diff: https://github.com/opencontainers/runc/compare/v1.0.0-rc92...v1.0.0-rc95

Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
This commit is contained in:
Sebastiaan van Stijn 2021-03-13 15:37:07 +01:00
parent f0d3e905b6
commit a927fc7831
No known key found for this signature in database
GPG key ID: 76698F39D527CE8C
34 changed files with 1339 additions and 599 deletions

View file

@ -91,7 +91,7 @@ google.golang.org/grpc f495f5b15ae7ccda3b38c53a1bfc
# the containerd project first, and update both after that is merged.
# This commit does not need to match RUNC_COMMIT as it is used for helper
# packages but should be newer or equal.
github.com/opencontainers/runc ff819c7e9184c13b7c2607fe6c30ae19403a7aff # v1.0.0-rc92
github.com/opencontainers/runc b9ee9c6314599f1b4a7f497e1f1f856fe433d3b7 # v1.0.0-rc95
github.com/opencontainers/runtime-spec 1c3f411f041711bbeecf35ff7e93461ea6789220 # v1.0.3-0.20210326190908-1c3f411f0417
github.com/opencontainers/image-spec d60099175f88c47cd379c4738d158884749ed235 # v1.0.1
github.com/cyphar/filepath-securejoin a261ee33d7a517f054effbf451841abaafe3e0fd # v0.2.2

View file

@ -1,9 +1,10 @@
# runc
[![Build Status](https://travis-ci.org/opencontainers/runc.svg?branch=master)](https://travis-ci.org/opencontainers/runc)
[![Go Report Card](https://goreportcard.com/badge/github.com/opencontainers/runc)](https://goreportcard.com/report/github.com/opencontainers/runc)
[![GoDoc](https://godoc.org/github.com/opencontainers/runc?status.svg)](https://godoc.org/github.com/opencontainers/runc)
[![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/588/badge)](https://bestpractices.coreinfrastructure.org/projects/588)
[![gha/validate](https://github.com/opencontainers/runc/workflows/validate/badge.svg)](https://github.com/opencontainers/runc/actions?query=workflow%3Avalidate)
[![gha/ci](https://github.com/opencontainers/runc/workflows/ci/badge.svg)](https://github.com/opencontainers/runc/actions?query=workflow%3Aci)
## Introduction
@ -17,10 +18,6 @@ This means that `runc` 1.0.0 should implement the 1.0 version of the specificati
You can find official releases of `runc` on the [release](https://github.com/opencontainers/runc/releases) page.
Currently, the following features are not considered to be production-ready:
* [Support for cgroup v2](./docs/cgroup-v2.md)
## Security
The reporting process and disclosure communications are outlined [here](https://github.com/opencontainers/org/blob/master/SECURITY.md).
@ -64,19 +61,20 @@ sudo make install
with some of them enabled by default (see `BUILDTAGS` in top-level `Makefile`).
To change build tags from the default, set the `BUILDTAGS` variable for make,
e.g.
e.g. to disable seccomp:
```bash
make BUILDTAGS='seccomp apparmor'
make BUILDTAGS=""
```
| Build Tag | Feature | Enabled by default | Dependency |
|-----------|------------------------------------|--------------------|------------|
| seccomp | Syscall filtering | yes | libseccomp |
| selinux | selinux process and mount labeling | yes | <none> |
| apparmor | apparmor profile support | yes | <none> |
| nokmem | disable kernel memory accounting | no | <none> |
The following build tags were used earlier, but are now obsoleted:
- **nokmem** (since runc v1.0.0-rc94 kernel memory settings are ignored)
- **apparmor** (since runc v1.0.0-rc93 the feature is always enabled)
- **selinux** (since runc v1.0.0-rc93 the feature is always enabled)
### Running the test suite
@ -128,6 +126,14 @@ make verify-dependencies
## Using runc
Please note that runc is a low level tool not designed with an end user
in mind. It is mostly employed by other higher level container software.
Therefore, unless there is some specific use case that prevents the use
of tools like Docker or Podman, it is not recommended to use runc directly.
If you still want to use runc, here's how.
### Creating an OCI Bundle
In order to use runc you must have your container in the format of an OCI bundle.
@ -169,7 +175,9 @@ If you used the unmodified `runc spec` template this should give you a `sh` sess
The second way to start a container is using the specs lifecycle operations.
This gives you more power over how the container is created and managed while it is running.
This will also launch the container in the background so you will have to edit the `config.json` to remove the `terminal` setting for the simple examples here.
This will also launch the container in the background so you will have to edit
the `config.json` to remove the `terminal` setting for the simple examples
below (see more details about [runc terminal handling](docs/terminals.md)).
Your process field in the `config.json` should look like this below with `"terminal": false` and `"args": ["sleep", "5"]`.
@ -292,8 +300,12 @@ PIDFile=/run/mycontainerid.pid
WantedBy=multi-user.target
```
#### cgroup v2
See [`./docs/cgroup-v2.md`](./docs/cgroup-v2.md).
## More documentation
* [cgroup v2](./docs/cgroup-v2.md)
* [Checkpoint and restore](./docs/checkpoint-restore.md)
* [systemd cgroup driver](./docs/systemd.md)
* [Terminals and standard IO](./docs/terminals.md)
## License

View file

@ -1,26 +1,28 @@
module github.com/opencontainers/runc
go 1.14
go 1.13
require (
github.com/checkpoint-restore/go-criu/v4 v4.1.0
github.com/cilium/ebpf v0.0.0-20200702112145-1c8d4c9ef775
github.com/containerd/console v1.0.0
github.com/coreos/go-systemd/v22 v22.1.0
github.com/checkpoint-restore/go-criu/v5 v5.0.0
github.com/cilium/ebpf v0.5.0
github.com/containerd/console v1.0.2
github.com/coreos/go-systemd/v22 v22.3.1
github.com/cyphar/filepath-securejoin v0.2.2
github.com/docker/go-units v0.4.0
github.com/godbus/dbus/v5 v5.0.3
github.com/golang/protobuf v1.4.2
github.com/moby/sys/mountinfo v0.1.3
github.com/mrunalp/fileutils v0.0.0-20200520151820-abd8a0e76976
github.com/opencontainers/runtime-spec v1.0.3-0.20200728170252-4d89ac9fbff6
github.com/opencontainers/selinux v1.6.0
github.com/godbus/dbus/v5 v5.0.4
github.com/moby/sys/mountinfo v0.4.1
github.com/mrunalp/fileutils v0.5.0
github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417
github.com/opencontainers/selinux v1.8.0
github.com/pkg/errors v0.9.1
github.com/seccomp/libseccomp-golang v0.9.1
github.com/sirupsen/logrus v1.6.0
github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2
github.com/sirupsen/logrus v1.7.0
github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635
// NOTE: urfave/cli must be <= v1.22.1 due to a regression: https://github.com/urfave/cli/issues/1092
github.com/urfave/cli v1.22.1
github.com/vishvananda/netlink v1.1.0
golang.org/x/sys v0.0.0-20200728102440-3e129f6d46b1
github.com/willf/bitset v1.1.11
golang.org/x/net v0.0.0-20201224014010-6772e930b67b
golang.org/x/sys v0.0.0-20210426230700-d19ff857e887
google.golang.org/protobuf v1.25.0
)

View file

@ -57,90 +57,94 @@ struct describing how the container is to be created. A sample would look simila
```go
defaultMountFlags := unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
var devices []*configs.DeviceRule
for _, device := range specconv.AllowedDevices {
devices = append(devices, &device.Rule)
}
config := &configs.Config{
Rootfs: "/your/path/to/rootfs",
Capabilities: &configs.Capabilities{
Bounding: []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
},
Effective: []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
},
Inheritable: []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
},
Permitted: []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
},
Ambient: []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
},
},
Bounding: []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
},
Effective: []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
},
Inheritable: []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
},
Permitted: []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
},
Ambient: []string{
"CAP_CHOWN",
"CAP_DAC_OVERRIDE",
"CAP_FSETID",
"CAP_FOWNER",
"CAP_MKNOD",
"CAP_NET_RAW",
"CAP_SETGID",
"CAP_SETUID",
"CAP_SETFCAP",
"CAP_SETPCAP",
"CAP_NET_BIND_SERVICE",
"CAP_SYS_CHROOT",
"CAP_KILL",
"CAP_AUDIT_WRITE",
},
},
Namespaces: configs.Namespaces([]configs.Namespace{
{Type: configs.NEWNS},
{Type: configs.NEWUTS},
@ -155,7 +159,7 @@ config := &configs.Config{
Parent: "system",
Resources: &configs.Resources{
MemorySwappiness: nil,
Devices: specconv.AllowedDevices,
Devices: devices,
},
},
MaskPaths: []string{
@ -313,7 +317,7 @@ state, err := container.State()
#### Checkpoint & Restore
libcontainer now integrates [CRIU](http://criu.org/) for checkpointing and restoring containers.
This let's you save the state of a process running inside a container to disk, and then restore
This lets you save the state of a process running inside a container to disk, and then restore
that state into a new process, on the same machine or on another machine.
`criu` version 1.5.2 or higher is required to use checkpoint and restore.

View file

@ -7,37 +7,44 @@ import (
)
type Manager interface {
// Applies cgroup configuration to the process with the specified pid
// Apply creates a cgroup, if not yet created, and adds a process
// with the specified pid into that cgroup. A special value of -1
// can be used to merely create a cgroup.
Apply(pid int) error
// Returns the PIDs inside the cgroup set
// GetPids returns the PIDs of all processes inside the cgroup.
GetPids() ([]int, error)
// Returns the PIDs inside the cgroup set & all sub-cgroups
// GetAllPids returns the PIDs of all processes inside the cgroup
// any all its sub-cgroups.
GetAllPids() ([]int, error)
// Returns statistics for the cgroup set
// GetStats returns cgroups statistics.
GetStats() (*Stats, error)
// Toggles the freezer cgroup according with specified state
// Freeze sets the freezer cgroup to the specified state.
Freeze(state configs.FreezerState) error
// Destroys the cgroup set
// Destroy removes cgroup.
Destroy() error
// Path returns a cgroup path to the specified controller/subsystem.
// For cgroupv2, the argument is unused and can be empty.
Path(string) string
// Sets the cgroup as configured.
Set(container *configs.Config) error
// Set sets cgroup resources parameters/limits. If the argument is nil,
// the resources specified during Manager creation (or the previous call
// to Set) are used.
Set(r *configs.Resources) error
// GetPaths returns cgroup path(s) to save in a state file in order to restore later.
// GetPaths returns cgroup path(s) to save in a state file in order to
// restore later.
//
// For cgroup v1, a key is cgroup subsystem name, and the value is the path
// to the cgroup for this subsystem.
// For cgroup v1, a key is cgroup subsystem name, and the value is the
// path to the cgroup for this subsystem.
//
// For cgroup v2 unified hierarchy, a key is "", and the value is the unified path.
// For cgroup v2 unified hierarchy, a key is "", and the value is the
// unified path.
GetPaths() map[string]string
// GetCgroups returns the cgroup data as configured.
@ -46,6 +53,9 @@ type Manager interface {
// GetFreezerState retrieves the current FreezerState of the cgroup.
GetFreezerState() (configs.FreezerState, error)
// Whether the cgroup path exists or not
// Exists returns whether the cgroup path exists or not.
Exists() bool
// OOMKillCount reports OOM kill count for the cgroup.
OOMKillCount() (uint64, error)
}

View file

@ -0,0 +1,51 @@
// +build linux
package fscommon
import (
"bytes"
"os"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
// WriteFile writes data to a cgroup file in dir.
// It is supposed to be used for cgroup files only.
func WriteFile(dir, file, data string) error {
fd, err := OpenFile(dir, file, unix.O_WRONLY)
if err != nil {
return err
}
defer fd.Close()
if err := retryingWriteFile(fd, data); err != nil {
return errors.Wrapf(err, "failed to write %q", data)
}
return nil
}
// ReadFile reads data from a cgroup file in dir.
// It is supposed to be used for cgroup files only.
func ReadFile(dir, file string) (string, error) {
fd, err := OpenFile(dir, file, unix.O_RDONLY)
if err != nil {
return "", err
}
defer fd.Close()
var buf bytes.Buffer
_, err = buf.ReadFrom(fd)
return buf.String(), err
}
func retryingWriteFile(fd *os.File, data string) error {
for {
_, err := fd.Write([]byte(data))
if errors.Is(err, unix.EINTR) {
logrus.Infof("interrupted while writing %s to %s", data, fd.Name())
continue
}
return err
}
}

View file

@ -0,0 +1,120 @@
package fscommon
import (
"os"
"strings"
"sync"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
const (
cgroupfsDir = "/sys/fs/cgroup"
cgroupfsPrefix = cgroupfsDir + "/"
)
var (
// TestMode is set to true by unit tests that need "fake" cgroupfs.
TestMode bool
cgroupFd int = -1
prepOnce sync.Once
prepErr error
resolveFlags uint64
)
func prepareOpenat2() error {
prepOnce.Do(func() {
fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{
Flags: unix.O_DIRECTORY | unix.O_PATH})
if err != nil {
prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err}
if err != unix.ENOSYS {
logrus.Warnf("falling back to securejoin: %s", prepErr)
} else {
logrus.Debug("openat2 not available, falling back to securejoin")
}
return
}
var st unix.Statfs_t
if err = unix.Fstatfs(fd, &st); err != nil {
prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err}
logrus.Warnf("falling back to securejoin: %s", prepErr)
return
}
cgroupFd = fd
resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS
if st.Type == unix.CGROUP2_SUPER_MAGIC {
// cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks
resolveFlags |= unix.RESOLVE_NO_XDEV | unix.RESOLVE_NO_SYMLINKS
}
})
return prepErr
}
// OpenFile opens a cgroup file in a given dir with given flags.
// It is supposed to be used for cgroup files only.
func OpenFile(dir, file string, flags int) (*os.File, error) {
if dir == "" {
return nil, errors.Errorf("no directory specified for %s", file)
}
mode := os.FileMode(0)
if TestMode && flags&os.O_WRONLY != 0 {
// "emulate" cgroup fs for unit tests
flags |= os.O_TRUNC | os.O_CREATE
mode = 0o600
}
if prepareOpenat2() != nil {
return openFallback(dir, file, flags, mode)
}
reldir := strings.TrimPrefix(dir, cgroupfsPrefix)
if len(reldir) == len(dir) { // non-standard path, old system?
return openFallback(dir, file, flags, mode)
}
relname := reldir + "/" + file
fd, err := unix.Openat2(cgroupFd, relname,
&unix.OpenHow{
Resolve: resolveFlags,
Flags: uint64(flags) | unix.O_CLOEXEC,
Mode: uint64(mode),
})
if err != nil {
return nil, &os.PathError{Op: "openat2", Path: dir + "/" + file, Err: err}
}
return os.NewFile(uintptr(fd), cgroupfsPrefix+relname), nil
}
var errNotCgroupfs = errors.New("not a cgroup file")
// openFallback is used when openat2(2) is not available. It checks the opened
// file is on cgroupfs, returning an error otherwise.
func openFallback(dir, file string, flags int, mode os.FileMode) (*os.File, error) {
path := dir + "/" + file
fd, err := os.OpenFile(path, flags, mode)
if err != nil {
return nil, err
}
if TestMode {
return fd, nil
}
// Check this is a cgroupfs file.
var st unix.Statfs_t
if err := unix.Fstatfs(int(fd.Fd()), &st); err != nil {
_ = fd.Close()
return nil, &os.PathError{Op: "statfs", Path: path, Err: err}
}
if st.Type != unix.CGROUP_SUPER_MAGIC && st.Type != unix.CGROUP2_SUPER_MAGIC {
_ = fd.Close()
return nil, &os.PathError{Op: "open", Path: path, Err: errNotCgroupfs}
}
return fd, nil
}

View file

@ -0,0 +1,122 @@
// +build linux
package fscommon
import (
"errors"
"fmt"
"math"
"strconv"
"strings"
)
var (
ErrNotValidFormat = errors.New("line is not a valid key value format")
)
// ParseUint converts a string to an uint64 integer.
// Negative values are returned at zero as, due to kernel bugs,
// some of the memory cgroup stats can be negative.
func ParseUint(s string, base, bitSize int) (uint64, error) {
value, err := strconv.ParseUint(s, base, bitSize)
if err != nil {
intValue, intErr := strconv.ParseInt(s, base, bitSize)
// 1. Handle negative values greater than MinInt64 (and)
// 2. Handle negative values lesser than MinInt64
if intErr == nil && intValue < 0 {
return 0, nil
} else if intErr != nil && intErr.(*strconv.NumError).Err == strconv.ErrRange && intValue < 0 {
return 0, nil
}
return value, err
}
return value, nil
}
// ParseKeyValue parses a space-separated "name value" kind of cgroup
// parameter and returns its key as a string, and its value as uint64
// (ParseUint is used to convert the value). For example,
// "io_service_bytes 1234" will be returned as "io_service_bytes", 1234.
func ParseKeyValue(t string) (string, uint64, error) {
parts := strings.SplitN(t, " ", 3)
if len(parts) != 2 {
return "", 0, fmt.Errorf("line %q is not in key value format", t)
}
value, err := ParseUint(parts[1], 10, 64)
if err != nil {
return "", 0, fmt.Errorf("unable to convert to uint64: %v", err)
}
return parts[0], value, nil
}
// GetValueByKey reads a key-value pairs from the specified cgroup file,
// and returns a value of the specified key. ParseUint is used for value
// conversion.
func GetValueByKey(path, file, key string) (uint64, error) {
content, err := ReadFile(path, file)
if err != nil {
return 0, err
}
lines := strings.Split(string(content), "\n")
for _, line := range lines {
arr := strings.Split(line, " ")
if len(arr) == 2 && arr[0] == key {
return ParseUint(arr[1], 10, 64)
}
}
return 0, nil
}
// GetCgroupParamUint reads a single uint64 value from the specified cgroup file.
// If the value read is "max", the math.MaxUint64 is returned.
func GetCgroupParamUint(path, file string) (uint64, error) {
contents, err := GetCgroupParamString(path, file)
if err != nil {
return 0, err
}
contents = strings.TrimSpace(contents)
if contents == "max" {
return math.MaxUint64, nil
}
res, err := ParseUint(contents, 10, 64)
if err != nil {
return res, fmt.Errorf("unable to parse file %q", path+"/"+file)
}
return res, nil
}
// GetCgroupParamInt reads a single int64 value from specified cgroup file.
// If the value read is "max", the math.MaxInt64 is returned.
func GetCgroupParamInt(path, file string) (int64, error) {
contents, err := ReadFile(path, file)
if err != nil {
return 0, err
}
contents = strings.TrimSpace(contents)
if contents == "max" {
return math.MaxInt64, nil
}
res, err := strconv.ParseInt(contents, 10, 64)
if err != nil {
return res, fmt.Errorf("unable to parse %q as a int from Cgroup file %q", contents, path+"/"+file)
}
return res, nil
}
// GetCgroupParamString reads a string from the specified cgroup file.
func GetCgroupParamString(path, file string) (string, error) {
contents, err := ReadFile(path, file)
if err != nil {
return "", err
}
return strings.TrimSpace(contents), nil
}

View file

@ -39,6 +39,33 @@ type CpuStats struct {
ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
}
type CPUSetStats struct {
// List of the physical numbers of the CPUs on which processes
// in that cpuset are allowed to execute
CPUs []uint16 `json:"cpus,omitempty"`
// cpu_exclusive flag
CPUExclusive uint64 `json:"cpu_exclusive"`
// List of memory nodes on which processes in that cpuset
// are allowed to allocate memory
Mems []uint16 `json:"mems,omitempty"`
// mem_hardwall flag
MemHardwall uint64 `json:"mem_hardwall"`
// mem_exclusive flag
MemExclusive uint64 `json:"mem_exclusive"`
// memory_migrate flag
MemoryMigrate uint64 `json:"memory_migrate"`
// memory_spread page flag
MemorySpreadPage uint64 `json:"memory_spread_page"`
// memory_spread slab flag
MemorySpreadSlab uint64 `json:"memory_spread_slab"`
// memory_pressure
MemoryPressure uint64 `json:"memory_pressure"`
// sched_load balance flag
SchedLoadBalance uint64 `json:"sched_load_balance"`
// sched_relax_domain_level
SchedRelaxDomainLevel int64 `json:"sched_relax_domain_level"`
}
type MemoryData struct {
Usage uint64 `json:"usage,omitempty"`
MaxUsage uint64 `json:"max_usage,omitempty"`
@ -121,6 +148,7 @@ type HugetlbStats struct {
type Stats struct {
CpuStats CpuStats `json:"cpu_stats,omitempty"`
CPUSetStats CPUSetStats `json:"cpuset_stats,omitempty"`
MemoryStats MemoryStats `json:"memory_stats,omitempty"`
PidsStats PidsStats `json:"pids_stats,omitempty"`
BlkioStats BlkioStats `json:"blkio_stats,omitempty"`

View file

@ -15,7 +15,9 @@ import (
"sync"
"time"
units "github.com/docker/go-units"
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
"github.com/opencontainers/runc/libcontainer/userns"
"github.com/sirupsen/logrus"
"golang.org/x/sys/unix"
)
@ -29,19 +31,19 @@ var (
isUnified bool
)
// HugePageSizeUnitList is a list of the units used by the linux kernel when
// naming the HugePage control files.
// https://www.kernel.org/doc/Documentation/cgroup-v1/hugetlb.txt
// TODO Since the kernel only use KB, MB and GB; TB and PB should be removed,
// depends on https://github.com/docker/go-units/commit/a09cd47f892041a4fac473133d181f5aea6fa393
var HugePageSizeUnitList = []string{"B", "KB", "MB", "GB", "TB", "PB"}
// IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode.
func IsCgroup2UnifiedMode() bool {
isUnifiedOnce.Do(func() {
var st unix.Statfs_t
if err := unix.Statfs(unifiedMountpoint, &st); err != nil {
panic("cannot statfs cgroup root")
err := unix.Statfs(unifiedMountpoint, &st)
if err != nil {
if os.IsNotExist(err) && userns.RunningInUserNS() {
// ignore the "not found" error if running in userns
logrus.WithError(err).Debugf("%s missing, assuming cgroup v1", unifiedMountpoint)
isUnified = false
return
}
panic(fmt.Sprintf("cannot statfs cgroup root: %s", err))
}
isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
})
@ -86,11 +88,11 @@ func GetAllSubsystems() ([]string, error) {
// - freezer: implemented in kernel 5.2
// We assume these are always available, as it is hard to detect availability.
pseudo := []string{"devices", "freezer"}
data, err := ioutil.ReadFile("/sys/fs/cgroup/cgroup.controllers")
data, err := fscommon.ReadFile("/sys/fs/cgroup", "cgroup.controllers")
if err != nil {
return nil, err
}
subsystems := append(pseudo, strings.Fields(string(data))...)
subsystems := append(pseudo, strings.Fields(data)...)
return subsystems, nil
}
f, err := os.Open("/proc/cgroups")
@ -207,20 +209,66 @@ func EnterPid(cgroupPaths map[string]string, pid int) error {
return nil
}
func rmdir(path string) error {
err := unix.Rmdir(path)
if err == nil || err == unix.ENOENT {
return nil
}
return &os.PathError{Op: "rmdir", Path: path, Err: err}
}
// RemovePath aims to remove cgroup path. It does so recursively,
// by removing any subdirectories (sub-cgroups) first.
func RemovePath(path string) error {
// try the fast path first
if err := rmdir(path); err == nil {
return nil
}
infos, err := ioutil.ReadDir(path)
if err != nil {
if os.IsNotExist(err) {
err = nil
}
return err
}
for _, info := range infos {
if info.IsDir() {
// We should remove subcgroups dir first
if err = RemovePath(filepath.Join(path, info.Name())); err != nil {
break
}
}
}
if err == nil {
err = rmdir(path)
}
return err
}
// RemovePaths iterates over the provided paths removing them.
// We trying to remove all paths five times with increasing delay between tries.
// If after all there are not removed cgroups - appropriate error will be
// returned.
func RemovePaths(paths map[string]string) (err error) {
const retries = 5
delay := 10 * time.Millisecond
for i := 0; i < 5; i++ {
for i := 0; i < retries; i++ {
if i != 0 {
time.Sleep(delay)
delay *= 2
}
for s, p := range paths {
os.RemoveAll(p)
// TODO: here probably should be logging
if err := RemovePath(p); err != nil {
// do not log intermediate iterations
switch i {
case 0:
logrus.WithError(err).Warnf("Failed to remove cgroup (will retry)")
case retries - 1:
logrus.WithError(err).Error("Failed to remove cgroup")
}
}
_, err := os.Stat(p)
// We need this strange way of checking cgroups existence because
// RemoveAll almost always returns error, even on already removed
@ -230,6 +278,8 @@ func RemovePaths(paths map[string]string) (err error) {
}
}
if len(paths) == 0 {
//nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506
paths = make(map[string]string)
return nil
}
}
@ -237,27 +287,50 @@ func RemovePaths(paths map[string]string) (err error) {
}
func GetHugePageSize() ([]string, error) {
files, err := ioutil.ReadDir("/sys/kernel/mm/hugepages")
dir, err := os.OpenFile("/sys/kernel/mm/hugepages", unix.O_DIRECTORY|unix.O_RDONLY, 0)
if err != nil {
return []string{}, err
return nil, err
}
var fileNames []string
for _, st := range files {
fileNames = append(fileNames, st.Name())
files, err := dir.Readdirnames(0)
dir.Close()
if err != nil {
return nil, err
}
return getHugePageSizeFromFilenames(fileNames)
return getHugePageSizeFromFilenames(files)
}
func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) {
var pageSizes []string
for _, fileName := range fileNames {
nameArray := strings.Split(fileName, "-")
pageSize, err := units.RAMInBytes(nameArray[1])
if err != nil {
return []string{}, err
pageSizes := make([]string, 0, len(fileNames))
for _, file := range fileNames {
// example: hugepages-1048576kB
val := strings.TrimPrefix(file, "hugepages-")
if len(val) == len(file) {
// unexpected file name: no prefix found
continue
}
sizeString := units.CustomSize("%g%s", float64(pageSize), 1024.0, HugePageSizeUnitList)
pageSizes = append(pageSizes, sizeString)
// The suffix is always "kB" (as of Linux 5.9)
eLen := len(val) - 2
val = strings.TrimSuffix(val, "kB")
if len(val) != eLen {
logrus.Warnf("GetHugePageSize: %s: invalid filename suffix (expected \"kB\")", file)
continue
}
size, err := strconv.Atoi(val)
if err != nil {
return nil, err
}
// Model after https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/hugetlb_cgroup.c?id=eff48ddeab782e35e58ccc8853f7386bbae9dec4#n574
// but in our case the size is in KB already.
if size >= (1 << 20) {
val = strconv.Itoa(size>>20) + "GB"
} else if size >= (1 << 10) {
val = strconv.Itoa(size>>10) + "MB"
} else {
val += "KB"
}
pageSizes = append(pageSizes, val)
}
return pageSizes, nil
@ -303,14 +376,14 @@ func WriteCgroupProc(dir string, pid int) error {
return nil
}
cgroupProcessesFile, err := os.OpenFile(filepath.Join(dir, CgroupProcesses), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0700)
file, err := fscommon.OpenFile(dir, CgroupProcesses, os.O_WRONLY)
if err != nil {
return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err)
}
defer cgroupProcessesFile.Close()
defer file.Close()
for i := 0; i < 5; i++ {
_, err = cgroupProcessesFile.WriteString(strconv.Itoa(pid))
_, err = file.WriteString(strconv.Itoa(pid))
if err == nil {
return nil
}
@ -327,17 +400,6 @@ func WriteCgroupProc(dir string, pid int) error {
return err
}
// Since the OCI spec is designed for cgroup v1, in some cases
// there is need to convert from the cgroup v1 configuration to cgroup v2
// the formula for BlkIOWeight is y = (1 + (x - 10) * 9999 / 990)
// convert linearly from [10-1000] to [1-10000]
func ConvertBlkIOToCgroupV2Value(blkIoWeight uint16) uint64 {
if blkIoWeight == 0 {
return 0
}
return uint64(1 + (uint64(blkIoWeight)-10)*9999/990)
}
// Since the OCI spec is designed for cgroup v1, in some cases
// there is need to convert from the cgroup v1 configuration to cgroup v2
// the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142)
@ -377,3 +439,14 @@ func ConvertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) {
return memorySwap - memory, nil
}
// Since the OCI spec is designed for cgroup v1, in some cases
// there is need to convert from the cgroup v1 configuration to cgroup v2
// the formula for BlkIOWeight to IOWeight is y = (1 + (x - 10) * 9999 / 990)
// convert linearly from [10-1000] to [1-10000]
func ConvertBlkIOToIOWeightValue(blkIoWeight uint16) uint64 {
if blkIoWeight == 0 {
return 0
}
return uint64(1 + (uint64(blkIoWeight)-10)*9999/990)
}

View file

@ -1,16 +1,16 @@
package cgroups
import (
"bufio"
"errors"
"fmt"
"io"
"os"
"path/filepath"
"strings"
"sync"
"syscall"
securejoin "github.com/cyphar/filepath-securejoin"
"github.com/moby/sys/mountinfo"
"golang.org/x/sys/unix"
)
@ -23,7 +23,12 @@ const (
)
var (
errUnified = errors.New("not implemented for cgroup v2 unified hierarchy")
errUnified = errors.New("not implemented for cgroup v2 unified hierarchy")
ErrV1NoUnified = errors.New("invalid configuration: cannot use unified on cgroup v1")
readMountinfoOnce sync.Once
readMountinfoErr error
cgroupMountinfo []*mountinfo.Info
)
type NotFoundError struct {
@ -90,6 +95,21 @@ func tryDefaultPath(cgroupPath, subsystem string) string {
return path
}
// readCgroupMountinfo returns a list of cgroup v1 mounts (i.e. the ones
// with fstype of "cgroup") for the current running process.
//
// The results are cached (to avoid re-reading mountinfo which is relatively
// expensive), so it is assumed that cgroup mounts are not being changed.
func readCgroupMountinfo() ([]*mountinfo.Info, error) {
readMountinfoOnce.Do(func() {
cgroupMountinfo, readMountinfoErr = mountinfo.GetMounts(
mountinfo.FSTypeFilter("cgroup"),
)
})
return cgroupMountinfo, readMountinfoErr
}
// https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt
func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) {
if IsCgroup2UnifiedMode() {
@ -110,56 +130,28 @@ func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string,
return "", "", errUnified
}
// Avoid parsing mountinfo by checking if subsystem is valid/available.
if !isSubsystemAvailable(subsystem) {
return "", "", NewNotFoundError(subsystem)
}
f, err := os.Open("/proc/self/mountinfo")
mi, err := readCgroupMountinfo()
if err != nil {
return "", "", err
}
defer f.Close()
return findCgroupMountpointAndRootFromReader(f, cgroupPath, subsystem)
return findCgroupMountpointAndRootFromMI(mi, cgroupPath, subsystem)
}
func findCgroupMountpointAndRootFromReader(reader io.Reader, cgroupPath, subsystem string) (string, string, error) {
scanner := bufio.NewScanner(reader)
for scanner.Scan() {
txt := scanner.Text()
fields := strings.Fields(txt)
if len(fields) < 9 {
continue
}
if strings.HasPrefix(fields[4], cgroupPath) {
for _, opt := range strings.Split(fields[len(fields)-1], ",") {
func findCgroupMountpointAndRootFromMI(mounts []*mountinfo.Info, cgroupPath, subsystem string) (string, string, error) {
for _, mi := range mounts {
if strings.HasPrefix(mi.Mountpoint, cgroupPath) {
for _, opt := range strings.Split(mi.VFSOptions, ",") {
if opt == subsystem {
return fields[4], fields[3], nil
return mi.Mountpoint, mi.Root, nil
}
}
}
}
if err := scanner.Err(); err != nil {
return "", "", err
}
return "", "", NewNotFoundError(subsystem)
}
func isSubsystemAvailable(subsystem string) bool {
if IsCgroup2UnifiedMode() {
panic("don't call isSubsystemAvailable from cgroupv2 code")
}
cgroups, err := ParseCgroupFile("/proc/self/cgroup")
if err != nil {
return false
}
_, avail := cgroups[subsystem]
return avail
}
func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
if len(m.Subsystems) == 0 {
return "", fmt.Errorf("no subsystem for mount")
@ -168,25 +160,15 @@ func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
return getControllerPath(m.Subsystems[0], cgroups)
}
func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount, error) {
func getCgroupMountsHelper(ss map[string]bool, mounts []*mountinfo.Info, all bool) ([]Mount, error) {
res := make([]Mount, 0, len(ss))
scanner := bufio.NewScanner(mi)
numFound := 0
for scanner.Scan() && numFound < len(ss) {
txt := scanner.Text()
sepIdx := strings.Index(txt, " - ")
if sepIdx == -1 {
return nil, fmt.Errorf("invalid mountinfo format")
}
if txt[sepIdx+3:sepIdx+10] == "cgroup2" || txt[sepIdx+3:sepIdx+9] != "cgroup" {
continue
}
fields := strings.Split(txt, " ")
for _, mi := range mounts {
m := Mount{
Mountpoint: fields[4],
Root: fields[3],
Mountpoint: mi.Mountpoint,
Root: mi.Root,
}
for _, opt := range strings.Split(fields[len(fields)-1], ",") {
for _, opt := range strings.Split(mi.VFSOptions, ",") {
seen, known := ss[opt]
if !known || (!all && seen) {
continue
@ -199,19 +181,18 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount,
if len(m.Subsystems) > 0 || all {
res = append(res, m)
}
}
if err := scanner.Err(); err != nil {
return nil, err
if !all && numFound >= len(ss) {
break
}
}
return res, nil
}
func getCgroupMountsV1(all bool) ([]Mount, error) {
f, err := os.Open("/proc/self/mountinfo")
mi, err := readCgroupMountinfo()
if err != nil {
return nil, err
}
defer f.Close()
allSubsystems, err := ParseCgroupFile("/proc/self/cgroup")
if err != nil {
@ -222,7 +203,8 @@ func getCgroupMountsV1(all bool) ([]Mount, error) {
for s := range allSubsystems {
allMap[s] = false
}
return getCgroupMountsHelper(allMap, f, all)
return getCgroupMountsHelper(allMap, mi, all)
}
// GetOwnCgroup returns the relative path to the cgroup docker is running in.

View file

@ -2,6 +2,7 @@ package configs
import (
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
"github.com/opencontainers/runc/libcontainer/devices"
)
type FreezerState string
@ -42,7 +43,7 @@ type Cgroup struct {
type Resources struct {
// Devices is the set of access rules for devices in the container.
Devices []*DeviceRule `json:"devices"`
Devices []*devices.Rule `json:"devices"`
// Memory limit (in bytes)
Memory int64 `json:"memory"`
@ -53,12 +54,6 @@ type Resources struct {
// Total memory usage (memory + swap); set `-1` to enable unlimited swap
MemorySwap int64 `json:"memory_swap"`
// Kernel memory limit (in bytes)
KernelMemory int64 `json:"kernel_memory"`
// Kernel memory limit for TCP use (in bytes)
KernelMemoryTCP int64 `json:"kernel_memory_tcp"`
// CPU shares (relative weight vs. other containers)
CpuShares uint64 `json:"cpu_shares"`
@ -127,6 +122,9 @@ type Resources struct {
// CpuWeight sets a proportional bandwidth limit.
CpuWeight uint64 `json:"cpu_weight"`
// Unified is cgroupv2-only key-value map.
Unified map[string]string `json:"unified"`
// SkipDevices allows to skip configuring device permissions.
// Used by e.g. kubelet while creating a parent cgroup (kubepods)
// common for many containers.

View file

@ -7,6 +7,7 @@ import (
"os/exec"
"time"
"github.com/opencontainers/runc/libcontainer/devices"
"github.com/opencontainers/runtime-spec/specs-go"
"github.com/pkg/errors"
"github.com/sirupsen/logrus"
@ -30,9 +31,10 @@ type IDMap struct {
// for syscalls. Additional architectures can be added by specifying them in
// Architectures.
type Seccomp struct {
DefaultAction Action `json:"default_action"`
Architectures []string `json:"architectures"`
Syscalls []*Syscall `json:"syscalls"`
DefaultAction Action `json:"default_action"`
Architectures []string `json:"architectures"`
Syscalls []*Syscall `json:"syscalls"`
DefaultErrnoRet *uint `json:"default_errno_ret"`
}
// Action is taken upon rule match in Seccomp
@ -92,6 +94,9 @@ type Config struct {
// Path to a directory containing the container's root filesystem.
Rootfs string `json:"rootfs"`
// Umask is the umask to use inside of the container.
Umask *uint32 `json:"umask"`
// Readonlyfs will remount the container's rootfs as readonly where only externally mounted
// bind mounts are writtable.
Readonlyfs bool `json:"readonlyfs"`
@ -104,7 +109,7 @@ type Config struct {
Mounts []*Mount `json:"mounts"`
// The device nodes that should be automatically created within the container upon container start. Note, make sure that the node is marked as allowed in the cgroup as well!
Devices []*Device `json:"devices"`
Devices []*devices.Device `json:"devices"`
MountLabel string `json:"mount_label"`
@ -218,25 +223,25 @@ const (
// the runtime environment has been created but before the pivot_root has been executed.
// CreateRuntime is called immediately after the deprecated Prestart hook.
// CreateRuntime commands are called in the Runtime Namespace.
CreateRuntime = "createRuntime"
CreateRuntime HookName = "createRuntime"
// CreateContainer commands MUST be called as part of the create operation after
// the runtime environment has been created but before the pivot_root has been executed.
// CreateContainer commands are called in the Container namespace.
CreateContainer = "createContainer"
CreateContainer HookName = "createContainer"
// StartContainer commands MUST be called as part of the start operation and before
// the container process is started.
// StartContainer commands are called in the Container namespace.
StartContainer = "startContainer"
StartContainer HookName = "startContainer"
// Poststart commands are executed after the container init process starts.
// Poststart commands are called in the Runtime Namespace.
Poststart = "poststart"
Poststart HookName = "poststart"
// Poststop commands are executed after the container init process exits.
// Poststop commands are called in the Runtime Namespace.
Poststop = "poststop"
Poststop HookName = "poststop"
)
type Capabilities struct {
@ -383,7 +388,7 @@ func (c Command) Run(s *specs.State) error {
return err
case <-timerCh:
cmd.Process.Kill()
cmd.Wait()
<-errC
return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds())
}
}

View file

@ -0,0 +1,9 @@
// +build gofuzz
package configs
func FuzzUnmarshalJSON(data []byte) int {
hooks := Hooks{}
_ = hooks.UnmarshalJSON(data)
return 1
}

View file

@ -1,16 +0,0 @@
// +build !windows
package configs
import (
"errors"
"golang.org/x/sys/unix"
)
func (d *DeviceRule) Mkdev() (uint64, error) {
if d.Major == Wildcard || d.Minor == Wildcard {
return 0, errors.New("cannot mkdev() device with wildcards")
}
return unix.Mkdev(uint32(d.Major), uint32(d.Minor)), nil
}

View file

@ -1,5 +0,0 @@
package configs
func (d *DeviceRule) Mkdev() (uint64, error) {
return 0, nil
}

View file

@ -0,0 +1,17 @@
package configs
import "github.com/opencontainers/runc/libcontainer/devices"
type (
// Deprecated: use libcontainer/devices.Device
Device = devices.Device
// Deprecated: use libcontainer/devices.Rule
DeviceRule = devices.Rule
// Deprecated: use libcontainer/devices.Type
DeviceType = devices.Type
// Deprecated: use libcontainer/devices.Permissions
DevicePermissions = devices.Permissions
)

View file

@ -56,7 +56,7 @@ func IsNamespaceSupported(ns NamespaceType) bool {
if nsFile == "" {
return false
}
_, err := os.Stat(fmt.Sprintf("/proc/self/ns/%s", nsFile))
_, err := os.Stat("/proc/self/ns/" + nsFile)
// a namespace is supported if it exists and we have permissions to read it
supported = err == nil
supportedNamespaces[ns] = supported

View file

@ -1,4 +1,4 @@
package configs
package devices
import (
"fmt"
@ -11,7 +11,7 @@ const (
)
type Device struct {
DeviceRule
Rule
// Path to the device.
Path string `json:"path"`
@ -26,10 +26,10 @@ type Device struct {
Gid uint32 `json:"gid"`
}
// DevicePermissions is a cgroupv1-style string to represent device access. It
// Permissions is a cgroupv1-style string to represent device access. It
// has to be a string for backward compatibility reasons, hence why it has
// methods to do set operations.
type DevicePermissions string
type Permissions string
const (
deviceRead uint = (1 << iota)
@ -37,7 +37,7 @@ const (
deviceMknod
)
func (p DevicePermissions) toSet() uint {
func (p Permissions) toSet() uint {
var set uint
for _, perm := range p {
switch perm {
@ -52,7 +52,7 @@ func (p DevicePermissions) toSet() uint {
return set
}
func fromSet(set uint) DevicePermissions {
func fromSet(set uint) Permissions {
var perm string
if set&deviceRead == deviceRead {
perm += "r"
@ -63,53 +63,53 @@ func fromSet(set uint) DevicePermissions {
if set&deviceMknod == deviceMknod {
perm += "m"
}
return DevicePermissions(perm)
return Permissions(perm)
}
// Union returns the union of the two sets of DevicePermissions.
func (p DevicePermissions) Union(o DevicePermissions) DevicePermissions {
// Union returns the union of the two sets of Permissions.
func (p Permissions) Union(o Permissions) Permissions {
lhs := p.toSet()
rhs := o.toSet()
return fromSet(lhs | rhs)
}
// Difference returns the set difference of the two sets of DevicePermissions.
// Difference returns the set difference of the two sets of Permissions.
// In set notation, A.Difference(B) gives you A\B.
func (p DevicePermissions) Difference(o DevicePermissions) DevicePermissions {
func (p Permissions) Difference(o Permissions) Permissions {
lhs := p.toSet()
rhs := o.toSet()
return fromSet(lhs &^ rhs)
}
// Intersection computes the intersection of the two sets of DevicePermissions.
func (p DevicePermissions) Intersection(o DevicePermissions) DevicePermissions {
// Intersection computes the intersection of the two sets of Permissions.
func (p Permissions) Intersection(o Permissions) Permissions {
lhs := p.toSet()
rhs := o.toSet()
return fromSet(lhs & rhs)
}
// IsEmpty returns whether the set of permissions in a DevicePermissions is
// IsEmpty returns whether the set of permissions in a Permissions is
// empty.
func (p DevicePermissions) IsEmpty() bool {
return p == DevicePermissions("")
func (p Permissions) IsEmpty() bool {
return p == Permissions("")
}
// IsValid returns whether the set of permissions is a subset of valid
// permissions (namely, {r,w,m}).
func (p DevicePermissions) IsValid() bool {
func (p Permissions) IsValid() bool {
return p == fromSet(p.toSet())
}
type DeviceType rune
type Type rune
const (
WildcardDevice DeviceType = 'a'
BlockDevice DeviceType = 'b'
CharDevice DeviceType = 'c' // or 'u'
FifoDevice DeviceType = 'p'
WildcardDevice Type = 'a'
BlockDevice Type = 'b'
CharDevice Type = 'c' // or 'u'
FifoDevice Type = 'p'
)
func (t DeviceType) IsValid() bool {
func (t Type) IsValid() bool {
switch t {
case WildcardDevice, BlockDevice, CharDevice, FifoDevice:
return true
@ -118,7 +118,7 @@ func (t DeviceType) IsValid() bool {
}
}
func (t DeviceType) CanMknod() bool {
func (t Type) CanMknod() bool {
switch t {
case BlockDevice, CharDevice, FifoDevice:
return true
@ -127,7 +127,7 @@ func (t DeviceType) CanMknod() bool {
}
}
func (t DeviceType) CanCgroup() bool {
func (t Type) CanCgroup() bool {
switch t {
case WildcardDevice, BlockDevice, CharDevice:
return true
@ -136,10 +136,10 @@ func (t DeviceType) CanCgroup() bool {
}
}
type DeviceRule struct {
type Rule struct {
// Type of device ('c' for char, 'b' for block). If set to 'a', this rule
// acts as a wildcard and all fields other than Allow are ignored.
Type DeviceType `json:"type"`
Type Type `json:"type"`
// Major is the device's major number.
Major int64 `json:"major"`
@ -149,13 +149,13 @@ type DeviceRule struct {
// Permissions is the set of permissions that this rule applies to (in the
// cgroupv1 format -- any combination of "rwm").
Permissions DevicePermissions `json:"permissions"`
Permissions Permissions `json:"permissions"`
// Allow specifies whether this rule is allowed.
Allow bool `json:"allow"`
}
func (d *DeviceRule) CgroupString() string {
func (d *Rule) CgroupString() string {
var (
major = strconv.FormatInt(d.Major, 10)
minor = strconv.FormatInt(d.Minor, 10)
@ -168,3 +168,7 @@ func (d *DeviceRule) CgroupString() string {
}
return fmt.Sprintf("%c %s:%s %s", d.Type, major, minor, d.Permissions)
}
func (d *Rule) Mkdev() (uint64, error) {
return mkDev(d)
}

View file

@ -1,3 +1,5 @@
// +build !windows
package devices
import (
@ -6,7 +8,6 @@ import (
"os"
"path/filepath"
"github.com/opencontainers/runc/libcontainer/configs"
"golang.org/x/sys/unix"
)
@ -21,9 +22,16 @@ var (
ioutilReadDir = ioutil.ReadDir
)
func mkDev(d *Rule) (uint64, error) {
if d.Major == Wildcard || d.Minor == Wildcard {
return 0, errors.New("cannot mkdev() device with wildcards")
}
return unix.Mkdev(uint32(d.Major), uint32(d.Minor)), nil
}
// Given the path to a device and its cgroup_permissions(which cannot be easily queried) look up the
// information about a linux device and return that information as a Device struct.
func DeviceFromPath(path, permissions string) (*configs.Device, error) {
func DeviceFromPath(path, permissions string) (*Device, error) {
var stat unix.Stat_t
err := unixLstat(path, &stat)
if err != nil {
@ -31,7 +39,7 @@ func DeviceFromPath(path, permissions string) (*configs.Device, error) {
}
var (
devType configs.DeviceType
devType Type
mode = stat.Mode
devNumber = uint64(stat.Rdev)
major = unix.Major(devNumber)
@ -39,41 +47,41 @@ func DeviceFromPath(path, permissions string) (*configs.Device, error) {
)
switch mode & unix.S_IFMT {
case unix.S_IFBLK:
devType = configs.BlockDevice
devType = BlockDevice
case unix.S_IFCHR:
devType = configs.CharDevice
devType = CharDevice
case unix.S_IFIFO:
devType = configs.FifoDevice
devType = FifoDevice
default:
return nil, ErrNotADevice
}
return &configs.Device{
DeviceRule: configs.DeviceRule{
return &Device{
Rule: Rule{
Type: devType,
Major: int64(major),
Minor: int64(minor),
Permissions: configs.DevicePermissions(permissions),
Permissions: Permissions(permissions),
},
Path: path,
FileMode: os.FileMode(mode),
FileMode: os.FileMode(mode &^ unix.S_IFMT),
Uid: stat.Uid,
Gid: stat.Gid,
}, nil
}
// HostDevices returns all devices that can be found under /dev directory.
func HostDevices() ([]*configs.Device, error) {
func HostDevices() ([]*Device, error) {
return GetDevices("/dev")
}
// GetDevices recursively traverses a directory specified by path
// and returns all devices found there.
func GetDevices(path string) ([]*configs.Device, error) {
func GetDevices(path string) ([]*Device, error) {
files, err := ioutilReadDir(path)
if err != nil {
return nil, err
}
var out []*configs.Device
var out []*Device
for _, f := range files {
switch {
case f.IsDir():
@ -104,7 +112,7 @@ func GetDevices(path string) ([]*configs.Device, error) {
}
return nil, err
}
if device.Type == configs.FifoDevice {
if device.Type == FifoDevice {
continue
}
out = append(out, device)

View file

@ -59,14 +59,38 @@
#include <sys/syscall.h>
/* Use our own wrapper for memfd_create. */
#if !defined(SYS_memfd_create) && defined(__NR_memfd_create)
# define SYS_memfd_create __NR_memfd_create
#ifndef SYS_memfd_create
# ifdef __NR_memfd_create
# define SYS_memfd_create __NR_memfd_create
# else
/* These values come from <https://fedora.juszkiewicz.com.pl/syscalls.html>. */
# warning "libc is outdated -- using hard-coded SYS_memfd_create"
# if defined(__x86_64__)
# define SYS_memfd_create 319
# elif defined(__i386__)
# define SYS_memfd_create 356
# elif defined(__ia64__)
# define SYS_memfd_create 1340
# elif defined(__arm__)
# define SYS_memfd_create 385
# elif defined(__aarch64__)
# define SYS_memfd_create 279
# elif defined(__ppc__) || defined(__PPC64__) || defined(__powerpc64__)
# define SYS_memfd_create 360
# elif defined(__s390__) || defined(__s390x__)
# define SYS_memfd_create 350
# else
# warning "unknown architecture -- cannot hard-code SYS_memfd_create"
# endif
# endif
#endif
/* memfd_create(2) flags -- copied from <linux/memfd.h>. */
#ifndef MFD_CLOEXEC
# define MFD_CLOEXEC 0x0001U
# define MFD_ALLOW_SEALING 0x0002U
#endif
int memfd_create(const char *name, unsigned int flags)
{
#ifdef SYS_memfd_create
@ -77,7 +101,6 @@ int memfd_create(const char *name, unsigned int flags)
#endif
}
/* This comes directly from <linux/fcntl.h>. */
#ifndef F_LINUX_SPECIFIC_BASE
# define F_LINUX_SPECIFIC_BASE 1024
@ -103,7 +126,7 @@ static void *must_realloc(void *ptr, size_t size)
void *old = ptr;
do {
ptr = realloc(old, size);
} while(!ptr);
} while (!ptr);
return ptr;
}
@ -115,10 +138,10 @@ static void *must_realloc(void *ptr, size_t size)
static int is_self_cloned(void)
{
int fd, ret, is_cloned = 0;
struct stat statbuf = {};
struct statfs fsbuf = {};
struct stat statbuf = { };
struct statfs fsbuf = { };
fd = open("/proc/self/exe", O_RDONLY|O_CLOEXEC);
fd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
if (fd < 0) {
fprintf(stderr, "you have no read access to runc binary file\n");
return -ENOTRECOVERABLE;
@ -274,7 +297,7 @@ enum {
static int make_execfd(int *fdtype)
{
int fd = -1;
char template[PATH_MAX] = {0};
char template[PATH_MAX] = { 0 };
char *prefix = getenv("_LIBCONTAINER_STATEDIR");
if (!prefix || *prefix != '/')
@ -303,7 +326,7 @@ static int make_execfd(int *fdtype)
*fdtype = EFD_FILE;
fd = open(prefix, O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700);
if (fd >= 0) {
struct stat statbuf = {};
struct stat statbuf = { };
bool working_otmpfile = false;
/*
@ -348,27 +371,27 @@ static int seal_execfd(int *fd, int fdtype)
switch (fdtype) {
case EFD_MEMFD:
return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_SEALS);
case EFD_FILE: {
/* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */
int newfd;
char fdpath[PATH_MAX] = {0};
case EFD_FILE:{
/* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */
int newfd;
char fdpath[PATH_MAX] = { 0 };
if (fchmod(*fd, 0100) < 0)
return -1;
if (fchmod(*fd, 0100) < 0)
return -1;
if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0)
return -1;
if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0)
return -1;
newfd = open(fdpath, O_PATH | O_CLOEXEC);
if (newfd < 0)
return -1;
newfd = open(fdpath, O_PATH | O_CLOEXEC);
if (newfd < 0)
return -1;
close(*fd);
*fd = newfd;
return 0;
}
close(*fd);
*fd = newfd;
return 0;
}
default:
break;
break;
}
return -1;
}
@ -376,7 +399,7 @@ static int seal_execfd(int *fd, int fdtype)
static int try_bindfd(void)
{
int fd, ret = -1;
char template[PATH_MAX] = {0};
char template[PATH_MAX] = { 0 };
char *prefix = getenv("_LIBCONTAINER_STATEDIR");
if (!prefix || *prefix != '/')
@ -404,7 +427,6 @@ static int try_bindfd(void)
if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0)
goto out_umount;
/* Get read-only handle that we're sure can't be made read-write. */
ret = open(template, O_PATH | O_CLOEXEC);
@ -448,7 +470,7 @@ static ssize_t fd_to_fd(int outfd, int infd)
if (n < 0)
return -1;
nwritten += n;
} while(nwritten < nread);
} while (nwritten < nread);
total += nwritten;
}
@ -459,7 +481,7 @@ static ssize_t fd_to_fd(int outfd, int infd)
static int clone_binary(void)
{
int binfd, execfd;
struct stat statbuf = {};
struct stat statbuf = { };
size_t sent = 0;
int fdtype = EFD_NONE;

View file

@ -0,0 +1,142 @@
#include <stdlib.h>
#include <string.h>
#ifdef ESCAPE_TEST
# include <assert.h>
# define test_assert(arg) assert(arg)
#else
# define test_assert(arg)
#endif
#define DEL '\x7f'
/*
* Poor man version of itoa with base=16 and input number from 0 to 15,
* represented by a char. Converts it to a single hex digit ('0' to 'f').
*/
static char hex(char i)
{
test_assert(i >= 0 && i < 16);
if (i >= 0 && i < 10) {
return '0' + i;
}
if (i >= 10 && i < 16) {
return 'a' + i - 10;
}
return '?';
}
/*
* Given the character, tells how many _extra_ characters are needed
* to JSON-escape it. If 0 is returned, the character does not need to
* be escaped.
*/
static int need_escape(char c)
{
switch (c) {
case '\\':
case '"':
case '\b':
case '\n':
case '\r':
case '\t':
case '\f':
return 1;
case DEL: // -> \u007f
return 5;
default:
if (c > 0 && c < ' ') {
// ASCII decimal 01 to 31 -> \u00xx
return 5;
}
return 0;
}
}
/*
* Escape the string so it can be used as a JSON string (per RFC4627,
* section 2.5 minimal requirements, plus the DEL (0x7f) character).
*
* It is expected that the argument is a string allocated via malloc.
* In case no escaping is needed, the original string is returned as is;
* otherwise, the original string is free'd, and the newly allocated
* escaped string is returned. Thus, in any case, the value returned
* need to be free'd by the caller.
*/
char *escape_json_string(char *s)
{
int i, j, len;
char *c, *out;
/*
* First, check if escaping is at all needed -- if not, we can avoid
* malloc and return the argument as is. While at it, count how much
* extra space is required.
*
* XXX: the counting code must be in sync with the escaping code
* (checked by test_assert()s below).
*/
for (i = j = 0; s[i] != '\0'; i++) {
j += need_escape(s[i]);
}
if (j == 0) {
// nothing to escape
return s;
}
len = i + j + 1;
out = malloc(len);
if (!out) {
free(s);
// As malloc failed, strdup can fail, too, so in the worst case
// scenario NULL will be returned from here.
return strdup("escape_json_string: out of memory");
}
for (c = s, j = 0; *c != '\0'; c++) {
switch (*c) {
case '"':
case '\\':
test_assert(need_escape(*c) == 1);
out[j++] = '\\';
out[j++] = *c;
continue;
}
if ((*c < 0 || *c >= ' ') && (*c != DEL)) {
// no escape needed
test_assert(need_escape(*c) == 0);
out[j++] = *c;
continue;
}
out[j++] = '\\';
switch (*c) {
case '\b':
out[j++] = 'b';
break;
case '\n':
out[j++] = 'n';
break;
case '\r':
out[j++] = 'r';
break;
case '\t':
out[j++] = 't';
break;
case '\f':
out[j++] = 'f';
break;
default:
test_assert(need_escape(*c) == 5);
out[j++] = 'u';
out[j++] = '0';
out[j++] = '0';
out[j++] = hex(*c >> 4);
out[j++] = hex(*c & 0x0f);
}
}
test_assert(j + 1 == len);
out[j] = '\0';
free(s);
return out;
}

View file

@ -29,6 +29,8 @@
/* Get all of the CLONE_NEW* flags. */
#include "namespace.h"
extern char *escape_json_string(char *str);
/* Synchronisation values. */
enum sync_t {
SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */
@ -36,7 +38,7 @@ enum sync_t {
SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */
SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */
SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */
SYNC_CHILD_READY = 0x45, /* The child or grandchild is ready to return. */
SYNC_CHILD_FINISH = 0x45, /* The child or grandchild has finished. */
};
/*
@ -45,10 +47,14 @@ enum sync_t {
*/
#define CREATECGROUPNS 0x80
#define STAGE_SETUP -1
/* longjmp() arguments. */
#define JUMP_PARENT 0x00
#define JUMP_CHILD 0xA0
#define JUMP_INIT 0xA1
#define STAGE_PARENT 0
#define STAGE_CHILD 1
#define STAGE_INIT 2
/* Stores the current stage of nsexec. */
int current_stage = STAGE_SETUP;
/* Assume the stack grows down, so arguments should be above it. */
struct clone_t {
@ -56,7 +62,7 @@ struct clone_t {
* Reserve some space for clone() to locate arguments
* and retcode in this place
*/
char stack[4096] __attribute__ ((aligned(16)));
char stack[4096] __attribute__((aligned(16)));
char stack_ptr[0];
/* There's two children. This is used to execute the different code. */
@ -102,31 +108,31 @@ static int logfd = -1;
* List of netlink message types sent to us as part of bootstrapping the init.
* These constants are defined in libcontainer/message_linux.go.
*/
#define INIT_MSG 62000
#define INIT_MSG 62000
#define CLONE_FLAGS_ATTR 27281
#define NS_PATHS_ATTR 27282
#define UIDMAP_ATTR 27283
#define GIDMAP_ATTR 27284
#define UIDMAP_ATTR 27283
#define GIDMAP_ATTR 27284
#define SETGROUP_ATTR 27285
#define OOM_SCORE_ADJ_ATTR 27286
#define ROOTLESS_EUID_ATTR 27287
#define UIDMAPPATH_ATTR 27288
#define GIDMAPPATH_ATTR 27289
#define UIDMAPPATH_ATTR 27288
#define GIDMAPPATH_ATTR 27289
/*
* Use the raw syscall for versions of glibc which don't include a function for
* it, namely (glibc 2.12).
*/
#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14
# define _GNU_SOURCE
# include "syscall.h"
# if !defined(SYS_setns) && defined(__NR_setns)
# define SYS_setns __NR_setns
# endif
# define _GNU_SOURCE
# include "syscall.h"
# if !defined(SYS_setns) && defined(__NR_setns)
# define SYS_setns __NR_setns
# endif
#ifndef SYS_setns
# error "setns(2) syscall not supported by glibc version"
#endif
# ifndef SYS_setns
# error "setns(2) syscall not supported by glibc version"
# endif
int setns(int fd, int nstype)
{
@ -134,33 +140,43 @@ int setns(int fd, int nstype)
}
#endif
static void write_log_with_info(const char *level, const char *function, int line, const char *format, ...)
static void write_log(const char *level, const char *format, ...)
{
char message[1024] = {};
char *message = NULL, *stage = NULL;
va_list args;
int ret;
if (logfd < 0 || level == NULL)
return;
goto out;
va_start(args, format);
if (vsnprintf(message, sizeof(message), format, args) < 0)
goto done;
dprintf(logfd, "{\"level\":\"%s\", \"msg\": \"%s:%d %s\"}\n", level, function, line, message);
done:
ret = vasprintf(&message, format, args);
va_end(args);
}
if (ret < 0)
goto out;
#define write_log(level, fmt, ...) \
write_log_with_info((level), __FUNCTION__, __LINE__, (fmt), ##__VA_ARGS__)
message = escape_json_string(message);
if (current_stage == STAGE_SETUP)
stage = strdup("nsexec");
else
ret = asprintf(&stage, "nsexec-%d", current_stage);
if (ret < 0)
goto out;
dprintf(logfd, "{\"level\":\"%s\", \"msg\": \"%s[%d]: %s\"}\n", level, stage, getpid(), message);
out:
free(message);
free(stage);
}
/* XXX: This is ugly. */
static int syncfd = -1;
#define bail(fmt, ...) \
do { \
write_log(FATAL, "nsenter: " fmt ": %m", ##__VA_ARGS__); \
write_log(FATAL, fmt ": %m", ##__VA_ARGS__); \
exit(1); \
} while(0)
@ -187,7 +203,7 @@ static int write_file(char *data, size_t data_len, char *pathfmt, ...)
goto out;
}
out:
out:
close(fd);
return ret;
}
@ -297,9 +313,11 @@ static void update_uidmap(const char *path, int pid, char *map, size_t map_len)
if (map == NULL || map_len <= 0)
return;
write_log(DEBUG, "update /proc/%d/uid_map to '%s'", pid, map);
if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) {
if (errno != EPERM)
bail("failed to update /proc/%d/uid_map", pid);
write_log(DEBUG, "update /proc/%d/uid_map got -EPERM (trying %s)", pid, path);
if (try_mapping_tool(path, pid, map, map_len))
bail("failed to use newuid map on %d", pid);
}
@ -310,9 +328,11 @@ static void update_gidmap(const char *path, int pid, char *map, size_t map_len)
if (map == NULL || map_len <= 0)
return;
write_log(DEBUG, "update /proc/%d/gid_map to '%s'", pid, map);
if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) {
if (errno != EPERM)
bail("failed to update /proc/%d/gid_map", pid);
write_log(DEBUG, "update /proc/%d/gid_map got -EPERM (trying %s)", pid, path);
if (try_mapping_tool(path, pid, map, map_len))
bail("failed to use newgid map on %d", pid);
}
@ -323,19 +343,20 @@ static void update_oom_score_adj(char *data, size_t len)
if (data == NULL || len <= 0)
return;
write_log(DEBUG, "update /proc/self/oom_score_adj to '%s'", data);
if (write_file(data, len, "/proc/self/oom_score_adj") < 0)
bail("failed to update /proc/self/oom_score_adj");
}
/* A dummy function that just jumps to the given jumpval. */
static int child_func(void *arg) __attribute__ ((noinline));
static int child_func(void *arg) __attribute__((noinline));
static int child_func(void *arg)
{
struct clone_t *ca = (struct clone_t *)arg;
longjmp(*ca->env, ca->jmpval);
}
static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline));
static int clone_parent(jmp_buf *env, int jmpval) __attribute__((noinline));
static int clone_parent(jmp_buf *env, int jmpval)
{
struct clone_t ca = {
@ -507,7 +528,6 @@ void join_namespaces(char *nslist)
char *namespace = strtok_r(nslist, ",", &saveptr);
struct namespace_t {
int fd;
int ns;
char type[PATH_MAX];
char path[PATH_MAX];
} *namespaces = NULL;
@ -542,7 +562,7 @@ void join_namespaces(char *nslist)
bail("failed to open %s", path);
ns->fd = fd;
ns->ns = nsflag(namespace);
strncpy(ns->type, namespace, PATH_MAX - 1);
strncpy(ns->path, path, PATH_MAX - 1);
ns->path[PATH_MAX - 1] = '\0';
} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
@ -555,12 +575,14 @@ void join_namespaces(char *nslist)
*/
for (i = 0; i < num; i++) {
struct namespace_t ns = namespaces[i];
struct namespace_t *ns = &namespaces[i];
int flag = nsflag(ns->type);
if (setns(ns.fd, ns.ns) < 0)
bail("failed to setns to %s", ns.path);
write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", flag, ns->type, ns->path);
if (setns(ns->fd, flag) < 0)
bail("failed to setns into %s namespace", ns->type);
close(ns.fd);
close(ns->fd);
}
free(namespaces);
@ -569,6 +591,14 @@ void join_namespaces(char *nslist)
/* Defined in cloned_binary.c. */
extern int ensure_cloned_binary(void);
static inline int sane_kill(pid_t pid, int signum)
{
if (pid > 0)
return kill(pid, signum);
else
return 0;
}
void nsexec(void)
{
int pipenum;
@ -598,7 +628,14 @@ void nsexec(void)
if (ensure_cloned_binary() < 0)
bail("could not ensure we are a cloned binary");
write_log(DEBUG, "nsexec started");
/*
* Inform the parent we're past initial setup.
* For the other side of this, see initWaiter.
*/
if (write(pipenum, "", 1) != 1)
bail("could not inform the parent we are past initial setup");
write_log(DEBUG, "=> nsexec container setup");
/* Parse all of the netlink configuration. */
nl_parse(pipenum, &config);
@ -622,6 +659,7 @@ void nsexec(void)
* containers), which is the recommendation from the kernel folks.
*/
if (config.namespaces) {
write_log(DEBUG, "set process as non-dumpable");
if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
bail("failed to set process as non-dumpable");
}
@ -686,45 +724,49 @@ void nsexec(void)
* -- Aleksa "what has my life come to?" Sarai
*/
switch (setjmp(env)) {
current_stage = setjmp(env);
switch (current_stage) {
/*
* Stage 0: We're in the parent. Our job is just to create a new child
* (stage 1: JUMP_CHILD) process and write its uid_map and
* (stage 1: STAGE_CHILD) process and write its uid_map and
* gid_map. That process will go on to create a new process, then
* it will send us its PID which we will send to the bootstrap
* process.
*/
case JUMP_PARENT:{
case STAGE_PARENT:{
int len;
pid_t child, first_child = -1;
bool ready = false;
pid_t stage1_pid = -1, stage2_pid = -1;
bool stage1_complete, stage2_complete;
/* For debugging. */
prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0);
write_log(DEBUG, "~> nsexec stage-0");
/* Start the process of getting a container. */
child = clone_parent(&env, JUMP_CHILD);
if (child < 0)
bail("unable to fork: child_func");
write_log(DEBUG, "spawn stage-1");
stage1_pid = clone_parent(&env, STAGE_CHILD);
if (stage1_pid < 0)
bail("unable to spawn stage-1");
/*
* State machine for synchronisation with the children.
*
* Father only return when both child and grandchild are
* ready, so we can receive all possible error codes
* generated by children.
*/
syncfd = sync_child_pipe[1];
close(sync_child_pipe[0]);
while (!ready) {
/*
* State machine for synchronisation with the children. We only
* return once both the child and grandchild are ready.
*/
write_log(DEBUG, "-> stage-1 synchronisation loop");
stage1_complete = false;
while (!stage1_complete) {
enum sync_t s;
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with child: next state");
bail("failed to sync with stage-1: next state");
switch (s) {
case SYNC_USERMAP_PLS:
write_log(DEBUG, "stage-1 requested userns mappings");
/*
* Enable setgroups(2) if we've been asked to. But we also
* have to explicitly disable setgroups(2) if we're
@ -735,70 +777,78 @@ void nsexec(void)
* For rootless multi-entry mapping, config.is_setgroup shall be true and
* newuidmap/newgidmap shall be used.
*/
if (config.is_rootless_euid && !config.is_setgroup)
update_setgroups(child, SETGROUPS_DENY);
update_setgroups(stage1_pid, SETGROUPS_DENY);
/* Set up mappings. */
update_uidmap(config.uidmappath, child, config.uidmap, config.uidmap_len);
update_gidmap(config.gidmappath, child, config.gidmap, config.gidmap_len);
update_uidmap(config.uidmappath, stage1_pid, config.uidmap, config.uidmap_len);
update_gidmap(config.gidmappath, stage1_pid, config.gidmap, config.gidmap_len);
s = SYNC_USERMAP_ACK;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(child, SIGKILL);
bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
sane_kill(stage1_pid, SIGKILL);
sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with stage-1: write(SYNC_USERMAP_ACK)");
}
break;
case SYNC_RECVPID_PLS:{
first_child = child;
case SYNC_RECVPID_PLS:
write_log(DEBUG, "stage-1 requested pid to be forwarded");
/* Get the init_func pid. */
if (read(syncfd, &child, sizeof(child)) != sizeof(child)) {
kill(first_child, SIGKILL);
bail("failed to sync with child: read(childpid)");
}
/* Get the stage-2 pid. */
if (read(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) {
sane_kill(stage1_pid, SIGKILL);
sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with stage-1: read(stage2_pid)");
}
/* Send ACK. */
s = SYNC_RECVPID_ACK;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(first_child, SIGKILL);
kill(child, SIGKILL);
bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
}
/* Send ACK. */
s = SYNC_RECVPID_ACK;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
sane_kill(stage1_pid, SIGKILL);
sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with stage-1: write(SYNC_RECVPID_ACK)");
}
/* Send the init_func pid back to our parent.
*
* Send the init_func pid and the pid of the first child back to our parent.
* We need to send both back because we can't reap the first child we created (CLONE_PARENT).
* It becomes the responsibility of our parent to reap the first child.
*/
len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child);
if (len < 0) {
kill(child, SIGKILL);
bail("unable to generate JSON for child pid");
}
/*
* Send both the stage-1 and stage-2 pids back to runc.
* runc needs the stage-2 to continue process management,
* but because stage-1 was spawned with CLONE_PARENT we
* cannot reap it within stage-0 and thus we need to ask
* runc to reap the zombie for us.
*/
write_log(DEBUG, "forward stage-1 (%d) and stage-2 (%d) pids to runc",
stage1_pid, stage2_pid);
len =
dprintf(pipenum, "{\"stage1_pid\":%d,\"stage2_pid\":%d}\n", stage1_pid,
stage2_pid);
if (len < 0) {
sane_kill(stage1_pid, SIGKILL);
sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with runc: write(pid-JSON)");
}
break;
case SYNC_CHILD_READY:
ready = true;
case SYNC_CHILD_FINISH:
write_log(DEBUG, "stage-1 complete");
stage1_complete = true;
break;
default:
bail("unexpected sync value: %u", s);
}
}
write_log(DEBUG, "<- stage-1 synchronisation loop");
/* Now sync with grandchild. */
syncfd = sync_grandchild_pipe[1];
close(sync_grandchild_pipe[0]);
ready = false;
while (!ready) {
write_log(DEBUG, "-> stage-2 synchronisation loop");
stage2_complete = false;
while (!stage2_complete) {
enum sync_t s;
write_log(DEBUG, "signalling stage-2 to run");
s = SYNC_GRANDCHILD;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(child, SIGKILL);
sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with child: write(SYNC_GRANDCHILD)");
}
@ -806,27 +856,31 @@ void nsexec(void)
bail("failed to sync with child: next state");
switch (s) {
case SYNC_CHILD_READY:
ready = true;
case SYNC_CHILD_FINISH:
write_log(DEBUG, "stage-2 complete");
stage2_complete = true;
break;
default:
bail("unexpected sync value: %u", s);
}
}
write_log(DEBUG, "<- stage-2 synchronisation loop");
write_log(DEBUG, "<~ nsexec stage-0");
exit(0);
}
break;
/*
* Stage 1: We're in the first child process. Our job is to join any
* provided namespaces in the netlink payload and unshare all
* of the requested namespaces. If we've been asked to
* CLONE_NEWUSER, we will ask our parent (stage 0) to set up
* our user mappings for us. Then, we create a new child
* (stage 2: JUMP_INIT) for PID namespace. We then send the
* child's PID to our parent (stage 0).
* provided namespaces in the netlink payload and unshare all of
* the requested namespaces. If we've been asked to CLONE_NEWUSER,
* we will ask our parent (stage 0) to set up our user mappings
* for us. Then, we create a new child (stage 2: STAGE_INIT) for
* PID namespace. We then send the child's PID to our parent
* (stage 0).
*/
case JUMP_CHILD:{
pid_t child;
case STAGE_CHILD:{
pid_t stage2_pid = -1;
enum sync_t s;
/* We're in a child and thus need to tell the parent if we die. */
@ -835,11 +889,12 @@ void nsexec(void)
/* For debugging. */
prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0);
write_log(DEBUG, "~> nsexec stage-1");
/*
* We need to setns first. We cannot do this earlier (in stage 0)
* because of the fact that we forked to get here (the PID of
* [stage 2: JUMP_INIT]) would be meaningless). We could send it
* [stage 2: STAGE_INIT]) would be meaningless). We could send it
* using cmsg(3) but that's just annoying.
*/
if (config.namespaces)
@ -865,40 +920,50 @@ void nsexec(void)
* problem.
*/
if (config.cloneflags & CLONE_NEWUSER) {
write_log(DEBUG, "unshare user namespace");
if (unshare(CLONE_NEWUSER) < 0)
bail("failed to unshare user namespace");
config.cloneflags &= ~CLONE_NEWUSER;
/*
* We don't have the privileges to do any mapping here (see the
* clone_parent rant). So signal our parent to hook us up.
* We need to set ourselves as dumpable temporarily so that the
* parent process can write to our procfs files.
*/
/* Switching is only necessary if we joined namespaces. */
if (config.namespaces) {
write_log(DEBUG, "temporarily set process as dumpable");
if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0)
bail("failed to set process as dumpable");
bail("failed to temporarily set process as dumpable");
}
/*
* We don't have the privileges to do any mapping here (see the
* clone_parent rant). So signal stage-0 to do the mapping for
* us.
*/
write_log(DEBUG, "request stage-0 to map user namespace");
s = SYNC_USERMAP_PLS;
if (write(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
/* ... wait for mapping ... */
write_log(DEBUG, "request stage-0 to map user namespace");
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
if (s != SYNC_USERMAP_ACK)
bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
/* Switching is only necessary if we joined namespaces. */
/* Revert temporary re-dumpable setting. */
if (config.namespaces) {
write_log(DEBUG, "re-set process as non-dumpable");
if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
bail("failed to set process as dumpable");
bail("failed to re-set process as non-dumpable");
}
/* Become root in the namespace proper. */
if (setresuid(0, 0, 0) < 0)
bail("failed to become root in user namespace");
}
/*
* Unshare all of the namespaces. Now, it should be noted that this
* ordering might break in the future (especially with rootless
@ -909,8 +974,9 @@ void nsexec(void)
* some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
* was broken, so we'll just do it the long way anyway.
*/
write_log(DEBUG, "unshare remaining namespace (except cgroupns)");
if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0)
bail("failed to unshare namespaces");
bail("failed to unshare remaining namespaces (except cgroupns)");
/*
* TODO: What about non-namespace clone flags that we're dropping here?
@ -921,41 +987,45 @@ void nsexec(void)
* which would break many applications and libraries, so we must fork
* to actually enter the new PID namespace.
*/
child = clone_parent(&env, JUMP_INIT);
if (child < 0)
bail("unable to fork: init_func");
write_log(DEBUG, "spawn stage-2");
stage2_pid = clone_parent(&env, STAGE_INIT);
if (stage2_pid < 0)
bail("unable to spawn stage-2");
/* Send the child to our parent, which knows what it's doing. */
write_log(DEBUG, "request stage-0 to forward stage-2 pid (%d)", stage2_pid);
s = SYNC_RECVPID_PLS;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(child, SIGKILL);
sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
}
if (write(syncfd, &child, sizeof(child)) != sizeof(child)) {
kill(child, SIGKILL);
bail("failed to sync with parent: write(childpid)");
if (write(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) {
sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with parent: write(stage2_pid)");
}
/* ... wait for parent to get the pid ... */
if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(child, SIGKILL);
sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
}
if (s != SYNC_RECVPID_ACK) {
kill(child, SIGKILL);
sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
}
s = SYNC_CHILD_READY;
write_log(DEBUG, "signal completion to stage-0");
s = SYNC_CHILD_FINISH;
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
kill(child, SIGKILL);
bail("failed to sync with parent: write(SYNC_CHILD_READY)");
sane_kill(stage2_pid, SIGKILL);
bail("failed to sync with parent: write(SYNC_CHILD_FINISH)");
}
/* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */
/* Our work is done. [Stage 2: STAGE_INIT] is doing the rest of the work. */
write_log(DEBUG, "<~ nsexec stage-1");
exit(0);
}
break;
/*
* Stage 2: We're the final child process, and the only process that will
@ -963,7 +1033,7 @@ void nsexec(void)
* final cleanup steps and then return to the Go runtime to allow
* init_linux.go to run.
*/
case JUMP_INIT:{
case STAGE_INIT:{
/*
* We're inside the child now, having jumped from the
* start_child() code after forking in the parent.
@ -978,6 +1048,7 @@ void nsexec(void)
/* For debugging. */
prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0);
write_log(DEBUG, "~> nsexec stage-2");
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with parent: read(SYNC_GRANDCHILD)");
@ -998,21 +1069,30 @@ void nsexec(void)
bail("setgroups failed");
}
/* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */
/*
* Wait until our topmost parent has finished cgroup setup in
* p.manager.Apply().
*
* TODO(cyphar): Check if this code is actually needed because we
* should be in the cgroup even from stage-0, so
* waiting until now might not make sense.
*/
if (config.cloneflags & CLONE_NEWCGROUP) {
uint8_t value;
if (read(pipenum, &value, sizeof(value)) != sizeof(value))
bail("read synchronisation value failed");
if (value == CREATECGROUPNS) {
write_log(DEBUG, "unshare cgroup namespace");
if (unshare(CLONE_NEWCGROUP) < 0)
bail("failed to unshare cgroup namespace");
} else
bail("received unknown synchronisation value");
}
s = SYNC_CHILD_READY;
write_log(DEBUG, "signal completion to stage-0");
s = SYNC_CHILD_FINISH;
if (write(syncfd, &s, sizeof(s)) != sizeof(s))
bail("failed to sync with patent: write(SYNC_CHILD_READY)");
bail("failed to sync with patent: write(SYNC_CHILD_FINISH)");
/* Close sync pipes. */
close(sync_grandchild_pipe[0]);
@ -1021,10 +1101,13 @@ void nsexec(void)
nl_free(&config);
/* Finish executing, let the Go runtime take over. */
write_log(DEBUG, "<= nsexec container setup");
write_log(DEBUG, "booting up go runtime ...");
return;
}
break;
default:
bail("unexpected jump value");
bail("unknown stage '%d' for jump value", current_stage);
}
/* Should never be reached. */

View file

@ -0,0 +1 @@
../escape.c

View file

@ -0,0 +1,53 @@
package escapetest
// This file is part of escape_json_string unit test.
// It is in a separate package so cgo can be used together
// with go test.
// #include <stdlib.h>
// extern char *escape_json_string(char *str);
// #cgo CFLAGS: -DESCAPE_TEST=1
import "C"
import (
"testing"
"unsafe"
)
func testEscapeJsonString(t *testing.T, input, want string) {
in := C.CString(input)
out := C.escape_json_string(in)
got := C.GoString(out)
C.free(unsafe.Pointer(out))
t.Logf("input: %q, output: %q", input, got)
if got != want {
t.Errorf("Failed on input: %q, want %q, got %q", input, want, got)
}
}
func testEscapeJson(t *testing.T) {
testCases := []struct {
input, output string
}{
{"", ""},
{"abcdef", "abcdef"},
{`\\\\\\`, `\\\\\\\\\\\\`},
{`with"quote`, `with\"quote`},
{"\n\r\b\t\f\\", `\n\r\b\t\f\\`},
{"\007", "\\u0007"},
{"\017 \020 \037", "\\u000f \\u0010 \\u001f"},
{"\033", "\\u001b"},
{`<->`, `<->`},
{"\176\177\200", "~\\u007f\200"},
{"\000", ""},
{"a\x7fxc", "a\\u007fxc"},
{"a\033xc", "a\\u001bxc"},
{"a\nxc", "a\\nxc"},
{"a\\xc", "a\\\\xc"},
{"Barney B\303\244r", "Barney B\303\244r"},
}
for _, tc := range testCases {
testEscapeJsonString(t, tc.input, tc.output)
}
}

View file

@ -1,41 +0,0 @@
package user
import (
"errors"
)
var (
// The current operating system does not provide the required data for user lookups.
ErrUnsupported = errors.New("user lookup: operating system does not provide passwd-formatted data")
// No matching entries found in file.
ErrNoPasswdEntries = errors.New("no matching entries in passwd file")
ErrNoGroupEntries = errors.New("no matching entries in group file")
)
// LookupUser looks up a user by their username in /etc/passwd. If the user
// cannot be found (or there is no /etc/passwd file on the filesystem), then
// LookupUser returns an error.
func LookupUser(username string) (User, error) {
return lookupUser(username)
}
// LookupUid looks up a user by their user id in /etc/passwd. If the user cannot
// be found (or there is no /etc/passwd file on the filesystem), then LookupId
// returns an error.
func LookupUid(uid int) (User, error) {
return lookupUid(uid)
}
// LookupGroup looks up a group by its name in /etc/group. If the group cannot
// be found (or there is no /etc/group file on the filesystem), then LookupGroup
// returns an error.
func LookupGroup(groupname string) (Group, error) {
return lookupGroup(groupname)
}
// LookupGid looks up a group by its group id in /etc/group. If the group cannot
// be found (or there is no /etc/group file on the filesystem), then LookupGid
// returns an error.
func LookupGid(gid int) (Group, error) {
return lookupGid(gid)
}

View file

@ -16,13 +16,19 @@ const (
unixGroupPath = "/etc/group"
)
func lookupUser(username string) (User, error) {
// LookupUser looks up a user by their username in /etc/passwd. If the user
// cannot be found (or there is no /etc/passwd file on the filesystem), then
// LookupUser returns an error.
func LookupUser(username string) (User, error) {
return lookupUserFunc(func(u User) bool {
return u.Name == username
})
}
func lookupUid(uid int) (User, error) {
// LookupUid looks up a user by their user id in /etc/passwd. If the user cannot
// be found (or there is no /etc/passwd file on the filesystem), then LookupId
// returns an error.
func LookupUid(uid int) (User, error) {
return lookupUserFunc(func(u User) bool {
return u.Uid == uid
})
@ -51,13 +57,19 @@ func lookupUserFunc(filter func(u User) bool) (User, error) {
return users[0], nil
}
func lookupGroup(groupname string) (Group, error) {
// LookupGroup looks up a group by its name in /etc/group. If the group cannot
// be found (or there is no /etc/group file on the filesystem), then LookupGroup
// returns an error.
func LookupGroup(groupname string) (Group, error) {
return lookupGroupFunc(func(g Group) bool {
return g.Name == groupname
})
}
func lookupGid(gid int) (Group, error) {
// LookupGid looks up a group by its group id in /etc/group. If the group cannot
// be found (or there is no /etc/group file on the filesystem), then LookupGid
// returns an error.
func LookupGid(gid int) (Group, error) {
return lookupGroupFunc(func(g Group) bool {
return g.Gid == gid
})

View file

@ -1,40 +0,0 @@
// +build windows
package user
import (
"fmt"
"os/user"
)
func lookupUser(username string) (User, error) {
u, err := user.Lookup(username)
if err != nil {
return User{}, err
}
return userFromOS(u)
}
func lookupUid(uid int) (User, error) {
u, err := user.LookupId(fmt.Sprintf("%d", uid))
if err != nil {
return User{}, err
}
return userFromOS(u)
}
func lookupGroup(groupname string) (Group, error) {
g, err := user.LookupGroup(groupname)
if err != nil {
return Group{}, err
}
return groupFromOS(g)
}
func lookupGid(gid int) (Group, error) {
g, err := user.LookupGroupId(fmt.Sprintf("%d", gid))
if err != nil {
return Group{}, err
}
return groupFromOS(g)
}

View file

@ -2,10 +2,10 @@ package user
import (
"bufio"
"errors"
"fmt"
"io"
"os"
"os/user"
"strconv"
"strings"
)
@ -16,6 +16,13 @@ const (
)
var (
// The current operating system does not provide the required data for user lookups.
ErrUnsupported = errors.New("user lookup: operating system does not provide passwd-formatted data")
// No matching entries found in file.
ErrNoPasswdEntries = errors.New("no matching entries in passwd file")
ErrNoGroupEntries = errors.New("no matching entries in group file")
ErrRange = fmt.Errorf("uids and gids must be in range %d-%d", minId, maxId)
)
@ -29,28 +36,6 @@ type User struct {
Shell string
}
// userFromOS converts an os/user.(*User) to local User
//
// (This does not include Pass, Shell or Gecos)
func userFromOS(u *user.User) (User, error) {
newUser := User{
Name: u.Username,
Home: u.HomeDir,
}
id, err := strconv.Atoi(u.Uid)
if err != nil {
return newUser, err
}
newUser.Uid = id
id, err = strconv.Atoi(u.Gid)
if err != nil {
return newUser, err
}
newUser.Gid = id
return newUser, nil
}
type Group struct {
Name string
Pass string
@ -58,23 +43,6 @@ type Group struct {
List []string
}
// groupFromOS converts an os/user.(*Group) to local Group
//
// (This does not include Pass or List)
func groupFromOS(g *user.Group) (Group, error) {
newGroup := Group{
Name: g.Name,
}
id, err := strconv.Atoi(g.Gid)
if err != nil {
return newGroup, err
}
newGroup.Gid = id
return newGroup, nil
}
// SubID represents an entry in /etc/sub{u,g}id
type SubID struct {
Name string
@ -466,7 +434,7 @@ func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, err
// we asked for a group but didn't find it. let's check to see
// if we wanted a numeric group
if !found {
gid, err := strconv.Atoi(ag)
gid, err := strconv.ParseInt(ag, 10, 64)
if err != nil {
return nil, fmt.Errorf("Unable to find group %s", ag)
}
@ -474,7 +442,7 @@ func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, err
if gid < minId || gid > maxId {
return nil, ErrRange
}
gidMap[gid] = struct{}{}
gidMap[int(gid)] = struct{}{}
}
}
gids := []int{}

View file

@ -0,0 +1,42 @@
// +build gofuzz
package user
import (
"io"
"strings"
)
func IsDivisbleBy(n int, divisibleby int) bool {
return (n % divisibleby) == 0
}
func FuzzUser(data []byte) int {
if len(data) == 0 {
return -1
}
if !IsDivisbleBy(len(data), 5) {
return -1
}
var divided [][]byte
chunkSize := len(data) / 5
for i := 0; i < len(data); i += chunkSize {
end := i + chunkSize
divided = append(divided, data[i:end])
}
_, _ = ParsePasswdFilter(strings.NewReader(string(divided[0])), nil)
var passwd, group io.Reader
group = strings.NewReader(string(divided[1]))
_, _ = GetAdditionalGroups([]string{string(divided[2])}, group)
passwd = strings.NewReader(string(divided[3]))
_, _ = GetExecUser(string(divided[4]), nil, passwd, group)
return 1
}

View file

@ -0,0 +1,5 @@
package userns
// RunningInUserNS detects whether we are currently running in a user namespace.
// Originally copied from github.com/lxc/lxd/shared/util.go
var RunningInUserNS = runningInUserNS

View file

@ -0,0 +1,15 @@
// +build gofuzz
package userns
import (
"strings"
"github.com/opencontainers/runc/libcontainer/user"
)
func FuzzUIDMap(data []byte) int {
uidmap, _ := user.ParseIDMap(strings.NewReader(string(data)))
_ = uidMapInUserNS(uidmap)
return 1
}

View file

@ -0,0 +1,37 @@
package userns
import (
"sync"
"github.com/opencontainers/runc/libcontainer/user"
)
var (
inUserNS bool
nsOnce sync.Once
)
// runningInUserNS detects whether we are currently running in a user namespace.
// Originally copied from github.com/lxc/lxd/shared/util.go
func runningInUserNS() bool {
nsOnce.Do(func() {
uidmap, err := user.CurrentProcessUIDMap()
if err != nil {
// This kernel-provided file only exists if user namespaces are supported
return
}
inUserNS = uidMapInUserNS(uidmap)
})
return inUserNS
}
func uidMapInUserNS(uidmap []user.IDMap) bool {
/*
* We assume we are in the initial user namespace if we have a full
* range - 4294967295 uids starting at uid 0.
*/
if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 {
return false
}
return true
}

View file

@ -0,0 +1,17 @@
// +build !linux
package userns
import "github.com/opencontainers/runc/libcontainer/user"
// runningInUserNS is a stub for non-Linux systems
// Always returns false
func runningInUserNS() bool {
return false
}
// uidMapInUserNS is a stub for non-Linux systems
// Always returns false
func uidMapInUserNS(uidmap []user.IDMap) bool {
return false
}