diff --git a/vendor.conf b/vendor.conf index 9d509b0b27..149f828200 100644 --- a/vendor.conf +++ b/vendor.conf @@ -91,7 +91,7 @@ google.golang.org/grpc f495f5b15ae7ccda3b38c53a1bfc # the containerd project first, and update both after that is merged. # This commit does not need to match RUNC_COMMIT as it is used for helper # packages but should be newer or equal. -github.com/opencontainers/runc ff819c7e9184c13b7c2607fe6c30ae19403a7aff # v1.0.0-rc92 +github.com/opencontainers/runc b9ee9c6314599f1b4a7f497e1f1f856fe433d3b7 # v1.0.0-rc95 github.com/opencontainers/runtime-spec 1c3f411f041711bbeecf35ff7e93461ea6789220 # v1.0.3-0.20210326190908-1c3f411f0417 github.com/opencontainers/image-spec d60099175f88c47cd379c4738d158884749ed235 # v1.0.1 github.com/cyphar/filepath-securejoin a261ee33d7a517f054effbf451841abaafe3e0fd # v0.2.2 diff --git a/vendor/github.com/opencontainers/runc/README.md b/vendor/github.com/opencontainers/runc/README.md index aeef59895b..23106d303c 100644 --- a/vendor/github.com/opencontainers/runc/README.md +++ b/vendor/github.com/opencontainers/runc/README.md @@ -1,9 +1,10 @@ # runc -[![Build Status](https://travis-ci.org/opencontainers/runc.svg?branch=master)](https://travis-ci.org/opencontainers/runc) [![Go Report Card](https://goreportcard.com/badge/github.com/opencontainers/runc)](https://goreportcard.com/report/github.com/opencontainers/runc) [![GoDoc](https://godoc.org/github.com/opencontainers/runc?status.svg)](https://godoc.org/github.com/opencontainers/runc) [![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/588/badge)](https://bestpractices.coreinfrastructure.org/projects/588) +[![gha/validate](https://github.com/opencontainers/runc/workflows/validate/badge.svg)](https://github.com/opencontainers/runc/actions?query=workflow%3Avalidate) +[![gha/ci](https://github.com/opencontainers/runc/workflows/ci/badge.svg)](https://github.com/opencontainers/runc/actions?query=workflow%3Aci) ## Introduction @@ -17,10 +18,6 @@ This means that `runc` 1.0.0 should implement the 1.0 version of the specificati You can find official releases of `runc` on the [release](https://github.com/opencontainers/runc/releases) page. -Currently, the following features are not considered to be production-ready: - -* [Support for cgroup v2](./docs/cgroup-v2.md) - ## Security The reporting process and disclosure communications are outlined [here](https://github.com/opencontainers/org/blob/master/SECURITY.md). @@ -64,19 +61,20 @@ sudo make install with some of them enabled by default (see `BUILDTAGS` in top-level `Makefile`). To change build tags from the default, set the `BUILDTAGS` variable for make, -e.g. +e.g. to disable seccomp: ```bash -make BUILDTAGS='seccomp apparmor' +make BUILDTAGS="" ``` | Build Tag | Feature | Enabled by default | Dependency | |-----------|------------------------------------|--------------------|------------| | seccomp | Syscall filtering | yes | libseccomp | -| selinux | selinux process and mount labeling | yes | | -| apparmor | apparmor profile support | yes | | -| nokmem | disable kernel memory accounting | no | | +The following build tags were used earlier, but are now obsoleted: + - **nokmem** (since runc v1.0.0-rc94 kernel memory settings are ignored) + - **apparmor** (since runc v1.0.0-rc93 the feature is always enabled) + - **selinux** (since runc v1.0.0-rc93 the feature is always enabled) ### Running the test suite @@ -128,6 +126,14 @@ make verify-dependencies ## Using runc +Please note that runc is a low level tool not designed with an end user +in mind. It is mostly employed by other higher level container software. + +Therefore, unless there is some specific use case that prevents the use +of tools like Docker or Podman, it is not recommended to use runc directly. + +If you still want to use runc, here's how. + ### Creating an OCI Bundle In order to use runc you must have your container in the format of an OCI bundle. @@ -169,7 +175,9 @@ If you used the unmodified `runc spec` template this should give you a `sh` sess The second way to start a container is using the specs lifecycle operations. This gives you more power over how the container is created and managed while it is running. -This will also launch the container in the background so you will have to edit the `config.json` to remove the `terminal` setting for the simple examples here. +This will also launch the container in the background so you will have to edit +the `config.json` to remove the `terminal` setting for the simple examples +below (see more details about [runc terminal handling](docs/terminals.md)). Your process field in the `config.json` should look like this below with `"terminal": false` and `"args": ["sleep", "5"]`. @@ -292,8 +300,12 @@ PIDFile=/run/mycontainerid.pid WantedBy=multi-user.target ``` -#### cgroup v2 -See [`./docs/cgroup-v2.md`](./docs/cgroup-v2.md). +## More documentation + +* [cgroup v2](./docs/cgroup-v2.md) +* [Checkpoint and restore](./docs/checkpoint-restore.md) +* [systemd cgroup driver](./docs/systemd.md) +* [Terminals and standard IO](./docs/terminals.md) ## License diff --git a/vendor/github.com/opencontainers/runc/go.mod b/vendor/github.com/opencontainers/runc/go.mod index fcf068dfae..41cd5aa385 100644 --- a/vendor/github.com/opencontainers/runc/go.mod +++ b/vendor/github.com/opencontainers/runc/go.mod @@ -1,26 +1,28 @@ module github.com/opencontainers/runc -go 1.14 +go 1.13 require ( - github.com/checkpoint-restore/go-criu/v4 v4.1.0 - github.com/cilium/ebpf v0.0.0-20200702112145-1c8d4c9ef775 - github.com/containerd/console v1.0.0 - github.com/coreos/go-systemd/v22 v22.1.0 + github.com/checkpoint-restore/go-criu/v5 v5.0.0 + github.com/cilium/ebpf v0.5.0 + github.com/containerd/console v1.0.2 + github.com/coreos/go-systemd/v22 v22.3.1 github.com/cyphar/filepath-securejoin v0.2.2 github.com/docker/go-units v0.4.0 - github.com/godbus/dbus/v5 v5.0.3 - github.com/golang/protobuf v1.4.2 - github.com/moby/sys/mountinfo v0.1.3 - github.com/mrunalp/fileutils v0.0.0-20200520151820-abd8a0e76976 - github.com/opencontainers/runtime-spec v1.0.3-0.20200728170252-4d89ac9fbff6 - github.com/opencontainers/selinux v1.6.0 + github.com/godbus/dbus/v5 v5.0.4 + github.com/moby/sys/mountinfo v0.4.1 + github.com/mrunalp/fileutils v0.5.0 + github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417 + github.com/opencontainers/selinux v1.8.0 github.com/pkg/errors v0.9.1 github.com/seccomp/libseccomp-golang v0.9.1 - github.com/sirupsen/logrus v1.6.0 - github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2 + github.com/sirupsen/logrus v1.7.0 + github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635 // NOTE: urfave/cli must be <= v1.22.1 due to a regression: https://github.com/urfave/cli/issues/1092 github.com/urfave/cli v1.22.1 github.com/vishvananda/netlink v1.1.0 - golang.org/x/sys v0.0.0-20200728102440-3e129f6d46b1 + github.com/willf/bitset v1.1.11 + golang.org/x/net v0.0.0-20201224014010-6772e930b67b + golang.org/x/sys v0.0.0-20210426230700-d19ff857e887 + google.golang.org/protobuf v1.25.0 ) diff --git a/vendor/github.com/opencontainers/runc/libcontainer/README.md b/vendor/github.com/opencontainers/runc/libcontainer/README.md index 6803ef56c5..13eee49d4b 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/README.md +++ b/vendor/github.com/opencontainers/runc/libcontainer/README.md @@ -57,90 +57,94 @@ struct describing how the container is to be created. A sample would look simila ```go defaultMountFlags := unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV +var devices []*configs.DeviceRule +for _, device := range specconv.AllowedDevices { + devices = append(devices, &device.Rule) +} config := &configs.Config{ Rootfs: "/your/path/to/rootfs", Capabilities: &configs.Capabilities{ - Bounding: []string{ - "CAP_CHOWN", - "CAP_DAC_OVERRIDE", - "CAP_FSETID", - "CAP_FOWNER", - "CAP_MKNOD", - "CAP_NET_RAW", - "CAP_SETGID", - "CAP_SETUID", - "CAP_SETFCAP", - "CAP_SETPCAP", - "CAP_NET_BIND_SERVICE", - "CAP_SYS_CHROOT", - "CAP_KILL", - "CAP_AUDIT_WRITE", - }, - Effective: []string{ - "CAP_CHOWN", - "CAP_DAC_OVERRIDE", - "CAP_FSETID", - "CAP_FOWNER", - "CAP_MKNOD", - "CAP_NET_RAW", - "CAP_SETGID", - "CAP_SETUID", - "CAP_SETFCAP", - "CAP_SETPCAP", - "CAP_NET_BIND_SERVICE", - "CAP_SYS_CHROOT", - "CAP_KILL", - "CAP_AUDIT_WRITE", - }, - Inheritable: []string{ - "CAP_CHOWN", - "CAP_DAC_OVERRIDE", - "CAP_FSETID", - "CAP_FOWNER", - "CAP_MKNOD", - "CAP_NET_RAW", - "CAP_SETGID", - "CAP_SETUID", - "CAP_SETFCAP", - "CAP_SETPCAP", - "CAP_NET_BIND_SERVICE", - "CAP_SYS_CHROOT", - "CAP_KILL", - "CAP_AUDIT_WRITE", - }, - Permitted: []string{ - "CAP_CHOWN", - "CAP_DAC_OVERRIDE", - "CAP_FSETID", - "CAP_FOWNER", - "CAP_MKNOD", - "CAP_NET_RAW", - "CAP_SETGID", - "CAP_SETUID", - "CAP_SETFCAP", - "CAP_SETPCAP", - "CAP_NET_BIND_SERVICE", - "CAP_SYS_CHROOT", - "CAP_KILL", - "CAP_AUDIT_WRITE", - }, - Ambient: []string{ - "CAP_CHOWN", - "CAP_DAC_OVERRIDE", - "CAP_FSETID", - "CAP_FOWNER", - "CAP_MKNOD", - "CAP_NET_RAW", - "CAP_SETGID", - "CAP_SETUID", - "CAP_SETFCAP", - "CAP_SETPCAP", - "CAP_NET_BIND_SERVICE", - "CAP_SYS_CHROOT", - "CAP_KILL", - "CAP_AUDIT_WRITE", - }, - }, + Bounding: []string{ + "CAP_CHOWN", + "CAP_DAC_OVERRIDE", + "CAP_FSETID", + "CAP_FOWNER", + "CAP_MKNOD", + "CAP_NET_RAW", + "CAP_SETGID", + "CAP_SETUID", + "CAP_SETFCAP", + "CAP_SETPCAP", + "CAP_NET_BIND_SERVICE", + "CAP_SYS_CHROOT", + "CAP_KILL", + "CAP_AUDIT_WRITE", + }, + Effective: []string{ + "CAP_CHOWN", + "CAP_DAC_OVERRIDE", + "CAP_FSETID", + "CAP_FOWNER", + "CAP_MKNOD", + "CAP_NET_RAW", + "CAP_SETGID", + "CAP_SETUID", + "CAP_SETFCAP", + "CAP_SETPCAP", + "CAP_NET_BIND_SERVICE", + "CAP_SYS_CHROOT", + "CAP_KILL", + "CAP_AUDIT_WRITE", + }, + Inheritable: []string{ + "CAP_CHOWN", + "CAP_DAC_OVERRIDE", + "CAP_FSETID", + "CAP_FOWNER", + "CAP_MKNOD", + "CAP_NET_RAW", + "CAP_SETGID", + "CAP_SETUID", + "CAP_SETFCAP", + "CAP_SETPCAP", + "CAP_NET_BIND_SERVICE", + "CAP_SYS_CHROOT", + "CAP_KILL", + "CAP_AUDIT_WRITE", + }, + Permitted: []string{ + "CAP_CHOWN", + "CAP_DAC_OVERRIDE", + "CAP_FSETID", + "CAP_FOWNER", + "CAP_MKNOD", + "CAP_NET_RAW", + "CAP_SETGID", + "CAP_SETUID", + "CAP_SETFCAP", + "CAP_SETPCAP", + "CAP_NET_BIND_SERVICE", + "CAP_SYS_CHROOT", + "CAP_KILL", + "CAP_AUDIT_WRITE", + }, + Ambient: []string{ + "CAP_CHOWN", + "CAP_DAC_OVERRIDE", + "CAP_FSETID", + "CAP_FOWNER", + "CAP_MKNOD", + "CAP_NET_RAW", + "CAP_SETGID", + "CAP_SETUID", + "CAP_SETFCAP", + "CAP_SETPCAP", + "CAP_NET_BIND_SERVICE", + "CAP_SYS_CHROOT", + "CAP_KILL", + "CAP_AUDIT_WRITE", + }, + }, Namespaces: configs.Namespaces([]configs.Namespace{ {Type: configs.NEWNS}, {Type: configs.NEWUTS}, @@ -155,7 +159,7 @@ config := &configs.Config{ Parent: "system", Resources: &configs.Resources{ MemorySwappiness: nil, - Devices: specconv.AllowedDevices, + Devices: devices, }, }, MaskPaths: []string{ @@ -313,7 +317,7 @@ state, err := container.State() #### Checkpoint & Restore libcontainer now integrates [CRIU](http://criu.org/) for checkpointing and restoring containers. -This let's you save the state of a process running inside a container to disk, and then restore +This lets you save the state of a process running inside a container to disk, and then restore that state into a new process, on the same machine or on another machine. `criu` version 1.5.2 or higher is required to use checkpoint and restore. diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go index a16a68e979..68a346ca53 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go @@ -7,37 +7,44 @@ import ( ) type Manager interface { - // Applies cgroup configuration to the process with the specified pid + // Apply creates a cgroup, if not yet created, and adds a process + // with the specified pid into that cgroup. A special value of -1 + // can be used to merely create a cgroup. Apply(pid int) error - // Returns the PIDs inside the cgroup set + // GetPids returns the PIDs of all processes inside the cgroup. GetPids() ([]int, error) - // Returns the PIDs inside the cgroup set & all sub-cgroups + // GetAllPids returns the PIDs of all processes inside the cgroup + // any all its sub-cgroups. GetAllPids() ([]int, error) - // Returns statistics for the cgroup set + // GetStats returns cgroups statistics. GetStats() (*Stats, error) - // Toggles the freezer cgroup according with specified state + // Freeze sets the freezer cgroup to the specified state. Freeze(state configs.FreezerState) error - // Destroys the cgroup set + // Destroy removes cgroup. Destroy() error // Path returns a cgroup path to the specified controller/subsystem. // For cgroupv2, the argument is unused and can be empty. Path(string) string - // Sets the cgroup as configured. - Set(container *configs.Config) error + // Set sets cgroup resources parameters/limits. If the argument is nil, + // the resources specified during Manager creation (or the previous call + // to Set) are used. + Set(r *configs.Resources) error - // GetPaths returns cgroup path(s) to save in a state file in order to restore later. + // GetPaths returns cgroup path(s) to save in a state file in order to + // restore later. // - // For cgroup v1, a key is cgroup subsystem name, and the value is the path - // to the cgroup for this subsystem. + // For cgroup v1, a key is cgroup subsystem name, and the value is the + // path to the cgroup for this subsystem. // - // For cgroup v2 unified hierarchy, a key is "", and the value is the unified path. + // For cgroup v2 unified hierarchy, a key is "", and the value is the + // unified path. GetPaths() map[string]string // GetCgroups returns the cgroup data as configured. @@ -46,6 +53,9 @@ type Manager interface { // GetFreezerState retrieves the current FreezerState of the cgroup. GetFreezerState() (configs.FreezerState, error) - // Whether the cgroup path exists or not + // Exists returns whether the cgroup path exists or not. Exists() bool + + // OOMKillCount reports OOM kill count for the cgroup. + OOMKillCount() (uint64, error) } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/fscommon.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/fscommon.go new file mode 100644 index 0000000000..ae2613cdbd --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/fscommon.go @@ -0,0 +1,51 @@ +// +build linux + +package fscommon + +import ( + "bytes" + "os" + + "github.com/pkg/errors" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +// WriteFile writes data to a cgroup file in dir. +// It is supposed to be used for cgroup files only. +func WriteFile(dir, file, data string) error { + fd, err := OpenFile(dir, file, unix.O_WRONLY) + if err != nil { + return err + } + defer fd.Close() + if err := retryingWriteFile(fd, data); err != nil { + return errors.Wrapf(err, "failed to write %q", data) + } + return nil +} + +// ReadFile reads data from a cgroup file in dir. +// It is supposed to be used for cgroup files only. +func ReadFile(dir, file string) (string, error) { + fd, err := OpenFile(dir, file, unix.O_RDONLY) + if err != nil { + return "", err + } + defer fd.Close() + var buf bytes.Buffer + + _, err = buf.ReadFrom(fd) + return buf.String(), err +} + +func retryingWriteFile(fd *os.File, data string) error { + for { + _, err := fd.Write([]byte(data)) + if errors.Is(err, unix.EINTR) { + logrus.Infof("interrupted while writing %s to %s", data, fd.Name()) + continue + } + return err + } +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/open.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/open.go new file mode 100644 index 0000000000..49af83b3c0 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/open.go @@ -0,0 +1,120 @@ +package fscommon + +import ( + "os" + "strings" + "sync" + + "github.com/pkg/errors" + "github.com/sirupsen/logrus" + "golang.org/x/sys/unix" +) + +const ( + cgroupfsDir = "/sys/fs/cgroup" + cgroupfsPrefix = cgroupfsDir + "/" +) + +var ( + // TestMode is set to true by unit tests that need "fake" cgroupfs. + TestMode bool + + cgroupFd int = -1 + prepOnce sync.Once + prepErr error + resolveFlags uint64 +) + +func prepareOpenat2() error { + prepOnce.Do(func() { + fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{ + Flags: unix.O_DIRECTORY | unix.O_PATH}) + if err != nil { + prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err} + if err != unix.ENOSYS { + logrus.Warnf("falling back to securejoin: %s", prepErr) + } else { + logrus.Debug("openat2 not available, falling back to securejoin") + } + return + } + var st unix.Statfs_t + if err = unix.Fstatfs(fd, &st); err != nil { + prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err} + logrus.Warnf("falling back to securejoin: %s", prepErr) + return + } + + cgroupFd = fd + + resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS + if st.Type == unix.CGROUP2_SUPER_MAGIC { + // cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks + resolveFlags |= unix.RESOLVE_NO_XDEV | unix.RESOLVE_NO_SYMLINKS + } + + }) + + return prepErr +} + +// OpenFile opens a cgroup file in a given dir with given flags. +// It is supposed to be used for cgroup files only. +func OpenFile(dir, file string, flags int) (*os.File, error) { + if dir == "" { + return nil, errors.Errorf("no directory specified for %s", file) + } + mode := os.FileMode(0) + if TestMode && flags&os.O_WRONLY != 0 { + // "emulate" cgroup fs for unit tests + flags |= os.O_TRUNC | os.O_CREATE + mode = 0o600 + } + if prepareOpenat2() != nil { + return openFallback(dir, file, flags, mode) + } + reldir := strings.TrimPrefix(dir, cgroupfsPrefix) + if len(reldir) == len(dir) { // non-standard path, old system? + return openFallback(dir, file, flags, mode) + } + + relname := reldir + "/" + file + fd, err := unix.Openat2(cgroupFd, relname, + &unix.OpenHow{ + Resolve: resolveFlags, + Flags: uint64(flags) | unix.O_CLOEXEC, + Mode: uint64(mode), + }) + if err != nil { + return nil, &os.PathError{Op: "openat2", Path: dir + "/" + file, Err: err} + } + + return os.NewFile(uintptr(fd), cgroupfsPrefix+relname), nil +} + +var errNotCgroupfs = errors.New("not a cgroup file") + +// openFallback is used when openat2(2) is not available. It checks the opened +// file is on cgroupfs, returning an error otherwise. +func openFallback(dir, file string, flags int, mode os.FileMode) (*os.File, error) { + path := dir + "/" + file + fd, err := os.OpenFile(path, flags, mode) + if err != nil { + return nil, err + } + if TestMode { + return fd, nil + } + // Check this is a cgroupfs file. + var st unix.Statfs_t + if err := unix.Fstatfs(int(fd.Fd()), &st); err != nil { + _ = fd.Close() + return nil, &os.PathError{Op: "statfs", Path: path, Err: err} + } + if st.Type != unix.CGROUP_SUPER_MAGIC && st.Type != unix.CGROUP2_SUPER_MAGIC { + _ = fd.Close() + return nil, &os.PathError{Op: "open", Path: path, Err: errNotCgroupfs} + } + + return fd, nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/utils.go new file mode 100644 index 0000000000..db0caded15 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/utils.go @@ -0,0 +1,122 @@ +// +build linux + +package fscommon + +import ( + "errors" + "fmt" + "math" + "strconv" + "strings" +) + +var ( + ErrNotValidFormat = errors.New("line is not a valid key value format") +) + +// ParseUint converts a string to an uint64 integer. +// Negative values are returned at zero as, due to kernel bugs, +// some of the memory cgroup stats can be negative. +func ParseUint(s string, base, bitSize int) (uint64, error) { + value, err := strconv.ParseUint(s, base, bitSize) + if err != nil { + intValue, intErr := strconv.ParseInt(s, base, bitSize) + // 1. Handle negative values greater than MinInt64 (and) + // 2. Handle negative values lesser than MinInt64 + if intErr == nil && intValue < 0 { + return 0, nil + } else if intErr != nil && intErr.(*strconv.NumError).Err == strconv.ErrRange && intValue < 0 { + return 0, nil + } + + return value, err + } + + return value, nil +} + +// ParseKeyValue parses a space-separated "name value" kind of cgroup +// parameter and returns its key as a string, and its value as uint64 +// (ParseUint is used to convert the value). For example, +// "io_service_bytes 1234" will be returned as "io_service_bytes", 1234. +func ParseKeyValue(t string) (string, uint64, error) { + parts := strings.SplitN(t, " ", 3) + if len(parts) != 2 { + return "", 0, fmt.Errorf("line %q is not in key value format", t) + } + + value, err := ParseUint(parts[1], 10, 64) + if err != nil { + return "", 0, fmt.Errorf("unable to convert to uint64: %v", err) + } + + return parts[0], value, nil +} + +// GetValueByKey reads a key-value pairs from the specified cgroup file, +// and returns a value of the specified key. ParseUint is used for value +// conversion. +func GetValueByKey(path, file, key string) (uint64, error) { + content, err := ReadFile(path, file) + if err != nil { + return 0, err + } + + lines := strings.Split(string(content), "\n") + for _, line := range lines { + arr := strings.Split(line, " ") + if len(arr) == 2 && arr[0] == key { + return ParseUint(arr[1], 10, 64) + } + } + + return 0, nil +} + +// GetCgroupParamUint reads a single uint64 value from the specified cgroup file. +// If the value read is "max", the math.MaxUint64 is returned. +func GetCgroupParamUint(path, file string) (uint64, error) { + contents, err := GetCgroupParamString(path, file) + if err != nil { + return 0, err + } + contents = strings.TrimSpace(contents) + if contents == "max" { + return math.MaxUint64, nil + } + + res, err := ParseUint(contents, 10, 64) + if err != nil { + return res, fmt.Errorf("unable to parse file %q", path+"/"+file) + } + return res, nil +} + +// GetCgroupParamInt reads a single int64 value from specified cgroup file. +// If the value read is "max", the math.MaxInt64 is returned. +func GetCgroupParamInt(path, file string) (int64, error) { + contents, err := ReadFile(path, file) + if err != nil { + return 0, err + } + contents = strings.TrimSpace(contents) + if contents == "max" { + return math.MaxInt64, nil + } + + res, err := strconv.ParseInt(contents, 10, 64) + if err != nil { + return res, fmt.Errorf("unable to parse %q as a int from Cgroup file %q", contents, path+"/"+file) + } + return res, nil +} + +// GetCgroupParamString reads a string from the specified cgroup file. +func GetCgroupParamString(path, file string) (string, error) { + contents, err := ReadFile(path, file) + if err != nil { + return "", err + } + + return strings.TrimSpace(contents), nil +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go index 7ac8166059..e7f9c46263 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go @@ -39,6 +39,33 @@ type CpuStats struct { ThrottlingData ThrottlingData `json:"throttling_data,omitempty"` } +type CPUSetStats struct { + // List of the physical numbers of the CPUs on which processes + // in that cpuset are allowed to execute + CPUs []uint16 `json:"cpus,omitempty"` + // cpu_exclusive flag + CPUExclusive uint64 `json:"cpu_exclusive"` + // List of memory nodes on which processes in that cpuset + // are allowed to allocate memory + Mems []uint16 `json:"mems,omitempty"` + // mem_hardwall flag + MemHardwall uint64 `json:"mem_hardwall"` + // mem_exclusive flag + MemExclusive uint64 `json:"mem_exclusive"` + // memory_migrate flag + MemoryMigrate uint64 `json:"memory_migrate"` + // memory_spread page flag + MemorySpreadPage uint64 `json:"memory_spread_page"` + // memory_spread slab flag + MemorySpreadSlab uint64 `json:"memory_spread_slab"` + // memory_pressure + MemoryPressure uint64 `json:"memory_pressure"` + // sched_load balance flag + SchedLoadBalance uint64 `json:"sched_load_balance"` + // sched_relax_domain_level + SchedRelaxDomainLevel int64 `json:"sched_relax_domain_level"` +} + type MemoryData struct { Usage uint64 `json:"usage,omitempty"` MaxUsage uint64 `json:"max_usage,omitempty"` @@ -121,6 +148,7 @@ type HugetlbStats struct { type Stats struct { CpuStats CpuStats `json:"cpu_stats,omitempty"` + CPUSetStats CPUSetStats `json:"cpuset_stats,omitempty"` MemoryStats MemoryStats `json:"memory_stats,omitempty"` PidsStats PidsStats `json:"pids_stats,omitempty"` BlkioStats BlkioStats `json:"blkio_stats,omitempty"` diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go index 6e88b5dff6..35ce2c1c2d 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go @@ -15,7 +15,9 @@ import ( "sync" "time" - units "github.com/docker/go-units" + "github.com/opencontainers/runc/libcontainer/cgroups/fscommon" + "github.com/opencontainers/runc/libcontainer/userns" + "github.com/sirupsen/logrus" "golang.org/x/sys/unix" ) @@ -29,19 +31,19 @@ var ( isUnified bool ) -// HugePageSizeUnitList is a list of the units used by the linux kernel when -// naming the HugePage control files. -// https://www.kernel.org/doc/Documentation/cgroup-v1/hugetlb.txt -// TODO Since the kernel only use KB, MB and GB; TB and PB should be removed, -// depends on https://github.com/docker/go-units/commit/a09cd47f892041a4fac473133d181f5aea6fa393 -var HugePageSizeUnitList = []string{"B", "KB", "MB", "GB", "TB", "PB"} - // IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode. func IsCgroup2UnifiedMode() bool { isUnifiedOnce.Do(func() { var st unix.Statfs_t - if err := unix.Statfs(unifiedMountpoint, &st); err != nil { - panic("cannot statfs cgroup root") + err := unix.Statfs(unifiedMountpoint, &st) + if err != nil { + if os.IsNotExist(err) && userns.RunningInUserNS() { + // ignore the "not found" error if running in userns + logrus.WithError(err).Debugf("%s missing, assuming cgroup v1", unifiedMountpoint) + isUnified = false + return + } + panic(fmt.Sprintf("cannot statfs cgroup root: %s", err)) } isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC }) @@ -86,11 +88,11 @@ func GetAllSubsystems() ([]string, error) { // - freezer: implemented in kernel 5.2 // We assume these are always available, as it is hard to detect availability. pseudo := []string{"devices", "freezer"} - data, err := ioutil.ReadFile("/sys/fs/cgroup/cgroup.controllers") + data, err := fscommon.ReadFile("/sys/fs/cgroup", "cgroup.controllers") if err != nil { return nil, err } - subsystems := append(pseudo, strings.Fields(string(data))...) + subsystems := append(pseudo, strings.Fields(data)...) return subsystems, nil } f, err := os.Open("/proc/cgroups") @@ -207,20 +209,66 @@ func EnterPid(cgroupPaths map[string]string, pid int) error { return nil } +func rmdir(path string) error { + err := unix.Rmdir(path) + if err == nil || err == unix.ENOENT { + return nil + } + return &os.PathError{Op: "rmdir", Path: path, Err: err} +} + +// RemovePath aims to remove cgroup path. It does so recursively, +// by removing any subdirectories (sub-cgroups) first. +func RemovePath(path string) error { + // try the fast path first + if err := rmdir(path); err == nil { + return nil + } + + infos, err := ioutil.ReadDir(path) + if err != nil { + if os.IsNotExist(err) { + err = nil + } + return err + } + for _, info := range infos { + if info.IsDir() { + // We should remove subcgroups dir first + if err = RemovePath(filepath.Join(path, info.Name())); err != nil { + break + } + } + } + if err == nil { + err = rmdir(path) + } + return err +} + // RemovePaths iterates over the provided paths removing them. // We trying to remove all paths five times with increasing delay between tries. // If after all there are not removed cgroups - appropriate error will be // returned. func RemovePaths(paths map[string]string) (err error) { + const retries = 5 delay := 10 * time.Millisecond - for i := 0; i < 5; i++ { + for i := 0; i < retries; i++ { if i != 0 { time.Sleep(delay) delay *= 2 } for s, p := range paths { - os.RemoveAll(p) - // TODO: here probably should be logging + if err := RemovePath(p); err != nil { + // do not log intermediate iterations + switch i { + case 0: + logrus.WithError(err).Warnf("Failed to remove cgroup (will retry)") + case retries - 1: + logrus.WithError(err).Error("Failed to remove cgroup") + } + + } _, err := os.Stat(p) // We need this strange way of checking cgroups existence because // RemoveAll almost always returns error, even on already removed @@ -230,6 +278,8 @@ func RemovePaths(paths map[string]string) (err error) { } } if len(paths) == 0 { + //nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506 + paths = make(map[string]string) return nil } } @@ -237,27 +287,50 @@ func RemovePaths(paths map[string]string) (err error) { } func GetHugePageSize() ([]string, error) { - files, err := ioutil.ReadDir("/sys/kernel/mm/hugepages") + dir, err := os.OpenFile("/sys/kernel/mm/hugepages", unix.O_DIRECTORY|unix.O_RDONLY, 0) if err != nil { - return []string{}, err + return nil, err } - var fileNames []string - for _, st := range files { - fileNames = append(fileNames, st.Name()) + files, err := dir.Readdirnames(0) + dir.Close() + if err != nil { + return nil, err } - return getHugePageSizeFromFilenames(fileNames) + + return getHugePageSizeFromFilenames(files) } func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) { - var pageSizes []string - for _, fileName := range fileNames { - nameArray := strings.Split(fileName, "-") - pageSize, err := units.RAMInBytes(nameArray[1]) - if err != nil { - return []string{}, err + pageSizes := make([]string, 0, len(fileNames)) + + for _, file := range fileNames { + // example: hugepages-1048576kB + val := strings.TrimPrefix(file, "hugepages-") + if len(val) == len(file) { + // unexpected file name: no prefix found + continue } - sizeString := units.CustomSize("%g%s", float64(pageSize), 1024.0, HugePageSizeUnitList) - pageSizes = append(pageSizes, sizeString) + // The suffix is always "kB" (as of Linux 5.9) + eLen := len(val) - 2 + val = strings.TrimSuffix(val, "kB") + if len(val) != eLen { + logrus.Warnf("GetHugePageSize: %s: invalid filename suffix (expected \"kB\")", file) + continue + } + size, err := strconv.Atoi(val) + if err != nil { + return nil, err + } + // Model after https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/hugetlb_cgroup.c?id=eff48ddeab782e35e58ccc8853f7386bbae9dec4#n574 + // but in our case the size is in KB already. + if size >= (1 << 20) { + val = strconv.Itoa(size>>20) + "GB" + } else if size >= (1 << 10) { + val = strconv.Itoa(size>>10) + "MB" + } else { + val += "KB" + } + pageSizes = append(pageSizes, val) } return pageSizes, nil @@ -303,14 +376,14 @@ func WriteCgroupProc(dir string, pid int) error { return nil } - cgroupProcessesFile, err := os.OpenFile(filepath.Join(dir, CgroupProcesses), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0700) + file, err := fscommon.OpenFile(dir, CgroupProcesses, os.O_WRONLY) if err != nil { return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err) } - defer cgroupProcessesFile.Close() + defer file.Close() for i := 0; i < 5; i++ { - _, err = cgroupProcessesFile.WriteString(strconv.Itoa(pid)) + _, err = file.WriteString(strconv.Itoa(pid)) if err == nil { return nil } @@ -327,17 +400,6 @@ func WriteCgroupProc(dir string, pid int) error { return err } -// Since the OCI spec is designed for cgroup v1, in some cases -// there is need to convert from the cgroup v1 configuration to cgroup v2 -// the formula for BlkIOWeight is y = (1 + (x - 10) * 9999 / 990) -// convert linearly from [10-1000] to [1-10000] -func ConvertBlkIOToCgroupV2Value(blkIoWeight uint16) uint64 { - if blkIoWeight == 0 { - return 0 - } - return uint64(1 + (uint64(blkIoWeight)-10)*9999/990) -} - // Since the OCI spec is designed for cgroup v1, in some cases // there is need to convert from the cgroup v1 configuration to cgroup v2 // the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142) @@ -377,3 +439,14 @@ func ConvertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) { return memorySwap - memory, nil } + +// Since the OCI spec is designed for cgroup v1, in some cases +// there is need to convert from the cgroup v1 configuration to cgroup v2 +// the formula for BlkIOWeight to IOWeight is y = (1 + (x - 10) * 9999 / 990) +// convert linearly from [10-1000] to [1-10000] +func ConvertBlkIOToIOWeightValue(blkIoWeight uint16) uint64 { + if blkIoWeight == 0 { + return 0 + } + return uint64(1 + (uint64(blkIoWeight)-10)*9999/990) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go index a94f208616..95ec9dff02 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go @@ -1,16 +1,16 @@ package cgroups import ( - "bufio" "errors" "fmt" - "io" "os" "path/filepath" "strings" + "sync" "syscall" securejoin "github.com/cyphar/filepath-securejoin" + "github.com/moby/sys/mountinfo" "golang.org/x/sys/unix" ) @@ -23,7 +23,12 @@ const ( ) var ( - errUnified = errors.New("not implemented for cgroup v2 unified hierarchy") + errUnified = errors.New("not implemented for cgroup v2 unified hierarchy") + ErrV1NoUnified = errors.New("invalid configuration: cannot use unified on cgroup v1") + + readMountinfoOnce sync.Once + readMountinfoErr error + cgroupMountinfo []*mountinfo.Info ) type NotFoundError struct { @@ -90,6 +95,21 @@ func tryDefaultPath(cgroupPath, subsystem string) string { return path } +// readCgroupMountinfo returns a list of cgroup v1 mounts (i.e. the ones +// with fstype of "cgroup") for the current running process. +// +// The results are cached (to avoid re-reading mountinfo which is relatively +// expensive), so it is assumed that cgroup mounts are not being changed. +func readCgroupMountinfo() ([]*mountinfo.Info, error) { + readMountinfoOnce.Do(func() { + cgroupMountinfo, readMountinfoErr = mountinfo.GetMounts( + mountinfo.FSTypeFilter("cgroup"), + ) + }) + + return cgroupMountinfo, readMountinfoErr +} + // https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) { if IsCgroup2UnifiedMode() { @@ -110,56 +130,28 @@ func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string, return "", "", errUnified } - // Avoid parsing mountinfo by checking if subsystem is valid/available. - if !isSubsystemAvailable(subsystem) { - return "", "", NewNotFoundError(subsystem) - } - - f, err := os.Open("/proc/self/mountinfo") + mi, err := readCgroupMountinfo() if err != nil { return "", "", err } - defer f.Close() - return findCgroupMountpointAndRootFromReader(f, cgroupPath, subsystem) + return findCgroupMountpointAndRootFromMI(mi, cgroupPath, subsystem) } -func findCgroupMountpointAndRootFromReader(reader io.Reader, cgroupPath, subsystem string) (string, string, error) { - scanner := bufio.NewScanner(reader) - for scanner.Scan() { - txt := scanner.Text() - fields := strings.Fields(txt) - if len(fields) < 9 { - continue - } - if strings.HasPrefix(fields[4], cgroupPath) { - for _, opt := range strings.Split(fields[len(fields)-1], ",") { +func findCgroupMountpointAndRootFromMI(mounts []*mountinfo.Info, cgroupPath, subsystem string) (string, string, error) { + for _, mi := range mounts { + if strings.HasPrefix(mi.Mountpoint, cgroupPath) { + for _, opt := range strings.Split(mi.VFSOptions, ",") { if opt == subsystem { - return fields[4], fields[3], nil + return mi.Mountpoint, mi.Root, nil } } } } - if err := scanner.Err(); err != nil { - return "", "", err - } return "", "", NewNotFoundError(subsystem) } -func isSubsystemAvailable(subsystem string) bool { - if IsCgroup2UnifiedMode() { - panic("don't call isSubsystemAvailable from cgroupv2 code") - } - - cgroups, err := ParseCgroupFile("/proc/self/cgroup") - if err != nil { - return false - } - _, avail := cgroups[subsystem] - return avail -} - func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) { if len(m.Subsystems) == 0 { return "", fmt.Errorf("no subsystem for mount") @@ -168,25 +160,15 @@ func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) { return getControllerPath(m.Subsystems[0], cgroups) } -func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount, error) { +func getCgroupMountsHelper(ss map[string]bool, mounts []*mountinfo.Info, all bool) ([]Mount, error) { res := make([]Mount, 0, len(ss)) - scanner := bufio.NewScanner(mi) numFound := 0 - for scanner.Scan() && numFound < len(ss) { - txt := scanner.Text() - sepIdx := strings.Index(txt, " - ") - if sepIdx == -1 { - return nil, fmt.Errorf("invalid mountinfo format") - } - if txt[sepIdx+3:sepIdx+10] == "cgroup2" || txt[sepIdx+3:sepIdx+9] != "cgroup" { - continue - } - fields := strings.Split(txt, " ") + for _, mi := range mounts { m := Mount{ - Mountpoint: fields[4], - Root: fields[3], + Mountpoint: mi.Mountpoint, + Root: mi.Root, } - for _, opt := range strings.Split(fields[len(fields)-1], ",") { + for _, opt := range strings.Split(mi.VFSOptions, ",") { seen, known := ss[opt] if !known || (!all && seen) { continue @@ -199,19 +181,18 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount, if len(m.Subsystems) > 0 || all { res = append(res, m) } - } - if err := scanner.Err(); err != nil { - return nil, err + if !all && numFound >= len(ss) { + break + } } return res, nil } func getCgroupMountsV1(all bool) ([]Mount, error) { - f, err := os.Open("/proc/self/mountinfo") + mi, err := readCgroupMountinfo() if err != nil { return nil, err } - defer f.Close() allSubsystems, err := ParseCgroupFile("/proc/self/cgroup") if err != nil { @@ -222,7 +203,8 @@ func getCgroupMountsV1(all bool) ([]Mount, error) { for s := range allSubsystems { allMap[s] = false } - return getCgroupMountsHelper(allMap, f, all) + + return getCgroupMountsHelper(allMap, mi, all) } // GetOwnCgroup returns the relative path to the cgroup docker is running in. diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go index 6e90ae16b5..87d0da8428 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go @@ -2,6 +2,7 @@ package configs import ( systemdDbus "github.com/coreos/go-systemd/v22/dbus" + "github.com/opencontainers/runc/libcontainer/devices" ) type FreezerState string @@ -42,7 +43,7 @@ type Cgroup struct { type Resources struct { // Devices is the set of access rules for devices in the container. - Devices []*DeviceRule `json:"devices"` + Devices []*devices.Rule `json:"devices"` // Memory limit (in bytes) Memory int64 `json:"memory"` @@ -53,12 +54,6 @@ type Resources struct { // Total memory usage (memory + swap); set `-1` to enable unlimited swap MemorySwap int64 `json:"memory_swap"` - // Kernel memory limit (in bytes) - KernelMemory int64 `json:"kernel_memory"` - - // Kernel memory limit for TCP use (in bytes) - KernelMemoryTCP int64 `json:"kernel_memory_tcp"` - // CPU shares (relative weight vs. other containers) CpuShares uint64 `json:"cpu_shares"` @@ -127,6 +122,9 @@ type Resources struct { // CpuWeight sets a proportional bandwidth limit. CpuWeight uint64 `json:"cpu_weight"` + // Unified is cgroupv2-only key-value map. + Unified map[string]string `json:"unified"` + // SkipDevices allows to skip configuring device permissions. // Used by e.g. kubelet while creating a parent cgroup (kubepods) // common for many containers. diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go index 540f0f85d2..14a0960389 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/config.go @@ -7,6 +7,7 @@ import ( "os/exec" "time" + "github.com/opencontainers/runc/libcontainer/devices" "github.com/opencontainers/runtime-spec/specs-go" "github.com/pkg/errors" "github.com/sirupsen/logrus" @@ -30,9 +31,10 @@ type IDMap struct { // for syscalls. Additional architectures can be added by specifying them in // Architectures. type Seccomp struct { - DefaultAction Action `json:"default_action"` - Architectures []string `json:"architectures"` - Syscalls []*Syscall `json:"syscalls"` + DefaultAction Action `json:"default_action"` + Architectures []string `json:"architectures"` + Syscalls []*Syscall `json:"syscalls"` + DefaultErrnoRet *uint `json:"default_errno_ret"` } // Action is taken upon rule match in Seccomp @@ -92,6 +94,9 @@ type Config struct { // Path to a directory containing the container's root filesystem. Rootfs string `json:"rootfs"` + // Umask is the umask to use inside of the container. + Umask *uint32 `json:"umask"` + // Readonlyfs will remount the container's rootfs as readonly where only externally mounted // bind mounts are writtable. Readonlyfs bool `json:"readonlyfs"` @@ -104,7 +109,7 @@ type Config struct { Mounts []*Mount `json:"mounts"` // The device nodes that should be automatically created within the container upon container start. Note, make sure that the node is marked as allowed in the cgroup as well! - Devices []*Device `json:"devices"` + Devices []*devices.Device `json:"devices"` MountLabel string `json:"mount_label"` @@ -218,25 +223,25 @@ const ( // the runtime environment has been created but before the pivot_root has been executed. // CreateRuntime is called immediately after the deprecated Prestart hook. // CreateRuntime commands are called in the Runtime Namespace. - CreateRuntime = "createRuntime" + CreateRuntime HookName = "createRuntime" // CreateContainer commands MUST be called as part of the create operation after // the runtime environment has been created but before the pivot_root has been executed. // CreateContainer commands are called in the Container namespace. - CreateContainer = "createContainer" + CreateContainer HookName = "createContainer" // StartContainer commands MUST be called as part of the start operation and before // the container process is started. // StartContainer commands are called in the Container namespace. - StartContainer = "startContainer" + StartContainer HookName = "startContainer" // Poststart commands are executed after the container init process starts. // Poststart commands are called in the Runtime Namespace. - Poststart = "poststart" + Poststart HookName = "poststart" // Poststop commands are executed after the container init process exits. // Poststop commands are called in the Runtime Namespace. - Poststop = "poststop" + Poststop HookName = "poststop" ) type Capabilities struct { @@ -383,7 +388,7 @@ func (c Command) Run(s *specs.State) error { return err case <-timerCh: cmd.Process.Kill() - cmd.Wait() + <-errC return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds()) } } diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/configs_fuzzer.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/configs_fuzzer.go new file mode 100644 index 0000000000..93bf41c8de --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/configs_fuzzer.go @@ -0,0 +1,9 @@ +// +build gofuzz + +package configs + +func FuzzUnmarshalJSON(data []byte) int { + hooks := Hooks{} + _ = hooks.UnmarshalJSON(data) + return 1 +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/device_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/device_unix.go deleted file mode 100644 index 650c46848a..0000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/device_unix.go +++ /dev/null @@ -1,16 +0,0 @@ -// +build !windows - -package configs - -import ( - "errors" - - "golang.org/x/sys/unix" -) - -func (d *DeviceRule) Mkdev() (uint64, error) { - if d.Major == Wildcard || d.Minor == Wildcard { - return 0, errors.New("cannot mkdev() device with wildcards") - } - return unix.Mkdev(uint32(d.Major), uint32(d.Minor)), nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/device_windows.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/device_windows.go deleted file mode 100644 index 729289393f..0000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/device_windows.go +++ /dev/null @@ -1,5 +0,0 @@ -package configs - -func (d *DeviceRule) Mkdev() (uint64, error) { - return 0, nil -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/devices.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/devices.go new file mode 100644 index 0000000000..b9e3664cea --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/devices.go @@ -0,0 +1,17 @@ +package configs + +import "github.com/opencontainers/runc/libcontainer/devices" + +type ( + // Deprecated: use libcontainer/devices.Device + Device = devices.Device + + // Deprecated: use libcontainer/devices.Rule + DeviceRule = devices.Rule + + // Deprecated: use libcontainer/devices.Type + DeviceType = devices.Type + + // Deprecated: use libcontainer/devices.Permissions + DevicePermissions = devices.Permissions +) diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go index 1bbaef9bd9..d52d6fcd14 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go @@ -56,7 +56,7 @@ func IsNamespaceSupported(ns NamespaceType) bool { if nsFile == "" { return false } - _, err := os.Stat(fmt.Sprintf("/proc/self/ns/%s", nsFile)) + _, err := os.Stat("/proc/self/ns/" + nsFile) // a namespace is supported if it exists and we have permissions to read it supported = err == nil supportedNamespaces[ns] = supported diff --git a/vendor/github.com/opencontainers/runc/libcontainer/configs/device.go b/vendor/github.com/opencontainers/runc/libcontainer/devices/device.go similarity index 63% rename from vendor/github.com/opencontainers/runc/libcontainer/configs/device.go rename to vendor/github.com/opencontainers/runc/libcontainer/devices/device.go index 632bf6ac49..c2c2b3bb7c 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/configs/device.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/devices/device.go @@ -1,4 +1,4 @@ -package configs +package devices import ( "fmt" @@ -11,7 +11,7 @@ const ( ) type Device struct { - DeviceRule + Rule // Path to the device. Path string `json:"path"` @@ -26,10 +26,10 @@ type Device struct { Gid uint32 `json:"gid"` } -// DevicePermissions is a cgroupv1-style string to represent device access. It +// Permissions is a cgroupv1-style string to represent device access. It // has to be a string for backward compatibility reasons, hence why it has // methods to do set operations. -type DevicePermissions string +type Permissions string const ( deviceRead uint = (1 << iota) @@ -37,7 +37,7 @@ const ( deviceMknod ) -func (p DevicePermissions) toSet() uint { +func (p Permissions) toSet() uint { var set uint for _, perm := range p { switch perm { @@ -52,7 +52,7 @@ func (p DevicePermissions) toSet() uint { return set } -func fromSet(set uint) DevicePermissions { +func fromSet(set uint) Permissions { var perm string if set&deviceRead == deviceRead { perm += "r" @@ -63,53 +63,53 @@ func fromSet(set uint) DevicePermissions { if set&deviceMknod == deviceMknod { perm += "m" } - return DevicePermissions(perm) + return Permissions(perm) } -// Union returns the union of the two sets of DevicePermissions. -func (p DevicePermissions) Union(o DevicePermissions) DevicePermissions { +// Union returns the union of the two sets of Permissions. +func (p Permissions) Union(o Permissions) Permissions { lhs := p.toSet() rhs := o.toSet() return fromSet(lhs | rhs) } -// Difference returns the set difference of the two sets of DevicePermissions. +// Difference returns the set difference of the two sets of Permissions. // In set notation, A.Difference(B) gives you A\B. -func (p DevicePermissions) Difference(o DevicePermissions) DevicePermissions { +func (p Permissions) Difference(o Permissions) Permissions { lhs := p.toSet() rhs := o.toSet() return fromSet(lhs &^ rhs) } -// Intersection computes the intersection of the two sets of DevicePermissions. -func (p DevicePermissions) Intersection(o DevicePermissions) DevicePermissions { +// Intersection computes the intersection of the two sets of Permissions. +func (p Permissions) Intersection(o Permissions) Permissions { lhs := p.toSet() rhs := o.toSet() return fromSet(lhs & rhs) } -// IsEmpty returns whether the set of permissions in a DevicePermissions is +// IsEmpty returns whether the set of permissions in a Permissions is // empty. -func (p DevicePermissions) IsEmpty() bool { - return p == DevicePermissions("") +func (p Permissions) IsEmpty() bool { + return p == Permissions("") } // IsValid returns whether the set of permissions is a subset of valid // permissions (namely, {r,w,m}). -func (p DevicePermissions) IsValid() bool { +func (p Permissions) IsValid() bool { return p == fromSet(p.toSet()) } -type DeviceType rune +type Type rune const ( - WildcardDevice DeviceType = 'a' - BlockDevice DeviceType = 'b' - CharDevice DeviceType = 'c' // or 'u' - FifoDevice DeviceType = 'p' + WildcardDevice Type = 'a' + BlockDevice Type = 'b' + CharDevice Type = 'c' // or 'u' + FifoDevice Type = 'p' ) -func (t DeviceType) IsValid() bool { +func (t Type) IsValid() bool { switch t { case WildcardDevice, BlockDevice, CharDevice, FifoDevice: return true @@ -118,7 +118,7 @@ func (t DeviceType) IsValid() bool { } } -func (t DeviceType) CanMknod() bool { +func (t Type) CanMknod() bool { switch t { case BlockDevice, CharDevice, FifoDevice: return true @@ -127,7 +127,7 @@ func (t DeviceType) CanMknod() bool { } } -func (t DeviceType) CanCgroup() bool { +func (t Type) CanCgroup() bool { switch t { case WildcardDevice, BlockDevice, CharDevice: return true @@ -136,10 +136,10 @@ func (t DeviceType) CanCgroup() bool { } } -type DeviceRule struct { +type Rule struct { // Type of device ('c' for char, 'b' for block). If set to 'a', this rule // acts as a wildcard and all fields other than Allow are ignored. - Type DeviceType `json:"type"` + Type Type `json:"type"` // Major is the device's major number. Major int64 `json:"major"` @@ -149,13 +149,13 @@ type DeviceRule struct { // Permissions is the set of permissions that this rule applies to (in the // cgroupv1 format -- any combination of "rwm"). - Permissions DevicePermissions `json:"permissions"` + Permissions Permissions `json:"permissions"` // Allow specifies whether this rule is allowed. Allow bool `json:"allow"` } -func (d *DeviceRule) CgroupString() string { +func (d *Rule) CgroupString() string { var ( major = strconv.FormatInt(d.Major, 10) minor = strconv.FormatInt(d.Minor, 10) @@ -168,3 +168,7 @@ func (d *DeviceRule) CgroupString() string { } return fmt.Sprintf("%c %s:%s %s", d.Type, major, minor, d.Permissions) } + +func (d *Rule) Mkdev() (uint64, error) { + return mkDev(d) +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/devices/devices.go b/vendor/github.com/opencontainers/runc/libcontainer/devices/device_unix.go similarity index 76% rename from vendor/github.com/opencontainers/runc/libcontainer/devices/devices.go rename to vendor/github.com/opencontainers/runc/libcontainer/devices/device_unix.go index 79f89c2d77..acb816998c 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/devices/devices.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/devices/device_unix.go @@ -1,3 +1,5 @@ +// +build !windows + package devices import ( @@ -6,7 +8,6 @@ import ( "os" "path/filepath" - "github.com/opencontainers/runc/libcontainer/configs" "golang.org/x/sys/unix" ) @@ -21,9 +22,16 @@ var ( ioutilReadDir = ioutil.ReadDir ) +func mkDev(d *Rule) (uint64, error) { + if d.Major == Wildcard || d.Minor == Wildcard { + return 0, errors.New("cannot mkdev() device with wildcards") + } + return unix.Mkdev(uint32(d.Major), uint32(d.Minor)), nil +} + // Given the path to a device and its cgroup_permissions(which cannot be easily queried) look up the // information about a linux device and return that information as a Device struct. -func DeviceFromPath(path, permissions string) (*configs.Device, error) { +func DeviceFromPath(path, permissions string) (*Device, error) { var stat unix.Stat_t err := unixLstat(path, &stat) if err != nil { @@ -31,7 +39,7 @@ func DeviceFromPath(path, permissions string) (*configs.Device, error) { } var ( - devType configs.DeviceType + devType Type mode = stat.Mode devNumber = uint64(stat.Rdev) major = unix.Major(devNumber) @@ -39,41 +47,41 @@ func DeviceFromPath(path, permissions string) (*configs.Device, error) { ) switch mode & unix.S_IFMT { case unix.S_IFBLK: - devType = configs.BlockDevice + devType = BlockDevice case unix.S_IFCHR: - devType = configs.CharDevice + devType = CharDevice case unix.S_IFIFO: - devType = configs.FifoDevice + devType = FifoDevice default: return nil, ErrNotADevice } - return &configs.Device{ - DeviceRule: configs.DeviceRule{ + return &Device{ + Rule: Rule{ Type: devType, Major: int64(major), Minor: int64(minor), - Permissions: configs.DevicePermissions(permissions), + Permissions: Permissions(permissions), }, Path: path, - FileMode: os.FileMode(mode), + FileMode: os.FileMode(mode &^ unix.S_IFMT), Uid: stat.Uid, Gid: stat.Gid, }, nil } // HostDevices returns all devices that can be found under /dev directory. -func HostDevices() ([]*configs.Device, error) { +func HostDevices() ([]*Device, error) { return GetDevices("/dev") } // GetDevices recursively traverses a directory specified by path // and returns all devices found there. -func GetDevices(path string) ([]*configs.Device, error) { +func GetDevices(path string) ([]*Device, error) { files, err := ioutilReadDir(path) if err != nil { return nil, err } - var out []*configs.Device + var out []*Device for _, f := range files { switch { case f.IsDir(): @@ -104,7 +112,7 @@ func GetDevices(path string) ([]*configs.Device, error) { } return nil, err } - if device.Type == configs.FifoDevice { + if device.Type == FifoDevice { continue } out = append(out, device) diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/cloned_binary.c b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/cloned_binary.c index 24cc8c6e16..4268ebda9e 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/cloned_binary.c +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/cloned_binary.c @@ -59,14 +59,38 @@ #include /* Use our own wrapper for memfd_create. */ -#if !defined(SYS_memfd_create) && defined(__NR_memfd_create) -# define SYS_memfd_create __NR_memfd_create +#ifndef SYS_memfd_create +# ifdef __NR_memfd_create +# define SYS_memfd_create __NR_memfd_create +# else +/* These values come from . */ +# warning "libc is outdated -- using hard-coded SYS_memfd_create" +# if defined(__x86_64__) +# define SYS_memfd_create 319 +# elif defined(__i386__) +# define SYS_memfd_create 356 +# elif defined(__ia64__) +# define SYS_memfd_create 1340 +# elif defined(__arm__) +# define SYS_memfd_create 385 +# elif defined(__aarch64__) +# define SYS_memfd_create 279 +# elif defined(__ppc__) || defined(__PPC64__) || defined(__powerpc64__) +# define SYS_memfd_create 360 +# elif defined(__s390__) || defined(__s390x__) +# define SYS_memfd_create 350 +# else +# warning "unknown architecture -- cannot hard-code SYS_memfd_create" +# endif +# endif #endif + /* memfd_create(2) flags -- copied from . */ #ifndef MFD_CLOEXEC # define MFD_CLOEXEC 0x0001U # define MFD_ALLOW_SEALING 0x0002U #endif + int memfd_create(const char *name, unsigned int flags) { #ifdef SYS_memfd_create @@ -77,7 +101,6 @@ int memfd_create(const char *name, unsigned int flags) #endif } - /* This comes directly from . */ #ifndef F_LINUX_SPECIFIC_BASE # define F_LINUX_SPECIFIC_BASE 1024 @@ -103,7 +126,7 @@ static void *must_realloc(void *ptr, size_t size) void *old = ptr; do { ptr = realloc(old, size); - } while(!ptr); + } while (!ptr); return ptr; } @@ -115,10 +138,10 @@ static void *must_realloc(void *ptr, size_t size) static int is_self_cloned(void) { int fd, ret, is_cloned = 0; - struct stat statbuf = {}; - struct statfs fsbuf = {}; + struct stat statbuf = { }; + struct statfs fsbuf = { }; - fd = open("/proc/self/exe", O_RDONLY|O_CLOEXEC); + fd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC); if (fd < 0) { fprintf(stderr, "you have no read access to runc binary file\n"); return -ENOTRECOVERABLE; @@ -274,7 +297,7 @@ enum { static int make_execfd(int *fdtype) { int fd = -1; - char template[PATH_MAX] = {0}; + char template[PATH_MAX] = { 0 }; char *prefix = getenv("_LIBCONTAINER_STATEDIR"); if (!prefix || *prefix != '/') @@ -303,7 +326,7 @@ static int make_execfd(int *fdtype) *fdtype = EFD_FILE; fd = open(prefix, O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700); if (fd >= 0) { - struct stat statbuf = {}; + struct stat statbuf = { }; bool working_otmpfile = false; /* @@ -348,27 +371,27 @@ static int seal_execfd(int *fd, int fdtype) switch (fdtype) { case EFD_MEMFD: return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_SEALS); - case EFD_FILE: { - /* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */ - int newfd; - char fdpath[PATH_MAX] = {0}; + case EFD_FILE:{ + /* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */ + int newfd; + char fdpath[PATH_MAX] = { 0 }; - if (fchmod(*fd, 0100) < 0) - return -1; + if (fchmod(*fd, 0100) < 0) + return -1; - if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0) - return -1; + if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0) + return -1; - newfd = open(fdpath, O_PATH | O_CLOEXEC); - if (newfd < 0) - return -1; + newfd = open(fdpath, O_PATH | O_CLOEXEC); + if (newfd < 0) + return -1; - close(*fd); - *fd = newfd; - return 0; - } + close(*fd); + *fd = newfd; + return 0; + } default: - break; + break; } return -1; } @@ -376,7 +399,7 @@ static int seal_execfd(int *fd, int fdtype) static int try_bindfd(void) { int fd, ret = -1; - char template[PATH_MAX] = {0}; + char template[PATH_MAX] = { 0 }; char *prefix = getenv("_LIBCONTAINER_STATEDIR"); if (!prefix || *prefix != '/') @@ -404,7 +427,6 @@ static int try_bindfd(void) if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0) goto out_umount; - /* Get read-only handle that we're sure can't be made read-write. */ ret = open(template, O_PATH | O_CLOEXEC); @@ -448,7 +470,7 @@ static ssize_t fd_to_fd(int outfd, int infd) if (n < 0) return -1; nwritten += n; - } while(nwritten < nread); + } while (nwritten < nread); total += nwritten; } @@ -459,7 +481,7 @@ static ssize_t fd_to_fd(int outfd, int infd) static int clone_binary(void) { int binfd, execfd; - struct stat statbuf = {}; + struct stat statbuf = { }; size_t sent = 0; int fdtype = EFD_NONE; diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/escape.c b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/escape.c new file mode 100644 index 0000000000..78e7e9f489 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/escape.c @@ -0,0 +1,142 @@ +#include +#include + +#ifdef ESCAPE_TEST +# include +# define test_assert(arg) assert(arg) +#else +# define test_assert(arg) +#endif + +#define DEL '\x7f' + +/* + * Poor man version of itoa with base=16 and input number from 0 to 15, + * represented by a char. Converts it to a single hex digit ('0' to 'f'). + */ +static char hex(char i) +{ + test_assert(i >= 0 && i < 16); + + if (i >= 0 && i < 10) { + return '0' + i; + } + if (i >= 10 && i < 16) { + return 'a' + i - 10; + } + return '?'; +} + +/* + * Given the character, tells how many _extra_ characters are needed + * to JSON-escape it. If 0 is returned, the character does not need to + * be escaped. + */ +static int need_escape(char c) +{ + switch (c) { + case '\\': + case '"': + case '\b': + case '\n': + case '\r': + case '\t': + case '\f': + return 1; + case DEL: // -> \u007f + return 5; + default: + if (c > 0 && c < ' ') { + // ASCII decimal 01 to 31 -> \u00xx + return 5; + } + return 0; + } +} + +/* + * Escape the string so it can be used as a JSON string (per RFC4627, + * section 2.5 minimal requirements, plus the DEL (0x7f) character). + * + * It is expected that the argument is a string allocated via malloc. + * In case no escaping is needed, the original string is returned as is; + * otherwise, the original string is free'd, and the newly allocated + * escaped string is returned. Thus, in any case, the value returned + * need to be free'd by the caller. + */ +char *escape_json_string(char *s) +{ + int i, j, len; + char *c, *out; + + /* + * First, check if escaping is at all needed -- if not, we can avoid + * malloc and return the argument as is. While at it, count how much + * extra space is required. + * + * XXX: the counting code must be in sync with the escaping code + * (checked by test_assert()s below). + */ + for (i = j = 0; s[i] != '\0'; i++) { + j += need_escape(s[i]); + } + if (j == 0) { + // nothing to escape + return s; + } + + len = i + j + 1; + out = malloc(len); + if (!out) { + free(s); + // As malloc failed, strdup can fail, too, so in the worst case + // scenario NULL will be returned from here. + return strdup("escape_json_string: out of memory"); + } + for (c = s, j = 0; *c != '\0'; c++) { + switch (*c) { + case '"': + case '\\': + test_assert(need_escape(*c) == 1); + out[j++] = '\\'; + out[j++] = *c; + continue; + } + if ((*c < 0 || *c >= ' ') && (*c != DEL)) { + // no escape needed + test_assert(need_escape(*c) == 0); + out[j++] = *c; + continue; + } + out[j++] = '\\'; + switch (*c) { + case '\b': + out[j++] = 'b'; + break; + case '\n': + out[j++] = 'n'; + break; + case '\r': + out[j++] = 'r'; + break; + case '\t': + out[j++] = 't'; + break; + case '\f': + out[j++] = 'f'; + break; + default: + test_assert(need_escape(*c) == 5); + out[j++] = 'u'; + out[j++] = '0'; + out[j++] = '0'; + out[j++] = hex(*c >> 4); + out[j++] = hex(*c & 0x0f); + } + } + test_assert(j + 1 == len); + out[j] = '\0'; + + free(s); + return out; +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c index a33f2fcc37..bee0042942 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c @@ -29,6 +29,8 @@ /* Get all of the CLONE_NEW* flags. */ #include "namespace.h" +extern char *escape_json_string(char *str); + /* Synchronisation values. */ enum sync_t { SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */ @@ -36,7 +38,7 @@ enum sync_t { SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */ SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */ SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */ - SYNC_CHILD_READY = 0x45, /* The child or grandchild is ready to return. */ + SYNC_CHILD_FINISH = 0x45, /* The child or grandchild has finished. */ }; /* @@ -45,10 +47,14 @@ enum sync_t { */ #define CREATECGROUPNS 0x80 +#define STAGE_SETUP -1 /* longjmp() arguments. */ -#define JUMP_PARENT 0x00 -#define JUMP_CHILD 0xA0 -#define JUMP_INIT 0xA1 +#define STAGE_PARENT 0 +#define STAGE_CHILD 1 +#define STAGE_INIT 2 + +/* Stores the current stage of nsexec. */ +int current_stage = STAGE_SETUP; /* Assume the stack grows down, so arguments should be above it. */ struct clone_t { @@ -56,7 +62,7 @@ struct clone_t { * Reserve some space for clone() to locate arguments * and retcode in this place */ - char stack[4096] __attribute__ ((aligned(16))); + char stack[4096] __attribute__((aligned(16))); char stack_ptr[0]; /* There's two children. This is used to execute the different code. */ @@ -102,31 +108,31 @@ static int logfd = -1; * List of netlink message types sent to us as part of bootstrapping the init. * These constants are defined in libcontainer/message_linux.go. */ -#define INIT_MSG 62000 +#define INIT_MSG 62000 #define CLONE_FLAGS_ATTR 27281 #define NS_PATHS_ATTR 27282 -#define UIDMAP_ATTR 27283 -#define GIDMAP_ATTR 27284 +#define UIDMAP_ATTR 27283 +#define GIDMAP_ATTR 27284 #define SETGROUP_ATTR 27285 #define OOM_SCORE_ADJ_ATTR 27286 #define ROOTLESS_EUID_ATTR 27287 -#define UIDMAPPATH_ATTR 27288 -#define GIDMAPPATH_ATTR 27289 +#define UIDMAPPATH_ATTR 27288 +#define GIDMAPPATH_ATTR 27289 /* * Use the raw syscall for versions of glibc which don't include a function for * it, namely (glibc 2.12). */ #if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14 -# define _GNU_SOURCE -# include "syscall.h" -# if !defined(SYS_setns) && defined(__NR_setns) -# define SYS_setns __NR_setns -# endif +# define _GNU_SOURCE +# include "syscall.h" +# if !defined(SYS_setns) && defined(__NR_setns) +# define SYS_setns __NR_setns +# endif -#ifndef SYS_setns -# error "setns(2) syscall not supported by glibc version" -#endif +# ifndef SYS_setns +# error "setns(2) syscall not supported by glibc version" +# endif int setns(int fd, int nstype) { @@ -134,33 +140,43 @@ int setns(int fd, int nstype) } #endif -static void write_log_with_info(const char *level, const char *function, int line, const char *format, ...) +static void write_log(const char *level, const char *format, ...) { - char message[1024] = {}; - + char *message = NULL, *stage = NULL; va_list args; + int ret; if (logfd < 0 || level == NULL) - return; + goto out; va_start(args, format); - if (vsnprintf(message, sizeof(message), format, args) < 0) - goto done; - - dprintf(logfd, "{\"level\":\"%s\", \"msg\": \"%s:%d %s\"}\n", level, function, line, message); -done: + ret = vasprintf(&message, format, args); va_end(args); -} + if (ret < 0) + goto out; -#define write_log(level, fmt, ...) \ - write_log_with_info((level), __FUNCTION__, __LINE__, (fmt), ##__VA_ARGS__) + message = escape_json_string(message); + + if (current_stage == STAGE_SETUP) + stage = strdup("nsexec"); + else + ret = asprintf(&stage, "nsexec-%d", current_stage); + if (ret < 0) + goto out; + + dprintf(logfd, "{\"level\":\"%s\", \"msg\": \"%s[%d]: %s\"}\n", level, stage, getpid(), message); + +out: + free(message); + free(stage); +} /* XXX: This is ugly. */ static int syncfd = -1; #define bail(fmt, ...) \ do { \ - write_log(FATAL, "nsenter: " fmt ": %m", ##__VA_ARGS__); \ + write_log(FATAL, fmt ": %m", ##__VA_ARGS__); \ exit(1); \ } while(0) @@ -187,7 +203,7 @@ static int write_file(char *data, size_t data_len, char *pathfmt, ...) goto out; } - out: +out: close(fd); return ret; } @@ -297,9 +313,11 @@ static void update_uidmap(const char *path, int pid, char *map, size_t map_len) if (map == NULL || map_len <= 0) return; + write_log(DEBUG, "update /proc/%d/uid_map to '%s'", pid, map); if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) { if (errno != EPERM) bail("failed to update /proc/%d/uid_map", pid); + write_log(DEBUG, "update /proc/%d/uid_map got -EPERM (trying %s)", pid, path); if (try_mapping_tool(path, pid, map, map_len)) bail("failed to use newuid map on %d", pid); } @@ -310,9 +328,11 @@ static void update_gidmap(const char *path, int pid, char *map, size_t map_len) if (map == NULL || map_len <= 0) return; + write_log(DEBUG, "update /proc/%d/gid_map to '%s'", pid, map); if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) { if (errno != EPERM) bail("failed to update /proc/%d/gid_map", pid); + write_log(DEBUG, "update /proc/%d/gid_map got -EPERM (trying %s)", pid, path); if (try_mapping_tool(path, pid, map, map_len)) bail("failed to use newgid map on %d", pid); } @@ -323,19 +343,20 @@ static void update_oom_score_adj(char *data, size_t len) if (data == NULL || len <= 0) return; + write_log(DEBUG, "update /proc/self/oom_score_adj to '%s'", data); if (write_file(data, len, "/proc/self/oom_score_adj") < 0) bail("failed to update /proc/self/oom_score_adj"); } /* A dummy function that just jumps to the given jumpval. */ -static int child_func(void *arg) __attribute__ ((noinline)); +static int child_func(void *arg) __attribute__((noinline)); static int child_func(void *arg) { struct clone_t *ca = (struct clone_t *)arg; longjmp(*ca->env, ca->jmpval); } -static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline)); +static int clone_parent(jmp_buf *env, int jmpval) __attribute__((noinline)); static int clone_parent(jmp_buf *env, int jmpval) { struct clone_t ca = { @@ -507,7 +528,6 @@ void join_namespaces(char *nslist) char *namespace = strtok_r(nslist, ",", &saveptr); struct namespace_t { int fd; - int ns; char type[PATH_MAX]; char path[PATH_MAX]; } *namespaces = NULL; @@ -542,7 +562,7 @@ void join_namespaces(char *nslist) bail("failed to open %s", path); ns->fd = fd; - ns->ns = nsflag(namespace); + strncpy(ns->type, namespace, PATH_MAX - 1); strncpy(ns->path, path, PATH_MAX - 1); ns->path[PATH_MAX - 1] = '\0'; } while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL); @@ -555,12 +575,14 @@ void join_namespaces(char *nslist) */ for (i = 0; i < num; i++) { - struct namespace_t ns = namespaces[i]; + struct namespace_t *ns = &namespaces[i]; + int flag = nsflag(ns->type); - if (setns(ns.fd, ns.ns) < 0) - bail("failed to setns to %s", ns.path); + write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", flag, ns->type, ns->path); + if (setns(ns->fd, flag) < 0) + bail("failed to setns into %s namespace", ns->type); - close(ns.fd); + close(ns->fd); } free(namespaces); @@ -569,6 +591,14 @@ void join_namespaces(char *nslist) /* Defined in cloned_binary.c. */ extern int ensure_cloned_binary(void); +static inline int sane_kill(pid_t pid, int signum) +{ + if (pid > 0) + return kill(pid, signum); + else + return 0; +} + void nsexec(void) { int pipenum; @@ -598,7 +628,14 @@ void nsexec(void) if (ensure_cloned_binary() < 0) bail("could not ensure we are a cloned binary"); - write_log(DEBUG, "nsexec started"); + /* + * Inform the parent we're past initial setup. + * For the other side of this, see initWaiter. + */ + if (write(pipenum, "", 1) != 1) + bail("could not inform the parent we are past initial setup"); + + write_log(DEBUG, "=> nsexec container setup"); /* Parse all of the netlink configuration. */ nl_parse(pipenum, &config); @@ -622,6 +659,7 @@ void nsexec(void) * containers), which is the recommendation from the kernel folks. */ if (config.namespaces) { + write_log(DEBUG, "set process as non-dumpable"); if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) bail("failed to set process as non-dumpable"); } @@ -686,45 +724,49 @@ void nsexec(void) * -- Aleksa "what has my life come to?" Sarai */ - switch (setjmp(env)) { + current_stage = setjmp(env); + switch (current_stage) { /* * Stage 0: We're in the parent. Our job is just to create a new child - * (stage 1: JUMP_CHILD) process and write its uid_map and + * (stage 1: STAGE_CHILD) process and write its uid_map and * gid_map. That process will go on to create a new process, then * it will send us its PID which we will send to the bootstrap * process. */ - case JUMP_PARENT:{ + case STAGE_PARENT:{ int len; - pid_t child, first_child = -1; - bool ready = false; + pid_t stage1_pid = -1, stage2_pid = -1; + bool stage1_complete, stage2_complete; /* For debugging. */ prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0); + write_log(DEBUG, "~> nsexec stage-0"); /* Start the process of getting a container. */ - child = clone_parent(&env, JUMP_CHILD); - if (child < 0) - bail("unable to fork: child_func"); + write_log(DEBUG, "spawn stage-1"); + stage1_pid = clone_parent(&env, STAGE_CHILD); + if (stage1_pid < 0) + bail("unable to spawn stage-1"); - /* - * State machine for synchronisation with the children. - * - * Father only return when both child and grandchild are - * ready, so we can receive all possible error codes - * generated by children. - */ syncfd = sync_child_pipe[1]; close(sync_child_pipe[0]); - while (!ready) { + /* + * State machine for synchronisation with the children. We only + * return once both the child and grandchild are ready. + */ + write_log(DEBUG, "-> stage-1 synchronisation loop"); + stage1_complete = false; + while (!stage1_complete) { enum sync_t s; if (read(syncfd, &s, sizeof(s)) != sizeof(s)) - bail("failed to sync with child: next state"); + bail("failed to sync with stage-1: next state"); switch (s) { case SYNC_USERMAP_PLS: + write_log(DEBUG, "stage-1 requested userns mappings"); + /* * Enable setgroups(2) if we've been asked to. But we also * have to explicitly disable setgroups(2) if we're @@ -735,70 +777,78 @@ void nsexec(void) * For rootless multi-entry mapping, config.is_setgroup shall be true and * newuidmap/newgidmap shall be used. */ - if (config.is_rootless_euid && !config.is_setgroup) - update_setgroups(child, SETGROUPS_DENY); + update_setgroups(stage1_pid, SETGROUPS_DENY); /* Set up mappings. */ - update_uidmap(config.uidmappath, child, config.uidmap, config.uidmap_len); - update_gidmap(config.gidmappath, child, config.gidmap, config.gidmap_len); + update_uidmap(config.uidmappath, stage1_pid, config.uidmap, config.uidmap_len); + update_gidmap(config.gidmappath, stage1_pid, config.gidmap, config.gidmap_len); s = SYNC_USERMAP_ACK; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - kill(child, SIGKILL); - bail("failed to sync with child: write(SYNC_USERMAP_ACK)"); + sane_kill(stage1_pid, SIGKILL); + sane_kill(stage2_pid, SIGKILL); + bail("failed to sync with stage-1: write(SYNC_USERMAP_ACK)"); } break; - case SYNC_RECVPID_PLS:{ - first_child = child; + case SYNC_RECVPID_PLS: + write_log(DEBUG, "stage-1 requested pid to be forwarded"); - /* Get the init_func pid. */ - if (read(syncfd, &child, sizeof(child)) != sizeof(child)) { - kill(first_child, SIGKILL); - bail("failed to sync with child: read(childpid)"); - } + /* Get the stage-2 pid. */ + if (read(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) { + sane_kill(stage1_pid, SIGKILL); + sane_kill(stage2_pid, SIGKILL); + bail("failed to sync with stage-1: read(stage2_pid)"); + } - /* Send ACK. */ - s = SYNC_RECVPID_ACK; - if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - kill(first_child, SIGKILL); - kill(child, SIGKILL); - bail("failed to sync with child: write(SYNC_RECVPID_ACK)"); - } + /* Send ACK. */ + s = SYNC_RECVPID_ACK; + if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { + sane_kill(stage1_pid, SIGKILL); + sane_kill(stage2_pid, SIGKILL); + bail("failed to sync with stage-1: write(SYNC_RECVPID_ACK)"); + } - /* Send the init_func pid back to our parent. - * - * Send the init_func pid and the pid of the first child back to our parent. - * We need to send both back because we can't reap the first child we created (CLONE_PARENT). - * It becomes the responsibility of our parent to reap the first child. - */ - len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child); - if (len < 0) { - kill(child, SIGKILL); - bail("unable to generate JSON for child pid"); - } + /* + * Send both the stage-1 and stage-2 pids back to runc. + * runc needs the stage-2 to continue process management, + * but because stage-1 was spawned with CLONE_PARENT we + * cannot reap it within stage-0 and thus we need to ask + * runc to reap the zombie for us. + */ + write_log(DEBUG, "forward stage-1 (%d) and stage-2 (%d) pids to runc", + stage1_pid, stage2_pid); + len = + dprintf(pipenum, "{\"stage1_pid\":%d,\"stage2_pid\":%d}\n", stage1_pid, + stage2_pid); + if (len < 0) { + sane_kill(stage1_pid, SIGKILL); + sane_kill(stage2_pid, SIGKILL); + bail("failed to sync with runc: write(pid-JSON)"); } break; - case SYNC_CHILD_READY: - ready = true; + case SYNC_CHILD_FINISH: + write_log(DEBUG, "stage-1 complete"); + stage1_complete = true; break; default: bail("unexpected sync value: %u", s); } } + write_log(DEBUG, "<- stage-1 synchronisation loop"); /* Now sync with grandchild. */ - syncfd = sync_grandchild_pipe[1]; close(sync_grandchild_pipe[0]); - - ready = false; - while (!ready) { + write_log(DEBUG, "-> stage-2 synchronisation loop"); + stage2_complete = false; + while (!stage2_complete) { enum sync_t s; + write_log(DEBUG, "signalling stage-2 to run"); s = SYNC_GRANDCHILD; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - kill(child, SIGKILL); + sane_kill(stage2_pid, SIGKILL); bail("failed to sync with child: write(SYNC_GRANDCHILD)"); } @@ -806,27 +856,31 @@ void nsexec(void) bail("failed to sync with child: next state"); switch (s) { - case SYNC_CHILD_READY: - ready = true; + case SYNC_CHILD_FINISH: + write_log(DEBUG, "stage-2 complete"); + stage2_complete = true; break; default: bail("unexpected sync value: %u", s); } } + write_log(DEBUG, "<- stage-2 synchronisation loop"); + write_log(DEBUG, "<~ nsexec stage-0"); exit(0); } + break; /* * Stage 1: We're in the first child process. Our job is to join any - * provided namespaces in the netlink payload and unshare all - * of the requested namespaces. If we've been asked to - * CLONE_NEWUSER, we will ask our parent (stage 0) to set up - * our user mappings for us. Then, we create a new child - * (stage 2: JUMP_INIT) for PID namespace. We then send the - * child's PID to our parent (stage 0). + * provided namespaces in the netlink payload and unshare all of + * the requested namespaces. If we've been asked to CLONE_NEWUSER, + * we will ask our parent (stage 0) to set up our user mappings + * for us. Then, we create a new child (stage 2: STAGE_INIT) for + * PID namespace. We then send the child's PID to our parent + * (stage 0). */ - case JUMP_CHILD:{ - pid_t child; + case STAGE_CHILD:{ + pid_t stage2_pid = -1; enum sync_t s; /* We're in a child and thus need to tell the parent if we die. */ @@ -835,11 +889,12 @@ void nsexec(void) /* For debugging. */ prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0); + write_log(DEBUG, "~> nsexec stage-1"); /* * We need to setns first. We cannot do this earlier (in stage 0) * because of the fact that we forked to get here (the PID of - * [stage 2: JUMP_INIT]) would be meaningless). We could send it + * [stage 2: STAGE_INIT]) would be meaningless). We could send it * using cmsg(3) but that's just annoying. */ if (config.namespaces) @@ -865,40 +920,50 @@ void nsexec(void) * problem. */ if (config.cloneflags & CLONE_NEWUSER) { + write_log(DEBUG, "unshare user namespace"); if (unshare(CLONE_NEWUSER) < 0) bail("failed to unshare user namespace"); config.cloneflags &= ~CLONE_NEWUSER; /* - * We don't have the privileges to do any mapping here (see the - * clone_parent rant). So signal our parent to hook us up. + * We need to set ourselves as dumpable temporarily so that the + * parent process can write to our procfs files. */ - - /* Switching is only necessary if we joined namespaces. */ if (config.namespaces) { + write_log(DEBUG, "temporarily set process as dumpable"); if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0) - bail("failed to set process as dumpable"); + bail("failed to temporarily set process as dumpable"); } + + /* + * We don't have the privileges to do any mapping here (see the + * clone_parent rant). So signal stage-0 to do the mapping for + * us. + */ + write_log(DEBUG, "request stage-0 to map user namespace"); s = SYNC_USERMAP_PLS; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) bail("failed to sync with parent: write(SYNC_USERMAP_PLS)"); /* ... wait for mapping ... */ - + write_log(DEBUG, "request stage-0 to map user namespace"); if (read(syncfd, &s, sizeof(s)) != sizeof(s)) bail("failed to sync with parent: read(SYNC_USERMAP_ACK)"); if (s != SYNC_USERMAP_ACK) bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s); - /* Switching is only necessary if we joined namespaces. */ + + /* Revert temporary re-dumpable setting. */ if (config.namespaces) { + write_log(DEBUG, "re-set process as non-dumpable"); if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0) - bail("failed to set process as dumpable"); + bail("failed to re-set process as non-dumpable"); } /* Become root in the namespace proper. */ if (setresuid(0, 0, 0) < 0) bail("failed to become root in user namespace"); } + /* * Unshare all of the namespaces. Now, it should be noted that this * ordering might break in the future (especially with rootless @@ -909,8 +974,9 @@ void nsexec(void) * some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID) * was broken, so we'll just do it the long way anyway. */ + write_log(DEBUG, "unshare remaining namespace (except cgroupns)"); if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0) - bail("failed to unshare namespaces"); + bail("failed to unshare remaining namespaces (except cgroupns)"); /* * TODO: What about non-namespace clone flags that we're dropping here? @@ -921,41 +987,45 @@ void nsexec(void) * which would break many applications and libraries, so we must fork * to actually enter the new PID namespace. */ - child = clone_parent(&env, JUMP_INIT); - if (child < 0) - bail("unable to fork: init_func"); + write_log(DEBUG, "spawn stage-2"); + stage2_pid = clone_parent(&env, STAGE_INIT); + if (stage2_pid < 0) + bail("unable to spawn stage-2"); /* Send the child to our parent, which knows what it's doing. */ + write_log(DEBUG, "request stage-0 to forward stage-2 pid (%d)", stage2_pid); s = SYNC_RECVPID_PLS; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - kill(child, SIGKILL); + sane_kill(stage2_pid, SIGKILL); bail("failed to sync with parent: write(SYNC_RECVPID_PLS)"); } - if (write(syncfd, &child, sizeof(child)) != sizeof(child)) { - kill(child, SIGKILL); - bail("failed to sync with parent: write(childpid)"); + if (write(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) { + sane_kill(stage2_pid, SIGKILL); + bail("failed to sync with parent: write(stage2_pid)"); } /* ... wait for parent to get the pid ... */ - if (read(syncfd, &s, sizeof(s)) != sizeof(s)) { - kill(child, SIGKILL); + sane_kill(stage2_pid, SIGKILL); bail("failed to sync with parent: read(SYNC_RECVPID_ACK)"); } if (s != SYNC_RECVPID_ACK) { - kill(child, SIGKILL); + sane_kill(stage2_pid, SIGKILL); bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s); } - s = SYNC_CHILD_READY; + write_log(DEBUG, "signal completion to stage-0"); + s = SYNC_CHILD_FINISH; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) { - kill(child, SIGKILL); - bail("failed to sync with parent: write(SYNC_CHILD_READY)"); + sane_kill(stage2_pid, SIGKILL); + bail("failed to sync with parent: write(SYNC_CHILD_FINISH)"); } - /* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */ + /* Our work is done. [Stage 2: STAGE_INIT] is doing the rest of the work. */ + write_log(DEBUG, "<~ nsexec stage-1"); exit(0); } + break; /* * Stage 2: We're the final child process, and the only process that will @@ -963,7 +1033,7 @@ void nsexec(void) * final cleanup steps and then return to the Go runtime to allow * init_linux.go to run. */ - case JUMP_INIT:{ + case STAGE_INIT:{ /* * We're inside the child now, having jumped from the * start_child() code after forking in the parent. @@ -978,6 +1048,7 @@ void nsexec(void) /* For debugging. */ prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0); + write_log(DEBUG, "~> nsexec stage-2"); if (read(syncfd, &s, sizeof(s)) != sizeof(s)) bail("failed to sync with parent: read(SYNC_GRANDCHILD)"); @@ -998,21 +1069,30 @@ void nsexec(void) bail("setgroups failed"); } - /* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */ + /* + * Wait until our topmost parent has finished cgroup setup in + * p.manager.Apply(). + * + * TODO(cyphar): Check if this code is actually needed because we + * should be in the cgroup even from stage-0, so + * waiting until now might not make sense. + */ if (config.cloneflags & CLONE_NEWCGROUP) { uint8_t value; if (read(pipenum, &value, sizeof(value)) != sizeof(value)) bail("read synchronisation value failed"); if (value == CREATECGROUPNS) { + write_log(DEBUG, "unshare cgroup namespace"); if (unshare(CLONE_NEWCGROUP) < 0) bail("failed to unshare cgroup namespace"); } else bail("received unknown synchronisation value"); } - s = SYNC_CHILD_READY; + write_log(DEBUG, "signal completion to stage-0"); + s = SYNC_CHILD_FINISH; if (write(syncfd, &s, sizeof(s)) != sizeof(s)) - bail("failed to sync with patent: write(SYNC_CHILD_READY)"); + bail("failed to sync with patent: write(SYNC_CHILD_FINISH)"); /* Close sync pipes. */ close(sync_grandchild_pipe[0]); @@ -1021,10 +1101,13 @@ void nsexec(void) nl_free(&config); /* Finish executing, let the Go runtime take over. */ + write_log(DEBUG, "<= nsexec container setup"); + write_log(DEBUG, "booting up go runtime ..."); return; } + break; default: - bail("unexpected jump value"); + bail("unknown stage '%d' for jump value", current_stage); } /* Should never be reached. */ diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/test/escape.c b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/test/escape.c new file mode 120000 index 0000000000..c53e316a22 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/test/escape.c @@ -0,0 +1 @@ +../escape.c \ No newline at end of file diff --git a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/test/escape.go b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/test/escape.go new file mode 100644 index 0000000000..4accf967a4 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/test/escape.go @@ -0,0 +1,53 @@ +package escapetest + +// This file is part of escape_json_string unit test. +// It is in a separate package so cgo can be used together +// with go test. + +// #include +// extern char *escape_json_string(char *str); +// #cgo CFLAGS: -DESCAPE_TEST=1 +import "C" + +import ( + "testing" + "unsafe" +) + +func testEscapeJsonString(t *testing.T, input, want string) { + in := C.CString(input) + out := C.escape_json_string(in) + got := C.GoString(out) + C.free(unsafe.Pointer(out)) + t.Logf("input: %q, output: %q", input, got) + if got != want { + t.Errorf("Failed on input: %q, want %q, got %q", input, want, got) + } +} + +func testEscapeJson(t *testing.T) { + testCases := []struct { + input, output string + }{ + {"", ""}, + {"abcdef", "abcdef"}, + {`\\\\\\`, `\\\\\\\\\\\\`}, + {`with"quote`, `with\"quote`}, + {"\n\r\b\t\f\\", `\n\r\b\t\f\\`}, + {"\007", "\\u0007"}, + {"\017 \020 \037", "\\u000f \\u0010 \\u001f"}, + {"\033", "\\u001b"}, + {`<->`, `<->`}, + {"\176\177\200", "~\\u007f\200"}, + {"\000", ""}, + {"a\x7fxc", "a\\u007fxc"}, + {"a\033xc", "a\\u001bxc"}, + {"a\nxc", "a\\nxc"}, + {"a\\xc", "a\\\\xc"}, + {"Barney B\303\244r", "Barney B\303\244r"}, + } + + for _, tc := range testCases { + testEscapeJsonString(t, tc.input, tc.output) + } +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup.go b/vendor/github.com/opencontainers/runc/libcontainer/user/lookup.go deleted file mode 100644 index 6fd8dd0d44..0000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup.go +++ /dev/null @@ -1,41 +0,0 @@ -package user - -import ( - "errors" -) - -var ( - // The current operating system does not provide the required data for user lookups. - ErrUnsupported = errors.New("user lookup: operating system does not provide passwd-formatted data") - // No matching entries found in file. - ErrNoPasswdEntries = errors.New("no matching entries in passwd file") - ErrNoGroupEntries = errors.New("no matching entries in group file") -) - -// LookupUser looks up a user by their username in /etc/passwd. If the user -// cannot be found (or there is no /etc/passwd file on the filesystem), then -// LookupUser returns an error. -func LookupUser(username string) (User, error) { - return lookupUser(username) -} - -// LookupUid looks up a user by their user id in /etc/passwd. If the user cannot -// be found (or there is no /etc/passwd file on the filesystem), then LookupId -// returns an error. -func LookupUid(uid int) (User, error) { - return lookupUid(uid) -} - -// LookupGroup looks up a group by its name in /etc/group. If the group cannot -// be found (or there is no /etc/group file on the filesystem), then LookupGroup -// returns an error. -func LookupGroup(groupname string) (Group, error) { - return lookupGroup(groupname) -} - -// LookupGid looks up a group by its group id in /etc/group. If the group cannot -// be found (or there is no /etc/group file on the filesystem), then LookupGid -// returns an error. -func LookupGid(gid int) (Group, error) { - return lookupGid(gid) -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go b/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go index 92b5ae8de0..967717a1b1 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go @@ -16,13 +16,19 @@ const ( unixGroupPath = "/etc/group" ) -func lookupUser(username string) (User, error) { +// LookupUser looks up a user by their username in /etc/passwd. If the user +// cannot be found (or there is no /etc/passwd file on the filesystem), then +// LookupUser returns an error. +func LookupUser(username string) (User, error) { return lookupUserFunc(func(u User) bool { return u.Name == username }) } -func lookupUid(uid int) (User, error) { +// LookupUid looks up a user by their user id in /etc/passwd. If the user cannot +// be found (or there is no /etc/passwd file on the filesystem), then LookupId +// returns an error. +func LookupUid(uid int) (User, error) { return lookupUserFunc(func(u User) bool { return u.Uid == uid }) @@ -51,13 +57,19 @@ func lookupUserFunc(filter func(u User) bool) (User, error) { return users[0], nil } -func lookupGroup(groupname string) (Group, error) { +// LookupGroup looks up a group by its name in /etc/group. If the group cannot +// be found (or there is no /etc/group file on the filesystem), then LookupGroup +// returns an error. +func LookupGroup(groupname string) (Group, error) { return lookupGroupFunc(func(g Group) bool { return g.Name == groupname }) } -func lookupGid(gid int) (Group, error) { +// LookupGid looks up a group by its group id in /etc/group. If the group cannot +// be found (or there is no /etc/group file on the filesystem), then LookupGid +// returns an error. +func LookupGid(gid int) (Group, error) { return lookupGroupFunc(func(g Group) bool { return g.Gid == gid }) diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_windows.go b/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_windows.go deleted file mode 100644 index 65cd40e928..0000000000 --- a/vendor/github.com/opencontainers/runc/libcontainer/user/lookup_windows.go +++ /dev/null @@ -1,40 +0,0 @@ -// +build windows - -package user - -import ( - "fmt" - "os/user" -) - -func lookupUser(username string) (User, error) { - u, err := user.Lookup(username) - if err != nil { - return User{}, err - } - return userFromOS(u) -} - -func lookupUid(uid int) (User, error) { - u, err := user.LookupId(fmt.Sprintf("%d", uid)) - if err != nil { - return User{}, err - } - return userFromOS(u) -} - -func lookupGroup(groupname string) (Group, error) { - g, err := user.LookupGroup(groupname) - if err != nil { - return Group{}, err - } - return groupFromOS(g) -} - -func lookupGid(gid int) (Group, error) { - g, err := user.LookupGroupId(fmt.Sprintf("%d", gid)) - if err != nil { - return Group{}, err - } - return groupFromOS(g) -} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/user.go b/vendor/github.com/opencontainers/runc/libcontainer/user/user.go index 4b89dad737..68da4400d4 100644 --- a/vendor/github.com/opencontainers/runc/libcontainer/user/user.go +++ b/vendor/github.com/opencontainers/runc/libcontainer/user/user.go @@ -2,10 +2,10 @@ package user import ( "bufio" + "errors" "fmt" "io" "os" - "os/user" "strconv" "strings" ) @@ -16,6 +16,13 @@ const ( ) var ( + // The current operating system does not provide the required data for user lookups. + ErrUnsupported = errors.New("user lookup: operating system does not provide passwd-formatted data") + + // No matching entries found in file. + ErrNoPasswdEntries = errors.New("no matching entries in passwd file") + ErrNoGroupEntries = errors.New("no matching entries in group file") + ErrRange = fmt.Errorf("uids and gids must be in range %d-%d", minId, maxId) ) @@ -29,28 +36,6 @@ type User struct { Shell string } -// userFromOS converts an os/user.(*User) to local User -// -// (This does not include Pass, Shell or Gecos) -func userFromOS(u *user.User) (User, error) { - newUser := User{ - Name: u.Username, - Home: u.HomeDir, - } - id, err := strconv.Atoi(u.Uid) - if err != nil { - return newUser, err - } - newUser.Uid = id - - id, err = strconv.Atoi(u.Gid) - if err != nil { - return newUser, err - } - newUser.Gid = id - return newUser, nil -} - type Group struct { Name string Pass string @@ -58,23 +43,6 @@ type Group struct { List []string } -// groupFromOS converts an os/user.(*Group) to local Group -// -// (This does not include Pass or List) -func groupFromOS(g *user.Group) (Group, error) { - newGroup := Group{ - Name: g.Name, - } - - id, err := strconv.Atoi(g.Gid) - if err != nil { - return newGroup, err - } - newGroup.Gid = id - - return newGroup, nil -} - // SubID represents an entry in /etc/sub{u,g}id type SubID struct { Name string @@ -466,7 +434,7 @@ func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, err // we asked for a group but didn't find it. let's check to see // if we wanted a numeric group if !found { - gid, err := strconv.Atoi(ag) + gid, err := strconv.ParseInt(ag, 10, 64) if err != nil { return nil, fmt.Errorf("Unable to find group %s", ag) } @@ -474,7 +442,7 @@ func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, err if gid < minId || gid > maxId { return nil, ErrRange } - gidMap[gid] = struct{}{} + gidMap[int(gid)] = struct{}{} } } gids := []int{} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/user/user_fuzzer.go b/vendor/github.com/opencontainers/runc/libcontainer/user/user_fuzzer.go new file mode 100644 index 0000000000..8c9bb5df39 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/user/user_fuzzer.go @@ -0,0 +1,42 @@ +// +build gofuzz + +package user + +import ( + "io" + "strings" +) + +func IsDivisbleBy(n int, divisibleby int) bool { + return (n % divisibleby) == 0 +} + +func FuzzUser(data []byte) int { + if len(data) == 0 { + return -1 + } + if !IsDivisbleBy(len(data), 5) { + return -1 + } + + var divided [][]byte + + chunkSize := len(data) / 5 + + for i := 0; i < len(data); i += chunkSize { + end := i + chunkSize + + divided = append(divided, data[i:end]) + } + + _, _ = ParsePasswdFilter(strings.NewReader(string(divided[0])), nil) + + var passwd, group io.Reader + + group = strings.NewReader(string(divided[1])) + _, _ = GetAdditionalGroups([]string{string(divided[2])}, group) + + passwd = strings.NewReader(string(divided[3])) + _, _ = GetExecUser(string(divided[4]), nil, passwd, group) + return 1 +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns.go b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns.go new file mode 100644 index 0000000000..f6cb98e5e4 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns.go @@ -0,0 +1,5 @@ +package userns + +// RunningInUserNS detects whether we are currently running in a user namespace. +// Originally copied from github.com/lxc/lxd/shared/util.go +var RunningInUserNS = runningInUserNS diff --git a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_fuzzer.go b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_fuzzer.go new file mode 100644 index 0000000000..529f8eaea2 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_fuzzer.go @@ -0,0 +1,15 @@ +// +build gofuzz + +package userns + +import ( + "strings" + + "github.com/opencontainers/runc/libcontainer/user" +) + +func FuzzUIDMap(data []byte) int { + uidmap, _ := user.ParseIDMap(strings.NewReader(string(data))) + _ = uidMapInUserNS(uidmap) + return 1 +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_linux.go b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_linux.go new file mode 100644 index 0000000000..724e6df012 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_linux.go @@ -0,0 +1,37 @@ +package userns + +import ( + "sync" + + "github.com/opencontainers/runc/libcontainer/user" +) + +var ( + inUserNS bool + nsOnce sync.Once +) + +// runningInUserNS detects whether we are currently running in a user namespace. +// Originally copied from github.com/lxc/lxd/shared/util.go +func runningInUserNS() bool { + nsOnce.Do(func() { + uidmap, err := user.CurrentProcessUIDMap() + if err != nil { + // This kernel-provided file only exists if user namespaces are supported + return + } + inUserNS = uidMapInUserNS(uidmap) + }) + return inUserNS +} + +func uidMapInUserNS(uidmap []user.IDMap) bool { + /* + * We assume we are in the initial user namespace if we have a full + * range - 4294967295 uids starting at uid 0. + */ + if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 { + return false + } + return true +} diff --git a/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_unsupported.go b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_unsupported.go new file mode 100644 index 0000000000..f45bb0c315 --- /dev/null +++ b/vendor/github.com/opencontainers/runc/libcontainer/userns/userns_unsupported.go @@ -0,0 +1,17 @@ +// +build !linux + +package userns + +import "github.com/opencontainers/runc/libcontainer/user" + +// runningInUserNS is a stub for non-Linux systems +// Always returns false +func runningInUserNS() bool { + return false +} + +// uidMapInUserNS is a stub for non-Linux systems +// Always returns false +func uidMapInUserNS(uidmap []user.IDMap) bool { + return false +}