mirror of
https://github.com/moby/moby.git
synced 2022-11-09 12:21:53 -05:00
vendor: github.com/opencontainers/runc v1.0.0-rc95
full diff: https://github.com/opencontainers/runc/compare/v1.0.0-rc92...v1.0.0-rc95 Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
This commit is contained in:
parent
f0d3e905b6
commit
a927fc7831
34 changed files with 1339 additions and 599 deletions
|
@ -91,7 +91,7 @@ google.golang.org/grpc f495f5b15ae7ccda3b38c53a1bfc
|
|||
# the containerd project first, and update both after that is merged.
|
||||
# This commit does not need to match RUNC_COMMIT as it is used for helper
|
||||
# packages but should be newer or equal.
|
||||
github.com/opencontainers/runc ff819c7e9184c13b7c2607fe6c30ae19403a7aff # v1.0.0-rc92
|
||||
github.com/opencontainers/runc b9ee9c6314599f1b4a7f497e1f1f856fe433d3b7 # v1.0.0-rc95
|
||||
github.com/opencontainers/runtime-spec 1c3f411f041711bbeecf35ff7e93461ea6789220 # v1.0.3-0.20210326190908-1c3f411f0417
|
||||
github.com/opencontainers/image-spec d60099175f88c47cd379c4738d158884749ed235 # v1.0.1
|
||||
github.com/cyphar/filepath-securejoin a261ee33d7a517f054effbf451841abaafe3e0fd # v0.2.2
|
||||
|
|
38
vendor/github.com/opencontainers/runc/README.md
generated
vendored
38
vendor/github.com/opencontainers/runc/README.md
generated
vendored
|
@ -1,9 +1,10 @@
|
|||
# runc
|
||||
|
||||
[![Build Status](https://travis-ci.org/opencontainers/runc.svg?branch=master)](https://travis-ci.org/opencontainers/runc)
|
||||
[![Go Report Card](https://goreportcard.com/badge/github.com/opencontainers/runc)](https://goreportcard.com/report/github.com/opencontainers/runc)
|
||||
[![GoDoc](https://godoc.org/github.com/opencontainers/runc?status.svg)](https://godoc.org/github.com/opencontainers/runc)
|
||||
[![CII Best Practices](https://bestpractices.coreinfrastructure.org/projects/588/badge)](https://bestpractices.coreinfrastructure.org/projects/588)
|
||||
[![gha/validate](https://github.com/opencontainers/runc/workflows/validate/badge.svg)](https://github.com/opencontainers/runc/actions?query=workflow%3Avalidate)
|
||||
[![gha/ci](https://github.com/opencontainers/runc/workflows/ci/badge.svg)](https://github.com/opencontainers/runc/actions?query=workflow%3Aci)
|
||||
|
||||
## Introduction
|
||||
|
||||
|
@ -17,10 +18,6 @@ This means that `runc` 1.0.0 should implement the 1.0 version of the specificati
|
|||
|
||||
You can find official releases of `runc` on the [release](https://github.com/opencontainers/runc/releases) page.
|
||||
|
||||
Currently, the following features are not considered to be production-ready:
|
||||
|
||||
* [Support for cgroup v2](./docs/cgroup-v2.md)
|
||||
|
||||
## Security
|
||||
|
||||
The reporting process and disclosure communications are outlined [here](https://github.com/opencontainers/org/blob/master/SECURITY.md).
|
||||
|
@ -64,19 +61,20 @@ sudo make install
|
|||
with some of them enabled by default (see `BUILDTAGS` in top-level `Makefile`).
|
||||
|
||||
To change build tags from the default, set the `BUILDTAGS` variable for make,
|
||||
e.g.
|
||||
e.g. to disable seccomp:
|
||||
|
||||
```bash
|
||||
make BUILDTAGS='seccomp apparmor'
|
||||
make BUILDTAGS=""
|
||||
```
|
||||
|
||||
| Build Tag | Feature | Enabled by default | Dependency |
|
||||
|-----------|------------------------------------|--------------------|------------|
|
||||
| seccomp | Syscall filtering | yes | libseccomp |
|
||||
| selinux | selinux process and mount labeling | yes | <none> |
|
||||
| apparmor | apparmor profile support | yes | <none> |
|
||||
| nokmem | disable kernel memory accounting | no | <none> |
|
||||
|
||||
The following build tags were used earlier, but are now obsoleted:
|
||||
- **nokmem** (since runc v1.0.0-rc94 kernel memory settings are ignored)
|
||||
- **apparmor** (since runc v1.0.0-rc93 the feature is always enabled)
|
||||
- **selinux** (since runc v1.0.0-rc93 the feature is always enabled)
|
||||
|
||||
### Running the test suite
|
||||
|
||||
|
@ -128,6 +126,14 @@ make verify-dependencies
|
|||
|
||||
## Using runc
|
||||
|
||||
Please note that runc is a low level tool not designed with an end user
|
||||
in mind. It is mostly employed by other higher level container software.
|
||||
|
||||
Therefore, unless there is some specific use case that prevents the use
|
||||
of tools like Docker or Podman, it is not recommended to use runc directly.
|
||||
|
||||
If you still want to use runc, here's how.
|
||||
|
||||
### Creating an OCI Bundle
|
||||
|
||||
In order to use runc you must have your container in the format of an OCI bundle.
|
||||
|
@ -169,7 +175,9 @@ If you used the unmodified `runc spec` template this should give you a `sh` sess
|
|||
|
||||
The second way to start a container is using the specs lifecycle operations.
|
||||
This gives you more power over how the container is created and managed while it is running.
|
||||
This will also launch the container in the background so you will have to edit the `config.json` to remove the `terminal` setting for the simple examples here.
|
||||
This will also launch the container in the background so you will have to edit
|
||||
the `config.json` to remove the `terminal` setting for the simple examples
|
||||
below (see more details about [runc terminal handling](docs/terminals.md)).
|
||||
Your process field in the `config.json` should look like this below with `"terminal": false` and `"args": ["sleep", "5"]`.
|
||||
|
||||
|
||||
|
@ -292,8 +300,12 @@ PIDFile=/run/mycontainerid.pid
|
|||
WantedBy=multi-user.target
|
||||
```
|
||||
|
||||
#### cgroup v2
|
||||
See [`./docs/cgroup-v2.md`](./docs/cgroup-v2.md).
|
||||
## More documentation
|
||||
|
||||
* [cgroup v2](./docs/cgroup-v2.md)
|
||||
* [Checkpoint and restore](./docs/checkpoint-restore.md)
|
||||
* [systemd cgroup driver](./docs/systemd.md)
|
||||
* [Terminals and standard IO](./docs/terminals.md)
|
||||
|
||||
## License
|
||||
|
||||
|
|
30
vendor/github.com/opencontainers/runc/go.mod
generated
vendored
30
vendor/github.com/opencontainers/runc/go.mod
generated
vendored
|
@ -1,26 +1,28 @@
|
|||
module github.com/opencontainers/runc
|
||||
|
||||
go 1.14
|
||||
go 1.13
|
||||
|
||||
require (
|
||||
github.com/checkpoint-restore/go-criu/v4 v4.1.0
|
||||
github.com/cilium/ebpf v0.0.0-20200702112145-1c8d4c9ef775
|
||||
github.com/containerd/console v1.0.0
|
||||
github.com/coreos/go-systemd/v22 v22.1.0
|
||||
github.com/checkpoint-restore/go-criu/v5 v5.0.0
|
||||
github.com/cilium/ebpf v0.5.0
|
||||
github.com/containerd/console v1.0.2
|
||||
github.com/coreos/go-systemd/v22 v22.3.1
|
||||
github.com/cyphar/filepath-securejoin v0.2.2
|
||||
github.com/docker/go-units v0.4.0
|
||||
github.com/godbus/dbus/v5 v5.0.3
|
||||
github.com/golang/protobuf v1.4.2
|
||||
github.com/moby/sys/mountinfo v0.1.3
|
||||
github.com/mrunalp/fileutils v0.0.0-20200520151820-abd8a0e76976
|
||||
github.com/opencontainers/runtime-spec v1.0.3-0.20200728170252-4d89ac9fbff6
|
||||
github.com/opencontainers/selinux v1.6.0
|
||||
github.com/godbus/dbus/v5 v5.0.4
|
||||
github.com/moby/sys/mountinfo v0.4.1
|
||||
github.com/mrunalp/fileutils v0.5.0
|
||||
github.com/opencontainers/runtime-spec v1.0.3-0.20210326190908-1c3f411f0417
|
||||
github.com/opencontainers/selinux v1.8.0
|
||||
github.com/pkg/errors v0.9.1
|
||||
github.com/seccomp/libseccomp-golang v0.9.1
|
||||
github.com/sirupsen/logrus v1.6.0
|
||||
github.com/syndtr/gocapability v0.0.0-20180916011248-d98352740cb2
|
||||
github.com/sirupsen/logrus v1.7.0
|
||||
github.com/syndtr/gocapability v0.0.0-20200815063812-42c35b437635
|
||||
// NOTE: urfave/cli must be <= v1.22.1 due to a regression: https://github.com/urfave/cli/issues/1092
|
||||
github.com/urfave/cli v1.22.1
|
||||
github.com/vishvananda/netlink v1.1.0
|
||||
golang.org/x/sys v0.0.0-20200728102440-3e129f6d46b1
|
||||
github.com/willf/bitset v1.1.11
|
||||
golang.org/x/net v0.0.0-20201224014010-6772e930b67b
|
||||
golang.org/x/sys v0.0.0-20210426230700-d19ff857e887
|
||||
google.golang.org/protobuf v1.25.0
|
||||
)
|
||||
|
|
170
vendor/github.com/opencontainers/runc/libcontainer/README.md
generated
vendored
170
vendor/github.com/opencontainers/runc/libcontainer/README.md
generated
vendored
|
@ -57,90 +57,94 @@ struct describing how the container is to be created. A sample would look simila
|
|||
|
||||
```go
|
||||
defaultMountFlags := unix.MS_NOEXEC | unix.MS_NOSUID | unix.MS_NODEV
|
||||
var devices []*configs.DeviceRule
|
||||
for _, device := range specconv.AllowedDevices {
|
||||
devices = append(devices, &device.Rule)
|
||||
}
|
||||
config := &configs.Config{
|
||||
Rootfs: "/your/path/to/rootfs",
|
||||
Capabilities: &configs.Capabilities{
|
||||
Bounding: []string{
|
||||
"CAP_CHOWN",
|
||||
"CAP_DAC_OVERRIDE",
|
||||
"CAP_FSETID",
|
||||
"CAP_FOWNER",
|
||||
"CAP_MKNOD",
|
||||
"CAP_NET_RAW",
|
||||
"CAP_SETGID",
|
||||
"CAP_SETUID",
|
||||
"CAP_SETFCAP",
|
||||
"CAP_SETPCAP",
|
||||
"CAP_NET_BIND_SERVICE",
|
||||
"CAP_SYS_CHROOT",
|
||||
"CAP_KILL",
|
||||
"CAP_AUDIT_WRITE",
|
||||
},
|
||||
Effective: []string{
|
||||
"CAP_CHOWN",
|
||||
"CAP_DAC_OVERRIDE",
|
||||
"CAP_FSETID",
|
||||
"CAP_FOWNER",
|
||||
"CAP_MKNOD",
|
||||
"CAP_NET_RAW",
|
||||
"CAP_SETGID",
|
||||
"CAP_SETUID",
|
||||
"CAP_SETFCAP",
|
||||
"CAP_SETPCAP",
|
||||
"CAP_NET_BIND_SERVICE",
|
||||
"CAP_SYS_CHROOT",
|
||||
"CAP_KILL",
|
||||
"CAP_AUDIT_WRITE",
|
||||
},
|
||||
Inheritable: []string{
|
||||
"CAP_CHOWN",
|
||||
"CAP_DAC_OVERRIDE",
|
||||
"CAP_FSETID",
|
||||
"CAP_FOWNER",
|
||||
"CAP_MKNOD",
|
||||
"CAP_NET_RAW",
|
||||
"CAP_SETGID",
|
||||
"CAP_SETUID",
|
||||
"CAP_SETFCAP",
|
||||
"CAP_SETPCAP",
|
||||
"CAP_NET_BIND_SERVICE",
|
||||
"CAP_SYS_CHROOT",
|
||||
"CAP_KILL",
|
||||
"CAP_AUDIT_WRITE",
|
||||
},
|
||||
Permitted: []string{
|
||||
"CAP_CHOWN",
|
||||
"CAP_DAC_OVERRIDE",
|
||||
"CAP_FSETID",
|
||||
"CAP_FOWNER",
|
||||
"CAP_MKNOD",
|
||||
"CAP_NET_RAW",
|
||||
"CAP_SETGID",
|
||||
"CAP_SETUID",
|
||||
"CAP_SETFCAP",
|
||||
"CAP_SETPCAP",
|
||||
"CAP_NET_BIND_SERVICE",
|
||||
"CAP_SYS_CHROOT",
|
||||
"CAP_KILL",
|
||||
"CAP_AUDIT_WRITE",
|
||||
},
|
||||
Ambient: []string{
|
||||
"CAP_CHOWN",
|
||||
"CAP_DAC_OVERRIDE",
|
||||
"CAP_FSETID",
|
||||
"CAP_FOWNER",
|
||||
"CAP_MKNOD",
|
||||
"CAP_NET_RAW",
|
||||
"CAP_SETGID",
|
||||
"CAP_SETUID",
|
||||
"CAP_SETFCAP",
|
||||
"CAP_SETPCAP",
|
||||
"CAP_NET_BIND_SERVICE",
|
||||
"CAP_SYS_CHROOT",
|
||||
"CAP_KILL",
|
||||
"CAP_AUDIT_WRITE",
|
||||
},
|
||||
},
|
||||
Bounding: []string{
|
||||
"CAP_CHOWN",
|
||||
"CAP_DAC_OVERRIDE",
|
||||
"CAP_FSETID",
|
||||
"CAP_FOWNER",
|
||||
"CAP_MKNOD",
|
||||
"CAP_NET_RAW",
|
||||
"CAP_SETGID",
|
||||
"CAP_SETUID",
|
||||
"CAP_SETFCAP",
|
||||
"CAP_SETPCAP",
|
||||
"CAP_NET_BIND_SERVICE",
|
||||
"CAP_SYS_CHROOT",
|
||||
"CAP_KILL",
|
||||
"CAP_AUDIT_WRITE",
|
||||
},
|
||||
Effective: []string{
|
||||
"CAP_CHOWN",
|
||||
"CAP_DAC_OVERRIDE",
|
||||
"CAP_FSETID",
|
||||
"CAP_FOWNER",
|
||||
"CAP_MKNOD",
|
||||
"CAP_NET_RAW",
|
||||
"CAP_SETGID",
|
||||
"CAP_SETUID",
|
||||
"CAP_SETFCAP",
|
||||
"CAP_SETPCAP",
|
||||
"CAP_NET_BIND_SERVICE",
|
||||
"CAP_SYS_CHROOT",
|
||||
"CAP_KILL",
|
||||
"CAP_AUDIT_WRITE",
|
||||
},
|
||||
Inheritable: []string{
|
||||
"CAP_CHOWN",
|
||||
"CAP_DAC_OVERRIDE",
|
||||
"CAP_FSETID",
|
||||
"CAP_FOWNER",
|
||||
"CAP_MKNOD",
|
||||
"CAP_NET_RAW",
|
||||
"CAP_SETGID",
|
||||
"CAP_SETUID",
|
||||
"CAP_SETFCAP",
|
||||
"CAP_SETPCAP",
|
||||
"CAP_NET_BIND_SERVICE",
|
||||
"CAP_SYS_CHROOT",
|
||||
"CAP_KILL",
|
||||
"CAP_AUDIT_WRITE",
|
||||
},
|
||||
Permitted: []string{
|
||||
"CAP_CHOWN",
|
||||
"CAP_DAC_OVERRIDE",
|
||||
"CAP_FSETID",
|
||||
"CAP_FOWNER",
|
||||
"CAP_MKNOD",
|
||||
"CAP_NET_RAW",
|
||||
"CAP_SETGID",
|
||||
"CAP_SETUID",
|
||||
"CAP_SETFCAP",
|
||||
"CAP_SETPCAP",
|
||||
"CAP_NET_BIND_SERVICE",
|
||||
"CAP_SYS_CHROOT",
|
||||
"CAP_KILL",
|
||||
"CAP_AUDIT_WRITE",
|
||||
},
|
||||
Ambient: []string{
|
||||
"CAP_CHOWN",
|
||||
"CAP_DAC_OVERRIDE",
|
||||
"CAP_FSETID",
|
||||
"CAP_FOWNER",
|
||||
"CAP_MKNOD",
|
||||
"CAP_NET_RAW",
|
||||
"CAP_SETGID",
|
||||
"CAP_SETUID",
|
||||
"CAP_SETFCAP",
|
||||
"CAP_SETPCAP",
|
||||
"CAP_NET_BIND_SERVICE",
|
||||
"CAP_SYS_CHROOT",
|
||||
"CAP_KILL",
|
||||
"CAP_AUDIT_WRITE",
|
||||
},
|
||||
},
|
||||
Namespaces: configs.Namespaces([]configs.Namespace{
|
||||
{Type: configs.NEWNS},
|
||||
{Type: configs.NEWUTS},
|
||||
|
@ -155,7 +159,7 @@ config := &configs.Config{
|
|||
Parent: "system",
|
||||
Resources: &configs.Resources{
|
||||
MemorySwappiness: nil,
|
||||
Devices: specconv.AllowedDevices,
|
||||
Devices: devices,
|
||||
},
|
||||
},
|
||||
MaskPaths: []string{
|
||||
|
@ -313,7 +317,7 @@ state, err := container.State()
|
|||
#### Checkpoint & Restore
|
||||
|
||||
libcontainer now integrates [CRIU](http://criu.org/) for checkpointing and restoring containers.
|
||||
This let's you save the state of a process running inside a container to disk, and then restore
|
||||
This lets you save the state of a process running inside a container to disk, and then restore
|
||||
that state into a new process, on the same machine or on another machine.
|
||||
|
||||
`criu` version 1.5.2 or higher is required to use checkpoint and restore.
|
||||
|
|
36
vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go
generated
vendored
36
vendor/github.com/opencontainers/runc/libcontainer/cgroups/cgroups.go
generated
vendored
|
@ -7,37 +7,44 @@ import (
|
|||
)
|
||||
|
||||
type Manager interface {
|
||||
// Applies cgroup configuration to the process with the specified pid
|
||||
// Apply creates a cgroup, if not yet created, and adds a process
|
||||
// with the specified pid into that cgroup. A special value of -1
|
||||
// can be used to merely create a cgroup.
|
||||
Apply(pid int) error
|
||||
|
||||
// Returns the PIDs inside the cgroup set
|
||||
// GetPids returns the PIDs of all processes inside the cgroup.
|
||||
GetPids() ([]int, error)
|
||||
|
||||
// Returns the PIDs inside the cgroup set & all sub-cgroups
|
||||
// GetAllPids returns the PIDs of all processes inside the cgroup
|
||||
// any all its sub-cgroups.
|
||||
GetAllPids() ([]int, error)
|
||||
|
||||
// Returns statistics for the cgroup set
|
||||
// GetStats returns cgroups statistics.
|
||||
GetStats() (*Stats, error)
|
||||
|
||||
// Toggles the freezer cgroup according with specified state
|
||||
// Freeze sets the freezer cgroup to the specified state.
|
||||
Freeze(state configs.FreezerState) error
|
||||
|
||||
// Destroys the cgroup set
|
||||
// Destroy removes cgroup.
|
||||
Destroy() error
|
||||
|
||||
// Path returns a cgroup path to the specified controller/subsystem.
|
||||
// For cgroupv2, the argument is unused and can be empty.
|
||||
Path(string) string
|
||||
|
||||
// Sets the cgroup as configured.
|
||||
Set(container *configs.Config) error
|
||||
// Set sets cgroup resources parameters/limits. If the argument is nil,
|
||||
// the resources specified during Manager creation (or the previous call
|
||||
// to Set) are used.
|
||||
Set(r *configs.Resources) error
|
||||
|
||||
// GetPaths returns cgroup path(s) to save in a state file in order to restore later.
|
||||
// GetPaths returns cgroup path(s) to save in a state file in order to
|
||||
// restore later.
|
||||
//
|
||||
// For cgroup v1, a key is cgroup subsystem name, and the value is the path
|
||||
// to the cgroup for this subsystem.
|
||||
// For cgroup v1, a key is cgroup subsystem name, and the value is the
|
||||
// path to the cgroup for this subsystem.
|
||||
//
|
||||
// For cgroup v2 unified hierarchy, a key is "", and the value is the unified path.
|
||||
// For cgroup v2 unified hierarchy, a key is "", and the value is the
|
||||
// unified path.
|
||||
GetPaths() map[string]string
|
||||
|
||||
// GetCgroups returns the cgroup data as configured.
|
||||
|
@ -46,6 +53,9 @@ type Manager interface {
|
|||
// GetFreezerState retrieves the current FreezerState of the cgroup.
|
||||
GetFreezerState() (configs.FreezerState, error)
|
||||
|
||||
// Whether the cgroup path exists or not
|
||||
// Exists returns whether the cgroup path exists or not.
|
||||
Exists() bool
|
||||
|
||||
// OOMKillCount reports OOM kill count for the cgroup.
|
||||
OOMKillCount() (uint64, error)
|
||||
}
|
||||
|
|
51
vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/fscommon.go
generated
vendored
Normal file
51
vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/fscommon.go
generated
vendored
Normal file
|
@ -0,0 +1,51 @@
|
|||
// +build linux
|
||||
|
||||
package fscommon
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"os"
|
||||
|
||||
"github.com/pkg/errors"
|
||||
"github.com/sirupsen/logrus"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
// WriteFile writes data to a cgroup file in dir.
|
||||
// It is supposed to be used for cgroup files only.
|
||||
func WriteFile(dir, file, data string) error {
|
||||
fd, err := OpenFile(dir, file, unix.O_WRONLY)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer fd.Close()
|
||||
if err := retryingWriteFile(fd, data); err != nil {
|
||||
return errors.Wrapf(err, "failed to write %q", data)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// ReadFile reads data from a cgroup file in dir.
|
||||
// It is supposed to be used for cgroup files only.
|
||||
func ReadFile(dir, file string) (string, error) {
|
||||
fd, err := OpenFile(dir, file, unix.O_RDONLY)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer fd.Close()
|
||||
var buf bytes.Buffer
|
||||
|
||||
_, err = buf.ReadFrom(fd)
|
||||
return buf.String(), err
|
||||
}
|
||||
|
||||
func retryingWriteFile(fd *os.File, data string) error {
|
||||
for {
|
||||
_, err := fd.Write([]byte(data))
|
||||
if errors.Is(err, unix.EINTR) {
|
||||
logrus.Infof("interrupted while writing %s to %s", data, fd.Name())
|
||||
continue
|
||||
}
|
||||
return err
|
||||
}
|
||||
}
|
120
vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/open.go
generated
vendored
Normal file
120
vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/open.go
generated
vendored
Normal file
|
@ -0,0 +1,120 @@
|
|||
package fscommon
|
||||
|
||||
import (
|
||||
"os"
|
||||
"strings"
|
||||
"sync"
|
||||
|
||||
"github.com/pkg/errors"
|
||||
"github.com/sirupsen/logrus"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
const (
|
||||
cgroupfsDir = "/sys/fs/cgroup"
|
||||
cgroupfsPrefix = cgroupfsDir + "/"
|
||||
)
|
||||
|
||||
var (
|
||||
// TestMode is set to true by unit tests that need "fake" cgroupfs.
|
||||
TestMode bool
|
||||
|
||||
cgroupFd int = -1
|
||||
prepOnce sync.Once
|
||||
prepErr error
|
||||
resolveFlags uint64
|
||||
)
|
||||
|
||||
func prepareOpenat2() error {
|
||||
prepOnce.Do(func() {
|
||||
fd, err := unix.Openat2(-1, cgroupfsDir, &unix.OpenHow{
|
||||
Flags: unix.O_DIRECTORY | unix.O_PATH})
|
||||
if err != nil {
|
||||
prepErr = &os.PathError{Op: "openat2", Path: cgroupfsDir, Err: err}
|
||||
if err != unix.ENOSYS {
|
||||
logrus.Warnf("falling back to securejoin: %s", prepErr)
|
||||
} else {
|
||||
logrus.Debug("openat2 not available, falling back to securejoin")
|
||||
}
|
||||
return
|
||||
}
|
||||
var st unix.Statfs_t
|
||||
if err = unix.Fstatfs(fd, &st); err != nil {
|
||||
prepErr = &os.PathError{Op: "statfs", Path: cgroupfsDir, Err: err}
|
||||
logrus.Warnf("falling back to securejoin: %s", prepErr)
|
||||
return
|
||||
}
|
||||
|
||||
cgroupFd = fd
|
||||
|
||||
resolveFlags = unix.RESOLVE_BENEATH | unix.RESOLVE_NO_MAGICLINKS
|
||||
if st.Type == unix.CGROUP2_SUPER_MAGIC {
|
||||
// cgroupv2 has a single mountpoint and no "cpu,cpuacct" symlinks
|
||||
resolveFlags |= unix.RESOLVE_NO_XDEV | unix.RESOLVE_NO_SYMLINKS
|
||||
}
|
||||
|
||||
})
|
||||
|
||||
return prepErr
|
||||
}
|
||||
|
||||
// OpenFile opens a cgroup file in a given dir with given flags.
|
||||
// It is supposed to be used for cgroup files only.
|
||||
func OpenFile(dir, file string, flags int) (*os.File, error) {
|
||||
if dir == "" {
|
||||
return nil, errors.Errorf("no directory specified for %s", file)
|
||||
}
|
||||
mode := os.FileMode(0)
|
||||
if TestMode && flags&os.O_WRONLY != 0 {
|
||||
// "emulate" cgroup fs for unit tests
|
||||
flags |= os.O_TRUNC | os.O_CREATE
|
||||
mode = 0o600
|
||||
}
|
||||
if prepareOpenat2() != nil {
|
||||
return openFallback(dir, file, flags, mode)
|
||||
}
|
||||
reldir := strings.TrimPrefix(dir, cgroupfsPrefix)
|
||||
if len(reldir) == len(dir) { // non-standard path, old system?
|
||||
return openFallback(dir, file, flags, mode)
|
||||
}
|
||||
|
||||
relname := reldir + "/" + file
|
||||
fd, err := unix.Openat2(cgroupFd, relname,
|
||||
&unix.OpenHow{
|
||||
Resolve: resolveFlags,
|
||||
Flags: uint64(flags) | unix.O_CLOEXEC,
|
||||
Mode: uint64(mode),
|
||||
})
|
||||
if err != nil {
|
||||
return nil, &os.PathError{Op: "openat2", Path: dir + "/" + file, Err: err}
|
||||
}
|
||||
|
||||
return os.NewFile(uintptr(fd), cgroupfsPrefix+relname), nil
|
||||
}
|
||||
|
||||
var errNotCgroupfs = errors.New("not a cgroup file")
|
||||
|
||||
// openFallback is used when openat2(2) is not available. It checks the opened
|
||||
// file is on cgroupfs, returning an error otherwise.
|
||||
func openFallback(dir, file string, flags int, mode os.FileMode) (*os.File, error) {
|
||||
path := dir + "/" + file
|
||||
fd, err := os.OpenFile(path, flags, mode)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
if TestMode {
|
||||
return fd, nil
|
||||
}
|
||||
// Check this is a cgroupfs file.
|
||||
var st unix.Statfs_t
|
||||
if err := unix.Fstatfs(int(fd.Fd()), &st); err != nil {
|
||||
_ = fd.Close()
|
||||
return nil, &os.PathError{Op: "statfs", Path: path, Err: err}
|
||||
}
|
||||
if st.Type != unix.CGROUP_SUPER_MAGIC && st.Type != unix.CGROUP2_SUPER_MAGIC {
|
||||
_ = fd.Close()
|
||||
return nil, &os.PathError{Op: "open", Path: path, Err: errNotCgroupfs}
|
||||
}
|
||||
|
||||
return fd, nil
|
||||
}
|
122
vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/utils.go
generated
vendored
Normal file
122
vendor/github.com/opencontainers/runc/libcontainer/cgroups/fscommon/utils.go
generated
vendored
Normal file
|
@ -0,0 +1,122 @@
|
|||
// +build linux
|
||||
|
||||
package fscommon
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"math"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var (
|
||||
ErrNotValidFormat = errors.New("line is not a valid key value format")
|
||||
)
|
||||
|
||||
// ParseUint converts a string to an uint64 integer.
|
||||
// Negative values are returned at zero as, due to kernel bugs,
|
||||
// some of the memory cgroup stats can be negative.
|
||||
func ParseUint(s string, base, bitSize int) (uint64, error) {
|
||||
value, err := strconv.ParseUint(s, base, bitSize)
|
||||
if err != nil {
|
||||
intValue, intErr := strconv.ParseInt(s, base, bitSize)
|
||||
// 1. Handle negative values greater than MinInt64 (and)
|
||||
// 2. Handle negative values lesser than MinInt64
|
||||
if intErr == nil && intValue < 0 {
|
||||
return 0, nil
|
||||
} else if intErr != nil && intErr.(*strconv.NumError).Err == strconv.ErrRange && intValue < 0 {
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
return value, err
|
||||
}
|
||||
|
||||
return value, nil
|
||||
}
|
||||
|
||||
// ParseKeyValue parses a space-separated "name value" kind of cgroup
|
||||
// parameter and returns its key as a string, and its value as uint64
|
||||
// (ParseUint is used to convert the value). For example,
|
||||
// "io_service_bytes 1234" will be returned as "io_service_bytes", 1234.
|
||||
func ParseKeyValue(t string) (string, uint64, error) {
|
||||
parts := strings.SplitN(t, " ", 3)
|
||||
if len(parts) != 2 {
|
||||
return "", 0, fmt.Errorf("line %q is not in key value format", t)
|
||||
}
|
||||
|
||||
value, err := ParseUint(parts[1], 10, 64)
|
||||
if err != nil {
|
||||
return "", 0, fmt.Errorf("unable to convert to uint64: %v", err)
|
||||
}
|
||||
|
||||
return parts[0], value, nil
|
||||
}
|
||||
|
||||
// GetValueByKey reads a key-value pairs from the specified cgroup file,
|
||||
// and returns a value of the specified key. ParseUint is used for value
|
||||
// conversion.
|
||||
func GetValueByKey(path, file, key string) (uint64, error) {
|
||||
content, err := ReadFile(path, file)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
|
||||
lines := strings.Split(string(content), "\n")
|
||||
for _, line := range lines {
|
||||
arr := strings.Split(line, " ")
|
||||
if len(arr) == 2 && arr[0] == key {
|
||||
return ParseUint(arr[1], 10, 64)
|
||||
}
|
||||
}
|
||||
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
// GetCgroupParamUint reads a single uint64 value from the specified cgroup file.
|
||||
// If the value read is "max", the math.MaxUint64 is returned.
|
||||
func GetCgroupParamUint(path, file string) (uint64, error) {
|
||||
contents, err := GetCgroupParamString(path, file)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
contents = strings.TrimSpace(contents)
|
||||
if contents == "max" {
|
||||
return math.MaxUint64, nil
|
||||
}
|
||||
|
||||
res, err := ParseUint(contents, 10, 64)
|
||||
if err != nil {
|
||||
return res, fmt.Errorf("unable to parse file %q", path+"/"+file)
|
||||
}
|
||||
return res, nil
|
||||
}
|
||||
|
||||
// GetCgroupParamInt reads a single int64 value from specified cgroup file.
|
||||
// If the value read is "max", the math.MaxInt64 is returned.
|
||||
func GetCgroupParamInt(path, file string) (int64, error) {
|
||||
contents, err := ReadFile(path, file)
|
||||
if err != nil {
|
||||
return 0, err
|
||||
}
|
||||
contents = strings.TrimSpace(contents)
|
||||
if contents == "max" {
|
||||
return math.MaxInt64, nil
|
||||
}
|
||||
|
||||
res, err := strconv.ParseInt(contents, 10, 64)
|
||||
if err != nil {
|
||||
return res, fmt.Errorf("unable to parse %q as a int from Cgroup file %q", contents, path+"/"+file)
|
||||
}
|
||||
return res, nil
|
||||
}
|
||||
|
||||
// GetCgroupParamString reads a string from the specified cgroup file.
|
||||
func GetCgroupParamString(path, file string) (string, error) {
|
||||
contents, err := ReadFile(path, file)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return strings.TrimSpace(contents), nil
|
||||
}
|
28
vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go
generated
vendored
28
vendor/github.com/opencontainers/runc/libcontainer/cgroups/stats.go
generated
vendored
|
@ -39,6 +39,33 @@ type CpuStats struct {
|
|||
ThrottlingData ThrottlingData `json:"throttling_data,omitempty"`
|
||||
}
|
||||
|
||||
type CPUSetStats struct {
|
||||
// List of the physical numbers of the CPUs on which processes
|
||||
// in that cpuset are allowed to execute
|
||||
CPUs []uint16 `json:"cpus,omitempty"`
|
||||
// cpu_exclusive flag
|
||||
CPUExclusive uint64 `json:"cpu_exclusive"`
|
||||
// List of memory nodes on which processes in that cpuset
|
||||
// are allowed to allocate memory
|
||||
Mems []uint16 `json:"mems,omitempty"`
|
||||
// mem_hardwall flag
|
||||
MemHardwall uint64 `json:"mem_hardwall"`
|
||||
// mem_exclusive flag
|
||||
MemExclusive uint64 `json:"mem_exclusive"`
|
||||
// memory_migrate flag
|
||||
MemoryMigrate uint64 `json:"memory_migrate"`
|
||||
// memory_spread page flag
|
||||
MemorySpreadPage uint64 `json:"memory_spread_page"`
|
||||
// memory_spread slab flag
|
||||
MemorySpreadSlab uint64 `json:"memory_spread_slab"`
|
||||
// memory_pressure
|
||||
MemoryPressure uint64 `json:"memory_pressure"`
|
||||
// sched_load balance flag
|
||||
SchedLoadBalance uint64 `json:"sched_load_balance"`
|
||||
// sched_relax_domain_level
|
||||
SchedRelaxDomainLevel int64 `json:"sched_relax_domain_level"`
|
||||
}
|
||||
|
||||
type MemoryData struct {
|
||||
Usage uint64 `json:"usage,omitempty"`
|
||||
MaxUsage uint64 `json:"max_usage,omitempty"`
|
||||
|
@ -121,6 +148,7 @@ type HugetlbStats struct {
|
|||
|
||||
type Stats struct {
|
||||
CpuStats CpuStats `json:"cpu_stats,omitempty"`
|
||||
CPUSetStats CPUSetStats `json:"cpuset_stats,omitempty"`
|
||||
MemoryStats MemoryStats `json:"memory_stats,omitempty"`
|
||||
PidsStats PidsStats `json:"pids_stats,omitempty"`
|
||||
BlkioStats BlkioStats `json:"blkio_stats,omitempty"`
|
||||
|
|
159
vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go
generated
vendored
159
vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go
generated
vendored
|
@ -15,7 +15,9 @@ import (
|
|||
"sync"
|
||||
"time"
|
||||
|
||||
units "github.com/docker/go-units"
|
||||
"github.com/opencontainers/runc/libcontainer/cgroups/fscommon"
|
||||
"github.com/opencontainers/runc/libcontainer/userns"
|
||||
"github.com/sirupsen/logrus"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
|
@ -29,19 +31,19 @@ var (
|
|||
isUnified bool
|
||||
)
|
||||
|
||||
// HugePageSizeUnitList is a list of the units used by the linux kernel when
|
||||
// naming the HugePage control files.
|
||||
// https://www.kernel.org/doc/Documentation/cgroup-v1/hugetlb.txt
|
||||
// TODO Since the kernel only use KB, MB and GB; TB and PB should be removed,
|
||||
// depends on https://github.com/docker/go-units/commit/a09cd47f892041a4fac473133d181f5aea6fa393
|
||||
var HugePageSizeUnitList = []string{"B", "KB", "MB", "GB", "TB", "PB"}
|
||||
|
||||
// IsCgroup2UnifiedMode returns whether we are running in cgroup v2 unified mode.
|
||||
func IsCgroup2UnifiedMode() bool {
|
||||
isUnifiedOnce.Do(func() {
|
||||
var st unix.Statfs_t
|
||||
if err := unix.Statfs(unifiedMountpoint, &st); err != nil {
|
||||
panic("cannot statfs cgroup root")
|
||||
err := unix.Statfs(unifiedMountpoint, &st)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) && userns.RunningInUserNS() {
|
||||
// ignore the "not found" error if running in userns
|
||||
logrus.WithError(err).Debugf("%s missing, assuming cgroup v1", unifiedMountpoint)
|
||||
isUnified = false
|
||||
return
|
||||
}
|
||||
panic(fmt.Sprintf("cannot statfs cgroup root: %s", err))
|
||||
}
|
||||
isUnified = st.Type == unix.CGROUP2_SUPER_MAGIC
|
||||
})
|
||||
|
@ -86,11 +88,11 @@ func GetAllSubsystems() ([]string, error) {
|
|||
// - freezer: implemented in kernel 5.2
|
||||
// We assume these are always available, as it is hard to detect availability.
|
||||
pseudo := []string{"devices", "freezer"}
|
||||
data, err := ioutil.ReadFile("/sys/fs/cgroup/cgroup.controllers")
|
||||
data, err := fscommon.ReadFile("/sys/fs/cgroup", "cgroup.controllers")
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
subsystems := append(pseudo, strings.Fields(string(data))...)
|
||||
subsystems := append(pseudo, strings.Fields(data)...)
|
||||
return subsystems, nil
|
||||
}
|
||||
f, err := os.Open("/proc/cgroups")
|
||||
|
@ -207,20 +209,66 @@ func EnterPid(cgroupPaths map[string]string, pid int) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func rmdir(path string) error {
|
||||
err := unix.Rmdir(path)
|
||||
if err == nil || err == unix.ENOENT {
|
||||
return nil
|
||||
}
|
||||
return &os.PathError{Op: "rmdir", Path: path, Err: err}
|
||||
}
|
||||
|
||||
// RemovePath aims to remove cgroup path. It does so recursively,
|
||||
// by removing any subdirectories (sub-cgroups) first.
|
||||
func RemovePath(path string) error {
|
||||
// try the fast path first
|
||||
if err := rmdir(path); err == nil {
|
||||
return nil
|
||||
}
|
||||
|
||||
infos, err := ioutil.ReadDir(path)
|
||||
if err != nil {
|
||||
if os.IsNotExist(err) {
|
||||
err = nil
|
||||
}
|
||||
return err
|
||||
}
|
||||
for _, info := range infos {
|
||||
if info.IsDir() {
|
||||
// We should remove subcgroups dir first
|
||||
if err = RemovePath(filepath.Join(path, info.Name())); err != nil {
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
if err == nil {
|
||||
err = rmdir(path)
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
// RemovePaths iterates over the provided paths removing them.
|
||||
// We trying to remove all paths five times with increasing delay between tries.
|
||||
// If after all there are not removed cgroups - appropriate error will be
|
||||
// returned.
|
||||
func RemovePaths(paths map[string]string) (err error) {
|
||||
const retries = 5
|
||||
delay := 10 * time.Millisecond
|
||||
for i := 0; i < 5; i++ {
|
||||
for i := 0; i < retries; i++ {
|
||||
if i != 0 {
|
||||
time.Sleep(delay)
|
||||
delay *= 2
|
||||
}
|
||||
for s, p := range paths {
|
||||
os.RemoveAll(p)
|
||||
// TODO: here probably should be logging
|
||||
if err := RemovePath(p); err != nil {
|
||||
// do not log intermediate iterations
|
||||
switch i {
|
||||
case 0:
|
||||
logrus.WithError(err).Warnf("Failed to remove cgroup (will retry)")
|
||||
case retries - 1:
|
||||
logrus.WithError(err).Error("Failed to remove cgroup")
|
||||
}
|
||||
|
||||
}
|
||||
_, err := os.Stat(p)
|
||||
// We need this strange way of checking cgroups existence because
|
||||
// RemoveAll almost always returns error, even on already removed
|
||||
|
@ -230,6 +278,8 @@ func RemovePaths(paths map[string]string) (err error) {
|
|||
}
|
||||
}
|
||||
if len(paths) == 0 {
|
||||
//nolint:ineffassign,staticcheck // done to help garbage collecting: opencontainers/runc#2506
|
||||
paths = make(map[string]string)
|
||||
return nil
|
||||
}
|
||||
}
|
||||
|
@ -237,27 +287,50 @@ func RemovePaths(paths map[string]string) (err error) {
|
|||
}
|
||||
|
||||
func GetHugePageSize() ([]string, error) {
|
||||
files, err := ioutil.ReadDir("/sys/kernel/mm/hugepages")
|
||||
dir, err := os.OpenFile("/sys/kernel/mm/hugepages", unix.O_DIRECTORY|unix.O_RDONLY, 0)
|
||||
if err != nil {
|
||||
return []string{}, err
|
||||
return nil, err
|
||||
}
|
||||
var fileNames []string
|
||||
for _, st := range files {
|
||||
fileNames = append(fileNames, st.Name())
|
||||
files, err := dir.Readdirnames(0)
|
||||
dir.Close()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return getHugePageSizeFromFilenames(fileNames)
|
||||
|
||||
return getHugePageSizeFromFilenames(files)
|
||||
}
|
||||
|
||||
func getHugePageSizeFromFilenames(fileNames []string) ([]string, error) {
|
||||
var pageSizes []string
|
||||
for _, fileName := range fileNames {
|
||||
nameArray := strings.Split(fileName, "-")
|
||||
pageSize, err := units.RAMInBytes(nameArray[1])
|
||||
if err != nil {
|
||||
return []string{}, err
|
||||
pageSizes := make([]string, 0, len(fileNames))
|
||||
|
||||
for _, file := range fileNames {
|
||||
// example: hugepages-1048576kB
|
||||
val := strings.TrimPrefix(file, "hugepages-")
|
||||
if len(val) == len(file) {
|
||||
// unexpected file name: no prefix found
|
||||
continue
|
||||
}
|
||||
sizeString := units.CustomSize("%g%s", float64(pageSize), 1024.0, HugePageSizeUnitList)
|
||||
pageSizes = append(pageSizes, sizeString)
|
||||
// The suffix is always "kB" (as of Linux 5.9)
|
||||
eLen := len(val) - 2
|
||||
val = strings.TrimSuffix(val, "kB")
|
||||
if len(val) != eLen {
|
||||
logrus.Warnf("GetHugePageSize: %s: invalid filename suffix (expected \"kB\")", file)
|
||||
continue
|
||||
}
|
||||
size, err := strconv.Atoi(val)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
// Model after https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/mm/hugetlb_cgroup.c?id=eff48ddeab782e35e58ccc8853f7386bbae9dec4#n574
|
||||
// but in our case the size is in KB already.
|
||||
if size >= (1 << 20) {
|
||||
val = strconv.Itoa(size>>20) + "GB"
|
||||
} else if size >= (1 << 10) {
|
||||
val = strconv.Itoa(size>>10) + "MB"
|
||||
} else {
|
||||
val += "KB"
|
||||
}
|
||||
pageSizes = append(pageSizes, val)
|
||||
}
|
||||
|
||||
return pageSizes, nil
|
||||
|
@ -303,14 +376,14 @@ func WriteCgroupProc(dir string, pid int) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
cgroupProcessesFile, err := os.OpenFile(filepath.Join(dir, CgroupProcesses), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0700)
|
||||
file, err := fscommon.OpenFile(dir, CgroupProcesses, os.O_WRONLY)
|
||||
if err != nil {
|
||||
return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err)
|
||||
}
|
||||
defer cgroupProcessesFile.Close()
|
||||
defer file.Close()
|
||||
|
||||
for i := 0; i < 5; i++ {
|
||||
_, err = cgroupProcessesFile.WriteString(strconv.Itoa(pid))
|
||||
_, err = file.WriteString(strconv.Itoa(pid))
|
||||
if err == nil {
|
||||
return nil
|
||||
}
|
||||
|
@ -327,17 +400,6 @@ func WriteCgroupProc(dir string, pid int) error {
|
|||
return err
|
||||
}
|
||||
|
||||
// Since the OCI spec is designed for cgroup v1, in some cases
|
||||
// there is need to convert from the cgroup v1 configuration to cgroup v2
|
||||
// the formula for BlkIOWeight is y = (1 + (x - 10) * 9999 / 990)
|
||||
// convert linearly from [10-1000] to [1-10000]
|
||||
func ConvertBlkIOToCgroupV2Value(blkIoWeight uint16) uint64 {
|
||||
if blkIoWeight == 0 {
|
||||
return 0
|
||||
}
|
||||
return uint64(1 + (uint64(blkIoWeight)-10)*9999/990)
|
||||
}
|
||||
|
||||
// Since the OCI spec is designed for cgroup v1, in some cases
|
||||
// there is need to convert from the cgroup v1 configuration to cgroup v2
|
||||
// the formula for cpuShares is y = (1 + ((x - 2) * 9999) / 262142)
|
||||
|
@ -377,3 +439,14 @@ func ConvertMemorySwapToCgroupV2Value(memorySwap, memory int64) (int64, error) {
|
|||
|
||||
return memorySwap - memory, nil
|
||||
}
|
||||
|
||||
// Since the OCI spec is designed for cgroup v1, in some cases
|
||||
// there is need to convert from the cgroup v1 configuration to cgroup v2
|
||||
// the formula for BlkIOWeight to IOWeight is y = (1 + (x - 10) * 9999 / 990)
|
||||
// convert linearly from [10-1000] to [1-10000]
|
||||
func ConvertBlkIOToIOWeightValue(blkIoWeight uint16) uint64 {
|
||||
if blkIoWeight == 0 {
|
||||
return 0
|
||||
}
|
||||
return uint64(1 + (uint64(blkIoWeight)-10)*9999/990)
|
||||
}
|
||||
|
|
100
vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go
generated
vendored
100
vendor/github.com/opencontainers/runc/libcontainer/cgroups/v1_utils.go
generated
vendored
|
@ -1,16 +1,16 @@
|
|||
package cgroups
|
||||
|
||||
import (
|
||||
"bufio"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"sync"
|
||||
"syscall"
|
||||
|
||||
securejoin "github.com/cyphar/filepath-securejoin"
|
||||
"github.com/moby/sys/mountinfo"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
|
@ -23,7 +23,12 @@ const (
|
|||
)
|
||||
|
||||
var (
|
||||
errUnified = errors.New("not implemented for cgroup v2 unified hierarchy")
|
||||
errUnified = errors.New("not implemented for cgroup v2 unified hierarchy")
|
||||
ErrV1NoUnified = errors.New("invalid configuration: cannot use unified on cgroup v1")
|
||||
|
||||
readMountinfoOnce sync.Once
|
||||
readMountinfoErr error
|
||||
cgroupMountinfo []*mountinfo.Info
|
||||
)
|
||||
|
||||
type NotFoundError struct {
|
||||
|
@ -90,6 +95,21 @@ func tryDefaultPath(cgroupPath, subsystem string) string {
|
|||
return path
|
||||
}
|
||||
|
||||
// readCgroupMountinfo returns a list of cgroup v1 mounts (i.e. the ones
|
||||
// with fstype of "cgroup") for the current running process.
|
||||
//
|
||||
// The results are cached (to avoid re-reading mountinfo which is relatively
|
||||
// expensive), so it is assumed that cgroup mounts are not being changed.
|
||||
func readCgroupMountinfo() ([]*mountinfo.Info, error) {
|
||||
readMountinfoOnce.Do(func() {
|
||||
cgroupMountinfo, readMountinfoErr = mountinfo.GetMounts(
|
||||
mountinfo.FSTypeFilter("cgroup"),
|
||||
)
|
||||
})
|
||||
|
||||
return cgroupMountinfo, readMountinfoErr
|
||||
}
|
||||
|
||||
// https://www.kernel.org/doc/Documentation/cgroup-v1/cgroups.txt
|
||||
func FindCgroupMountpoint(cgroupPath, subsystem string) (string, error) {
|
||||
if IsCgroup2UnifiedMode() {
|
||||
|
@ -110,56 +130,28 @@ func FindCgroupMountpointAndRoot(cgroupPath, subsystem string) (string, string,
|
|||
return "", "", errUnified
|
||||
}
|
||||
|
||||
// Avoid parsing mountinfo by checking if subsystem is valid/available.
|
||||
if !isSubsystemAvailable(subsystem) {
|
||||
return "", "", NewNotFoundError(subsystem)
|
||||
}
|
||||
|
||||
f, err := os.Open("/proc/self/mountinfo")
|
||||
mi, err := readCgroupMountinfo()
|
||||
if err != nil {
|
||||
return "", "", err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
return findCgroupMountpointAndRootFromReader(f, cgroupPath, subsystem)
|
||||
return findCgroupMountpointAndRootFromMI(mi, cgroupPath, subsystem)
|
||||
}
|
||||
|
||||
func findCgroupMountpointAndRootFromReader(reader io.Reader, cgroupPath, subsystem string) (string, string, error) {
|
||||
scanner := bufio.NewScanner(reader)
|
||||
for scanner.Scan() {
|
||||
txt := scanner.Text()
|
||||
fields := strings.Fields(txt)
|
||||
if len(fields) < 9 {
|
||||
continue
|
||||
}
|
||||
if strings.HasPrefix(fields[4], cgroupPath) {
|
||||
for _, opt := range strings.Split(fields[len(fields)-1], ",") {
|
||||
func findCgroupMountpointAndRootFromMI(mounts []*mountinfo.Info, cgroupPath, subsystem string) (string, string, error) {
|
||||
for _, mi := range mounts {
|
||||
if strings.HasPrefix(mi.Mountpoint, cgroupPath) {
|
||||
for _, opt := range strings.Split(mi.VFSOptions, ",") {
|
||||
if opt == subsystem {
|
||||
return fields[4], fields[3], nil
|
||||
return mi.Mountpoint, mi.Root, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
return "", "", err
|
||||
}
|
||||
|
||||
return "", "", NewNotFoundError(subsystem)
|
||||
}
|
||||
|
||||
func isSubsystemAvailable(subsystem string) bool {
|
||||
if IsCgroup2UnifiedMode() {
|
||||
panic("don't call isSubsystemAvailable from cgroupv2 code")
|
||||
}
|
||||
|
||||
cgroups, err := ParseCgroupFile("/proc/self/cgroup")
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
_, avail := cgroups[subsystem]
|
||||
return avail
|
||||
}
|
||||
|
||||
func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
|
||||
if len(m.Subsystems) == 0 {
|
||||
return "", fmt.Errorf("no subsystem for mount")
|
||||
|
@ -168,25 +160,15 @@ func (m Mount) GetOwnCgroup(cgroups map[string]string) (string, error) {
|
|||
return getControllerPath(m.Subsystems[0], cgroups)
|
||||
}
|
||||
|
||||
func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount, error) {
|
||||
func getCgroupMountsHelper(ss map[string]bool, mounts []*mountinfo.Info, all bool) ([]Mount, error) {
|
||||
res := make([]Mount, 0, len(ss))
|
||||
scanner := bufio.NewScanner(mi)
|
||||
numFound := 0
|
||||
for scanner.Scan() && numFound < len(ss) {
|
||||
txt := scanner.Text()
|
||||
sepIdx := strings.Index(txt, " - ")
|
||||
if sepIdx == -1 {
|
||||
return nil, fmt.Errorf("invalid mountinfo format")
|
||||
}
|
||||
if txt[sepIdx+3:sepIdx+10] == "cgroup2" || txt[sepIdx+3:sepIdx+9] != "cgroup" {
|
||||
continue
|
||||
}
|
||||
fields := strings.Split(txt, " ")
|
||||
for _, mi := range mounts {
|
||||
m := Mount{
|
||||
Mountpoint: fields[4],
|
||||
Root: fields[3],
|
||||
Mountpoint: mi.Mountpoint,
|
||||
Root: mi.Root,
|
||||
}
|
||||
for _, opt := range strings.Split(fields[len(fields)-1], ",") {
|
||||
for _, opt := range strings.Split(mi.VFSOptions, ",") {
|
||||
seen, known := ss[opt]
|
||||
if !known || (!all && seen) {
|
||||
continue
|
||||
|
@ -199,19 +181,18 @@ func getCgroupMountsHelper(ss map[string]bool, mi io.Reader, all bool) ([]Mount,
|
|||
if len(m.Subsystems) > 0 || all {
|
||||
res = append(res, m)
|
||||
}
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
return nil, err
|
||||
if !all && numFound >= len(ss) {
|
||||
break
|
||||
}
|
||||
}
|
||||
return res, nil
|
||||
}
|
||||
|
||||
func getCgroupMountsV1(all bool) ([]Mount, error) {
|
||||
f, err := os.Open("/proc/self/mountinfo")
|
||||
mi, err := readCgroupMountinfo()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
allSubsystems, err := ParseCgroupFile("/proc/self/cgroup")
|
||||
if err != nil {
|
||||
|
@ -222,7 +203,8 @@ func getCgroupMountsV1(all bool) ([]Mount, error) {
|
|||
for s := range allSubsystems {
|
||||
allMap[s] = false
|
||||
}
|
||||
return getCgroupMountsHelper(allMap, f, all)
|
||||
|
||||
return getCgroupMountsHelper(allMap, mi, all)
|
||||
}
|
||||
|
||||
// GetOwnCgroup returns the relative path to the cgroup docker is running in.
|
||||
|
|
12
vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go
generated
vendored
12
vendor/github.com/opencontainers/runc/libcontainer/configs/cgroup_linux.go
generated
vendored
|
@ -2,6 +2,7 @@ package configs
|
|||
|
||||
import (
|
||||
systemdDbus "github.com/coreos/go-systemd/v22/dbus"
|
||||
"github.com/opencontainers/runc/libcontainer/devices"
|
||||
)
|
||||
|
||||
type FreezerState string
|
||||
|
@ -42,7 +43,7 @@ type Cgroup struct {
|
|||
|
||||
type Resources struct {
|
||||
// Devices is the set of access rules for devices in the container.
|
||||
Devices []*DeviceRule `json:"devices"`
|
||||
Devices []*devices.Rule `json:"devices"`
|
||||
|
||||
// Memory limit (in bytes)
|
||||
Memory int64 `json:"memory"`
|
||||
|
@ -53,12 +54,6 @@ type Resources struct {
|
|||
// Total memory usage (memory + swap); set `-1` to enable unlimited swap
|
||||
MemorySwap int64 `json:"memory_swap"`
|
||||
|
||||
// Kernel memory limit (in bytes)
|
||||
KernelMemory int64 `json:"kernel_memory"`
|
||||
|
||||
// Kernel memory limit for TCP use (in bytes)
|
||||
KernelMemoryTCP int64 `json:"kernel_memory_tcp"`
|
||||
|
||||
// CPU shares (relative weight vs. other containers)
|
||||
CpuShares uint64 `json:"cpu_shares"`
|
||||
|
||||
|
@ -127,6 +122,9 @@ type Resources struct {
|
|||
// CpuWeight sets a proportional bandwidth limit.
|
||||
CpuWeight uint64 `json:"cpu_weight"`
|
||||
|
||||
// Unified is cgroupv2-only key-value map.
|
||||
Unified map[string]string `json:"unified"`
|
||||
|
||||
// SkipDevices allows to skip configuring device permissions.
|
||||
// Used by e.g. kubelet while creating a parent cgroup (kubepods)
|
||||
// common for many containers.
|
||||
|
|
25
vendor/github.com/opencontainers/runc/libcontainer/configs/config.go
generated
vendored
25
vendor/github.com/opencontainers/runc/libcontainer/configs/config.go
generated
vendored
|
@ -7,6 +7,7 @@ import (
|
|||
"os/exec"
|
||||
"time"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/devices"
|
||||
"github.com/opencontainers/runtime-spec/specs-go"
|
||||
"github.com/pkg/errors"
|
||||
"github.com/sirupsen/logrus"
|
||||
|
@ -30,9 +31,10 @@ type IDMap struct {
|
|||
// for syscalls. Additional architectures can be added by specifying them in
|
||||
// Architectures.
|
||||
type Seccomp struct {
|
||||
DefaultAction Action `json:"default_action"`
|
||||
Architectures []string `json:"architectures"`
|
||||
Syscalls []*Syscall `json:"syscalls"`
|
||||
DefaultAction Action `json:"default_action"`
|
||||
Architectures []string `json:"architectures"`
|
||||
Syscalls []*Syscall `json:"syscalls"`
|
||||
DefaultErrnoRet *uint `json:"default_errno_ret"`
|
||||
}
|
||||
|
||||
// Action is taken upon rule match in Seccomp
|
||||
|
@ -92,6 +94,9 @@ type Config struct {
|
|||
// Path to a directory containing the container's root filesystem.
|
||||
Rootfs string `json:"rootfs"`
|
||||
|
||||
// Umask is the umask to use inside of the container.
|
||||
Umask *uint32 `json:"umask"`
|
||||
|
||||
// Readonlyfs will remount the container's rootfs as readonly where only externally mounted
|
||||
// bind mounts are writtable.
|
||||
Readonlyfs bool `json:"readonlyfs"`
|
||||
|
@ -104,7 +109,7 @@ type Config struct {
|
|||
Mounts []*Mount `json:"mounts"`
|
||||
|
||||
// The device nodes that should be automatically created within the container upon container start. Note, make sure that the node is marked as allowed in the cgroup as well!
|
||||
Devices []*Device `json:"devices"`
|
||||
Devices []*devices.Device `json:"devices"`
|
||||
|
||||
MountLabel string `json:"mount_label"`
|
||||
|
||||
|
@ -218,25 +223,25 @@ const (
|
|||
// the runtime environment has been created but before the pivot_root has been executed.
|
||||
// CreateRuntime is called immediately after the deprecated Prestart hook.
|
||||
// CreateRuntime commands are called in the Runtime Namespace.
|
||||
CreateRuntime = "createRuntime"
|
||||
CreateRuntime HookName = "createRuntime"
|
||||
|
||||
// CreateContainer commands MUST be called as part of the create operation after
|
||||
// the runtime environment has been created but before the pivot_root has been executed.
|
||||
// CreateContainer commands are called in the Container namespace.
|
||||
CreateContainer = "createContainer"
|
||||
CreateContainer HookName = "createContainer"
|
||||
|
||||
// StartContainer commands MUST be called as part of the start operation and before
|
||||
// the container process is started.
|
||||
// StartContainer commands are called in the Container namespace.
|
||||
StartContainer = "startContainer"
|
||||
StartContainer HookName = "startContainer"
|
||||
|
||||
// Poststart commands are executed after the container init process starts.
|
||||
// Poststart commands are called in the Runtime Namespace.
|
||||
Poststart = "poststart"
|
||||
Poststart HookName = "poststart"
|
||||
|
||||
// Poststop commands are executed after the container init process exits.
|
||||
// Poststop commands are called in the Runtime Namespace.
|
||||
Poststop = "poststop"
|
||||
Poststop HookName = "poststop"
|
||||
)
|
||||
|
||||
type Capabilities struct {
|
||||
|
@ -383,7 +388,7 @@ func (c Command) Run(s *specs.State) error {
|
|||
return err
|
||||
case <-timerCh:
|
||||
cmd.Process.Kill()
|
||||
cmd.Wait()
|
||||
<-errC
|
||||
return fmt.Errorf("hook ran past specified timeout of %.1fs", c.Timeout.Seconds())
|
||||
}
|
||||
}
|
||||
|
|
9
vendor/github.com/opencontainers/runc/libcontainer/configs/configs_fuzzer.go
generated
vendored
Normal file
9
vendor/github.com/opencontainers/runc/libcontainer/configs/configs_fuzzer.go
generated
vendored
Normal file
|
@ -0,0 +1,9 @@
|
|||
// +build gofuzz
|
||||
|
||||
package configs
|
||||
|
||||
func FuzzUnmarshalJSON(data []byte) int {
|
||||
hooks := Hooks{}
|
||||
_ = hooks.UnmarshalJSON(data)
|
||||
return 1
|
||||
}
|
16
vendor/github.com/opencontainers/runc/libcontainer/configs/device_unix.go
generated
vendored
16
vendor/github.com/opencontainers/runc/libcontainer/configs/device_unix.go
generated
vendored
|
@ -1,16 +0,0 @@
|
|||
// +build !windows
|
||||
|
||||
package configs
|
||||
|
||||
import (
|
||||
"errors"
|
||||
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
func (d *DeviceRule) Mkdev() (uint64, error) {
|
||||
if d.Major == Wildcard || d.Minor == Wildcard {
|
||||
return 0, errors.New("cannot mkdev() device with wildcards")
|
||||
}
|
||||
return unix.Mkdev(uint32(d.Major), uint32(d.Minor)), nil
|
||||
}
|
5
vendor/github.com/opencontainers/runc/libcontainer/configs/device_windows.go
generated
vendored
5
vendor/github.com/opencontainers/runc/libcontainer/configs/device_windows.go
generated
vendored
|
@ -1,5 +0,0 @@
|
|||
package configs
|
||||
|
||||
func (d *DeviceRule) Mkdev() (uint64, error) {
|
||||
return 0, nil
|
||||
}
|
17
vendor/github.com/opencontainers/runc/libcontainer/configs/devices.go
generated
vendored
Normal file
17
vendor/github.com/opencontainers/runc/libcontainer/configs/devices.go
generated
vendored
Normal file
|
@ -0,0 +1,17 @@
|
|||
package configs
|
||||
|
||||
import "github.com/opencontainers/runc/libcontainer/devices"
|
||||
|
||||
type (
|
||||
// Deprecated: use libcontainer/devices.Device
|
||||
Device = devices.Device
|
||||
|
||||
// Deprecated: use libcontainer/devices.Rule
|
||||
DeviceRule = devices.Rule
|
||||
|
||||
// Deprecated: use libcontainer/devices.Type
|
||||
DeviceType = devices.Type
|
||||
|
||||
// Deprecated: use libcontainer/devices.Permissions
|
||||
DevicePermissions = devices.Permissions
|
||||
)
|
2
vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go
generated
vendored
2
vendor/github.com/opencontainers/runc/libcontainer/configs/namespaces_linux.go
generated
vendored
|
@ -56,7 +56,7 @@ func IsNamespaceSupported(ns NamespaceType) bool {
|
|||
if nsFile == "" {
|
||||
return false
|
||||
}
|
||||
_, err := os.Stat(fmt.Sprintf("/proc/self/ns/%s", nsFile))
|
||||
_, err := os.Stat("/proc/self/ns/" + nsFile)
|
||||
// a namespace is supported if it exists and we have permissions to read it
|
||||
supported = err == nil
|
||||
supportedNamespaces[ns] = supported
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
package configs
|
||||
package devices
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
@ -11,7 +11,7 @@ const (
|
|||
)
|
||||
|
||||
type Device struct {
|
||||
DeviceRule
|
||||
Rule
|
||||
|
||||
// Path to the device.
|
||||
Path string `json:"path"`
|
||||
|
@ -26,10 +26,10 @@ type Device struct {
|
|||
Gid uint32 `json:"gid"`
|
||||
}
|
||||
|
||||
// DevicePermissions is a cgroupv1-style string to represent device access. It
|
||||
// Permissions is a cgroupv1-style string to represent device access. It
|
||||
// has to be a string for backward compatibility reasons, hence why it has
|
||||
// methods to do set operations.
|
||||
type DevicePermissions string
|
||||
type Permissions string
|
||||
|
||||
const (
|
||||
deviceRead uint = (1 << iota)
|
||||
|
@ -37,7 +37,7 @@ const (
|
|||
deviceMknod
|
||||
)
|
||||
|
||||
func (p DevicePermissions) toSet() uint {
|
||||
func (p Permissions) toSet() uint {
|
||||
var set uint
|
||||
for _, perm := range p {
|
||||
switch perm {
|
||||
|
@ -52,7 +52,7 @@ func (p DevicePermissions) toSet() uint {
|
|||
return set
|
||||
}
|
||||
|
||||
func fromSet(set uint) DevicePermissions {
|
||||
func fromSet(set uint) Permissions {
|
||||
var perm string
|
||||
if set&deviceRead == deviceRead {
|
||||
perm += "r"
|
||||
|
@ -63,53 +63,53 @@ func fromSet(set uint) DevicePermissions {
|
|||
if set&deviceMknod == deviceMknod {
|
||||
perm += "m"
|
||||
}
|
||||
return DevicePermissions(perm)
|
||||
return Permissions(perm)
|
||||
}
|
||||
|
||||
// Union returns the union of the two sets of DevicePermissions.
|
||||
func (p DevicePermissions) Union(o DevicePermissions) DevicePermissions {
|
||||
// Union returns the union of the two sets of Permissions.
|
||||
func (p Permissions) Union(o Permissions) Permissions {
|
||||
lhs := p.toSet()
|
||||
rhs := o.toSet()
|
||||
return fromSet(lhs | rhs)
|
||||
}
|
||||
|
||||
// Difference returns the set difference of the two sets of DevicePermissions.
|
||||
// Difference returns the set difference of the two sets of Permissions.
|
||||
// In set notation, A.Difference(B) gives you A\B.
|
||||
func (p DevicePermissions) Difference(o DevicePermissions) DevicePermissions {
|
||||
func (p Permissions) Difference(o Permissions) Permissions {
|
||||
lhs := p.toSet()
|
||||
rhs := o.toSet()
|
||||
return fromSet(lhs &^ rhs)
|
||||
}
|
||||
|
||||
// Intersection computes the intersection of the two sets of DevicePermissions.
|
||||
func (p DevicePermissions) Intersection(o DevicePermissions) DevicePermissions {
|
||||
// Intersection computes the intersection of the two sets of Permissions.
|
||||
func (p Permissions) Intersection(o Permissions) Permissions {
|
||||
lhs := p.toSet()
|
||||
rhs := o.toSet()
|
||||
return fromSet(lhs & rhs)
|
||||
}
|
||||
|
||||
// IsEmpty returns whether the set of permissions in a DevicePermissions is
|
||||
// IsEmpty returns whether the set of permissions in a Permissions is
|
||||
// empty.
|
||||
func (p DevicePermissions) IsEmpty() bool {
|
||||
return p == DevicePermissions("")
|
||||
func (p Permissions) IsEmpty() bool {
|
||||
return p == Permissions("")
|
||||
}
|
||||
|
||||
// IsValid returns whether the set of permissions is a subset of valid
|
||||
// permissions (namely, {r,w,m}).
|
||||
func (p DevicePermissions) IsValid() bool {
|
||||
func (p Permissions) IsValid() bool {
|
||||
return p == fromSet(p.toSet())
|
||||
}
|
||||
|
||||
type DeviceType rune
|
||||
type Type rune
|
||||
|
||||
const (
|
||||
WildcardDevice DeviceType = 'a'
|
||||
BlockDevice DeviceType = 'b'
|
||||
CharDevice DeviceType = 'c' // or 'u'
|
||||
FifoDevice DeviceType = 'p'
|
||||
WildcardDevice Type = 'a'
|
||||
BlockDevice Type = 'b'
|
||||
CharDevice Type = 'c' // or 'u'
|
||||
FifoDevice Type = 'p'
|
||||
)
|
||||
|
||||
func (t DeviceType) IsValid() bool {
|
||||
func (t Type) IsValid() bool {
|
||||
switch t {
|
||||
case WildcardDevice, BlockDevice, CharDevice, FifoDevice:
|
||||
return true
|
||||
|
@ -118,7 +118,7 @@ func (t DeviceType) IsValid() bool {
|
|||
}
|
||||
}
|
||||
|
||||
func (t DeviceType) CanMknod() bool {
|
||||
func (t Type) CanMknod() bool {
|
||||
switch t {
|
||||
case BlockDevice, CharDevice, FifoDevice:
|
||||
return true
|
||||
|
@ -127,7 +127,7 @@ func (t DeviceType) CanMknod() bool {
|
|||
}
|
||||
}
|
||||
|
||||
func (t DeviceType) CanCgroup() bool {
|
||||
func (t Type) CanCgroup() bool {
|
||||
switch t {
|
||||
case WildcardDevice, BlockDevice, CharDevice:
|
||||
return true
|
||||
|
@ -136,10 +136,10 @@ func (t DeviceType) CanCgroup() bool {
|
|||
}
|
||||
}
|
||||
|
||||
type DeviceRule struct {
|
||||
type Rule struct {
|
||||
// Type of device ('c' for char, 'b' for block). If set to 'a', this rule
|
||||
// acts as a wildcard and all fields other than Allow are ignored.
|
||||
Type DeviceType `json:"type"`
|
||||
Type Type `json:"type"`
|
||||
|
||||
// Major is the device's major number.
|
||||
Major int64 `json:"major"`
|
||||
|
@ -149,13 +149,13 @@ type DeviceRule struct {
|
|||
|
||||
// Permissions is the set of permissions that this rule applies to (in the
|
||||
// cgroupv1 format -- any combination of "rwm").
|
||||
Permissions DevicePermissions `json:"permissions"`
|
||||
Permissions Permissions `json:"permissions"`
|
||||
|
||||
// Allow specifies whether this rule is allowed.
|
||||
Allow bool `json:"allow"`
|
||||
}
|
||||
|
||||
func (d *DeviceRule) CgroupString() string {
|
||||
func (d *Rule) CgroupString() string {
|
||||
var (
|
||||
major = strconv.FormatInt(d.Major, 10)
|
||||
minor = strconv.FormatInt(d.Minor, 10)
|
||||
|
@ -168,3 +168,7 @@ func (d *DeviceRule) CgroupString() string {
|
|||
}
|
||||
return fmt.Sprintf("%c %s:%s %s", d.Type, major, minor, d.Permissions)
|
||||
}
|
||||
|
||||
func (d *Rule) Mkdev() (uint64, error) {
|
||||
return mkDev(d)
|
||||
}
|
|
@ -1,3 +1,5 @@
|
|||
// +build !windows
|
||||
|
||||
package devices
|
||||
|
||||
import (
|
||||
|
@ -6,7 +8,6 @@ import (
|
|||
"os"
|
||||
"path/filepath"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
"golang.org/x/sys/unix"
|
||||
)
|
||||
|
||||
|
@ -21,9 +22,16 @@ var (
|
|||
ioutilReadDir = ioutil.ReadDir
|
||||
)
|
||||
|
||||
func mkDev(d *Rule) (uint64, error) {
|
||||
if d.Major == Wildcard || d.Minor == Wildcard {
|
||||
return 0, errors.New("cannot mkdev() device with wildcards")
|
||||
}
|
||||
return unix.Mkdev(uint32(d.Major), uint32(d.Minor)), nil
|
||||
}
|
||||
|
||||
// Given the path to a device and its cgroup_permissions(which cannot be easily queried) look up the
|
||||
// information about a linux device and return that information as a Device struct.
|
||||
func DeviceFromPath(path, permissions string) (*configs.Device, error) {
|
||||
func DeviceFromPath(path, permissions string) (*Device, error) {
|
||||
var stat unix.Stat_t
|
||||
err := unixLstat(path, &stat)
|
||||
if err != nil {
|
||||
|
@ -31,7 +39,7 @@ func DeviceFromPath(path, permissions string) (*configs.Device, error) {
|
|||
}
|
||||
|
||||
var (
|
||||
devType configs.DeviceType
|
||||
devType Type
|
||||
mode = stat.Mode
|
||||
devNumber = uint64(stat.Rdev)
|
||||
major = unix.Major(devNumber)
|
||||
|
@ -39,41 +47,41 @@ func DeviceFromPath(path, permissions string) (*configs.Device, error) {
|
|||
)
|
||||
switch mode & unix.S_IFMT {
|
||||
case unix.S_IFBLK:
|
||||
devType = configs.BlockDevice
|
||||
devType = BlockDevice
|
||||
case unix.S_IFCHR:
|
||||
devType = configs.CharDevice
|
||||
devType = CharDevice
|
||||
case unix.S_IFIFO:
|
||||
devType = configs.FifoDevice
|
||||
devType = FifoDevice
|
||||
default:
|
||||
return nil, ErrNotADevice
|
||||
}
|
||||
return &configs.Device{
|
||||
DeviceRule: configs.DeviceRule{
|
||||
return &Device{
|
||||
Rule: Rule{
|
||||
Type: devType,
|
||||
Major: int64(major),
|
||||
Minor: int64(minor),
|
||||
Permissions: configs.DevicePermissions(permissions),
|
||||
Permissions: Permissions(permissions),
|
||||
},
|
||||
Path: path,
|
||||
FileMode: os.FileMode(mode),
|
||||
FileMode: os.FileMode(mode &^ unix.S_IFMT),
|
||||
Uid: stat.Uid,
|
||||
Gid: stat.Gid,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// HostDevices returns all devices that can be found under /dev directory.
|
||||
func HostDevices() ([]*configs.Device, error) {
|
||||
func HostDevices() ([]*Device, error) {
|
||||
return GetDevices("/dev")
|
||||
}
|
||||
|
||||
// GetDevices recursively traverses a directory specified by path
|
||||
// and returns all devices found there.
|
||||
func GetDevices(path string) ([]*configs.Device, error) {
|
||||
func GetDevices(path string) ([]*Device, error) {
|
||||
files, err := ioutilReadDir(path)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
var out []*configs.Device
|
||||
var out []*Device
|
||||
for _, f := range files {
|
||||
switch {
|
||||
case f.IsDir():
|
||||
|
@ -104,7 +112,7 @@ func GetDevices(path string) ([]*configs.Device, error) {
|
|||
}
|
||||
return nil, err
|
||||
}
|
||||
if device.Type == configs.FifoDevice {
|
||||
if device.Type == FifoDevice {
|
||||
continue
|
||||
}
|
||||
out = append(out, device)
|
80
vendor/github.com/opencontainers/runc/libcontainer/nsenter/cloned_binary.c
generated
vendored
80
vendor/github.com/opencontainers/runc/libcontainer/nsenter/cloned_binary.c
generated
vendored
|
@ -59,14 +59,38 @@
|
|||
#include <sys/syscall.h>
|
||||
|
||||
/* Use our own wrapper for memfd_create. */
|
||||
#if !defined(SYS_memfd_create) && defined(__NR_memfd_create)
|
||||
# define SYS_memfd_create __NR_memfd_create
|
||||
#ifndef SYS_memfd_create
|
||||
# ifdef __NR_memfd_create
|
||||
# define SYS_memfd_create __NR_memfd_create
|
||||
# else
|
||||
/* These values come from <https://fedora.juszkiewicz.com.pl/syscalls.html>. */
|
||||
# warning "libc is outdated -- using hard-coded SYS_memfd_create"
|
||||
# if defined(__x86_64__)
|
||||
# define SYS_memfd_create 319
|
||||
# elif defined(__i386__)
|
||||
# define SYS_memfd_create 356
|
||||
# elif defined(__ia64__)
|
||||
# define SYS_memfd_create 1340
|
||||
# elif defined(__arm__)
|
||||
# define SYS_memfd_create 385
|
||||
# elif defined(__aarch64__)
|
||||
# define SYS_memfd_create 279
|
||||
# elif defined(__ppc__) || defined(__PPC64__) || defined(__powerpc64__)
|
||||
# define SYS_memfd_create 360
|
||||
# elif defined(__s390__) || defined(__s390x__)
|
||||
# define SYS_memfd_create 350
|
||||
# else
|
||||
# warning "unknown architecture -- cannot hard-code SYS_memfd_create"
|
||||
# endif
|
||||
# endif
|
||||
#endif
|
||||
|
||||
/* memfd_create(2) flags -- copied from <linux/memfd.h>. */
|
||||
#ifndef MFD_CLOEXEC
|
||||
# define MFD_CLOEXEC 0x0001U
|
||||
# define MFD_ALLOW_SEALING 0x0002U
|
||||
#endif
|
||||
|
||||
int memfd_create(const char *name, unsigned int flags)
|
||||
{
|
||||
#ifdef SYS_memfd_create
|
||||
|
@ -77,7 +101,6 @@ int memfd_create(const char *name, unsigned int flags)
|
|||
#endif
|
||||
}
|
||||
|
||||
|
||||
/* This comes directly from <linux/fcntl.h>. */
|
||||
#ifndef F_LINUX_SPECIFIC_BASE
|
||||
# define F_LINUX_SPECIFIC_BASE 1024
|
||||
|
@ -103,7 +126,7 @@ static void *must_realloc(void *ptr, size_t size)
|
|||
void *old = ptr;
|
||||
do {
|
||||
ptr = realloc(old, size);
|
||||
} while(!ptr);
|
||||
} while (!ptr);
|
||||
return ptr;
|
||||
}
|
||||
|
||||
|
@ -115,10 +138,10 @@ static void *must_realloc(void *ptr, size_t size)
|
|||
static int is_self_cloned(void)
|
||||
{
|
||||
int fd, ret, is_cloned = 0;
|
||||
struct stat statbuf = {};
|
||||
struct statfs fsbuf = {};
|
||||
struct stat statbuf = { };
|
||||
struct statfs fsbuf = { };
|
||||
|
||||
fd = open("/proc/self/exe", O_RDONLY|O_CLOEXEC);
|
||||
fd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
|
||||
if (fd < 0) {
|
||||
fprintf(stderr, "you have no read access to runc binary file\n");
|
||||
return -ENOTRECOVERABLE;
|
||||
|
@ -274,7 +297,7 @@ enum {
|
|||
static int make_execfd(int *fdtype)
|
||||
{
|
||||
int fd = -1;
|
||||
char template[PATH_MAX] = {0};
|
||||
char template[PATH_MAX] = { 0 };
|
||||
char *prefix = getenv("_LIBCONTAINER_STATEDIR");
|
||||
|
||||
if (!prefix || *prefix != '/')
|
||||
|
@ -303,7 +326,7 @@ static int make_execfd(int *fdtype)
|
|||
*fdtype = EFD_FILE;
|
||||
fd = open(prefix, O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700);
|
||||
if (fd >= 0) {
|
||||
struct stat statbuf = {};
|
||||
struct stat statbuf = { };
|
||||
bool working_otmpfile = false;
|
||||
|
||||
/*
|
||||
|
@ -348,27 +371,27 @@ static int seal_execfd(int *fd, int fdtype)
|
|||
switch (fdtype) {
|
||||
case EFD_MEMFD:
|
||||
return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_SEALS);
|
||||
case EFD_FILE: {
|
||||
/* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */
|
||||
int newfd;
|
||||
char fdpath[PATH_MAX] = {0};
|
||||
case EFD_FILE:{
|
||||
/* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */
|
||||
int newfd;
|
||||
char fdpath[PATH_MAX] = { 0 };
|
||||
|
||||
if (fchmod(*fd, 0100) < 0)
|
||||
return -1;
|
||||
if (fchmod(*fd, 0100) < 0)
|
||||
return -1;
|
||||
|
||||
if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0)
|
||||
return -1;
|
||||
if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0)
|
||||
return -1;
|
||||
|
||||
newfd = open(fdpath, O_PATH | O_CLOEXEC);
|
||||
if (newfd < 0)
|
||||
return -1;
|
||||
newfd = open(fdpath, O_PATH | O_CLOEXEC);
|
||||
if (newfd < 0)
|
||||
return -1;
|
||||
|
||||
close(*fd);
|
||||
*fd = newfd;
|
||||
return 0;
|
||||
}
|
||||
close(*fd);
|
||||
*fd = newfd;
|
||||
return 0;
|
||||
}
|
||||
default:
|
||||
break;
|
||||
break;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
@ -376,7 +399,7 @@ static int seal_execfd(int *fd, int fdtype)
|
|||
static int try_bindfd(void)
|
||||
{
|
||||
int fd, ret = -1;
|
||||
char template[PATH_MAX] = {0};
|
||||
char template[PATH_MAX] = { 0 };
|
||||
char *prefix = getenv("_LIBCONTAINER_STATEDIR");
|
||||
|
||||
if (!prefix || *prefix != '/')
|
||||
|
@ -404,7 +427,6 @@ static int try_bindfd(void)
|
|||
if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0)
|
||||
goto out_umount;
|
||||
|
||||
|
||||
/* Get read-only handle that we're sure can't be made read-write. */
|
||||
ret = open(template, O_PATH | O_CLOEXEC);
|
||||
|
||||
|
@ -448,7 +470,7 @@ static ssize_t fd_to_fd(int outfd, int infd)
|
|||
if (n < 0)
|
||||
return -1;
|
||||
nwritten += n;
|
||||
} while(nwritten < nread);
|
||||
} while (nwritten < nread);
|
||||
|
||||
total += nwritten;
|
||||
}
|
||||
|
@ -459,7 +481,7 @@ static ssize_t fd_to_fd(int outfd, int infd)
|
|||
static int clone_binary(void)
|
||||
{
|
||||
int binfd, execfd;
|
||||
struct stat statbuf = {};
|
||||
struct stat statbuf = { };
|
||||
size_t sent = 0;
|
||||
int fdtype = EFD_NONE;
|
||||
|
||||
|
|
142
vendor/github.com/opencontainers/runc/libcontainer/nsenter/escape.c
generated
vendored
Normal file
142
vendor/github.com/opencontainers/runc/libcontainer/nsenter/escape.c
generated
vendored
Normal file
|
@ -0,0 +1,142 @@
|
|||
#include <stdlib.h>
|
||||
#include <string.h>
|
||||
|
||||
#ifdef ESCAPE_TEST
|
||||
# include <assert.h>
|
||||
# define test_assert(arg) assert(arg)
|
||||
#else
|
||||
# define test_assert(arg)
|
||||
#endif
|
||||
|
||||
#define DEL '\x7f'
|
||||
|
||||
/*
|
||||
* Poor man version of itoa with base=16 and input number from 0 to 15,
|
||||
* represented by a char. Converts it to a single hex digit ('0' to 'f').
|
||||
*/
|
||||
static char hex(char i)
|
||||
{
|
||||
test_assert(i >= 0 && i < 16);
|
||||
|
||||
if (i >= 0 && i < 10) {
|
||||
return '0' + i;
|
||||
}
|
||||
if (i >= 10 && i < 16) {
|
||||
return 'a' + i - 10;
|
||||
}
|
||||
return '?';
|
||||
}
|
||||
|
||||
/*
|
||||
* Given the character, tells how many _extra_ characters are needed
|
||||
* to JSON-escape it. If 0 is returned, the character does not need to
|
||||
* be escaped.
|
||||
*/
|
||||
static int need_escape(char c)
|
||||
{
|
||||
switch (c) {
|
||||
case '\\':
|
||||
case '"':
|
||||
case '\b':
|
||||
case '\n':
|
||||
case '\r':
|
||||
case '\t':
|
||||
case '\f':
|
||||
return 1;
|
||||
case DEL: // -> \u007f
|
||||
return 5;
|
||||
default:
|
||||
if (c > 0 && c < ' ') {
|
||||
// ASCII decimal 01 to 31 -> \u00xx
|
||||
return 5;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Escape the string so it can be used as a JSON string (per RFC4627,
|
||||
* section 2.5 minimal requirements, plus the DEL (0x7f) character).
|
||||
*
|
||||
* It is expected that the argument is a string allocated via malloc.
|
||||
* In case no escaping is needed, the original string is returned as is;
|
||||
* otherwise, the original string is free'd, and the newly allocated
|
||||
* escaped string is returned. Thus, in any case, the value returned
|
||||
* need to be free'd by the caller.
|
||||
*/
|
||||
char *escape_json_string(char *s)
|
||||
{
|
||||
int i, j, len;
|
||||
char *c, *out;
|
||||
|
||||
/*
|
||||
* First, check if escaping is at all needed -- if not, we can avoid
|
||||
* malloc and return the argument as is. While at it, count how much
|
||||
* extra space is required.
|
||||
*
|
||||
* XXX: the counting code must be in sync with the escaping code
|
||||
* (checked by test_assert()s below).
|
||||
*/
|
||||
for (i = j = 0; s[i] != '\0'; i++) {
|
||||
j += need_escape(s[i]);
|
||||
}
|
||||
if (j == 0) {
|
||||
// nothing to escape
|
||||
return s;
|
||||
}
|
||||
|
||||
len = i + j + 1;
|
||||
out = malloc(len);
|
||||
if (!out) {
|
||||
free(s);
|
||||
// As malloc failed, strdup can fail, too, so in the worst case
|
||||
// scenario NULL will be returned from here.
|
||||
return strdup("escape_json_string: out of memory");
|
||||
}
|
||||
for (c = s, j = 0; *c != '\0'; c++) {
|
||||
switch (*c) {
|
||||
case '"':
|
||||
case '\\':
|
||||
test_assert(need_escape(*c) == 1);
|
||||
out[j++] = '\\';
|
||||
out[j++] = *c;
|
||||
continue;
|
||||
}
|
||||
if ((*c < 0 || *c >= ' ') && (*c != DEL)) {
|
||||
// no escape needed
|
||||
test_assert(need_escape(*c) == 0);
|
||||
out[j++] = *c;
|
||||
continue;
|
||||
}
|
||||
out[j++] = '\\';
|
||||
switch (*c) {
|
||||
case '\b':
|
||||
out[j++] = 'b';
|
||||
break;
|
||||
case '\n':
|
||||
out[j++] = 'n';
|
||||
break;
|
||||
case '\r':
|
||||
out[j++] = 'r';
|
||||
break;
|
||||
case '\t':
|
||||
out[j++] = 't';
|
||||
break;
|
||||
case '\f':
|
||||
out[j++] = 'f';
|
||||
break;
|
||||
default:
|
||||
test_assert(need_escape(*c) == 5);
|
||||
out[j++] = 'u';
|
||||
out[j++] = '0';
|
||||
out[j++] = '0';
|
||||
out[j++] = hex(*c >> 4);
|
||||
out[j++] = hex(*c & 0x0f);
|
||||
}
|
||||
}
|
||||
test_assert(j + 1 == len);
|
||||
out[j] = '\0';
|
||||
|
||||
free(s);
|
||||
return out;
|
||||
}
|
353
vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c
generated
vendored
353
vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c
generated
vendored
|
@ -29,6 +29,8 @@
|
|||
/* Get all of the CLONE_NEW* flags. */
|
||||
#include "namespace.h"
|
||||
|
||||
extern char *escape_json_string(char *str);
|
||||
|
||||
/* Synchronisation values. */
|
||||
enum sync_t {
|
||||
SYNC_USERMAP_PLS = 0x40, /* Request parent to map our users. */
|
||||
|
@ -36,7 +38,7 @@ enum sync_t {
|
|||
SYNC_RECVPID_PLS = 0x42, /* Tell parent we're sending the PID. */
|
||||
SYNC_RECVPID_ACK = 0x43, /* PID was correctly received by parent. */
|
||||
SYNC_GRANDCHILD = 0x44, /* The grandchild is ready to run. */
|
||||
SYNC_CHILD_READY = 0x45, /* The child or grandchild is ready to return. */
|
||||
SYNC_CHILD_FINISH = 0x45, /* The child or grandchild has finished. */
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -45,10 +47,14 @@ enum sync_t {
|
|||
*/
|
||||
#define CREATECGROUPNS 0x80
|
||||
|
||||
#define STAGE_SETUP -1
|
||||
/* longjmp() arguments. */
|
||||
#define JUMP_PARENT 0x00
|
||||
#define JUMP_CHILD 0xA0
|
||||
#define JUMP_INIT 0xA1
|
||||
#define STAGE_PARENT 0
|
||||
#define STAGE_CHILD 1
|
||||
#define STAGE_INIT 2
|
||||
|
||||
/* Stores the current stage of nsexec. */
|
||||
int current_stage = STAGE_SETUP;
|
||||
|
||||
/* Assume the stack grows down, so arguments should be above it. */
|
||||
struct clone_t {
|
||||
|
@ -56,7 +62,7 @@ struct clone_t {
|
|||
* Reserve some space for clone() to locate arguments
|
||||
* and retcode in this place
|
||||
*/
|
||||
char stack[4096] __attribute__ ((aligned(16)));
|
||||
char stack[4096] __attribute__((aligned(16)));
|
||||
char stack_ptr[0];
|
||||
|
||||
/* There's two children. This is used to execute the different code. */
|
||||
|
@ -102,31 +108,31 @@ static int logfd = -1;
|
|||
* List of netlink message types sent to us as part of bootstrapping the init.
|
||||
* These constants are defined in libcontainer/message_linux.go.
|
||||
*/
|
||||
#define INIT_MSG 62000
|
||||
#define INIT_MSG 62000
|
||||
#define CLONE_FLAGS_ATTR 27281
|
||||
#define NS_PATHS_ATTR 27282
|
||||
#define UIDMAP_ATTR 27283
|
||||
#define GIDMAP_ATTR 27284
|
||||
#define UIDMAP_ATTR 27283
|
||||
#define GIDMAP_ATTR 27284
|
||||
#define SETGROUP_ATTR 27285
|
||||
#define OOM_SCORE_ADJ_ATTR 27286
|
||||
#define ROOTLESS_EUID_ATTR 27287
|
||||
#define UIDMAPPATH_ATTR 27288
|
||||
#define GIDMAPPATH_ATTR 27289
|
||||
#define UIDMAPPATH_ATTR 27288
|
||||
#define GIDMAPPATH_ATTR 27289
|
||||
|
||||
/*
|
||||
* Use the raw syscall for versions of glibc which don't include a function for
|
||||
* it, namely (glibc 2.12).
|
||||
*/
|
||||
#if __GLIBC__ == 2 && __GLIBC_MINOR__ < 14
|
||||
# define _GNU_SOURCE
|
||||
# include "syscall.h"
|
||||
# if !defined(SYS_setns) && defined(__NR_setns)
|
||||
# define SYS_setns __NR_setns
|
||||
# endif
|
||||
# define _GNU_SOURCE
|
||||
# include "syscall.h"
|
||||
# if !defined(SYS_setns) && defined(__NR_setns)
|
||||
# define SYS_setns __NR_setns
|
||||
# endif
|
||||
|
||||
#ifndef SYS_setns
|
||||
# error "setns(2) syscall not supported by glibc version"
|
||||
#endif
|
||||
# ifndef SYS_setns
|
||||
# error "setns(2) syscall not supported by glibc version"
|
||||
# endif
|
||||
|
||||
int setns(int fd, int nstype)
|
||||
{
|
||||
|
@ -134,33 +140,43 @@ int setns(int fd, int nstype)
|
|||
}
|
||||
#endif
|
||||
|
||||
static void write_log_with_info(const char *level, const char *function, int line, const char *format, ...)
|
||||
static void write_log(const char *level, const char *format, ...)
|
||||
{
|
||||
char message[1024] = {};
|
||||
|
||||
char *message = NULL, *stage = NULL;
|
||||
va_list args;
|
||||
int ret;
|
||||
|
||||
if (logfd < 0 || level == NULL)
|
||||
return;
|
||||
goto out;
|
||||
|
||||
va_start(args, format);
|
||||
if (vsnprintf(message, sizeof(message), format, args) < 0)
|
||||
goto done;
|
||||
|
||||
dprintf(logfd, "{\"level\":\"%s\", \"msg\": \"%s:%d %s\"}\n", level, function, line, message);
|
||||
done:
|
||||
ret = vasprintf(&message, format, args);
|
||||
va_end(args);
|
||||
}
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
#define write_log(level, fmt, ...) \
|
||||
write_log_with_info((level), __FUNCTION__, __LINE__, (fmt), ##__VA_ARGS__)
|
||||
message = escape_json_string(message);
|
||||
|
||||
if (current_stage == STAGE_SETUP)
|
||||
stage = strdup("nsexec");
|
||||
else
|
||||
ret = asprintf(&stage, "nsexec-%d", current_stage);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
|
||||
dprintf(logfd, "{\"level\":\"%s\", \"msg\": \"%s[%d]: %s\"}\n", level, stage, getpid(), message);
|
||||
|
||||
out:
|
||||
free(message);
|
||||
free(stage);
|
||||
}
|
||||
|
||||
/* XXX: This is ugly. */
|
||||
static int syncfd = -1;
|
||||
|
||||
#define bail(fmt, ...) \
|
||||
do { \
|
||||
write_log(FATAL, "nsenter: " fmt ": %m", ##__VA_ARGS__); \
|
||||
write_log(FATAL, fmt ": %m", ##__VA_ARGS__); \
|
||||
exit(1); \
|
||||
} while(0)
|
||||
|
||||
|
@ -187,7 +203,7 @@ static int write_file(char *data, size_t data_len, char *pathfmt, ...)
|
|||
goto out;
|
||||
}
|
||||
|
||||
out:
|
||||
out:
|
||||
close(fd);
|
||||
return ret;
|
||||
}
|
||||
|
@ -297,9 +313,11 @@ static void update_uidmap(const char *path, int pid, char *map, size_t map_len)
|
|||
if (map == NULL || map_len <= 0)
|
||||
return;
|
||||
|
||||
write_log(DEBUG, "update /proc/%d/uid_map to '%s'", pid, map);
|
||||
if (write_file(map, map_len, "/proc/%d/uid_map", pid) < 0) {
|
||||
if (errno != EPERM)
|
||||
bail("failed to update /proc/%d/uid_map", pid);
|
||||
write_log(DEBUG, "update /proc/%d/uid_map got -EPERM (trying %s)", pid, path);
|
||||
if (try_mapping_tool(path, pid, map, map_len))
|
||||
bail("failed to use newuid map on %d", pid);
|
||||
}
|
||||
|
@ -310,9 +328,11 @@ static void update_gidmap(const char *path, int pid, char *map, size_t map_len)
|
|||
if (map == NULL || map_len <= 0)
|
||||
return;
|
||||
|
||||
write_log(DEBUG, "update /proc/%d/gid_map to '%s'", pid, map);
|
||||
if (write_file(map, map_len, "/proc/%d/gid_map", pid) < 0) {
|
||||
if (errno != EPERM)
|
||||
bail("failed to update /proc/%d/gid_map", pid);
|
||||
write_log(DEBUG, "update /proc/%d/gid_map got -EPERM (trying %s)", pid, path);
|
||||
if (try_mapping_tool(path, pid, map, map_len))
|
||||
bail("failed to use newgid map on %d", pid);
|
||||
}
|
||||
|
@ -323,19 +343,20 @@ static void update_oom_score_adj(char *data, size_t len)
|
|||
if (data == NULL || len <= 0)
|
||||
return;
|
||||
|
||||
write_log(DEBUG, "update /proc/self/oom_score_adj to '%s'", data);
|
||||
if (write_file(data, len, "/proc/self/oom_score_adj") < 0)
|
||||
bail("failed to update /proc/self/oom_score_adj");
|
||||
}
|
||||
|
||||
/* A dummy function that just jumps to the given jumpval. */
|
||||
static int child_func(void *arg) __attribute__ ((noinline));
|
||||
static int child_func(void *arg) __attribute__((noinline));
|
||||
static int child_func(void *arg)
|
||||
{
|
||||
struct clone_t *ca = (struct clone_t *)arg;
|
||||
longjmp(*ca->env, ca->jmpval);
|
||||
}
|
||||
|
||||
static int clone_parent(jmp_buf *env, int jmpval) __attribute__ ((noinline));
|
||||
static int clone_parent(jmp_buf *env, int jmpval) __attribute__((noinline));
|
||||
static int clone_parent(jmp_buf *env, int jmpval)
|
||||
{
|
||||
struct clone_t ca = {
|
||||
|
@ -507,7 +528,6 @@ void join_namespaces(char *nslist)
|
|||
char *namespace = strtok_r(nslist, ",", &saveptr);
|
||||
struct namespace_t {
|
||||
int fd;
|
||||
int ns;
|
||||
char type[PATH_MAX];
|
||||
char path[PATH_MAX];
|
||||
} *namespaces = NULL;
|
||||
|
@ -542,7 +562,7 @@ void join_namespaces(char *nslist)
|
|||
bail("failed to open %s", path);
|
||||
|
||||
ns->fd = fd;
|
||||
ns->ns = nsflag(namespace);
|
||||
strncpy(ns->type, namespace, PATH_MAX - 1);
|
||||
strncpy(ns->path, path, PATH_MAX - 1);
|
||||
ns->path[PATH_MAX - 1] = '\0';
|
||||
} while ((namespace = strtok_r(NULL, ",", &saveptr)) != NULL);
|
||||
|
@ -555,12 +575,14 @@ void join_namespaces(char *nslist)
|
|||
*/
|
||||
|
||||
for (i = 0; i < num; i++) {
|
||||
struct namespace_t ns = namespaces[i];
|
||||
struct namespace_t *ns = &namespaces[i];
|
||||
int flag = nsflag(ns->type);
|
||||
|
||||
if (setns(ns.fd, ns.ns) < 0)
|
||||
bail("failed to setns to %s", ns.path);
|
||||
write_log(DEBUG, "setns(%#x) into %s namespace (with path %s)", flag, ns->type, ns->path);
|
||||
if (setns(ns->fd, flag) < 0)
|
||||
bail("failed to setns into %s namespace", ns->type);
|
||||
|
||||
close(ns.fd);
|
||||
close(ns->fd);
|
||||
}
|
||||
|
||||
free(namespaces);
|
||||
|
@ -569,6 +591,14 @@ void join_namespaces(char *nslist)
|
|||
/* Defined in cloned_binary.c. */
|
||||
extern int ensure_cloned_binary(void);
|
||||
|
||||
static inline int sane_kill(pid_t pid, int signum)
|
||||
{
|
||||
if (pid > 0)
|
||||
return kill(pid, signum);
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
|
||||
void nsexec(void)
|
||||
{
|
||||
int pipenum;
|
||||
|
@ -598,7 +628,14 @@ void nsexec(void)
|
|||
if (ensure_cloned_binary() < 0)
|
||||
bail("could not ensure we are a cloned binary");
|
||||
|
||||
write_log(DEBUG, "nsexec started");
|
||||
/*
|
||||
* Inform the parent we're past initial setup.
|
||||
* For the other side of this, see initWaiter.
|
||||
*/
|
||||
if (write(pipenum, "", 1) != 1)
|
||||
bail("could not inform the parent we are past initial setup");
|
||||
|
||||
write_log(DEBUG, "=> nsexec container setup");
|
||||
|
||||
/* Parse all of the netlink configuration. */
|
||||
nl_parse(pipenum, &config);
|
||||
|
@ -622,6 +659,7 @@ void nsexec(void)
|
|||
* containers), which is the recommendation from the kernel folks.
|
||||
*/
|
||||
if (config.namespaces) {
|
||||
write_log(DEBUG, "set process as non-dumpable");
|
||||
if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
|
||||
bail("failed to set process as non-dumpable");
|
||||
}
|
||||
|
@ -686,45 +724,49 @@ void nsexec(void)
|
|||
* -- Aleksa "what has my life come to?" Sarai
|
||||
*/
|
||||
|
||||
switch (setjmp(env)) {
|
||||
current_stage = setjmp(env);
|
||||
switch (current_stage) {
|
||||
/*
|
||||
* Stage 0: We're in the parent. Our job is just to create a new child
|
||||
* (stage 1: JUMP_CHILD) process and write its uid_map and
|
||||
* (stage 1: STAGE_CHILD) process and write its uid_map and
|
||||
* gid_map. That process will go on to create a new process, then
|
||||
* it will send us its PID which we will send to the bootstrap
|
||||
* process.
|
||||
*/
|
||||
case JUMP_PARENT:{
|
||||
case STAGE_PARENT:{
|
||||
int len;
|
||||
pid_t child, first_child = -1;
|
||||
bool ready = false;
|
||||
pid_t stage1_pid = -1, stage2_pid = -1;
|
||||
bool stage1_complete, stage2_complete;
|
||||
|
||||
/* For debugging. */
|
||||
prctl(PR_SET_NAME, (unsigned long)"runc:[0:PARENT]", 0, 0, 0);
|
||||
write_log(DEBUG, "~> nsexec stage-0");
|
||||
|
||||
/* Start the process of getting a container. */
|
||||
child = clone_parent(&env, JUMP_CHILD);
|
||||
if (child < 0)
|
||||
bail("unable to fork: child_func");
|
||||
write_log(DEBUG, "spawn stage-1");
|
||||
stage1_pid = clone_parent(&env, STAGE_CHILD);
|
||||
if (stage1_pid < 0)
|
||||
bail("unable to spawn stage-1");
|
||||
|
||||
/*
|
||||
* State machine for synchronisation with the children.
|
||||
*
|
||||
* Father only return when both child and grandchild are
|
||||
* ready, so we can receive all possible error codes
|
||||
* generated by children.
|
||||
*/
|
||||
syncfd = sync_child_pipe[1];
|
||||
close(sync_child_pipe[0]);
|
||||
|
||||
while (!ready) {
|
||||
/*
|
||||
* State machine for synchronisation with the children. We only
|
||||
* return once both the child and grandchild are ready.
|
||||
*/
|
||||
write_log(DEBUG, "-> stage-1 synchronisation loop");
|
||||
stage1_complete = false;
|
||||
while (!stage1_complete) {
|
||||
enum sync_t s;
|
||||
|
||||
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
|
||||
bail("failed to sync with child: next state");
|
||||
bail("failed to sync with stage-1: next state");
|
||||
|
||||
switch (s) {
|
||||
case SYNC_USERMAP_PLS:
|
||||
write_log(DEBUG, "stage-1 requested userns mappings");
|
||||
|
||||
/*
|
||||
* Enable setgroups(2) if we've been asked to. But we also
|
||||
* have to explicitly disable setgroups(2) if we're
|
||||
|
@ -735,70 +777,78 @@ void nsexec(void)
|
|||
* For rootless multi-entry mapping, config.is_setgroup shall be true and
|
||||
* newuidmap/newgidmap shall be used.
|
||||
*/
|
||||
|
||||
if (config.is_rootless_euid && !config.is_setgroup)
|
||||
update_setgroups(child, SETGROUPS_DENY);
|
||||
update_setgroups(stage1_pid, SETGROUPS_DENY);
|
||||
|
||||
/* Set up mappings. */
|
||||
update_uidmap(config.uidmappath, child, config.uidmap, config.uidmap_len);
|
||||
update_gidmap(config.gidmappath, child, config.gidmap, config.gidmap_len);
|
||||
update_uidmap(config.uidmappath, stage1_pid, config.uidmap, config.uidmap_len);
|
||||
update_gidmap(config.gidmappath, stage1_pid, config.gidmap, config.gidmap_len);
|
||||
|
||||
s = SYNC_USERMAP_ACK;
|
||||
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
|
||||
kill(child, SIGKILL);
|
||||
bail("failed to sync with child: write(SYNC_USERMAP_ACK)");
|
||||
sane_kill(stage1_pid, SIGKILL);
|
||||
sane_kill(stage2_pid, SIGKILL);
|
||||
bail("failed to sync with stage-1: write(SYNC_USERMAP_ACK)");
|
||||
}
|
||||
break;
|
||||
case SYNC_RECVPID_PLS:{
|
||||
first_child = child;
|
||||
case SYNC_RECVPID_PLS:
|
||||
write_log(DEBUG, "stage-1 requested pid to be forwarded");
|
||||
|
||||
/* Get the init_func pid. */
|
||||
if (read(syncfd, &child, sizeof(child)) != sizeof(child)) {
|
||||
kill(first_child, SIGKILL);
|
||||
bail("failed to sync with child: read(childpid)");
|
||||
}
|
||||
/* Get the stage-2 pid. */
|
||||
if (read(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) {
|
||||
sane_kill(stage1_pid, SIGKILL);
|
||||
sane_kill(stage2_pid, SIGKILL);
|
||||
bail("failed to sync with stage-1: read(stage2_pid)");
|
||||
}
|
||||
|
||||
/* Send ACK. */
|
||||
s = SYNC_RECVPID_ACK;
|
||||
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
|
||||
kill(first_child, SIGKILL);
|
||||
kill(child, SIGKILL);
|
||||
bail("failed to sync with child: write(SYNC_RECVPID_ACK)");
|
||||
}
|
||||
/* Send ACK. */
|
||||
s = SYNC_RECVPID_ACK;
|
||||
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
|
||||
sane_kill(stage1_pid, SIGKILL);
|
||||
sane_kill(stage2_pid, SIGKILL);
|
||||
bail("failed to sync with stage-1: write(SYNC_RECVPID_ACK)");
|
||||
}
|
||||
|
||||
/* Send the init_func pid back to our parent.
|
||||
*
|
||||
* Send the init_func pid and the pid of the first child back to our parent.
|
||||
* We need to send both back because we can't reap the first child we created (CLONE_PARENT).
|
||||
* It becomes the responsibility of our parent to reap the first child.
|
||||
*/
|
||||
len = dprintf(pipenum, "{\"pid\": %d, \"pid_first\": %d}\n", child, first_child);
|
||||
if (len < 0) {
|
||||
kill(child, SIGKILL);
|
||||
bail("unable to generate JSON for child pid");
|
||||
}
|
||||
/*
|
||||
* Send both the stage-1 and stage-2 pids back to runc.
|
||||
* runc needs the stage-2 to continue process management,
|
||||
* but because stage-1 was spawned with CLONE_PARENT we
|
||||
* cannot reap it within stage-0 and thus we need to ask
|
||||
* runc to reap the zombie for us.
|
||||
*/
|
||||
write_log(DEBUG, "forward stage-1 (%d) and stage-2 (%d) pids to runc",
|
||||
stage1_pid, stage2_pid);
|
||||
len =
|
||||
dprintf(pipenum, "{\"stage1_pid\":%d,\"stage2_pid\":%d}\n", stage1_pid,
|
||||
stage2_pid);
|
||||
if (len < 0) {
|
||||
sane_kill(stage1_pid, SIGKILL);
|
||||
sane_kill(stage2_pid, SIGKILL);
|
||||
bail("failed to sync with runc: write(pid-JSON)");
|
||||
}
|
||||
break;
|
||||
case SYNC_CHILD_READY:
|
||||
ready = true;
|
||||
case SYNC_CHILD_FINISH:
|
||||
write_log(DEBUG, "stage-1 complete");
|
||||
stage1_complete = true;
|
||||
break;
|
||||
default:
|
||||
bail("unexpected sync value: %u", s);
|
||||
}
|
||||
}
|
||||
write_log(DEBUG, "<- stage-1 synchronisation loop");
|
||||
|
||||
/* Now sync with grandchild. */
|
||||
|
||||
syncfd = sync_grandchild_pipe[1];
|
||||
close(sync_grandchild_pipe[0]);
|
||||
|
||||
ready = false;
|
||||
while (!ready) {
|
||||
write_log(DEBUG, "-> stage-2 synchronisation loop");
|
||||
stage2_complete = false;
|
||||
while (!stage2_complete) {
|
||||
enum sync_t s;
|
||||
|
||||
write_log(DEBUG, "signalling stage-2 to run");
|
||||
s = SYNC_GRANDCHILD;
|
||||
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
|
||||
kill(child, SIGKILL);
|
||||
sane_kill(stage2_pid, SIGKILL);
|
||||
bail("failed to sync with child: write(SYNC_GRANDCHILD)");
|
||||
}
|
||||
|
||||
|
@ -806,27 +856,31 @@ void nsexec(void)
|
|||
bail("failed to sync with child: next state");
|
||||
|
||||
switch (s) {
|
||||
case SYNC_CHILD_READY:
|
||||
ready = true;
|
||||
case SYNC_CHILD_FINISH:
|
||||
write_log(DEBUG, "stage-2 complete");
|
||||
stage2_complete = true;
|
||||
break;
|
||||
default:
|
||||
bail("unexpected sync value: %u", s);
|
||||
}
|
||||
}
|
||||
write_log(DEBUG, "<- stage-2 synchronisation loop");
|
||||
write_log(DEBUG, "<~ nsexec stage-0");
|
||||
exit(0);
|
||||
}
|
||||
break;
|
||||
|
||||
/*
|
||||
* Stage 1: We're in the first child process. Our job is to join any
|
||||
* provided namespaces in the netlink payload and unshare all
|
||||
* of the requested namespaces. If we've been asked to
|
||||
* CLONE_NEWUSER, we will ask our parent (stage 0) to set up
|
||||
* our user mappings for us. Then, we create a new child
|
||||
* (stage 2: JUMP_INIT) for PID namespace. We then send the
|
||||
* child's PID to our parent (stage 0).
|
||||
* provided namespaces in the netlink payload and unshare all of
|
||||
* the requested namespaces. If we've been asked to CLONE_NEWUSER,
|
||||
* we will ask our parent (stage 0) to set up our user mappings
|
||||
* for us. Then, we create a new child (stage 2: STAGE_INIT) for
|
||||
* PID namespace. We then send the child's PID to our parent
|
||||
* (stage 0).
|
||||
*/
|
||||
case JUMP_CHILD:{
|
||||
pid_t child;
|
||||
case STAGE_CHILD:{
|
||||
pid_t stage2_pid = -1;
|
||||
enum sync_t s;
|
||||
|
||||
/* We're in a child and thus need to tell the parent if we die. */
|
||||
|
@ -835,11 +889,12 @@ void nsexec(void)
|
|||
|
||||
/* For debugging. */
|
||||
prctl(PR_SET_NAME, (unsigned long)"runc:[1:CHILD]", 0, 0, 0);
|
||||
write_log(DEBUG, "~> nsexec stage-1");
|
||||
|
||||
/*
|
||||
* We need to setns first. We cannot do this earlier (in stage 0)
|
||||
* because of the fact that we forked to get here (the PID of
|
||||
* [stage 2: JUMP_INIT]) would be meaningless). We could send it
|
||||
* [stage 2: STAGE_INIT]) would be meaningless). We could send it
|
||||
* using cmsg(3) but that's just annoying.
|
||||
*/
|
||||
if (config.namespaces)
|
||||
|
@ -865,40 +920,50 @@ void nsexec(void)
|
|||
* problem.
|
||||
*/
|
||||
if (config.cloneflags & CLONE_NEWUSER) {
|
||||
write_log(DEBUG, "unshare user namespace");
|
||||
if (unshare(CLONE_NEWUSER) < 0)
|
||||
bail("failed to unshare user namespace");
|
||||
config.cloneflags &= ~CLONE_NEWUSER;
|
||||
|
||||
/*
|
||||
* We don't have the privileges to do any mapping here (see the
|
||||
* clone_parent rant). So signal our parent to hook us up.
|
||||
* We need to set ourselves as dumpable temporarily so that the
|
||||
* parent process can write to our procfs files.
|
||||
*/
|
||||
|
||||
/* Switching is only necessary if we joined namespaces. */
|
||||
if (config.namespaces) {
|
||||
write_log(DEBUG, "temporarily set process as dumpable");
|
||||
if (prctl(PR_SET_DUMPABLE, 1, 0, 0, 0) < 0)
|
||||
bail("failed to set process as dumpable");
|
||||
bail("failed to temporarily set process as dumpable");
|
||||
}
|
||||
|
||||
/*
|
||||
* We don't have the privileges to do any mapping here (see the
|
||||
* clone_parent rant). So signal stage-0 to do the mapping for
|
||||
* us.
|
||||
*/
|
||||
write_log(DEBUG, "request stage-0 to map user namespace");
|
||||
s = SYNC_USERMAP_PLS;
|
||||
if (write(syncfd, &s, sizeof(s)) != sizeof(s))
|
||||
bail("failed to sync with parent: write(SYNC_USERMAP_PLS)");
|
||||
|
||||
/* ... wait for mapping ... */
|
||||
|
||||
write_log(DEBUG, "request stage-0 to map user namespace");
|
||||
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
|
||||
bail("failed to sync with parent: read(SYNC_USERMAP_ACK)");
|
||||
if (s != SYNC_USERMAP_ACK)
|
||||
bail("failed to sync with parent: SYNC_USERMAP_ACK: got %u", s);
|
||||
/* Switching is only necessary if we joined namespaces. */
|
||||
|
||||
/* Revert temporary re-dumpable setting. */
|
||||
if (config.namespaces) {
|
||||
write_log(DEBUG, "re-set process as non-dumpable");
|
||||
if (prctl(PR_SET_DUMPABLE, 0, 0, 0, 0) < 0)
|
||||
bail("failed to set process as dumpable");
|
||||
bail("failed to re-set process as non-dumpable");
|
||||
}
|
||||
|
||||
/* Become root in the namespace proper. */
|
||||
if (setresuid(0, 0, 0) < 0)
|
||||
bail("failed to become root in user namespace");
|
||||
}
|
||||
|
||||
/*
|
||||
* Unshare all of the namespaces. Now, it should be noted that this
|
||||
* ordering might break in the future (especially with rootless
|
||||
|
@ -909,8 +974,9 @@ void nsexec(void)
|
|||
* some old kernel versions where clone(CLONE_PARENT | CLONE_NEWPID)
|
||||
* was broken, so we'll just do it the long way anyway.
|
||||
*/
|
||||
write_log(DEBUG, "unshare remaining namespace (except cgroupns)");
|
||||
if (unshare(config.cloneflags & ~CLONE_NEWCGROUP) < 0)
|
||||
bail("failed to unshare namespaces");
|
||||
bail("failed to unshare remaining namespaces (except cgroupns)");
|
||||
|
||||
/*
|
||||
* TODO: What about non-namespace clone flags that we're dropping here?
|
||||
|
@ -921,41 +987,45 @@ void nsexec(void)
|
|||
* which would break many applications and libraries, so we must fork
|
||||
* to actually enter the new PID namespace.
|
||||
*/
|
||||
child = clone_parent(&env, JUMP_INIT);
|
||||
if (child < 0)
|
||||
bail("unable to fork: init_func");
|
||||
write_log(DEBUG, "spawn stage-2");
|
||||
stage2_pid = clone_parent(&env, STAGE_INIT);
|
||||
if (stage2_pid < 0)
|
||||
bail("unable to spawn stage-2");
|
||||
|
||||
/* Send the child to our parent, which knows what it's doing. */
|
||||
write_log(DEBUG, "request stage-0 to forward stage-2 pid (%d)", stage2_pid);
|
||||
s = SYNC_RECVPID_PLS;
|
||||
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
|
||||
kill(child, SIGKILL);
|
||||
sane_kill(stage2_pid, SIGKILL);
|
||||
bail("failed to sync with parent: write(SYNC_RECVPID_PLS)");
|
||||
}
|
||||
if (write(syncfd, &child, sizeof(child)) != sizeof(child)) {
|
||||
kill(child, SIGKILL);
|
||||
bail("failed to sync with parent: write(childpid)");
|
||||
if (write(syncfd, &stage2_pid, sizeof(stage2_pid)) != sizeof(stage2_pid)) {
|
||||
sane_kill(stage2_pid, SIGKILL);
|
||||
bail("failed to sync with parent: write(stage2_pid)");
|
||||
}
|
||||
|
||||
/* ... wait for parent to get the pid ... */
|
||||
|
||||
if (read(syncfd, &s, sizeof(s)) != sizeof(s)) {
|
||||
kill(child, SIGKILL);
|
||||
sane_kill(stage2_pid, SIGKILL);
|
||||
bail("failed to sync with parent: read(SYNC_RECVPID_ACK)");
|
||||
}
|
||||
if (s != SYNC_RECVPID_ACK) {
|
||||
kill(child, SIGKILL);
|
||||
sane_kill(stage2_pid, SIGKILL);
|
||||
bail("failed to sync with parent: SYNC_RECVPID_ACK: got %u", s);
|
||||
}
|
||||
|
||||
s = SYNC_CHILD_READY;
|
||||
write_log(DEBUG, "signal completion to stage-0");
|
||||
s = SYNC_CHILD_FINISH;
|
||||
if (write(syncfd, &s, sizeof(s)) != sizeof(s)) {
|
||||
kill(child, SIGKILL);
|
||||
bail("failed to sync with parent: write(SYNC_CHILD_READY)");
|
||||
sane_kill(stage2_pid, SIGKILL);
|
||||
bail("failed to sync with parent: write(SYNC_CHILD_FINISH)");
|
||||
}
|
||||
|
||||
/* Our work is done. [Stage 2: JUMP_INIT] is doing the rest of the work. */
|
||||
/* Our work is done. [Stage 2: STAGE_INIT] is doing the rest of the work. */
|
||||
write_log(DEBUG, "<~ nsexec stage-1");
|
||||
exit(0);
|
||||
}
|
||||
break;
|
||||
|
||||
/*
|
||||
* Stage 2: We're the final child process, and the only process that will
|
||||
|
@ -963,7 +1033,7 @@ void nsexec(void)
|
|||
* final cleanup steps and then return to the Go runtime to allow
|
||||
* init_linux.go to run.
|
||||
*/
|
||||
case JUMP_INIT:{
|
||||
case STAGE_INIT:{
|
||||
/*
|
||||
* We're inside the child now, having jumped from the
|
||||
* start_child() code after forking in the parent.
|
||||
|
@ -978,6 +1048,7 @@ void nsexec(void)
|
|||
|
||||
/* For debugging. */
|
||||
prctl(PR_SET_NAME, (unsigned long)"runc:[2:INIT]", 0, 0, 0);
|
||||
write_log(DEBUG, "~> nsexec stage-2");
|
||||
|
||||
if (read(syncfd, &s, sizeof(s)) != sizeof(s))
|
||||
bail("failed to sync with parent: read(SYNC_GRANDCHILD)");
|
||||
|
@ -998,21 +1069,30 @@ void nsexec(void)
|
|||
bail("setgroups failed");
|
||||
}
|
||||
|
||||
/* ... wait until our topmost parent has finished cgroup setup in p.manager.Apply() ... */
|
||||
/*
|
||||
* Wait until our topmost parent has finished cgroup setup in
|
||||
* p.manager.Apply().
|
||||
*
|
||||
* TODO(cyphar): Check if this code is actually needed because we
|
||||
* should be in the cgroup even from stage-0, so
|
||||
* waiting until now might not make sense.
|
||||
*/
|
||||
if (config.cloneflags & CLONE_NEWCGROUP) {
|
||||
uint8_t value;
|
||||
if (read(pipenum, &value, sizeof(value)) != sizeof(value))
|
||||
bail("read synchronisation value failed");
|
||||
if (value == CREATECGROUPNS) {
|
||||
write_log(DEBUG, "unshare cgroup namespace");
|
||||
if (unshare(CLONE_NEWCGROUP) < 0)
|
||||
bail("failed to unshare cgroup namespace");
|
||||
} else
|
||||
bail("received unknown synchronisation value");
|
||||
}
|
||||
|
||||
s = SYNC_CHILD_READY;
|
||||
write_log(DEBUG, "signal completion to stage-0");
|
||||
s = SYNC_CHILD_FINISH;
|
||||
if (write(syncfd, &s, sizeof(s)) != sizeof(s))
|
||||
bail("failed to sync with patent: write(SYNC_CHILD_READY)");
|
||||
bail("failed to sync with patent: write(SYNC_CHILD_FINISH)");
|
||||
|
||||
/* Close sync pipes. */
|
||||
close(sync_grandchild_pipe[0]);
|
||||
|
@ -1021,10 +1101,13 @@ void nsexec(void)
|
|||
nl_free(&config);
|
||||
|
||||
/* Finish executing, let the Go runtime take over. */
|
||||
write_log(DEBUG, "<= nsexec container setup");
|
||||
write_log(DEBUG, "booting up go runtime ...");
|
||||
return;
|
||||
}
|
||||
break;
|
||||
default:
|
||||
bail("unexpected jump value");
|
||||
bail("unknown stage '%d' for jump value", current_stage);
|
||||
}
|
||||
|
||||
/* Should never be reached. */
|
||||
|
|
1
vendor/github.com/opencontainers/runc/libcontainer/nsenter/test/escape.c
generated
vendored
Symbolic link
1
vendor/github.com/opencontainers/runc/libcontainer/nsenter/test/escape.c
generated
vendored
Symbolic link
|
@ -0,0 +1 @@
|
|||
../escape.c
|
53
vendor/github.com/opencontainers/runc/libcontainer/nsenter/test/escape.go
generated
vendored
Normal file
53
vendor/github.com/opencontainers/runc/libcontainer/nsenter/test/escape.go
generated
vendored
Normal file
|
@ -0,0 +1,53 @@
|
|||
package escapetest
|
||||
|
||||
// This file is part of escape_json_string unit test.
|
||||
// It is in a separate package so cgo can be used together
|
||||
// with go test.
|
||||
|
||||
// #include <stdlib.h>
|
||||
// extern char *escape_json_string(char *str);
|
||||
// #cgo CFLAGS: -DESCAPE_TEST=1
|
||||
import "C"
|
||||
|
||||
import (
|
||||
"testing"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
func testEscapeJsonString(t *testing.T, input, want string) {
|
||||
in := C.CString(input)
|
||||
out := C.escape_json_string(in)
|
||||
got := C.GoString(out)
|
||||
C.free(unsafe.Pointer(out))
|
||||
t.Logf("input: %q, output: %q", input, got)
|
||||
if got != want {
|
||||
t.Errorf("Failed on input: %q, want %q, got %q", input, want, got)
|
||||
}
|
||||
}
|
||||
|
||||
func testEscapeJson(t *testing.T) {
|
||||
testCases := []struct {
|
||||
input, output string
|
||||
}{
|
||||
{"", ""},
|
||||
{"abcdef", "abcdef"},
|
||||
{`\\\\\\`, `\\\\\\\\\\\\`},
|
||||
{`with"quote`, `with\"quote`},
|
||||
{"\n\r\b\t\f\\", `\n\r\b\t\f\\`},
|
||||
{"\007", "\\u0007"},
|
||||
{"\017 \020 \037", "\\u000f \\u0010 \\u001f"},
|
||||
{"\033", "\\u001b"},
|
||||
{`<->`, `<->`},
|
||||
{"\176\177\200", "~\\u007f\200"},
|
||||
{"\000", ""},
|
||||
{"a\x7fxc", "a\\u007fxc"},
|
||||
{"a\033xc", "a\\u001bxc"},
|
||||
{"a\nxc", "a\\nxc"},
|
||||
{"a\\xc", "a\\\\xc"},
|
||||
{"Barney B\303\244r", "Barney B\303\244r"},
|
||||
}
|
||||
|
||||
for _, tc := range testCases {
|
||||
testEscapeJsonString(t, tc.input, tc.output)
|
||||
}
|
||||
}
|
41
vendor/github.com/opencontainers/runc/libcontainer/user/lookup.go
generated
vendored
41
vendor/github.com/opencontainers/runc/libcontainer/user/lookup.go
generated
vendored
|
@ -1,41 +0,0 @@
|
|||
package user
|
||||
|
||||
import (
|
||||
"errors"
|
||||
)
|
||||
|
||||
var (
|
||||
// The current operating system does not provide the required data for user lookups.
|
||||
ErrUnsupported = errors.New("user lookup: operating system does not provide passwd-formatted data")
|
||||
// No matching entries found in file.
|
||||
ErrNoPasswdEntries = errors.New("no matching entries in passwd file")
|
||||
ErrNoGroupEntries = errors.New("no matching entries in group file")
|
||||
)
|
||||
|
||||
// LookupUser looks up a user by their username in /etc/passwd. If the user
|
||||
// cannot be found (or there is no /etc/passwd file on the filesystem), then
|
||||
// LookupUser returns an error.
|
||||
func LookupUser(username string) (User, error) {
|
||||
return lookupUser(username)
|
||||
}
|
||||
|
||||
// LookupUid looks up a user by their user id in /etc/passwd. If the user cannot
|
||||
// be found (or there is no /etc/passwd file on the filesystem), then LookupId
|
||||
// returns an error.
|
||||
func LookupUid(uid int) (User, error) {
|
||||
return lookupUid(uid)
|
||||
}
|
||||
|
||||
// LookupGroup looks up a group by its name in /etc/group. If the group cannot
|
||||
// be found (or there is no /etc/group file on the filesystem), then LookupGroup
|
||||
// returns an error.
|
||||
func LookupGroup(groupname string) (Group, error) {
|
||||
return lookupGroup(groupname)
|
||||
}
|
||||
|
||||
// LookupGid looks up a group by its group id in /etc/group. If the group cannot
|
||||
// be found (or there is no /etc/group file on the filesystem), then LookupGid
|
||||
// returns an error.
|
||||
func LookupGid(gid int) (Group, error) {
|
||||
return lookupGid(gid)
|
||||
}
|
20
vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go
generated
vendored
20
vendor/github.com/opencontainers/runc/libcontainer/user/lookup_unix.go
generated
vendored
|
@ -16,13 +16,19 @@ const (
|
|||
unixGroupPath = "/etc/group"
|
||||
)
|
||||
|
||||
func lookupUser(username string) (User, error) {
|
||||
// LookupUser looks up a user by their username in /etc/passwd. If the user
|
||||
// cannot be found (or there is no /etc/passwd file on the filesystem), then
|
||||
// LookupUser returns an error.
|
||||
func LookupUser(username string) (User, error) {
|
||||
return lookupUserFunc(func(u User) bool {
|
||||
return u.Name == username
|
||||
})
|
||||
}
|
||||
|
||||
func lookupUid(uid int) (User, error) {
|
||||
// LookupUid looks up a user by their user id in /etc/passwd. If the user cannot
|
||||
// be found (or there is no /etc/passwd file on the filesystem), then LookupId
|
||||
// returns an error.
|
||||
func LookupUid(uid int) (User, error) {
|
||||
return lookupUserFunc(func(u User) bool {
|
||||
return u.Uid == uid
|
||||
})
|
||||
|
@ -51,13 +57,19 @@ func lookupUserFunc(filter func(u User) bool) (User, error) {
|
|||
return users[0], nil
|
||||
}
|
||||
|
||||
func lookupGroup(groupname string) (Group, error) {
|
||||
// LookupGroup looks up a group by its name in /etc/group. If the group cannot
|
||||
// be found (or there is no /etc/group file on the filesystem), then LookupGroup
|
||||
// returns an error.
|
||||
func LookupGroup(groupname string) (Group, error) {
|
||||
return lookupGroupFunc(func(g Group) bool {
|
||||
return g.Name == groupname
|
||||
})
|
||||
}
|
||||
|
||||
func lookupGid(gid int) (Group, error) {
|
||||
// LookupGid looks up a group by its group id in /etc/group. If the group cannot
|
||||
// be found (or there is no /etc/group file on the filesystem), then LookupGid
|
||||
// returns an error.
|
||||
func LookupGid(gid int) (Group, error) {
|
||||
return lookupGroupFunc(func(g Group) bool {
|
||||
return g.Gid == gid
|
||||
})
|
||||
|
|
40
vendor/github.com/opencontainers/runc/libcontainer/user/lookup_windows.go
generated
vendored
40
vendor/github.com/opencontainers/runc/libcontainer/user/lookup_windows.go
generated
vendored
|
@ -1,40 +0,0 @@
|
|||
// +build windows
|
||||
|
||||
package user
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"os/user"
|
||||
)
|
||||
|
||||
func lookupUser(username string) (User, error) {
|
||||
u, err := user.Lookup(username)
|
||||
if err != nil {
|
||||
return User{}, err
|
||||
}
|
||||
return userFromOS(u)
|
||||
}
|
||||
|
||||
func lookupUid(uid int) (User, error) {
|
||||
u, err := user.LookupId(fmt.Sprintf("%d", uid))
|
||||
if err != nil {
|
||||
return User{}, err
|
||||
}
|
||||
return userFromOS(u)
|
||||
}
|
||||
|
||||
func lookupGroup(groupname string) (Group, error) {
|
||||
g, err := user.LookupGroup(groupname)
|
||||
if err != nil {
|
||||
return Group{}, err
|
||||
}
|
||||
return groupFromOS(g)
|
||||
}
|
||||
|
||||
func lookupGid(gid int) (Group, error) {
|
||||
g, err := user.LookupGroupId(fmt.Sprintf("%d", gid))
|
||||
if err != nil {
|
||||
return Group{}, err
|
||||
}
|
||||
return groupFromOS(g)
|
||||
}
|
52
vendor/github.com/opencontainers/runc/libcontainer/user/user.go
generated
vendored
52
vendor/github.com/opencontainers/runc/libcontainer/user/user.go
generated
vendored
|
@ -2,10 +2,10 @@ package user
|
|||
|
||||
import (
|
||||
"bufio"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"os/user"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
@ -16,6 +16,13 @@ const (
|
|||
)
|
||||
|
||||
var (
|
||||
// The current operating system does not provide the required data for user lookups.
|
||||
ErrUnsupported = errors.New("user lookup: operating system does not provide passwd-formatted data")
|
||||
|
||||
// No matching entries found in file.
|
||||
ErrNoPasswdEntries = errors.New("no matching entries in passwd file")
|
||||
ErrNoGroupEntries = errors.New("no matching entries in group file")
|
||||
|
||||
ErrRange = fmt.Errorf("uids and gids must be in range %d-%d", minId, maxId)
|
||||
)
|
||||
|
||||
|
@ -29,28 +36,6 @@ type User struct {
|
|||
Shell string
|
||||
}
|
||||
|
||||
// userFromOS converts an os/user.(*User) to local User
|
||||
//
|
||||
// (This does not include Pass, Shell or Gecos)
|
||||
func userFromOS(u *user.User) (User, error) {
|
||||
newUser := User{
|
||||
Name: u.Username,
|
||||
Home: u.HomeDir,
|
||||
}
|
||||
id, err := strconv.Atoi(u.Uid)
|
||||
if err != nil {
|
||||
return newUser, err
|
||||
}
|
||||
newUser.Uid = id
|
||||
|
||||
id, err = strconv.Atoi(u.Gid)
|
||||
if err != nil {
|
||||
return newUser, err
|
||||
}
|
||||
newUser.Gid = id
|
||||
return newUser, nil
|
||||
}
|
||||
|
||||
type Group struct {
|
||||
Name string
|
||||
Pass string
|
||||
|
@ -58,23 +43,6 @@ type Group struct {
|
|||
List []string
|
||||
}
|
||||
|
||||
// groupFromOS converts an os/user.(*Group) to local Group
|
||||
//
|
||||
// (This does not include Pass or List)
|
||||
func groupFromOS(g *user.Group) (Group, error) {
|
||||
newGroup := Group{
|
||||
Name: g.Name,
|
||||
}
|
||||
|
||||
id, err := strconv.Atoi(g.Gid)
|
||||
if err != nil {
|
||||
return newGroup, err
|
||||
}
|
||||
newGroup.Gid = id
|
||||
|
||||
return newGroup, nil
|
||||
}
|
||||
|
||||
// SubID represents an entry in /etc/sub{u,g}id
|
||||
type SubID struct {
|
||||
Name string
|
||||
|
@ -466,7 +434,7 @@ func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, err
|
|||
// we asked for a group but didn't find it. let's check to see
|
||||
// if we wanted a numeric group
|
||||
if !found {
|
||||
gid, err := strconv.Atoi(ag)
|
||||
gid, err := strconv.ParseInt(ag, 10, 64)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("Unable to find group %s", ag)
|
||||
}
|
||||
|
@ -474,7 +442,7 @@ func GetAdditionalGroups(additionalGroups []string, group io.Reader) ([]int, err
|
|||
if gid < minId || gid > maxId {
|
||||
return nil, ErrRange
|
||||
}
|
||||
gidMap[gid] = struct{}{}
|
||||
gidMap[int(gid)] = struct{}{}
|
||||
}
|
||||
}
|
||||
gids := []int{}
|
||||
|
|
42
vendor/github.com/opencontainers/runc/libcontainer/user/user_fuzzer.go
generated
vendored
Normal file
42
vendor/github.com/opencontainers/runc/libcontainer/user/user_fuzzer.go
generated
vendored
Normal file
|
@ -0,0 +1,42 @@
|
|||
// +build gofuzz
|
||||
|
||||
package user
|
||||
|
||||
import (
|
||||
"io"
|
||||
"strings"
|
||||
)
|
||||
|
||||
func IsDivisbleBy(n int, divisibleby int) bool {
|
||||
return (n % divisibleby) == 0
|
||||
}
|
||||
|
||||
func FuzzUser(data []byte) int {
|
||||
if len(data) == 0 {
|
||||
return -1
|
||||
}
|
||||
if !IsDivisbleBy(len(data), 5) {
|
||||
return -1
|
||||
}
|
||||
|
||||
var divided [][]byte
|
||||
|
||||
chunkSize := len(data) / 5
|
||||
|
||||
for i := 0; i < len(data); i += chunkSize {
|
||||
end := i + chunkSize
|
||||
|
||||
divided = append(divided, data[i:end])
|
||||
}
|
||||
|
||||
_, _ = ParsePasswdFilter(strings.NewReader(string(divided[0])), nil)
|
||||
|
||||
var passwd, group io.Reader
|
||||
|
||||
group = strings.NewReader(string(divided[1]))
|
||||
_, _ = GetAdditionalGroups([]string{string(divided[2])}, group)
|
||||
|
||||
passwd = strings.NewReader(string(divided[3]))
|
||||
_, _ = GetExecUser(string(divided[4]), nil, passwd, group)
|
||||
return 1
|
||||
}
|
5
vendor/github.com/opencontainers/runc/libcontainer/userns/userns.go
generated
vendored
Normal file
5
vendor/github.com/opencontainers/runc/libcontainer/userns/userns.go
generated
vendored
Normal file
|
@ -0,0 +1,5 @@
|
|||
package userns
|
||||
|
||||
// RunningInUserNS detects whether we are currently running in a user namespace.
|
||||
// Originally copied from github.com/lxc/lxd/shared/util.go
|
||||
var RunningInUserNS = runningInUserNS
|
15
vendor/github.com/opencontainers/runc/libcontainer/userns/userns_fuzzer.go
generated
vendored
Normal file
15
vendor/github.com/opencontainers/runc/libcontainer/userns/userns_fuzzer.go
generated
vendored
Normal file
|
@ -0,0 +1,15 @@
|
|||
// +build gofuzz
|
||||
|
||||
package userns
|
||||
|
||||
import (
|
||||
"strings"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/user"
|
||||
)
|
||||
|
||||
func FuzzUIDMap(data []byte) int {
|
||||
uidmap, _ := user.ParseIDMap(strings.NewReader(string(data)))
|
||||
_ = uidMapInUserNS(uidmap)
|
||||
return 1
|
||||
}
|
37
vendor/github.com/opencontainers/runc/libcontainer/userns/userns_linux.go
generated
vendored
Normal file
37
vendor/github.com/opencontainers/runc/libcontainer/userns/userns_linux.go
generated
vendored
Normal file
|
@ -0,0 +1,37 @@
|
|||
package userns
|
||||
|
||||
import (
|
||||
"sync"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/user"
|
||||
)
|
||||
|
||||
var (
|
||||
inUserNS bool
|
||||
nsOnce sync.Once
|
||||
)
|
||||
|
||||
// runningInUserNS detects whether we are currently running in a user namespace.
|
||||
// Originally copied from github.com/lxc/lxd/shared/util.go
|
||||
func runningInUserNS() bool {
|
||||
nsOnce.Do(func() {
|
||||
uidmap, err := user.CurrentProcessUIDMap()
|
||||
if err != nil {
|
||||
// This kernel-provided file only exists if user namespaces are supported
|
||||
return
|
||||
}
|
||||
inUserNS = uidMapInUserNS(uidmap)
|
||||
})
|
||||
return inUserNS
|
||||
}
|
||||
|
||||
func uidMapInUserNS(uidmap []user.IDMap) bool {
|
||||
/*
|
||||
* We assume we are in the initial user namespace if we have a full
|
||||
* range - 4294967295 uids starting at uid 0.
|
||||
*/
|
||||
if len(uidmap) == 1 && uidmap[0].ID == 0 && uidmap[0].ParentID == 0 && uidmap[0].Count == 4294967295 {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}
|
17
vendor/github.com/opencontainers/runc/libcontainer/userns/userns_unsupported.go
generated
vendored
Normal file
17
vendor/github.com/opencontainers/runc/libcontainer/userns/userns_unsupported.go
generated
vendored
Normal file
|
@ -0,0 +1,17 @@
|
|||
// +build !linux
|
||||
|
||||
package userns
|
||||
|
||||
import "github.com/opencontainers/runc/libcontainer/user"
|
||||
|
||||
// runningInUserNS is a stub for non-Linux systems
|
||||
// Always returns false
|
||||
func runningInUserNS() bool {
|
||||
return false
|
||||
}
|
||||
|
||||
// uidMapInUserNS is a stub for non-Linux systems
|
||||
// Always returns false
|
||||
func uidMapInUserNS(uidmap []user.IDMap) bool {
|
||||
return false
|
||||
}
|
Loading…
Add table
Reference in a new issue