mirror of
https://github.com/moby/moby.git
synced 2022-11-09 12:21:53 -05:00
Merge pull request #16244 from calavera/libcontainer_0_0_4
Vendor libcontainer v0.0.4
This commit is contained in:
commit
dac92a8afb
28 changed files with 525 additions and 644 deletions
|
@ -1112,12 +1112,9 @@ func (container *Container) unmountVolumes(forceSyscall bool) error {
|
|||
|
||||
func (container *Container) networkMounts() []execdriver.Mount {
|
||||
var mounts []execdriver.Mount
|
||||
mode := "Z"
|
||||
if container.hostConfig.NetworkMode.IsContainer() {
|
||||
mode = "z"
|
||||
}
|
||||
shared := container.hostConfig.NetworkMode.IsContainer()
|
||||
if container.ResolvConfPath != "" {
|
||||
label.Relabel(container.ResolvConfPath, container.MountLabel, mode)
|
||||
label.Relabel(container.ResolvConfPath, container.MountLabel, shared)
|
||||
writable := !container.hostConfig.ReadonlyRootfs
|
||||
if m, exists := container.MountPoints["/etc/resolv.conf"]; exists {
|
||||
writable = m.RW
|
||||
|
@ -1130,7 +1127,7 @@ func (container *Container) networkMounts() []execdriver.Mount {
|
|||
})
|
||||
}
|
||||
if container.HostnamePath != "" {
|
||||
label.Relabel(container.HostnamePath, container.MountLabel, mode)
|
||||
label.Relabel(container.HostnamePath, container.MountLabel, shared)
|
||||
writable := !container.hostConfig.ReadonlyRootfs
|
||||
if m, exists := container.MountPoints["/etc/hostname"]; exists {
|
||||
writable = m.RW
|
||||
|
@ -1143,7 +1140,7 @@ func (container *Container) networkMounts() []execdriver.Mount {
|
|||
})
|
||||
}
|
||||
if container.HostsPath != "" {
|
||||
label.Relabel(container.HostsPath, container.MountLabel, mode)
|
||||
label.Relabel(container.HostsPath, container.MountLabel, shared)
|
||||
writable := !container.hostConfig.ReadonlyRootfs
|
||||
if m, exists := container.MountPoints["/etc/hosts"]; exists {
|
||||
writable = m.RW
|
||||
|
|
|
@ -59,7 +59,7 @@ func createContainerPlatformSpecificSettings(container *Container, config *runco
|
|||
return err
|
||||
}
|
||||
|
||||
if err := label.Relabel(v.Path(), container.MountLabel, "z"); err != nil {
|
||||
if err := label.Relabel(v.Path(), container.MountLabel, true); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
|
|
|
@ -355,7 +355,8 @@ func (daemon *Daemon) registerMountPoints(container *Container, hostConfig *runc
|
|||
}
|
||||
}
|
||||
|
||||
if err := label.Relabel(bind.Source, container.MountLabel, bind.Mode); err != nil {
|
||||
shared := label.IsShared(bind.Mode)
|
||||
if err := label.Relabel(bind.Source, container.MountLabel, shared); err != nil {
|
||||
return err
|
||||
}
|
||||
binds[bind.Destination] = true
|
||||
|
|
|
@ -42,7 +42,7 @@ clone git github.com/endophage/gotuf 9bcdad0308e34a49f38448b8ad436ad8860825ce
|
|||
clone git github.com/jfrazelle/go 6e461eb70cb4187b41a84e9a567d7137bdbe0f16
|
||||
clone git github.com/agl/ed25519 d2b94fd789ea21d12fac1a4443dd3a3f79cda72c
|
||||
|
||||
clone git github.com/opencontainers/runc v0.0.3 # libcontainer
|
||||
clone git github.com/opencontainers/runc v0.0.4 # libcontainer
|
||||
# libcontainer deps (see src/github.com/docker/libcontainer/update-vendor.sh)
|
||||
clone git github.com/coreos/go-systemd v3
|
||||
clone git github.com/godbus/dbus v2
|
||||
|
|
|
@ -83,7 +83,7 @@ type data struct {
|
|||
pid int
|
||||
}
|
||||
|
||||
func (m *Manager) Apply(pid int) error {
|
||||
func (m *Manager) Apply(pid int) (err error) {
|
||||
if m.Cgroups == nil {
|
||||
return nil
|
||||
}
|
||||
|
@ -235,12 +235,12 @@ func getCgroupData(c *configs.Cgroup, pid int) (*data, error) {
|
|||
}, nil
|
||||
}
|
||||
|
||||
func (raw *data) parent(subsystem, mountpoint, src string) (string, error) {
|
||||
initPath, err := cgroups.GetInitCgroupDir(subsystem)
|
||||
func (raw *data) parent(subsystem, mountpoint, root string) (string, error) {
|
||||
initPath, err := cgroups.GetThisCgroupDir(subsystem)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
relDir, err := filepath.Rel(src, initPath)
|
||||
relDir, err := filepath.Rel(root, initPath)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
@ -248,7 +248,7 @@ func (raw *data) parent(subsystem, mountpoint, src string) (string, error) {
|
|||
}
|
||||
|
||||
func (raw *data) path(subsystem string) (string, error) {
|
||||
mnt, src, err := cgroups.FindCgroupMountpointAndSource(subsystem)
|
||||
mnt, root, err := cgroups.FindCgroupMountpointAndRoot(subsystem)
|
||||
// If we didn't mount the subsystem, there is no point we make the path.
|
||||
if err != nil {
|
||||
return "", err
|
||||
|
@ -259,7 +259,7 @@ func (raw *data) path(subsystem string) (string, error) {
|
|||
return filepath.Join(raw.root, filepath.Base(mnt), raw.cgroup), nil
|
||||
}
|
||||
|
||||
parent, err := raw.parent(subsystem, mnt, src)
|
||||
parent, err := raw.parent(subsystem, mnt, root)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
|
|
@ -17,7 +17,7 @@ import (
|
|||
type MemoryGroup struct {
|
||||
}
|
||||
|
||||
func (s *MemoryGroup) Apply(d *data) error {
|
||||
func (s *MemoryGroup) Apply(d *data) (err error) {
|
||||
path, err := d.path("memory")
|
||||
if err != nil {
|
||||
if cgroups.IsNotFound(err) {
|
||||
|
@ -28,21 +28,22 @@ func (s *MemoryGroup) Apply(d *data) error {
|
|||
if err := os.MkdirAll(path, 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
defer func() {
|
||||
if err != nil {
|
||||
os.RemoveAll(path)
|
||||
}
|
||||
}()
|
||||
|
||||
if err := s.Set(path, d.c); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// We need to join memory cgroup after set memory limits, because
|
||||
// kmem.limit_in_bytes can only be set when the cgroup is empty.
|
||||
_, err = d.join("memory")
|
||||
if err != nil {
|
||||
if _, err = d.join("memory"); err != nil {
|
||||
return err
|
||||
}
|
||||
defer func() {
|
||||
if err != nil {
|
||||
os.RemoveAll(path)
|
||||
}
|
||||
}()
|
||||
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -21,6 +21,9 @@ const cgroupNamePrefix = "name="
|
|||
|
||||
// https://www.kernel.org/doc/Documentation/cgroups/cgroups.txt
|
||||
func FindCgroupMountpoint(subsystem string) (string, error) {
|
||||
// We are not using mount.GetMounts() because it's super-inefficient,
|
||||
// parsing it directly sped up x10 times because of not using Sscanf.
|
||||
// It was one of two major performance drawbacks in container start.
|
||||
f, err := os.Open("/proc/self/mountinfo")
|
||||
if err != nil {
|
||||
return "", err
|
||||
|
@ -44,7 +47,7 @@ func FindCgroupMountpoint(subsystem string) (string, error) {
|
|||
return "", NewNotFoundError(subsystem)
|
||||
}
|
||||
|
||||
func FindCgroupMountpointAndSource(subsystem string) (string, string, error) {
|
||||
func FindCgroupMountpointAndRoot(subsystem string) (string, string, error) {
|
||||
f, err := os.Open("/proc/self/mountinfo")
|
||||
if err != nil {
|
||||
return "", "", err
|
||||
|
@ -69,15 +72,28 @@ func FindCgroupMountpointAndSource(subsystem string) (string, string, error) {
|
|||
}
|
||||
|
||||
func FindCgroupMountpointDir() (string, error) {
|
||||
mounts, err := mount.GetMounts()
|
||||
f, err := os.Open("/proc/self/mountinfo")
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
for _, mount := range mounts {
|
||||
if mount.Fstype == "cgroup" {
|
||||
return filepath.Dir(mount.Mountpoint), nil
|
||||
scanner := bufio.NewScanner(f)
|
||||
for scanner.Scan() {
|
||||
text := scanner.Text()
|
||||
fields := strings.Split(text, " ")
|
||||
// Safe as mountinfo encodes mountpoints with spaces as \040.
|
||||
index := strings.Index(text, " - ")
|
||||
postSeparatorFields := strings.Fields(text[index+3:])
|
||||
if len(postSeparatorFields) < 3 {
|
||||
return "", fmt.Errorf("Error found less than 3 fields post '-' in %q", text)
|
||||
}
|
||||
if postSeparatorFields[0] == "cgroup" {
|
||||
return filepath.Dir(fields[4]), nil
|
||||
}
|
||||
}
|
||||
if err := scanner.Err(); err != nil {
|
||||
return "", err
|
||||
}
|
||||
|
||||
return "", NewNotFoundError("cgroup")
|
||||
|
|
|
@ -1,5 +1,11 @@
|
|||
package configs
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"encoding/json"
|
||||
"os/exec"
|
||||
)
|
||||
|
||||
type Rlimit struct {
|
||||
Type int `json:"type"`
|
||||
Hard uint64 `json:"hard"`
|
||||
|
@ -13,36 +19,46 @@ type IDMap struct {
|
|||
Size int `json:"size"`
|
||||
}
|
||||
|
||||
// Seccomp represents syscall restrictions
|
||||
type Seccomp struct {
|
||||
Syscalls []*Syscall `json:"syscalls"`
|
||||
DefaultAction Action `json:"default_action"`
|
||||
Syscalls []*Syscall `json:"syscalls"`
|
||||
}
|
||||
|
||||
// An action to be taken upon rule match in Seccomp
|
||||
type Action int
|
||||
|
||||
const (
|
||||
Kill Action = iota - 3
|
||||
Kill Action = iota - 4
|
||||
Errno
|
||||
Trap
|
||||
Allow
|
||||
)
|
||||
|
||||
// A comparison operator to be used when matching syscall arguments in Seccomp
|
||||
type Operator int
|
||||
|
||||
const (
|
||||
EqualTo Operator = iota
|
||||
NotEqualTo
|
||||
GreatherThan
|
||||
GreaterThan
|
||||
GreaterThanOrEqualTo
|
||||
LessThan
|
||||
LessThanOrEqualTo
|
||||
MaskEqualTo
|
||||
)
|
||||
|
||||
// A rule to match a specific syscall argument in Seccomp
|
||||
type Arg struct {
|
||||
Index int `json:"index"`
|
||||
Value uint32 `json:"value"`
|
||||
Op Operator `json:"op"`
|
||||
Index uint `json:"index"`
|
||||
Value uint64 `json:"value"`
|
||||
ValueTwo uint64 `json:"value_two"`
|
||||
Op Operator `json:"op"`
|
||||
}
|
||||
|
||||
// An rule to match a syscall in Seccomp
|
||||
type Syscall struct {
|
||||
Value int `json:"value"`
|
||||
Name string `json:"name"`
|
||||
Action Action `json:"action"`
|
||||
Args []*Arg `json:"args"`
|
||||
}
|
||||
|
@ -117,6 +133,12 @@ type Config struct {
|
|||
// If Rlimits are not set, the container will inherit rlimits from the parent process
|
||||
Rlimits []Rlimit `json:"rlimits"`
|
||||
|
||||
// OomScoreAdj specifies the adjustment to be made by the kernel when calculating oom scores
|
||||
// for a process. Valid values are between the range [-1000, '1000'], where processes with
|
||||
// higher scores are preferred for being killed.
|
||||
// More information about kernel oom score calculation here: https://lwn.net/Articles/317814/
|
||||
OomScoreAdj int `json:"oom_score_adj"`
|
||||
|
||||
// AdditionalGroups specifies the gids that should be added to supplementary groups
|
||||
// in addition to those that the user belongs to.
|
||||
AdditionalGroups []string `json:"additional_groups"`
|
||||
|
@ -140,7 +162,79 @@ type Config struct {
|
|||
Sysctl map[string]string `json:"sysctl"`
|
||||
|
||||
// Seccomp allows actions to be taken whenever a syscall is made within the container.
|
||||
// By default, all syscalls are allowed with actions to allow, trap, kill, or return an errno
|
||||
// can be specified on a per syscall basis.
|
||||
// A number of rules are given, each having an action to be taken if a syscall matches it.
|
||||
// A default action to be taken if no rules match is also given.
|
||||
Seccomp *Seccomp `json:"seccomp"`
|
||||
|
||||
// Hooks are a collection of actions to perform at various container lifecycle events.
|
||||
// Hooks are not able to be marshaled to json but they are also not needed to.
|
||||
Hooks *Hooks `json:"-"`
|
||||
}
|
||||
|
||||
type Hooks struct {
|
||||
// Prestart commands are executed after the container namespaces are created,
|
||||
// but before the user supplied command is executed from init.
|
||||
Prestart []Hook
|
||||
|
||||
// Poststop commands are executed after the container init process exits.
|
||||
Poststop []Hook
|
||||
}
|
||||
|
||||
// HookState is the payload provided to a hook on execution.
|
||||
type HookState struct {
|
||||
ID string `json:"id"`
|
||||
Pid int `json:"pid"`
|
||||
Root string `json:"root"`
|
||||
}
|
||||
|
||||
type Hook interface {
|
||||
// Run executes the hook with the provided state.
|
||||
Run(HookState) error
|
||||
}
|
||||
|
||||
// NewFunctionHooks will call the provided function when the hook is run.
|
||||
func NewFunctionHook(f func(HookState) error) FuncHook {
|
||||
return FuncHook{
|
||||
run: f,
|
||||
}
|
||||
}
|
||||
|
||||
type FuncHook struct {
|
||||
run func(HookState) error
|
||||
}
|
||||
|
||||
func (f FuncHook) Run(s HookState) error {
|
||||
return f.run(s)
|
||||
}
|
||||
|
||||
type Command struct {
|
||||
Path string `json:"path"`
|
||||
Args []string `json:"args"`
|
||||
Env []string `json:"env"`
|
||||
Dir string `json:"dir"`
|
||||
}
|
||||
|
||||
// NewCommandHooks will execute the provided command when the hook is run.
|
||||
func NewCommandHook(cmd Command) CommandHook {
|
||||
return CommandHook{
|
||||
Command: cmd,
|
||||
}
|
||||
}
|
||||
|
||||
type CommandHook struct {
|
||||
Command
|
||||
}
|
||||
|
||||
func (c Command) Run(s HookState) error {
|
||||
b, err := json.Marshal(s)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
cmd := exec.Cmd{
|
||||
Path: c.Path,
|
||||
Args: c.Args,
|
||||
Env: c.Env,
|
||||
Stdin: bytes.NewReader(b),
|
||||
}
|
||||
return cmd.Run()
|
||||
}
|
||||
|
|
|
@ -25,10 +25,3 @@ type Mount struct {
|
|||
// Optional Command to be run after Source is mounted.
|
||||
PostmountCmds []Command `json:"postmount_cmds"`
|
||||
}
|
||||
|
||||
type Command struct {
|
||||
Path string `json:"path"`
|
||||
Args []string `json:"args"`
|
||||
Env []string `json:"env"`
|
||||
Dir string `json:"dir"`
|
||||
}
|
||||
|
|
|
@ -185,6 +185,7 @@ func (c *linuxContainer) newInitProcess(p *Process, cmd *exec.Cmd, parentPipe, c
|
|||
parentPipe: parentPipe,
|
||||
manager: c.cgroupManager,
|
||||
config: c.newInitConfig(p),
|
||||
container: c,
|
||||
}, nil
|
||||
}
|
||||
|
||||
|
@ -247,6 +248,17 @@ func (c *linuxContainer) Destroy() error {
|
|||
err = rerr
|
||||
}
|
||||
c.initProcess = nil
|
||||
if c.config.Hooks != nil {
|
||||
s := configs.HookState{
|
||||
ID: c.id,
|
||||
Root: c.config.Rootfs,
|
||||
}
|
||||
for _, hook := range c.config.Hooks.Poststop {
|
||||
if err := hook.Run(s); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
|
@ -299,7 +311,7 @@ func (c *linuxContainer) checkCriuVersion() error {
|
|||
return nil
|
||||
}
|
||||
|
||||
const descriptors_filename = "descriptors.json"
|
||||
const descriptorsFilename = "descriptors.json"
|
||||
|
||||
func (c *linuxContainer) addCriuDumpMount(req *criurpc.CriuReq, m *configs.Mount) {
|
||||
mountDest := m.Destination
|
||||
|
@ -406,7 +418,7 @@ func (c *linuxContainer) Checkpoint(criuOpts *CriuOpts) error {
|
|||
return err
|
||||
}
|
||||
|
||||
err = ioutil.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptors_filename), fdsJSON, 0655)
|
||||
err = ioutil.WriteFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename), fdsJSON, 0655)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -532,13 +544,19 @@ func (c *linuxContainer) Restore(process *Process, criuOpts *CriuOpts) error {
|
|||
break
|
||||
}
|
||||
}
|
||||
for _, i := range criuOpts.VethPairs {
|
||||
veth := new(criurpc.CriuVethPair)
|
||||
veth.IfOut = proto.String(i.HostInterfaceName)
|
||||
veth.IfIn = proto.String(i.ContainerInterfaceName)
|
||||
req.Opts.Veths = append(req.Opts.Veths, veth)
|
||||
}
|
||||
|
||||
var (
|
||||
fds []string
|
||||
fdJSON []byte
|
||||
)
|
||||
|
||||
if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptors_filename)); err != nil {
|
||||
if fdJSON, err = ioutil.ReadFile(filepath.Join(criuOpts.ImagesDirectory, descriptorsFilename)); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
|
@ -568,6 +586,7 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *
|
|||
return err
|
||||
}
|
||||
|
||||
logPath := filepath.Join(opts.WorkDirectory, req.GetOpts().GetLogFile())
|
||||
criuClient := os.NewFile(uintptr(fds[0]), "criu-transport-client")
|
||||
criuServer := os.NewFile(uintptr(fds[1]), "criu-transport-server")
|
||||
defer criuClient.Close()
|
||||
|
@ -631,7 +650,8 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *
|
|||
return err
|
||||
}
|
||||
if !resp.GetSuccess() {
|
||||
return fmt.Errorf("criu failed: type %s errno %d", req.GetType().String(), resp.GetCrErrno())
|
||||
typeString := req.GetType().String()
|
||||
return fmt.Errorf("criu failed: type %s errno %d\nlog file: %s", typeString, resp.GetCrErrno(), logPath)
|
||||
}
|
||||
|
||||
t := resp.GetType()
|
||||
|
@ -671,7 +691,7 @@ func (c *linuxContainer) criuSwrk(process *Process, req *criurpc.CriuReq, opts *
|
|||
return err
|
||||
}
|
||||
if !st.Success() {
|
||||
return fmt.Errorf("criu failed: %s", st.String())
|
||||
return fmt.Errorf("criu failed: %s\nlog file: %s", st.String(), logPath)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -5,6 +5,11 @@ type CriuPageServerInfo struct {
|
|||
Port int32 // port number of CRIU page server
|
||||
}
|
||||
|
||||
type VethPairName struct {
|
||||
ContainerInterfaceName string
|
||||
HostInterfaceName string
|
||||
}
|
||||
|
||||
type CriuOpts struct {
|
||||
ImagesDirectory string // directory for storing image files
|
||||
WorkDirectory string // directory to cd and write logs/pidfiles/stats to
|
||||
|
@ -14,4 +19,5 @@ type CriuOpts struct {
|
|||
ShellJob bool // allow to dump and restore shell jobs
|
||||
FileLocks bool // handle file locks, for safety
|
||||
PageServer CriuPageServerInfo // allow to dump to criu page server
|
||||
VethPairs []VethPairName // pass the veth to criu when restore
|
||||
}
|
||||
|
|
|
@ -5,7 +5,9 @@ package libcontainer
|
|||
import (
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io/ioutil"
|
||||
"os"
|
||||
"strconv"
|
||||
"strings"
|
||||
"syscall"
|
||||
|
||||
|
@ -13,7 +15,6 @@ import (
|
|||
"github.com/opencontainers/runc/libcontainer/cgroups"
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
"github.com/opencontainers/runc/libcontainer/netlink"
|
||||
"github.com/opencontainers/runc/libcontainer/seccomp"
|
||||
"github.com/opencontainers/runc/libcontainer/system"
|
||||
"github.com/opencontainers/runc/libcontainer/user"
|
||||
"github.com/opencontainers/runc/libcontainer/utils"
|
||||
|
@ -239,6 +240,11 @@ func setupRlimits(config *configs.Config) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func setOomScoreAdj(oomScoreAdj int) error {
|
||||
path := "/proc/self/oom_score_adj"
|
||||
return ioutil.WriteFile(path, []byte(strconv.Itoa(oomScoreAdj)), 0700)
|
||||
}
|
||||
|
||||
// killCgroupProcesses freezes then iterates over all the processes inside the
|
||||
// manager's cgroups sending a SIGKILL to each process then waiting for them to
|
||||
// exit.
|
||||
|
@ -270,61 +276,3 @@ func killCgroupProcesses(m cgroups.Manager) error {
|
|||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func finalizeSeccomp(config *initConfig) error {
|
||||
if config.Config.Seccomp == nil {
|
||||
return nil
|
||||
}
|
||||
context := seccomp.New()
|
||||
for _, s := range config.Config.Seccomp.Syscalls {
|
||||
ss := &seccomp.Syscall{
|
||||
Value: uint32(s.Value),
|
||||
Action: seccompAction(s.Action),
|
||||
}
|
||||
if len(s.Args) > 0 {
|
||||
ss.Args = seccompArgs(s.Args)
|
||||
}
|
||||
context.Add(ss)
|
||||
}
|
||||
return context.Load()
|
||||
}
|
||||
|
||||
func seccompAction(a configs.Action) seccomp.Action {
|
||||
switch a {
|
||||
case configs.Kill:
|
||||
return seccomp.Kill
|
||||
case configs.Trap:
|
||||
return seccomp.Trap
|
||||
case configs.Allow:
|
||||
return seccomp.Allow
|
||||
}
|
||||
return seccomp.Error(syscall.Errno(int(a)))
|
||||
}
|
||||
|
||||
func seccompArgs(args []*configs.Arg) seccomp.Args {
|
||||
var sa []seccomp.Arg
|
||||
for _, a := range args {
|
||||
sa = append(sa, seccomp.Arg{
|
||||
Index: uint32(a.Index),
|
||||
Op: seccompOperator(a.Op),
|
||||
Value: uint(a.Value),
|
||||
})
|
||||
}
|
||||
return seccomp.Args{sa}
|
||||
}
|
||||
|
||||
func seccompOperator(o configs.Operator) seccomp.Operator {
|
||||
switch o {
|
||||
case configs.EqualTo:
|
||||
return seccomp.EqualTo
|
||||
case configs.NotEqualTo:
|
||||
return seccomp.NotEqualTo
|
||||
case configs.GreatherThan:
|
||||
return seccomp.GreatherThan
|
||||
case configs.LessThan:
|
||||
return seccomp.LessThan
|
||||
case configs.MaskEqualTo:
|
||||
return seccomp.MaskEqualTo
|
||||
}
|
||||
return 0
|
||||
}
|
||||
|
|
|
@ -29,7 +29,7 @@ func SetFileCreateLabel(fileLabel string) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
func Relabel(path string, fileLabel string, relabel string) error {
|
||||
func Relabel(path string, fileLabel string, shared bool) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
|
@ -59,3 +59,13 @@ func DupSecOpt(src string) []string {
|
|||
func DisableSecOpt() []string {
|
||||
return nil
|
||||
}
|
||||
|
||||
// Validate checks that the label does not include unexpected options
|
||||
func Validate(label string) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
// IsShared checks that the label includes a "shared" mark
|
||||
func IsShared(label string) bool {
|
||||
return false
|
||||
}
|
||||
|
|
|
@ -9,6 +9,8 @@ import (
|
|||
"github.com/opencontainers/runc/libcontainer/selinux"
|
||||
)
|
||||
|
||||
var ErrIncompatibleLabel = fmt.Errorf("Bad SELinux option z and Z can not be used together")
|
||||
|
||||
// InitLabels returns the process label and file labels to be used within
|
||||
// the container. A list of options can be passed into this function to alter
|
||||
// the labels. The labels returned will include a random MCS String, that is
|
||||
|
@ -95,28 +97,24 @@ func SetFileCreateLabel(fileLabel string) error {
|
|||
return nil
|
||||
}
|
||||
|
||||
// Change the label of path to the filelabel string. If the relabel string
|
||||
// is "z", relabel will change the MCS label to s0. This will allow all
|
||||
// containers to share the content. If the relabel string is a "Z" then
|
||||
// the MCS label should continue to be used. SELinux will use this field
|
||||
// to make sure the content can not be shared by other containes.
|
||||
func Relabel(path string, fileLabel string, relabel string) error {
|
||||
exclude_path := []string{"/", "/usr", "/etc"}
|
||||
// Change the label of path to the filelabel string.
|
||||
// It changes the MCS label to s0 if shared is true.
|
||||
// This will allow all containers to share the content.
|
||||
func Relabel(path string, fileLabel string, shared bool) error {
|
||||
if !selinux.SelinuxEnabled() {
|
||||
return nil
|
||||
}
|
||||
|
||||
if fileLabel == "" {
|
||||
return nil
|
||||
}
|
||||
if !strings.ContainsAny(relabel, "zZ") {
|
||||
return nil
|
||||
|
||||
exclude_paths := map[string]bool{"/": true, "/usr": true, "/etc": true}
|
||||
if exclude_paths[path] {
|
||||
return fmt.Errorf("Relabeling of %s is not allowed", path)
|
||||
}
|
||||
for _, p := range exclude_path {
|
||||
if path == p {
|
||||
return fmt.Errorf("Relabeling of %s is not allowed", path)
|
||||
}
|
||||
}
|
||||
if strings.Contains(relabel, "z") && strings.Contains(relabel, "Z") {
|
||||
return fmt.Errorf("Bad SELinux option z and Z can not be used together")
|
||||
}
|
||||
if strings.Contains(relabel, "z") {
|
||||
|
||||
if shared {
|
||||
c := selinux.NewContext(fileLabel)
|
||||
c["level"] = "s0"
|
||||
fileLabel = c.Get()
|
||||
|
@ -161,3 +159,16 @@ func DupSecOpt(src string) []string {
|
|||
func DisableSecOpt() []string {
|
||||
return selinux.DisableSecOpt()
|
||||
}
|
||||
|
||||
// Validate checks that the label does not include unexpected options
|
||||
func Validate(label string) error {
|
||||
if strings.Contains(label, "z") && strings.Contains(label, "Z") {
|
||||
return ErrIncompatibleLabel
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
// IsShared checks that the label includes a "shared" mark
|
||||
func IsShared(label string) bool {
|
||||
return strings.Contains(label, "z")
|
||||
}
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
// +build arm ppc64
|
||||
// +build arm ppc64 ppc64le
|
||||
|
||||
package netlink
|
||||
|
||||
|
|
|
@ -1,4 +1,4 @@
|
|||
// +build !arm,!ppc64
|
||||
// +build !arm,!ppc64,!ppc64le
|
||||
|
||||
package netlink
|
||||
|
||||
|
|
|
@ -13,6 +13,7 @@ import (
|
|||
"syscall"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/cgroups"
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
"github.com/opencontainers/runc/libcontainer/system"
|
||||
)
|
||||
|
||||
|
@ -138,11 +139,9 @@ func (p *setnsProcess) terminate() error {
|
|||
|
||||
func (p *setnsProcess) wait() (*os.ProcessState, error) {
|
||||
err := p.cmd.Wait()
|
||||
if err != nil {
|
||||
return p.cmd.ProcessState, err
|
||||
}
|
||||
|
||||
return p.cmd.ProcessState, nil
|
||||
// Return actual ProcessState even on Wait error
|
||||
return p.cmd.ProcessState, err
|
||||
}
|
||||
|
||||
func (p *setnsProcess) pid() int {
|
||||
|
@ -175,9 +174,9 @@ func (p *initProcess) externalDescriptors() []string {
|
|||
return p.fds
|
||||
}
|
||||
|
||||
func (p *initProcess) start() error {
|
||||
func (p *initProcess) start() (err error) {
|
||||
defer p.parentPipe.Close()
|
||||
err := p.cmd.Start()
|
||||
err = p.cmd.Start()
|
||||
p.childPipe.Close()
|
||||
if err != nil {
|
||||
return newSystemError(err)
|
||||
|
@ -202,6 +201,18 @@ func (p *initProcess) start() error {
|
|||
p.manager.Destroy()
|
||||
}
|
||||
}()
|
||||
if p.config.Config.Hooks != nil {
|
||||
s := configs.HookState{
|
||||
ID: p.container.id,
|
||||
Pid: p.pid(),
|
||||
Root: p.config.Config.Rootfs,
|
||||
}
|
||||
for _, hook := range p.config.Config.Hooks.Prestart {
|
||||
if err := hook.Run(s); err != nil {
|
||||
return newSystemError(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
if err := p.createNetworkInterfaces(); err != nil {
|
||||
return newSystemError(err)
|
||||
}
|
||||
|
@ -286,9 +297,7 @@ func (p *initProcess) setExternalDescriptors(newFds []string) {
|
|||
}
|
||||
|
||||
func getPipeFds(pid int) ([]string, error) {
|
||||
var fds []string
|
||||
|
||||
fds = make([]string, 3)
|
||||
fds := make([]string, 3)
|
||||
|
||||
dirPath := filepath.Join("/proc", strconv.Itoa(pid), "/fd")
|
||||
for i := 0; i < 3; i++ {
|
||||
|
|
|
@ -27,6 +27,8 @@ func setupRootfs(config *configs.Config, console *linuxConsole) (err error) {
|
|||
if err := prepareRoot(config); err != nil {
|
||||
return newSystemError(err)
|
||||
}
|
||||
|
||||
setupDev := len(config.Devices) == 0
|
||||
for _, m := range config.Mounts {
|
||||
for _, precmd := range m.PremountCmds {
|
||||
if err := mountCmd(precmd); err != nil {
|
||||
|
@ -43,14 +45,16 @@ func setupRootfs(config *configs.Config, console *linuxConsole) (err error) {
|
|||
}
|
||||
}
|
||||
}
|
||||
if err := createDevices(config); err != nil {
|
||||
return newSystemError(err)
|
||||
}
|
||||
if err := setupPtmx(config, console); err != nil {
|
||||
return newSystemError(err)
|
||||
}
|
||||
if err := setupDevSymlinks(config.Rootfs); err != nil {
|
||||
return newSystemError(err)
|
||||
if !setupDev {
|
||||
if err := createDevices(config); err != nil {
|
||||
return newSystemError(err)
|
||||
}
|
||||
if err := setupPtmx(config, console); err != nil {
|
||||
return newSystemError(err)
|
||||
}
|
||||
if err := setupDevSymlinks(config.Rootfs); err != nil {
|
||||
return newSystemError(err)
|
||||
}
|
||||
}
|
||||
if err := syscall.Chdir(config.Rootfs); err != nil {
|
||||
return newSystemError(err)
|
||||
|
@ -63,8 +67,10 @@ func setupRootfs(config *configs.Config, console *linuxConsole) (err error) {
|
|||
if err != nil {
|
||||
return newSystemError(err)
|
||||
}
|
||||
if err := reOpenDevNull(config.Rootfs); err != nil {
|
||||
return newSystemError(err)
|
||||
if !setupDev {
|
||||
if err := reOpenDevNull(config.Rootfs); err != nil {
|
||||
return newSystemError(err)
|
||||
}
|
||||
}
|
||||
if config.Readonlyfs {
|
||||
if err := setReadonly(); err != nil {
|
||||
|
@ -131,6 +137,11 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
|
|||
return err
|
||||
}
|
||||
return syscall.Mount(m.Source, dest, m.Device, uintptr(m.Flags), data)
|
||||
case "securityfs":
|
||||
if err := os.MkdirAll(dest, 0755); err != nil {
|
||||
return err
|
||||
}
|
||||
return syscall.Mount(m.Source, dest, m.Device, uintptr(m.Flags), data)
|
||||
case "bind":
|
||||
stat, err := os.Stat(m.Source)
|
||||
if err != nil {
|
||||
|
@ -160,7 +171,11 @@ func mountToRootfs(m *configs.Mount, rootfs, mountLabel string) error {
|
|||
}
|
||||
}
|
||||
if m.Relabel != "" {
|
||||
if err := label.Relabel(m.Source, mountLabel, m.Relabel); err != nil {
|
||||
if err := label.Validate(m.Relabel); err != nil {
|
||||
return err
|
||||
}
|
||||
shared := label.IsShared(m.Relabel)
|
||||
if err := label.Relabel(m.Source, mountLabel, shared); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1,34 +0,0 @@
|
|||
// +build linux
|
||||
|
||||
package seccomp
|
||||
|
||||
import "strings"
|
||||
|
||||
type bpfLabel struct {
|
||||
label string
|
||||
location uint32
|
||||
}
|
||||
|
||||
type bpfLabels []bpfLabel
|
||||
|
||||
// labelIndex returns the index for the label if it exists in the slice.
|
||||
// if it does not exist in the slice it appends the label lb to the end
|
||||
// of the slice and returns the index.
|
||||
func labelIndex(labels *bpfLabels, lb string) uint32 {
|
||||
var id uint32
|
||||
for id = 0; id < uint32(len(*labels)); id++ {
|
||||
if strings.EqualFold(lb, (*labels)[id].label) {
|
||||
return id
|
||||
}
|
||||
}
|
||||
*labels = append(*labels, bpfLabel{lb, 0xffffffff})
|
||||
return id
|
||||
}
|
||||
|
||||
func scmpBpfStmt(code uint16, k uint32) sockFilter {
|
||||
return sockFilter{code, 0, 0, k}
|
||||
}
|
||||
|
||||
func scmpBpfJump(code uint16, k uint32, jt, jf uint8) sockFilter {
|
||||
return sockFilter{code, jt, jf, k}
|
||||
}
|
53
vendor/src/github.com/opencontainers/runc/libcontainer/seccomp/config.go
vendored
Normal file
53
vendor/src/github.com/opencontainers/runc/libcontainer/seccomp/config.go
vendored
Normal file
|
@ -0,0 +1,53 @@
|
|||
package seccomp
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
)
|
||||
|
||||
// ConvertStringToOperator converts a string into a Seccomp comparison operator.
|
||||
// Comparison operators use the names they are assigned by Libseccomp's header.
|
||||
// Attempting to convert a string that is not a valid operator results in an
|
||||
// error.
|
||||
func ConvertStringToOperator(in string) (configs.Operator, error) {
|
||||
switch in {
|
||||
case "SCMP_CMP_NE":
|
||||
return configs.NotEqualTo, nil
|
||||
case "SCMP_CMP_LT":
|
||||
return configs.LessThan, nil
|
||||
case "SCMP_CMP_LE":
|
||||
return configs.LessThanOrEqualTo, nil
|
||||
case "SCMP_CMP_EQ":
|
||||
return configs.EqualTo, nil
|
||||
case "SCMP_CMP_GE":
|
||||
return configs.GreaterThan, nil
|
||||
case "SCMP_CMP_GT":
|
||||
return configs.GreaterThanOrEqualTo, nil
|
||||
case "SCMP_CMP_MASKED_EQ":
|
||||
return configs.MaskEqualTo, nil
|
||||
default:
|
||||
return 0, fmt.Errorf("string %s is not a valid operator for seccomp", in)
|
||||
}
|
||||
}
|
||||
|
||||
// ConvertStringToAction converts a string into a Seccomp rule match action.
|
||||
// Actions use the named they are assigned in Libseccomp's header, though some
|
||||
// (notable, SCMP_ACT_TRACE) are not available in this implementation and will
|
||||
// return errors.
|
||||
// Attempting to convert a string that is not a valid action results in an
|
||||
// error.
|
||||
func ConvertStringToAction(in string) (configs.Action, error) {
|
||||
switch in {
|
||||
case "SCMP_ACT_KILL":
|
||||
return configs.Kill, nil
|
||||
case "SCMP_ACT_ERRNO":
|
||||
return configs.Errno, nil
|
||||
case "SCMP_ACT_TRAP":
|
||||
return configs.Trap, nil
|
||||
case "SCMP_ACT_ALLOW":
|
||||
return configs.Allow, nil
|
||||
default:
|
||||
return 0, fmt.Errorf("string %s is not a valid action for seccomp", in)
|
||||
}
|
||||
}
|
|
@ -1,146 +0,0 @@
|
|||
// +build linux
|
||||
|
||||
package seccomp
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"syscall"
|
||||
)
|
||||
|
||||
const labelTemplate = "lb-%d-%d"
|
||||
|
||||
// Action is the type of action that will be taken when a
|
||||
// syscall is performed.
|
||||
type Action int
|
||||
|
||||
const (
|
||||
Kill Action = iota - 3 // Kill the calling process of the syscall.
|
||||
Trap // Trap and coredump the calling process of the syscall.
|
||||
Allow // Allow the syscall to be completed.
|
||||
)
|
||||
|
||||
// Syscall is the specified syscall, action, and any type of arguments
|
||||
// to filter on.
|
||||
type Syscall struct {
|
||||
// Value is the syscall number.
|
||||
Value uint32
|
||||
// Action is the action to perform when the specified syscall is made.
|
||||
Action Action
|
||||
// Args are filters that can be specified on the arguments to the syscall.
|
||||
Args Args
|
||||
}
|
||||
|
||||
func (s *Syscall) scmpAction() uint32 {
|
||||
switch s.Action {
|
||||
case Allow:
|
||||
return retAllow
|
||||
case Trap:
|
||||
return retTrap
|
||||
case Kill:
|
||||
return retKill
|
||||
}
|
||||
return actionErrno(uint32(s.Action))
|
||||
}
|
||||
|
||||
// Arg represents an argument to the syscall with the argument's index,
|
||||
// the operator to apply when matching, and the argument's value at that time.
|
||||
type Arg struct {
|
||||
Index uint32 // index of args which start from zero
|
||||
Op Operator // operation, such as EQ/NE/GE/LE
|
||||
Value uint // the value of arg
|
||||
}
|
||||
|
||||
type Args [][]Arg
|
||||
|
||||
var (
|
||||
ErrUnresolvedLabel = errors.New("seccomp: unresolved label")
|
||||
ErrDuplicateLabel = errors.New("seccomp: duplicate label use")
|
||||
ErrUnsupportedOperation = errors.New("seccomp: unsupported operation for argument")
|
||||
)
|
||||
|
||||
// Error returns an Action that will be used to send the calling
|
||||
// process the specified errno when the syscall is made.
|
||||
func Error(code syscall.Errno) Action {
|
||||
return Action(code)
|
||||
}
|
||||
|
||||
// New returns a new syscall context for use.
|
||||
func New() *Context {
|
||||
return &Context{
|
||||
syscalls: make(map[uint32]*Syscall),
|
||||
}
|
||||
}
|
||||
|
||||
// Context holds syscalls for the current process to limit the type of
|
||||
// actions the calling process can make.
|
||||
type Context struct {
|
||||
syscalls map[uint32]*Syscall
|
||||
}
|
||||
|
||||
// Add will add the specified syscall, action, and arguments to the seccomp
|
||||
// Context.
|
||||
func (c *Context) Add(s *Syscall) {
|
||||
c.syscalls[s.Value] = s
|
||||
}
|
||||
|
||||
// Remove removes the specified syscall configuration from the Context.
|
||||
func (c *Context) Remove(call uint32) {
|
||||
delete(c.syscalls, call)
|
||||
}
|
||||
|
||||
// Load will apply the Context to the calling process makeing any secccomp process changes
|
||||
// apply after the context is loaded.
|
||||
func (c *Context) Load() error {
|
||||
filter, err := c.newFilter()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := prctl(prSetNoNewPrivileges, 1, 0, 0, 0); err != nil {
|
||||
return err
|
||||
}
|
||||
prog := newSockFprog(filter)
|
||||
return prog.set()
|
||||
}
|
||||
|
||||
func (c *Context) newFilter() ([]sockFilter, error) {
|
||||
var (
|
||||
labels bpfLabels
|
||||
f = newFilter()
|
||||
)
|
||||
for _, s := range c.syscalls {
|
||||
f.addSyscall(s, &labels)
|
||||
}
|
||||
f.allow()
|
||||
// process args for the syscalls
|
||||
for _, s := range c.syscalls {
|
||||
if err := f.addArguments(s, &labels); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
// apply labels for arguments
|
||||
idx := int32(len(*f) - 1)
|
||||
for ; idx >= 0; idx-- {
|
||||
lf := &(*f)[idx]
|
||||
if lf.code != (syscall.BPF_JMP + syscall.BPF_JA) {
|
||||
continue
|
||||
}
|
||||
rel := int32(lf.jt)<<8 | int32(lf.jf)
|
||||
if ((jumpJT << 8) | jumpJF) == rel {
|
||||
if labels[lf.k].location == 0xffffffff {
|
||||
return nil, ErrUnresolvedLabel
|
||||
}
|
||||
lf.k = labels[lf.k].location - uint32(idx+1)
|
||||
lf.jt = 0
|
||||
lf.jf = 0
|
||||
} else if ((labelJT << 8) | labelJF) == rel {
|
||||
if labels[lf.k].location != 0xffffffff {
|
||||
return nil, ErrDuplicateLabel
|
||||
}
|
||||
labels[lf.k].location = uint32(idx)
|
||||
lf.k = 0
|
||||
lf.jt = 0
|
||||
lf.jf = 0
|
||||
}
|
||||
}
|
||||
return *f, nil
|
||||
}
|
|
@ -1,118 +0,0 @@
|
|||
// +build linux
|
||||
|
||||
package seccomp
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"syscall"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
type sockFilter struct {
|
||||
code uint16
|
||||
jt uint8
|
||||
jf uint8
|
||||
k uint32
|
||||
}
|
||||
|
||||
func newFilter() *filter {
|
||||
var f filter
|
||||
f = append(f, sockFilter{
|
||||
pfLD + syscall.BPF_W + syscall.BPF_ABS,
|
||||
0,
|
||||
0,
|
||||
uint32(unsafe.Offsetof(secData.nr)),
|
||||
})
|
||||
return &f
|
||||
}
|
||||
|
||||
type filter []sockFilter
|
||||
|
||||
func (f *filter) addSyscall(s *Syscall, labels *bpfLabels) {
|
||||
if len(s.Args) == 0 {
|
||||
f.call(s.Value, scmpBpfStmt(syscall.BPF_RET+syscall.BPF_K, s.scmpAction()))
|
||||
} else {
|
||||
if len(s.Args[0]) > 0 {
|
||||
lb := fmt.Sprintf(labelTemplate, s.Value, s.Args[0][0].Index)
|
||||
f.call(s.Value,
|
||||
scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JA, labelIndex(labels, lb),
|
||||
jumpJT, jumpJF))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (f *filter) addArguments(s *Syscall, labels *bpfLabels) error {
|
||||
for i := 0; len(s.Args) > i; i++ {
|
||||
if len(s.Args[i]) > 0 {
|
||||
lb := fmt.Sprintf(labelTemplate, s.Value, s.Args[i][0].Index)
|
||||
f.label(labels, lb)
|
||||
f.arg(s.Args[i][0].Index)
|
||||
}
|
||||
for j := 0; j < len(s.Args[i]); j++ {
|
||||
var jf sockFilter
|
||||
if len(s.Args)-1 > i && len(s.Args[i+1]) > 0 {
|
||||
lbj := fmt.Sprintf(labelTemplate, s.Value, s.Args[i+1][0].Index)
|
||||
jf = scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JA,
|
||||
labelIndex(labels, lbj), jumpJT, jumpJF)
|
||||
} else {
|
||||
jf = scmpBpfStmt(syscall.BPF_RET+syscall.BPF_K, s.scmpAction())
|
||||
}
|
||||
if err := f.op(s.Args[i][j].Op, s.Args[i][j].Value, jf); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
f.allow()
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (f *filter) label(labels *bpfLabels, lb string) {
|
||||
*f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JA, labelIndex(labels, lb), labelJT, labelJF))
|
||||
}
|
||||
|
||||
func (f *filter) call(nr uint32, jt sockFilter) {
|
||||
*f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, nr, 0, 1))
|
||||
*f = append(*f, jt)
|
||||
}
|
||||
|
||||
func (f *filter) allow() {
|
||||
*f = append(*f, scmpBpfStmt(syscall.BPF_RET+syscall.BPF_K, retAllow))
|
||||
}
|
||||
|
||||
func (f *filter) deny() {
|
||||
*f = append(*f, scmpBpfStmt(syscall.BPF_RET+syscall.BPF_K, retTrap))
|
||||
}
|
||||
|
||||
func (f *filter) arg(index uint32) {
|
||||
arg(f, index)
|
||||
}
|
||||
|
||||
func (f *filter) op(operation Operator, v uint, jf sockFilter) error {
|
||||
switch operation {
|
||||
case EqualTo:
|
||||
jumpEqualTo(f, v, jf)
|
||||
case NotEqualTo:
|
||||
jumpNotEqualTo(f, v, jf)
|
||||
case GreatherThan:
|
||||
jumpGreaterThan(f, v, jf)
|
||||
case LessThan:
|
||||
jumpLessThan(f, v, jf)
|
||||
case MaskEqualTo:
|
||||
jumpMaskEqualTo(f, v, jf)
|
||||
default:
|
||||
return ErrUnsupportedOperation
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func arg(f *filter, idx uint32) {
|
||||
*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_W+syscall.BPF_ABS, endian.low(idx)))
|
||||
*f = append(*f, scmpBpfStmt(syscall.BPF_ST, 0))
|
||||
*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_W+syscall.BPF_ABS, endian.hi(idx)))
|
||||
*f = append(*f, scmpBpfStmt(syscall.BPF_ST, 1))
|
||||
}
|
||||
|
||||
func jump(f *filter, labels *bpfLabels, lb string) {
|
||||
*f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JA, labelIndex(labels, lb),
|
||||
jumpJT, jumpJF))
|
||||
}
|
|
@ -1,68 +0,0 @@
|
|||
// +build linux,amd64
|
||||
|
||||
package seccomp
|
||||
|
||||
// Using BPF filters
|
||||
//
|
||||
// ref: http://www.gsp.com/cgi-bin/man.cgi?topic=bpf
|
||||
import "syscall"
|
||||
|
||||
func jumpGreaterThan(f *filter, v uint, jt sockFilter) {
|
||||
lo := uint32(uint64(v) % 0x100000000)
|
||||
hi := uint32(uint64(v) / 0x100000000)
|
||||
*f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JGT+syscall.BPF_K, (hi), 4, 0))
|
||||
*f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, (hi), 0, 5))
|
||||
*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 0))
|
||||
*f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JGE+syscall.BPF_K, (lo), 0, 2))
|
||||
*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1))
|
||||
*f = append(*f, jt)
|
||||
*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1))
|
||||
}
|
||||
|
||||
func jumpEqualTo(f *filter, v uint, jt sockFilter) {
|
||||
lo := uint32(uint64(v) % 0x100000000)
|
||||
hi := uint32(uint64(v) / 0x100000000)
|
||||
*f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, (hi), 0, 5))
|
||||
*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 0))
|
||||
*f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, (lo), 0, 2))
|
||||
*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1))
|
||||
*f = append(*f, jt)
|
||||
*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1))
|
||||
}
|
||||
|
||||
func jumpLessThan(f *filter, v uint, jt sockFilter) {
|
||||
lo := uint32(uint64(v) % 0x100000000)
|
||||
hi := uint32(uint64(v) / 0x100000000)
|
||||
*f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JGT+syscall.BPF_K, (hi), 6, 0))
|
||||
*f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, (hi), 0, 3))
|
||||
*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 0))
|
||||
*f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JGT+syscall.BPF_K, (lo), 2, 0))
|
||||
*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1))
|
||||
*f = append(*f, jt)
|
||||
*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1))
|
||||
}
|
||||
|
||||
func jumpNotEqualTo(f *filter, v uint, jt sockFilter) {
|
||||
lo := uint32(uint64(v) % 0x100000000)
|
||||
hi := uint32(uint64(v) / 0x100000000)
|
||||
*f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, hi, 5, 0))
|
||||
*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 0))
|
||||
*f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, lo, 2, 0))
|
||||
*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1))
|
||||
*f = append(*f, jt)
|
||||
*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1))
|
||||
}
|
||||
|
||||
// this checks for a value inside a mask. The evalusation is equal to doing
|
||||
// CLONE_NEWUSER & syscallMask == CLONE_NEWUSER
|
||||
func jumpMaskEqualTo(f *filter, v uint, jt sockFilter) {
|
||||
lo := uint32(uint64(v) % 0x100000000)
|
||||
hi := uint32(uint64(v) / 0x100000000)
|
||||
*f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, hi, 0, 6))
|
||||
*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 0))
|
||||
*f = append(*f, scmpBpfStmt(syscall.BPF_ALU+syscall.BPF_AND, uint32(v)))
|
||||
*f = append(*f, scmpBpfJump(syscall.BPF_JMP+syscall.BPF_JEQ+syscall.BPF_K, lo, 0, 2))
|
||||
*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1))
|
||||
*f = append(*f, jt)
|
||||
*f = append(*f, scmpBpfStmt(syscall.BPF_LD+syscall.BPF_MEM, 1))
|
||||
}
|
165
vendor/src/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go
vendored
Normal file
165
vendor/src/github.com/opencontainers/runc/libcontainer/seccomp/seccomp_linux.go
vendored
Normal file
|
@ -0,0 +1,165 @@
|
|||
// +build linux,cgo,seccomp
|
||||
|
||||
package seccomp
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"log"
|
||||
"syscall"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
libseccomp "github.com/seccomp/libseccomp-golang"
|
||||
)
|
||||
|
||||
var (
|
||||
actAllow = libseccomp.ActAllow
|
||||
actTrap = libseccomp.ActTrap
|
||||
actKill = libseccomp.ActKill
|
||||
actErrno = libseccomp.ActErrno.SetReturnCode(int16(syscall.EPERM))
|
||||
)
|
||||
|
||||
// Filters given syscalls in a container, preventing them from being used
|
||||
// Started in the container init process, and carried over to all child processes
|
||||
// Setns calls, however, require a separate invocation, as they are not children
|
||||
// of the init until they join the namespace
|
||||
func InitSeccomp(config *configs.Seccomp) error {
|
||||
if config == nil {
|
||||
return fmt.Errorf("cannot initialize Seccomp - nil config passed")
|
||||
}
|
||||
|
||||
defaultAction, err := getAction(config.DefaultAction)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error initializing seccomp - invalid default action")
|
||||
}
|
||||
|
||||
filter, err := libseccomp.NewFilter(defaultAction)
|
||||
if err != nil {
|
||||
return fmt.Errorf("error creating filter: %s", err)
|
||||
}
|
||||
|
||||
// Unset no new privs bit
|
||||
if err := filter.SetNoNewPrivsBit(false); err != nil {
|
||||
return fmt.Errorf("error setting no new privileges: %s", err)
|
||||
}
|
||||
|
||||
// Add a rule for each syscall
|
||||
for _, call := range config.Syscalls {
|
||||
if call == nil {
|
||||
return fmt.Errorf("encountered nil syscall while initializing Seccomp")
|
||||
}
|
||||
|
||||
if err = matchCall(filter, call); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
if err = filter.Load(); err != nil {
|
||||
return fmt.Errorf("error loading seccomp filter into kernel: %s", err)
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
||||
|
||||
// Convert Libcontainer Action to Libseccomp ScmpAction
|
||||
func getAction(act configs.Action) (libseccomp.ScmpAction, error) {
|
||||
switch act {
|
||||
case configs.Kill:
|
||||
return actKill, nil
|
||||
case configs.Errno:
|
||||
return actErrno, nil
|
||||
case configs.Trap:
|
||||
return actTrap, nil
|
||||
case configs.Allow:
|
||||
return actAllow, nil
|
||||
default:
|
||||
return libseccomp.ActInvalid, fmt.Errorf("invalid action, cannot use in rule")
|
||||
}
|
||||
}
|
||||
|
||||
// Convert Libcontainer Operator to Libseccomp ScmpCompareOp
|
||||
func getOperator(op configs.Operator) (libseccomp.ScmpCompareOp, error) {
|
||||
switch op {
|
||||
case configs.EqualTo:
|
||||
return libseccomp.CompareEqual, nil
|
||||
case configs.NotEqualTo:
|
||||
return libseccomp.CompareNotEqual, nil
|
||||
case configs.GreaterThan:
|
||||
return libseccomp.CompareGreater, nil
|
||||
case configs.GreaterThanOrEqualTo:
|
||||
return libseccomp.CompareGreaterEqual, nil
|
||||
case configs.LessThan:
|
||||
return libseccomp.CompareLess, nil
|
||||
case configs.LessThanOrEqualTo:
|
||||
return libseccomp.CompareLessOrEqual, nil
|
||||
case configs.MaskEqualTo:
|
||||
return libseccomp.CompareMaskedEqual, nil
|
||||
default:
|
||||
return libseccomp.CompareInvalid, fmt.Errorf("invalid operator, cannot use in rule")
|
||||
}
|
||||
}
|
||||
|
||||
// Convert Libcontainer Arg to Libseccomp ScmpCondition
|
||||
func getCondition(arg *configs.Arg) (libseccomp.ScmpCondition, error) {
|
||||
cond := libseccomp.ScmpCondition{}
|
||||
|
||||
if arg == nil {
|
||||
return cond, fmt.Errorf("cannot convert nil to syscall condition")
|
||||
}
|
||||
|
||||
op, err := getOperator(arg.Op)
|
||||
if err != nil {
|
||||
return cond, err
|
||||
}
|
||||
|
||||
return libseccomp.MakeCondition(arg.Index, op, arg.Value, arg.ValueTwo)
|
||||
}
|
||||
|
||||
// Add a rule to match a single syscall
|
||||
func matchCall(filter *libseccomp.ScmpFilter, call *configs.Syscall) error {
|
||||
if call == nil || filter == nil {
|
||||
return fmt.Errorf("cannot use nil as syscall to block")
|
||||
}
|
||||
|
||||
if len(call.Name) == 0 {
|
||||
return fmt.Errorf("empty string is not a valid syscall")
|
||||
}
|
||||
|
||||
// If we can't resolve the syscall, assume it's not supported on this kernel
|
||||
// Ignore it, don't error out
|
||||
callNum, err := libseccomp.GetSyscallFromName(call.Name)
|
||||
if err != nil {
|
||||
log.Printf("Error resolving syscall name %s: %s - ignoring syscall.", call.Name, err)
|
||||
return nil
|
||||
}
|
||||
|
||||
// Convert the call's action to the libseccomp equivalent
|
||||
callAct, err := getAction(call.Action)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
// Unconditional match - just add the rule
|
||||
if len(call.Args) == 0 {
|
||||
if err = filter.AddRule(callNum, callAct); err != nil {
|
||||
return err
|
||||
}
|
||||
} else {
|
||||
// Conditional match - convert the per-arg rules into library format
|
||||
conditions := []libseccomp.ScmpCondition{}
|
||||
|
||||
for _, cond := range call.Args {
|
||||
newCond, err := getCondition(cond)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
conditions = append(conditions, newCond)
|
||||
}
|
||||
|
||||
if err = filter.AddRuleConditional(callNum, callAct, conditions); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
|
||||
return nil
|
||||
}
|
|
@ -1,124 +0,0 @@
|
|||
// +build linux
|
||||
|
||||
// Package seccomp provides native seccomp ( https://www.kernel.org/doc/Documentation/prctl/seccomp_filter.txt ) support for go.
|
||||
package seccomp
|
||||
|
||||
import (
|
||||
"syscall"
|
||||
"unsafe"
|
||||
)
|
||||
|
||||
// Operator that is used for argument comparison.
|
||||
type Operator int
|
||||
|
||||
const (
|
||||
EqualTo Operator = iota
|
||||
NotEqualTo
|
||||
GreatherThan
|
||||
LessThan
|
||||
MaskEqualTo
|
||||
)
|
||||
|
||||
const (
|
||||
jumpJT = 0xff
|
||||
jumpJF = 0xff
|
||||
labelJT = 0xfe
|
||||
labelJF = 0xfe
|
||||
)
|
||||
|
||||
const (
|
||||
pfLD = 0x0
|
||||
retKill = 0x00000000
|
||||
retTrap = 0x00030000
|
||||
retAllow = 0x7fff0000
|
||||
modeFilter = 0x2
|
||||
prSetNoNewPrivileges = 0x26
|
||||
)
|
||||
|
||||
func actionErrno(errno uint32) uint32 {
|
||||
return 0x00050000 | (errno & 0x0000ffff)
|
||||
}
|
||||
|
||||
var (
|
||||
secData = struct {
|
||||
nr int32
|
||||
arch uint32
|
||||
insPointer uint64
|
||||
args [6]uint64
|
||||
}{0, 0, 0, [6]uint64{0, 0, 0, 0, 0, 0}}
|
||||
)
|
||||
|
||||
var isLittle = func() bool {
|
||||
var (
|
||||
x = 0x1234
|
||||
p = unsafe.Pointer(&x)
|
||||
p2 = (*[unsafe.Sizeof(0)]byte)(p)
|
||||
)
|
||||
if p2[0] == 0 {
|
||||
return false
|
||||
}
|
||||
return true
|
||||
}()
|
||||
|
||||
var endian endianSupport
|
||||
|
||||
type endianSupport struct {
|
||||
}
|
||||
|
||||
func (e endianSupport) hi(i uint32) uint32 {
|
||||
if isLittle {
|
||||
return e.little(i)
|
||||
}
|
||||
return e.big(i)
|
||||
}
|
||||
|
||||
func (e endianSupport) low(i uint32) uint32 {
|
||||
if isLittle {
|
||||
return e.big(i)
|
||||
}
|
||||
return e.little(i)
|
||||
}
|
||||
|
||||
func (endianSupport) big(idx uint32) uint32 {
|
||||
if idx >= 6 {
|
||||
return 0
|
||||
}
|
||||
return uint32(unsafe.Offsetof(secData.args)) + 8*idx
|
||||
}
|
||||
|
||||
func (endianSupport) little(idx uint32) uint32 {
|
||||
if idx < 0 || idx >= 6 {
|
||||
return 0
|
||||
}
|
||||
return uint32(unsafe.Offsetof(secData.args)) +
|
||||
uint32(unsafe.Alignof(secData.args[0]))*idx + uint32(unsafe.Sizeof(secData.arch))
|
||||
}
|
||||
|
||||
func prctl(option int, arg2, arg3, arg4, arg5 uintptr) error {
|
||||
_, _, err := syscall.Syscall6(syscall.SYS_PRCTL, uintptr(option), arg2, arg3, arg4, arg5, 0)
|
||||
if err != 0 {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func newSockFprog(filter []sockFilter) *sockFprog {
|
||||
return &sockFprog{
|
||||
len: uint16(len(filter)),
|
||||
filt: filter,
|
||||
}
|
||||
}
|
||||
|
||||
type sockFprog struct {
|
||||
len uint16
|
||||
filt []sockFilter
|
||||
}
|
||||
|
||||
func (s *sockFprog) set() error {
|
||||
_, _, err := syscall.Syscall(syscall.SYS_PRCTL, uintptr(syscall.PR_SET_SECCOMP),
|
||||
uintptr(modeFilter), uintptr(unsafe.Pointer(s)))
|
||||
if err != 0 {
|
||||
return err
|
||||
}
|
||||
return nil
|
||||
}
|
|
@ -1,3 +1,19 @@
|
|||
// +build !linux
|
||||
// +build !linux !cgo !seccomp
|
||||
|
||||
package seccomp
|
||||
|
||||
import (
|
||||
"errors"
|
||||
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
)
|
||||
|
||||
var ErrSeccompNotEnabled = errors.New("seccomp: config provided but seccomp not supported")
|
||||
|
||||
// Seccomp not supported, do nothing
|
||||
func InitSeccomp(config *configs.Seccomp) error {
|
||||
if config != nil {
|
||||
return ErrSeccompNotEnabled
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
|
|
@ -7,6 +7,7 @@ import (
|
|||
|
||||
"github.com/opencontainers/runc/libcontainer/apparmor"
|
||||
"github.com/opencontainers/runc/libcontainer/label"
|
||||
"github.com/opencontainers/runc/libcontainer/seccomp"
|
||||
"github.com/opencontainers/runc/libcontainer/system"
|
||||
)
|
||||
|
||||
|
@ -20,6 +21,14 @@ func (l *linuxSetnsInit) Init() error {
|
|||
if err := setupRlimits(l.config.Config); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil {
|
||||
return err
|
||||
}
|
||||
if l.config.Config.Seccomp != nil {
|
||||
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if err := finalizeNamespace(l.config); err != nil {
|
||||
return err
|
||||
}
|
||||
|
|
|
@ -9,6 +9,7 @@ import (
|
|||
"github.com/opencontainers/runc/libcontainer/apparmor"
|
||||
"github.com/opencontainers/runc/libcontainer/configs"
|
||||
"github.com/opencontainers/runc/libcontainer/label"
|
||||
"github.com/opencontainers/runc/libcontainer/seccomp"
|
||||
"github.com/opencontainers/runc/libcontainer/system"
|
||||
)
|
||||
|
||||
|
@ -46,6 +47,10 @@ func (l *linuxStandardInit) Init() error {
|
|||
if err := setupRlimits(l.config.Config); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := setOomScoreAdj(l.config.Config.OomScoreAdj); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
label.Init()
|
||||
// InitializeMountNamespace() can be executed only for a new mount namespace
|
||||
if l.config.Config.Namespaces.Contains(configs.NEWNS) {
|
||||
|
@ -85,6 +90,11 @@ func (l *linuxStandardInit) Init() error {
|
|||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if l.config.Config.Seccomp != nil {
|
||||
if err := seccomp.InitSeccomp(l.config.Config.Seccomp); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if err := finalizeNamespace(l.config); err != nil {
|
||||
return err
|
||||
}
|
||||
|
@ -99,8 +109,5 @@ func (l *linuxStandardInit) Init() error {
|
|||
if syscall.Getppid() != l.parentPid {
|
||||
return syscall.Kill(syscall.Getpid(), syscall.SIGKILL)
|
||||
}
|
||||
if err := finalizeSeccomp(l.config); err != nil {
|
||||
return err
|
||||
}
|
||||
return system.Execv(l.config.Args[0], l.config.Args[0:], os.Environ())
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue