Merge pull request #38873 from thaJeztah/update_containerd_1.2.5

Update containerd v1.2.5, runc 2b18fe1d885ee5083ef9f0838fee39b62d653e30
2022-11-09 12:21:53 -05:00 · 2019-03-14 19:12:41 -07:00 · 2019-03-14 19:12:41 -07:00 · 05e7d000f2
commit 05e7d000f2
parent ca0b64ee3b 386b06eacd
24 changed files with 918 additions and 168 deletions
--- a/hack/dockerfile/install/containerd.installer
+++ b/hack/dockerfile/install/containerd.installer
@ -4,7 +4,7 @@
 # containerd is also pinned in vendor.conf. When updating the binary
 # version you may also need to update the vendor version to pick up bug
 # fixes or new APIs.
-CONTAINERD_COMMIT=e6b3f5632f50dbc4e9cb6288d911bf4f5e95b18e # v1.2.4
+CONTAINERD_COMMIT=bb71b10fd8f58240ca47fbb579b9d1028eea7c84 # v1.2.5

 install_containerd() {
 	echo "Install containerd version $CONTAINERD_COMMIT"
--- a/hack/dockerfile/install/runc.installer
+++ b/hack/dockerfile/install/runc.installer
@ -4,7 +4,7 @@
 # The version of runc should match the version that is used by the containerd
 # version that is used. If you need to update runc, open a pull request in
 # the containerd project first, and update both after that is merged.
-RUNC_COMMIT=6635b4f0c6af3810594d2770f662f34ddc15b40d
+RUNC_COMMIT=2b18fe1d885ee5083ef9f0838fee39b62d653e30

 install_runc() {
 	# If using RHEL7 kernels (3.10.0 el7), disable kmem accounting/limiting
--- a/vendor.conf
+++ b/vendor.conf
@ -79,7 +79,7 @@ google.golang.org/grpc v1.12.0
 # the containerd project first, and update both after that is merged.
 # This commit does not need to match RUNC_COMMIT as it is used for helper
 # packages but should be newer or equal.
-github.com/opencontainers/runc 12f6a991201fdb8f82579582d5e00e28fba06d0a
+github.com/opencontainers/runc 2b18fe1d885ee5083ef9f0838fee39b62d653e30
 github.com/opencontainers/runtime-spec 29686dbc5559d93fb1ef402eeda3e35c38d75af4 # v1.0.1-59-g29686db
 github.com/opencontainers/image-spec v1.0.1
 github.com/seccomp/libseccomp-golang 32f571b70023028bd57d9288c20efbcb237f3ce0
@ -118,10 +118,10 @@ github.com/googleapis/gax-go v2.0.0
 google.golang.org/genproto 694d95ba50e67b2e363f3483057db5d4910c18f9

 # containerd
-github.com/containerd/containerd e6b3f5632f50dbc4e9cb6288d911bf4f5e95b18e # v1.2.4
+github.com/containerd/containerd bb71b10fd8f58240ca47fbb579b9d1028eea7c84 # v1.2.5
 github.com/containerd/fifo 3d5202aec260678c48179c56f40e6f38a095738c
 github.com/containerd/continuity 004b46473808b3e7a4a3049c20e4376c91eb966d
-github.com/containerd/cgroups 5e610833b72089b37d0e615de9a92dfc043757c2
+github.com/containerd/cgroups dbea6f2bd41658b84b00417ceefa416b979cbf10
 github.com/containerd/console c12b1e7919c14469339a5d38f2f8ed9b64a9de23
 github.com/containerd/go-runc 5a6d9f37cfa36b15efba46dc7ea349fa9b7143c3
 github.com/containerd/typeurl a93fcdb778cd272c6e9b3028b2f42d813e785d40
--- a/vendor/github.com/containerd/cgroups/README.md
+++ b/vendor/github.com/containerd/cgroups/README.md
@ -1,8 +1,9 @@
 # cgroups

 [![Build Status](https://travis-ci.org/containerd/cgroups.svg?branch=master)](https://travis-ci.org/containerd/cgroups)
-
 [![codecov](https://codecov.io/gh/containerd/cgroups/branch/master/graph/badge.svg)](https://codecov.io/gh/containerd/cgroups)
+[![GoDoc](https://godoc.org/github.com/containerd/cgroups?status.svg)](https://godoc.org/github.com/containerd/cgroups)
+[![Go Report Card](https://goreportcard.com/badge/github.com/containerd/cgroups)](https://goreportcard.com/report/github.com/containerd/cgroups)

 Go package for creating, managing, inspecting, and destroying cgroups.
 The resources format for settings on the cgroup uses the OCI runtime-spec found
@ -110,3 +111,14 @@ err := control.MoveTo(destination)
 ```go
 subCgroup, err := control.New("child", resources)
 ```
+
+## Project details
+
+Cgroups is a containerd sub-project, licensed under the [Apache 2.0 license](./LICENSE).
+As a containerd sub-project, you will find the:
+
+ * [Project governance](https://github.com/containerd/project/blob/master/GOVERNANCE.md),
+ * [Maintainers](https://github.com/containerd/project/blob/master/MAINTAINERS),
+ * and [Contributing guidelines](https://github.com/containerd/project/blob/master/CONTRIBUTING.md)
+
+information in our [`containerd/project`](https://github.com/containerd/project) repository.
--- a/vendor/github.com/containerd/cgroups/blkio.go
+++ b/vendor/github.com/containerd/cgroups/blkio.go
@ -191,31 +191,42 @@ func (b *blkioController) readEntry(devices map[deviceKey]string, path, name str
 }

 func createBlkioSettings(blkio *specs.LinuxBlockIO) []blkioSettings {
-	settings := []blkioSettings{
-		{
-			name:   "weight",
-			value:  blkio.Weight,
-			format: uintf,
-		},
-		{
-			name:   "leaf_weight",
-			value:  blkio.LeafWeight,
-			format: uintf,
-		},
-	}
-	for _, wd := range blkio.WeightDevice {
+	settings := []blkioSettings{}
+
+	if blkio.Weight != nil {
 		settings = append(settings,
 			blkioSettings{
-				name:   "weight_device",
-				value:  wd,
-				format: weightdev,
-			},
-			blkioSettings{
-				name:   "leaf_weight_device",
-				value:  wd,
-				format: weightleafdev,
+				name:   "weight",
+				value:  blkio.Weight,
+				format: uintf,
 			})
 	}
+	if blkio.LeafWeight != nil {
+		settings = append(settings,
+			blkioSettings{
+				name:   "leaf_weight",
+				value:  blkio.LeafWeight,
+				format: uintf,
+			})
+	}
+	for _, wd := range blkio.WeightDevice {
+		if wd.Weight != nil {
+			settings = append(settings,
+				blkioSettings{
+					name:   "weight_device",
+					value:  wd,
+					format: weightdev,
+				})
+		}
+		if wd.LeafWeight != nil {
+			settings = append(settings,
+				blkioSettings{
+					name:   "leaf_weight_device",
+					value:  wd,
+					format: weightleafdev,
+				})
+		}
+	}
 	for _, t := range []struct {
 		name string
 		list []specs.LinuxThrottleDevice
@ -265,12 +276,12 @@ func uintf(v interface{}) []byte {

 func weightdev(v interface{}) []byte {
 	wd := v.(specs.LinuxWeightDevice)
-	return []byte(fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.Weight))
+	return []byte(fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, *wd.Weight))
 }

 func weightleafdev(v interface{}) []byte {
 	wd := v.(specs.LinuxWeightDevice)
-	return []byte(fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, wd.LeafWeight))
+	return []byte(fmt.Sprintf("%d:%d %d", wd.Major, wd.Minor, *wd.LeafWeight))
 }

 func throttleddev(v interface{}) []byte {
--- a/vendor/github.com/containerd/cgroups/cgroup.go
+++ b/vendor/github.com/containerd/cgroups/cgroup.go
@ -30,47 +30,84 @@ import (
 )

 // New returns a new control via the cgroup cgroups interface
-func New(hierarchy Hierarchy, path Path, resources *specs.LinuxResources) (Cgroup, error) {
+func New(hierarchy Hierarchy, path Path, resources *specs.LinuxResources, opts ...InitOpts) (Cgroup, error) {
+	config := newInitConfig()
+	for _, o := range opts {
+		if err := o(config); err != nil {
+			return nil, err
+		}
+	}
 	subsystems, err := hierarchy()
 	if err != nil {
 		return nil, err
 	}
+	var active []Subsystem
 	for _, s := range subsystems {
+		// check if subsystem exists
 		if err := initializeSubsystem(s, path, resources); err != nil {
+			if err == ErrControllerNotActive {
+				if config.InitCheck != nil {
+					if skerr := config.InitCheck(s, path, err); skerr != nil {
+						if skerr != ErrIgnoreSubsystem {
+							return nil, skerr
+						}
+					}
+				}
+				continue
+			}
 			return nil, err
 		}
+		active = append(active, s)
 	}
 	return &cgroup{
 		path:       path,
-		subsystems: subsystems,
+		subsystems: active,
 	}, nil
 }

 // Load will load an existing cgroup and allow it to be controlled
-func Load(hierarchy Hierarchy, path Path) (Cgroup, error) {
+func Load(hierarchy Hierarchy, path Path, opts ...InitOpts) (Cgroup, error) {
+	config := newInitConfig()
+	for _, o := range opts {
+		if err := o(config); err != nil {
+			return nil, err
+		}
+	}
+	var activeSubsystems []Subsystem
 	subsystems, err := hierarchy()
 	if err != nil {
 		return nil, err
 	}
-	// check the the subsystems still exist
+	// check that the subsystems still exist, and keep only those that actually exist
 	for _, s := range pathers(subsystems) {
 		p, err := path(s.Name())
 		if err != nil {
 			if os.IsNotExist(errors.Cause(err)) {
 				return nil, ErrCgroupDeleted
 			}
+			if err == ErrControllerNotActive {
+				if config.InitCheck != nil {
+					if skerr := config.InitCheck(s, path, err); skerr != nil {
+						if skerr != ErrIgnoreSubsystem {
+							return nil, skerr
+						}
+					}
+				}
+				continue
+			}
 			return nil, err
 		}
 		if _, err := os.Lstat(s.Path(p)); err != nil {
 			if os.IsNotExist(err) {
-				return nil, ErrCgroupDeleted
+				continue
 			}
 			return nil, err
 		}
+		activeSubsystems = append(activeSubsystems, s)
 	}
 	return &cgroup{
 		path:       path,
-		subsystems: subsystems,
+		subsystems: activeSubsystems,
 	}, nil
 }

@ -319,6 +356,49 @@ func (c *cgroup) processes(subsystem Name, recursive bool) ([]Process, error) {
 	return processes, err
 }

+// Tasks returns the tasks running inside the cgroup along
+// with the subsystem used, pid, and path
+func (c *cgroup) Tasks(subsystem Name, recursive bool) ([]Task, error) {
+	c.mu.Lock()
+	defer c.mu.Unlock()
+	if c.err != nil {
+		return nil, c.err
+	}
+	return c.tasks(subsystem, recursive)
+}
+
+func (c *cgroup) tasks(subsystem Name, recursive bool) ([]Task, error) {
+	s := c.getSubsystem(subsystem)
+	sp, err := c.path(subsystem)
+	if err != nil {
+		return nil, err
+	}
+	path := s.(pather).Path(sp)
+	var tasks []Task
+	err = filepath.Walk(path, func(p string, info os.FileInfo, err error) error {
+		if err != nil {
+			return err
+		}
+		if !recursive && info.IsDir() {
+			if p == path {
+				return nil
+			}
+			return filepath.SkipDir
+		}
+		dir, name := filepath.Split(p)
+		if name != cgroupTasks {
+			return nil
+		}
+		procs, err := readTasksPids(dir, subsystem)
+		if err != nil {
+			return err
+		}
+		tasks = append(tasks, procs...)
+		return nil
+	})
+	return tasks, err
+}
+
 // Freeze freezes the entire cgroup and all the processes inside it
 func (c *cgroup) Freeze() error {
 	c.mu.Lock()
--- a/vendor/github.com/containerd/cgroups/control.go
+++ b/vendor/github.com/containerd/cgroups/control.go
@ -44,6 +44,15 @@ type Process struct {
 	Path string
 }

+type Task struct {
+	// Subsystem is the name of the subsystem that the task is in
+	Subsystem Name
+	// Pid is the process id of the task
+	Pid int
+	// Path is the full path of the subsystem and location that the task is in
+	Path string
+}
+
 // Cgroup handles interactions with the individual groups to perform
 // actions on them as them main interface to this cgroup package
 type Cgroup interface {
@ -64,6 +73,8 @@ type Cgroup interface {
 	Update(resources *specs.LinuxResources) error
 	// Processes returns all the processes in a select subsystem for the cgroup
 	Processes(Name, bool) ([]Process, error)
+	// Tasks returns all the tasks in a select subsystem for the cgroup
+	Tasks(Name, bool) ([]Task, error)
 	// Freeze freezes or pauses all processes inside the cgroup
 	Freeze() error
 	// Thaw thaw or resumes all processes inside the cgroup
--- a/vendor/github.com/containerd/cgroups/cpuset.go
+++ b/vendor/github.com/containerd/cgroups/cpuset.go
@ -57,21 +57,21 @@ func (c *cpusetController) Create(path string, resources *specs.LinuxResources)
 	if resources.CPU != nil {
 		for _, t := range []struct {
 			name  string
-			value *string
+			value string
 		}{
 			{
 				name:  "cpus",
-				value: &resources.CPU.Cpus,
+				value: resources.CPU.Cpus,
 			},
 			{
 				name:  "mems",
-				value: &resources.CPU.Mems,
+				value: resources.CPU.Mems,
 			},
 		} {
-			if t.value != nil {
+			if t.value != "" {
 				if err := ioutil.WriteFile(
 					filepath.Join(c.Path(path), fmt.Sprintf("cpuset.%s", t.name)),
-					[]byte(*t.value),
+					[]byte(t.value),
 					defaultFilePerm,
 				); err != nil {
 					return err
--- a/vendor/github.com/containerd/cgroups/devices.go
+++ b/vendor/github.com/containerd/cgroups/devices.go
@ -58,6 +58,9 @@ func (d *devicesController) Create(path string, resources *specs.LinuxResources)
 		if device.Allow {
 			file = allowDeviceFile
 		}
+		if device.Type == "" {
+			device.Type = "a"
+		}
 		if err := ioutil.WriteFile(
 			filepath.Join(d.Path(path), file),
 			[]byte(deviceString(device)),
--- a/vendor/github.com/containerd/cgroups/net_prio.go
+++ b/vendor/github.com/containerd/cgroups/net_prio.go
@ -50,7 +50,7 @@ func (n *netprioController) Create(path string, resources *specs.LinuxResources)
 	if resources.Network != nil {
 		for _, prio := range resources.Network.Priorities {
 			if err := ioutil.WriteFile(
-				filepath.Join(n.Path(path), "net_prio_ifpriomap"),
+				filepath.Join(n.Path(path), "net_prio.ifpriomap"),
 				formatPrio(prio.Name, prio.Priority),
 				defaultFilePerm,
 			); err != nil {
--- a/vendor/github.com/containerd/cgroups/opts.go
+++ b/vendor/github.com/containerd/cgroups/opts.go
@ -0,0 +1,61 @@
+/*
+   Copyright The containerd Authors.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+*/
+
+package cgroups
+
+import (
+	"github.com/pkg/errors"
+)
+
+var (
+	// ErrIgnoreSubsystem allows the specific subsystem to be skipped
+	ErrIgnoreSubsystem = errors.New("skip subsystem")
+	// ErrDevicesRequired is returned when the devices subsystem is required but
+	// does not exist or is not active
+	ErrDevicesRequired = errors.New("devices subsystem is required")
+)
+
+// InitOpts allows configuration for the creation or loading of a cgroup
+type InitOpts func(*InitConfig) error
+
+// InitConfig provides configuration options for the creation
+// or loading of a cgroup and its subsystems
+type InitConfig struct {
+	// InitCheck can be used to check initialization errors from the subsystem
+	InitCheck InitCheck
+}
+
+func newInitConfig() *InitConfig {
+	return &InitConfig{
+		InitCheck: RequireDevices,
+	}
+}
+
+// InitCheck allows subsystems errors to be checked when initialized or loaded
+type InitCheck func(Subsystem, Path, error) error
+
+// AllowAny allows any subsystem errors to be skipped
+func AllowAny(s Subsystem, p Path, err error) error {
+	return ErrIgnoreSubsystem
+}
+
+// RequireDevices requires the device subsystem but no others
+func RequireDevices(s Subsystem, p Path, err error) error {
+	if s.Name() == Devices {
+		return ErrDevicesRequired
+	}
+	return ErrIgnoreSubsystem
+}
--- a/vendor/github.com/containerd/cgroups/paths.go
+++ b/vendor/github.com/containerd/cgroups/paths.go
@ -57,6 +57,9 @@ func PidPath(pid int) Path {
 	return existingPath(paths, "")
 }

+// ErrControllerNotActive is returned when a controller is not supported or enabled
+var ErrControllerNotActive = errors.New("controller is not supported")
+
 func existingPath(paths map[string]string, suffix string) Path {
 	// localize the paths based on the root mount dest for nested cgroups
 	for n, p := range paths {
@ -77,7 +80,7 @@ func existingPath(paths map[string]string, suffix string) Path {
 		root, ok := paths[string(name)]
 		if !ok {
 			if root, ok = paths[fmt.Sprintf("name=%s", name)]; !ok {
-				return "", fmt.Errorf("unable to find %q in controller set", name)
+				return "", ErrControllerNotActive
 			}
 		}
 		if suffix != "" {
--- a/vendor/github.com/containerd/cgroups/subsystem.go
+++ b/vendor/github.com/containerd/cgroups/subsystem.go
@ -42,7 +42,7 @@ const (
 )

 // Subsystems returns a complete list of the default cgroups
-// avaliable on most linux systems
+// available on most linux systems
 func Subsystems() []Name {
 	n := []Name{
 		Hugetlb,
--- a/vendor/github.com/containerd/cgroups/systemd.go
+++ b/vendor/github.com/containerd/cgroups/systemd.go
@ -32,6 +32,11 @@ const (
 	defaultSlice      = "system.slice"
 )

+var (
+	canDelegate bool
+	once        sync.Once
+)
+
 func Systemd() ([]Subsystem, error) {
 	root, err := v1MountPoint()
 	if err != nil {
@ -54,7 +59,7 @@ func Slice(slice, name string) Path {
 		slice = defaultSlice
 	}
 	return func(subsystem Name) (string, error) {
-		return filepath.Join(slice, unitName(name)), nil
+		return filepath.Join(slice, name), nil
 	}
 }

@ -80,15 +85,39 @@ func (s *SystemdController) Create(path string, resources *specs.LinuxResources)
 	}
 	defer conn.Close()
 	slice, name := splitName(path)
+	// We need to see if systemd can handle the delegate property
+	// Systemd will return an error if it cannot handle delegate regardless
+	// of its bool setting.
+	checkDelegate := func() {
+		canDelegate = true
+		dlSlice := newProperty("Delegate", true)
+		if _, err := conn.StartTransientUnit(slice, "testdelegate", []systemdDbus.Property{dlSlice}, nil); err != nil {
+			if dbusError, ok := err.(dbus.Error); ok {
+				// Starting with systemd v237, Delegate is not even a property of slices anymore,
+				// so the D-Bus call fails with "InvalidArgs" error.
+				if strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.PropertyReadOnly") || strings.Contains(dbusError.Name, "org.freedesktop.DBus.Error.InvalidArgs") {
+					canDelegate = false
+				}
+			}
+		}
+
+		conn.StopUnit(slice, "testDelegate", nil)
+	}
+	once.Do(checkDelegate)
 	properties := []systemdDbus.Property{
 		systemdDbus.PropDescription(fmt.Sprintf("cgroup %s", name)),
 		systemdDbus.PropWants(slice),
 		newProperty("DefaultDependencies", false),
-		newProperty("Delegate", true),
 		newProperty("MemoryAccounting", true),
 		newProperty("CPUAccounting", true),
 		newProperty("BlockIOAccounting", true),
 	}
+
+	// If we can delegate, we add the property back in
+	if canDelegate {
+		properties = append(properties, newProperty("Delegate", true))
+	}
+
 	ch := make(chan string)
 	_, err = conn.StartTransientUnit(name, "replace", properties, ch)
 	if err != nil {
--- a/vendor/github.com/containerd/cgroups/utils.go
+++ b/vendor/github.com/containerd/cgroups/utils.go
@ -111,7 +111,7 @@ func remove(path string) error {
 	return fmt.Errorf("cgroups: unable to remove path %q", path)
 }

-// readPids will read all the pids in a cgroup by the provided path
+// readPids will read all the pids of processes in a cgroup by the provided path
 func readPids(path string, subsystem Name) ([]Process, error) {
 	f, err := os.Open(filepath.Join(path, cgroupProcs))
 	if err != nil {
@ -138,6 +138,33 @@ func readPids(path string, subsystem Name) ([]Process, error) {
 	return out, nil
 }

+// readTasksPids will read all the pids of tasks in a cgroup by the provided path
+func readTasksPids(path string, subsystem Name) ([]Task, error) {
+	f, err := os.Open(filepath.Join(path, cgroupTasks))
+	if err != nil {
+		return nil, err
+	}
+	defer f.Close()
+	var (
+		out []Task
+		s   = bufio.NewScanner(f)
+	)
+	for s.Scan() {
+		if t := s.Text(); t != "" {
+			pid, err := strconv.Atoi(t)
+			if err != nil {
+				return nil, err
+			}
+			out = append(out, Task{
+				Pid:       pid,
+				Subsystem: subsystem,
+				Path:      path,
+			})
+		}
+	}
+	return out, nil
+}
+
 func hugePageSizes() ([]string, error) {
 	var (
 		pageSizes []string
--- a/vendor/github.com/containerd/containerd/archive/tar.go
+++ b/vendor/github.com/containerd/containerd/archive/tar.go
@ -194,7 +194,7 @@ func applyNaive(ctx context.Context, root string, tr *tar.Reader, options ApplyO
 				parentPath = filepath.Dir(path)
 			}
 			if _, err := os.Lstat(parentPath); err != nil && os.IsNotExist(err) {
-				err = mkdirAll(parentPath, 0700)
+				err = mkdirAll(parentPath, 0755)
 				if err != nil {
 					return 0, err
 				}
--- a/vendor/github.com/containerd/containerd/image.go
+++ b/vendor/github.com/containerd/containerd/image.go
@ -170,26 +170,22 @@ func (i *image) Unpack(ctx context.Context, snapshotterName string) error {
 		chain = append(chain, layer.Diff.Digest)
 	}

-	if unpacked {
-		desc, err := i.i.Config(ctx, cs, i.platform)
-		if err != nil {
-			return err
-		}
-
-		rootfs := identity.ChainID(chain).String()
-
-		cinfo := content.Info{
-			Digest: desc.Digest,
-			Labels: map[string]string{
-				fmt.Sprintf("containerd.io/gc.ref.snapshot.%s", snapshotterName): rootfs,
-			},
-		}
-		if _, err := cs.Update(ctx, cinfo, fmt.Sprintf("labels.containerd.io/gc.ref.snapshot.%s", snapshotterName)); err != nil {
-			return err
-		}
+	desc, err := i.i.Config(ctx, cs, i.platform)
+	if err != nil {
+		return err
 	}

-	return nil
+	rootfs := identity.ChainID(chain).String()
+
+	cinfo := content.Info{
+		Digest: desc.Digest,
+		Labels: map[string]string{
+			fmt.Sprintf("containerd.io/gc.ref.snapshot.%s", snapshotterName): rootfs,
+		},
+	}
+
+	_, err = cs.Update(ctx, cinfo, fmt.Sprintf("labels.containerd.io/gc.ref.snapshot.%s", snapshotterName))
+	return err
 }

 func (i *image) getLayers(ctx context.Context, platform platforms.MatchComparer) ([]rootfs.Layer, error) {
--- a/vendor/github.com/containerd/containerd/metadata/gc.go
+++ b/vendor/github.com/containerd/containerd/metadata/gc.go
@ -64,6 +64,18 @@ func scanRoots(ctx context.Context, tx *bolt.Tx, nc chan<- gc.Node) error {
 	// iterate through each namespace
 	v1c := v1bkt.Cursor()

+	// cerr indicates the scan did not successfully send all
+	// the roots. The scan does not need to be cancelled but
+	// must return error at the end.
+	var cerr error
+	fn := func(n gc.Node) {
+		select {
+		case nc <- n:
+		case <-ctx.Done():
+			cerr = ctx.Err()
+		}
+	}
+
 	for k, v := v1c.First(); k != nil; k, v = v1c.Next() {
 		if v != nil {
 			continue
@ -92,11 +104,7 @@ func scanRoots(ctx context.Context, tx *bolt.Tx, nc chan<- gc.Node) error {
 					}
 				}

-				select {
-				case nc <- gcnode(ResourceLease, ns, string(k)):
-				case <-ctx.Done():
-					return ctx.Err()
-				}
+				fn(gcnode(ResourceLease, ns, string(k)))

 				// Emit content and snapshots as roots instead of implementing
 				// in references. Since leases cannot be referenced there is
@ -106,11 +114,7 @@ func scanRoots(ctx context.Context, tx *bolt.Tx, nc chan<- gc.Node) error {
 				cbkt := libkt.Bucket(bucketKeyObjectContent)
 				if cbkt != nil {
 					if err := cbkt.ForEach(func(k, v []byte) error {
-						select {
-						case nc <- gcnode(ResourceContent, ns, string(k)):
-						case <-ctx.Done():
-							return ctx.Err()
-						}
+						fn(gcnode(ResourceContent, ns, string(k)))
 						return nil
 					}); err != nil {
 						return err
@ -126,11 +130,7 @@ func scanRoots(ctx context.Context, tx *bolt.Tx, nc chan<- gc.Node) error {
 						snbkt := sbkt.Bucket(sk)

 						return snbkt.ForEach(func(k, v []byte) error {
-							select {
-							case nc <- gcnode(ResourceSnapshot, ns, fmt.Sprintf("%s/%s", sk, k)):
-							case <-ctx.Done():
-								return ctx.Err()
-							}
+							fn(gcnode(ResourceSnapshot, ns, fmt.Sprintf("%s/%s", sk, k)))
 							return nil
 						})
 					}); err != nil {
@ -141,11 +141,7 @@ func scanRoots(ctx context.Context, tx *bolt.Tx, nc chan<- gc.Node) error {
 				ibkt := libkt.Bucket(bucketKeyObjectIngests)
 				if ibkt != nil {
 					if err := ibkt.ForEach(func(k, v []byte) error {
-						select {
-						case nc <- gcnode(ResourceIngest, ns, string(k)):
-						case <-ctx.Done():
-							return ctx.Err()
-						}
+						fn(gcnode(ResourceIngest, ns, string(k)))
 						return nil
 					}); err != nil {
 						return err
@ -168,18 +164,9 @@ func scanRoots(ctx context.Context, tx *bolt.Tx, nc chan<- gc.Node) error {
 				target := ibkt.Bucket(k).Bucket(bucketKeyTarget)
 				if target != nil {
 					contentKey := string(target.Get(bucketKeyDigest))
-					select {
-					case nc <- gcnode(ResourceContent, ns, contentKey):
-					case <-ctx.Done():
-						return ctx.Err()
-					}
+					fn(gcnode(ResourceContent, ns, contentKey))
 				}
-				return sendSnapshotRefs(ns, ibkt.Bucket(k), func(n gc.Node) {
-					select {
-					case nc <- n:
-					case <-ctx.Done():
-					}
-				})
+				return sendLabelRefs(ns, ibkt.Bucket(k), fn)
 			}); err != nil {
 				return err
 			}
@ -200,11 +187,7 @@ func scanRoots(ctx context.Context, tx *bolt.Tx, nc chan<- gc.Node) error {
 					if ea == nil || expThreshold.After(*ea) {
 						return nil
 					}
-					select {
-					case nc <- gcnode(ResourceIngest, ns, string(k)):
-					case <-ctx.Done():
-						return ctx.Err()
-					}
+					fn(gcnode(ResourceIngest, ns, string(k)))
 					return nil
 				}); err != nil {
 					return err
@ -216,7 +199,12 @@ func scanRoots(ctx context.Context, tx *bolt.Tx, nc chan<- gc.Node) error {
 					if v != nil {
 						return nil
 					}
-					return sendRootRef(ctx, nc, gcnode(ResourceContent, ns, string(k)), cbkt.Bucket(k))
+
+					if isRootRef(cbkt.Bucket(k)) {
+						fn(gcnode(ResourceContent, ns, string(k)))
+					}
+
+					return nil
 				}); err != nil {
 					return err
 				}
@ -229,23 +217,15 @@ func scanRoots(ctx context.Context, tx *bolt.Tx, nc chan<- gc.Node) error {
 				if v != nil {
 					return nil
 				}
-				snapshotter := string(cbkt.Bucket(k).Get(bucketKeySnapshotter))
+
+				cibkt := cbkt.Bucket(k)
+				snapshotter := string(cibkt.Get(bucketKeySnapshotter))
 				if snapshotter != "" {
-					ss := string(cbkt.Bucket(k).Get(bucketKeySnapshotKey))
-					select {
-					case nc <- gcnode(ResourceSnapshot, ns, fmt.Sprintf("%s/%s", snapshotter, ss)):
-					case <-ctx.Done():
-						return ctx.Err()
-					}
+					ss := string(cibkt.Get(bucketKeySnapshotKey))
+					fn(gcnode(ResourceSnapshot, ns, fmt.Sprintf("%s/%s", snapshotter, ss)))
 				}

-				// TODO: Send additional snapshot refs through labels
-				return sendSnapshotRefs(ns, cbkt.Bucket(k), func(n gc.Node) {
-					select {
-					case nc <- n:
-					case <-ctx.Done():
-					}
-				})
+				return sendLabelRefs(ns, cibkt, fn)
 			}); err != nil {
 				return err
 			}
@ -263,15 +243,17 @@ func scanRoots(ctx context.Context, tx *bolt.Tx, nc chan<- gc.Node) error {
 					if v != nil {
 						return nil
 					}
-
-					return sendRootRef(ctx, nc, gcnode(ResourceSnapshot, ns, fmt.Sprintf("%s/%s", sk, k)), snbkt.Bucket(k))
+					if isRootRef(snbkt.Bucket(k)) {
+						fn(gcnode(ResourceSnapshot, ns, fmt.Sprintf("%s/%s", sk, k)))
+					}
+					return nil
 				})
 			}); err != nil {
 				return err
 			}
 		}
 	}
-	return nil
+	return cerr
 }

 func references(ctx context.Context, tx *bolt.Tx, node gc.Node, fn func(gc.Node)) error {
@ -282,10 +264,7 @@ func references(ctx context.Context, tx *bolt.Tx, node gc.Node, fn func(gc.Node)
 			return nil
 		}

-		if err := sendSnapshotRefs(node.Namespace, bkt, fn); err != nil {
-			return err
-		}
-		return sendContentRefs(node.Namespace, bkt, fn)
+		return sendLabelRefs(node.Namespace, bkt, fn)
 	} else if node.Type == ResourceSnapshot {
 		parts := strings.SplitN(node.Key, "/", 2)
 		if len(parts) != 2 {
@ -304,7 +283,7 @@ func references(ctx context.Context, tx *bolt.Tx, node gc.Node, fn func(gc.Node)
 			fn(gcnode(ResourceSnapshot, node.Namespace, fmt.Sprintf("%s/%s", ss, pv)))
 		}

-		return sendSnapshotRefs(node.Namespace, bkt, fn)
+		return sendLabelRefs(node.Namespace, bkt, fn)
 	} else if node.Type == ResourceIngest {
 		// Send expected value
 		bkt := getBucket(tx, bucketKeyVersion, []byte(node.Namespace), bucketKeyObjectContent, bucketKeyObjectIngests, []byte(node.Key))
@ -456,25 +435,8 @@ func remove(ctx context.Context, tx *bolt.Tx, node gc.Node) error {
 	return nil
 }

-// sendSnapshotRefs sends all snapshot references referred to by the labels in the bkt
-func sendSnapshotRefs(ns string, bkt *bolt.Bucket, fn func(gc.Node)) error {
-	lbkt := bkt.Bucket(bucketKeyObjectLabels)
-	if lbkt != nil {
-		lc := lbkt.Cursor()
-
-		for k, v := lc.Seek(labelGCSnapRef); k != nil && strings.HasPrefix(string(k), string(labelGCSnapRef)); k, v = lc.Next() {
-			snapshotter := k[len(labelGCSnapRef):]
-			if i := bytes.IndexByte(snapshotter, '/'); i >= 0 {
-				snapshotter = snapshotter[:i]
-			}
-			fn(gcnode(ResourceSnapshot, ns, fmt.Sprintf("%s/%s", snapshotter, v)))
-		}
-	}
-	return nil
-}
-
-// sendContentRefs sends all content references referred to by the labels in the bkt
-func sendContentRefs(ns string, bkt *bolt.Bucket, fn func(gc.Node)) error {
+// sendLabelRefs sends all snapshot and content references referred to by the labels in the bkt
+func sendLabelRefs(ns string, bkt *bolt.Bucket, fn func(gc.Node)) error {
 	lbkt := bkt.Bucket(bucketKeyObjectLabels)
 	if lbkt != nil {
 		lc := lbkt.Cursor()
@ -490,6 +452,15 @@ func sendContentRefs(ns string, bkt *bolt.Bucket, fn func(gc.Node)) error {

 			fn(gcnode(ResourceContent, ns, string(v)))
 		}
+
+		for k, v := lc.Seek(labelGCSnapRef); k != nil && strings.HasPrefix(string(k), string(labelGCSnapRef)); k, v = lc.Next() {
+			snapshotter := k[len(labelGCSnapRef):]
+			if i := bytes.IndexByte(snapshotter, '/'); i >= 0 {
+				snapshotter = snapshotter[:i]
+			}
+			fn(gcnode(ResourceSnapshot, ns, fmt.Sprintf("%s/%s", snapshotter, v)))
+		}
+
 	}
 	return nil
 }
@ -506,17 +477,6 @@ func isRootRef(bkt *bolt.Bucket) bool {
 	return false
 }

-func sendRootRef(ctx context.Context, nc chan<- gc.Node, n gc.Node, bkt *bolt.Bucket) error {
-	if isRootRef(bkt) {
-		select {
-		case nc <- n:
-		case <-ctx.Done():
-			return ctx.Err()
-		}
-	}
-	return nil
-}
-
 func gcnode(t gc.ResourceType, ns, key string) gc.Node {
 	return gc.Node{
 		Type:      t,
--- a/vendor/github.com/containerd/containerd/vendor.conf
+++ b/vendor/github.com/containerd/containerd/vendor.conf
@ -1,6 +1,6 @@
 github.com/containerd/go-runc 5a6d9f37cfa36b15efba46dc7ea349fa9b7143c3
 github.com/containerd/console c12b1e7919c14469339a5d38f2f8ed9b64a9de23
-github.com/containerd/cgroups 5e610833b72089b37d0e615de9a92dfc043757c2
+github.com/containerd/cgroups dbea6f2bd41658b84b00417ceefa416b979cbf10
 github.com/containerd/typeurl a93fcdb778cd272c6e9b3028b2f42d813e785d40
 github.com/containerd/fifo 3d5202aec260678c48179c56f40e6f38a095738c
 github.com/containerd/btrfs 2e1aa0ddf94f91fa282b6ed87c23bf0d64911244
@ -20,7 +20,7 @@ github.com/gogo/protobuf v1.0.0
 github.com/gogo/googleapis 08a7655d27152912db7aaf4f983275eaf8d128ef
 github.com/golang/protobuf v1.1.0
 github.com/opencontainers/runtime-spec eba862dc2470385a233c7507392675cbeadf7353 # v1.0.1-45-geba862d
-github.com/opencontainers/runc 6635b4f0c6af3810594d2770f662f34ddc15b40d
+github.com/opencontainers/runc 2b18fe1d885ee5083ef9f0838fee39b62d653e30
 github.com/sirupsen/logrus v1.0.0
 github.com/urfave/cli 7bc6a0acffa589f415f88aca16cc1de5ffd66f9c
 golang.org/x/net b3756b4b77d7b13260a0a2ec658753cf48922eac
@ -43,7 +43,7 @@ github.com/google/go-cmp v0.1.0
 go.etcd.io/bbolt v1.3.1-etcd.8

 # cri dependencies
-github.com/containerd/cri da0c016c830b2ea97fd1d737c49a568a816bf964 # release/1.2 branch
+github.com/containerd/cri a92c40017473cbe0239ce180125f12669757e44f # release/1.2 branch
 github.com/containerd/go-cni 40bcf8ec8acd7372be1d77031d585d5d8e561c90
 github.com/blang/semver v3.1.0
 github.com/containernetworking/cni v0.6.0
--- a/vendor/github.com/opencontainers/runc/README.md
+++ b/vendor/github.com/opencontainers/runc/README.md
@ -16,10 +16,9 @@ This means that `runc` 1.0.0 should implement the 1.0 version of the specificati

 You can find official releases of `runc` on the [release](https://github.com/opencontainers/runc/releases) page.

-### Security
+## Security

-If you wish to report a security issue, please disclose the issue responsibly
-to security@opencontainers.org.
+Reporting process and disclosure communications are outlined in [/org/security](https://github.com/opencontainers/org/blob/master/security/)

 ## Building

--- a/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go
+++ b/vendor/github.com/opencontainers/runc/libcontainer/cgroups/utils.go
@ -14,6 +14,7 @@ import (
 	"time"

 	units "github.com/docker/go-units"
+	"golang.org/x/sys/unix"
 )

 const (
@ -463,11 +464,40 @@ func WriteCgroupProc(dir string, pid int) error {
 		return fmt.Errorf("no such directory for %s", CgroupProcesses)
 	}

-	// Don't attach any pid to the cgroup if -1 is specified as a pid
-	if pid != -1 {
-		if err := ioutil.WriteFile(filepath.Join(dir, CgroupProcesses), []byte(strconv.Itoa(pid)), 0700); err != nil {
-			return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err)
-		}
+	// Dont attach any pid to the cgroup if -1 is specified as a pid
+	if pid == -1 {
+		return nil
+	}
+
+	cgroupProcessesFile, err := os.OpenFile(filepath.Join(dir, CgroupProcesses), os.O_WRONLY|os.O_CREATE|os.O_TRUNC, 0700)
+	if err != nil {
+		return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err)
+	}
+	defer cgroupProcessesFile.Close()
+
+	for i := 0; i < 5; i++ {
+		_, err = cgroupProcessesFile.WriteString(strconv.Itoa(pid))
+		if err == nil {
+			return nil
+		}
+
+		// EINVAL might mean that the task being added to cgroup.procs is in state
+		// TASK_NEW. We should attempt to do so again.
+		if isEINVAL(err) {
+			time.Sleep(30 * time.Millisecond)
+			continue
+		}
+
+		return fmt.Errorf("failed to write %v to %v: %v", pid, CgroupProcesses, err)
+	}
+	return err
+}
+
+func isEINVAL(err error) bool {
+	switch err := err.(type) {
+	case *os.PathError:
+		return err.Err == unix.EINVAL
+	default:
+		return false
 	}
-	return nil
 }
--- a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/cloned_binary.c
+++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/cloned_binary.c
@ -0,0 +1,516 @@
+/*
+ * Copyright (C) 2019 Aleksa Sarai <cyphar@cyphar.com>
+ * Copyright (C) 2019 SUSE LLC
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define _GNU_SOURCE
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <limits.h>
+#include <fcntl.h>
+#include <errno.h>
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/statfs.h>
+#include <sys/vfs.h>
+#include <sys/mman.h>
+#include <sys/mount.h>
+#include <sys/sendfile.h>
+#include <sys/syscall.h>
+
+/* Use our own wrapper for memfd_create. */
+#if !defined(SYS_memfd_create) && defined(__NR_memfd_create)
+#  define SYS_memfd_create __NR_memfd_create
+#endif
+/* memfd_create(2) flags -- copied from <linux/memfd.h>. */
+#ifndef MFD_CLOEXEC
+#  define MFD_CLOEXEC       0x0001U
+#  define MFD_ALLOW_SEALING 0x0002U
+#endif
+int memfd_create(const char *name, unsigned int flags)
+{
+#ifdef SYS_memfd_create
+	return syscall(SYS_memfd_create, name, flags);
+#else
+	errno = ENOSYS;
+	return -1;
+#endif
+}
+
+
+/* This comes directly from <linux/fcntl.h>. */
+#ifndef F_LINUX_SPECIFIC_BASE
+#  define F_LINUX_SPECIFIC_BASE 1024
+#endif
+#ifndef F_ADD_SEALS
+#  define F_ADD_SEALS (F_LINUX_SPECIFIC_BASE + 9)
+#  define F_GET_SEALS (F_LINUX_SPECIFIC_BASE + 10)
+#endif
+#ifndef F_SEAL_SEAL
+#  define F_SEAL_SEAL   0x0001	/* prevent further seals from being set */
+#  define F_SEAL_SHRINK 0x0002	/* prevent file from shrinking */
+#  define F_SEAL_GROW   0x0004	/* prevent file from growing */
+#  define F_SEAL_WRITE  0x0008	/* prevent writes */
+#endif
+
+#define CLONED_BINARY_ENV "_LIBCONTAINER_CLONED_BINARY"
+#define RUNC_MEMFD_COMMENT "runc_cloned:/proc/self/exe"
+#define RUNC_MEMFD_SEALS \
+	(F_SEAL_SEAL | F_SEAL_SHRINK | F_SEAL_GROW | F_SEAL_WRITE)
+
+static void *must_realloc(void *ptr, size_t size)
+{
+	void *old = ptr;
+	do {
+		ptr = realloc(old, size);
+	} while(!ptr);
+	return ptr;
+}
+
+/*
+ * Verify whether we are currently in a self-cloned program (namely, is
+ * /proc/self/exe a memfd). F_GET_SEALS will only succeed for memfds (or rather
+ * for shmem files), and we want to be sure it's actually sealed.
+ */
+static int is_self_cloned(void)
+{
+	int fd, ret, is_cloned = 0;
+	struct stat statbuf = {};
+	struct statfs fsbuf = {};
+
+	fd = open("/proc/self/exe", O_RDONLY|O_CLOEXEC);
+	if (fd < 0)
+		return -ENOTRECOVERABLE;
+
+	/*
+	 * Is the binary a fully-sealed memfd? We don't need CLONED_BINARY_ENV for
+	 * this, because you cannot write to a sealed memfd no matter what (so
+	 * sharing it isn't a bad thing -- and an admin could bind-mount a sealed
+	 * memfd to /usr/bin/runc to allow re-use).
+	 */
+	ret = fcntl(fd, F_GET_SEALS);
+	if (ret >= 0) {
+		is_cloned = (ret == RUNC_MEMFD_SEALS);
+		goto out;
+	}
+
+	/*
+	 * All other forms require CLONED_BINARY_ENV, since they are potentially
+	 * writeable (or we can't tell if they're fully safe) and thus we must
+	 * check the environment as an extra layer of defence.
+	 */
+	if (!getenv(CLONED_BINARY_ENV)) {
+		is_cloned = false;
+		goto out;
+	}
+
+	/*
+	 * Is the binary on a read-only filesystem? We can't detect bind-mounts in
+	 * particular (in-kernel they are identical to regular mounts) but we can
+	 * at least be sure that it's read-only. In addition, to make sure that
+	 * it's *our* bind-mount we check CLONED_BINARY_ENV.
+	 */
+	if (fstatfs(fd, &fsbuf) >= 0)
+		is_cloned |= (fsbuf.f_flags & MS_RDONLY);
+
+	/*
+	 * Okay, we're a tmpfile -- or we're currently running on RHEL <=7.6
+	 * which appears to have a borked backport of F_GET_SEALS. Either way,
+	 * having a file which has no hardlinks indicates that we aren't using
+	 * a host-side "runc" binary and this is something that a container
+	 * cannot fake (because unlinking requires being able to resolve the
+	 * path that you want to unlink).
+	 */
+	if (fstat(fd, &statbuf) >= 0)
+		is_cloned |= (statbuf.st_nlink == 0);
+
+out:
+	close(fd);
+	return is_cloned;
+}
+
+/* Read a given file into a new buffer, and providing the length. */
+static char *read_file(char *path, size_t *length)
+{
+	int fd;
+	char buf[4096], *copy = NULL;
+
+	if (!length)
+		return NULL;
+
+	fd = open(path, O_RDONLY | O_CLOEXEC);
+	if (fd < 0)
+		return NULL;
+
+	*length = 0;
+	for (;;) {
+		ssize_t n;
+
+		n = read(fd, buf, sizeof(buf));
+		if (n < 0)
+			goto error;
+		if (!n)
+			break;
+
+		copy = must_realloc(copy, (*length + n) * sizeof(*copy));
+		memcpy(copy + *length, buf, n);
+		*length += n;
+	}
+	close(fd);
+	return copy;
+
+error:
+	close(fd);
+	free(copy);
+	return NULL;
+}
+
+/*
+ * A poor-man's version of "xargs -0". Basically parses a given block of
+ * NUL-delimited data, within the given length and adds a pointer to each entry
+ * to the array of pointers.
+ */
+static int parse_xargs(char *data, int data_length, char ***output)
+{
+	int num = 0;
+	char *cur = data;
+
+	if (!data || *output != NULL)
+		return -1;
+
+	while (cur < data + data_length) {
+		num++;
+		*output = must_realloc(*output, (num + 1) * sizeof(**output));
+		(*output)[num - 1] = cur;
+		cur += strlen(cur) + 1;
+	}
+	(*output)[num] = NULL;
+	return num;
+}
+
+/*
+ * "Parse" out argv from /proc/self/cmdline.
+ * This is necessary because we are running in a context where we don't have a
+ * main() that we can just get the arguments from.
+ */
+static int fetchve(char ***argv)
+{
+	char *cmdline = NULL;
+	size_t cmdline_size;
+
+	cmdline = read_file("/proc/self/cmdline", &cmdline_size);
+	if (!cmdline)
+		goto error;
+
+	if (parse_xargs(cmdline, cmdline_size, argv) <= 0)
+		goto error;
+
+	return 0;
+
+error:
+	free(cmdline);
+	return -EINVAL;
+}
+
+enum {
+	EFD_NONE = 0,
+	EFD_MEMFD,
+	EFD_FILE,
+};
+
+/*
+ * This comes from <linux/fcntl.h>. We can't hard-code __O_TMPFILE because it
+ * changes depending on the architecture. If we don't have O_TMPFILE we always
+ * have the mkostemp(3) fallback.
+ */
+#ifndef O_TMPFILE
+#  if defined(__O_TMPFILE) && defined(O_DIRECTORY)
+#    define O_TMPFILE (__O_TMPFILE | O_DIRECTORY)
+#  endif
+#endif
+
+static int make_execfd(int *fdtype)
+{
+	int fd = -1;
+	char template[PATH_MAX] = {0};
+	char *prefix = secure_getenv("_LIBCONTAINER_STATEDIR");
+
+	if (!prefix || *prefix != '/')
+		prefix = "/tmp";
+	if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0)
+		return -1;
+
+	/*
+	 * Now try memfd, it's much nicer than actually creating a file in STATEDIR
+	 * since it's easily detected thanks to sealing and also doesn't require
+	 * assumptions about STATEDIR.
+	 */
+	*fdtype = EFD_MEMFD;
+	fd = memfd_create(RUNC_MEMFD_COMMENT, MFD_CLOEXEC | MFD_ALLOW_SEALING);
+	if (fd >= 0)
+		return fd;
+	if (errno != ENOSYS && errno != EINVAL)
+		goto error;
+
+#ifdef O_TMPFILE
+	/*
+	 * Try O_TMPFILE to avoid races where someone might snatch our file. Note
+	 * that O_EXCL isn't actually a security measure here (since you can just
+	 * fd re-open it and clear O_EXCL).
+	 */
+	*fdtype = EFD_FILE;
+	fd = open(prefix, O_TMPFILE | O_EXCL | O_RDWR | O_CLOEXEC, 0700);
+	if (fd >= 0) {
+		struct stat statbuf = {};
+		bool working_otmpfile = false;
+
+		/*
+		 * open(2) ignores unknown O_* flags -- yeah, I was surprised when I
+		 * found this out too. As a result we can't check for EINVAL. However,
+		 * if we get nlink != 0 (or EISDIR) then we know that this kernel
+		 * doesn't support O_TMPFILE.
+		 */
+		if (fstat(fd, &statbuf) >= 0)
+			working_otmpfile = (statbuf.st_nlink == 0);
+
+		if (working_otmpfile)
+			return fd;
+
+		/* Pretend that we got EISDIR since O_TMPFILE failed. */
+		close(fd);
+		errno = EISDIR;
+	}
+	if (errno != EISDIR)
+		goto error;
+#endif /* defined(O_TMPFILE) */
+
+	/*
+	 * Our final option is to create a temporary file the old-school way, and
+	 * then unlink it so that nothing else sees it by accident.
+	 */
+	*fdtype = EFD_FILE;
+	fd = mkostemp(template, O_CLOEXEC);
+	if (fd >= 0) {
+		if (unlink(template) >= 0)
+			return fd;
+		close(fd);
+	}
+
+error:
+	*fdtype = EFD_NONE;
+	return -1;
+}
+
+static int seal_execfd(int *fd, int fdtype)
+{
+	switch (fdtype) {
+	case EFD_MEMFD:
+		return fcntl(*fd, F_ADD_SEALS, RUNC_MEMFD_SEALS);
+	case EFD_FILE: {
+		/* Need to re-open our pseudo-memfd as an O_PATH to avoid execve(2) giving -ETXTBSY. */
+		int newfd;
+		char fdpath[PATH_MAX] = {0};
+
+		if (fchmod(*fd, 0100) < 0)
+			return -1;
+
+		if (snprintf(fdpath, sizeof(fdpath), "/proc/self/fd/%d", *fd) < 0)
+			return -1;
+
+		newfd = open(fdpath, O_PATH | O_CLOEXEC);
+		if (newfd < 0)
+			return -1;
+
+		close(*fd);
+		*fd = newfd;
+		return 0;
+	}
+	default:
+	   break;
+	}
+	return -1;
+}
+
+static int try_bindfd(void)
+{
+	int fd, ret = -1;
+	char template[PATH_MAX] = {0};
+	char *prefix = secure_getenv("_LIBCONTAINER_STATEDIR");
+
+	if (!prefix || *prefix != '/')
+		prefix = "/tmp";
+	if (snprintf(template, sizeof(template), "%s/runc.XXXXXX", prefix) < 0)
+		return ret;
+
+	/*
+	 * We need somewhere to mount it, mounting anything over /proc/self is a
+	 * BAD idea on the host -- even if we do it temporarily.
+	 */
+	fd = mkstemp(template);
+	if (fd < 0)
+		return ret;
+	close(fd);
+
+	/*
+	 * For obvious reasons this won't work in rootless mode because we haven't
+	 * created a userns+mntns -- but getting that to work will be a bit
+	 * complicated and it's only worth doing if someone actually needs it.
+	 */
+	ret = -EPERM;
+	if (mount("/proc/self/exe", template, "", MS_BIND, "") < 0)
+		goto out;
+	if (mount("", template, "", MS_REMOUNT | MS_BIND | MS_RDONLY, "") < 0)
+		goto out_umount;
+
+
+	/* Get read-only handle that we're sure can't be made read-write. */
+	ret = open(template, O_PATH | O_CLOEXEC);
+
+out_umount:
+	/*
+	 * Make sure the MNT_DETACH works, otherwise we could get remounted
+	 * read-write and that would be quite bad (the fd would be made read-write
+	 * too, invalidating the protection).
+	 */
+	if (umount2(template, MNT_DETACH) < 0) {
+		if (ret >= 0)
+			close(ret);
+		ret = -ENOTRECOVERABLE;
+	}
+
+out:
+	/*
+	 * We don't care about unlink errors, the worst that happens is that
+	 * there's an empty file left around in STATEDIR.
+	 */
+	unlink(template);
+	return ret;
+}
+
+static ssize_t fd_to_fd(int outfd, int infd)
+{
+	ssize_t total = 0;
+	char buffer[4096];
+
+	for (;;) {
+		ssize_t nread, nwritten = 0;
+
+		nread = read(infd, buffer, sizeof(buffer));
+		if (nread < 0)
+			return -1;
+		if (!nread)
+			break;
+
+		do {
+			ssize_t n = write(outfd, buffer + nwritten, nread - nwritten);
+			if (n < 0)
+				return -1;
+			nwritten += n;
+		} while(nwritten < nread);
+
+		total += nwritten;
+	}
+
+	return total;
+}
+
+static int clone_binary(void)
+{
+	int binfd, execfd;
+	struct stat statbuf = {};
+	size_t sent = 0;
+	int fdtype = EFD_NONE;
+
+	/*
+	 * Before we resort to copying, let's try creating an ro-binfd in one shot
+	 * by getting a handle for a read-only bind-mount of the execfd.
+	 */
+	execfd = try_bindfd();
+	if (execfd >= 0)
+		return execfd;
+
+	/*
+	 * Dammit, that didn't work -- time to copy the binary to a safe place we
+	 * can seal the contents.
+	 */
+	execfd = make_execfd(&fdtype);
+	if (execfd < 0 || fdtype == EFD_NONE)
+		return -ENOTRECOVERABLE;
+
+	binfd = open("/proc/self/exe", O_RDONLY | O_CLOEXEC);
+	if (binfd < 0)
+		goto error;
+
+	if (fstat(binfd, &statbuf) < 0)
+		goto error_binfd;
+
+	while (sent < statbuf.st_size) {
+		int n = sendfile(execfd, binfd, NULL, statbuf.st_size - sent);
+		if (n < 0) {
+			/* sendfile can fail so we fallback to a dumb user-space copy. */
+			n = fd_to_fd(execfd, binfd);
+			if (n < 0)
+				goto error_binfd;
+		}
+		sent += n;
+	}
+	close(binfd);
+	if (sent != statbuf.st_size)
+		goto error;
+
+	if (seal_execfd(&execfd, fdtype) < 0)
+		goto error;
+
+	return execfd;
+
+error_binfd:
+	close(binfd);
+error:
+	close(execfd);
+	return -EIO;
+}
+
+/* Get cheap access to the environment. */
+extern char **environ;
+
+int ensure_cloned_binary(void)
+{
+	int execfd;
+	char **argv = NULL;
+
+	/* Check that we're not self-cloned, and if we are then bail. */
+	int cloned = is_self_cloned();
+	if (cloned > 0 || cloned == -ENOTRECOVERABLE)
+		return cloned;
+
+	if (fetchve(&argv) < 0)
+		return -EINVAL;
+
+	execfd = clone_binary();
+	if (execfd < 0)
+		return -EIO;
+
+	if (putenv(CLONED_BINARY_ENV "=1"))
+		goto error;
+
+	fexecve(execfd, argv, environ);
+error:
+	close(execfd);
+	return -ENOEXEC;
+}
--- a/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c
+++ b/vendor/github.com/opencontainers/runc/libcontainer/nsenter/nsexec.c
@ -534,6 +534,9 @@ void join_namespaces(char *nslist)
 	free(namespaces);
 }

+/* Defined in cloned_binary.c. */
+extern int ensure_cloned_binary(void);
+
 void nsexec(void)
 {
 	int pipenum;
@ -549,6 +552,14 @@ void nsexec(void)
 	if (pipenum == -1)
 		return;

+	/*
+	 * We need to re-exec if we are not in a cloned binary. This is necessary
+	 * to ensure that containers won't be able to access the host binary
+	 * through /proc/self/exe. See CVE-2019-5736.
+	 */
+	if (ensure_cloned_binary() < 0)
+		bail("could not ensure we are a cloned binary");
+
 	/* Parse all of the netlink configuration. */
 	nl_parse(pipenum, &config);

--- a/vendor/github.com/opencontainers/runc/vendor.conf
+++ b/vendor/github.com/opencontainers/runc/vendor.conf
@ -1,8 +1,9 @@
 # OCI runtime-spec. When updating this, make sure you use a version tag rather
 # than a commit ID so it's much more obvious what version of the spec we are
 # using.
-github.com/opencontainers/runtime-spec 5684b8af48c1ac3b1451fa499724e30e3c20a294
+github.com/opencontainers/runtime-spec 29686dbc5559d93fb1ef402eeda3e35c38d75af4
 # Core libcontainer functionality.
+github.com/checkpoint-restore/go-criu v3.11
 github.com/mrunalp/fileutils ed869b029674c0e9ce4c0dfa781405c2d9946d08
 github.com/opencontainers/selinux v1.0.0-rc1
 github.com/seccomp/libseccomp-golang 84e90a91acea0f4e51e62bc1a75de18b1fc0790f
@ -18,7 +19,7 @@ github.com/golang/protobuf 18c9bb3261723cd5401db4d0c9fbc5c3b6c70fe8
 github.com/cyphar/filepath-securejoin v0.2.1
 github.com/docker/go-units v0.2.0
 github.com/urfave/cli d53eb991652b1d438abdd34ce4bfa3ef1539108e
-golang.org/x/sys 7ddbeae9ae08c6a06a59597f0c9edbc5ff2444ce https://github.com/golang/sys
+golang.org/x/sys 41f3e6584952bb034a481797859f6ab34b6803bd https://github.com/golang/sys

 # console dependencies
 github.com/containerd/console 2748ece16665b45a47f884001d5831ec79703880