libcontainerd: remove LCOW bits

Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
This commit is contained in:
Sebastiaan van Stijn 2021-03-19 11:56:41 +01:00
parent 86b4d88e55
commit 08ddbfbdac
No known key found for this signature in database
GPG Key ID: 76698F39D527CE8C
19 changed files with 7 additions and 1621 deletions

View File

@ -2,43 +2,14 @@ package daemon // import "github.com/docker/docker/daemon"
import (
"github.com/Microsoft/hcsshim/cmd/containerd-shim-runhcs-v1/options"
"github.com/Microsoft/opengcs/client"
"github.com/docker/docker/container"
"github.com/docker/docker/pkg/system"
)
func (daemon *Daemon) getLibcontainerdCreateOptions(container *container.Container) (string, interface{}, error) {
// Set the runtime options to debug regardless of current logging level.
func (daemon *Daemon) getLibcontainerdCreateOptions(_ *container.Container) (string, interface{}, error) {
if system.ContainerdRuntimeSupported() {
opts := &options.Options{Debug: true}
return "", opts, nil
// Set the runtime options to debug regardless of current logging level.
return "", &options.Options{Debug: true}, nil
}
// TODO (containerd) - Probably need to revisit LCOW options here
// rather than blindly ignoring them.
// LCOW options.
if container.OS == "linux" {
config := &client.Config{}
if err := config.GenerateDefault(daemon.configStore.GraphOptions); err != nil {
return "", nil, err
}
// Override from user-supplied options.
for k, v := range container.HostConfig.StorageOpt {
switch k {
case "lcow.kirdpath":
config.KirdPath = v
case "lcow.bootparameters":
config.BootParameters = v
}
}
if err := config.Validate(); err != nil {
return "", nil, err
}
return "", config, nil
}
return "", nil, nil
}

View File

@ -9,7 +9,6 @@ import (
"fmt"
"io/ioutil"
"os"
"path"
"path/filepath"
"regexp"
"strings"
@ -19,7 +18,6 @@ import (
"github.com/Microsoft/hcsshim"
"github.com/Microsoft/hcsshim/osversion"
opengcs "github.com/Microsoft/opengcs/client"
"github.com/containerd/containerd"
"github.com/containerd/containerd/cio"
containerderrdefs "github.com/containerd/containerd/errdefs"
@ -150,11 +148,10 @@ func (c *client) Create(_ context.Context, id string, spec *specs.Spec, shim str
}
var err error
if spec.Linux == nil {
err = c.createWindows(id, spec, runtimeOptions)
} else {
err = c.createLinux(id, spec, runtimeOptions)
if spec.Linux != nil {
return errors.New("linux containers are not supported on this platform")
}
err = c.createWindows(id, spec, runtimeOptions)
if err == nil {
c.eventQ.Append(id, func() {
@ -366,203 +363,6 @@ func (c *client) createWindows(id string, spec *specs.Spec, runtimeOptions inter
}
func (c *client) createLinux(id string, spec *specs.Spec, runtimeOptions interface{}) error {
logrus.Debugf("libcontainerd: createLinux(): containerId %s ", id)
logger := c.logger.WithField("container", id)
if runtimeOptions == nil {
return fmt.Errorf("lcow option must be supplied to the runtime")
}
lcowConfig, ok := runtimeOptions.(*opengcs.Config)
if !ok {
return fmt.Errorf("lcow option must be supplied to the runtime")
}
configuration := &hcsshim.ContainerConfig{
HvPartition: true,
Name: id,
SystemType: "container",
ContainerType: "linux",
Owner: defaultOwner,
TerminateOnLastHandleClosed: true,
HvRuntime: &hcsshim.HvRuntime{
ImagePath: lcowConfig.KirdPath,
LinuxKernelFile: lcowConfig.KernelFile,
LinuxInitrdFile: lcowConfig.InitrdFile,
LinuxBootParameters: lcowConfig.BootParameters,
},
}
if spec.Windows == nil {
return fmt.Errorf("spec.Windows must not be nil for LCOW containers")
}
c.extractResourcesFromSpec(spec, configuration)
// We must have least one layer in the spec
if spec.Windows.LayerFolders == nil || len(spec.Windows.LayerFolders) == 0 {
return fmt.Errorf("OCI spec is invalid - at least one LayerFolders must be supplied to the runtime")
}
// Strip off the top-most layer as that's passed in separately to HCS
configuration.LayerFolderPath = spec.Windows.LayerFolders[len(spec.Windows.LayerFolders)-1]
layerFolders := spec.Windows.LayerFolders[:len(spec.Windows.LayerFolders)-1]
for _, layerPath := range layerFolders {
_, filename := filepath.Split(layerPath)
g, err := hcsshim.NameToGuid(filename)
if err != nil {
return err
}
configuration.Layers = append(configuration.Layers, hcsshim.Layer{
ID: g.ToString(),
Path: filepath.Join(layerPath, "layer.vhd"),
})
}
if spec.Windows.Network != nil {
configuration.EndpointList = spec.Windows.Network.EndpointList
configuration.AllowUnqualifiedDNSQuery = spec.Windows.Network.AllowUnqualifiedDNSQuery
if spec.Windows.Network.DNSSearchList != nil {
configuration.DNSSearchList = strings.Join(spec.Windows.Network.DNSSearchList, ",")
}
configuration.NetworkSharedContainerName = spec.Windows.Network.NetworkSharedContainerName
}
// Add the mounts (volumes, bind mounts etc) to the structure. We have to do
// some translation for both the mapped directories passed into HCS and in
// the spec.
//
// For HCS, we only pass in the mounts from the spec which are type "bind".
// Further, the "ContainerPath" field (which is a little mis-leadingly
// named when it applies to the utility VM rather than the container in the
// utility VM) is moved to under /tmp/gcs/<ID>/binds, where this is passed
// by the caller through a 'uvmpath' option.
//
// We do similar translation for the mounts in the spec by stripping out
// the uvmpath option, and translating the Source path to the location in the
// utility VM calculated above.
//
// From inside the utility VM, you would see a 9p mount such as in the following
// where a host folder has been mapped to /target. The line with /tmp/gcs/<ID>/binds
// specifically:
//
// / # mount
// rootfs on / type rootfs (rw,size=463736k,nr_inodes=115934)
// proc on /proc type proc (rw,relatime)
// sysfs on /sys type sysfs (rw,relatime)
// udev on /dev type devtmpfs (rw,relatime,size=498100k,nr_inodes=124525,mode=755)
// tmpfs on /run type tmpfs (rw,relatime)
// cgroup on /sys/fs/cgroup type cgroup (rw,relatime,cpuset,cpu,cpuacct,blkio,memory,devices,freezer,net_cls,perf_event,net_prio,hugetlb,pids,rdma)
// mqueue on /dev/mqueue type mqueue (rw,relatime)
// devpts on /dev/pts type devpts (rw,relatime,mode=600,ptmxmode=000)
// /binds/b3ea9126d67702173647ece2744f7c11181c0150e9890fc9a431849838033edc/target on /binds/b3ea9126d67702173647ece2744f7c11181c0150e9890fc9a431849838033edc/target type 9p (rw,sync,dirsync,relatime,trans=fd,rfdno=6,wfdno=6)
// /dev/pmem0 on /tmp/gcs/b3ea9126d67702173647ece2744f7c11181c0150e9890fc9a431849838033edc/layer0 type ext4 (ro,relatime,block_validity,delalloc,norecovery,barrier,dax,user_xattr,acl)
// /dev/sda on /tmp/gcs/b3ea9126d67702173647ece2744f7c11181c0150e9890fc9a431849838033edc/scratch type ext4 (rw,relatime,block_validity,delalloc,barrier,user_xattr,acl)
// overlay on /tmp/gcs/b3ea9126d67702173647ece2744f7c11181c0150e9890fc9a431849838033edc/rootfs type overlay (rw,relatime,lowerdir=/tmp/base/:/tmp/gcs/b3ea9126d67702173647ece2744f7c11181c0150e9890fc9a431849838033edc/layer0,upperdir=/tmp/gcs/b3ea9126d67702173647ece2744f7c11181c0150e9890fc9a431849838033edc/scratch/upper,workdir=/tmp/gcs/b3ea9126d67702173647ece2744f7c11181c0150e9890fc9a431849838033edc/scratch/work)
//
// /tmp/gcs/b3ea9126d67702173647ece2744f7c11181c0150e9890fc9a431849838033edc # ls -l
// total 16
// drwx------ 3 0 0 60 Sep 7 18:54 binds
// -rw-r--r-- 1 0 0 3345 Sep 7 18:54 config.json
// drwxr-xr-x 10 0 0 4096 Sep 6 17:26 layer0
// drwxr-xr-x 1 0 0 4096 Sep 7 18:54 rootfs
// drwxr-xr-x 5 0 0 4096 Sep 7 18:54 scratch
//
// /tmp/gcs/b3ea9126d67702173647ece2744f7c11181c0150e9890fc9a431849838033edc # ls -l binds
// total 0
// drwxrwxrwt 2 0 0 4096 Sep 7 16:51 target
mds := []hcsshim.MappedDir{}
specMounts := []specs.Mount{}
for _, mount := range spec.Mounts {
specMount := mount
if mount.Type == "bind" {
// Strip out the uvmpath from the options
updatedOptions := []string{}
uvmPath := ""
readonly := false
for _, opt := range mount.Options {
dropOption := false
elements := strings.SplitN(opt, "=", 2)
switch elements[0] {
case "uvmpath":
uvmPath = elements[1]
dropOption = true
case "rw":
case "ro":
readonly = true
case "rbind":
default:
return fmt.Errorf("unsupported option %q", opt)
}
if !dropOption {
updatedOptions = append(updatedOptions, opt)
}
}
mount.Options = updatedOptions
if uvmPath == "" {
return fmt.Errorf("no uvmpath for bind mount %+v", mount)
}
md := hcsshim.MappedDir{
HostPath: mount.Source,
ContainerPath: path.Join(uvmPath, mount.Destination),
CreateInUtilityVM: true,
ReadOnly: readonly,
}
// If we are 1803/RS4+ enable LinuxMetadata support by default
if osversion.Build() >= osversion.RS4 {
md.LinuxMetadata = true
}
mds = append(mds, md)
specMount.Source = path.Join(uvmPath, mount.Destination)
}
specMounts = append(specMounts, specMount)
}
configuration.MappedDirectories = mds
hcsContainer, err := hcsshim.CreateContainer(id, configuration)
if err != nil {
return err
}
spec.Mounts = specMounts
// Construct a container object for calling start on it.
ctr := &container{
id: id,
execs: make(map[string]*process),
isWindows: false,
ociSpec: spec,
hcsContainer: hcsContainer,
status: containerd.Created,
waitCh: make(chan struct{}),
}
// Start the container.
logger.Debug("starting container")
if err = hcsContainer.Start(); err != nil {
c.logger.WithError(err).Error("failed to start container")
ctr.debugGCS()
ctr.Lock()
if err := c.terminateContainer(ctr); err != nil {
c.logger.WithError(err).Error("failed to cleanup after a failed Start")
} else {
c.logger.Debug("cleaned up after failed Start by calling Terminate")
}
ctr.Unlock()
return err
}
ctr.debugGCS()
c.Lock()
c.containers[id] = ctr
c.Unlock()
logger.Debug("createLinux() completed successfully")
return nil
}
func (c *client) extractResourcesFromSpec(spec *specs.Spec, configuration *hcsshim.ContainerConfig) {
if spec.Windows.Resources != nil {
if spec.Windows.Resources.CPU != nil {

View File

@ -1,10 +1,6 @@
package local // import "github.com/docker/docker/libcontainerd/local"
import (
"strings"
opengcs "github.com/Microsoft/opengcs/client"
)
import "strings"
// setupEnvironmentVariables converts a string array of environment variables
// into a map as required by the HCS. Source array is in format [v1=k1] [v2=k2] etc.
@ -18,26 +14,3 @@ func setupEnvironmentVariables(a []string) map[string]string {
}
return r
}
// LCOWOption is a CreateOption required for LCOW configuration
type LCOWOption struct {
Config *opengcs.Config
}
// Apply for the LCOW option is a no-op.
func (s *LCOWOption) Apply(interface{}) error {
return nil
}
// debugGCS is a dirty hack for debugging for Linux Utility VMs. It simply
// runs a bunch of commands inside the UVM, but seriously aides in advanced debugging.
func (c *container) debugGCS() {
if c == nil || c.isWindows || c.hcsContainer == nil {
return
}
cfg := opengcs.Config{
Uvm: c.hcsContainer,
UvmTimeoutSeconds: 600,
}
cfg.DebugGCS()
}

View File

@ -94,6 +94,5 @@ func (c *client) UpdateResources(ctx context.Context, containerID string, resour
func getSpecUser(ociSpec *specs.Spec) (int, int) {
// TODO: (containerd): Not implemented, but don't error.
// Not clear if we can even do this for LCOW.
return 0, 0
}

View File

@ -5,7 +5,6 @@ github.com/docker/libtrust 9cbd2a1374f46905c68a4eb3694a
github.com/golang/gddo 72a348e765d293ed6d1ded7b699591f14d6cd921
github.com/google/uuid 0cd6bf5da1e1c83f8b45653022c74f71af0538a4 # v1.1.1
github.com/gorilla/mux 98cb6bf42e086f6af920b965c38cacc07402d51b # v1.8.0
github.com/Microsoft/opengcs a10967154e143a36014584a6f664344e3bb0aa64
github.com/moby/locker 281af2d563954745bea9d1487c965f24d30742fe # v1.0.1
github.com/moby/term bea5bbe245bf407372d477f1361d2ff042d2f556

View File

@ -1,21 +0,0 @@
MIT License
Copyright (c) Microsoft Corporation. All rights reserved.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE

View File

@ -1,14 +0,0 @@
# Open Guest Compute Service (opengcs) [![Build Status](https://travis-ci.org/Microsoft/opengcs.svg?branch=master)](https://travis-ci.org/Microsoft/opengcs)
Open Guest Compute Service is a Linux open source project to further the development of a production quality implementation of Linux Hyper-V container on Windows (LCOW). It's designed to run inside a custom Linux OS for supporting Linux container payload.
# Getting Started
[How to build GCS binaries](./docs/gcsbuildinstructions.md/)
[How to build custom Linux OS images](./docs/customosbuildinstructions.md/)
# Contributing
This project has adopted the [Microsoft Open Source Code of Conduct](https://opensource.microsoft.com/codeofconduct/). For more information see the [Code of Conduct FAQ](https://opensource.microsoft.com/codeofconduct/faq/) or contact [opencode@microsoft.com](mailto:opencode@microsoft.com) with any additional questions or comments.

View File

@ -1,201 +0,0 @@
// +build windows
package client
import (
"encoding/json"
"fmt"
"os"
"path/filepath"
"strconv"
"strings"
"github.com/Microsoft/hcsshim"
"github.com/sirupsen/logrus"
)
// Mode is the operational mode, both requested, and actual after verification
type Mode uint
const (
// defaultUvmTimeoutSeconds is the default time to wait for utility VM operations
defaultUvmTimeoutSeconds = 5 * 60
// DefaultVhdxSizeGB is the size of the default sandbox & scratch in GB
DefaultVhdxSizeGB = 20
// defaultVhdxBlockSizeMB is the block-size for the sandbox/scratch VHDx's this package can create.
defaultVhdxBlockSizeMB = 1
)
// Config is the structure used to configuring a utility VM. There are two ways
// of starting. Either supply a VHD, or a Kernel+Initrd. For the latter, both
// must be supplied, and both must be in the same directory.
//
// VHD is the priority.
type Config struct {
Options // Configuration options
Name string // Name of the utility VM
UvmTimeoutSeconds int // How long to wait for the utility VM to respond in seconds
Uvm hcsshim.Container // The actual container
MappedVirtualDisks []hcsshim.MappedVirtualDisk // Data-disks to be attached
}
// Options is the structure used by a client to define configurable options for a utility VM.
type Options struct {
KirdPath string // Path to where kernel/initrd are found (defaults to %PROGRAMFILES%\Linux Containers)
KernelFile string // Kernel for Utility VM (embedded in a UEFI bootloader) - does NOT include full path, just filename
InitrdFile string // Initrd image for Utility VM - does NOT include full path, just filename
TimeoutSeconds int // Requested time for the utility VM to respond in seconds (may be over-ridden by environment)
BootParameters string // Additional boot parameters for initrd booting
}
// ParseOptions parses a set of K-V pairs into options used by opengcs. Note
// for consistency with the LCOW graphdriver in docker, we keep the same
// convention of an `lcow.` prefix.
func ParseOptions(options []string) (Options, error) {
rOpts := Options{TimeoutSeconds: 0}
for _, v := range options {
opt := strings.SplitN(v, "=", 2)
if len(opt) == 2 {
switch strings.ToLower(opt[0]) {
case "lcow.kirdpath":
rOpts.KirdPath = opt[1]
case "lcow.kernel":
rOpts.KernelFile = opt[1]
case "lcow.initrd":
rOpts.InitrdFile = opt[1]
case "lcow.bootparameters":
rOpts.BootParameters = opt[1]
case "lcow.timeout":
var err error
if rOpts.TimeoutSeconds, err = strconv.Atoi(opt[1]); err != nil {
return rOpts, fmt.Errorf("lcow.timeout option could not be interpreted as an integer")
}
if rOpts.TimeoutSeconds < 0 {
return rOpts, fmt.Errorf("lcow.timeout option cannot be negative")
}
}
}
}
// Set default values if not supplied
if rOpts.KirdPath == "" {
rOpts.KirdPath = filepath.Join(os.Getenv("ProgramFiles"), "Linux Containers")
}
if rOpts.KernelFile == "" {
rOpts.KernelFile = `kernel`
}
if rOpts.InitrdFile == "" {
rOpts.InitrdFile = `initrd.img`
}
return rOpts, nil
}
// GenerateDefault generates a default config from a set of options
// If baseDir is not supplied, defaults to $env:ProgramFiles\Linux Containers
func (config *Config) GenerateDefault(options []string) error {
// Parse the options that the user supplied.
var err error
config.Options, err = ParseOptions(options)
if err != nil {
return err
}
// Get the timeout from the environment
envTimeoutSeconds := 0
envTimeout := os.Getenv("OPENGCS_UVM_TIMEOUT_SECONDS")
if len(envTimeout) > 0 {
var err error
if envTimeoutSeconds, err = strconv.Atoi(envTimeout); err != nil {
return fmt.Errorf("OPENGCS_UVM_TIMEOUT_SECONDS could not be interpreted as an integer")
}
if envTimeoutSeconds < 0 {
return fmt.Errorf("OPENGCS_UVM_TIMEOUT_SECONDS cannot be negative")
}
}
// Priority to the requested timeout from the options.
if config.TimeoutSeconds != 0 {
config.UvmTimeoutSeconds = config.TimeoutSeconds
return nil
}
// Next priority, the environment
if envTimeoutSeconds != 0 {
config.UvmTimeoutSeconds = envTimeoutSeconds
return nil
}
// Last priority is the default timeout
config.UvmTimeoutSeconds = defaultUvmTimeoutSeconds
return nil
}
// Validate validates a Config structure for starting a utility VM.
func (config *Config) Validate() error {
if _, err := os.Stat(filepath.Join(config.KirdPath, config.KernelFile)); os.IsNotExist(err) {
return fmt.Errorf("kernel '%s' not found", filepath.Join(config.KirdPath, config.KernelFile))
}
if _, err := os.Stat(filepath.Join(config.KirdPath, config.InitrdFile)); os.IsNotExist(err) {
return fmt.Errorf("initrd '%s' not found", filepath.Join(config.KirdPath, config.InitrdFile))
}
// Ensure all the MappedVirtualDisks exist on the host
for _, mvd := range config.MappedVirtualDisks {
if _, err := os.Stat(mvd.HostPath); err != nil {
return fmt.Errorf("mapped virtual disk '%s' not found", mvd.HostPath)
}
if mvd.ContainerPath == "" {
return fmt.Errorf("mapped virtual disk '%s' requested without a container path", mvd.HostPath)
}
}
return nil
}
// StartUtilityVM creates and starts a utility VM from a configuration.
func (config *Config) StartUtilityVM() error {
logrus.Debugf("opengcs: StartUtilityVM: %+v", config)
if err := config.Validate(); err != nil {
return err
}
configuration := &hcsshim.ContainerConfig{
HvPartition: true,
Name: config.Name,
SystemType: "container",
ContainerType: "linux",
TerminateOnLastHandleClosed: true,
MappedVirtualDisks: config.MappedVirtualDisks,
HvRuntime: &hcsshim.HvRuntime{
ImagePath: config.KirdPath,
LinuxInitrdFile: config.InitrdFile,
LinuxKernelFile: config.KernelFile,
LinuxBootParameters: config.BootParameters,
},
}
configurationS, _ := json.Marshal(configuration)
logrus.Debugf("opengcs: StartUtilityVM: calling HCS with '%s'", string(configurationS))
uvm, err := hcsshim.CreateContainer(config.Name, configuration)
if err != nil {
return err
}
logrus.Debugf("opengcs: StartUtilityVM: uvm created, starting...")
err = uvm.Start()
if err != nil {
logrus.Debugf("opengcs: StartUtilityVM: uvm failed to start: %s", err)
// Make sure we don't leave it laying around as it's been created in HCS
uvm.Terminate()
return err
}
config.Uvm = uvm
logrus.Debugf("opengcs StartUtilityVM: uvm %s is running", config.Name)
return nil
}

View File

@ -1,167 +0,0 @@
// +build windows
package client
import (
"bytes"
"fmt"
"os"
"strings"
"time"
winio "github.com/Microsoft/go-winio/vhd"
// "github.com/Microsoft/hcsshim"
"github.com/sirupsen/logrus"
)
// dismount is a simple utility function wrapping a conditional HotRemove. It would
// have been easier if you could cancel a deferred function, but this works just
// as well.
func (config *Config) dismount(file string) error {
logrus.Debugf("opengcs: CreateExt4Vhdx: hot-remove of %s", file)
err := config.HotRemoveVhd(file)
if err != nil {
logrus.Warnf("failed to hot-remove: %s", err)
}
return err
}
// CreateExt4Vhdx does what it says on the tin. It is the responsibility of the caller to synchronise
// simultaneous attempts to create the cache file.
func (config *Config) CreateExt4Vhdx(destFile string, sizeGB uint32, cacheFile string) error {
// Smallest we can accept is the default sandbox size as we can't size down, only expand.
if sizeGB < DefaultVhdxSizeGB {
sizeGB = DefaultVhdxSizeGB
}
logrus.Debugf("opengcs: CreateExt4Vhdx: %s size:%dGB cache:%s", destFile, sizeGB, cacheFile)
// Retrieve from cache if the default size and already on disk
if cacheFile != "" && sizeGB == DefaultVhdxSizeGB {
if _, err := os.Stat(cacheFile); err == nil {
if err := CopyFile(cacheFile, destFile, false); err != nil {
return fmt.Errorf("failed to copy cached file '%s' to '%s': %s", cacheFile, destFile, err)
}
logrus.Debugf("opengcs: CreateExt4Vhdx: %s fulfilled from cache", destFile)
return nil
}
}
// Must have a utility VM to operate on
if config.Uvm == nil {
return fmt.Errorf("no utility VM")
}
// Create the VHDX
if err := winio.CreateVhdx(destFile, sizeGB, defaultVhdxBlockSizeMB); err != nil {
return fmt.Errorf("failed to create VHDx %s: %s", destFile, err)
}
defer config.DebugGCS()
// Attach it to the utility VM, but don't mount it (as there's no filesystem on it)
if err := config.HotAddVhd(destFile, "", false, false); err != nil {
return fmt.Errorf("opengcs: CreateExt4Vhdx: failed to hot-add %s to utility VM: %s", cacheFile, err)
}
// Get the list of mapped virtual disks to find the controller and LUN IDs
logrus.Debugf("opengcs: CreateExt4Vhdx: %s querying mapped virtual disks", destFile)
mvdControllers, err := config.Uvm.MappedVirtualDisks()
if err != nil {
return fmt.Errorf("failed to get mapped virtual disks: %s", err)
}
// Find our mapped disk from the list of all currently added.
controller := -1
lun := -1
for controllerNumber, controllerElement := range mvdControllers {
for diskNumber, diskElement := range controllerElement.MappedVirtualDisks {
if diskElement.HostPath == destFile {
controller = controllerNumber
lun = diskNumber
break
}
}
}
if controller == -1 || lun == -1 {
config.dismount(destFile)
return fmt.Errorf("failed to find %s in mapped virtual disks after hot-adding", destFile)
}
logrus.Debugf("opengcs: CreateExt4Vhdx: %s at C=%d L=%d", destFile, controller, lun)
// Validate /sys/bus/scsi/devices/C:0:0:L exists as a directory
testdCommand := fmt.Sprintf(`test -d /sys/bus/scsi/devices/%d:0:0:%d`, controller, lun)
testdProc, err := config.RunProcess(testdCommand, nil, nil, nil)
if err != nil {
config.dismount(destFile)
return fmt.Errorf("failed to `%s` following hot-add %s to utility VM: %s", testdCommand, destFile, err)
}
defer testdProc.Close()
testdProc.WaitTimeout(time.Second * time.Duration(config.UvmTimeoutSeconds))
testdExitCode, err := testdProc.ExitCode()
if err != nil {
config.dismount(destFile)
return fmt.Errorf("failed to get exit code from `%s` following hot-add %s to utility VM: %s", testdCommand, destFile, err)
}
if testdExitCode != 0 {
config.dismount(destFile)
return fmt.Errorf("`%s` return non-zero exit code (%d) following hot-add %s to utility VM", testdCommand, testdExitCode, destFile)
}
// Get the device from under the block subdirectory by doing a simple ls. This will come back as (eg) `sda`
lsCommand := fmt.Sprintf(`ls /sys/bus/scsi/devices/%d:0:0:%d/block`, controller, lun)
var lsOutput bytes.Buffer
lsProc, err := config.RunProcess(lsCommand, nil, &lsOutput, nil)
if err != nil {
config.dismount(destFile)
return fmt.Errorf("failed to `%s` following hot-add %s to utility VM: %s", lsCommand, destFile, err)
}
defer lsProc.Close()
lsProc.WaitTimeout(time.Second * time.Duration(config.UvmTimeoutSeconds))
lsExitCode, err := lsProc.ExitCode()
if err != nil {
config.dismount(destFile)
return fmt.Errorf("failed to get exit code from `%s` following hot-add %s to utility VM: %s", lsCommand, destFile, err)
}
if lsExitCode != 0 {
config.dismount(destFile)
return fmt.Errorf("`%s` return non-zero exit code (%d) following hot-add %s to utility VM", lsCommand, lsExitCode, destFile)
}
device := fmt.Sprintf(`/dev/%s`, strings.TrimSpace(lsOutput.String()))
logrus.Debugf("opengcs: CreateExt4Vhdx: %s: device at %s", destFile, device)
// Format it ext4
mkfsCommand := fmt.Sprintf(`mkfs.ext4 -q -E lazy_itable_init=1 -O ^has_journal,sparse_super2,uninit_bg,^resize_inode %s`, device)
var mkfsStderr bytes.Buffer
mkfsProc, err := config.RunProcess(mkfsCommand, nil, nil, &mkfsStderr)
if err != nil {
config.dismount(destFile)
return fmt.Errorf("failed to RunProcess %q following hot-add %s to utility VM: %s", destFile, mkfsCommand, err)
}
defer mkfsProc.Close()
mkfsProc.WaitTimeout(time.Second * time.Duration(config.UvmTimeoutSeconds))
mkfsExitCode, err := mkfsProc.ExitCode()
if err != nil {
config.dismount(destFile)
return fmt.Errorf("failed to get exit code from `%s` following hot-add %s to utility VM: %s", mkfsCommand, destFile, err)
}
if mkfsExitCode != 0 {
config.dismount(destFile)
return fmt.Errorf("`%s` return non-zero exit code (%d) following hot-add %s to utility VM: %s", mkfsCommand, mkfsExitCode, destFile, strings.TrimSpace(mkfsStderr.String()))
}
// Dismount before we copy it
if err := config.dismount(destFile); err != nil {
return fmt.Errorf("failed to hot-remove: %s", err)
}
// Populate the cache.
if cacheFile != "" && (sizeGB == DefaultVhdxSizeGB) {
if err := CopyFile(destFile, cacheFile, true); err != nil {
return fmt.Errorf("failed to seed cache '%s' from '%s': %s", destFile, cacheFile, err)
}
}
logrus.Debugf("opengcs: CreateExt4Vhdx: %s created (non-cache)", destFile)
return nil
}

View File

@ -1,42 +0,0 @@
// +build windows
package client
import (
"fmt"
"github.com/Microsoft/hcsshim"
"github.com/sirupsen/logrus"
)
// HotAddVhd hot-adds a VHD to a utility VM. This is used in the global one-utility-VM-
// service-VM per host scenario. In order to do a graphdriver `Diff`, we hot-add the
// sandbox to /mnt/<id> so that we can run `exportSandbox` inside the utility VM to
// get a tar-stream of the sandboxes contents back to the daemon.
func (config *Config) HotAddVhd(hostPath string, containerPath string, readOnly bool, mount bool) error {
logrus.Debugf("opengcs: HotAddVhd: %s: %s", hostPath, containerPath)
if config.Uvm == nil {
return fmt.Errorf("cannot hot-add VHD as no utility VM is in configuration")
}
defer config.DebugGCS()
modification := &hcsshim.ResourceModificationRequestResponse{
Resource: "MappedVirtualDisk",
Data: hcsshim.MappedVirtualDisk{
HostPath: hostPath,
ContainerPath: containerPath,
CreateInUtilityVM: true,
ReadOnly: readOnly,
AttachOnly: !mount,
},
Request: "Add",
}
if err := config.Uvm.Modify(modification); err != nil {
return fmt.Errorf("failed to modify utility VM configuration for hot-add: %s", err)
}
logrus.Debugf("opengcs: HotAddVhd: %s added successfully", hostPath)
return nil
}

View File

@ -1,36 +0,0 @@
// +build windows
package client
import (
"fmt"
"github.com/Microsoft/hcsshim"
"github.com/sirupsen/logrus"
)
// HotRemoveVhd hot-removes a VHD from a utility VM. This is used in the global one-utility-VM-
// service-VM per host scenario.
func (config *Config) HotRemoveVhd(hostPath string) error {
logrus.Debugf("opengcs: HotRemoveVhd: %s", hostPath)
if config.Uvm == nil {
return fmt.Errorf("cannot hot-add VHD as no utility VM is in configuration")
}
defer config.DebugGCS()
modification := &hcsshim.ResourceModificationRequestResponse{
Resource: "MappedVirtualDisk",
Data: hcsshim.MappedVirtualDisk{
HostPath: hostPath,
CreateInUtilityVM: true,
},
Request: "Remove",
}
if err := config.Uvm.Modify(modification); err != nil {
return fmt.Errorf("failed modifying utility VM for hot-remove %s: %s", hostPath, err)
}
logrus.Debugf("opengcs: HotRemoveVhd: %s removed successfully", hostPath)
return nil
}

View File

@ -1,24 +0,0 @@
// +build windows
package client
import (
"os"
"strconv"
)
var (
logDataFromUVM int64
)
func init() {
bytes := os.Getenv("OPENGCS_LOG_DATA_FROM_UVM")
if len(bytes) == 0 {
return
}
u, err := strconv.ParseUint(bytes, 10, 32)
if err != nil {
return
}
logDataFromUVM = int64(u)
}

View File

@ -1,31 +0,0 @@
// +build windows
package client
import (
"fmt"
"os"
"path/filepath"
)
// LayerVhdDetails is a utility for getting a file name, size and indication of
// sandbox for a VHD(x) in a folder. A read-only layer will be layer.vhd. A
// read-write layer will be sandbox.vhdx.
func LayerVhdDetails(folder string) (string, int64, bool, error) {
var fileInfo os.FileInfo
isSandbox := false
filename := filepath.Join(folder, "layer.vhd")
var err error
if fileInfo, err = os.Stat(filename); err != nil {
filename = filepath.Join(folder, "sandbox.vhdx")
if fileInfo, err = os.Stat(filename); err != nil {
if os.IsNotExist(err) {
return "", 0, isSandbox, fmt.Errorf("could not find layer or sandbox in %s", folder)
}
return "", 0, isSandbox, fmt.Errorf("error locating layer or sandbox in %s: %s", folder, err)
}
isSandbox = true
}
return filename, fileInfo.Size(), isSandbox, nil
}

View File

@ -1,164 +0,0 @@
// +build windows
package client
import (
"bytes"
"fmt"
"io"
"os"
"strings"
"time"
"github.com/Microsoft/hcsshim"
"github.com/sirupsen/logrus"
)
// Process is the structure pertaining to a process running in a utility VM.
type process struct {
Process hcsshim.Process
Stdin io.WriteCloser
Stdout io.ReadCloser
Stderr io.ReadCloser
}
// createUtilsProcess is a convenient wrapper for hcsshim.createUtilsProcess to use when
// communicating with a utility VM.
func (config *Config) createUtilsProcess(commandLine string) (process, error) {
logrus.Debugf("opengcs: createUtilsProcess")
if config.Uvm == nil {
return process{}, fmt.Errorf("cannot create utils process as no utility VM is in configuration")
}
var (
err error
proc process
)
env := make(map[string]string)
env["PATH"] = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:"
processConfig := &hcsshim.ProcessConfig{
EmulateConsole: false,
CreateStdInPipe: true,
CreateStdOutPipe: true,
CreateStdErrPipe: true,
CreateInUtilityVm: true,
WorkingDirectory: "/bin",
Environment: env,
CommandLine: commandLine,
}
proc.Process, err = config.Uvm.CreateProcess(processConfig)
if err != nil {
return process{}, fmt.Errorf("failed to create process (%+v) in utility VM: %s", config, err)
}
if proc.Stdin, proc.Stdout, proc.Stderr, err = proc.Process.Stdio(); err != nil {
proc.Process.Kill() // Should this have a timeout?
proc.Process.Close()
return process{}, fmt.Errorf("failed to get stdio pipes for process %+v: %s", config, err)
}
logrus.Debugf("opengcs: createUtilsProcess success: pid %d", proc.Process.Pid())
return proc, nil
}
// RunProcess runs the given command line program in the utilityVM. It takes in
// an input to the reader to feed into stdin and returns stdout to output.
// IMPORTANT: It is the responsibility of the caller to call Close() on the returned process.
func (config *Config) RunProcess(commandLine string, stdin io.Reader, stdout io.Writer, stderr io.Writer) (hcsshim.Process, error) {
logrus.Debugf("opengcs: RunProcess: %s", commandLine)
process, err := config.createUtilsProcess(commandLine)
if err != nil {
return nil, err
}
// Send the data into the process's stdin
if stdin != nil {
if _, err = copyWithTimeout(process.Stdin,
stdin,
0,
config.UvmTimeoutSeconds,
fmt.Sprintf("send to stdin of %s", commandLine)); err != nil {
return nil, err
}
// Don't need stdin now we've sent everything. This signals GCS that we are finished sending data.
if err := process.Process.CloseStdin(); err != nil && !hcsshim.IsNotExist(err) && !hcsshim.IsAlreadyClosed(err) {
// This error will occur if the compute system is currently shutting down
if perr, ok := err.(*hcsshim.ProcessError); ok && perr.Err != hcsshim.ErrVmcomputeOperationInvalidState {
return nil, err
}
}
}
if stdout != nil {
// Copy the data over to the writer.
if _, err := copyWithTimeout(stdout,
process.Stdout,
0,
config.UvmTimeoutSeconds,
fmt.Sprintf("RunProcess: copy back from %s", commandLine)); err != nil {
return nil, err
}
}
if stderr != nil {
// Copy the data over to the writer.
if _, err := copyWithTimeout(stderr,
process.Stderr,
0,
config.UvmTimeoutSeconds,
fmt.Sprintf("RunProcess: copy back from %s", commandLine)); err != nil {
return nil, err
}
}
logrus.Debugf("opengcs: runProcess success: %s", commandLine)
return process.Process, nil
}
func debugCommand(s string) string {
return fmt.Sprintf(`echo -e 'DEBUG COMMAND: %s\\n--------------\\n';%s;echo -e '\\n\\n';`, s, s)
}
// DebugGCS extracts logs from the GCS. It's a useful hack for debugging,
// but not necessarily optimal, but all that is available to us in RS3.
func (config *Config) DebugGCS() {
if logrus.GetLevel() < logrus.DebugLevel || len(os.Getenv("OPENGCS_DEBUG_ENABLE")) == 0 {
return
}
var out bytes.Buffer
cmd := os.Getenv("OPENGCS_DEBUG_COMMAND")
if cmd == "" {
cmd = `sh -c "`
cmd += debugCommand("kill -10 `pidof gcs`") // SIGUSR1 for stackdump
cmd += debugCommand("ls -l /tmp")
cmd += debugCommand("cat /tmp/gcs.log")
cmd += debugCommand("cat /tmp/gcs/gcs-stacks*")
cmd += debugCommand("cat /tmp/gcs/paniclog*")
cmd += debugCommand("ls -l /tmp/gcs")
cmd += debugCommand("ls -l /tmp/gcs/*")
cmd += debugCommand("cat /tmp/gcs/*/config.json")
cmd += debugCommand("ls -lR /var/run/gcsrunc")
cmd += debugCommand("cat /tmp/gcs/global-runc.log")
cmd += debugCommand("cat /tmp/gcs/*/runc.log")
cmd += debugCommand("ps -ef")
cmd += `"`
}
proc, err := config.RunProcess(cmd, nil, &out, nil)
defer func() {
if proc != nil {
proc.Kill()
proc.Close()
}
}()
if err != nil {
logrus.Debugln("benign failure getting gcs logs: ", err)
}
if proc != nil {
proc.WaitTimeout(time.Second * 30)
}
logrus.Debugf("GCS Debugging:\n%s\n\nEnd GCS Debugging", strings.TrimSpace(out.String()))
}

View File

@ -1,3 +0,0 @@
// +build !windows
package client

View File

@ -1,122 +0,0 @@
// +build windows
package client
import (
"bytes"
"encoding/hex"
"fmt"
"io"
"os"
"syscall"
"time"
"unsafe"
"github.com/sirupsen/logrus"
)
var (
modkernel32 = syscall.NewLazyDLL("kernel32.dll")
procCopyFileW = modkernel32.NewProc("CopyFileW")
)
// writeFileFromReader writes an output file from an io.Reader
func writeFileFromReader(path string, reader io.Reader, timeoutSeconds int, context string) (int64, error) {
outFile, err := os.Create(path)
if err != nil {
return 0, fmt.Errorf("opengcs: writeFileFromReader: failed to create %s: %s", path, err)
}
defer outFile.Close()
return copyWithTimeout(outFile, reader, 0, timeoutSeconds, context)
}
// copyWithTimeout is a wrapper for io.Copy using a timeout duration
func copyWithTimeout(dst io.Writer, src io.Reader, size int64, timeoutSeconds int, context string) (int64, error) {
logrus.Debugf("opengcs: copywithtimeout: size %d: timeout %d: (%s)", size, timeoutSeconds, context)
type resultType struct {
err error
bytes int64
}
done := make(chan resultType, 1)
go func() {
result := resultType{}
if logrus.GetLevel() < logrus.DebugLevel || logDataFromUVM == 0 {
result.bytes, result.err = io.Copy(dst, src)
} else {
// In advanced debug mode where we log (hexdump format) what is copied
// up to the number of bytes defined by environment variable
// OPENGCS_LOG_DATA_FROM_UVM
var buf bytes.Buffer
tee := io.TeeReader(src, &buf)
result.bytes, result.err = io.Copy(dst, tee)
if result.err == nil {
size := result.bytes
if size > logDataFromUVM {
size = logDataFromUVM
}
if size > 0 {
bytes := make([]byte, size)
if _, err := buf.Read(bytes); err == nil {
logrus.Debugf(fmt.Sprintf("opengcs: copyWithTimeout\n%s", hex.Dump(bytes)))
}
}
}
}
done <- result
}()
var result resultType
timedout := time.After(time.Duration(timeoutSeconds) * time.Second)
select {
case <-timedout:
return 0, fmt.Errorf("opengcs: copyWithTimeout: timed out (%s)", context)
case result = <-done:
if result.err != nil && result.err != io.EOF {
// See https://github.com/golang/go/blob/f3f29d1dea525f48995c1693c609f5e67c046893/src/os/exec/exec_windows.go for a clue as to why we are doing this :)
if se, ok := result.err.(syscall.Errno); ok {
const (
errNoData = syscall.Errno(232)
errBrokenPipe = syscall.Errno(109)
)
if se == errNoData || se == errBrokenPipe {
logrus.Debugf("opengcs: copyWithTimeout: hit NoData or BrokenPipe: %d: %s", se, context)
return result.bytes, nil
}
}
return 0, fmt.Errorf("opengcs: copyWithTimeout: error reading: '%s' after %d bytes (%s)", result.err, result.bytes, context)
}
}
logrus.Debugf("opengcs: copyWithTimeout: success - copied %d bytes (%s)", result.bytes, context)
return result.bytes, nil
}
// CopyFile is a utility for copying a file - used for the sandbox cache.
// Uses CopyFileW win32 API for performance
func CopyFile(srcFile, destFile string, overwrite bool) error {
var bFailIfExists uint32 = 1
if overwrite {
bFailIfExists = 0
}
lpExistingFileName, err := syscall.UTF16PtrFromString(srcFile)
if err != nil {
return err
}
lpNewFileName, err := syscall.UTF16PtrFromString(destFile)
if err != nil {
return err
}
r1, _, err := syscall.Syscall(
procCopyFileW.Addr(),
3,
uintptr(unsafe.Pointer(lpExistingFileName)),
uintptr(unsafe.Pointer(lpNewFileName)),
uintptr(bFailIfExists))
if r1 == 0 {
return fmt.Errorf("failed CopyFileW Win32 call from '%s' to '%s': %s", srcFile, destFile, err)
}
return nil
}

View File

@ -1,69 +0,0 @@
// +build windows
package client
import (
"fmt"
"io"
"os"
"github.com/sirupsen/logrus"
)
// VhdToTar does what is says - it exports a VHD in a specified
// folder (either a read-only layer.vhd, or a read-write sandbox.vhd) to a
// ReadCloser containing a tar-stream of the layers contents.
func (config *Config) VhdToTar(vhdFile string, uvmMountPath string, isSandbox bool, vhdSize int64) (io.ReadCloser, error) {
logrus.Debugf("opengcs: VhdToTar: %s isSandbox: %t", vhdFile, isSandbox)
if config.Uvm == nil {
return nil, fmt.Errorf("cannot VhdToTar as no utility VM is in configuration")
}
defer config.DebugGCS()
vhdHandle, err := os.Open(vhdFile)
if err != nil {
return nil, fmt.Errorf("opengcs: VhdToTar: failed to open %s: %s", vhdFile, err)
}
defer vhdHandle.Close()
logrus.Debugf("opengcs: VhdToTar: exporting %s, size %d, isSandbox %t", vhdHandle.Name(), vhdSize, isSandbox)
// Different binary depending on whether a RO layer or a RW sandbox
command := "vhd2tar"
if isSandbox {
command = fmt.Sprintf("exportSandbox -path %s", uvmMountPath)
}
// Start the binary in the utility VM
process, err := config.createUtilsProcess(command)
if err != nil {
return nil, fmt.Errorf("opengcs: VhdToTar: %s: failed to create utils process %s: %s", vhdHandle.Name(), command, err)
}
if !isSandbox {
// Send the VHD contents to the utility VM processes stdin handle if not a sandbox
logrus.Debugf("opengcs: VhdToTar: copying the layer VHD into the utility VM")
if _, err = copyWithTimeout(process.Stdin, vhdHandle, vhdSize, config.UvmTimeoutSeconds, fmt.Sprintf("vhdtotarstream: sending %s to %s", vhdHandle.Name(), command)); err != nil {
process.Process.Close()
return nil, fmt.Errorf("opengcs: VhdToTar: %s: failed to copyWithTimeout on the stdin pipe (to utility VM): %s", vhdHandle.Name(), err)
}
}
// Start a goroutine which copies the stdout (ie the tar stream)
reader, writer := io.Pipe()
go func() {
defer writer.Close()
defer process.Process.Close()
logrus.Debugf("opengcs: VhdToTar: copying tar stream back from the utility VM")
bytes, err := copyWithTimeout(writer, process.Stdout, vhdSize, config.UvmTimeoutSeconds, fmt.Sprintf("vhdtotarstream: copy tarstream from %s", command))
if err != nil {
logrus.Errorf("opengcs: VhdToTar: %s: copyWithTimeout on the stdout pipe (from utility VM) failed: %s", vhdHandle.Name(), err)
}
logrus.Debugf("opengcs: VhdToTar: copied %d bytes of the tarstream of %s from the utility VM", bytes, vhdHandle.Name())
}()
// Return the read-side of the pipe connected to the goroutine which is reading from the stdout of the process in the utility VM
return reader, nil
}

View File

@ -1,333 +0,0 @@
#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <getopt.h>
#include <net/if.h>
#include <netinet/ip.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/mount.h>
#include <sys/socket.h>
#include <sys/stat.h>
#include <sys/sysmacros.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#define DEFAULT_PATH_ENV "PATH=/sbin:/usr/sbin:/bin:/usr/bin"
const char *const default_envp[] = {
DEFAULT_PATH_ENV,
NULL,
};
// When nothing is passed, default to the LCOWv1 behavior.
const char *const default_argv[] = { "/bin/gcs", "-loglevel", "debug", "-logfile=/tmp/gcs.log" };
const char *const default_shell = "/bin/sh";
struct Mount {
const char *source, *target, *type;
unsigned long flags;
const void *data;
};
struct Mkdir {
const char *path;
mode_t mode;
};
struct Mknod {
const char *path;
mode_t mode;
int major, minor;
};
struct Symlink {
const char *linkpath, *target;
};
enum OpType {
OpMount,
OpMkdir,
OpMknod,
OpSymlink,
};
struct InitOp {
enum OpType op;
union {
struct Mount mount;
struct Mkdir mkdir;
struct Mknod mknod;
struct Symlink symlink;
};
};
const struct InitOp ops[] = {
// mount /proc (which should already exist)
{ OpMount, .mount = { "proc", "/proc", "proc", MS_NODEV | MS_NOSUID | MS_NOEXEC } },
// add symlinks in /dev (which is already mounted)
{ OpSymlink, .symlink = { "/dev/fd", "/proc/self/fd" } },
{ OpSymlink, .symlink = { "/dev/stdin", "/proc/self/fd/0" } },
{ OpSymlink, .symlink = { "/dev/stdout", "/proc/self/fd/1" } },
{ OpSymlink, .symlink = { "/dev/stderr", "/proc/self/fd/2" } },
// mount tmpfs on /run and /tmp (which should already exist)
{ OpMount, .mount = { "tmpfs", "/run", "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC, "mode=0755" } },
{ OpMount, .mount = { "tmpfs", "/tmp", "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC } },
// mount shm and devpts
{ OpMkdir, .mkdir = { "/dev/shm", 0755 } },
{ OpMount, .mount = { "shm", "/dev/shm", "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC } },
{ OpMkdir, .mkdir = { "/dev/pts", 0755 } },
{ OpMount, .mount = { "devpts", "/dev/pts", "devpts", MS_NOSUID | MS_NOEXEC } },
// mount /sys (which should already exist)
{ OpMount, .mount = { "sysfs", "/sys", "sysfs", MS_NODEV | MS_NOSUID | MS_NOEXEC } },
{ OpMount, .mount = { "cgroup_root", "/sys/fs/cgroup", "tmpfs", MS_NODEV | MS_NOSUID | MS_NOEXEC, "mode=0755" } },
};
void warn(const char *msg) {
int error = errno;
perror(msg);
errno = error;
}
void warn2(const char *msg1, const char *msg2) {
int error = errno;
fputs(msg1, stderr);
fputs(": ", stderr);
errno = error;
warn(msg2);
}
_Noreturn void dien() {
exit(errno);
}
_Noreturn void die(const char *msg) {
warn(msg);
dien();
}
_Noreturn void die2(const char *msg1, const char *msg2) {
warn2(msg1, msg2);
dien();
}
void init_dev() {
if (mount("dev", "/dev", "devtmpfs", MS_NOSUID | MS_NOEXEC, NULL) < 0) {
warn2("mount", "/dev");
// /dev will be already mounted if devtmpfs.mount = 1 on the kernel
// command line or CONFIG_DEVTMPFS_MOUNT is set. Do not consider this
// an error.
if (errno != EBUSY) {
dien();
}
}
}
void init_fs(const struct InitOp *ops, size_t count) {
for (size_t i = 0; i < count; i++) {
switch (ops[i].op) {
case OpMount: {
const struct Mount *m = &ops[i].mount;
if (mount(m->source, m->target, m->type, m->flags, m->data) < 0) {
die2("mount", m->target);
}
break;
}
case OpMkdir: {
const struct Mkdir *m = &ops[i].mkdir;
if (mkdir(m->path, m->mode) < 0) {
warn2("mkdir", m->path);
if (errno != EEXIST) {
dien();
}
}
break;
}
case OpMknod: {
const struct Mknod *n = &ops[i].mknod;
if (mknod(n->path, n->mode, makedev(n->major, n->minor)) < 0) {
warn2("mknod", n->path);
if (errno != EEXIST) {
dien();
}
}
break;
}
case OpSymlink: {
const struct Symlink *sl = &ops[i].symlink;
if (symlink(sl->target, sl->linkpath) < 0) {
warn2("symlink", sl->linkpath);
if (errno != EEXIST) {
dien();
}
}
break;
}
}
}
}
void init_cgroups() {
const char *fpath = "/proc/cgroups";
FILE *f = fopen(fpath, "r");
if (f == NULL) {
die2("fopen", fpath);
}
// Skip the first line.
for (;;) {
char c = fgetc(f);
if (c == EOF || c == '\n') {
break;
}
}
for (;;) {
static const char base_path[] = "/sys/fs/cgroup/";
char path[sizeof(base_path) - 1 + 64];
char* name = path + sizeof(base_path) - 1;
int hier, groups, enabled;
int r = fscanf(f, "%64s %d %d %d\n", name, &hier, &groups, &enabled);
if (r == EOF) {
break;
}
if (r != 4) {
errno = errno ? : EINVAL;
die2("fscanf", fpath);
}
if (enabled) {
memcpy(path, base_path, sizeof(base_path) - 1);
if (mkdir(path, 0755) < 0) {
die2("mkdir", path);
}
if (mount(name, path, "cgroup", MS_NODEV | MS_NOSUID | MS_NOEXEC, name) < 0) {
die2("mount", path);
}
}
}
fclose(f);
}
void init_network(const char *iface, int domain) {
int s = socket(domain, SOCK_DGRAM, IPPROTO_IP);
if (s < 0) {
if (errno == EAFNOSUPPORT) {
return;
}
die("socket");
}
struct ifreq request = {0};
strncpy(request.ifr_name, iface, sizeof(request.ifr_name));
if (ioctl(s, SIOCGIFFLAGS, &request) < 0) {
die2("ioctl(SIOCGIFFLAGS)", iface);
}
request.ifr_flags |= IFF_UP | IFF_RUNNING;
if (ioctl(s, SIOCSIFFLAGS, &request) < 0) {
die2("ioctl(SIOCSIFFLAGS)", iface);
}
close(s);
}
pid_t launch(int argc, char **argv) {
int pid = fork();
if (pid != 0) {
if (pid < 0) {
die("fork");
}
return pid;
}
// Unblock signals before execing.
sigset_t set;
sigfillset(&set);
sigprocmask(SIG_UNBLOCK, &set, 0);
// Create a session and process group.
setsid();
setpgid(0, 0);
// Terminate the arguments and exec.
char **argvn = alloca(sizeof(argv[0]) * (argc + 1));
memcpy(argvn, argv, sizeof(argv[0]) * argc);
argvn[argc] = NULL;
if (putenv(DEFAULT_PATH_ENV)) { // Specify the PATH used for execvpe
die("putenv");
}
execvpe(argvn[0], argvn, (char**)default_envp);
die2("execvpe", argvn[0]);
}
int reap_until(pid_t until_pid) {
for (;;) {
int status;
pid_t pid = wait(&status);
if (pid < 0) {
die("wait");
}
if (pid == until_pid) {
// The initial child process died. Pass through the exit status.
if (WIFEXITED(status)) {
if (WEXITSTATUS(status) != 0) {
fputs("child exited with error\n", stderr);
}
return WEXITSTATUS(status);
}
fputs("child exited by signal\n", stderr);
return 128 + WTERMSIG(status);
}
}
}
int main(int argc, char **argv) {
char *debug_shell = NULL;
if (argc <= 1) {
argv = (char **)default_argv;
argc = sizeof(default_argv) / sizeof(default_argv[0]);
optind = 0;
debug_shell = (char*)default_shell;
} else {
for (int opt; (opt = getopt(argc, argv, "+d:")) >= 0; ) {
switch (opt) {
case 'd':
debug_shell = optarg;
break;
default:
exit(1);
}
}
}
char **child_argv = argv + optind;
int child_argc = argc - optind;
// Block all signals in init. SIGCHLD will still cause wait() to return.
sigset_t set;
sigfillset(&set);
sigprocmask(SIG_BLOCK, &set, 0);
init_dev();
init_fs(ops, sizeof(ops) / sizeof(ops[0]));
init_cgroups();
init_network("lo", AF_INET);
init_network("lo", AF_INET6);
pid_t pid = launch(child_argc, child_argv);
if (debug_shell != NULL) {
// The debug shell takes over as the primary child.
pid = launch(1, &debug_shell);
}
// Reap until the initial child process dies.
return reap_until(pid);
}

View File

@ -1,129 +0,0 @@
// vsockexec opens vsock connections for the specified stdio descriptors and
// then execs the specified process.
#include <errno.h>
#include <sys/socket.h>
#include <linux/vm_sockets.h>
#include <netinet/in.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#ifdef USE_TCP
static const int tcpmode = 1;
#else
static const int tcpmode;
#endif
static int openvsock(unsigned int cid, unsigned int port)
{
int s = socket(AF_VSOCK, SOCK_STREAM, 0);
if (s < 0) {
perror("socket: AF_VSOCK");
return -1;
}
struct sockaddr_vm addr = {0};
addr.svm_family = AF_VSOCK;
addr.svm_port = port;
addr.svm_cid = cid;
if (connect(s, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
fprintf(stderr, "connect: port %u: %s", port, strerror(errno));
return -1;
}
return s;
}
static int opentcp(unsigned short port)
{
int s = socket(AF_INET, SOCK_STREAM, 0);
if (s < 0) {
perror("socket: AF_INET");
return -1;
}
struct sockaddr_in addr = {0};
addr.sin_family = AF_INET;
addr.sin_port = htons(port);
addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
if (connect(s, (struct sockaddr *)&addr, sizeof(addr)) < 0) {
fprintf(stderr, "connect: port %u: %s\n", port, strerror(errno));
return -1;
}
return s;
}
_Noreturn static void usage(const char *argv0)
{
fprintf(stderr, "%s [-i port] [-o port] [-e port] -- program [args...]\n", argv0);
exit(1);
}
int main(int argc, char **argv)
{
unsigned int ports[3] = {0};
int sockets[3] = {-1, -1, -1};
int c;
while ((c = getopt(argc, argv, "+i:o:e:")) != -1) {
switch (c) {
case 'i':
ports[0] = strtoul(optarg, NULL, 10);
break;
case 'o':
ports[1] = strtoul(optarg, NULL, 10);
break;
case 'e':
ports[2] = strtoul(optarg, NULL, 10);
break;
default:
usage(argv[0]);
}
}
if (optind == argc) {
fprintf(stderr, "%s: missing program argument\n", argv[0]);
usage(argv[0]);
}
for (int i = 0; i < 3; i++) {
if (ports[i] != 0) {
int j;
for (j = 0; j < i; j++) {
if (ports[i] == ports[j]) {
int s = dup(sockets[j]);
if (s < 0) {
perror("dup");
return 1;
}
sockets[i] = s;
break;
}
}
if (j == i) {
int s = tcpmode ? opentcp(ports[i]) : openvsock(VMADDR_CID_HOST, ports[i]);
if (s < 0) {
return 1;
}
sockets[i] = s;
}
}
}
for (int i = 0; i < 3; i++) {
if (sockets[i] >= 0) {
dup2(sockets[i], i);
close(sockets[i]);
}
}
execvp(argv[optind], argv + optind);
fprintf(stderr, "execvp: %s: %s\n", argv[optind], strerror(errno));
return 1;
}