moby--moby/daemon/graphdriver/lcow/lcow.go

// +build windows

// Locale:      en-gb
// About:       Graph-driver for Linux Containers On Windows (LCOW)
//
// This graphdriver runs in two modes. Yet to be determined which one will
// be the shipping mode. The global mode is where a single utility VM
// is used for all service VM tool operations. This isn't safe security-wise
// as it's attaching a sandbox of multiple containers to it, containing
// untrusted data. This may be fine for client devops scenarios. In
// safe mode, a unique utility VM is instantiated for all service VM tool
// operations. The downside of safe-mode is that operations are slower as
// a new service utility VM has to be started and torn-down when needed.
//
// Options:
//
// The following options are read by the graphdriver itself:
//
//   * lcow.globalmode - Enables global service VM Mode
//        -- Possible values:     true/false
//        -- Default if omitted:  false
//
//   * lcow.sandboxsize - Specifies a custom sandbox size in GB for starting a container
//        -- Possible values:      >= default sandbox size (opengcs defined, currently 20)
//        -- Default if omitted:  20
//
// The following options are read by opengcs:
//
//   * lcow.kirdpath - Specifies a custom path to a kernel/initrd pair
//        -- Possible values:      Any local path that is not a mapped drive
//        -- Default if omitted:  %ProgramFiles%\Linux Containers
//
//   * lcow.bootparameters - Specifies additional boot parameters for booting in kernel+initrd mode
//        -- Possible values:      Any valid linux kernel boot options
//        -- Default if omitted:  <nil>
//
//   * lcow.timeout - Specifies a timeout for utility VM operations in seconds
//        -- Possible values:      >=0
//        -- Default if omitted:  300

// TODO: Grab logs from SVM at terminate or errors

package lcow // import "github.com/docker/docker/daemon/graphdriver/lcow"

import (
	"bytes"
	"encoding/json"
	"fmt"
	"io"
	"io/ioutil"
	"os"
	"path"
	"path/filepath"
	"strconv"
	"strings"
	"sync"
	"syscall"
	"time"

	"github.com/Microsoft/go-winio/pkg/security"
	"github.com/Microsoft/hcsshim"
	"github.com/Microsoft/hcsshim/ext4/tar2ext4"
	"github.com/Microsoft/opengcs/client"
	"github.com/docker/docker/daemon/graphdriver"
	"github.com/docker/docker/pkg/archive"
	"github.com/docker/docker/pkg/containerfs"
	"github.com/docker/docker/pkg/idtools"
	"github.com/docker/docker/pkg/ioutils"
	"github.com/docker/docker/pkg/reexec"
	"github.com/sirupsen/logrus"
)

// noreexec controls reexec functionality. Off by default, on for debugging purposes.
var noreexec = false

// init registers this driver to the register. It gets initialised by the
// function passed in the second parameter, implemented in this file.
func init() {
	graphdriver.Register("lcow", InitDriver)
	// DOCKER_LCOW_NOREEXEC allows for inline processing which makes
	// debugging issues in the re-exec codepath significantly easier.
	if os.Getenv("DOCKER_LCOW_NOREEXEC") != "" {
		logrus.Warnf("LCOW Graphdriver is set to not re-exec. This is intended for debugging purposes only.")
		noreexec = true
	} else {
		reexec.Register("docker-lcow-tar2ext4", tar2ext4Reexec)
	}
}

const (
	// sandboxFilename is the name of the file containing a layer's sandbox (read-write layer).
	sandboxFilename = "sandbox.vhdx"

	// scratchFilename is the name of the scratch-space used by an SVM to avoid running out of memory.
	scratchFilename = "scratch.vhdx"

	// layerFilename is the name of the file containing a layer's read-only contents.
	// Note this really is VHD format, not VHDX.
	layerFilename = "layer.vhd"

	// toolsScratchPath is a location in a service utility VM that the tools can use as a
	// scratch space to avoid running out of memory.
	toolsScratchPath = "/tmp/scratch"

	// svmGlobalID is the ID used in the serviceVMs map for the global service VM when running in "global" mode.
	svmGlobalID = "_lcow_global_svm_"

	// cacheDirectory is the sub-folder under the driver's data-root used to cache blank sandbox and scratch VHDs.
	cacheDirectory = "cache"

	// scratchDirectory is the sub-folder under the driver's data-root used for scratch VHDs in service VMs
	scratchDirectory = "scratch"

	// errOperationPending is the HRESULT returned by the HCS when the VM termination operation is still pending.
	errOperationPending syscall.Errno = 0xc0370103
)

// Driver represents an LCOW graph driver.
type Driver struct {
	dataRoot           string     // Root path on the host where we are storing everything.
	cachedSandboxFile  string     // Location of the local default-sized cached sandbox.
	cachedSandboxMutex sync.Mutex // Protects race conditions from multiple threads creating the cached sandbox.
	cachedScratchFile  string     // Location of the local cached empty scratch space.
	cachedScratchMutex sync.Mutex // Protects race conditions from multiple threads creating the cached scratch.
	options            []string   // Graphdriver options we are initialised with.
	globalMode         bool       // Indicates if running in an unsafe/global service VM mode.
	defaultSandboxSize uint64     // The default sandbox size to use if one is not specified

	// NOTE: It is OK to use a cache here because Windows does not support
	// restoring containers when the daemon dies.
	serviceVms *serviceVMMap // Map of the configs representing the service VM(s) we are running.
}

// layerDetails is the structure returned by a helper function `getLayerDetails`
// for getting information about a layer folder
type layerDetails struct {
	filename  string // \path\to\sandbox.vhdx or \path\to\layer.vhd
	size      int64  // size of the above file
	isSandbox bool   // true if sandbox.vhdx
}

// deletefiles is a helper function for initialisation where we delete any
// left-over scratch files in case we were previously forcibly terminated.
func deletefiles(path string, f os.FileInfo, err error) error {
	if strings.HasSuffix(f.Name(), ".vhdx") {
		logrus.Warnf("lcowdriver: init: deleting stale scratch file %s", path)
		return os.Remove(path)
	}
	return nil
}

// InitDriver returns a new LCOW storage driver.
func InitDriver(dataRoot string, options []string, _, _ []idtools.IDMap) (graphdriver.Driver, error) {
	title := "lcowdriver: init:"

	cd := filepath.Join(dataRoot, cacheDirectory)
	sd := filepath.Join(dataRoot, scratchDirectory)

	d := &Driver{
		dataRoot:          dataRoot,
		options:           options,
		cachedSandboxFile: filepath.Join(cd, sandboxFilename),
		cachedScratchFile: filepath.Join(cd, scratchFilename),
		serviceVms: &serviceVMMap{
			svms: make(map[string]*serviceVMMapItem),
		},
		globalMode:         false,
		defaultSandboxSize: client.DefaultVhdxSizeGB,
	}

	// Looks for relevant options
	for _, v := range options {
		opt := strings.SplitN(v, "=", 2)
		if len(opt) == 2 {
			switch strings.ToLower(opt[0]) {
			case "lcow.globalmode":
				var err error
				d.globalMode, err = strconv.ParseBool(opt[1])
				if err != nil {
					return nil, fmt.Errorf("%s failed to parse value for 'lcow.globalmode' - must be 'true' or 'false'", title)
				}
				break
			case "lcow.sandboxsize":
				var err error
				d.defaultSandboxSize, err = strconv.ParseUint(opt[1], 10, 32)
				if err != nil {
					return nil, fmt.Errorf("%s failed to parse value '%s' for 'lcow.sandboxsize'", title, v)
				}
				if d.defaultSandboxSize < client.DefaultVhdxSizeGB {
					return nil, fmt.Errorf("%s 'lcow.sandboxsize' option cannot be less than %d", title, client.DefaultVhdxSizeGB)
				}
				break
			}
		}
	}

	// Make sure the dataRoot directory is created
	if err := idtools.MkdirAllAndChown(dataRoot, 0700, idtools.Identity{UID: 0, GID: 0}); err != nil {
		return nil, fmt.Errorf("%s failed to create '%s': %v", title, dataRoot, err)
	}

	// Make sure the cache directory is created under dataRoot
	if err := idtools.MkdirAllAndChown(cd, 0700, idtools.Identity{UID: 0, GID: 0}); err != nil {
		return nil, fmt.Errorf("%s failed to create '%s': %v", title, cd, err)
	}

	// Make sure the scratch directory is created under dataRoot
	if err := idtools.MkdirAllAndChown(sd, 0700, idtools.Identity{UID: 0, GID: 0}); err != nil {
		return nil, fmt.Errorf("%s failed to create '%s': %v", title, sd, err)
	}

	// Delete any items in the scratch directory
	filepath.Walk(sd, deletefiles)

	logrus.Infof("%s dataRoot: %s globalMode: %t", title, dataRoot, d.globalMode)

	return d, nil
}

func (d *Driver) getVMID(id string) string {
	if d.globalMode {
		return svmGlobalID
	}
	return id
}

// remapLongToShortContainerPath does the mapping of a long container path for a
// SCSI attached disk, to a short container path where it's actually mounted.
func remapLongToShortContainerPath(longContainerPath string, attachCounter uint64, svmName string) string {
	shortContainerPath := longContainerPath
	if shortContainerPath != "" && shortContainerPath != toolsScratchPath {
		shortContainerPath = fmt.Sprintf("/tmp/d%d", attachCounter)
		logrus.Debugf("lcowdriver: UVM %s: remapping %s --> %s", svmName, longContainerPath, shortContainerPath)
	}
	return shortContainerPath
}

// startServiceVMIfNotRunning starts a service utility VM if it is not currently running.
// It can optionally be started with a mapped virtual disk. Returns a opengcs config structure
// representing the VM.
func (d *Driver) startServiceVMIfNotRunning(id string, mvdToAdd []hcsshim.MappedVirtualDisk, context string) (_ *serviceVM, err error) {
	// Use the global ID if in global mode
	id = d.getVMID(id)

	title := "lcowdriver: startServiceVMIfNotRunning " + id

	// Attempt to add ID to the service vm map
	logrus.Debugf("%s: adding entry to service vm map", title)
	svm, exists, err := d.serviceVms.add(id)
	if err != nil && err == errVMisTerminating {
		// VM is in the process of terminating. Wait until it's done and then try again
		logrus.Debugf("%s: VM with current ID still in the process of terminating", title)
		if err := svm.getStopError(); err != nil {
			logrus.Debugf("%s: VM did not stop successfully: %s", title, err)
			return nil, err
		}
		return d.startServiceVMIfNotRunning(id, mvdToAdd, context)
	} else if err != nil {
		logrus.Debugf("%s: failed to add service vm to map: %s", title, err)
		return nil, fmt.Errorf("%s: failed to add to service vm map: %s", title, err)
	}

	if exists {
		// Service VM is already up and running. In this case, just hot add the vhds.
		// Note that hotAddVHDs will remap long to short container paths, so no need
		// for us to that here.
		logrus.Debugf("%s: service vm already exists. Just hot adding: %+v", title, mvdToAdd)
		if err := svm.hotAddVHDs(mvdToAdd...); err != nil {
			logrus.Debugf("%s: failed to hot add vhds on service vm creation: %s", title, err)
			return nil, fmt.Errorf("%s: failed to hot add vhds on service vm: %s", title, err)
		}
		return svm, nil
	}

	// We are the first service for this id, so we need to start it
	logrus.Debugf("%s: service vm doesn't exist. Now starting it up", title)

	defer func() {
		// Signal that start has finished, passing in the error if any.
		svm.signalStartFinished(err)
		if err != nil {
			// We added a ref to the VM, since we failed, we should delete the ref.
			d.terminateServiceVM(id, "error path on startServiceVMIfNotRunning", false)
		}
	}()

	// Generate a default configuration
	if err := svm.config.GenerateDefault(d.options); err != nil {
		return nil, fmt.Errorf("%s: failed to generate default gogcs configuration for global svm (%s): %s", title, context, err)
	}

	// For the name, we deliberately suffix if safe-mode to ensure that it doesn't
	// clash with another utility VM which may be running for the container itself.
	// This also makes it easier to correlate through Get-ComputeProcess.
	if id == svmGlobalID {
		svm.config.Name = svmGlobalID
	} else {
		svm.config.Name = fmt.Sprintf("%s_svm", id)
	}

	// Ensure we take the cached scratch mutex around the check to ensure the file is complete
	// and not in the process of being created by another thread.
	scratchTargetFile := filepath.Join(d.dataRoot, scratchDirectory, fmt.Sprintf("%s.vhdx", id))

	logrus.Debugf("%s: locking cachedScratchMutex", title)
	d.cachedScratchMutex.Lock()
	if _, err := os.Stat(d.cachedScratchFile); err == nil {
		// Make a copy of cached scratch to the scratch directory
		logrus.Debugf("%s: (%s) cloning cached scratch for mvd", title, context)
		if err := client.CopyFile(d.cachedScratchFile, scratchTargetFile, true); err != nil {
			logrus.Debugf("%s: releasing cachedScratchMutex on err: %s", title, err)
			d.cachedScratchMutex.Unlock()
			return nil, err
		}

		// Add the cached clone as a mapped virtual disk
		logrus.Debugf("%s: (%s) adding cloned scratch as mvd", title, context)
		mvd := hcsshim.MappedVirtualDisk{
			HostPath:          scratchTargetFile,
			ContainerPath:     toolsScratchPath,
			CreateInUtilityVM: true,
		}
		svm.config.MappedVirtualDisks = append(svm.config.MappedVirtualDisks, mvd)
		svm.scratchAttached = true
	}

	logrus.Debugf("%s: releasing cachedScratchMutex", title)
	d.cachedScratchMutex.Unlock()

	// Add mapped virtual disks. First those that are already in the configuration. Generally,
	// the only one that will be here is the service VMs scratch. The exception is when invoked
	// via the graphdrivers DiffGetter implementation.
	for i, mvd := range svm.config.MappedVirtualDisks {
		svm.attachCounter++
		svm.attachedVHDs[mvd.HostPath] = &attachedVHD{refCount: 1, attachCounter: svm.attachCounter}

		// No-op for the service VMs scratch disk. Only applicable in the DiffGetter interface invocation.
		svm.config.MappedVirtualDisks[i].ContainerPath = remapLongToShortContainerPath(mvd.ContainerPath, svm.attachCounter, svm.config.Name)
	}

	// Then the remaining ones to add, and adding them to the startup configuration.
	for _, mvd := range mvdToAdd {
		svm.attachCounter++
		svm.attachedVHDs[mvd.HostPath] = &attachedVHD{refCount: 1, attachCounter: svm.attachCounter}
		mvd.ContainerPath = remapLongToShortContainerPath(mvd.ContainerPath, svm.attachCounter, svm.config.Name)
		svm.config.MappedVirtualDisks = append(svm.config.MappedVirtualDisks, mvd)
	}

	// Start it.
	logrus.Debugf("%s: (%s) starting %s", title, context, svm.config.Name)
	if err := svm.config.StartUtilityVM(); err != nil {
		return nil, fmt.Errorf("failed to start service utility VM (%s): %s", context, err)
	}

	// defer function to terminate the VM if the next steps fail
	defer func() {
		if err != nil {
			waitTerminate(svm, fmt.Sprintf("%s: (%s)", title, context))
		}
	}()

	// Now we have a running service VM, we can create the cached scratch file if it doesn't exist.
	logrus.Debugf("%s: locking cachedScratchMutex", title)
	d.cachedScratchMutex.Lock()
	if _, err := os.Stat(d.cachedScratchFile); err != nil {
		logrus.Debugf("%s: (%s) creating an SVM scratch", title, context)

		// Don't use svm.CreateExt4Vhdx since that only works when the service vm is setup,
		// but we're still in that process right now.
		if err := svm.config.CreateExt4Vhdx(scratchTargetFile, client.DefaultVhdxSizeGB, d.cachedScratchFile); err != nil {
			logrus.Debugf("%s: (%s) releasing cachedScratchMutex on error path", title, context)
			d.cachedScratchMutex.Unlock()
			logrus.Debugf("%s: failed to create vm scratch %s: %s", title, scratchTargetFile, err)
			return nil, fmt.Errorf("failed to create SVM scratch VHDX (%s): %s", context, err)
		}
	}
	logrus.Debugf("%s: (%s) releasing cachedScratchMutex", title, context)
	d.cachedScratchMutex.Unlock()

	// Hot-add the scratch-space if not already attached
	if !svm.scratchAttached {
		logrus.Debugf("%s: (%s) hot-adding scratch %s", title, context, scratchTargetFile)
		if err := svm.hotAddVHDsAtStart(hcsshim.MappedVirtualDisk{
			HostPath:          scratchTargetFile,
			ContainerPath:     toolsScratchPath,
			CreateInUtilityVM: true,
		}); err != nil {
			logrus.Debugf("%s: failed to hot-add scratch %s: %s", title, scratchTargetFile, err)
			return nil, fmt.Errorf("failed to hot-add %s failed: %s", scratchTargetFile, err)
		}
		svm.scratchAttached = true
		// Don't need to ref-count here as it will be done via hotAddVHDsAtStart() call above.
	}

	logrus.Debugf("%s: (%s) success", title, context)
	return svm, nil
}

// terminateServiceVM terminates a service utility VM if its running if it's,
// not being used by any goroutine, but does nothing when in global mode as it's
// lifetime is limited to that of the daemon. If the force flag is set, then
// the VM will be killed regardless of the ref count or if it's global.
func (d *Driver) terminateServiceVM(id, context string, force bool) (err error) {
	// We don't do anything in safe mode unless the force flag has been passed, which
	// is only the case for cleanup at driver termination.
	if d.globalMode && !force {
		logrus.Debugf("lcowdriver: terminateservicevm: %s (%s) - doing nothing as in global mode", id, context)
		return nil
	}

	id = d.getVMID(id)

	var svm *serviceVM
	var lastRef bool
	if !force {
		// In the not force case, we ref count
		svm, lastRef, err = d.serviceVms.decrementRefCount(id)
	} else {
		// In the force case, we ignore the ref count and just set it to 0
		svm, err = d.serviceVms.setRefCountZero(id)
		lastRef = true
	}

	if err == errVMUnknown {
		return nil
	} else if err == errVMisTerminating {
		return svm.getStopError()
	} else if !lastRef {
		return nil
	}

	// We run the deletion of the scratch as a deferred function to at least attempt
	// clean-up in case of errors.
	defer func() {
		if svm.scratchAttached {
			scratchTargetFile := filepath.Join(d.dataRoot, scratchDirectory, fmt.Sprintf("%s.vhdx", id))
			logrus.Debugf("lcowdriver: terminateservicevm: %s (%s) - deleting scratch %s", id, context, scratchTargetFile)
			if errRemove := os.Remove(scratchTargetFile); errRemove != nil {
				logrus.Warnf("failed to remove scratch file %s (%s): %s", scratchTargetFile, context, errRemove)
				err = errRemove
			}
		}

		// This function shouldn't actually return error unless there is a bug
		if errDelete := d.serviceVms.deleteID(id); errDelete != nil {
			logrus.Warnf("failed to service vm from svm map %s (%s): %s", id, context, errDelete)
		}

		// Signal that this VM has stopped
		svm.signalStopFinished(err)
	}()

	// Now it's possible that the service VM failed to start and now we are trying to terminate it.
	// In this case, we will relay the error to the goroutines waiting for this vm to stop.
	if err := svm.getStartError(); err != nil {
		logrus.Debugf("lcowdriver: terminateservicevm: %s had failed to start up: %s", id, err)
		return err
	}

	if err := waitTerminate(svm, fmt.Sprintf("terminateservicevm: %s (%s)", id, context)); err != nil {
		return err
	}

	logrus.Debugf("lcowdriver: terminateservicevm: %s (%s) - success", id, context)
	return nil
}

func waitTerminate(svm *serviceVM, context string) error {
	if svm.config == nil {
		return fmt.Errorf("lcowdriver: waitTermiante: Nil utility VM. %s", context)
	}

	logrus.Debugf("lcowdriver: waitTerminate: Calling terminate: %s", context)
	if err := svm.config.Uvm.Terminate(); err != nil {
		// We might get operation still pending from the HCS. In that case, we shouldn't return
		// an error since we call wait right after.
		underlyingError := err
		if conterr, ok := err.(*hcsshim.ContainerError); ok {
			underlyingError = conterr.Err
		}

		if syscallErr, ok := underlyingError.(syscall.Errno); ok {
			underlyingError = syscallErr
		}

		if underlyingError != errOperationPending {
			return fmt.Errorf("failed to terminate utility VM (%s): %s", context, err)
		}
		logrus.Debugf("lcowdriver: waitTerminate: uvm.Terminate() returned operation pending (%s)", context)
	}

	logrus.Debugf("lcowdriver: waitTerminate: (%s) - waiting for utility VM to terminate", context)
	if err := svm.config.Uvm.WaitTimeout(time.Duration(svm.config.UvmTimeoutSeconds) * time.Second); err != nil {
		return fmt.Errorf("failed waiting for utility VM to terminate (%s): %s", context, err)
	}
	return nil
}

// String returns the string representation of a driver. This should match
// the name the graph driver has been registered with.
func (d *Driver) String() string {
	return "lcow"
}

// Status returns the status of the driver.
func (d *Driver) Status() [][2]string {
	return [][2]string{
		{"LCOW", ""},
		// TODO: Add some more info here - mode, home, ....
	}
}

// Exists returns true if the given id is registered with this driver.
func (d *Driver) Exists(id string) bool {
	_, err := os.Lstat(d.dir(id))
	logrus.Debugf("lcowdriver: exists: id %s %t", id, err == nil)
	return err == nil
}

// CreateReadWrite creates a layer that is writable for use as a container
// file system. That equates to creating a sandbox.
func (d *Driver) CreateReadWrite(id, parent string, opts *graphdriver.CreateOpts) error {
	title := fmt.Sprintf("lcowdriver: createreadwrite: id %s", id)
	logrus.Debugf(title)

	// First we need to create the folder
	if err := d.Create(id, parent, opts); err != nil {
		return err
	}

	// Look for an explicit sandbox size option.
	sandboxSize := d.defaultSandboxSize
	for k, v := range opts.StorageOpt {
		switch strings.ToLower(k) {
		case "lcow.sandboxsize":
			var err error
			sandboxSize, err = strconv.ParseUint(v, 10, 32)
			if err != nil {
				return fmt.Errorf("%s failed to parse value '%s' for 'lcow.sandboxsize'", title, v)
			}
			if sandboxSize < client.DefaultVhdxSizeGB {
				return fmt.Errorf("%s 'lcow.sandboxsize' option cannot be less than %d", title, client.DefaultVhdxSizeGB)
			}
			break
		}
	}

	// Massive perf optimisation here. If we know that the RW layer is the default size,
	// and that the cached sandbox already exists, and we are running in safe mode, we
	// can just do a simple copy into the layers sandbox file without needing to start a
	// unique service VM. For a global service VM, it doesn't really matter. Of course,
	// this is only the case where the sandbox is the default size.
	//
	// Make sure we have the sandbox mutex taken while we are examining it.
	if sandboxSize == client.DefaultVhdxSizeGB {
		logrus.Debugf("%s: locking cachedSandboxMutex", title)
		d.cachedSandboxMutex.Lock()
		_, err := os.Stat(d.cachedSandboxFile)
		logrus.Debugf("%s: releasing cachedSandboxMutex", title)
		d.cachedSandboxMutex.Unlock()
		if err == nil {
			logrus.Debugf("%s: using cached sandbox to populate", title)
			if err := client.CopyFile(d.cachedSandboxFile, filepath.Join(d.dir(id), sandboxFilename), true); err != nil {
				return err
			}
			return nil
		}
	}

	logrus.Debugf("%s: creating SVM to create sandbox", title)
	svm, err := d.startServiceVMIfNotRunning(id, nil, "createreadwrite")
	if err != nil {
		return err
	}
	defer d.terminateServiceVM(id, "createreadwrite", false)

	// So the sandbox needs creating. If default size ensure we are the only thread populating the cache.
	// Non-default size we don't store, just create them one-off so no need to lock the cachedSandboxMutex.
	if sandboxSize == client.DefaultVhdxSizeGB {
		logrus.Debugf("%s: locking cachedSandboxMutex for creation", title)
		d.cachedSandboxMutex.Lock()
		defer func() {
			logrus.Debugf("%s: releasing cachedSandboxMutex for creation", title)
			d.cachedSandboxMutex.Unlock()
		}()
	}

	// Make sure we don't write to our local cached copy if this is for a non-default size request.
	targetCacheFile := d.cachedSandboxFile
	if sandboxSize != client.DefaultVhdxSizeGB {
		targetCacheFile = ""
	}

	// Create the ext4 vhdx
	logrus.Debugf("%s: creating sandbox ext4 vhdx", title)
	if err := svm.createExt4VHDX(filepath.Join(d.dir(id), sandboxFilename), uint32(sandboxSize), targetCacheFile); err != nil {
		logrus.Debugf("%s: failed to create sandbox vhdx for %s: %s", title, id, err)
		return err
	}
	return nil
}

// Create creates the folder for the layer with the given id, and
// adds it to the layer chain.
func (d *Driver) Create(id, parent string, opts *graphdriver.CreateOpts) error {
	logrus.Debugf("lcowdriver: create: id %s parent: %s", id, parent)

	parentChain, err := d.getLayerChain(parent)
	if err != nil {
		return err
	}

	var layerChain []string
	if parent != "" {
		if !d.Exists(parent) {
			return fmt.Errorf("lcowdriver: cannot create layer folder with missing parent %s", parent)
		}
		layerChain = []string{d.dir(parent)}
	}
	layerChain = append(layerChain, parentChain...)

	layerPath := d.dir(id)
	logrus.Debugf("lcowdriver: create: id %s: creating %s", id, layerPath)
	// Standard mkdir here, not with SDDL as the dataroot was created with
	// inheritance to just local system and administrators.
	if err := os.MkdirAll(layerPath, 0700); err != nil {
		return err
	}

	if err := d.setLayerChain(id, layerChain); err != nil {
		if err2 := os.RemoveAll(layerPath); err2 != nil {
			logrus.Warnf("failed to remove layer %s: %s", layerPath, err2)
		}
		return err
	}
	logrus.Debugf("lcowdriver: create: id %s: success", id)

	return nil
}

// Remove unmounts and removes the dir information.
func (d *Driver) Remove(id string) error {
	logrus.Debugf("lcowdriver: remove: id %s", id)
	tmpID := fmt.Sprintf("%s-removing", id)
	tmpLayerPath := d.dir(tmpID)
	layerPath := d.dir(id)

	logrus.Debugf("lcowdriver: remove: id %s: layerPath %s", id, layerPath)

	// Unmount all the layers
	err := d.Put(id)
	if err != nil {
		logrus.Debugf("lcowdriver: remove id %s: failed to unmount: %s", id, err)
		return err
	}

	// for non-global case just kill the vm
	if !d.globalMode {
		if err := d.terminateServiceVM(id, fmt.Sprintf("Remove %s", id), true); err != nil {
			return err
		}
	}

	if err := os.Rename(layerPath, tmpLayerPath); err != nil && !os.IsNotExist(err) {
		return err
	}

	if err := os.RemoveAll(tmpLayerPath); err != nil {
		return err
	}

	logrus.Debugf("lcowdriver: remove: id %s: layerPath %s succeeded", id, layerPath)
	return nil
}

// Get returns the rootfs path for the id. It is reference counted and
// effectively can be thought of as a "mount the layer into the utility
// vm if it isn't already". The contract from the caller of this is that
// all Gets and Puts are matched. It -should- be the case that on cleanup,
// nothing is mounted.
//
// For optimisation, we don't actually mount the filesystem (which in our
// case means [hot-]adding it to a service VM. But we track that and defer
// the actual adding to the point we need to access it.
func (d *Driver) Get(id, mountLabel string) (containerfs.ContainerFS, error) {
	title := fmt.Sprintf("lcowdriver: get: %s", id)
	logrus.Debugf(title)

	// Generate the mounts needed for the deferred operation.
	disks, err := d.getAllMounts(id)
	if err != nil {
		logrus.Debugf("%s failed to get all layer details for %s: %s", title, d.dir(id), err)
		return nil, fmt.Errorf("%s failed to get layer details for %s: %s", title, d.dir(id), err)
	}

	logrus.Debugf("%s: got layer mounts: %+v", title, disks)
	return &lcowfs{
		root:        unionMountName(disks),
		d:           d,
		mappedDisks: disks,
		vmID:        d.getVMID(id),
	}, nil
}

// Put does the reverse of get. If there are no more references to
// the layer, it unmounts it from the utility VM.
func (d *Driver) Put(id string) error {
	title := fmt.Sprintf("lcowdriver: put: %s", id)

	// Get the service VM that we need to remove from
	svm, err := d.serviceVms.get(d.getVMID(id))
	if err == errVMUnknown {
		return nil
	} else if err == errVMisTerminating {
		return svm.getStopError()
	}

	// Generate the mounts that Get() might have mounted
	disks, err := d.getAllMounts(id)
	if err != nil {
		logrus.Debugf("%s failed to get all layer details for %s: %s", title, d.dir(id), err)
		return fmt.Errorf("%s failed to get layer details for %s: %s", title, d.dir(id), err)
	}

	// Now, we want to perform the unmounts, hot-remove and stop the service vm.
	// We want to go though all the steps even if we have an error to clean up properly
	err = svm.deleteUnionMount(unionMountName(disks), disks...)
	if err != nil {
		logrus.Debugf("%s failed to delete union mount %s: %s", title, id, err)
	}

	err1 := svm.hotRemoveVHDs(disks...)
	if err1 != nil {
		logrus.Debugf("%s failed to hot remove vhds %s: %s", title, id, err)
		if err == nil {
			err = err1
		}
	}

	err1 = d.terminateServiceVM(id, fmt.Sprintf("Put %s", id), false)
	if err1 != nil {
		logrus.Debugf("%s failed to terminate service vm %s: %s", title, id, err1)
		if err == nil {
			err = err1
		}
	}
	logrus.Debugf("Put succeeded on id %s", id)
	return err
}

// Cleanup ensures the information the driver stores is properly removed.
// We use this opportunity to cleanup any -removing folders which may be
// still left if the daemon was killed while it was removing a layer.
func (d *Driver) Cleanup() error {
	title := "lcowdriver: cleanup"

	items, err := ioutil.ReadDir(d.dataRoot)
	if err != nil {
		if os.IsNotExist(err) {
			return nil
		}
		return err
	}

	// Note we don't return an error below - it's possible the files
	// are locked. However, next time around after the daemon exits,
	// we likely will be able to cleanup successfully. Instead we log
	// warnings if there are errors.
	for _, item := range items {
		if item.IsDir() && strings.HasSuffix(item.Name(), "-removing") {
			if err := os.RemoveAll(filepath.Join(d.dataRoot, item.Name())); err != nil {
				logrus.Warnf("%s failed to cleanup %s: %s", title, item.Name(), err)
			} else {
				logrus.Infof("%s cleaned up %s", title, item.Name())
			}
		}
	}

	// Cleanup any service VMs we have running, along with their scratch spaces.
	// We don't take the lock for this as it's taken in terminateServiceVm.
	for k, v := range d.serviceVms.svms {
		logrus.Debugf("%s svm entry: %s: %+v", title, k, v)
		d.terminateServiceVM(k, "cleanup", true)
	}

	return nil
}

// Diff takes a layer (and it's parent layer which may be null, but
// is ignored by this implementation below) and returns a reader for
// a tarstream representing the layers contents. The id could be
// a read-only "layer.vhd" or a read-write "sandbox.vhdx". The semantics
// of this function dictate that the layer is already mounted.
// However, as we do lazy mounting as a performance optimisation,
// this will likely not be the case.
func (d *Driver) Diff(id, parent string) (io.ReadCloser, error) {
	title := fmt.Sprintf("lcowdriver: diff: %s", id)

	// Get VHDX info
	ld, err := getLayerDetails(d.dir(id))
	if err != nil {
		logrus.Debugf("%s: failed to get vhdx information of %s: %s", title, d.dir(id), err)
		return nil, err
	}

	// Start the SVM with a mapped virtual disk. Note that if the SVM is
	// already running and we are in global mode, this will be
	// hot-added.
	mvd := hcsshim.MappedVirtualDisk{
		HostPath:          ld.filename,
		ContainerPath:     hostToGuest(ld.filename),
		CreateInUtilityVM: true,
		ReadOnly:          true,
	}

	logrus.Debugf("%s: starting service VM", title)
	svm, err := d.startServiceVMIfNotRunning(id, []hcsshim.MappedVirtualDisk{mvd}, fmt.Sprintf("diff %s", id))
	if err != nil {
		return nil, err
	}

	logrus.Debugf("lcowdriver: diff: waiting for svm to finish booting")
	err = svm.getStartError()
	if err != nil {
		d.terminateServiceVM(id, fmt.Sprintf("diff %s", id), false)
		return nil, fmt.Errorf("lcowdriver: diff: svm failed to boot: %s", err)
	}

	// Obtain the tar stream for it
	// The actual container path will have be remapped to a short name, so use that.
	actualContainerPath := svm.getShortContainerPath(&mvd)
	if actualContainerPath == "" {
		return nil, fmt.Errorf("failed to get short container path for %+v in SVM %s", mvd, svm.config.Name)
	}
	logrus.Debugf("%s: %s %s, size %d, ReadOnly %t", title, ld.filename, actualContainerPath, ld.size, ld.isSandbox)
	tarReadCloser, err := svm.config.VhdToTar(mvd.HostPath, actualContainerPath, ld.isSandbox, ld.size)
	if err != nil {
		svm.hotRemoveVHDs(mvd)
		d.terminateServiceVM(id, fmt.Sprintf("diff %s", id), false)
		return nil, fmt.Errorf("%s failed to export layer to tar stream for id: %s, parent: %s : %s", title, id, parent, err)
	}

	logrus.Debugf("%s id %s parent %s completed successfully", title, id, parent)

	// In safe/non-global mode, we can't tear down the service VM until things have been read.
	return ioutils.NewReadCloserWrapper(tarReadCloser, func() error {
		tarReadCloser.Close()
		svm.hotRemoveVHDs(mvd)
		d.terminateServiceVM(id, fmt.Sprintf("diff %s", id), false)
		return nil
	}), nil
}

// ApplyDiff extracts the changeset from the given diff into the
// layer with the specified id and parent, returning the size of the
// new layer in bytes. The layer should not be mounted when calling
// this function. Another way of describing this is that ApplyDiff writes
// to a new layer (a VHD in LCOW) the contents of a tarstream it's given.
func (d *Driver) ApplyDiff(id, parent string, diff io.Reader) (int64, error) {
	logrus.Debugf("lcowdriver: applydiff: id %s", id)

	// Log failures here as it's undiagnosable sometimes, due to a possible panic.
	// See https://github.com/moby/moby/issues/37955 for more information.

	dest := filepath.Join(d.dataRoot, id, layerFilename)
	if !noreexec {
		cmd := reexec.Command([]string{"docker-lcow-tar2ext4", dest}...)
		stdout := bytes.NewBuffer(nil)
		stderr := bytes.NewBuffer(nil)
		cmd.Stdin = diff
		cmd.Stdout = stdout
		cmd.Stderr = stderr

		if err := cmd.Start(); err != nil {
			logrus.Warnf("lcowdriver: applydiff: id %s failed to start re-exec: %s", id, err)
			return 0, err
		}

		if err := cmd.Wait(); err != nil {
			logrus.Warnf("lcowdriver: applydiff: id %s failed %s", id, err)
			return 0, fmt.Errorf("re-exec error: %v: stderr: %s", err, stderr)
		}

		size, err := strconv.ParseInt(stdout.String(), 10, 64)
		if err != nil {
			logrus.Warnf("lcowdriver: applydiff: id %s failed to parse output %s", id, err)
			return 0, fmt.Errorf("re-exec error: %v: stdout: %s", err, stdout)
		}
		return applySID(id, size, dest)

	}
	// The inline case
	size, err := tar2ext4Actual(dest, diff)
	if err != nil {
		logrus.Warnf("lcowdriver: applydiff: id %s failed %s", id, err)
	}
	return applySID(id, size, dest)
}

// applySID adds the VM Group SID read-only access.
func applySID(id string, size int64, dest string) (int64, error) {
	if err := security.GrantVmGroupAccess(dest); err != nil {
		logrus.Warnf("lcowdriver: applySIDs: id %s failed %s", id, err)
		return 0, err
	}
	return size, nil
}

// tar2ext4Reexec is the re-exec entry point for writing a layer from a tar file
func tar2ext4Reexec() {
	size, err := tar2ext4Actual(os.Args[1], os.Stdin)
	if err != nil {
		fmt.Fprint(os.Stderr, err)
		os.Exit(1)
	}
	fmt.Fprint(os.Stdout, size)
}

// tar2ext4Actual is the implementation of tar2ext to write a layer from a tar file.
// It can be called through re-exec (default), or inline for debugging.
func tar2ext4Actual(dest string, diff io.Reader) (int64, error) {
	// maxDiskSize is not relating to the sandbox size - this is the
	// maximum possible size a layer VHD generated can be from an EXT4
	// layout perspective.
	const maxDiskSize = 128 * 1024 * 1024 * 1024 // 128GB
	out, err := os.Create(dest)
	if err != nil {
		return 0, err
	}
	defer out.Close()
	if err := tar2ext4.Convert(
		diff,
		out,
		tar2ext4.AppendVhdFooter,
		tar2ext4.ConvertWhiteout,
		tar2ext4.MaximumDiskSize(maxDiskSize)); err != nil {
		return 0, err
	}
	fi, err := os.Stat(dest)
	if err != nil {
		return 0, err
	}
	return fi.Size(), nil
}

// Changes produces a list of changes between the specified layer
// and its parent layer. If parent is "", then all changes will be ADD changes.
// The layer should not be mounted when calling this function.
func (d *Driver) Changes(id, parent string) ([]archive.Change, error) {
	logrus.Debugf("lcowdriver: changes: id %s parent %s", id, parent)
	// TODO @gupta-ak. Needs implementation with assistance from service VM
	return nil, nil
}

// DiffSize calculates the changes between the specified layer
// and its parent and returns the size in bytes of the changes
// relative to its base filesystem directory.
func (d *Driver) DiffSize(id, parent string) (size int64, err error) {
	logrus.Debugf("lcowdriver: diffsize: id %s", id)
	// TODO @gupta-ak. Needs implementation with assistance from service VM
	return 0, nil
}

// GetMetadata returns custom driver information.
func (d *Driver) GetMetadata(id string) (map[string]string, error) {
	logrus.Debugf("lcowdriver: getmetadata: id %s", id)
	m := make(map[string]string)
	m["dir"] = d.dir(id)
	return m, nil
}

// GetLayerPath gets the layer path on host (path to VHD/VHDX)
func (d *Driver) GetLayerPath(id string) (string, error) {
	return d.dir(id), nil
}

// dir returns the absolute path to the layer.
func (d *Driver) dir(id string) string {
	return filepath.Join(d.dataRoot, filepath.Base(id))
}

// getLayerChain returns the layer chain information.
func (d *Driver) getLayerChain(id string) ([]string, error) {
	jPath := filepath.Join(d.dir(id), "layerchain.json")
	logrus.Debugf("lcowdriver: getlayerchain: id %s json %s", id, jPath)
	content, err := ioutil.ReadFile(jPath)
	if os.IsNotExist(err) {
		return nil, nil
	} else if err != nil {
		return nil, fmt.Errorf("lcowdriver: getlayerchain: %s unable to read layerchain file %s: %s", id, jPath, err)
	}

	var layerChain []string
	err = json.Unmarshal(content, &layerChain)
	if err != nil {
		return nil, fmt.Errorf("lcowdriver: getlayerchain: %s failed to unmarshall layerchain file %s: %s", id, jPath, err)
	}
	return layerChain, nil
}

// setLayerChain stores the layer chain information on disk.
func (d *Driver) setLayerChain(id string, chain []string) error {
	content, err := json.Marshal(&chain)
	if err != nil {
		return fmt.Errorf("lcowdriver: setlayerchain: %s failed to marshall layerchain json: %s", id, err)
	}

	jPath := filepath.Join(d.dir(id), "layerchain.json")
	logrus.Debugf("lcowdriver: setlayerchain: id %s json %s", id, jPath)
	err = ioutil.WriteFile(jPath, content, 0600)
	if err != nil {
		return fmt.Errorf("lcowdriver: setlayerchain: %s failed to write layerchain file: %s", id, err)
	}
	return nil
}

// getLayerDetails is a utility for getting a file name, size and indication of
// sandbox for a VHD(x) in a folder. A read-only layer will be layer.vhd. A
// read-write layer will be sandbox.vhdx.
func getLayerDetails(folder string) (*layerDetails, error) {
	var fileInfo os.FileInfo
	ld := &layerDetails{
		isSandbox: false,
		filename:  filepath.Join(folder, layerFilename),
	}

	fileInfo, err := os.Stat(ld.filename)
	if err != nil {
		ld.filename = filepath.Join(folder, sandboxFilename)
		if fileInfo, err = os.Stat(ld.filename); err != nil {
			return nil, fmt.Errorf("failed to locate layer or sandbox in %s", folder)
		}
		ld.isSandbox = true
	}
	ld.size = fileInfo.Size()

	return ld, nil
}

func (d *Driver) getAllMounts(id string) ([]hcsshim.MappedVirtualDisk, error) {
	layerChain, err := d.getLayerChain(id)
	if err != nil {
		return nil, err
	}
	layerChain = append([]string{d.dir(id)}, layerChain...)

	logrus.Debugf("getting all  layers: %v", layerChain)
	disks := make([]hcsshim.MappedVirtualDisk, len(layerChain), len(layerChain))
	for i := range layerChain {
		ld, err := getLayerDetails(layerChain[i])
		if err != nil {
			logrus.Debugf("Failed to get LayerVhdDetails from %s: %s", layerChain[i], err)
			return nil, err
		}
		disks[i].HostPath = ld.filename
		disks[i].ContainerPath = hostToGuest(ld.filename)
		disks[i].CreateInUtilityVM = true
		disks[i].ReadOnly = !ld.isSandbox
	}
	return disks, nil
}

func hostToGuest(hostpath string) string {
	// This is the "long" container path. At the point of which we are
	// calculating this, we don't know which service VM we're going to be
	// using, so we can't translate this to a short path yet, instead
	// deferring until the point of which it's added to an SVM. We don't
	// use long container paths in SVMs for SCSI disks, otherwise it can cause
	// command line operations that we invoke to fail due to being over ~4200
	// characters when there are ~47 layers involved. An example of this is
	// the mount call to create the overlay across multiple SCSI-attached disks.
	// It doesn't affect VPMem attached layers during container creation as
	// these get mapped by openGCS to /tmp/N/M where N is a container instance
	// number, and M is a layer number.
	return fmt.Sprintf("/tmp/%s", filepath.Base(filepath.Dir(hostpath)))
}

func unionMountName(disks []hcsshim.MappedVirtualDisk) string {
	return fmt.Sprintf("%s-mount", disks[0].ContainerPath)
}

type nopCloser struct {
	io.Reader
}

func (nopCloser) Close() error {
	return nil
}

type fileGetCloserFromSVM struct {
	id  string
	svm *serviceVM
	mvd *hcsshim.MappedVirtualDisk
	d   *Driver
}

func (fgc *fileGetCloserFromSVM) Close() error {
	if fgc.svm != nil {
		if fgc.mvd != nil {
			if err := fgc.svm.hotRemoveVHDs(*fgc.mvd); err != nil {
				// We just log this as we're going to tear down the SVM imminently unless in global mode
				logrus.Errorf("failed to remove mvd %s: %s", fgc.mvd.ContainerPath, err)
			}
		}
	}
	if fgc.d != nil && fgc.svm != nil && fgc.id != "" {
		if err := fgc.d.terminateServiceVM(fgc.id, fmt.Sprintf("diffgetter %s", fgc.id), false); err != nil {
			return err
		}
	}
	return nil
}

func (fgc *fileGetCloserFromSVM) Get(filename string) (io.ReadCloser, error) {
	errOut := &bytes.Buffer{}
	outOut := &bytes.Buffer{}
	// Must map to the actual "short" container path where the SCSI disk was mounted
	actualContainerPath := fgc.svm.getShortContainerPath(fgc.mvd)
	if actualContainerPath == "" {
		return nil, fmt.Errorf("inconsistency detected: couldn't get short container path for %+v in utility VM %s", fgc.mvd, fgc.svm.config.Name)
	}
	file := path.Join(actualContainerPath, filename)

	// Ugly fix for MSFT internal bug VSO#19696554
	// If a file name contains a space, pushing an image fails.
	// Using solution from https://groups.google.com/forum/#!topic/Golang-Nuts/DpldsmrhPio to escape for shell execution
	file = "'" + strings.Join(strings.Split(file, "'"), `'"'"'`) + "'"
	if err := fgc.svm.runProcess(fmt.Sprintf("cat %s", file), nil, outOut, errOut); err != nil {
		logrus.Debugf("cat %s failed: %s", file, errOut.String())
		return nil, err
	}
	return nopCloser{bytes.NewReader(outOut.Bytes())}, nil
}

// DiffGetter returns a FileGetCloser that can read files from the directory that
// contains files for the layer differences. Used for direct access for tar-split.
func (d *Driver) DiffGetter(id string) (graphdriver.FileGetCloser, error) {
	title := fmt.Sprintf("lcowdriver: diffgetter: %s", id)
	logrus.Debugf(title)

	ld, err := getLayerDetails(d.dir(id))
	if err != nil {
		logrus.Debugf("%s: failed to get vhdx information of %s: %s", title, d.dir(id), err)
		return nil, err
	}

	// Start the SVM with a mapped virtual disk. Note that if the SVM is
	// already running and we are in global mode, this will be hot-added.
	mvd := hcsshim.MappedVirtualDisk{
		HostPath:          ld.filename,
		ContainerPath:     hostToGuest(ld.filename),
		CreateInUtilityVM: true,
		ReadOnly:          true,
	}

	logrus.Debugf("%s: starting service VM", title)
	svm, err := d.startServiceVMIfNotRunning(id, []hcsshim.MappedVirtualDisk{mvd}, fmt.Sprintf("diffgetter %s", id))
	if err != nil {
		return nil, err
	}

	logrus.Debugf("%s: waiting for svm to finish booting", title)
	err = svm.getStartError()
	if err != nil {
		d.terminateServiceVM(id, fmt.Sprintf("diff %s", id), false)
		return nil, fmt.Errorf("%s: svm failed to boot: %s", title, err)
	}

	return &fileGetCloserFromSVM{
		id:  id,
		svm: svm,
		mvd: &mvd,
		d:   d}, nil
}