//go:build windows // +build windows // Locale: en-gb // About: Graph-driver for Linux Containers On Windows (LCOW) // // This graphdriver runs in two modes. Yet to be determined which one will // be the shipping mode. The global mode is where a single utility VM // is used for all service VM tool operations. This isn't safe security-wise // as it's attaching a sandbox of multiple containers to it, containing // untrusted data. This may be fine for client devops scenarios. In // safe mode, a unique utility VM is instantiated for all service VM tool // operations. The downside of safe-mode is that operations are slower as // a new service utility VM has to be started and torn-down when needed. // // Options: // // The following options are read by the graphdriver itself: // // * lcow.globalmode - Enables global service VM Mode // -- Possible values: true/false // -- Default if omitted: false // // * lcow.sandboxsize - Specifies a custom sandbox size in GB for starting a container // -- Possible values: >= default sandbox size (opengcs defined, currently 20) // -- Default if omitted: 20 // // The following options are read by opengcs: // // * lcow.kirdpath - Specifies a custom path to a kernel/initrd pair // -- Possible values: Any local path that is not a mapped drive // -- Default if omitted: %ProgramFiles%\Linux Containers // // * lcow.bootparameters - Specifies additional boot parameters for booting in kernel+initrd mode // -- Possible values: Any valid linux kernel boot options // -- Default if omitted: // // * lcow.timeout - Specifies a timeout for utility VM operations in seconds // -- Possible values: >=0 // -- Default if omitted: 300 // TODO: Grab logs from SVM at terminate or errors package lcow // import "github.com/docker/docker/daemon/graphdriver/lcow" import ( "bytes" "encoding/json" "fmt" "io" "io/ioutil" "os" "path" "path/filepath" "strconv" "strings" "sync" "syscall" "time" "github.com/Microsoft/go-winio/pkg/security" "github.com/Microsoft/hcsshim" "github.com/Microsoft/hcsshim/ext4/tar2ext4" "github.com/Microsoft/opengcs/client" "github.com/docker/docker/daemon/graphdriver" "github.com/docker/docker/pkg/archive" "github.com/docker/docker/pkg/containerfs" "github.com/docker/docker/pkg/idtools" "github.com/docker/docker/pkg/ioutils" "github.com/docker/docker/pkg/reexec" "github.com/sirupsen/logrus" ) // noreexec controls reexec functionality. Off by default, on for debugging purposes. var noreexec = false // init registers this driver to the register. It gets initialised by the // function passed in the second parameter, implemented in this file. func init() { graphdriver.Register("lcow", InitDriver) // DOCKER_LCOW_NOREEXEC allows for inline processing which makes // debugging issues in the re-exec codepath significantly easier. if os.Getenv("DOCKER_LCOW_NOREEXEC") != "" { logrus.Warnf("LCOW Graphdriver is set to not re-exec. This is intended for debugging purposes only.") noreexec = true } else { reexec.Register("docker-lcow-tar2ext4", tar2ext4Reexec) } } const ( // sandboxFilename is the name of the file containing a layer's sandbox (read-write layer). sandboxFilename = "sandbox.vhdx" // scratchFilename is the name of the scratch-space used by an SVM to avoid running out of memory. scratchFilename = "scratch.vhdx" // layerFilename is the name of the file containing a layer's read-only contents. // Note this really is VHD format, not VHDX. layerFilename = "layer.vhd" // toolsScratchPath is a location in a service utility VM that the tools can use as a // scratch space to avoid running out of memory. toolsScratchPath = "/tmp/scratch" // svmGlobalID is the ID used in the serviceVMs map for the global service VM when running in "global" mode. svmGlobalID = "_lcow_global_svm_" // cacheDirectory is the sub-folder under the driver's data-root used to cache blank sandbox and scratch VHDs. cacheDirectory = "cache" // scratchDirectory is the sub-folder under the driver's data-root used for scratch VHDs in service VMs scratchDirectory = "scratch" // errOperationPending is the HRESULT returned by the HCS when the VM termination operation is still pending. errOperationPending syscall.Errno = 0xc0370103 ) // Driver represents an LCOW graph driver. type Driver struct { dataRoot string // Root path on the host where we are storing everything. cachedSandboxFile string // Location of the local default-sized cached sandbox. cachedSandboxMutex sync.Mutex // Protects race conditions from multiple threads creating the cached sandbox. cachedScratchFile string // Location of the local cached empty scratch space. cachedScratchMutex sync.Mutex // Protects race conditions from multiple threads creating the cached scratch. options []string // Graphdriver options we are initialised with. globalMode bool // Indicates if running in an unsafe/global service VM mode. // NOTE: It is OK to use a cache here because Windows does not support // restoring containers when the daemon dies. serviceVms *serviceVMMap // Map of the configs representing the service VM(s) we are running. } // layerDetails is the structure returned by a helper function `getLayerDetails` // for getting information about a layer folder type layerDetails struct { filename string // \path\to\sandbox.vhdx or \path\to\layer.vhd size int64 // size of the above file isSandbox bool // true if sandbox.vhdx } // deletefiles is a helper function for initialisation where we delete any // left-over scratch files in case we were previously forcibly terminated. func deletefiles(path string, f os.FileInfo, err error) error { if strings.HasSuffix(f.Name(), ".vhdx") { logrus.Warnf("lcowdriver: init: deleting stale scratch file %s", path) return os.Remove(path) } return nil } // InitDriver returns a new LCOW storage driver. func InitDriver(dataRoot string, options []string, _, _ []idtools.IDMap) (graphdriver.Driver, error) { title := "lcowdriver: init:" cd := filepath.Join(dataRoot, cacheDirectory) sd := filepath.Join(dataRoot, scratchDirectory) d := &Driver{ dataRoot: dataRoot, options: options, cachedSandboxFile: filepath.Join(cd, sandboxFilename), cachedScratchFile: filepath.Join(cd, scratchFilename), serviceVms: &serviceVMMap{ svms: make(map[string]*serviceVMMapItem), }, globalMode: false, } // Looks for relevant options for _, v := range options { opt := strings.SplitN(v, "=", 2) if len(opt) == 2 { switch strings.ToLower(opt[0]) { case "lcow.globalmode": var err error d.globalMode, err = strconv.ParseBool(opt[1]) if err != nil { return nil, fmt.Errorf("%s failed to parse value for 'lcow.globalmode' - must be 'true' or 'false'", title) } break } } } // Make sure the dataRoot directory is created if err := idtools.MkdirAllAndChown(dataRoot, 0700, idtools.Identity{UID: 0, GID: 0}); err != nil { return nil, fmt.Errorf("%s failed to create '%s': %v", title, dataRoot, err) } // Make sure the cache directory is created under dataRoot if err := idtools.MkdirAllAndChown(cd, 0700, idtools.Identity{UID: 0, GID: 0}); err != nil { return nil, fmt.Errorf("%s failed to create '%s': %v", title, cd, err) } // Make sure the scratch directory is created under dataRoot if err := idtools.MkdirAllAndChown(sd, 0700, idtools.Identity{UID: 0, GID: 0}); err != nil { return nil, fmt.Errorf("%s failed to create '%s': %v", title, sd, err) } // Delete any items in the scratch directory filepath.Walk(sd, deletefiles) logrus.Infof("%s dataRoot: %s globalMode: %t", title, dataRoot, d.globalMode) return d, nil } func (d *Driver) getVMID(id string) string { if d.globalMode { return svmGlobalID } return id } // remapLongToShortContainerPath does the mapping of a long container path for a // SCSI attached disk, to a short container path where it's actually mounted. func remapLongToShortContainerPath(longContainerPath string, attachCounter uint64, svmName string) string { shortContainerPath := longContainerPath if shortContainerPath != "" && shortContainerPath != toolsScratchPath { shortContainerPath = fmt.Sprintf("/tmp/d%d", attachCounter) logrus.Debugf("lcowdriver: UVM %s: remapping %s --> %s", svmName, longContainerPath, shortContainerPath) } return shortContainerPath } // startServiceVMIfNotRunning starts a service utility VM if it is not currently running. // It can optionally be started with a mapped virtual disk. Returns a opengcs config structure // representing the VM. func (d *Driver) startServiceVMIfNotRunning(id string, mvdToAdd []hcsshim.MappedVirtualDisk, context string) (_ *serviceVM, err error) { // Use the global ID if in global mode id = d.getVMID(id) title := "lcowdriver: startServiceVMIfNotRunning " + id // Attempt to add ID to the service vm map logrus.Debugf("%s: adding entry to service vm map", title) svm, exists, err := d.serviceVms.add(id) if err != nil && err == errVMisTerminating { // VM is in the process of terminating. Wait until it's done and then try again logrus.Debugf("%s: VM with current ID still in the process of terminating", title) if err := svm.getStopError(); err != nil { logrus.Debugf("%s: VM did not stop successfully: %s", title, err) return nil, err } return d.startServiceVMIfNotRunning(id, mvdToAdd, context) } else if err != nil { logrus.Debugf("%s: failed to add service vm to map: %s", title, err) return nil, fmt.Errorf("%s: failed to add to service vm map: %s", title, err) } if exists { // Service VM is already up and running. In this case, just hot add the vhds. // Note that hotAddVHDs will remap long to short container paths, so no need // for us to that here. logrus.Debugf("%s: service vm already exists. Just hot adding: %+v", title, mvdToAdd) if err := svm.hotAddVHDs(mvdToAdd...); err != nil { logrus.Debugf("%s: failed to hot add vhds on service vm creation: %s", title, err) return nil, fmt.Errorf("%s: failed to hot add vhds on service vm: %s", title, err) } return svm, nil } // We are the first service for this id, so we need to start it logrus.Debugf("%s: service vm doesn't exist. Now starting it up", title) defer func() { // Signal that start has finished, passing in the error if any. svm.signalStartFinished(err) if err != nil { // We added a ref to the VM, since we failed, we should delete the ref. d.terminateServiceVM(id, "error path on startServiceVMIfNotRunning", false) } }() // Generate a default configuration if err := svm.config.GenerateDefault(d.options); err != nil { return nil, fmt.Errorf("%s: failed to generate default gogcs configuration for global svm (%s): %s", title, context, err) } // For the name, we deliberately suffix if safe-mode to ensure that it doesn't // clash with another utility VM which may be running for the container itself. // This also makes it easier to correlate through Get-ComputeProcess. if id == svmGlobalID { svm.config.Name = svmGlobalID } else { svm.config.Name = fmt.Sprintf("%s_svm", id) } // Ensure we take the cached scratch mutex around the check to ensure the file is complete // and not in the process of being created by another thread. scratchTargetFile := filepath.Join(d.dataRoot, scratchDirectory, fmt.Sprintf("%s.vhdx", id)) logrus.Debugf("%s: locking cachedScratchMutex", title) d.cachedScratchMutex.Lock() if _, err := os.Stat(d.cachedScratchFile); err == nil { // Make a copy of cached scratch to the scratch directory logrus.Debugf("%s: (%s) cloning cached scratch for mvd", title, context) if err := client.CopyFile(d.cachedScratchFile, scratchTargetFile, true); err != nil { logrus.Debugf("%s: releasing cachedScratchMutex on err: %s", title, err) d.cachedScratchMutex.Unlock() return nil, err } // Add the cached clone as a mapped virtual disk logrus.Debugf("%s: (%s) adding cloned scratch as mvd", title, context) mvd := hcsshim.MappedVirtualDisk{ HostPath: scratchTargetFile, ContainerPath: toolsScratchPath, CreateInUtilityVM: true, } svm.config.MappedVirtualDisks = append(svm.config.MappedVirtualDisks, mvd) svm.scratchAttached = true } logrus.Debugf("%s: releasing cachedScratchMutex", title) d.cachedScratchMutex.Unlock() // Add mapped virtual disks. First those that are already in the configuration. Generally, // the only one that will be here is the service VMs scratch. The exception is when invoked // via the graphdrivers DiffGetter implementation. for i, mvd := range svm.config.MappedVirtualDisks { svm.attachCounter++ svm.attachedVHDs[mvd.HostPath] = &attachedVHD{refCount: 1, attachCounter: svm.attachCounter} // No-op for the service VMs scratch disk. Only applicable in the DiffGetter interface invocation. svm.config.MappedVirtualDisks[i].ContainerPath = remapLongToShortContainerPath(mvd.ContainerPath, svm.attachCounter, svm.config.Name) } // Then the remaining ones to add, and adding them to the startup configuration. for _, mvd := range mvdToAdd { svm.attachCounter++ svm.attachedVHDs[mvd.HostPath] = &attachedVHD{refCount: 1, attachCounter: svm.attachCounter} mvd.ContainerPath = remapLongToShortContainerPath(mvd.ContainerPath, svm.attachCounter, svm.config.Name) svm.config.MappedVirtualDisks = append(svm.config.MappedVirtualDisks, mvd) } // Start it. logrus.Debugf("%s: (%s) starting %s", title, context, svm.config.Name) if err := svm.config.StartUtilityVM(); err != nil { return nil, fmt.Errorf("failed to start service utility VM (%s): %s", context, err) } // defer function to terminate the VM if the next steps fail defer func() { if err != nil { waitTerminate(svm, fmt.Sprintf("%s: (%s)", title, context)) } }() // Now we have a running service VM, we can create the cached scratch file if it doesn't exist. logrus.Debugf("%s: locking cachedScratchMutex", title) d.cachedScratchMutex.Lock() if _, err := os.Stat(d.cachedScratchFile); err != nil { logrus.Debugf("%s: (%s) creating an SVM scratch", title, context) // Don't use svm.CreateExt4Vhdx since that only works when the service vm is setup, // but we're still in that process right now. if err := svm.config.CreateExt4Vhdx(scratchTargetFile, client.DefaultVhdxSizeGB, d.cachedScratchFile); err != nil { logrus.Debugf("%s: (%s) releasing cachedScratchMutex on error path", title, context) d.cachedScratchMutex.Unlock() logrus.Debugf("%s: failed to create vm scratch %s: %s", title, scratchTargetFile, err) return nil, fmt.Errorf("failed to create SVM scratch VHDX (%s): %s", context, err) } } logrus.Debugf("%s: (%s) releasing cachedScratchMutex", title, context) d.cachedScratchMutex.Unlock() // Hot-add the scratch-space if not already attached if !svm.scratchAttached { logrus.Debugf("%s: (%s) hot-adding scratch %s", title, context, scratchTargetFile) if err := svm.hotAddVHDsAtStart(hcsshim.MappedVirtualDisk{ HostPath: scratchTargetFile, ContainerPath: toolsScratchPath, CreateInUtilityVM: true, }); err != nil { logrus.Debugf("%s: failed to hot-add scratch %s: %s", title, scratchTargetFile, err) return nil, fmt.Errorf("failed to hot-add %s failed: %s", scratchTargetFile, err) } svm.scratchAttached = true // Don't need to ref-count here as it will be done via hotAddVHDsAtStart() call above. } logrus.Debugf("%s: (%s) success", title, context) return svm, nil } // terminateServiceVM terminates a service utility VM if its running if it's, // not being used by any goroutine, but does nothing when in global mode as it's // lifetime is limited to that of the daemon. If the force flag is set, then // the VM will be killed regardless of the ref count or if it's global. func (d *Driver) terminateServiceVM(id, context string, force bool) (err error) { // We don't do anything in safe mode unless the force flag has been passed, which // is only the case for cleanup at driver termination. if d.globalMode && !force { logrus.Debugf("lcowdriver: terminateservicevm: %s (%s) - doing nothing as in global mode", id, context) return nil } id = d.getVMID(id) var svm *serviceVM var lastRef bool if !force { // In the not force case, we ref count svm, lastRef, err = d.serviceVms.decrementRefCount(id) } else { // In the force case, we ignore the ref count and just set it to 0 svm, err = d.serviceVms.setRefCountZero(id) lastRef = true } if err == errVMUnknown { return nil } else if err == errVMisTerminating { return svm.getStopError() } else if !lastRef { return nil } // We run the deletion of the scratch as a deferred function to at least attempt // clean-up in case of errors. defer func() { if svm.scratchAttached { scratchTargetFile := filepath.Join(d.dataRoot, scratchDirectory, fmt.Sprintf("%s.vhdx", id)) logrus.Debugf("lcowdriver: terminateservicevm: %s (%s) - deleting scratch %s", id, context, scratchTargetFile) if errRemove := os.Remove(scratchTargetFile); errRemove != nil { logrus.Warnf("failed to remove scratch file %s (%s): %s", scratchTargetFile, context, errRemove) err = errRemove } } // This function shouldn't actually return error unless there is a bug if errDelete := d.serviceVms.deleteID(id); errDelete != nil { logrus.Warnf("failed to service vm from svm map %s (%s): %s", id, context, errDelete) } // Signal that this VM has stopped svm.signalStopFinished(err) }() // Now it's possible that the service VM failed to start and now we are trying to terminate it. // In this case, we will relay the error to the goroutines waiting for this vm to stop. if err := svm.getStartError(); err != nil { logrus.Debugf("lcowdriver: terminateservicevm: %s had failed to start up: %s", id, err) return err } if err := waitTerminate(svm, fmt.Sprintf("terminateservicevm: %s (%s)", id, context)); err != nil { return err } logrus.Debugf("lcowdriver: terminateservicevm: %s (%s) - success", id, context) return nil } func waitTerminate(svm *serviceVM, context string) error { if svm.config == nil { return fmt.Errorf("lcowdriver: waitTermiante: Nil utility VM. %s", context) } logrus.Debugf("lcowdriver: waitTerminate: Calling terminate: %s", context) if err := svm.config.Uvm.Terminate(); err != nil { // We might get operation still pending from the HCS. In that case, we shouldn't return // an error since we call wait right after. underlyingError := err if conterr, ok := err.(*hcsshim.ContainerError); ok { underlyingError = conterr.Err } if syscallErr, ok := underlyingError.(syscall.Errno); ok { underlyingError = syscallErr } if underlyingError != errOperationPending { return fmt.Errorf("failed to terminate utility VM (%s): %s", context, err) } logrus.Debugf("lcowdriver: waitTerminate: uvm.Terminate() returned operation pending (%s)", context) } logrus.Debugf("lcowdriver: waitTerminate: (%s) - waiting for utility VM to terminate", context) if err := svm.config.Uvm.WaitTimeout(time.Duration(svm.config.UvmTimeoutSeconds) * time.Second); err != nil { return fmt.Errorf("failed waiting for utility VM to terminate (%s): %s", context, err) } return nil } // String returns the string representation of a driver. This should match // the name the graph driver has been registered with. func (d *Driver) String() string { return "lcow" } // Status returns the status of the driver. func (d *Driver) Status() [][2]string { return [][2]string{ {"LCOW", ""}, // TODO: Add some more info here - mode, home, .... } } // Exists returns true if the given id is registered with this driver. func (d *Driver) Exists(id string) bool { _, err := os.Lstat(d.dir(id)) logrus.Debugf("lcowdriver: exists: id %s %t", id, err == nil) return err == nil } // CreateReadWrite creates a layer that is writable for use as a container // file system. That equates to creating a sandbox. func (d *Driver) CreateReadWrite(id, parent string, opts *graphdriver.CreateOpts) error { title := fmt.Sprintf("lcowdriver: createreadwrite: id %s", id) logrus.Debugf(title) // First we need to create the folder if err := d.Create(id, parent, opts); err != nil { return err } // Look for an explicit sandbox size option. sandboxSize := uint64(client.DefaultVhdxSizeGB) for k, v := range opts.StorageOpt { switch strings.ToLower(k) { case "lcow.sandboxsize": var err error sandboxSize, err = strconv.ParseUint(v, 10, 32) if err != nil { return fmt.Errorf("%s failed to parse value '%s' for 'lcow.sandboxsize'", title, v) } if sandboxSize < client.DefaultVhdxSizeGB { return fmt.Errorf("%s 'lcow.sandboxsize' option cannot be less than %d", title, client.DefaultVhdxSizeGB) } break } } // Massive perf optimisation here. If we know that the RW layer is the default size, // and that the cached sandbox already exists, and we are running in safe mode, we // can just do a simple copy into the layers sandbox file without needing to start a // unique service VM. For a global service VM, it doesn't really matter. Of course, // this is only the case where the sandbox is the default size. // // Make sure we have the sandbox mutex taken while we are examining it. if sandboxSize == client.DefaultVhdxSizeGB { logrus.Debugf("%s: locking cachedSandboxMutex", title) d.cachedSandboxMutex.Lock() _, err := os.Stat(d.cachedSandboxFile) logrus.Debugf("%s: releasing cachedSandboxMutex", title) d.cachedSandboxMutex.Unlock() if err == nil { logrus.Debugf("%s: using cached sandbox to populate", title) if err := client.CopyFile(d.cachedSandboxFile, filepath.Join(d.dir(id), sandboxFilename), true); err != nil { return err } return nil } } logrus.Debugf("%s: creating SVM to create sandbox", title) svm, err := d.startServiceVMIfNotRunning(id, nil, "createreadwrite") if err != nil { return err } defer d.terminateServiceVM(id, "createreadwrite", false) // So the sandbox needs creating. If default size ensure we are the only thread populating the cache. // Non-default size we don't store, just create them one-off so no need to lock the cachedSandboxMutex. if sandboxSize == client.DefaultVhdxSizeGB { logrus.Debugf("%s: locking cachedSandboxMutex for creation", title) d.cachedSandboxMutex.Lock() defer func() { logrus.Debugf("%s: releasing cachedSandboxMutex for creation", title) d.cachedSandboxMutex.Unlock() }() } // Make sure we don't write to our local cached copy if this is for a non-default size request. targetCacheFile := d.cachedSandboxFile if sandboxSize != client.DefaultVhdxSizeGB { targetCacheFile = "" } // Create the ext4 vhdx logrus.Debugf("%s: creating sandbox ext4 vhdx", title) if err := svm.createExt4VHDX(filepath.Join(d.dir(id), sandboxFilename), uint32(sandboxSize), targetCacheFile); err != nil { logrus.Debugf("%s: failed to create sandbox vhdx for %s: %s", title, id, err) return err } return nil } // Create creates the folder for the layer with the given id, and // adds it to the layer chain. func (d *Driver) Create(id, parent string, opts *graphdriver.CreateOpts) error { logrus.Debugf("lcowdriver: create: id %s parent: %s", id, parent) parentChain, err := d.getLayerChain(parent) if err != nil { return err } var layerChain []string if parent != "" { if !d.Exists(parent) { return fmt.Errorf("lcowdriver: cannot create layer folder with missing parent %s", parent) } layerChain = []string{d.dir(parent)} } layerChain = append(layerChain, parentChain...) layerPath := d.dir(id) logrus.Debugf("lcowdriver: create: id %s: creating %s", id, layerPath) // Standard mkdir here, not with SDDL as the dataroot was created with // inheritance to just local system and administrators. if err := os.MkdirAll(layerPath, 0700); err != nil { return err } if err := d.setLayerChain(id, layerChain); err != nil { if err2 := os.RemoveAll(layerPath); err2 != nil { logrus.Warnf("failed to remove layer %s: %s", layerPath, err2) } return err } logrus.Debugf("lcowdriver: create: id %s: success", id) return nil } // Remove unmounts and removes the dir information. func (d *Driver) Remove(id string) error { logrus.Debugf("lcowdriver: remove: id %s", id) tmpID := fmt.Sprintf("%s-removing", id) tmpLayerPath := d.dir(tmpID) layerPath := d.dir(id) logrus.Debugf("lcowdriver: remove: id %s: layerPath %s", id, layerPath) // Unmount all the layers err := d.Put(id) if err != nil { logrus.Debugf("lcowdriver: remove id %s: failed to unmount: %s", id, err) return err } // for non-global case just kill the vm if !d.globalMode { if err := d.terminateServiceVM(id, fmt.Sprintf("Remove %s", id), true); err != nil { return err } } if err := os.Rename(layerPath, tmpLayerPath); err != nil && !os.IsNotExist(err) { return err } if err := os.RemoveAll(tmpLayerPath); err != nil { return err } logrus.Debugf("lcowdriver: remove: id %s: layerPath %s succeeded", id, layerPath) return nil } // Get returns the rootfs path for the id. It is reference counted and // effectively can be thought of as a "mount the layer into the utility // vm if it isn't already". The contract from the caller of this is that // all Gets and Puts are matched. It -should- be the case that on cleanup, // nothing is mounted. // // For optimisation, we don't actually mount the filesystem (which in our // case means [hot-]adding it to a service VM. But we track that and defer // the actual adding to the point we need to access it. func (d *Driver) Get(id, mountLabel string) (containerfs.ContainerFS, error) { title := fmt.Sprintf("lcowdriver: get: %s", id) logrus.Debugf(title) // Generate the mounts needed for the deferred operation. disks, err := d.getAllMounts(id) if err != nil { logrus.Debugf("%s failed to get all layer details for %s: %s", title, d.dir(id), err) return nil, fmt.Errorf("%s failed to get layer details for %s: %s", title, d.dir(id), err) } logrus.Debugf("%s: got layer mounts: %+v", title, disks) return &lcowfs{ root: unionMountName(disks), d: d, mappedDisks: disks, vmID: d.getVMID(id), }, nil } // Put does the reverse of get. If there are no more references to // the layer, it unmounts it from the utility VM. func (d *Driver) Put(id string) error { title := fmt.Sprintf("lcowdriver: put: %s", id) // Get the service VM that we need to remove from svm, err := d.serviceVms.get(d.getVMID(id)) if err == errVMUnknown { return nil } else if err == errVMisTerminating { return svm.getStopError() } // Generate the mounts that Get() might have mounted disks, err := d.getAllMounts(id) if err != nil { logrus.Debugf("%s failed to get all layer details for %s: %s", title, d.dir(id), err) return fmt.Errorf("%s failed to get layer details for %s: %s", title, d.dir(id), err) } // Now, we want to perform the unmounts, hot-remove and stop the service vm. // We want to go though all the steps even if we have an error to clean up properly err = svm.deleteUnionMount(unionMountName(disks), disks...) if err != nil { logrus.Debugf("%s failed to delete union mount %s: %s", title, id, err) } err1 := svm.hotRemoveVHDs(disks...) if err1 != nil { logrus.Debugf("%s failed to hot remove vhds %s: %s", title, id, err) if err == nil { err = err1 } } err1 = d.terminateServiceVM(id, fmt.Sprintf("Put %s", id), false) if err1 != nil { logrus.Debugf("%s failed to terminate service vm %s: %s", title, id, err1) if err == nil { err = err1 } } logrus.Debugf("Put succeeded on id %s", id) return err } // Cleanup ensures the information the driver stores is properly removed. // We use this opportunity to cleanup any -removing folders which may be // still left if the daemon was killed while it was removing a layer. func (d *Driver) Cleanup() error { title := "lcowdriver: cleanup" items, err := ioutil.ReadDir(d.dataRoot) if err != nil { if os.IsNotExist(err) { return nil } return err } // Note we don't return an error below - it's possible the files // are locked. However, next time around after the daemon exits, // we likely will be able to cleanup successfully. Instead we log // warnings if there are errors. for _, item := range items { if item.IsDir() && strings.HasSuffix(item.Name(), "-removing") { if err := os.RemoveAll(filepath.Join(d.dataRoot, item.Name())); err != nil { logrus.Warnf("%s failed to cleanup %s: %s", title, item.Name(), err) } else { logrus.Infof("%s cleaned up %s", title, item.Name()) } } } // Cleanup any service VMs we have running, along with their scratch spaces. // We don't take the lock for this as it's taken in terminateServiceVm. for k, v := range d.serviceVms.svms { logrus.Debugf("%s svm entry: %s: %+v", title, k, v) d.terminateServiceVM(k, "cleanup", true) } return nil } // Diff takes a layer (and it's parent layer which may be null, but // is ignored by this implementation below) and returns a reader for // a tarstream representing the layers contents. The id could be // a read-only "layer.vhd" or a read-write "sandbox.vhdx". The semantics // of this function dictate that the layer is already mounted. // However, as we do lazy mounting as a performance optimisation, // this will likely not be the case. func (d *Driver) Diff(id, parent string) (io.ReadCloser, error) { title := fmt.Sprintf("lcowdriver: diff: %s", id) // Get VHDX info ld, err := getLayerDetails(d.dir(id)) if err != nil { logrus.Debugf("%s: failed to get vhdx information of %s: %s", title, d.dir(id), err) return nil, err } // Start the SVM with a mapped virtual disk. Note that if the SVM is // already running and we are in global mode, this will be // hot-added. mvd := hcsshim.MappedVirtualDisk{ HostPath: ld.filename, ContainerPath: hostToGuest(ld.filename), CreateInUtilityVM: true, ReadOnly: true, } logrus.Debugf("%s: starting service VM", title) svm, err := d.startServiceVMIfNotRunning(id, []hcsshim.MappedVirtualDisk{mvd}, fmt.Sprintf("diff %s", id)) if err != nil { return nil, err } logrus.Debugf("lcowdriver: diff: waiting for svm to finish booting") err = svm.getStartError() if err != nil { d.terminateServiceVM(id, fmt.Sprintf("diff %s", id), false) return nil, fmt.Errorf("lcowdriver: diff: svm failed to boot: %s", err) } // Obtain the tar stream for it // The actual container path will have be remapped to a short name, so use that. actualContainerPath := svm.getShortContainerPath(&mvd) if actualContainerPath == "" { return nil, fmt.Errorf("failed to get short container path for %+v in SVM %s", mvd, svm.config.Name) } logrus.Debugf("%s: %s %s, size %d, ReadOnly %t", title, ld.filename, actualContainerPath, ld.size, ld.isSandbox) tarReadCloser, err := svm.config.VhdToTar(mvd.HostPath, actualContainerPath, ld.isSandbox, ld.size) if err != nil { svm.hotRemoveVHDs(mvd) d.terminateServiceVM(id, fmt.Sprintf("diff %s", id), false) return nil, fmt.Errorf("%s failed to export layer to tar stream for id: %s, parent: %s : %s", title, id, parent, err) } logrus.Debugf("%s id %s parent %s completed successfully", title, id, parent) // In safe/non-global mode, we can't tear down the service VM until things have been read. return ioutils.NewReadCloserWrapper(tarReadCloser, func() error { tarReadCloser.Close() svm.hotRemoveVHDs(mvd) d.terminateServiceVM(id, fmt.Sprintf("diff %s", id), false) return nil }), nil } // ApplyDiff extracts the changeset from the given diff into the // layer with the specified id and parent, returning the size of the // new layer in bytes. The layer should not be mounted when calling // this function. Another way of describing this is that ApplyDiff writes // to a new layer (a VHD in LCOW) the contents of a tarstream it's given. func (d *Driver) ApplyDiff(id, parent string, diff io.Reader) (int64, error) { logrus.Debugf("lcowdriver: applydiff: id %s", id) // Log failures here as it's undiagnosable sometimes, due to a possible panic. // See https://github.com/moby/moby/issues/37955 for more information. dest := filepath.Join(d.dataRoot, id, layerFilename) if !noreexec { cmd := reexec.Command([]string{"docker-lcow-tar2ext4", dest}...) stdout := bytes.NewBuffer(nil) stderr := bytes.NewBuffer(nil) cmd.Stdin = diff cmd.Stdout = stdout cmd.Stderr = stderr if err := cmd.Start(); err != nil { logrus.Warnf("lcowdriver: applydiff: id %s failed to start re-exec: %s", id, err) return 0, err } if err := cmd.Wait(); err != nil { logrus.Warnf("lcowdriver: applydiff: id %s failed %s", id, err) return 0, fmt.Errorf("re-exec error: %v: stderr: %s", err, stderr) } size, err := strconv.ParseInt(stdout.String(), 10, 64) if err != nil { logrus.Warnf("lcowdriver: applydiff: id %s failed to parse output %s", id, err) return 0, fmt.Errorf("re-exec error: %v: stdout: %s", err, stdout) } return applySID(id, size, dest) } // The inline case size, err := tar2ext4Actual(dest, diff) if err != nil { logrus.Warnf("lcowdriver: applydiff: id %s failed %s", id, err) } return applySID(id, size, dest) } // applySID adds the VM Group SID read-only access. func applySID(id string, size int64, dest string) (int64, error) { if err := security.GrantVmGroupAccess(dest); err != nil { logrus.Warnf("lcowdriver: applySIDs: id %s failed %s", id, err) return 0, err } return size, nil } // tar2ext4Reexec is the re-exec entry point for writing a layer from a tar file func tar2ext4Reexec() { size, err := tar2ext4Actual(os.Args[1], os.Stdin) if err != nil { fmt.Fprint(os.Stderr, err) os.Exit(1) } fmt.Fprint(os.Stdout, size) } // tar2ext4Actual is the implementation of tar2ext to write a layer from a tar file. // It can be called through re-exec (default), or inline for debugging. func tar2ext4Actual(dest string, diff io.Reader) (int64, error) { // maxDiskSize is not relating to the sandbox size - this is the // maximum possible size a layer VHD generated can be from an EXT4 // layout perspective. const maxDiskSize = 128 * 1024 * 1024 * 1024 // 128GB out, err := os.Create(dest) if err != nil { return 0, err } defer out.Close() if err := tar2ext4.Convert( diff, out, tar2ext4.AppendVhdFooter, tar2ext4.ConvertWhiteout, tar2ext4.MaximumDiskSize(maxDiskSize)); err != nil { return 0, err } fi, err := os.Stat(dest) if err != nil { return 0, err } return fi.Size(), nil } // Changes produces a list of changes between the specified layer // and its parent layer. If parent is "", then all changes will be ADD changes. // The layer should not be mounted when calling this function. func (d *Driver) Changes(id, parent string) ([]archive.Change, error) { logrus.Debugf("lcowdriver: changes: id %s parent %s", id, parent) // TODO @gupta-ak. Needs implementation with assistance from service VM return nil, nil } // DiffSize calculates the changes between the specified layer // and its parent and returns the size in bytes of the changes // relative to its base filesystem directory. func (d *Driver) DiffSize(id, parent string) (size int64, err error) { logrus.Debugf("lcowdriver: diffsize: id %s", id) // TODO @gupta-ak. Needs implementation with assistance from service VM return 0, nil } // GetMetadata returns custom driver information. func (d *Driver) GetMetadata(id string) (map[string]string, error) { logrus.Debugf("lcowdriver: getmetadata: id %s", id) m := make(map[string]string) m["dir"] = d.dir(id) return m, nil } // GetLayerPath gets the layer path on host (path to VHD/VHDX) func (d *Driver) GetLayerPath(id string) (string, error) { return d.dir(id), nil } // dir returns the absolute path to the layer. func (d *Driver) dir(id string) string { return filepath.Join(d.dataRoot, filepath.Base(id)) } // getLayerChain returns the layer chain information. func (d *Driver) getLayerChain(id string) ([]string, error) { jPath := filepath.Join(d.dir(id), "layerchain.json") logrus.Debugf("lcowdriver: getlayerchain: id %s json %s", id, jPath) content, err := ioutil.ReadFile(jPath) if os.IsNotExist(err) { return nil, nil } else if err != nil { return nil, fmt.Errorf("lcowdriver: getlayerchain: %s unable to read layerchain file %s: %s", id, jPath, err) } var layerChain []string err = json.Unmarshal(content, &layerChain) if err != nil { return nil, fmt.Errorf("lcowdriver: getlayerchain: %s failed to unmarshall layerchain file %s: %s", id, jPath, err) } return layerChain, nil } // setLayerChain stores the layer chain information on disk. func (d *Driver) setLayerChain(id string, chain []string) error { content, err := json.Marshal(&chain) if err != nil { return fmt.Errorf("lcowdriver: setlayerchain: %s failed to marshall layerchain json: %s", id, err) } jPath := filepath.Join(d.dir(id), "layerchain.json") logrus.Debugf("lcowdriver: setlayerchain: id %s json %s", id, jPath) err = ioutil.WriteFile(jPath, content, 0600) if err != nil { return fmt.Errorf("lcowdriver: setlayerchain: %s failed to write layerchain file: %s", id, err) } return nil } // getLayerDetails is a utility for getting a file name, size and indication of // sandbox for a VHD(x) in a folder. A read-only layer will be layer.vhd. A // read-write layer will be sandbox.vhdx. func getLayerDetails(folder string) (*layerDetails, error) { var fileInfo os.FileInfo ld := &layerDetails{ isSandbox: false, filename: filepath.Join(folder, layerFilename), } fileInfo, err := os.Stat(ld.filename) if err != nil { ld.filename = filepath.Join(folder, sandboxFilename) if fileInfo, err = os.Stat(ld.filename); err != nil { return nil, fmt.Errorf("failed to locate layer or sandbox in %s", folder) } ld.isSandbox = true } ld.size = fileInfo.Size() return ld, nil } func (d *Driver) getAllMounts(id string) ([]hcsshim.MappedVirtualDisk, error) { layerChain, err := d.getLayerChain(id) if err != nil { return nil, err } layerChain = append([]string{d.dir(id)}, layerChain...) logrus.Debugf("getting all layers: %v", layerChain) disks := make([]hcsshim.MappedVirtualDisk, len(layerChain), len(layerChain)) for i := range layerChain { ld, err := getLayerDetails(layerChain[i]) if err != nil { logrus.Debugf("Failed to get LayerVhdDetails from %s: %s", layerChain[i], err) return nil, err } disks[i].HostPath = ld.filename disks[i].ContainerPath = hostToGuest(ld.filename) disks[i].CreateInUtilityVM = true disks[i].ReadOnly = !ld.isSandbox } return disks, nil } func hostToGuest(hostpath string) string { // This is the "long" container path. At the point of which we are // calculating this, we don't know which service VM we're going to be // using, so we can't translate this to a short path yet, instead // deferring until the point of which it's added to an SVM. We don't // use long container paths in SVMs for SCSI disks, otherwise it can cause // command line operations that we invoke to fail due to being over ~4200 // characters when there are ~47 layers involved. An example of this is // the mount call to create the overlay across multiple SCSI-attached disks. // It doesn't affect VPMem attached layers during container creation as // these get mapped by openGCS to /tmp/N/M where N is a container instance // number, and M is a layer number. return fmt.Sprintf("/tmp/%s", filepath.Base(filepath.Dir(hostpath))) } func unionMountName(disks []hcsshim.MappedVirtualDisk) string { return fmt.Sprintf("%s-mount", disks[0].ContainerPath) } type nopCloser struct { io.Reader } func (nopCloser) Close() error { return nil } type fileGetCloserFromSVM struct { id string svm *serviceVM mvd *hcsshim.MappedVirtualDisk d *Driver } func (fgc *fileGetCloserFromSVM) Close() error { if fgc.svm != nil { if fgc.mvd != nil { if err := fgc.svm.hotRemoveVHDs(*fgc.mvd); err != nil { // We just log this as we're going to tear down the SVM imminently unless in global mode logrus.Errorf("failed to remove mvd %s: %s", fgc.mvd.ContainerPath, err) } } } if fgc.d != nil && fgc.svm != nil && fgc.id != "" { if err := fgc.d.terminateServiceVM(fgc.id, fmt.Sprintf("diffgetter %s", fgc.id), false); err != nil { return err } } return nil } func (fgc *fileGetCloserFromSVM) Get(filename string) (io.ReadCloser, error) { errOut := &bytes.Buffer{} outOut := &bytes.Buffer{} // Must map to the actual "short" container path where the SCSI disk was mounted actualContainerPath := fgc.svm.getShortContainerPath(fgc.mvd) if actualContainerPath == "" { return nil, fmt.Errorf("inconsistency detected: couldn't get short container path for %+v in utility VM %s", fgc.mvd, fgc.svm.config.Name) } file := path.Join(actualContainerPath, filename) // Ugly fix for MSFT internal bug VSO#19696554 // If a file name contains a space, pushing an image fails. // Using solution from https://groups.google.com/forum/#!topic/Golang-Nuts/DpldsmrhPio to escape for shell execution file = "'" + strings.Join(strings.Split(file, "'"), `'"'"'`) + "'" if err := fgc.svm.runProcess(fmt.Sprintf("cat %s", file), nil, outOut, errOut); err != nil { logrus.Debugf("cat %s failed: %s", file, errOut.String()) return nil, err } return nopCloser{bytes.NewReader(outOut.Bytes())}, nil } // DiffGetter returns a FileGetCloser that can read files from the directory that // contains files for the layer differences. Used for direct access for tar-split. func (d *Driver) DiffGetter(id string) (graphdriver.FileGetCloser, error) { title := fmt.Sprintf("lcowdriver: diffgetter: %s", id) logrus.Debugf(title) ld, err := getLayerDetails(d.dir(id)) if err != nil { logrus.Debugf("%s: failed to get vhdx information of %s: %s", title, d.dir(id), err) return nil, err } // Start the SVM with a mapped virtual disk. Note that if the SVM is // already running and we are in global mode, this will be hot-added. mvd := hcsshim.MappedVirtualDisk{ HostPath: ld.filename, ContainerPath: hostToGuest(ld.filename), CreateInUtilityVM: true, ReadOnly: true, } logrus.Debugf("%s: starting service VM", title) svm, err := d.startServiceVMIfNotRunning(id, []hcsshim.MappedVirtualDisk{mvd}, fmt.Sprintf("diffgetter %s", id)) if err != nil { return nil, err } logrus.Debugf("%s: waiting for svm to finish booting", title) err = svm.getStartError() if err != nil { d.terminateServiceVM(id, fmt.Sprintf("diff %s", id), false) return nil, fmt.Errorf("%s: svm failed to boot: %s", title, err) } return &fileGetCloserFromSVM{ id: id, svm: svm, mvd: &mvd, d: d}, nil }