LCOW: Graphdriver dynamic sandbox management

Signed-off-by: John Howard <jhoward@microsoft.com>

This changes the graphdriver to perform dynamic sandbox management.
Previously, as a temporary 'hack', the service VM had a prebuilt
sandbox in it. With this change, management is under the control
of the client (docker) and executes a mkfs.ext4 on it. This enables
sandboxes of non-default sizes too (a TODO previously in the code).

It also addresses https://github.com/moby/moby/pull/33969#discussion_r127287887

Requires:
- go-winio: v0.4.3
- opengcs:  v0.0.12
- hcsshim:  v0.6.x
This commit is contained in:
John Howard 2017-07-13 16:15:43 -07:00
parent 0bd1cf2517
commit 8c279ef3ad
1 changed files with 169 additions and 125 deletions

View File

@ -45,7 +45,7 @@
//
// * lcow.vhdx - Specifies a custom vhdx file to boot (instead of a kernel+initrd)
// -- Possible values: Any valid filename
// -- Default if ommitted: C:\Program Files\Linux Containers\uvm.vhdx
// -- Default if ommitted: uvm.vhdx under `lcow.kirdpath`
//
// * lcow.timeout - Specifies a timeout for utility VM operations in seconds
// -- Possible values: >=0
@ -120,6 +120,34 @@ type cacheItem struct {
isMounted bool // True when mounted in a service VM
}
// setIsMounted is a helper function for a cacheItem which does exactly what it says
func (ci *cacheItem) setIsMounted() {
logrus.Debugf("locking cache item for set isMounted")
ci.Lock()
defer ci.Unlock()
ci.isMounted = true
logrus.Debugf("set isMounted on cache item")
}
// incrementRefCount is a helper function for a cacheItem which does exactly what it says
func (ci *cacheItem) incrementRefCount() {
logrus.Debugf("locking cache item for increment")
ci.Lock()
defer ci.Unlock()
ci.refCount++
logrus.Debugf("incremented refcount on cache item %+v", ci)
}
// decrementRefCount is a helper function for a cacheItem which does exactly what it says
func (ci *cacheItem) decrementRefCount() int {
logrus.Debugf("locking cache item for decrement")
ci.Lock()
defer ci.Unlock()
ci.refCount--
logrus.Debugf("decremented refcount on cache item %+v", ci)
return ci.refCount
}
// serviceVMItem is our internal structure representing an item in our
// map of service VMs we are maintaining.
type serviceVMItem struct {
@ -147,6 +175,14 @@ type Driver struct {
cache map[string]*cacheItem // Map holding a cache of all the IDs we've mounted/unmounted.
}
// layerDetails is the structure returned by a helper function `getLayerDetails`
// for getting information about a layer folder
type layerDetails struct {
filename string // \path\to\sandbox.vhdx or \path\to\layer.vhd
size int64 // size of the above file
isSandbox bool // true if sandbox.vhdx
}
// deletefiles is a helper function for initialisation where we delete any
// left-over scratch files in case we were previously forcibly terminated.
func deletefiles(path string, f os.FileInfo, err error) error {
@ -240,7 +276,7 @@ func (d *Driver) startServiceVMIfNotRunning(id string, mvdToAdd *hcsshim.MappedV
logrus.Debugf("%s locking serviceVmItem %s", title, svm.config.Name)
svm.Lock()
if err := svm.config.HotAddVhd(mvdToAdd.HostPath, mvdToAdd.ContainerPath); err != nil {
if err := svm.config.HotAddVhd(mvdToAdd.HostPath, mvdToAdd.ContainerPath, false, true); err != nil {
logrus.Debugf("%s releasing serviceVmItem %s on hot-add failure %s", title, svm.config.Name, err)
svm.Unlock()
return nil, fmt.Errorf("%s hot add %s to %s failed: %s", title, mvdToAdd.HostPath, mvdToAdd.ContainerPath, err)
@ -310,7 +346,7 @@ func (d *Driver) startServiceVMIfNotRunning(id string, mvdToAdd *hcsshim.MappedV
// Start it.
logrus.Debugf("lcowdriver: startServiceVmIfNotRunning: (%s) starting %s", context, svm.config.Name)
if err := svm.config.Create(); err != nil {
if err := svm.config.StartUtilityVM(); err != nil {
return nil, fmt.Errorf("failed to start service utility VM (%s): %s", context, err)
}
@ -331,18 +367,22 @@ func (d *Driver) startServiceVMIfNotRunning(id string, mvdToAdd *hcsshim.MappedV
logrus.Debugf("%s locking cachedScratchMutex", title)
d.cachedScratchMutex.Lock()
if _, err := os.Stat(d.cachedScratchFile); err != nil {
// TODO: Not a typo, but needs fixing when the platform sandbox stuff has been sorted out.
logrus.Debugf("%s (%s): creating an SVM scratch - locking serviceVM", title, context)
svm.Lock()
if err := svm.config.CreateSandbox(d.cachedScratchFile, client.DefaultSandboxSizeMB, d.cachedSandboxFile); err != nil {
logrus.Debugf("%s (%s): releasing serviceVM on error path", title, context)
if err := svm.config.CreateExt4Vhdx(scratchTargetFile, client.DefaultVhdxSizeGB, d.cachedScratchFile); err != nil {
logrus.Debugf("%s (%s): releasing serviceVM on error path from CreateExt4Vhdx: %s", title, context, err)
svm.Unlock()
logrus.Debugf("%s (%s): releasing cachedScratchMutex on error path", title, context)
d.cachedScratchMutex.Unlock()
// TODO: NEED TO REMOVE FROM MAP HERE AND STOP IT
// Do a force terminate and remove it from the map on failure, ignoring any errors
if err2 := d.terminateServiceVM(id, "error path from CreateExt4Vhdx", true); err2 != nil {
logrus.Warnf("failed to terminate service VM on error path from CreateExt4Vhdx: %s", err2)
}
return nil, fmt.Errorf("failed to create SVM scratch VHDX (%s): %s", context, err)
}
logrus.Debugf("%s (%s): releasing serviceVM on error path", title, context)
logrus.Debugf("%s (%s): releasing serviceVM after %s created and cached to %s", title, context, scratchTargetFile, d.cachedScratchFile)
svm.Unlock()
}
logrus.Debugf("%s (%s): releasing cachedScratchMutex", title, context)
@ -350,19 +390,17 @@ func (d *Driver) startServiceVMIfNotRunning(id string, mvdToAdd *hcsshim.MappedV
// Hot-add the scratch-space if not already attached
if !svm.scratchAttached {
// Make a copy of it to the layer directory
logrus.Debugf("lcowdriver: startServiceVmIfNotRunning: (%s) cloning cached scratch for hot-add", context)
if err := client.CopyFile(d.cachedScratchFile, scratchTargetFile, true); err != nil {
// TODO: NEED TO REMOVE FROM MAP HERE AND STOP IT
return nil, err
}
logrus.Debugf("lcowdriver: startServiceVmIfNotRunning: (%s) hot-adding scratch %s - locking serviceVM", context, scratchTargetFile)
svm.Lock()
if err := svm.config.HotAddVhd(scratchTargetFile, toolsScratchPath); err != nil {
logrus.Debugf("%s (%s): releasing serviceVM on error path", title, context)
if err := svm.config.HotAddVhd(scratchTargetFile, toolsScratchPath, false, true); err != nil {
logrus.Debugf("%s (%s): releasing serviceVM on error path of HotAddVhd: %s", title, context, err)
svm.Unlock()
// TODOL NEED TO REMOVE FROM MAP HERE AND STOP IT
// Do a force terminate and remove it from the map on failure, ignoring any errors
if err2 := d.terminateServiceVM(id, "error path from HotAddVhd", true); err2 != nil {
logrus.Warnf("failed to terminate service VM on error path from HotAddVhd: %s", err2)
}
return nil, fmt.Errorf("failed to hot-add %s failed: %s", scratchTargetFile, err)
}
logrus.Debugf("%s (%s): releasing serviceVM", title, context)
@ -477,26 +515,43 @@ func (d *Driver) CreateReadWrite(id, parent string, opts *graphdriver.CreateOpts
return err
}
// Look for an explicit sandbox size option.
sandboxSize := uint64(client.DefaultVhdxSizeGB)
for k, v := range opts.StorageOpt {
switch strings.ToLower(k) {
case "lcow.sandboxsize":
var err error
sandboxSize, err = strconv.ParseUint(v, 10, 32)
if err != nil {
return fmt.Errorf("%s failed to parse value '%s' for 'lcow.sandboxsize'", title, v)
}
if sandboxSize < client.DefaultVhdxSizeGB {
return fmt.Errorf("%s 'lcow.sandboxsize' option cannot be less than %d", title, client.DefaultVhdxSizeGB)
}
break
}
}
// Massive perf optimisation here. If we know that the RW layer is the default size,
// and that the cached sandbox already exists, and we are running in safe mode, we
// can just do a simple copy into the layers sandbox file without needing to start a
// unique service VM. For a global service VM, it doesn't really matter.
// unique service VM. For a global service VM, it doesn't really matter. Of course,
// this is only the case where the sandbox is the default size.
//
// TODO: @jhowardmsft Where are we going to get the required size from?
// We need to look at the CreateOpts for that, I think....
// Make sure we have the sandbox mutex taken while we are examining it.
logrus.Debugf("%s: locking cachedSandboxMutex", title)
d.cachedSandboxMutex.Lock()
_, err := os.Stat(d.cachedSandboxFile)
logrus.Debugf("%s: releasing cachedSandboxMutex", title)
d.cachedSandboxMutex.Unlock()
if err == nil {
logrus.Debugf("%s: using cached sandbox to populate", title)
if err := client.CopyFile(d.cachedSandboxFile, filepath.Join(d.dir(id), sandboxFilename), true); err != nil {
return err
if sandboxSize == client.DefaultVhdxSizeGB {
logrus.Debugf("%s: locking cachedSandboxMutex", title)
d.cachedSandboxMutex.Lock()
_, err := os.Stat(d.cachedSandboxFile)
logrus.Debugf("%s: releasing cachedSandboxMutex", title)
d.cachedSandboxMutex.Unlock()
if err == nil {
logrus.Debugf("%s: using cached sandbox to populate", title)
if err := client.CopyFile(d.cachedSandboxFile, filepath.Join(d.dir(id), sandboxFilename), true); err != nil {
return err
}
return nil
}
return nil
}
logrus.Debugf("%s: creating SVM to create sandbox", title)
@ -506,13 +561,16 @@ func (d *Driver) CreateReadWrite(id, parent string, opts *graphdriver.CreateOpts
}
defer d.terminateServiceVM(id, "createreadwrite", false)
// So the cached sandbox needs creating. Ensure we are the only thread creating it.
logrus.Debugf("%s: locking cachedSandboxMutex for creation", title)
d.cachedSandboxMutex.Lock()
defer func() {
logrus.Debugf("%s: releasing cachedSandboxMutex for creation", title)
d.cachedSandboxMutex.Unlock()
}()
// So the sandbox needs creating. If default size ensure we are the only thread populating the cache.
// Non-default size we don't store, just create them one-off so no need to lock the cachedSandboxMutex.
if sandboxSize == client.DefaultVhdxSizeGB {
logrus.Debugf("%s: locking cachedSandboxMutex for creation", title)
d.cachedSandboxMutex.Lock()
defer func() {
logrus.Debugf("%s: releasing cachedSandboxMutex for creation", title)
d.cachedSandboxMutex.Unlock()
}()
}
// Synchronise the operation in the service VM.
logrus.Debugf("%s: locking svm for sandbox creation", title)
@ -521,7 +579,15 @@ func (d *Driver) CreateReadWrite(id, parent string, opts *graphdriver.CreateOpts
logrus.Debugf("%s: releasing svm for sandbox creation", title)
svm.Unlock()
}()
if err := svm.config.CreateSandbox(filepath.Join(d.dir(id), sandboxFilename), client.DefaultSandboxSizeMB, d.cachedSandboxFile); err != nil {
// Make sure we don't write to our local cached copy if this is for a non-default size request.
targetCacheFile := d.cachedSandboxFile
if sandboxSize != client.DefaultVhdxSizeGB {
targetCacheFile = ""
}
// Actually do the creation.
if err := svm.config.CreateExt4Vhdx(filepath.Join(d.dir(id), sandboxFilename), uint32(sandboxSize), targetCacheFile); err != nil {
return err
}
@ -599,42 +665,37 @@ func (d *Driver) Get(id, mountLabel string) (string, error) {
logrus.Debugf(title)
// Work out what we are working on
vhdFilename, vhdSize, isSandbox, err := getLayerDetails(d.dir(id))
ld, err := getLayerDetails(d.dir(id))
if err != nil {
logrus.Debugf("%s failed to get layer details from %s: %s", title, d.dir(id), err)
return "", fmt.Errorf("%s failed to open layer or sandbox VHD to open in %s: %s", title, d.dir(id), err)
}
logrus.Debugf("%s %s, size %d, isSandbox %t", title, vhdFilename, vhdSize, isSandbox)
logrus.Debugf("%s %s, size %d, isSandbox %t", title, ld.filename, ld.size, ld.isSandbox)
// Add item to cache, or update existing item, but ensure we have the
// lock while updating items.
logrus.Debugf("%s: locking cacheMutex", title)
d.cacheMutex.Lock()
var cacheEntry *cacheItem
if entry, ok := d.cache[id]; !ok {
var ci *cacheItem
if item, ok := d.cache[id]; !ok {
// The item is not currently in the cache.
cacheEntry = &cacheItem{
ci = &cacheItem{
refCount: 1,
isSandbox: isSandbox,
hostPath: vhdFilename,
isSandbox: ld.isSandbox,
hostPath: ld.filename,
uvmPath: fmt.Sprintf("/mnt/%s", id),
isMounted: false, // we defer this as an optimisation
}
d.cache[id] = cacheEntry
logrus.Debugf("%s: added cache entry %+v", title, cacheEntry)
d.cache[id] = ci
logrus.Debugf("%s: added cache item %+v", title, ci)
} else {
// Increment the reference counter in the cache.
logrus.Debugf("%s: locking cache item for increment", title)
entry.Lock()
entry.refCount++
logrus.Debugf("%s: releasing cache item for increment", title)
entry.Unlock()
logrus.Debugf("%s: incremented refcount on cache entry %+v", title, cacheEntry)
item.incrementRefCount()
}
logrus.Debugf("%s: releasing cacheMutex", title)
d.cacheMutex.Unlock()
logrus.Debugf("%s %s success. %s: %+v: size %d", title, id, d.dir(id), cacheEntry, vhdSize)
logrus.Debugf("%s %s success. %s: %+v: size %d", title, id, d.dir(id), ci, ld.size)
return d.dir(id), nil
}
@ -645,67 +706,53 @@ func (d *Driver) Put(id string) error {
logrus.Debugf("%s: locking cacheMutex", title)
d.cacheMutex.Lock()
entry, ok := d.cache[id]
item, ok := d.cache[id]
if !ok {
logrus.Debugf("%s: releasing cacheMutex on error path", title)
d.cacheMutex.Unlock()
return fmt.Errorf("%s possible ref-count error, or invalid id was passed to the graphdriver. Cannot handle id %s as it's not in the cache", title, id)
}
// Are we just decrementing the reference count?
logrus.Debugf("%s: locking cache item for possible decrement", title)
entry.Lock()
if entry.refCount > 1 {
entry.refCount--
logrus.Debugf("%s: releasing cache item for decrement and early get-out as refCount is now %d", title, entry.refCount)
entry.Unlock()
logrus.Debugf("%s: refCount decremented to %d. Releasing cacheMutex", title, entry.refCount)
// Decrement the ref-count, and nothing more to do if still in use.
if item.decrementRefCount() > 0 {
logrus.Debugf("%s: releasing cacheMutex. Cache item is still in use", title)
d.cacheMutex.Unlock()
return nil
}
logrus.Debugf("%s: releasing cache item", title)
entry.Unlock()
logrus.Debugf("%s: releasing cacheMutex. Ref count has dropped to zero", title)
// Remove from the cache map.
delete(d.cache, id)
logrus.Debugf("%s: releasing cacheMutex. Ref count on cache item has dropped to zero, removed from cache", title)
d.cacheMutex.Unlock()
// To reach this point, the reference count has dropped to zero. If we have
// done a mount and we are in global mode, then remove it. We don't
// need to remove in safe mode as the service VM is going to be torn down
// anyway.
// If we have done a mount and we are in global mode, then remove it. We don't
// need to remove in safe mode as the service VM is going to be torn down anyway.
if d.globalMode {
logrus.Debugf("%s: locking cache item at zero ref-count", title)
entry.Lock()
item.Lock()
defer func() {
logrus.Debugf("%s: releasing cache item at zero ref-count", title)
entry.Unlock()
item.Unlock()
}()
if entry.isMounted {
if item.isMounted {
svm, err := d.getServiceVM(id, false)
if err != nil {
return err
}
logrus.Debugf("%s: Hot-Removing %s. Locking svm", title, entry.hostPath)
logrus.Debugf("%s: Hot-Removing %s. Locking svm", title, item.hostPath)
svm.Lock()
if err := svm.config.HotRemoveVhd(entry.hostPath); err != nil {
if err := svm.config.HotRemoveVhd(item.hostPath); err != nil {
logrus.Debugf("%s: releasing svm on error path", title)
svm.Unlock()
return fmt.Errorf("%s failed to hot-remove %s from global service utility VM: %s", title, entry.hostPath, err)
return fmt.Errorf("%s failed to hot-remove %s from global service utility VM: %s", title, item.hostPath, err)
}
logrus.Debugf("%s: releasing svm", title)
svm.Unlock()
}
}
// Remove from the cache map.
logrus.Debugf("%s: Locking cacheMutex to delete item from cache", title)
d.cacheMutex.Lock()
delete(d.cache, id)
logrus.Debugf("%s: releasing cacheMutex after item deleted from cache", title)
d.cacheMutex.Unlock()
logrus.Debugf("%s %s: refCount 0. %s (%s) completed successfully", title, id, entry.hostPath, entry.uvmPath)
logrus.Debugf("%s %s: refCount 0. %s (%s) completed successfully", title, id, item.hostPath, item.uvmPath)
return nil
}
@ -717,7 +764,7 @@ func (d *Driver) Cleanup() error {
d.cacheMutex.Lock()
for k, v := range d.cache {
logrus.Debugf("%s cache entry: %s: %+v", title, k, v)
logrus.Debugf("%s cache item: %s: %+v", title, k, v)
if v.refCount > 0 {
logrus.Warnf("%s leaked %s: %+v", title, k, v)
}
@ -749,7 +796,7 @@ func (d *Driver) Cleanup() error {
// Cleanup any service VMs we have running, along with their scratch spaces.
// We don't take the lock for this as it's taken in terminateServiceVm.
for k, v := range d.serviceVms {
logrus.Debugf("%s svm entry: %s: %+v", title, k, v)
logrus.Debugf("%s svm: %s: %+v", title, k, v)
d.terminateServiceVM(k, "cleanup", true)
}
@ -773,28 +820,28 @@ func (d *Driver) Diff(id, parent string) (io.ReadCloser, error) {
d.cacheMutex.Unlock()
return nil, fmt.Errorf("%s fail as %s is not in the cache", title, id)
}
cacheEntry := d.cache[id]
ci := d.cache[id]
logrus.Debugf("%s: releasing cacheMutex", title)
d.cacheMutex.Unlock()
// Stat to get size
logrus.Debugf("%s: locking cacheEntry", title)
cacheEntry.Lock()
fileInfo, err := os.Stat(cacheEntry.hostPath)
logrus.Debugf("%s: locking cacheItem", title)
ci.Lock()
fileInfo, err := os.Stat(ci.hostPath)
if err != nil {
logrus.Debugf("%s: releasing cacheEntry on error path", title)
cacheEntry.Unlock()
return nil, fmt.Errorf("%s failed to stat %s: %s", title, cacheEntry.hostPath, err)
logrus.Debugf("%s: releasing cacheItem on error path", title)
ci.Unlock()
return nil, fmt.Errorf("%s failed to stat %s: %s", title, ci.hostPath, err)
}
logrus.Debugf("%s: releasing cacheEntry", title)
cacheEntry.Unlock()
logrus.Debugf("%s: releasing cacheItem", title)
ci.Unlock()
// Start the SVM with a mapped virtual disk. Note that if the SVM is
// already runing and we are in global mode, this will be
// hot-added.
mvd := &hcsshim.MappedVirtualDisk{
HostPath: cacheEntry.hostPath,
ContainerPath: cacheEntry.uvmPath,
HostPath: ci.hostPath,
ContainerPath: ci.uvmPath,
CreateInUtilityVM: true,
ReadOnly: true,
}
@ -805,8 +852,8 @@ func (d *Driver) Diff(id, parent string) (io.ReadCloser, error) {
return nil, err
}
// Set `isMounted` for the cache entry. Note that we re-scan the cache
// at this point as it's possible the cacheEntry changed during the long-
// Set `isMounted` for the cache item. Note that we re-scan the cache
// at this point as it's possible the cacheItem changed during the long-
// running operation above when we weren't holding the cacheMutex lock.
logrus.Debugf("%s: locking cacheMutex for updating isMounted", title)
d.cacheMutex.Lock()
@ -816,18 +863,14 @@ func (d *Driver) Diff(id, parent string) (io.ReadCloser, error) {
d.terminateServiceVM(id, fmt.Sprintf("diff %s", id), false)
return nil, fmt.Errorf("%s fail as %s is not in the cache when updating isMounted", title, id)
}
cacheEntry = d.cache[id]
logrus.Debugf("%s: locking cacheEntry for updating isMounted", title)
cacheEntry.Lock()
cacheEntry.isMounted = true
logrus.Debugf("%s: releasing cacheEntry for updating isMounted", title)
cacheEntry.Unlock()
ci = d.cache[id]
ci.setIsMounted()
logrus.Debugf("%s: releasing cacheMutex for updating isMounted", title)
d.cacheMutex.Unlock()
// Obtain the tar stream for it
logrus.Debugf("%s %s, size %d, isSandbox %t", title, cacheEntry.hostPath, fileInfo.Size(), cacheEntry.isSandbox)
tarReadCloser, err := svm.config.VhdToTar(cacheEntry.hostPath, cacheEntry.uvmPath, cacheEntry.isSandbox, fileInfo.Size())
logrus.Debugf("%s %s, size %d, isSandbox %t", title, ci.hostPath, fileInfo.Size(), ci.isSandbox)
tarReadCloser, err := svm.config.VhdToTar(ci.hostPath, ci.uvmPath, ci.isSandbox, fileInfo.Size())
if err != nil {
d.terminateServiceVM(id, fmt.Sprintf("diff %s", id), false)
return nil, fmt.Errorf("%s failed to export layer to tar stream for id: %s, parent: %s : %s", title, id, parent, err)
@ -945,21 +988,22 @@ func (d *Driver) setLayerChain(id string, chain []string) error {
// getLayerDetails is a utility for getting a file name, size and indication of
// sandbox for a VHD(x) in a folder. A read-only layer will be layer.vhd. A
// read-write layer will be sandbox.vhdx.
func getLayerDetails(folder string) (string, int64, bool, error) {
func getLayerDetails(folder string) (*layerDetails, error) {
var fileInfo os.FileInfo
isSandbox := false
filename := filepath.Join(folder, layerFilename)
var err error
if fileInfo, err = os.Stat(filename); err != nil {
filename = filepath.Join(folder, sandboxFilename)
if fileInfo, err = os.Stat(filename); err != nil {
if os.IsNotExist(err) {
return "", 0, isSandbox, fmt.Errorf("could not find layer or sandbox in %s", folder)
}
return "", 0, isSandbox, fmt.Errorf("error locating layer or sandbox in %s: %s", folder, err)
}
isSandbox = true
ld := &layerDetails{
isSandbox: false,
filename: filepath.Join(folder, layerFilename),
}
return filename, fileInfo.Size(), isSandbox, nil
fileInfo, err := os.Stat(ld.filename)
if err != nil {
ld.filename = filepath.Join(folder, sandboxFilename)
if fileInfo, err = os.Stat(ld.filename); err != nil {
return nil, fmt.Errorf("failed to locate layer or sandbox in %s", folder)
}
ld.isSandbox = true
}
ld.size = fileInfo.Size()
return ld, nil
}