moby--moby/daemon/graphdriver/lcow/lcow.go

1164 lines
42 KiB
Go
Raw Permalink Normal View History

//go:build windows
// +build windows
// Locale: en-gb
// About: Graph-driver for Linux Containers On Windows (LCOW)
//
// This graphdriver runs in two modes. Yet to be determined which one will
// be the shipping mode. The global mode is where a single utility VM
// is used for all service VM tool operations. This isn't safe security-wise
// as it's attaching a sandbox of multiple containers to it, containing
// untrusted data. This may be fine for client devops scenarios. In
// safe mode, a unique utility VM is instantiated for all service VM tool
// operations. The downside of safe-mode is that operations are slower as
// a new service utility VM has to be started and torn-down when needed.
//
// Options:
//
// The following options are read by the graphdriver itself:
//
// * lcow.globalmode - Enables global service VM Mode
// -- Possible values: true/false
// -- Default if omitted: false
//
// * lcow.sandboxsize - Specifies a custom sandbox size in GB for starting a container
// -- Possible values: >= default sandbox size (opengcs defined, currently 20)
// -- Default if omitted: 20
//
// The following options are read by opengcs:
//
// * lcow.kirdpath - Specifies a custom path to a kernel/initrd pair
// -- Possible values: Any local path that is not a mapped drive
// -- Default if omitted: %ProgramFiles%\Linux Containers
//
// * lcow.bootparameters - Specifies additional boot parameters for booting in kernel+initrd mode
// -- Possible values: Any valid linux kernel boot options
// -- Default if omitted: <nil>
//
// * lcow.timeout - Specifies a timeout for utility VM operations in seconds
// -- Possible values: >=0
// -- Default if omitted: 300
// TODO: Grab logs from SVM at terminate or errors
package lcow // import "github.com/docker/docker/daemon/graphdriver/lcow"
import (
"bytes"
"encoding/json"
"fmt"
"io"
"io/ioutil"
"os"
"path"
"path/filepath"
"strconv"
"strings"
"sync"
"syscall"
"time"
"github.com/Microsoft/go-winio/pkg/security"
"github.com/Microsoft/hcsshim"
"github.com/Microsoft/hcsshim/ext4/tar2ext4"
"github.com/Microsoft/opengcs/client"
"github.com/docker/docker/daemon/graphdriver"
"github.com/docker/docker/pkg/archive"
"github.com/docker/docker/pkg/containerfs"
"github.com/docker/docker/pkg/idtools"
"github.com/docker/docker/pkg/ioutils"
"github.com/docker/docker/pkg/reexec"
"github.com/sirupsen/logrus"
)
// noreexec controls reexec functionality. Off by default, on for debugging purposes.
var noreexec = false
// init registers this driver to the register. It gets initialised by the
// function passed in the second parameter, implemented in this file.
func init() {
graphdriver.Register("lcow", InitDriver)
// DOCKER_LCOW_NOREEXEC allows for inline processing which makes
// debugging issues in the re-exec codepath significantly easier.
if os.Getenv("DOCKER_LCOW_NOREEXEC") != "" {
logrus.Warnf("LCOW Graphdriver is set to not re-exec. This is intended for debugging purposes only.")
noreexec = true
} else {
reexec.Register("docker-lcow-tar2ext4", tar2ext4Reexec)
}
}
const (
// sandboxFilename is the name of the file containing a layer's sandbox (read-write layer).
sandboxFilename = "sandbox.vhdx"
// scratchFilename is the name of the scratch-space used by an SVM to avoid running out of memory.
scratchFilename = "scratch.vhdx"
// layerFilename is the name of the file containing a layer's read-only contents.
// Note this really is VHD format, not VHDX.
layerFilename = "layer.vhd"
// toolsScratchPath is a location in a service utility VM that the tools can use as a
// scratch space to avoid running out of memory.
toolsScratchPath = "/tmp/scratch"
// svmGlobalID is the ID used in the serviceVMs map for the global service VM when running in "global" mode.
svmGlobalID = "_lcow_global_svm_"
// cacheDirectory is the sub-folder under the driver's data-root used to cache blank sandbox and scratch VHDs.
cacheDirectory = "cache"
// scratchDirectory is the sub-folder under the driver's data-root used for scratch VHDs in service VMs
scratchDirectory = "scratch"
// errOperationPending is the HRESULT returned by the HCS when the VM termination operation is still pending.
errOperationPending syscall.Errno = 0xc0370103
)
// Driver represents an LCOW graph driver.
type Driver struct {
dataRoot string // Root path on the host where we are storing everything.
cachedSandboxFile string // Location of the local default-sized cached sandbox.
cachedSandboxMutex sync.Mutex // Protects race conditions from multiple threads creating the cached sandbox.
cachedScratchFile string // Location of the local cached empty scratch space.
cachedScratchMutex sync.Mutex // Protects race conditions from multiple threads creating the cached scratch.
options []string // Graphdriver options we are initialised with.
globalMode bool // Indicates if running in an unsafe/global service VM mode.
// NOTE: It is OK to use a cache here because Windows does not support
// restoring containers when the daemon dies.
serviceVms *serviceVMMap // Map of the configs representing the service VM(s) we are running.
}
// layerDetails is the structure returned by a helper function `getLayerDetails`
// for getting information about a layer folder
type layerDetails struct {
filename string // \path\to\sandbox.vhdx or \path\to\layer.vhd
size int64 // size of the above file
isSandbox bool // true if sandbox.vhdx
}
// deletefiles is a helper function for initialisation where we delete any
// left-over scratch files in case we were previously forcibly terminated.
func deletefiles(path string, f os.FileInfo, err error) error {
if strings.HasSuffix(f.Name(), ".vhdx") {
logrus.Warnf("lcowdriver: init: deleting stale scratch file %s", path)
return os.Remove(path)
}
return nil
}
// InitDriver returns a new LCOW storage driver.
func InitDriver(dataRoot string, options []string, _, _ []idtools.IDMap) (graphdriver.Driver, error) {
title := "lcowdriver: init:"
cd := filepath.Join(dataRoot, cacheDirectory)
sd := filepath.Join(dataRoot, scratchDirectory)
d := &Driver{
dataRoot: dataRoot,
options: options,
cachedSandboxFile: filepath.Join(cd, sandboxFilename),
cachedScratchFile: filepath.Join(cd, scratchFilename),
serviceVms: &serviceVMMap{
svms: make(map[string]*serviceVMMapItem),
},
globalMode: false,
}
// Looks for relevant options
for _, v := range options {
opt := strings.SplitN(v, "=", 2)
if len(opt) == 2 {
switch strings.ToLower(opt[0]) {
case "lcow.globalmode":
var err error
d.globalMode, err = strconv.ParseBool(opt[1])
if err != nil {
return nil, fmt.Errorf("%s failed to parse value for 'lcow.globalmode' - must be 'true' or 'false'", title)
}
break
}
}
}
// Make sure the dataRoot directory is created
if err := idtools.MkdirAllAndChown(dataRoot, 0700, idtools.Identity{UID: 0, GID: 0}); err != nil {
return nil, fmt.Errorf("%s failed to create '%s': %v", title, dataRoot, err)
}
// Make sure the cache directory is created under dataRoot
if err := idtools.MkdirAllAndChown(cd, 0700, idtools.Identity{UID: 0, GID: 0}); err != nil {
return nil, fmt.Errorf("%s failed to create '%s': %v", title, cd, err)
}
// Make sure the scratch directory is created under dataRoot
if err := idtools.MkdirAllAndChown(sd, 0700, idtools.Identity{UID: 0, GID: 0}); err != nil {
return nil, fmt.Errorf("%s failed to create '%s': %v", title, sd, err)
}
// Delete any items in the scratch directory
filepath.Walk(sd, deletefiles)
logrus.Infof("%s dataRoot: %s globalMode: %t", title, dataRoot, d.globalMode)
return d, nil
}
func (d *Driver) getVMID(id string) string {
if d.globalMode {
return svmGlobalID
}
return id
}
LCOW: Mount to short container paths to avoid command-line length limit Signed-off-by: John Howard <jhoward@microsoft.com> Fixes #36764 @johnstep PTAL. @jterry75 FYI. There are two commits in this PR. The first ensure that errors are actually returned to the caller - it was being thrown away. The second commit changes the LCOW driver to map, on a per service VM basis, "long" container paths such as `/tmp/c8fa0ae1b348f505df2707060f6a49e63280d71b83b7936935c827e2e9bde16d` to much shorter paths, based on a per-service VM counter, so something more like /tmp/d3. This means that the root cause of the failure where the mount call to create the overlay was failing due to command line length becomes something much shorter such as below. `mount -t overlay overlay -olowerdir=/tmp/d3:/tmp/d4:/tmp/d5:/tmp/d6:/tmp/d7:/tmp/d8:/tmp/d9:/tmp/d10:/tmp/d11:/tmp/d12:/tmp/d13:/tmp/d14:/tmp/d15:/tmp/d16:/tmp/d17:/tmp/d18:/tmp/d19:/tmp/d20:/tmp/d21:/tmp/d22:/tmp/d23:/tmp/d24:/tmp/d25:/tmp/d26:/tmp/d27:/tmp/d28:/tmp/d29:/tmp/d30:/tmp/d31:/tmp/d32:/tmp/d33:/tmp/d34:/tmp/d35:/tmp/d36:/tmp/d37:/tmp/d38:/tmp/d39:/tmp/d40:/tmp/d41:/tmp/d42:/tmp/d43:/tmp/d44:/tmp/d45:/tmp/d46:/tmp/d47:/tmp/d48:/tmp/d49:/tmp/d50:/tmp/d51:/tmp/d52:/tmp/d53:/tmp/d54:/tmp/d55:/tmp/d56:/tmp/d57:/tmp/d58:/tmp/d59:/tmp/d60:/tmp/d61:/tmp/d62,upperdir=/tmp/d2/upper,workdir=/tmp/d2/work /tmp/c8fa0ae1b348f505df2707060f6a49e63280d71b83b7936935c827e2e9bde16d-mount` For those worrying about overflow (which I'm sure @thaJeztah will mention...): It's safe to use a counter here as SVMs are disposable in the default configuration. The exception is when running the daemon in unsafe LCOW "global" mode (ie `--storage-opt lcow.globalmode=1`) where the SVMs aren't disposed of, but a single one is reused. However, to overflow the command line length, it would require several hundred-thousand trillion (conservative, I should sit down and work it out accurately if I get -really- bored) of SCSI hot-add operations, and even to hit that would be hard as just running containers normally uses the VPMEM path for the containers UVM, not to the global SVM on SCSI. It gets incremented by one per build step (commit more accurately) as a general rule. Hence it would be necessary to have to be doing automated builds without restarting the daemon for literally years on end in unsafe mode. :innocent: Note that in reality, the previous limit of ~47 layers before hitting the command line length limit is close to what is possible in the platform, at least as of RS5/Windows Server 2019 where, in the HCS v1 schema, a single SCSI controller is used, and that can only support 64 disks per controller per the Hyper-V VDEV. And remember we have one slot taken up for the SVMs scratch, and another for the containers scratch when committing a layer. So the best you can architecturally get on the platform is around the following (it's also different by 1 depending on whether in unsafe or default mode) ``` PS E:\docker\build\36764\short> docker build --no-cache . Sending build context to Docker daemon 2.048kB Step 1/4 : FROM alpine as first ---> 11cd0b38bc3c Step 2/4 : RUN echo test > /test ---> Running in 8ddfe20e5bfb Removing intermediate container 8ddfe20e5bfb ---> b0103a00b1c9 Step 3/4 : FROM alpine ---> 11cd0b38bc3c Step 4/4 : COPY --from=first /test /test ---> 54bfae391eba Successfully built 54bfae391eba PS E:\docker\build\36764\short> cd .. PS E:\docker\build\36764> docker build --no-cache . Sending build context to Docker daemon 4.689MB Step 1/61 : FROM alpine as first ---> 11cd0b38bc3c Step 2/61 : RUN echo test > /test ---> Running in 02597ff870db Removing intermediate container 02597ff870db ---> 3096de6fc454 Step 3/61 : RUN echo test > /test ---> Running in 9a8110f4ff19 Removing intermediate container 9a8110f4ff19 ---> 7691808cf28e Step 4/61 : RUN echo test > /test ---> Running in 9afb8f51510b Removing intermediate container 9afb8f51510b ---> e42a0df2bb1c Step 5/61 : RUN echo test > /test ---> Running in fe977ed6804e Removing intermediate container fe977ed6804e ---> 55850c9b0479 Step 6/61 : RUN echo test > /test ---> Running in be65cbfad172 Removing intermediate container be65cbfad172 ---> 0cf8acba70f0 Step 7/61 : RUN echo test > /test ---> Running in fd5b0907b6a9 Removing intermediate container fd5b0907b6a9 ---> 257a4493d85d Step 8/61 : RUN echo test > /test ---> Running in f7ca0ffd9076 Removing intermediate container f7ca0ffd9076 ---> 3baa6f4fa2d5 Step 9/61 : RUN echo test > /test ---> Running in 5146814d4727 Removing intermediate container 5146814d4727 ---> 485b9d5cf228 Step 10/61 : RUN echo test > /test ---> Running in a090eec1b743 Removing intermediate container a090eec1b743 ---> a7eb10155b51 Step 11/61 : RUN echo test > /test ---> Running in 942660b288df Removing intermediate container 942660b288df ---> 9d286a1e2133 Step 12/61 : RUN echo test > /test ---> Running in c3d369aa91df Removing intermediate container c3d369aa91df ---> f78be4788992 Step 13/61 : RUN echo test > /test ---> Running in a03c3ac6888f Removing intermediate container a03c3ac6888f ---> 6504363f61ab Step 14/61 : RUN echo test > /test ---> Running in 0c3c2fca3f90 Removing intermediate container 0c3c2fca3f90 ---> fe3448b8bb29 Step 15/61 : RUN echo test > /test ---> Running in 828d51c76d3b Removing intermediate container 828d51c76d3b ---> 870684e3aea0 Step 16/61 : RUN echo test > /test ---> Running in 59a2f7c5f3ad Removing intermediate container 59a2f7c5f3ad ---> cf84556ca5c0 Step 17/61 : RUN echo test > /test ---> Running in bfb4e088eeb3 Removing intermediate container bfb4e088eeb3 ---> 9c8f9f652cef Step 18/61 : RUN echo test > /test ---> Running in f1b88bb5a2d7 Removing intermediate container f1b88bb5a2d7 ---> a6233ad21648 Step 19/61 : RUN echo test > /test ---> Running in 45f70577d709 Removing intermediate container 45f70577d709 ---> 1b5cc52d370d Step 20/61 : RUN echo test > /test ---> Running in 2ce231d5043d Removing intermediate container 2ce231d5043d ---> 4a0e17cbebaa Step 21/61 : RUN echo test > /test ---> Running in 52e4b0928f1f Removing intermediate container 52e4b0928f1f ---> 99b50e989bcb Step 22/61 : RUN echo test > /test ---> Running in f7ba3da7460d Removing intermediate container f7ba3da7460d ---> bfa3cad88285 Step 23/61 : RUN echo test > /test ---> Running in 60180bf60f88 Removing intermediate container 60180bf60f88 ---> fe7271988bcb Step 24/61 : RUN echo test > /test ---> Running in 20324d396531 Removing intermediate container 20324d396531 ---> e930bc039128 Step 25/61 : RUN echo test > /test ---> Running in b3ac70fd4404 Removing intermediate container b3ac70fd4404 ---> 39d0a11ea6d8 Step 26/61 : RUN echo test > /test ---> Running in 0193267d3787 Removing intermediate container 0193267d3787 ---> 8062d7aab0a5 Step 27/61 : RUN echo test > /test ---> Running in f41f45fb7985 Removing intermediate container f41f45fb7985 ---> 1f5f18f2315b Step 28/61 : RUN echo test > /test ---> Running in 90dd09c63d6e Removing intermediate container 90dd09c63d6e ---> 02f0a1141f11 Step 29/61 : RUN echo test > /test ---> Running in c557e5386e0a Removing intermediate container c557e5386e0a ---> dbcd6fb1f6f4 Step 30/61 : RUN echo test > /test ---> Running in 65369385d855 Removing intermediate container 65369385d855 ---> e6e9058a0650 Step 31/61 : RUN echo test > /test ---> Running in d861fcc388fd Removing intermediate container d861fcc388fd ---> 6e4c2c0f741f Step 32/61 : RUN echo test > /test ---> Running in 1483962b7e1c Removing intermediate container 1483962b7e1c ---> cf8f142aa055 Step 33/61 : RUN echo test > /test ---> Running in 5868934816c1 Removing intermediate container 5868934816c1 ---> d5ff87cdc204 Step 34/61 : RUN echo test > /test ---> Running in e057f3201f3a Removing intermediate container e057f3201f3a ---> b4031b7ab4ac Step 35/61 : RUN echo test > /test ---> Running in 22b769b9079c Removing intermediate container 22b769b9079c ---> 019d898510b6 Step 36/61 : RUN echo test > /test ---> Running in f1d364ef4ff8 Removing intermediate container f1d364ef4ff8 ---> 9525cafdf04d Step 37/61 : RUN echo test > /test ---> Running in 5bf505b8bdcc Removing intermediate container 5bf505b8bdcc ---> cd5002b33bfd Step 38/61 : RUN echo test > /test ---> Running in be24a921945c Removing intermediate container be24a921945c ---> 8675db44d1b7 Step 39/61 : RUN echo test > /test ---> Running in 352dc6beef3d Removing intermediate container 352dc6beef3d ---> 0ab0ece43c71 Step 40/61 : RUN echo test > /test ---> Running in eebde33e5d9b Removing intermediate container eebde33e5d9b ---> 46ca4b0dfc03 Step 41/61 : RUN echo test > /test ---> Running in f920313a1e85 Removing intermediate container f920313a1e85 ---> 7f3888414d58 Step 42/61 : RUN echo test > /test ---> Running in 10e2f4dc1ac7 Removing intermediate container 10e2f4dc1ac7 ---> 14db9e15f2dc Step 43/61 : RUN echo test > /test ---> Running in c849d6e89aa5 Removing intermediate container c849d6e89aa5 ---> fdb770494dd6 Step 44/61 : RUN echo test > /test ---> Running in 419d1a8353db Removing intermediate container 419d1a8353db ---> d12e9cf078be Step 45/61 : RUN echo test > /test ---> Running in 0f1805263e4c Removing intermediate container 0f1805263e4c ---> cd005e7b08a4 Step 46/61 : RUN echo test > /test ---> Running in 5bde05b46441 Removing intermediate container 5bde05b46441 ---> 05aa426a3d4a Step 47/61 : RUN echo test > /test ---> Running in 01ebc84bd1bc Removing intermediate container 01ebc84bd1bc ---> 35d371fa4342 Step 48/61 : RUN echo test > /test ---> Running in 49f6c2f51dd4 Removing intermediate container 49f6c2f51dd4 ---> 1090b5dfa130 Step 49/61 : RUN echo test > /test ---> Running in f8a9089cd725 Removing intermediate container f8a9089cd725 ---> b2d0eec0716d Step 50/61 : RUN echo test > /test ---> Running in a1697a0b2db0 Removing intermediate container a1697a0b2db0 ---> 10d96ac8f497 Step 51/61 : RUN echo test > /test ---> Running in 33a2332c06eb Removing intermediate container 33a2332c06eb ---> ba5bf5609c1c Step 52/61 : RUN echo test > /test ---> Running in e8920392be0d Removing intermediate container e8920392be0d ---> 5b3a95685c7e Step 53/61 : RUN echo test > /test ---> Running in 4b9298587c65 Removing intermediate container 4b9298587c65 ---> d4961a349141 Step 54/61 : RUN echo test > /test ---> Running in 8a0c960c2ba1 Removing intermediate container 8a0c960c2ba1 ---> b413197fcfa2 Step 55/61 : RUN echo test > /test ---> Running in 536ee3b9596b Removing intermediate container 536ee3b9596b ---> fc16b69b224a Step 56/61 : RUN echo test > /test ---> Running in 8b817b8d7b59 Removing intermediate container 8b817b8d7b59 ---> 2f0896400ff9 Step 57/61 : RUN echo test > /test ---> Running in ab0ed79ec3d4 Removing intermediate container ab0ed79ec3d4 ---> b4fb420e736c Step 58/61 : RUN echo test > /test ---> Running in 8548d7eead1f Removing intermediate container 8548d7eead1f ---> 745103fd5a38 Step 59/61 : RUN echo test > /test ---> Running in 1980559ad5d6 Removing intermediate container 1980559ad5d6 ---> 08c1c74a5618 Step 60/61 : FROM alpine ---> 11cd0b38bc3c Step 61/61 : COPY --from=first /test /test ---> 67f053c66c27 Successfully built 67f053c66c27 PS E:\docker\build\36764> ``` Note also that subsequent error messages once you go beyond current platform limitations kind of suck (such as insufficient resources with a bunch of spew which is incomprehensible to most) and we could do better to detect this earlier in the daemon. That'll be for a (reasonably low-priority) follow-up though as and when I have time. Theoretically we *may*, if the platform doesn't require additional changes for RS5, be able to have bigger platform limits using the v2 schema with up to 127 VPMem devices, and the possibility to have multiple SCSI controllers per SVM/UVM. However, currently LCOW is using HCS v1 schema calls, and there's no plans to rewrite the graphdriver/libcontainerd components outside of the moving LCOW fully over to the containerd runtime/snapshotter using HCS v2 schema, which is still some time off fruition. PS OK, while waiting for a full run to complete, I did get bored. Turns out it won't overflow line length as max(uint64) is 18446744073709551616 which would still be short enough at 127 layers, double the current platform limit. And I could always change it to hex or base36 to make it even shorter, or remove the 'd' from /tmp/dN. IOW, pretty sure no-one is going to hit the limit even if we could get the platform to 256 which is the current Hyper-V SCSI limit per VM (4x64), although PMEM at 127 would be the next immediate limit.
2018-08-15 20:56:28 +00:00
// remapLongToShortContainerPath does the mapping of a long container path for a
// SCSI attached disk, to a short container path where it's actually mounted.
func remapLongToShortContainerPath(longContainerPath string, attachCounter uint64, svmName string) string {
shortContainerPath := longContainerPath
if shortContainerPath != "" && shortContainerPath != toolsScratchPath {
shortContainerPath = fmt.Sprintf("/tmp/d%d", attachCounter)
logrus.Debugf("lcowdriver: UVM %s: remapping %s --> %s", svmName, longContainerPath, shortContainerPath)
}
return shortContainerPath
}
// startServiceVMIfNotRunning starts a service utility VM if it is not currently running.
// It can optionally be started with a mapped virtual disk. Returns a opengcs config structure
// representing the VM.
func (d *Driver) startServiceVMIfNotRunning(id string, mvdToAdd []hcsshim.MappedVirtualDisk, context string) (_ *serviceVM, err error) {
// Use the global ID if in global mode
id = d.getVMID(id)
title := "lcowdriver: startServiceVMIfNotRunning " + id
// Attempt to add ID to the service vm map
logrus.Debugf("%s: adding entry to service vm map", title)
svm, exists, err := d.serviceVms.add(id)
if err != nil && err == errVMisTerminating {
// VM is in the process of terminating. Wait until it's done and then try again
logrus.Debugf("%s: VM with current ID still in the process of terminating", title)
if err := svm.getStopError(); err != nil {
logrus.Debugf("%s: VM did not stop successfully: %s", title, err)
return nil, err
}
return d.startServiceVMIfNotRunning(id, mvdToAdd, context)
} else if err != nil {
logrus.Debugf("%s: failed to add service vm to map: %s", title, err)
return nil, fmt.Errorf("%s: failed to add to service vm map: %s", title, err)
}
if exists {
// Service VM is already up and running. In this case, just hot add the vhds.
LCOW: Mount to short container paths to avoid command-line length limit Signed-off-by: John Howard <jhoward@microsoft.com> Fixes #36764 @johnstep PTAL. @jterry75 FYI. There are two commits in this PR. The first ensure that errors are actually returned to the caller - it was being thrown away. The second commit changes the LCOW driver to map, on a per service VM basis, "long" container paths such as `/tmp/c8fa0ae1b348f505df2707060f6a49e63280d71b83b7936935c827e2e9bde16d` to much shorter paths, based on a per-service VM counter, so something more like /tmp/d3. This means that the root cause of the failure where the mount call to create the overlay was failing due to command line length becomes something much shorter such as below. `mount -t overlay overlay -olowerdir=/tmp/d3:/tmp/d4:/tmp/d5:/tmp/d6:/tmp/d7:/tmp/d8:/tmp/d9:/tmp/d10:/tmp/d11:/tmp/d12:/tmp/d13:/tmp/d14:/tmp/d15:/tmp/d16:/tmp/d17:/tmp/d18:/tmp/d19:/tmp/d20:/tmp/d21:/tmp/d22:/tmp/d23:/tmp/d24:/tmp/d25:/tmp/d26:/tmp/d27:/tmp/d28:/tmp/d29:/tmp/d30:/tmp/d31:/tmp/d32:/tmp/d33:/tmp/d34:/tmp/d35:/tmp/d36:/tmp/d37:/tmp/d38:/tmp/d39:/tmp/d40:/tmp/d41:/tmp/d42:/tmp/d43:/tmp/d44:/tmp/d45:/tmp/d46:/tmp/d47:/tmp/d48:/tmp/d49:/tmp/d50:/tmp/d51:/tmp/d52:/tmp/d53:/tmp/d54:/tmp/d55:/tmp/d56:/tmp/d57:/tmp/d58:/tmp/d59:/tmp/d60:/tmp/d61:/tmp/d62,upperdir=/tmp/d2/upper,workdir=/tmp/d2/work /tmp/c8fa0ae1b348f505df2707060f6a49e63280d71b83b7936935c827e2e9bde16d-mount` For those worrying about overflow (which I'm sure @thaJeztah will mention...): It's safe to use a counter here as SVMs are disposable in the default configuration. The exception is when running the daemon in unsafe LCOW "global" mode (ie `--storage-opt lcow.globalmode=1`) where the SVMs aren't disposed of, but a single one is reused. However, to overflow the command line length, it would require several hundred-thousand trillion (conservative, I should sit down and work it out accurately if I get -really- bored) of SCSI hot-add operations, and even to hit that would be hard as just running containers normally uses the VPMEM path for the containers UVM, not to the global SVM on SCSI. It gets incremented by one per build step (commit more accurately) as a general rule. Hence it would be necessary to have to be doing automated builds without restarting the daemon for literally years on end in unsafe mode. :innocent: Note that in reality, the previous limit of ~47 layers before hitting the command line length limit is close to what is possible in the platform, at least as of RS5/Windows Server 2019 where, in the HCS v1 schema, a single SCSI controller is used, and that can only support 64 disks per controller per the Hyper-V VDEV. And remember we have one slot taken up for the SVMs scratch, and another for the containers scratch when committing a layer. So the best you can architecturally get on the platform is around the following (it's also different by 1 depending on whether in unsafe or default mode) ``` PS E:\docker\build\36764\short> docker build --no-cache . Sending build context to Docker daemon 2.048kB Step 1/4 : FROM alpine as first ---> 11cd0b38bc3c Step 2/4 : RUN echo test > /test ---> Running in 8ddfe20e5bfb Removing intermediate container 8ddfe20e5bfb ---> b0103a00b1c9 Step 3/4 : FROM alpine ---> 11cd0b38bc3c Step 4/4 : COPY --from=first /test /test ---> 54bfae391eba Successfully built 54bfae391eba PS E:\docker\build\36764\short> cd .. PS E:\docker\build\36764> docker build --no-cache . Sending build context to Docker daemon 4.689MB Step 1/61 : FROM alpine as first ---> 11cd0b38bc3c Step 2/61 : RUN echo test > /test ---> Running in 02597ff870db Removing intermediate container 02597ff870db ---> 3096de6fc454 Step 3/61 : RUN echo test > /test ---> Running in 9a8110f4ff19 Removing intermediate container 9a8110f4ff19 ---> 7691808cf28e Step 4/61 : RUN echo test > /test ---> Running in 9afb8f51510b Removing intermediate container 9afb8f51510b ---> e42a0df2bb1c Step 5/61 : RUN echo test > /test ---> Running in fe977ed6804e Removing intermediate container fe977ed6804e ---> 55850c9b0479 Step 6/61 : RUN echo test > /test ---> Running in be65cbfad172 Removing intermediate container be65cbfad172 ---> 0cf8acba70f0 Step 7/61 : RUN echo test > /test ---> Running in fd5b0907b6a9 Removing intermediate container fd5b0907b6a9 ---> 257a4493d85d Step 8/61 : RUN echo test > /test ---> Running in f7ca0ffd9076 Removing intermediate container f7ca0ffd9076 ---> 3baa6f4fa2d5 Step 9/61 : RUN echo test > /test ---> Running in 5146814d4727 Removing intermediate container 5146814d4727 ---> 485b9d5cf228 Step 10/61 : RUN echo test > /test ---> Running in a090eec1b743 Removing intermediate container a090eec1b743 ---> a7eb10155b51 Step 11/61 : RUN echo test > /test ---> Running in 942660b288df Removing intermediate container 942660b288df ---> 9d286a1e2133 Step 12/61 : RUN echo test > /test ---> Running in c3d369aa91df Removing intermediate container c3d369aa91df ---> f78be4788992 Step 13/61 : RUN echo test > /test ---> Running in a03c3ac6888f Removing intermediate container a03c3ac6888f ---> 6504363f61ab Step 14/61 : RUN echo test > /test ---> Running in 0c3c2fca3f90 Removing intermediate container 0c3c2fca3f90 ---> fe3448b8bb29 Step 15/61 : RUN echo test > /test ---> Running in 828d51c76d3b Removing intermediate container 828d51c76d3b ---> 870684e3aea0 Step 16/61 : RUN echo test > /test ---> Running in 59a2f7c5f3ad Removing intermediate container 59a2f7c5f3ad ---> cf84556ca5c0 Step 17/61 : RUN echo test > /test ---> Running in bfb4e088eeb3 Removing intermediate container bfb4e088eeb3 ---> 9c8f9f652cef Step 18/61 : RUN echo test > /test ---> Running in f1b88bb5a2d7 Removing intermediate container f1b88bb5a2d7 ---> a6233ad21648 Step 19/61 : RUN echo test > /test ---> Running in 45f70577d709 Removing intermediate container 45f70577d709 ---> 1b5cc52d370d Step 20/61 : RUN echo test > /test ---> Running in 2ce231d5043d Removing intermediate container 2ce231d5043d ---> 4a0e17cbebaa Step 21/61 : RUN echo test > /test ---> Running in 52e4b0928f1f Removing intermediate container 52e4b0928f1f ---> 99b50e989bcb Step 22/61 : RUN echo test > /test ---> Running in f7ba3da7460d Removing intermediate container f7ba3da7460d ---> bfa3cad88285 Step 23/61 : RUN echo test > /test ---> Running in 60180bf60f88 Removing intermediate container 60180bf60f88 ---> fe7271988bcb Step 24/61 : RUN echo test > /test ---> Running in 20324d396531 Removing intermediate container 20324d396531 ---> e930bc039128 Step 25/61 : RUN echo test > /test ---> Running in b3ac70fd4404 Removing intermediate container b3ac70fd4404 ---> 39d0a11ea6d8 Step 26/61 : RUN echo test > /test ---> Running in 0193267d3787 Removing intermediate container 0193267d3787 ---> 8062d7aab0a5 Step 27/61 : RUN echo test > /test ---> Running in f41f45fb7985 Removing intermediate container f41f45fb7985 ---> 1f5f18f2315b Step 28/61 : RUN echo test > /test ---> Running in 90dd09c63d6e Removing intermediate container 90dd09c63d6e ---> 02f0a1141f11 Step 29/61 : RUN echo test > /test ---> Running in c557e5386e0a Removing intermediate container c557e5386e0a ---> dbcd6fb1f6f4 Step 30/61 : RUN echo test > /test ---> Running in 65369385d855 Removing intermediate container 65369385d855 ---> e6e9058a0650 Step 31/61 : RUN echo test > /test ---> Running in d861fcc388fd Removing intermediate container d861fcc388fd ---> 6e4c2c0f741f Step 32/61 : RUN echo test > /test ---> Running in 1483962b7e1c Removing intermediate container 1483962b7e1c ---> cf8f142aa055 Step 33/61 : RUN echo test > /test ---> Running in 5868934816c1 Removing intermediate container 5868934816c1 ---> d5ff87cdc204 Step 34/61 : RUN echo test > /test ---> Running in e057f3201f3a Removing intermediate container e057f3201f3a ---> b4031b7ab4ac Step 35/61 : RUN echo test > /test ---> Running in 22b769b9079c Removing intermediate container 22b769b9079c ---> 019d898510b6 Step 36/61 : RUN echo test > /test ---> Running in f1d364ef4ff8 Removing intermediate container f1d364ef4ff8 ---> 9525cafdf04d Step 37/61 : RUN echo test > /test ---> Running in 5bf505b8bdcc Removing intermediate container 5bf505b8bdcc ---> cd5002b33bfd Step 38/61 : RUN echo test > /test ---> Running in be24a921945c Removing intermediate container be24a921945c ---> 8675db44d1b7 Step 39/61 : RUN echo test > /test ---> Running in 352dc6beef3d Removing intermediate container 352dc6beef3d ---> 0ab0ece43c71 Step 40/61 : RUN echo test > /test ---> Running in eebde33e5d9b Removing intermediate container eebde33e5d9b ---> 46ca4b0dfc03 Step 41/61 : RUN echo test > /test ---> Running in f920313a1e85 Removing intermediate container f920313a1e85 ---> 7f3888414d58 Step 42/61 : RUN echo test > /test ---> Running in 10e2f4dc1ac7 Removing intermediate container 10e2f4dc1ac7 ---> 14db9e15f2dc Step 43/61 : RUN echo test > /test ---> Running in c849d6e89aa5 Removing intermediate container c849d6e89aa5 ---> fdb770494dd6 Step 44/61 : RUN echo test > /test ---> Running in 419d1a8353db Removing intermediate container 419d1a8353db ---> d12e9cf078be Step 45/61 : RUN echo test > /test ---> Running in 0f1805263e4c Removing intermediate container 0f1805263e4c ---> cd005e7b08a4 Step 46/61 : RUN echo test > /test ---> Running in 5bde05b46441 Removing intermediate container 5bde05b46441 ---> 05aa426a3d4a Step 47/61 : RUN echo test > /test ---> Running in 01ebc84bd1bc Removing intermediate container 01ebc84bd1bc ---> 35d371fa4342 Step 48/61 : RUN echo test > /test ---> Running in 49f6c2f51dd4 Removing intermediate container 49f6c2f51dd4 ---> 1090b5dfa130 Step 49/61 : RUN echo test > /test ---> Running in f8a9089cd725 Removing intermediate container f8a9089cd725 ---> b2d0eec0716d Step 50/61 : RUN echo test > /test ---> Running in a1697a0b2db0 Removing intermediate container a1697a0b2db0 ---> 10d96ac8f497 Step 51/61 : RUN echo test > /test ---> Running in 33a2332c06eb Removing intermediate container 33a2332c06eb ---> ba5bf5609c1c Step 52/61 : RUN echo test > /test ---> Running in e8920392be0d Removing intermediate container e8920392be0d ---> 5b3a95685c7e Step 53/61 : RUN echo test > /test ---> Running in 4b9298587c65 Removing intermediate container 4b9298587c65 ---> d4961a349141 Step 54/61 : RUN echo test > /test ---> Running in 8a0c960c2ba1 Removing intermediate container 8a0c960c2ba1 ---> b413197fcfa2 Step 55/61 : RUN echo test > /test ---> Running in 536ee3b9596b Removing intermediate container 536ee3b9596b ---> fc16b69b224a Step 56/61 : RUN echo test > /test ---> Running in 8b817b8d7b59 Removing intermediate container 8b817b8d7b59 ---> 2f0896400ff9 Step 57/61 : RUN echo test > /test ---> Running in ab0ed79ec3d4 Removing intermediate container ab0ed79ec3d4 ---> b4fb420e736c Step 58/61 : RUN echo test > /test ---> Running in 8548d7eead1f Removing intermediate container 8548d7eead1f ---> 745103fd5a38 Step 59/61 : RUN echo test > /test ---> Running in 1980559ad5d6 Removing intermediate container 1980559ad5d6 ---> 08c1c74a5618 Step 60/61 : FROM alpine ---> 11cd0b38bc3c Step 61/61 : COPY --from=first /test /test ---> 67f053c66c27 Successfully built 67f053c66c27 PS E:\docker\build\36764> ``` Note also that subsequent error messages once you go beyond current platform limitations kind of suck (such as insufficient resources with a bunch of spew which is incomprehensible to most) and we could do better to detect this earlier in the daemon. That'll be for a (reasonably low-priority) follow-up though as and when I have time. Theoretically we *may*, if the platform doesn't require additional changes for RS5, be able to have bigger platform limits using the v2 schema with up to 127 VPMem devices, and the possibility to have multiple SCSI controllers per SVM/UVM. However, currently LCOW is using HCS v1 schema calls, and there's no plans to rewrite the graphdriver/libcontainerd components outside of the moving LCOW fully over to the containerd runtime/snapshotter using HCS v2 schema, which is still some time off fruition. PS OK, while waiting for a full run to complete, I did get bored. Turns out it won't overflow line length as max(uint64) is 18446744073709551616 which would still be short enough at 127 layers, double the current platform limit. And I could always change it to hex or base36 to make it even shorter, or remove the 'd' from /tmp/dN. IOW, pretty sure no-one is going to hit the limit even if we could get the platform to 256 which is the current Hyper-V SCSI limit per VM (4x64), although PMEM at 127 would be the next immediate limit.
2018-08-15 20:56:28 +00:00
// Note that hotAddVHDs will remap long to short container paths, so no need
// for us to that here.
logrus.Debugf("%s: service vm already exists. Just hot adding: %+v", title, mvdToAdd)
if err := svm.hotAddVHDs(mvdToAdd...); err != nil {
logrus.Debugf("%s: failed to hot add vhds on service vm creation: %s", title, err)
return nil, fmt.Errorf("%s: failed to hot add vhds on service vm: %s", title, err)
}
return svm, nil
}
// We are the first service for this id, so we need to start it
logrus.Debugf("%s: service vm doesn't exist. Now starting it up", title)
defer func() {
// Signal that start has finished, passing in the error if any.
svm.signalStartFinished(err)
if err != nil {
// We added a ref to the VM, since we failed, we should delete the ref.
d.terminateServiceVM(id, "error path on startServiceVMIfNotRunning", false)
}
}()
// Generate a default configuration
if err := svm.config.GenerateDefault(d.options); err != nil {
return nil, fmt.Errorf("%s: failed to generate default gogcs configuration for global svm (%s): %s", title, context, err)
}
// For the name, we deliberately suffix if safe-mode to ensure that it doesn't
// clash with another utility VM which may be running for the container itself.
// This also makes it easier to correlate through Get-ComputeProcess.
if id == svmGlobalID {
svm.config.Name = svmGlobalID
} else {
svm.config.Name = fmt.Sprintf("%s_svm", id)
}
// Ensure we take the cached scratch mutex around the check to ensure the file is complete
// and not in the process of being created by another thread.
scratchTargetFile := filepath.Join(d.dataRoot, scratchDirectory, fmt.Sprintf("%s.vhdx", id))
logrus.Debugf("%s: locking cachedScratchMutex", title)
d.cachedScratchMutex.Lock()
if _, err := os.Stat(d.cachedScratchFile); err == nil {
// Make a copy of cached scratch to the scratch directory
logrus.Debugf("%s: (%s) cloning cached scratch for mvd", title, context)
if err := client.CopyFile(d.cachedScratchFile, scratchTargetFile, true); err != nil {
logrus.Debugf("%s: releasing cachedScratchMutex on err: %s", title, err)
d.cachedScratchMutex.Unlock()
return nil, err
}
// Add the cached clone as a mapped virtual disk
logrus.Debugf("%s: (%s) adding cloned scratch as mvd", title, context)
mvd := hcsshim.MappedVirtualDisk{
HostPath: scratchTargetFile,
ContainerPath: toolsScratchPath,
CreateInUtilityVM: true,
}
svm.config.MappedVirtualDisks = append(svm.config.MappedVirtualDisks, mvd)
svm.scratchAttached = true
}
logrus.Debugf("%s: releasing cachedScratchMutex", title)
d.cachedScratchMutex.Unlock()
LCOW: Mount to short container paths to avoid command-line length limit Signed-off-by: John Howard <jhoward@microsoft.com> Fixes #36764 @johnstep PTAL. @jterry75 FYI. There are two commits in this PR. The first ensure that errors are actually returned to the caller - it was being thrown away. The second commit changes the LCOW driver to map, on a per service VM basis, "long" container paths such as `/tmp/c8fa0ae1b348f505df2707060f6a49e63280d71b83b7936935c827e2e9bde16d` to much shorter paths, based on a per-service VM counter, so something more like /tmp/d3. This means that the root cause of the failure where the mount call to create the overlay was failing due to command line length becomes something much shorter such as below. `mount -t overlay overlay -olowerdir=/tmp/d3:/tmp/d4:/tmp/d5:/tmp/d6:/tmp/d7:/tmp/d8:/tmp/d9:/tmp/d10:/tmp/d11:/tmp/d12:/tmp/d13:/tmp/d14:/tmp/d15:/tmp/d16:/tmp/d17:/tmp/d18:/tmp/d19:/tmp/d20:/tmp/d21:/tmp/d22:/tmp/d23:/tmp/d24:/tmp/d25:/tmp/d26:/tmp/d27:/tmp/d28:/tmp/d29:/tmp/d30:/tmp/d31:/tmp/d32:/tmp/d33:/tmp/d34:/tmp/d35:/tmp/d36:/tmp/d37:/tmp/d38:/tmp/d39:/tmp/d40:/tmp/d41:/tmp/d42:/tmp/d43:/tmp/d44:/tmp/d45:/tmp/d46:/tmp/d47:/tmp/d48:/tmp/d49:/tmp/d50:/tmp/d51:/tmp/d52:/tmp/d53:/tmp/d54:/tmp/d55:/tmp/d56:/tmp/d57:/tmp/d58:/tmp/d59:/tmp/d60:/tmp/d61:/tmp/d62,upperdir=/tmp/d2/upper,workdir=/tmp/d2/work /tmp/c8fa0ae1b348f505df2707060f6a49e63280d71b83b7936935c827e2e9bde16d-mount` For those worrying about overflow (which I'm sure @thaJeztah will mention...): It's safe to use a counter here as SVMs are disposable in the default configuration. The exception is when running the daemon in unsafe LCOW "global" mode (ie `--storage-opt lcow.globalmode=1`) where the SVMs aren't disposed of, but a single one is reused. However, to overflow the command line length, it would require several hundred-thousand trillion (conservative, I should sit down and work it out accurately if I get -really- bored) of SCSI hot-add operations, and even to hit that would be hard as just running containers normally uses the VPMEM path for the containers UVM, not to the global SVM on SCSI. It gets incremented by one per build step (commit more accurately) as a general rule. Hence it would be necessary to have to be doing automated builds without restarting the daemon for literally years on end in unsafe mode. :innocent: Note that in reality, the previous limit of ~47 layers before hitting the command line length limit is close to what is possible in the platform, at least as of RS5/Windows Server 2019 where, in the HCS v1 schema, a single SCSI controller is used, and that can only support 64 disks per controller per the Hyper-V VDEV. And remember we have one slot taken up for the SVMs scratch, and another for the containers scratch when committing a layer. So the best you can architecturally get on the platform is around the following (it's also different by 1 depending on whether in unsafe or default mode) ``` PS E:\docker\build\36764\short> docker build --no-cache . Sending build context to Docker daemon 2.048kB Step 1/4 : FROM alpine as first ---> 11cd0b38bc3c Step 2/4 : RUN echo test > /test ---> Running in 8ddfe20e5bfb Removing intermediate container 8ddfe20e5bfb ---> b0103a00b1c9 Step 3/4 : FROM alpine ---> 11cd0b38bc3c Step 4/4 : COPY --from=first /test /test ---> 54bfae391eba Successfully built 54bfae391eba PS E:\docker\build\36764\short> cd .. PS E:\docker\build\36764> docker build --no-cache . Sending build context to Docker daemon 4.689MB Step 1/61 : FROM alpine as first ---> 11cd0b38bc3c Step 2/61 : RUN echo test > /test ---> Running in 02597ff870db Removing intermediate container 02597ff870db ---> 3096de6fc454 Step 3/61 : RUN echo test > /test ---> Running in 9a8110f4ff19 Removing intermediate container 9a8110f4ff19 ---> 7691808cf28e Step 4/61 : RUN echo test > /test ---> Running in 9afb8f51510b Removing intermediate container 9afb8f51510b ---> e42a0df2bb1c Step 5/61 : RUN echo test > /test ---> Running in fe977ed6804e Removing intermediate container fe977ed6804e ---> 55850c9b0479 Step 6/61 : RUN echo test > /test ---> Running in be65cbfad172 Removing intermediate container be65cbfad172 ---> 0cf8acba70f0 Step 7/61 : RUN echo test > /test ---> Running in fd5b0907b6a9 Removing intermediate container fd5b0907b6a9 ---> 257a4493d85d Step 8/61 : RUN echo test > /test ---> Running in f7ca0ffd9076 Removing intermediate container f7ca0ffd9076 ---> 3baa6f4fa2d5 Step 9/61 : RUN echo test > /test ---> Running in 5146814d4727 Removing intermediate container 5146814d4727 ---> 485b9d5cf228 Step 10/61 : RUN echo test > /test ---> Running in a090eec1b743 Removing intermediate container a090eec1b743 ---> a7eb10155b51 Step 11/61 : RUN echo test > /test ---> Running in 942660b288df Removing intermediate container 942660b288df ---> 9d286a1e2133 Step 12/61 : RUN echo test > /test ---> Running in c3d369aa91df Removing intermediate container c3d369aa91df ---> f78be4788992 Step 13/61 : RUN echo test > /test ---> Running in a03c3ac6888f Removing intermediate container a03c3ac6888f ---> 6504363f61ab Step 14/61 : RUN echo test > /test ---> Running in 0c3c2fca3f90 Removing intermediate container 0c3c2fca3f90 ---> fe3448b8bb29 Step 15/61 : RUN echo test > /test ---> Running in 828d51c76d3b Removing intermediate container 828d51c76d3b ---> 870684e3aea0 Step 16/61 : RUN echo test > /test ---> Running in 59a2f7c5f3ad Removing intermediate container 59a2f7c5f3ad ---> cf84556ca5c0 Step 17/61 : RUN echo test > /test ---> Running in bfb4e088eeb3 Removing intermediate container bfb4e088eeb3 ---> 9c8f9f652cef Step 18/61 : RUN echo test > /test ---> Running in f1b88bb5a2d7 Removing intermediate container f1b88bb5a2d7 ---> a6233ad21648 Step 19/61 : RUN echo test > /test ---> Running in 45f70577d709 Removing intermediate container 45f70577d709 ---> 1b5cc52d370d Step 20/61 : RUN echo test > /test ---> Running in 2ce231d5043d Removing intermediate container 2ce231d5043d ---> 4a0e17cbebaa Step 21/61 : RUN echo test > /test ---> Running in 52e4b0928f1f Removing intermediate container 52e4b0928f1f ---> 99b50e989bcb Step 22/61 : RUN echo test > /test ---> Running in f7ba3da7460d Removing intermediate container f7ba3da7460d ---> bfa3cad88285 Step 23/61 : RUN echo test > /test ---> Running in 60180bf60f88 Removing intermediate container 60180bf60f88 ---> fe7271988bcb Step 24/61 : RUN echo test > /test ---> Running in 20324d396531 Removing intermediate container 20324d396531 ---> e930bc039128 Step 25/61 : RUN echo test > /test ---> Running in b3ac70fd4404 Removing intermediate container b3ac70fd4404 ---> 39d0a11ea6d8 Step 26/61 : RUN echo test > /test ---> Running in 0193267d3787 Removing intermediate container 0193267d3787 ---> 8062d7aab0a5 Step 27/61 : RUN echo test > /test ---> Running in f41f45fb7985 Removing intermediate container f41f45fb7985 ---> 1f5f18f2315b Step 28/61 : RUN echo test > /test ---> Running in 90dd09c63d6e Removing intermediate container 90dd09c63d6e ---> 02f0a1141f11 Step 29/61 : RUN echo test > /test ---> Running in c557e5386e0a Removing intermediate container c557e5386e0a ---> dbcd6fb1f6f4 Step 30/61 : RUN echo test > /test ---> Running in 65369385d855 Removing intermediate container 65369385d855 ---> e6e9058a0650 Step 31/61 : RUN echo test > /test ---> Running in d861fcc388fd Removing intermediate container d861fcc388fd ---> 6e4c2c0f741f Step 32/61 : RUN echo test > /test ---> Running in 1483962b7e1c Removing intermediate container 1483962b7e1c ---> cf8f142aa055 Step 33/61 : RUN echo test > /test ---> Running in 5868934816c1 Removing intermediate container 5868934816c1 ---> d5ff87cdc204 Step 34/61 : RUN echo test > /test ---> Running in e057f3201f3a Removing intermediate container e057f3201f3a ---> b4031b7ab4ac Step 35/61 : RUN echo test > /test ---> Running in 22b769b9079c Removing intermediate container 22b769b9079c ---> 019d898510b6 Step 36/61 : RUN echo test > /test ---> Running in f1d364ef4ff8 Removing intermediate container f1d364ef4ff8 ---> 9525cafdf04d Step 37/61 : RUN echo test > /test ---> Running in 5bf505b8bdcc Removing intermediate container 5bf505b8bdcc ---> cd5002b33bfd Step 38/61 : RUN echo test > /test ---> Running in be24a921945c Removing intermediate container be24a921945c ---> 8675db44d1b7 Step 39/61 : RUN echo test > /test ---> Running in 352dc6beef3d Removing intermediate container 352dc6beef3d ---> 0ab0ece43c71 Step 40/61 : RUN echo test > /test ---> Running in eebde33e5d9b Removing intermediate container eebde33e5d9b ---> 46ca4b0dfc03 Step 41/61 : RUN echo test > /test ---> Running in f920313a1e85 Removing intermediate container f920313a1e85 ---> 7f3888414d58 Step 42/61 : RUN echo test > /test ---> Running in 10e2f4dc1ac7 Removing intermediate container 10e2f4dc1ac7 ---> 14db9e15f2dc Step 43/61 : RUN echo test > /test ---> Running in c849d6e89aa5 Removing intermediate container c849d6e89aa5 ---> fdb770494dd6 Step 44/61 : RUN echo test > /test ---> Running in 419d1a8353db Removing intermediate container 419d1a8353db ---> d12e9cf078be Step 45/61 : RUN echo test > /test ---> Running in 0f1805263e4c Removing intermediate container 0f1805263e4c ---> cd005e7b08a4 Step 46/61 : RUN echo test > /test ---> Running in 5bde05b46441 Removing intermediate container 5bde05b46441 ---> 05aa426a3d4a Step 47/61 : RUN echo test > /test ---> Running in 01ebc84bd1bc Removing intermediate container 01ebc84bd1bc ---> 35d371fa4342 Step 48/61 : RUN echo test > /test ---> Running in 49f6c2f51dd4 Removing intermediate container 49f6c2f51dd4 ---> 1090b5dfa130 Step 49/61 : RUN echo test > /test ---> Running in f8a9089cd725 Removing intermediate container f8a9089cd725 ---> b2d0eec0716d Step 50/61 : RUN echo test > /test ---> Running in a1697a0b2db0 Removing intermediate container a1697a0b2db0 ---> 10d96ac8f497 Step 51/61 : RUN echo test > /test ---> Running in 33a2332c06eb Removing intermediate container 33a2332c06eb ---> ba5bf5609c1c Step 52/61 : RUN echo test > /test ---> Running in e8920392be0d Removing intermediate container e8920392be0d ---> 5b3a95685c7e Step 53/61 : RUN echo test > /test ---> Running in 4b9298587c65 Removing intermediate container 4b9298587c65 ---> d4961a349141 Step 54/61 : RUN echo test > /test ---> Running in 8a0c960c2ba1 Removing intermediate container 8a0c960c2ba1 ---> b413197fcfa2 Step 55/61 : RUN echo test > /test ---> Running in 536ee3b9596b Removing intermediate container 536ee3b9596b ---> fc16b69b224a Step 56/61 : RUN echo test > /test ---> Running in 8b817b8d7b59 Removing intermediate container 8b817b8d7b59 ---> 2f0896400ff9 Step 57/61 : RUN echo test > /test ---> Running in ab0ed79ec3d4 Removing intermediate container ab0ed79ec3d4 ---> b4fb420e736c Step 58/61 : RUN echo test > /test ---> Running in 8548d7eead1f Removing intermediate container 8548d7eead1f ---> 745103fd5a38 Step 59/61 : RUN echo test > /test ---> Running in 1980559ad5d6 Removing intermediate container 1980559ad5d6 ---> 08c1c74a5618 Step 60/61 : FROM alpine ---> 11cd0b38bc3c Step 61/61 : COPY --from=first /test /test ---> 67f053c66c27 Successfully built 67f053c66c27 PS E:\docker\build\36764> ``` Note also that subsequent error messages once you go beyond current platform limitations kind of suck (such as insufficient resources with a bunch of spew which is incomprehensible to most) and we could do better to detect this earlier in the daemon. That'll be for a (reasonably low-priority) follow-up though as and when I have time. Theoretically we *may*, if the platform doesn't require additional changes for RS5, be able to have bigger platform limits using the v2 schema with up to 127 VPMem devices, and the possibility to have multiple SCSI controllers per SVM/UVM. However, currently LCOW is using HCS v1 schema calls, and there's no plans to rewrite the graphdriver/libcontainerd components outside of the moving LCOW fully over to the containerd runtime/snapshotter using HCS v2 schema, which is still some time off fruition. PS OK, while waiting for a full run to complete, I did get bored. Turns out it won't overflow line length as max(uint64) is 18446744073709551616 which would still be short enough at 127 layers, double the current platform limit. And I could always change it to hex or base36 to make it even shorter, or remove the 'd' from /tmp/dN. IOW, pretty sure no-one is going to hit the limit even if we could get the platform to 256 which is the current Hyper-V SCSI limit per VM (4x64), although PMEM at 127 would be the next immediate limit.
2018-08-15 20:56:28 +00:00
// Add mapped virtual disks. First those that are already in the configuration. Generally,
// the only one that will be here is the service VMs scratch. The exception is when invoked
// via the graphdrivers DiffGetter implementation.
for i, mvd := range svm.config.MappedVirtualDisks {
svm.attachCounter++
svm.attachedVHDs[mvd.HostPath] = &attachedVHD{refCount: 1, attachCounter: svm.attachCounter}
// No-op for the service VMs scratch disk. Only applicable in the DiffGetter interface invocation.
svm.config.MappedVirtualDisks[i].ContainerPath = remapLongToShortContainerPath(mvd.ContainerPath, svm.attachCounter, svm.config.Name)
}
// Then the remaining ones to add, and adding them to the startup configuration.
for _, mvd := range mvdToAdd {
svm.attachCounter++
svm.attachedVHDs[mvd.HostPath] = &attachedVHD{refCount: 1, attachCounter: svm.attachCounter}
mvd.ContainerPath = remapLongToShortContainerPath(mvd.ContainerPath, svm.attachCounter, svm.config.Name)
svm.config.MappedVirtualDisks = append(svm.config.MappedVirtualDisks, mvd)
}
// Start it.
logrus.Debugf("%s: (%s) starting %s", title, context, svm.config.Name)
if err := svm.config.StartUtilityVM(); err != nil {
return nil, fmt.Errorf("failed to start service utility VM (%s): %s", context, err)
}
// defer function to terminate the VM if the next steps fail
defer func() {
if err != nil {
waitTerminate(svm, fmt.Sprintf("%s: (%s)", title, context))
}
}()
// Now we have a running service VM, we can create the cached scratch file if it doesn't exist.
logrus.Debugf("%s: locking cachedScratchMutex", title)
d.cachedScratchMutex.Lock()
if _, err := os.Stat(d.cachedScratchFile); err != nil {
logrus.Debugf("%s: (%s) creating an SVM scratch", title, context)
// Don't use svm.CreateExt4Vhdx since that only works when the service vm is setup,
// but we're still in that process right now.
if err := svm.config.CreateExt4Vhdx(scratchTargetFile, client.DefaultVhdxSizeGB, d.cachedScratchFile); err != nil {
logrus.Debugf("%s: (%s) releasing cachedScratchMutex on error path", title, context)
d.cachedScratchMutex.Unlock()
logrus.Debugf("%s: failed to create vm scratch %s: %s", title, scratchTargetFile, err)
return nil, fmt.Errorf("failed to create SVM scratch VHDX (%s): %s", context, err)
}
}
logrus.Debugf("%s: (%s) releasing cachedScratchMutex", title, context)
d.cachedScratchMutex.Unlock()
// Hot-add the scratch-space if not already attached
if !svm.scratchAttached {
logrus.Debugf("%s: (%s) hot-adding scratch %s", title, context, scratchTargetFile)
if err := svm.hotAddVHDsAtStart(hcsshim.MappedVirtualDisk{
HostPath: scratchTargetFile,
ContainerPath: toolsScratchPath,
CreateInUtilityVM: true,
}); err != nil {
logrus.Debugf("%s: failed to hot-add scratch %s: %s", title, scratchTargetFile, err)
return nil, fmt.Errorf("failed to hot-add %s failed: %s", scratchTargetFile, err)
}
svm.scratchAttached = true
LCOW: Mount to short container paths to avoid command-line length limit Signed-off-by: John Howard <jhoward@microsoft.com> Fixes #36764 @johnstep PTAL. @jterry75 FYI. There are two commits in this PR. The first ensure that errors are actually returned to the caller - it was being thrown away. The second commit changes the LCOW driver to map, on a per service VM basis, "long" container paths such as `/tmp/c8fa0ae1b348f505df2707060f6a49e63280d71b83b7936935c827e2e9bde16d` to much shorter paths, based on a per-service VM counter, so something more like /tmp/d3. This means that the root cause of the failure where the mount call to create the overlay was failing due to command line length becomes something much shorter such as below. `mount -t overlay overlay -olowerdir=/tmp/d3:/tmp/d4:/tmp/d5:/tmp/d6:/tmp/d7:/tmp/d8:/tmp/d9:/tmp/d10:/tmp/d11:/tmp/d12:/tmp/d13:/tmp/d14:/tmp/d15:/tmp/d16:/tmp/d17:/tmp/d18:/tmp/d19:/tmp/d20:/tmp/d21:/tmp/d22:/tmp/d23:/tmp/d24:/tmp/d25:/tmp/d26:/tmp/d27:/tmp/d28:/tmp/d29:/tmp/d30:/tmp/d31:/tmp/d32:/tmp/d33:/tmp/d34:/tmp/d35:/tmp/d36:/tmp/d37:/tmp/d38:/tmp/d39:/tmp/d40:/tmp/d41:/tmp/d42:/tmp/d43:/tmp/d44:/tmp/d45:/tmp/d46:/tmp/d47:/tmp/d48:/tmp/d49:/tmp/d50:/tmp/d51:/tmp/d52:/tmp/d53:/tmp/d54:/tmp/d55:/tmp/d56:/tmp/d57:/tmp/d58:/tmp/d59:/tmp/d60:/tmp/d61:/tmp/d62,upperdir=/tmp/d2/upper,workdir=/tmp/d2/work /tmp/c8fa0ae1b348f505df2707060f6a49e63280d71b83b7936935c827e2e9bde16d-mount` For those worrying about overflow (which I'm sure @thaJeztah will mention...): It's safe to use a counter here as SVMs are disposable in the default configuration. The exception is when running the daemon in unsafe LCOW "global" mode (ie `--storage-opt lcow.globalmode=1`) where the SVMs aren't disposed of, but a single one is reused. However, to overflow the command line length, it would require several hundred-thousand trillion (conservative, I should sit down and work it out accurately if I get -really- bored) of SCSI hot-add operations, and even to hit that would be hard as just running containers normally uses the VPMEM path for the containers UVM, not to the global SVM on SCSI. It gets incremented by one per build step (commit more accurately) as a general rule. Hence it would be necessary to have to be doing automated builds without restarting the daemon for literally years on end in unsafe mode. :innocent: Note that in reality, the previous limit of ~47 layers before hitting the command line length limit is close to what is possible in the platform, at least as of RS5/Windows Server 2019 where, in the HCS v1 schema, a single SCSI controller is used, and that can only support 64 disks per controller per the Hyper-V VDEV. And remember we have one slot taken up for the SVMs scratch, and another for the containers scratch when committing a layer. So the best you can architecturally get on the platform is around the following (it's also different by 1 depending on whether in unsafe or default mode) ``` PS E:\docker\build\36764\short> docker build --no-cache . Sending build context to Docker daemon 2.048kB Step 1/4 : FROM alpine as first ---> 11cd0b38bc3c Step 2/4 : RUN echo test > /test ---> Running in 8ddfe20e5bfb Removing intermediate container 8ddfe20e5bfb ---> b0103a00b1c9 Step 3/4 : FROM alpine ---> 11cd0b38bc3c Step 4/4 : COPY --from=first /test /test ---> 54bfae391eba Successfully built 54bfae391eba PS E:\docker\build\36764\short> cd .. PS E:\docker\build\36764> docker build --no-cache . Sending build context to Docker daemon 4.689MB Step 1/61 : FROM alpine as first ---> 11cd0b38bc3c Step 2/61 : RUN echo test > /test ---> Running in 02597ff870db Removing intermediate container 02597ff870db ---> 3096de6fc454 Step 3/61 : RUN echo test > /test ---> Running in 9a8110f4ff19 Removing intermediate container 9a8110f4ff19 ---> 7691808cf28e Step 4/61 : RUN echo test > /test ---> Running in 9afb8f51510b Removing intermediate container 9afb8f51510b ---> e42a0df2bb1c Step 5/61 : RUN echo test > /test ---> Running in fe977ed6804e Removing intermediate container fe977ed6804e ---> 55850c9b0479 Step 6/61 : RUN echo test > /test ---> Running in be65cbfad172 Removing intermediate container be65cbfad172 ---> 0cf8acba70f0 Step 7/61 : RUN echo test > /test ---> Running in fd5b0907b6a9 Removing intermediate container fd5b0907b6a9 ---> 257a4493d85d Step 8/61 : RUN echo test > /test ---> Running in f7ca0ffd9076 Removing intermediate container f7ca0ffd9076 ---> 3baa6f4fa2d5 Step 9/61 : RUN echo test > /test ---> Running in 5146814d4727 Removing intermediate container 5146814d4727 ---> 485b9d5cf228 Step 10/61 : RUN echo test > /test ---> Running in a090eec1b743 Removing intermediate container a090eec1b743 ---> a7eb10155b51 Step 11/61 : RUN echo test > /test ---> Running in 942660b288df Removing intermediate container 942660b288df ---> 9d286a1e2133 Step 12/61 : RUN echo test > /test ---> Running in c3d369aa91df Removing intermediate container c3d369aa91df ---> f78be4788992 Step 13/61 : RUN echo test > /test ---> Running in a03c3ac6888f Removing intermediate container a03c3ac6888f ---> 6504363f61ab Step 14/61 : RUN echo test > /test ---> Running in 0c3c2fca3f90 Removing intermediate container 0c3c2fca3f90 ---> fe3448b8bb29 Step 15/61 : RUN echo test > /test ---> Running in 828d51c76d3b Removing intermediate container 828d51c76d3b ---> 870684e3aea0 Step 16/61 : RUN echo test > /test ---> Running in 59a2f7c5f3ad Removing intermediate container 59a2f7c5f3ad ---> cf84556ca5c0 Step 17/61 : RUN echo test > /test ---> Running in bfb4e088eeb3 Removing intermediate container bfb4e088eeb3 ---> 9c8f9f652cef Step 18/61 : RUN echo test > /test ---> Running in f1b88bb5a2d7 Removing intermediate container f1b88bb5a2d7 ---> a6233ad21648 Step 19/61 : RUN echo test > /test ---> Running in 45f70577d709 Removing intermediate container 45f70577d709 ---> 1b5cc52d370d Step 20/61 : RUN echo test > /test ---> Running in 2ce231d5043d Removing intermediate container 2ce231d5043d ---> 4a0e17cbebaa Step 21/61 : RUN echo test > /test ---> Running in 52e4b0928f1f Removing intermediate container 52e4b0928f1f ---> 99b50e989bcb Step 22/61 : RUN echo test > /test ---> Running in f7ba3da7460d Removing intermediate container f7ba3da7460d ---> bfa3cad88285 Step 23/61 : RUN echo test > /test ---> Running in 60180bf60f88 Removing intermediate container 60180bf60f88 ---> fe7271988bcb Step 24/61 : RUN echo test > /test ---> Running in 20324d396531 Removing intermediate container 20324d396531 ---> e930bc039128 Step 25/61 : RUN echo test > /test ---> Running in b3ac70fd4404 Removing intermediate container b3ac70fd4404 ---> 39d0a11ea6d8 Step 26/61 : RUN echo test > /test ---> Running in 0193267d3787 Removing intermediate container 0193267d3787 ---> 8062d7aab0a5 Step 27/61 : RUN echo test > /test ---> Running in f41f45fb7985 Removing intermediate container f41f45fb7985 ---> 1f5f18f2315b Step 28/61 : RUN echo test > /test ---> Running in 90dd09c63d6e Removing intermediate container 90dd09c63d6e ---> 02f0a1141f11 Step 29/61 : RUN echo test > /test ---> Running in c557e5386e0a Removing intermediate container c557e5386e0a ---> dbcd6fb1f6f4 Step 30/61 : RUN echo test > /test ---> Running in 65369385d855 Removing intermediate container 65369385d855 ---> e6e9058a0650 Step 31/61 : RUN echo test > /test ---> Running in d861fcc388fd Removing intermediate container d861fcc388fd ---> 6e4c2c0f741f Step 32/61 : RUN echo test > /test ---> Running in 1483962b7e1c Removing intermediate container 1483962b7e1c ---> cf8f142aa055 Step 33/61 : RUN echo test > /test ---> Running in 5868934816c1 Removing intermediate container 5868934816c1 ---> d5ff87cdc204 Step 34/61 : RUN echo test > /test ---> Running in e057f3201f3a Removing intermediate container e057f3201f3a ---> b4031b7ab4ac Step 35/61 : RUN echo test > /test ---> Running in 22b769b9079c Removing intermediate container 22b769b9079c ---> 019d898510b6 Step 36/61 : RUN echo test > /test ---> Running in f1d364ef4ff8 Removing intermediate container f1d364ef4ff8 ---> 9525cafdf04d Step 37/61 : RUN echo test > /test ---> Running in 5bf505b8bdcc Removing intermediate container 5bf505b8bdcc ---> cd5002b33bfd Step 38/61 : RUN echo test > /test ---> Running in be24a921945c Removing intermediate container be24a921945c ---> 8675db44d1b7 Step 39/61 : RUN echo test > /test ---> Running in 352dc6beef3d Removing intermediate container 352dc6beef3d ---> 0ab0ece43c71 Step 40/61 : RUN echo test > /test ---> Running in eebde33e5d9b Removing intermediate container eebde33e5d9b ---> 46ca4b0dfc03 Step 41/61 : RUN echo test > /test ---> Running in f920313a1e85 Removing intermediate container f920313a1e85 ---> 7f3888414d58 Step 42/61 : RUN echo test > /test ---> Running in 10e2f4dc1ac7 Removing intermediate container 10e2f4dc1ac7 ---> 14db9e15f2dc Step 43/61 : RUN echo test > /test ---> Running in c849d6e89aa5 Removing intermediate container c849d6e89aa5 ---> fdb770494dd6 Step 44/61 : RUN echo test > /test ---> Running in 419d1a8353db Removing intermediate container 419d1a8353db ---> d12e9cf078be Step 45/61 : RUN echo test > /test ---> Running in 0f1805263e4c Removing intermediate container 0f1805263e4c ---> cd005e7b08a4 Step 46/61 : RUN echo test > /test ---> Running in 5bde05b46441 Removing intermediate container 5bde05b46441 ---> 05aa426a3d4a Step 47/61 : RUN echo test > /test ---> Running in 01ebc84bd1bc Removing intermediate container 01ebc84bd1bc ---> 35d371fa4342 Step 48/61 : RUN echo test > /test ---> Running in 49f6c2f51dd4 Removing intermediate container 49f6c2f51dd4 ---> 1090b5dfa130 Step 49/61 : RUN echo test > /test ---> Running in f8a9089cd725 Removing intermediate container f8a9089cd725 ---> b2d0eec0716d Step 50/61 : RUN echo test > /test ---> Running in a1697a0b2db0 Removing intermediate container a1697a0b2db0 ---> 10d96ac8f497 Step 51/61 : RUN echo test > /test ---> Running in 33a2332c06eb Removing intermediate container 33a2332c06eb ---> ba5bf5609c1c Step 52/61 : RUN echo test > /test ---> Running in e8920392be0d Removing intermediate container e8920392be0d ---> 5b3a95685c7e Step 53/61 : RUN echo test > /test ---> Running in 4b9298587c65 Removing intermediate container 4b9298587c65 ---> d4961a349141 Step 54/61 : RUN echo test > /test ---> Running in 8a0c960c2ba1 Removing intermediate container 8a0c960c2ba1 ---> b413197fcfa2 Step 55/61 : RUN echo test > /test ---> Running in 536ee3b9596b Removing intermediate container 536ee3b9596b ---> fc16b69b224a Step 56/61 : RUN echo test > /test ---> Running in 8b817b8d7b59 Removing intermediate container 8b817b8d7b59 ---> 2f0896400ff9 Step 57/61 : RUN echo test > /test ---> Running in ab0ed79ec3d4 Removing intermediate container ab0ed79ec3d4 ---> b4fb420e736c Step 58/61 : RUN echo test > /test ---> Running in 8548d7eead1f Removing intermediate container 8548d7eead1f ---> 745103fd5a38 Step 59/61 : RUN echo test > /test ---> Running in 1980559ad5d6 Removing intermediate container 1980559ad5d6 ---> 08c1c74a5618 Step 60/61 : FROM alpine ---> 11cd0b38bc3c Step 61/61 : COPY --from=first /test /test ---> 67f053c66c27 Successfully built 67f053c66c27 PS E:\docker\build\36764> ``` Note also that subsequent error messages once you go beyond current platform limitations kind of suck (such as insufficient resources with a bunch of spew which is incomprehensible to most) and we could do better to detect this earlier in the daemon. That'll be for a (reasonably low-priority) follow-up though as and when I have time. Theoretically we *may*, if the platform doesn't require additional changes for RS5, be able to have bigger platform limits using the v2 schema with up to 127 VPMem devices, and the possibility to have multiple SCSI controllers per SVM/UVM. However, currently LCOW is using HCS v1 schema calls, and there's no plans to rewrite the graphdriver/libcontainerd components outside of the moving LCOW fully over to the containerd runtime/snapshotter using HCS v2 schema, which is still some time off fruition. PS OK, while waiting for a full run to complete, I did get bored. Turns out it won't overflow line length as max(uint64) is 18446744073709551616 which would still be short enough at 127 layers, double the current platform limit. And I could always change it to hex or base36 to make it even shorter, or remove the 'd' from /tmp/dN. IOW, pretty sure no-one is going to hit the limit even if we could get the platform to 256 which is the current Hyper-V SCSI limit per VM (4x64), although PMEM at 127 would be the next immediate limit.
2018-08-15 20:56:28 +00:00
// Don't need to ref-count here as it will be done via hotAddVHDsAtStart() call above.
}
logrus.Debugf("%s: (%s) success", title, context)
return svm, nil
}
// terminateServiceVM terminates a service utility VM if its running if it's,
// not being used by any goroutine, but does nothing when in global mode as it's
// lifetime is limited to that of the daemon. If the force flag is set, then
// the VM will be killed regardless of the ref count or if it's global.
func (d *Driver) terminateServiceVM(id, context string, force bool) (err error) {
// We don't do anything in safe mode unless the force flag has been passed, which
// is only the case for cleanup at driver termination.
if d.globalMode && !force {
logrus.Debugf("lcowdriver: terminateservicevm: %s (%s) - doing nothing as in global mode", id, context)
return nil
}
id = d.getVMID(id)
var svm *serviceVM
var lastRef bool
if !force {
// In the not force case, we ref count
svm, lastRef, err = d.serviceVms.decrementRefCount(id)
} else {
// In the force case, we ignore the ref count and just set it to 0
svm, err = d.serviceVms.setRefCountZero(id)
lastRef = true
}
if err == errVMUnknown {
return nil
} else if err == errVMisTerminating {
return svm.getStopError()
} else if !lastRef {
return nil
}
// We run the deletion of the scratch as a deferred function to at least attempt
// clean-up in case of errors.
defer func() {
if svm.scratchAttached {
scratchTargetFile := filepath.Join(d.dataRoot, scratchDirectory, fmt.Sprintf("%s.vhdx", id))
logrus.Debugf("lcowdriver: terminateservicevm: %s (%s) - deleting scratch %s", id, context, scratchTargetFile)
if errRemove := os.Remove(scratchTargetFile); errRemove != nil {
logrus.Warnf("failed to remove scratch file %s (%s): %s", scratchTargetFile, context, errRemove)
err = errRemove
}
}
// This function shouldn't actually return error unless there is a bug
if errDelete := d.serviceVms.deleteID(id); errDelete != nil {
logrus.Warnf("failed to service vm from svm map %s (%s): %s", id, context, errDelete)
}
// Signal that this VM has stopped
svm.signalStopFinished(err)
}()
// Now it's possible that the service VM failed to start and now we are trying to terminate it.
// In this case, we will relay the error to the goroutines waiting for this vm to stop.
if err := svm.getStartError(); err != nil {
logrus.Debugf("lcowdriver: terminateservicevm: %s had failed to start up: %s", id, err)
return err
}
if err := waitTerminate(svm, fmt.Sprintf("terminateservicevm: %s (%s)", id, context)); err != nil {
return err
}
logrus.Debugf("lcowdriver: terminateservicevm: %s (%s) - success", id, context)
return nil
}
func waitTerminate(svm *serviceVM, context string) error {
if svm.config == nil {
return fmt.Errorf("lcowdriver: waitTermiante: Nil utility VM. %s", context)
}
logrus.Debugf("lcowdriver: waitTerminate: Calling terminate: %s", context)
if err := svm.config.Uvm.Terminate(); err != nil {
// We might get operation still pending from the HCS. In that case, we shouldn't return
// an error since we call wait right after.
underlyingError := err
if conterr, ok := err.(*hcsshim.ContainerError); ok {
underlyingError = conterr.Err
}
if syscallErr, ok := underlyingError.(syscall.Errno); ok {
underlyingError = syscallErr
}
if underlyingError != errOperationPending {
return fmt.Errorf("failed to terminate utility VM (%s): %s", context, err)
}
logrus.Debugf("lcowdriver: waitTerminate: uvm.Terminate() returned operation pending (%s)", context)
}
logrus.Debugf("lcowdriver: waitTerminate: (%s) - waiting for utility VM to terminate", context)
if err := svm.config.Uvm.WaitTimeout(time.Duration(svm.config.UvmTimeoutSeconds) * time.Second); err != nil {
return fmt.Errorf("failed waiting for utility VM to terminate (%s): %s", context, err)
}
return nil
}
// String returns the string representation of a driver. This should match
// the name the graph driver has been registered with.
func (d *Driver) String() string {
return "lcow"
}
// Status returns the status of the driver.
func (d *Driver) Status() [][2]string {
return [][2]string{
{"LCOW", ""},
// TODO: Add some more info here - mode, home, ....
}
}
// Exists returns true if the given id is registered with this driver.
func (d *Driver) Exists(id string) bool {
_, err := os.Lstat(d.dir(id))
logrus.Debugf("lcowdriver: exists: id %s %t", id, err == nil)
return err == nil
}
// CreateReadWrite creates a layer that is writable for use as a container
// file system. That equates to creating a sandbox.
func (d *Driver) CreateReadWrite(id, parent string, opts *graphdriver.CreateOpts) error {
title := fmt.Sprintf("lcowdriver: createreadwrite: id %s", id)
logrus.Debugf(title)
// First we need to create the folder
if err := d.Create(id, parent, opts); err != nil {
return err
}
// Look for an explicit sandbox size option.
sandboxSize := uint64(client.DefaultVhdxSizeGB)
for k, v := range opts.StorageOpt {
switch strings.ToLower(k) {
case "lcow.sandboxsize":
var err error
sandboxSize, err = strconv.ParseUint(v, 10, 32)
if err != nil {
return fmt.Errorf("%s failed to parse value '%s' for 'lcow.sandboxsize'", title, v)
}
if sandboxSize < client.DefaultVhdxSizeGB {
return fmt.Errorf("%s 'lcow.sandboxsize' option cannot be less than %d", title, client.DefaultVhdxSizeGB)
}
break
}
}
// Massive perf optimisation here. If we know that the RW layer is the default size,
// and that the cached sandbox already exists, and we are running in safe mode, we
// can just do a simple copy into the layers sandbox file without needing to start a
// unique service VM. For a global service VM, it doesn't really matter. Of course,
// this is only the case where the sandbox is the default size.
//
// Make sure we have the sandbox mutex taken while we are examining it.
if sandboxSize == client.DefaultVhdxSizeGB {
logrus.Debugf("%s: locking cachedSandboxMutex", title)
d.cachedSandboxMutex.Lock()
_, err := os.Stat(d.cachedSandboxFile)
logrus.Debugf("%s: releasing cachedSandboxMutex", title)
d.cachedSandboxMutex.Unlock()
if err == nil {
logrus.Debugf("%s: using cached sandbox to populate", title)
if err := client.CopyFile(d.cachedSandboxFile, filepath.Join(d.dir(id), sandboxFilename), true); err != nil {
return err
}
return nil
}
}
logrus.Debugf("%s: creating SVM to create sandbox", title)
svm, err := d.startServiceVMIfNotRunning(id, nil, "createreadwrite")
if err != nil {
return err
}
defer d.terminateServiceVM(id, "createreadwrite", false)
// So the sandbox needs creating. If default size ensure we are the only thread populating the cache.
// Non-default size we don't store, just create them one-off so no need to lock the cachedSandboxMutex.
if sandboxSize == client.DefaultVhdxSizeGB {
logrus.Debugf("%s: locking cachedSandboxMutex for creation", title)
d.cachedSandboxMutex.Lock()
defer func() {
logrus.Debugf("%s: releasing cachedSandboxMutex for creation", title)
d.cachedSandboxMutex.Unlock()
}()
}
// Make sure we don't write to our local cached copy if this is for a non-default size request.
targetCacheFile := d.cachedSandboxFile
if sandboxSize != client.DefaultVhdxSizeGB {
targetCacheFile = ""
}
// Create the ext4 vhdx
logrus.Debugf("%s: creating sandbox ext4 vhdx", title)
if err := svm.createExt4VHDX(filepath.Join(d.dir(id), sandboxFilename), uint32(sandboxSize), targetCacheFile); err != nil {
logrus.Debugf("%s: failed to create sandbox vhdx for %s: %s", title, id, err)
return err
}
return nil
}
// Create creates the folder for the layer with the given id, and
// adds it to the layer chain.
func (d *Driver) Create(id, parent string, opts *graphdriver.CreateOpts) error {
logrus.Debugf("lcowdriver: create: id %s parent: %s", id, parent)
parentChain, err := d.getLayerChain(parent)
if err != nil {
return err
}
var layerChain []string
if parent != "" {
if !d.Exists(parent) {
return fmt.Errorf("lcowdriver: cannot create layer folder with missing parent %s", parent)
}
layerChain = []string{d.dir(parent)}
}
layerChain = append(layerChain, parentChain...)
layerPath := d.dir(id)
logrus.Debugf("lcowdriver: create: id %s: creating %s", id, layerPath)
// Standard mkdir here, not with SDDL as the dataroot was created with
// inheritance to just local system and administrators.
if err := os.MkdirAll(layerPath, 0700); err != nil {
return err
}
if err := d.setLayerChain(id, layerChain); err != nil {
if err2 := os.RemoveAll(layerPath); err2 != nil {
logrus.Warnf("failed to remove layer %s: %s", layerPath, err2)
}
return err
}
logrus.Debugf("lcowdriver: create: id %s: success", id)
return nil
}
// Remove unmounts and removes the dir information.
func (d *Driver) Remove(id string) error {
logrus.Debugf("lcowdriver: remove: id %s", id)
tmpID := fmt.Sprintf("%s-removing", id)
tmpLayerPath := d.dir(tmpID)
layerPath := d.dir(id)
logrus.Debugf("lcowdriver: remove: id %s: layerPath %s", id, layerPath)
// Unmount all the layers
err := d.Put(id)
if err != nil {
logrus.Debugf("lcowdriver: remove id %s: failed to unmount: %s", id, err)
return err
}
// for non-global case just kill the vm
if !d.globalMode {
if err := d.terminateServiceVM(id, fmt.Sprintf("Remove %s", id), true); err != nil {
return err
}
}
if err := os.Rename(layerPath, tmpLayerPath); err != nil && !os.IsNotExist(err) {
return err
}
if err := os.RemoveAll(tmpLayerPath); err != nil {
return err
}
logrus.Debugf("lcowdriver: remove: id %s: layerPath %s succeeded", id, layerPath)
return nil
}
// Get returns the rootfs path for the id. It is reference counted and
// effectively can be thought of as a "mount the layer into the utility
// vm if it isn't already". The contract from the caller of this is that
// all Gets and Puts are matched. It -should- be the case that on cleanup,
// nothing is mounted.
//
// For optimisation, we don't actually mount the filesystem (which in our
// case means [hot-]adding it to a service VM. But we track that and defer
// the actual adding to the point we need to access it.
func (d *Driver) Get(id, mountLabel string) (containerfs.ContainerFS, error) {
title := fmt.Sprintf("lcowdriver: get: %s", id)
logrus.Debugf(title)
// Generate the mounts needed for the deferred operation.
disks, err := d.getAllMounts(id)
if err != nil {
logrus.Debugf("%s failed to get all layer details for %s: %s", title, d.dir(id), err)
return nil, fmt.Errorf("%s failed to get layer details for %s: %s", title, d.dir(id), err)
}
logrus.Debugf("%s: got layer mounts: %+v", title, disks)
return &lcowfs{
root: unionMountName(disks),
d: d,
mappedDisks: disks,
vmID: d.getVMID(id),
}, nil
}
// Put does the reverse of get. If there are no more references to
// the layer, it unmounts it from the utility VM.
func (d *Driver) Put(id string) error {
title := fmt.Sprintf("lcowdriver: put: %s", id)
// Get the service VM that we need to remove from
svm, err := d.serviceVms.get(d.getVMID(id))
if err == errVMUnknown {
return nil
} else if err == errVMisTerminating {
return svm.getStopError()
}
// Generate the mounts that Get() might have mounted
disks, err := d.getAllMounts(id)
if err != nil {
logrus.Debugf("%s failed to get all layer details for %s: %s", title, d.dir(id), err)
return fmt.Errorf("%s failed to get layer details for %s: %s", title, d.dir(id), err)
}
// Now, we want to perform the unmounts, hot-remove and stop the service vm.
// We want to go though all the steps even if we have an error to clean up properly
err = svm.deleteUnionMount(unionMountName(disks), disks...)
if err != nil {
logrus.Debugf("%s failed to delete union mount %s: %s", title, id, err)
}
err1 := svm.hotRemoveVHDs(disks...)
if err1 != nil {
logrus.Debugf("%s failed to hot remove vhds %s: %s", title, id, err)
if err == nil {
err = err1
}
}
err1 = d.terminateServiceVM(id, fmt.Sprintf("Put %s", id), false)
if err1 != nil {
logrus.Debugf("%s failed to terminate service vm %s: %s", title, id, err1)
if err == nil {
err = err1
}
}
logrus.Debugf("Put succeeded on id %s", id)
return err
}
// Cleanup ensures the information the driver stores is properly removed.
// We use this opportunity to cleanup any -removing folders which may be
// still left if the daemon was killed while it was removing a layer.
func (d *Driver) Cleanup() error {
title := "lcowdriver: cleanup"
items, err := ioutil.ReadDir(d.dataRoot)
if err != nil {
if os.IsNotExist(err) {
return nil
}
return err
}
// Note we don't return an error below - it's possible the files
// are locked. However, next time around after the daemon exits,
// we likely will be able to cleanup successfully. Instead we log
// warnings if there are errors.
for _, item := range items {
if item.IsDir() && strings.HasSuffix(item.Name(), "-removing") {
if err := os.RemoveAll(filepath.Join(d.dataRoot, item.Name())); err != nil {
logrus.Warnf("%s failed to cleanup %s: %s", title, item.Name(), err)
} else {
logrus.Infof("%s cleaned up %s", title, item.Name())
}
}
}
// Cleanup any service VMs we have running, along with their scratch spaces.
// We don't take the lock for this as it's taken in terminateServiceVm.
for k, v := range d.serviceVms.svms {
logrus.Debugf("%s svm entry: %s: %+v", title, k, v)
d.terminateServiceVM(k, "cleanup", true)
}
return nil
}
// Diff takes a layer (and it's parent layer which may be null, but
// is ignored by this implementation below) and returns a reader for
// a tarstream representing the layers contents. The id could be
// a read-only "layer.vhd" or a read-write "sandbox.vhdx". The semantics
// of this function dictate that the layer is already mounted.
// However, as we do lazy mounting as a performance optimisation,
// this will likely not be the case.
func (d *Driver) Diff(id, parent string) (io.ReadCloser, error) {
title := fmt.Sprintf("lcowdriver: diff: %s", id)
// Get VHDX info
ld, err := getLayerDetails(d.dir(id))
if err != nil {
logrus.Debugf("%s: failed to get vhdx information of %s: %s", title, d.dir(id), err)
return nil, err
}
// Start the SVM with a mapped virtual disk. Note that if the SVM is
// already running and we are in global mode, this will be
// hot-added.
mvd := hcsshim.MappedVirtualDisk{
HostPath: ld.filename,
ContainerPath: hostToGuest(ld.filename),
CreateInUtilityVM: true,
ReadOnly: true,
}
logrus.Debugf("%s: starting service VM", title)
svm, err := d.startServiceVMIfNotRunning(id, []hcsshim.MappedVirtualDisk{mvd}, fmt.Sprintf("diff %s", id))
if err != nil {
return nil, err
}
logrus.Debugf("lcowdriver: diff: waiting for svm to finish booting")
err = svm.getStartError()
if err != nil {
d.terminateServiceVM(id, fmt.Sprintf("diff %s", id), false)
return nil, fmt.Errorf("lcowdriver: diff: svm failed to boot: %s", err)
}
// Obtain the tar stream for it
LCOW: Mount to short container paths to avoid command-line length limit Signed-off-by: John Howard <jhoward@microsoft.com> Fixes #36764 @johnstep PTAL. @jterry75 FYI. There are two commits in this PR. The first ensure that errors are actually returned to the caller - it was being thrown away. The second commit changes the LCOW driver to map, on a per service VM basis, "long" container paths such as `/tmp/c8fa0ae1b348f505df2707060f6a49e63280d71b83b7936935c827e2e9bde16d` to much shorter paths, based on a per-service VM counter, so something more like /tmp/d3. This means that the root cause of the failure where the mount call to create the overlay was failing due to command line length becomes something much shorter such as below. `mount -t overlay overlay -olowerdir=/tmp/d3:/tmp/d4:/tmp/d5:/tmp/d6:/tmp/d7:/tmp/d8:/tmp/d9:/tmp/d10:/tmp/d11:/tmp/d12:/tmp/d13:/tmp/d14:/tmp/d15:/tmp/d16:/tmp/d17:/tmp/d18:/tmp/d19:/tmp/d20:/tmp/d21:/tmp/d22:/tmp/d23:/tmp/d24:/tmp/d25:/tmp/d26:/tmp/d27:/tmp/d28:/tmp/d29:/tmp/d30:/tmp/d31:/tmp/d32:/tmp/d33:/tmp/d34:/tmp/d35:/tmp/d36:/tmp/d37:/tmp/d38:/tmp/d39:/tmp/d40:/tmp/d41:/tmp/d42:/tmp/d43:/tmp/d44:/tmp/d45:/tmp/d46:/tmp/d47:/tmp/d48:/tmp/d49:/tmp/d50:/tmp/d51:/tmp/d52:/tmp/d53:/tmp/d54:/tmp/d55:/tmp/d56:/tmp/d57:/tmp/d58:/tmp/d59:/tmp/d60:/tmp/d61:/tmp/d62,upperdir=/tmp/d2/upper,workdir=/tmp/d2/work /tmp/c8fa0ae1b348f505df2707060f6a49e63280d71b83b7936935c827e2e9bde16d-mount` For those worrying about overflow (which I'm sure @thaJeztah will mention...): It's safe to use a counter here as SVMs are disposable in the default configuration. The exception is when running the daemon in unsafe LCOW "global" mode (ie `--storage-opt lcow.globalmode=1`) where the SVMs aren't disposed of, but a single one is reused. However, to overflow the command line length, it would require several hundred-thousand trillion (conservative, I should sit down and work it out accurately if I get -really- bored) of SCSI hot-add operations, and even to hit that would be hard as just running containers normally uses the VPMEM path for the containers UVM, not to the global SVM on SCSI. It gets incremented by one per build step (commit more accurately) as a general rule. Hence it would be necessary to have to be doing automated builds without restarting the daemon for literally years on end in unsafe mode. :innocent: Note that in reality, the previous limit of ~47 layers before hitting the command line length limit is close to what is possible in the platform, at least as of RS5/Windows Server 2019 where, in the HCS v1 schema, a single SCSI controller is used, and that can only support 64 disks per controller per the Hyper-V VDEV. And remember we have one slot taken up for the SVMs scratch, and another for the containers scratch when committing a layer. So the best you can architecturally get on the platform is around the following (it's also different by 1 depending on whether in unsafe or default mode) ``` PS E:\docker\build\36764\short> docker build --no-cache . Sending build context to Docker daemon 2.048kB Step 1/4 : FROM alpine as first ---> 11cd0b38bc3c Step 2/4 : RUN echo test > /test ---> Running in 8ddfe20e5bfb Removing intermediate container 8ddfe20e5bfb ---> b0103a00b1c9 Step 3/4 : FROM alpine ---> 11cd0b38bc3c Step 4/4 : COPY --from=first /test /test ---> 54bfae391eba Successfully built 54bfae391eba PS E:\docker\build\36764\short> cd .. PS E:\docker\build\36764> docker build --no-cache . Sending build context to Docker daemon 4.689MB Step 1/61 : FROM alpine as first ---> 11cd0b38bc3c Step 2/61 : RUN echo test > /test ---> Running in 02597ff870db Removing intermediate container 02597ff870db ---> 3096de6fc454 Step 3/61 : RUN echo test > /test ---> Running in 9a8110f4ff19 Removing intermediate container 9a8110f4ff19 ---> 7691808cf28e Step 4/61 : RUN echo test > /test ---> Running in 9afb8f51510b Removing intermediate container 9afb8f51510b ---> e42a0df2bb1c Step 5/61 : RUN echo test > /test ---> Running in fe977ed6804e Removing intermediate container fe977ed6804e ---> 55850c9b0479 Step 6/61 : RUN echo test > /test ---> Running in be65cbfad172 Removing intermediate container be65cbfad172 ---> 0cf8acba70f0 Step 7/61 : RUN echo test > /test ---> Running in fd5b0907b6a9 Removing intermediate container fd5b0907b6a9 ---> 257a4493d85d Step 8/61 : RUN echo test > /test ---> Running in f7ca0ffd9076 Removing intermediate container f7ca0ffd9076 ---> 3baa6f4fa2d5 Step 9/61 : RUN echo test > /test ---> Running in 5146814d4727 Removing intermediate container 5146814d4727 ---> 485b9d5cf228 Step 10/61 : RUN echo test > /test ---> Running in a090eec1b743 Removing intermediate container a090eec1b743 ---> a7eb10155b51 Step 11/61 : RUN echo test > /test ---> Running in 942660b288df Removing intermediate container 942660b288df ---> 9d286a1e2133 Step 12/61 : RUN echo test > /test ---> Running in c3d369aa91df Removing intermediate container c3d369aa91df ---> f78be4788992 Step 13/61 : RUN echo test > /test ---> Running in a03c3ac6888f Removing intermediate container a03c3ac6888f ---> 6504363f61ab Step 14/61 : RUN echo test > /test ---> Running in 0c3c2fca3f90 Removing intermediate container 0c3c2fca3f90 ---> fe3448b8bb29 Step 15/61 : RUN echo test > /test ---> Running in 828d51c76d3b Removing intermediate container 828d51c76d3b ---> 870684e3aea0 Step 16/61 : RUN echo test > /test ---> Running in 59a2f7c5f3ad Removing intermediate container 59a2f7c5f3ad ---> cf84556ca5c0 Step 17/61 : RUN echo test > /test ---> Running in bfb4e088eeb3 Removing intermediate container bfb4e088eeb3 ---> 9c8f9f652cef Step 18/61 : RUN echo test > /test ---> Running in f1b88bb5a2d7 Removing intermediate container f1b88bb5a2d7 ---> a6233ad21648 Step 19/61 : RUN echo test > /test ---> Running in 45f70577d709 Removing intermediate container 45f70577d709 ---> 1b5cc52d370d Step 20/61 : RUN echo test > /test ---> Running in 2ce231d5043d Removing intermediate container 2ce231d5043d ---> 4a0e17cbebaa Step 21/61 : RUN echo test > /test ---> Running in 52e4b0928f1f Removing intermediate container 52e4b0928f1f ---> 99b50e989bcb Step 22/61 : RUN echo test > /test ---> Running in f7ba3da7460d Removing intermediate container f7ba3da7460d ---> bfa3cad88285 Step 23/61 : RUN echo test > /test ---> Running in 60180bf60f88 Removing intermediate container 60180bf60f88 ---> fe7271988bcb Step 24/61 : RUN echo test > /test ---> Running in 20324d396531 Removing intermediate container 20324d396531 ---> e930bc039128 Step 25/61 : RUN echo test > /test ---> Running in b3ac70fd4404 Removing intermediate container b3ac70fd4404 ---> 39d0a11ea6d8 Step 26/61 : RUN echo test > /test ---> Running in 0193267d3787 Removing intermediate container 0193267d3787 ---> 8062d7aab0a5 Step 27/61 : RUN echo test > /test ---> Running in f41f45fb7985 Removing intermediate container f41f45fb7985 ---> 1f5f18f2315b Step 28/61 : RUN echo test > /test ---> Running in 90dd09c63d6e Removing intermediate container 90dd09c63d6e ---> 02f0a1141f11 Step 29/61 : RUN echo test > /test ---> Running in c557e5386e0a Removing intermediate container c557e5386e0a ---> dbcd6fb1f6f4 Step 30/61 : RUN echo test > /test ---> Running in 65369385d855 Removing intermediate container 65369385d855 ---> e6e9058a0650 Step 31/61 : RUN echo test > /test ---> Running in d861fcc388fd Removing intermediate container d861fcc388fd ---> 6e4c2c0f741f Step 32/61 : RUN echo test > /test ---> Running in 1483962b7e1c Removing intermediate container 1483962b7e1c ---> cf8f142aa055 Step 33/61 : RUN echo test > /test ---> Running in 5868934816c1 Removing intermediate container 5868934816c1 ---> d5ff87cdc204 Step 34/61 : RUN echo test > /test ---> Running in e057f3201f3a Removing intermediate container e057f3201f3a ---> b4031b7ab4ac Step 35/61 : RUN echo test > /test ---> Running in 22b769b9079c Removing intermediate container 22b769b9079c ---> 019d898510b6 Step 36/61 : RUN echo test > /test ---> Running in f1d364ef4ff8 Removing intermediate container f1d364ef4ff8 ---> 9525cafdf04d Step 37/61 : RUN echo test > /test ---> Running in 5bf505b8bdcc Removing intermediate container 5bf505b8bdcc ---> cd5002b33bfd Step 38/61 : RUN echo test > /test ---> Running in be24a921945c Removing intermediate container be24a921945c ---> 8675db44d1b7 Step 39/61 : RUN echo test > /test ---> Running in 352dc6beef3d Removing intermediate container 352dc6beef3d ---> 0ab0ece43c71 Step 40/61 : RUN echo test > /test ---> Running in eebde33e5d9b Removing intermediate container eebde33e5d9b ---> 46ca4b0dfc03 Step 41/61 : RUN echo test > /test ---> Running in f920313a1e85 Removing intermediate container f920313a1e85 ---> 7f3888414d58 Step 42/61 : RUN echo test > /test ---> Running in 10e2f4dc1ac7 Removing intermediate container 10e2f4dc1ac7 ---> 14db9e15f2dc Step 43/61 : RUN echo test > /test ---> Running in c849d6e89aa5 Removing intermediate container c849d6e89aa5 ---> fdb770494dd6 Step 44/61 : RUN echo test > /test ---> Running in 419d1a8353db Removing intermediate container 419d1a8353db ---> d12e9cf078be Step 45/61 : RUN echo test > /test ---> Running in 0f1805263e4c Removing intermediate container 0f1805263e4c ---> cd005e7b08a4 Step 46/61 : RUN echo test > /test ---> Running in 5bde05b46441 Removing intermediate container 5bde05b46441 ---> 05aa426a3d4a Step 47/61 : RUN echo test > /test ---> Running in 01ebc84bd1bc Removing intermediate container 01ebc84bd1bc ---> 35d371fa4342 Step 48/61 : RUN echo test > /test ---> Running in 49f6c2f51dd4 Removing intermediate container 49f6c2f51dd4 ---> 1090b5dfa130 Step 49/61 : RUN echo test > /test ---> Running in f8a9089cd725 Removing intermediate container f8a9089cd725 ---> b2d0eec0716d Step 50/61 : RUN echo test > /test ---> Running in a1697a0b2db0 Removing intermediate container a1697a0b2db0 ---> 10d96ac8f497 Step 51/61 : RUN echo test > /test ---> Running in 33a2332c06eb Removing intermediate container 33a2332c06eb ---> ba5bf5609c1c Step 52/61 : RUN echo test > /test ---> Running in e8920392be0d Removing intermediate container e8920392be0d ---> 5b3a95685c7e Step 53/61 : RUN echo test > /test ---> Running in 4b9298587c65 Removing intermediate container 4b9298587c65 ---> d4961a349141 Step 54/61 : RUN echo test > /test ---> Running in 8a0c960c2ba1 Removing intermediate container 8a0c960c2ba1 ---> b413197fcfa2 Step 55/61 : RUN echo test > /test ---> Running in 536ee3b9596b Removing intermediate container 536ee3b9596b ---> fc16b69b224a Step 56/61 : RUN echo test > /test ---> Running in 8b817b8d7b59 Removing intermediate container 8b817b8d7b59 ---> 2f0896400ff9 Step 57/61 : RUN echo test > /test ---> Running in ab0ed79ec3d4 Removing intermediate container ab0ed79ec3d4 ---> b4fb420e736c Step 58/61 : RUN echo test > /test ---> Running in 8548d7eead1f Removing intermediate container 8548d7eead1f ---> 745103fd5a38 Step 59/61 : RUN echo test > /test ---> Running in 1980559ad5d6 Removing intermediate container 1980559ad5d6 ---> 08c1c74a5618 Step 60/61 : FROM alpine ---> 11cd0b38bc3c Step 61/61 : COPY --from=first /test /test ---> 67f053c66c27 Successfully built 67f053c66c27 PS E:\docker\build\36764> ``` Note also that subsequent error messages once you go beyond current platform limitations kind of suck (such as insufficient resources with a bunch of spew which is incomprehensible to most) and we could do better to detect this earlier in the daemon. That'll be for a (reasonably low-priority) follow-up though as and when I have time. Theoretically we *may*, if the platform doesn't require additional changes for RS5, be able to have bigger platform limits using the v2 schema with up to 127 VPMem devices, and the possibility to have multiple SCSI controllers per SVM/UVM. However, currently LCOW is using HCS v1 schema calls, and there's no plans to rewrite the graphdriver/libcontainerd components outside of the moving LCOW fully over to the containerd runtime/snapshotter using HCS v2 schema, which is still some time off fruition. PS OK, while waiting for a full run to complete, I did get bored. Turns out it won't overflow line length as max(uint64) is 18446744073709551616 which would still be short enough at 127 layers, double the current platform limit. And I could always change it to hex or base36 to make it even shorter, or remove the 'd' from /tmp/dN. IOW, pretty sure no-one is going to hit the limit even if we could get the platform to 256 which is the current Hyper-V SCSI limit per VM (4x64), although PMEM at 127 would be the next immediate limit.
2018-08-15 20:56:28 +00:00
// The actual container path will have be remapped to a short name, so use that.
actualContainerPath := svm.getShortContainerPath(&mvd)
if actualContainerPath == "" {
return nil, fmt.Errorf("failed to get short container path for %+v in SVM %s", mvd, svm.config.Name)
}
logrus.Debugf("%s: %s %s, size %d, ReadOnly %t", title, ld.filename, actualContainerPath, ld.size, ld.isSandbox)
tarReadCloser, err := svm.config.VhdToTar(mvd.HostPath, actualContainerPath, ld.isSandbox, ld.size)
if err != nil {
svm.hotRemoveVHDs(mvd)
d.terminateServiceVM(id, fmt.Sprintf("diff %s", id), false)
return nil, fmt.Errorf("%s failed to export layer to tar stream for id: %s, parent: %s : %s", title, id, parent, err)
}
logrus.Debugf("%s id %s parent %s completed successfully", title, id, parent)
// In safe/non-global mode, we can't tear down the service VM until things have been read.
return ioutils.NewReadCloserWrapper(tarReadCloser, func() error {
tarReadCloser.Close()
svm.hotRemoveVHDs(mvd)
d.terminateServiceVM(id, fmt.Sprintf("diff %s", id), false)
return nil
}), nil
}
// ApplyDiff extracts the changeset from the given diff into the
// layer with the specified id and parent, returning the size of the
// new layer in bytes. The layer should not be mounted when calling
// this function. Another way of describing this is that ApplyDiff writes
// to a new layer (a VHD in LCOW) the contents of a tarstream it's given.
func (d *Driver) ApplyDiff(id, parent string, diff io.Reader) (int64, error) {
logrus.Debugf("lcowdriver: applydiff: id %s", id)
// Log failures here as it's undiagnosable sometimes, due to a possible panic.
// See https://github.com/moby/moby/issues/37955 for more information.
dest := filepath.Join(d.dataRoot, id, layerFilename)
if !noreexec {
cmd := reexec.Command([]string{"docker-lcow-tar2ext4", dest}...)
stdout := bytes.NewBuffer(nil)
stderr := bytes.NewBuffer(nil)
cmd.Stdin = diff
cmd.Stdout = stdout
cmd.Stderr = stderr
if err := cmd.Start(); err != nil {
logrus.Warnf("lcowdriver: applydiff: id %s failed to start re-exec: %s", id, err)
return 0, err
}
if err := cmd.Wait(); err != nil {
logrus.Warnf("lcowdriver: applydiff: id %s failed %s", id, err)
return 0, fmt.Errorf("re-exec error: %v: stderr: %s", err, stderr)
}
size, err := strconv.ParseInt(stdout.String(), 10, 64)
if err != nil {
logrus.Warnf("lcowdriver: applydiff: id %s failed to parse output %s", id, err)
return 0, fmt.Errorf("re-exec error: %v: stdout: %s", err, stdout)
}
return applySID(id, size, dest)
}
// The inline case
size, err := tar2ext4Actual(dest, diff)
if err != nil {
logrus.Warnf("lcowdriver: applydiff: id %s failed %s", id, err)
}
return applySID(id, size, dest)
}
// applySID adds the VM Group SID read-only access.
func applySID(id string, size int64, dest string) (int64, error) {
if err := security.GrantVmGroupAccess(dest); err != nil {
logrus.Warnf("lcowdriver: applySIDs: id %s failed %s", id, err)
return 0, err
}
return size, nil
}
// tar2ext4Reexec is the re-exec entry point for writing a layer from a tar file
func tar2ext4Reexec() {
size, err := tar2ext4Actual(os.Args[1], os.Stdin)
if err != nil {
fmt.Fprint(os.Stderr, err)
os.Exit(1)
}
fmt.Fprint(os.Stdout, size)
}
// tar2ext4Actual is the implementation of tar2ext to write a layer from a tar file.
// It can be called through re-exec (default), or inline for debugging.
func tar2ext4Actual(dest string, diff io.Reader) (int64, error) {
// maxDiskSize is not relating to the sandbox size - this is the
// maximum possible size a layer VHD generated can be from an EXT4
// layout perspective.
const maxDiskSize = 128 * 1024 * 1024 * 1024 // 128GB
out, err := os.Create(dest)
if err != nil {
return 0, err
}
defer out.Close()
if err := tar2ext4.Convert(
diff,
out,
tar2ext4.AppendVhdFooter,
tar2ext4.ConvertWhiteout,
tar2ext4.MaximumDiskSize(maxDiskSize)); err != nil {
return 0, err
}
fi, err := os.Stat(dest)
if err != nil {
return 0, err
}
return fi.Size(), nil
}
// Changes produces a list of changes between the specified layer
// and its parent layer. If parent is "", then all changes will be ADD changes.
// The layer should not be mounted when calling this function.
func (d *Driver) Changes(id, parent string) ([]archive.Change, error) {
logrus.Debugf("lcowdriver: changes: id %s parent %s", id, parent)
// TODO @gupta-ak. Needs implementation with assistance from service VM
return nil, nil
}
// DiffSize calculates the changes between the specified layer
// and its parent and returns the size in bytes of the changes
// relative to its base filesystem directory.
func (d *Driver) DiffSize(id, parent string) (size int64, err error) {
logrus.Debugf("lcowdriver: diffsize: id %s", id)
// TODO @gupta-ak. Needs implementation with assistance from service VM
return 0, nil
}
// GetMetadata returns custom driver information.
func (d *Driver) GetMetadata(id string) (map[string]string, error) {
logrus.Debugf("lcowdriver: getmetadata: id %s", id)
m := make(map[string]string)
m["dir"] = d.dir(id)
return m, nil
}
// GetLayerPath gets the layer path on host (path to VHD/VHDX)
func (d *Driver) GetLayerPath(id string) (string, error) {
return d.dir(id), nil
}
// dir returns the absolute path to the layer.
func (d *Driver) dir(id string) string {
return filepath.Join(d.dataRoot, filepath.Base(id))
}
// getLayerChain returns the layer chain information.
func (d *Driver) getLayerChain(id string) ([]string, error) {
jPath := filepath.Join(d.dir(id), "layerchain.json")
logrus.Debugf("lcowdriver: getlayerchain: id %s json %s", id, jPath)
content, err := ioutil.ReadFile(jPath)
if os.IsNotExist(err) {
return nil, nil
} else if err != nil {
return nil, fmt.Errorf("lcowdriver: getlayerchain: %s unable to read layerchain file %s: %s", id, jPath, err)
}
var layerChain []string
err = json.Unmarshal(content, &layerChain)
if err != nil {
return nil, fmt.Errorf("lcowdriver: getlayerchain: %s failed to unmarshall layerchain file %s: %s", id, jPath, err)
}
return layerChain, nil
}
// setLayerChain stores the layer chain information on disk.
func (d *Driver) setLayerChain(id string, chain []string) error {
content, err := json.Marshal(&chain)
if err != nil {
return fmt.Errorf("lcowdriver: setlayerchain: %s failed to marshall layerchain json: %s", id, err)
}
jPath := filepath.Join(d.dir(id), "layerchain.json")
logrus.Debugf("lcowdriver: setlayerchain: id %s json %s", id, jPath)
err = ioutil.WriteFile(jPath, content, 0600)
if err != nil {
return fmt.Errorf("lcowdriver: setlayerchain: %s failed to write layerchain file: %s", id, err)
}
return nil
}
// getLayerDetails is a utility for getting a file name, size and indication of
// sandbox for a VHD(x) in a folder. A read-only layer will be layer.vhd. A
// read-write layer will be sandbox.vhdx.
func getLayerDetails(folder string) (*layerDetails, error) {
var fileInfo os.FileInfo
ld := &layerDetails{
isSandbox: false,
filename: filepath.Join(folder, layerFilename),
}
fileInfo, err := os.Stat(ld.filename)
if err != nil {
ld.filename = filepath.Join(folder, sandboxFilename)
if fileInfo, err = os.Stat(ld.filename); err != nil {
return nil, fmt.Errorf("failed to locate layer or sandbox in %s", folder)
}
ld.isSandbox = true
}
ld.size = fileInfo.Size()
return ld, nil
}
func (d *Driver) getAllMounts(id string) ([]hcsshim.MappedVirtualDisk, error) {
layerChain, err := d.getLayerChain(id)
if err != nil {
return nil, err
}
layerChain = append([]string{d.dir(id)}, layerChain...)
logrus.Debugf("getting all layers: %v", layerChain)
disks := make([]hcsshim.MappedVirtualDisk, len(layerChain), len(layerChain))
for i := range layerChain {
ld, err := getLayerDetails(layerChain[i])
if err != nil {
logrus.Debugf("Failed to get LayerVhdDetails from %s: %s", layerChain[i], err)
return nil, err
}
disks[i].HostPath = ld.filename
disks[i].ContainerPath = hostToGuest(ld.filename)
disks[i].CreateInUtilityVM = true
disks[i].ReadOnly = !ld.isSandbox
}
return disks, nil
}
func hostToGuest(hostpath string) string {
LCOW: Mount to short container paths to avoid command-line length limit Signed-off-by: John Howard <jhoward@microsoft.com> Fixes #36764 @johnstep PTAL. @jterry75 FYI. There are two commits in this PR. The first ensure that errors are actually returned to the caller - it was being thrown away. The second commit changes the LCOW driver to map, on a per service VM basis, "long" container paths such as `/tmp/c8fa0ae1b348f505df2707060f6a49e63280d71b83b7936935c827e2e9bde16d` to much shorter paths, based on a per-service VM counter, so something more like /tmp/d3. This means that the root cause of the failure where the mount call to create the overlay was failing due to command line length becomes something much shorter such as below. `mount -t overlay overlay -olowerdir=/tmp/d3:/tmp/d4:/tmp/d5:/tmp/d6:/tmp/d7:/tmp/d8:/tmp/d9:/tmp/d10:/tmp/d11:/tmp/d12:/tmp/d13:/tmp/d14:/tmp/d15:/tmp/d16:/tmp/d17:/tmp/d18:/tmp/d19:/tmp/d20:/tmp/d21:/tmp/d22:/tmp/d23:/tmp/d24:/tmp/d25:/tmp/d26:/tmp/d27:/tmp/d28:/tmp/d29:/tmp/d30:/tmp/d31:/tmp/d32:/tmp/d33:/tmp/d34:/tmp/d35:/tmp/d36:/tmp/d37:/tmp/d38:/tmp/d39:/tmp/d40:/tmp/d41:/tmp/d42:/tmp/d43:/tmp/d44:/tmp/d45:/tmp/d46:/tmp/d47:/tmp/d48:/tmp/d49:/tmp/d50:/tmp/d51:/tmp/d52:/tmp/d53:/tmp/d54:/tmp/d55:/tmp/d56:/tmp/d57:/tmp/d58:/tmp/d59:/tmp/d60:/tmp/d61:/tmp/d62,upperdir=/tmp/d2/upper,workdir=/tmp/d2/work /tmp/c8fa0ae1b348f505df2707060f6a49e63280d71b83b7936935c827e2e9bde16d-mount` For those worrying about overflow (which I'm sure @thaJeztah will mention...): It's safe to use a counter here as SVMs are disposable in the default configuration. The exception is when running the daemon in unsafe LCOW "global" mode (ie `--storage-opt lcow.globalmode=1`) where the SVMs aren't disposed of, but a single one is reused. However, to overflow the command line length, it would require several hundred-thousand trillion (conservative, I should sit down and work it out accurately if I get -really- bored) of SCSI hot-add operations, and even to hit that would be hard as just running containers normally uses the VPMEM path for the containers UVM, not to the global SVM on SCSI. It gets incremented by one per build step (commit more accurately) as a general rule. Hence it would be necessary to have to be doing automated builds without restarting the daemon for literally years on end in unsafe mode. :innocent: Note that in reality, the previous limit of ~47 layers before hitting the command line length limit is close to what is possible in the platform, at least as of RS5/Windows Server 2019 where, in the HCS v1 schema, a single SCSI controller is used, and that can only support 64 disks per controller per the Hyper-V VDEV. And remember we have one slot taken up for the SVMs scratch, and another for the containers scratch when committing a layer. So the best you can architecturally get on the platform is around the following (it's also different by 1 depending on whether in unsafe or default mode) ``` PS E:\docker\build\36764\short> docker build --no-cache . Sending build context to Docker daemon 2.048kB Step 1/4 : FROM alpine as first ---> 11cd0b38bc3c Step 2/4 : RUN echo test > /test ---> Running in 8ddfe20e5bfb Removing intermediate container 8ddfe20e5bfb ---> b0103a00b1c9 Step 3/4 : FROM alpine ---> 11cd0b38bc3c Step 4/4 : COPY --from=first /test /test ---> 54bfae391eba Successfully built 54bfae391eba PS E:\docker\build\36764\short> cd .. PS E:\docker\build\36764> docker build --no-cache . Sending build context to Docker daemon 4.689MB Step 1/61 : FROM alpine as first ---> 11cd0b38bc3c Step 2/61 : RUN echo test > /test ---> Running in 02597ff870db Removing intermediate container 02597ff870db ---> 3096de6fc454 Step 3/61 : RUN echo test > /test ---> Running in 9a8110f4ff19 Removing intermediate container 9a8110f4ff19 ---> 7691808cf28e Step 4/61 : RUN echo test > /test ---> Running in 9afb8f51510b Removing intermediate container 9afb8f51510b ---> e42a0df2bb1c Step 5/61 : RUN echo test > /test ---> Running in fe977ed6804e Removing intermediate container fe977ed6804e ---> 55850c9b0479 Step 6/61 : RUN echo test > /test ---> Running in be65cbfad172 Removing intermediate container be65cbfad172 ---> 0cf8acba70f0 Step 7/61 : RUN echo test > /test ---> Running in fd5b0907b6a9 Removing intermediate container fd5b0907b6a9 ---> 257a4493d85d Step 8/61 : RUN echo test > /test ---> Running in f7ca0ffd9076 Removing intermediate container f7ca0ffd9076 ---> 3baa6f4fa2d5 Step 9/61 : RUN echo test > /test ---> Running in 5146814d4727 Removing intermediate container 5146814d4727 ---> 485b9d5cf228 Step 10/61 : RUN echo test > /test ---> Running in a090eec1b743 Removing intermediate container a090eec1b743 ---> a7eb10155b51 Step 11/61 : RUN echo test > /test ---> Running in 942660b288df Removing intermediate container 942660b288df ---> 9d286a1e2133 Step 12/61 : RUN echo test > /test ---> Running in c3d369aa91df Removing intermediate container c3d369aa91df ---> f78be4788992 Step 13/61 : RUN echo test > /test ---> Running in a03c3ac6888f Removing intermediate container a03c3ac6888f ---> 6504363f61ab Step 14/61 : RUN echo test > /test ---> Running in 0c3c2fca3f90 Removing intermediate container 0c3c2fca3f90 ---> fe3448b8bb29 Step 15/61 : RUN echo test > /test ---> Running in 828d51c76d3b Removing intermediate container 828d51c76d3b ---> 870684e3aea0 Step 16/61 : RUN echo test > /test ---> Running in 59a2f7c5f3ad Removing intermediate container 59a2f7c5f3ad ---> cf84556ca5c0 Step 17/61 : RUN echo test > /test ---> Running in bfb4e088eeb3 Removing intermediate container bfb4e088eeb3 ---> 9c8f9f652cef Step 18/61 : RUN echo test > /test ---> Running in f1b88bb5a2d7 Removing intermediate container f1b88bb5a2d7 ---> a6233ad21648 Step 19/61 : RUN echo test > /test ---> Running in 45f70577d709 Removing intermediate container 45f70577d709 ---> 1b5cc52d370d Step 20/61 : RUN echo test > /test ---> Running in 2ce231d5043d Removing intermediate container 2ce231d5043d ---> 4a0e17cbebaa Step 21/61 : RUN echo test > /test ---> Running in 52e4b0928f1f Removing intermediate container 52e4b0928f1f ---> 99b50e989bcb Step 22/61 : RUN echo test > /test ---> Running in f7ba3da7460d Removing intermediate container f7ba3da7460d ---> bfa3cad88285 Step 23/61 : RUN echo test > /test ---> Running in 60180bf60f88 Removing intermediate container 60180bf60f88 ---> fe7271988bcb Step 24/61 : RUN echo test > /test ---> Running in 20324d396531 Removing intermediate container 20324d396531 ---> e930bc039128 Step 25/61 : RUN echo test > /test ---> Running in b3ac70fd4404 Removing intermediate container b3ac70fd4404 ---> 39d0a11ea6d8 Step 26/61 : RUN echo test > /test ---> Running in 0193267d3787 Removing intermediate container 0193267d3787 ---> 8062d7aab0a5 Step 27/61 : RUN echo test > /test ---> Running in f41f45fb7985 Removing intermediate container f41f45fb7985 ---> 1f5f18f2315b Step 28/61 : RUN echo test > /test ---> Running in 90dd09c63d6e Removing intermediate container 90dd09c63d6e ---> 02f0a1141f11 Step 29/61 : RUN echo test > /test ---> Running in c557e5386e0a Removing intermediate container c557e5386e0a ---> dbcd6fb1f6f4 Step 30/61 : RUN echo test > /test ---> Running in 65369385d855 Removing intermediate container 65369385d855 ---> e6e9058a0650 Step 31/61 : RUN echo test > /test ---> Running in d861fcc388fd Removing intermediate container d861fcc388fd ---> 6e4c2c0f741f Step 32/61 : RUN echo test > /test ---> Running in 1483962b7e1c Removing intermediate container 1483962b7e1c ---> cf8f142aa055 Step 33/61 : RUN echo test > /test ---> Running in 5868934816c1 Removing intermediate container 5868934816c1 ---> d5ff87cdc204 Step 34/61 : RUN echo test > /test ---> Running in e057f3201f3a Removing intermediate container e057f3201f3a ---> b4031b7ab4ac Step 35/61 : RUN echo test > /test ---> Running in 22b769b9079c Removing intermediate container 22b769b9079c ---> 019d898510b6 Step 36/61 : RUN echo test > /test ---> Running in f1d364ef4ff8 Removing intermediate container f1d364ef4ff8 ---> 9525cafdf04d Step 37/61 : RUN echo test > /test ---> Running in 5bf505b8bdcc Removing intermediate container 5bf505b8bdcc ---> cd5002b33bfd Step 38/61 : RUN echo test > /test ---> Running in be24a921945c Removing intermediate container be24a921945c ---> 8675db44d1b7 Step 39/61 : RUN echo test > /test ---> Running in 352dc6beef3d Removing intermediate container 352dc6beef3d ---> 0ab0ece43c71 Step 40/61 : RUN echo test > /test ---> Running in eebde33e5d9b Removing intermediate container eebde33e5d9b ---> 46ca4b0dfc03 Step 41/61 : RUN echo test > /test ---> Running in f920313a1e85 Removing intermediate container f920313a1e85 ---> 7f3888414d58 Step 42/61 : RUN echo test > /test ---> Running in 10e2f4dc1ac7 Removing intermediate container 10e2f4dc1ac7 ---> 14db9e15f2dc Step 43/61 : RUN echo test > /test ---> Running in c849d6e89aa5 Removing intermediate container c849d6e89aa5 ---> fdb770494dd6 Step 44/61 : RUN echo test > /test ---> Running in 419d1a8353db Removing intermediate container 419d1a8353db ---> d12e9cf078be Step 45/61 : RUN echo test > /test ---> Running in 0f1805263e4c Removing intermediate container 0f1805263e4c ---> cd005e7b08a4 Step 46/61 : RUN echo test > /test ---> Running in 5bde05b46441 Removing intermediate container 5bde05b46441 ---> 05aa426a3d4a Step 47/61 : RUN echo test > /test ---> Running in 01ebc84bd1bc Removing intermediate container 01ebc84bd1bc ---> 35d371fa4342 Step 48/61 : RUN echo test > /test ---> Running in 49f6c2f51dd4 Removing intermediate container 49f6c2f51dd4 ---> 1090b5dfa130 Step 49/61 : RUN echo test > /test ---> Running in f8a9089cd725 Removing intermediate container f8a9089cd725 ---> b2d0eec0716d Step 50/61 : RUN echo test > /test ---> Running in a1697a0b2db0 Removing intermediate container a1697a0b2db0 ---> 10d96ac8f497 Step 51/61 : RUN echo test > /test ---> Running in 33a2332c06eb Removing intermediate container 33a2332c06eb ---> ba5bf5609c1c Step 52/61 : RUN echo test > /test ---> Running in e8920392be0d Removing intermediate container e8920392be0d ---> 5b3a95685c7e Step 53/61 : RUN echo test > /test ---> Running in 4b9298587c65 Removing intermediate container 4b9298587c65 ---> d4961a349141 Step 54/61 : RUN echo test > /test ---> Running in 8a0c960c2ba1 Removing intermediate container 8a0c960c2ba1 ---> b413197fcfa2 Step 55/61 : RUN echo test > /test ---> Running in 536ee3b9596b Removing intermediate container 536ee3b9596b ---> fc16b69b224a Step 56/61 : RUN echo test > /test ---> Running in 8b817b8d7b59 Removing intermediate container 8b817b8d7b59 ---> 2f0896400ff9 Step 57/61 : RUN echo test > /test ---> Running in ab0ed79ec3d4 Removing intermediate container ab0ed79ec3d4 ---> b4fb420e736c Step 58/61 : RUN echo test > /test ---> Running in 8548d7eead1f Removing intermediate container 8548d7eead1f ---> 745103fd5a38 Step 59/61 : RUN echo test > /test ---> Running in 1980559ad5d6 Removing intermediate container 1980559ad5d6 ---> 08c1c74a5618 Step 60/61 : FROM alpine ---> 11cd0b38bc3c Step 61/61 : COPY --from=first /test /test ---> 67f053c66c27 Successfully built 67f053c66c27 PS E:\docker\build\36764> ``` Note also that subsequent error messages once you go beyond current platform limitations kind of suck (such as insufficient resources with a bunch of spew which is incomprehensible to most) and we could do better to detect this earlier in the daemon. That'll be for a (reasonably low-priority) follow-up though as and when I have time. Theoretically we *may*, if the platform doesn't require additional changes for RS5, be able to have bigger platform limits using the v2 schema with up to 127 VPMem devices, and the possibility to have multiple SCSI controllers per SVM/UVM. However, currently LCOW is using HCS v1 schema calls, and there's no plans to rewrite the graphdriver/libcontainerd components outside of the moving LCOW fully over to the containerd runtime/snapshotter using HCS v2 schema, which is still some time off fruition. PS OK, while waiting for a full run to complete, I did get bored. Turns out it won't overflow line length as max(uint64) is 18446744073709551616 which would still be short enough at 127 layers, double the current platform limit. And I could always change it to hex or base36 to make it even shorter, or remove the 'd' from /tmp/dN. IOW, pretty sure no-one is going to hit the limit even if we could get the platform to 256 which is the current Hyper-V SCSI limit per VM (4x64), although PMEM at 127 would be the next immediate limit.
2018-08-15 20:56:28 +00:00
// This is the "long" container path. At the point of which we are
// calculating this, we don't know which service VM we're going to be
// using, so we can't translate this to a short path yet, instead
// deferring until the point of which it's added to an SVM. We don't
// use long container paths in SVMs for SCSI disks, otherwise it can cause
// command line operations that we invoke to fail due to being over ~4200
// characters when there are ~47 layers involved. An example of this is
// the mount call to create the overlay across multiple SCSI-attached disks.
// It doesn't affect VPMem attached layers during container creation as
// these get mapped by openGCS to /tmp/N/M where N is a container instance
// number, and M is a layer number.
return fmt.Sprintf("/tmp/%s", filepath.Base(filepath.Dir(hostpath)))
}
func unionMountName(disks []hcsshim.MappedVirtualDisk) string {
return fmt.Sprintf("%s-mount", disks[0].ContainerPath)
}
type nopCloser struct {
io.Reader
}
func (nopCloser) Close() error {
return nil
}
type fileGetCloserFromSVM struct {
id string
svm *serviceVM
mvd *hcsshim.MappedVirtualDisk
d *Driver
}
func (fgc *fileGetCloserFromSVM) Close() error {
if fgc.svm != nil {
if fgc.mvd != nil {
if err := fgc.svm.hotRemoveVHDs(*fgc.mvd); err != nil {
// We just log this as we're going to tear down the SVM imminently unless in global mode
logrus.Errorf("failed to remove mvd %s: %s", fgc.mvd.ContainerPath, err)
}
}
}
if fgc.d != nil && fgc.svm != nil && fgc.id != "" {
if err := fgc.d.terminateServiceVM(fgc.id, fmt.Sprintf("diffgetter %s", fgc.id), false); err != nil {
return err
}
}
return nil
}
func (fgc *fileGetCloserFromSVM) Get(filename string) (io.ReadCloser, error) {
errOut := &bytes.Buffer{}
outOut := &bytes.Buffer{}
LCOW: Mount to short container paths to avoid command-line length limit Signed-off-by: John Howard <jhoward@microsoft.com> Fixes #36764 @johnstep PTAL. @jterry75 FYI. There are two commits in this PR. The first ensure that errors are actually returned to the caller - it was being thrown away. The second commit changes the LCOW driver to map, on a per service VM basis, "long" container paths such as `/tmp/c8fa0ae1b348f505df2707060f6a49e63280d71b83b7936935c827e2e9bde16d` to much shorter paths, based on a per-service VM counter, so something more like /tmp/d3. This means that the root cause of the failure where the mount call to create the overlay was failing due to command line length becomes something much shorter such as below. `mount -t overlay overlay -olowerdir=/tmp/d3:/tmp/d4:/tmp/d5:/tmp/d6:/tmp/d7:/tmp/d8:/tmp/d9:/tmp/d10:/tmp/d11:/tmp/d12:/tmp/d13:/tmp/d14:/tmp/d15:/tmp/d16:/tmp/d17:/tmp/d18:/tmp/d19:/tmp/d20:/tmp/d21:/tmp/d22:/tmp/d23:/tmp/d24:/tmp/d25:/tmp/d26:/tmp/d27:/tmp/d28:/tmp/d29:/tmp/d30:/tmp/d31:/tmp/d32:/tmp/d33:/tmp/d34:/tmp/d35:/tmp/d36:/tmp/d37:/tmp/d38:/tmp/d39:/tmp/d40:/tmp/d41:/tmp/d42:/tmp/d43:/tmp/d44:/tmp/d45:/tmp/d46:/tmp/d47:/tmp/d48:/tmp/d49:/tmp/d50:/tmp/d51:/tmp/d52:/tmp/d53:/tmp/d54:/tmp/d55:/tmp/d56:/tmp/d57:/tmp/d58:/tmp/d59:/tmp/d60:/tmp/d61:/tmp/d62,upperdir=/tmp/d2/upper,workdir=/tmp/d2/work /tmp/c8fa0ae1b348f505df2707060f6a49e63280d71b83b7936935c827e2e9bde16d-mount` For those worrying about overflow (which I'm sure @thaJeztah will mention...): It's safe to use a counter here as SVMs are disposable in the default configuration. The exception is when running the daemon in unsafe LCOW "global" mode (ie `--storage-opt lcow.globalmode=1`) where the SVMs aren't disposed of, but a single one is reused. However, to overflow the command line length, it would require several hundred-thousand trillion (conservative, I should sit down and work it out accurately if I get -really- bored) of SCSI hot-add operations, and even to hit that would be hard as just running containers normally uses the VPMEM path for the containers UVM, not to the global SVM on SCSI. It gets incremented by one per build step (commit more accurately) as a general rule. Hence it would be necessary to have to be doing automated builds without restarting the daemon for literally years on end in unsafe mode. :innocent: Note that in reality, the previous limit of ~47 layers before hitting the command line length limit is close to what is possible in the platform, at least as of RS5/Windows Server 2019 where, in the HCS v1 schema, a single SCSI controller is used, and that can only support 64 disks per controller per the Hyper-V VDEV. And remember we have one slot taken up for the SVMs scratch, and another for the containers scratch when committing a layer. So the best you can architecturally get on the platform is around the following (it's also different by 1 depending on whether in unsafe or default mode) ``` PS E:\docker\build\36764\short> docker build --no-cache . Sending build context to Docker daemon 2.048kB Step 1/4 : FROM alpine as first ---> 11cd0b38bc3c Step 2/4 : RUN echo test > /test ---> Running in 8ddfe20e5bfb Removing intermediate container 8ddfe20e5bfb ---> b0103a00b1c9 Step 3/4 : FROM alpine ---> 11cd0b38bc3c Step 4/4 : COPY --from=first /test /test ---> 54bfae391eba Successfully built 54bfae391eba PS E:\docker\build\36764\short> cd .. PS E:\docker\build\36764> docker build --no-cache . Sending build context to Docker daemon 4.689MB Step 1/61 : FROM alpine as first ---> 11cd0b38bc3c Step 2/61 : RUN echo test > /test ---> Running in 02597ff870db Removing intermediate container 02597ff870db ---> 3096de6fc454 Step 3/61 : RUN echo test > /test ---> Running in 9a8110f4ff19 Removing intermediate container 9a8110f4ff19 ---> 7691808cf28e Step 4/61 : RUN echo test > /test ---> Running in 9afb8f51510b Removing intermediate container 9afb8f51510b ---> e42a0df2bb1c Step 5/61 : RUN echo test > /test ---> Running in fe977ed6804e Removing intermediate container fe977ed6804e ---> 55850c9b0479 Step 6/61 : RUN echo test > /test ---> Running in be65cbfad172 Removing intermediate container be65cbfad172 ---> 0cf8acba70f0 Step 7/61 : RUN echo test > /test ---> Running in fd5b0907b6a9 Removing intermediate container fd5b0907b6a9 ---> 257a4493d85d Step 8/61 : RUN echo test > /test ---> Running in f7ca0ffd9076 Removing intermediate container f7ca0ffd9076 ---> 3baa6f4fa2d5 Step 9/61 : RUN echo test > /test ---> Running in 5146814d4727 Removing intermediate container 5146814d4727 ---> 485b9d5cf228 Step 10/61 : RUN echo test > /test ---> Running in a090eec1b743 Removing intermediate container a090eec1b743 ---> a7eb10155b51 Step 11/61 : RUN echo test > /test ---> Running in 942660b288df Removing intermediate container 942660b288df ---> 9d286a1e2133 Step 12/61 : RUN echo test > /test ---> Running in c3d369aa91df Removing intermediate container c3d369aa91df ---> f78be4788992 Step 13/61 : RUN echo test > /test ---> Running in a03c3ac6888f Removing intermediate container a03c3ac6888f ---> 6504363f61ab Step 14/61 : RUN echo test > /test ---> Running in 0c3c2fca3f90 Removing intermediate container 0c3c2fca3f90 ---> fe3448b8bb29 Step 15/61 : RUN echo test > /test ---> Running in 828d51c76d3b Removing intermediate container 828d51c76d3b ---> 870684e3aea0 Step 16/61 : RUN echo test > /test ---> Running in 59a2f7c5f3ad Removing intermediate container 59a2f7c5f3ad ---> cf84556ca5c0 Step 17/61 : RUN echo test > /test ---> Running in bfb4e088eeb3 Removing intermediate container bfb4e088eeb3 ---> 9c8f9f652cef Step 18/61 : RUN echo test > /test ---> Running in f1b88bb5a2d7 Removing intermediate container f1b88bb5a2d7 ---> a6233ad21648 Step 19/61 : RUN echo test > /test ---> Running in 45f70577d709 Removing intermediate container 45f70577d709 ---> 1b5cc52d370d Step 20/61 : RUN echo test > /test ---> Running in 2ce231d5043d Removing intermediate container 2ce231d5043d ---> 4a0e17cbebaa Step 21/61 : RUN echo test > /test ---> Running in 52e4b0928f1f Removing intermediate container 52e4b0928f1f ---> 99b50e989bcb Step 22/61 : RUN echo test > /test ---> Running in f7ba3da7460d Removing intermediate container f7ba3da7460d ---> bfa3cad88285 Step 23/61 : RUN echo test > /test ---> Running in 60180bf60f88 Removing intermediate container 60180bf60f88 ---> fe7271988bcb Step 24/61 : RUN echo test > /test ---> Running in 20324d396531 Removing intermediate container 20324d396531 ---> e930bc039128 Step 25/61 : RUN echo test > /test ---> Running in b3ac70fd4404 Removing intermediate container b3ac70fd4404 ---> 39d0a11ea6d8 Step 26/61 : RUN echo test > /test ---> Running in 0193267d3787 Removing intermediate container 0193267d3787 ---> 8062d7aab0a5 Step 27/61 : RUN echo test > /test ---> Running in f41f45fb7985 Removing intermediate container f41f45fb7985 ---> 1f5f18f2315b Step 28/61 : RUN echo test > /test ---> Running in 90dd09c63d6e Removing intermediate container 90dd09c63d6e ---> 02f0a1141f11 Step 29/61 : RUN echo test > /test ---> Running in c557e5386e0a Removing intermediate container c557e5386e0a ---> dbcd6fb1f6f4 Step 30/61 : RUN echo test > /test ---> Running in 65369385d855 Removing intermediate container 65369385d855 ---> e6e9058a0650 Step 31/61 : RUN echo test > /test ---> Running in d861fcc388fd Removing intermediate container d861fcc388fd ---> 6e4c2c0f741f Step 32/61 : RUN echo test > /test ---> Running in 1483962b7e1c Removing intermediate container 1483962b7e1c ---> cf8f142aa055 Step 33/61 : RUN echo test > /test ---> Running in 5868934816c1 Removing intermediate container 5868934816c1 ---> d5ff87cdc204 Step 34/61 : RUN echo test > /test ---> Running in e057f3201f3a Removing intermediate container e057f3201f3a ---> b4031b7ab4ac Step 35/61 : RUN echo test > /test ---> Running in 22b769b9079c Removing intermediate container 22b769b9079c ---> 019d898510b6 Step 36/61 : RUN echo test > /test ---> Running in f1d364ef4ff8 Removing intermediate container f1d364ef4ff8 ---> 9525cafdf04d Step 37/61 : RUN echo test > /test ---> Running in 5bf505b8bdcc Removing intermediate container 5bf505b8bdcc ---> cd5002b33bfd Step 38/61 : RUN echo test > /test ---> Running in be24a921945c Removing intermediate container be24a921945c ---> 8675db44d1b7 Step 39/61 : RUN echo test > /test ---> Running in 352dc6beef3d Removing intermediate container 352dc6beef3d ---> 0ab0ece43c71 Step 40/61 : RUN echo test > /test ---> Running in eebde33e5d9b Removing intermediate container eebde33e5d9b ---> 46ca4b0dfc03 Step 41/61 : RUN echo test > /test ---> Running in f920313a1e85 Removing intermediate container f920313a1e85 ---> 7f3888414d58 Step 42/61 : RUN echo test > /test ---> Running in 10e2f4dc1ac7 Removing intermediate container 10e2f4dc1ac7 ---> 14db9e15f2dc Step 43/61 : RUN echo test > /test ---> Running in c849d6e89aa5 Removing intermediate container c849d6e89aa5 ---> fdb770494dd6 Step 44/61 : RUN echo test > /test ---> Running in 419d1a8353db Removing intermediate container 419d1a8353db ---> d12e9cf078be Step 45/61 : RUN echo test > /test ---> Running in 0f1805263e4c Removing intermediate container 0f1805263e4c ---> cd005e7b08a4 Step 46/61 : RUN echo test > /test ---> Running in 5bde05b46441 Removing intermediate container 5bde05b46441 ---> 05aa426a3d4a Step 47/61 : RUN echo test > /test ---> Running in 01ebc84bd1bc Removing intermediate container 01ebc84bd1bc ---> 35d371fa4342 Step 48/61 : RUN echo test > /test ---> Running in 49f6c2f51dd4 Removing intermediate container 49f6c2f51dd4 ---> 1090b5dfa130 Step 49/61 : RUN echo test > /test ---> Running in f8a9089cd725 Removing intermediate container f8a9089cd725 ---> b2d0eec0716d Step 50/61 : RUN echo test > /test ---> Running in a1697a0b2db0 Removing intermediate container a1697a0b2db0 ---> 10d96ac8f497 Step 51/61 : RUN echo test > /test ---> Running in 33a2332c06eb Removing intermediate container 33a2332c06eb ---> ba5bf5609c1c Step 52/61 : RUN echo test > /test ---> Running in e8920392be0d Removing intermediate container e8920392be0d ---> 5b3a95685c7e Step 53/61 : RUN echo test > /test ---> Running in 4b9298587c65 Removing intermediate container 4b9298587c65 ---> d4961a349141 Step 54/61 : RUN echo test > /test ---> Running in 8a0c960c2ba1 Removing intermediate container 8a0c960c2ba1 ---> b413197fcfa2 Step 55/61 : RUN echo test > /test ---> Running in 536ee3b9596b Removing intermediate container 536ee3b9596b ---> fc16b69b224a Step 56/61 : RUN echo test > /test ---> Running in 8b817b8d7b59 Removing intermediate container 8b817b8d7b59 ---> 2f0896400ff9 Step 57/61 : RUN echo test > /test ---> Running in ab0ed79ec3d4 Removing intermediate container ab0ed79ec3d4 ---> b4fb420e736c Step 58/61 : RUN echo test > /test ---> Running in 8548d7eead1f Removing intermediate container 8548d7eead1f ---> 745103fd5a38 Step 59/61 : RUN echo test > /test ---> Running in 1980559ad5d6 Removing intermediate container 1980559ad5d6 ---> 08c1c74a5618 Step 60/61 : FROM alpine ---> 11cd0b38bc3c Step 61/61 : COPY --from=first /test /test ---> 67f053c66c27 Successfully built 67f053c66c27 PS E:\docker\build\36764> ``` Note also that subsequent error messages once you go beyond current platform limitations kind of suck (such as insufficient resources with a bunch of spew which is incomprehensible to most) and we could do better to detect this earlier in the daemon. That'll be for a (reasonably low-priority) follow-up though as and when I have time. Theoretically we *may*, if the platform doesn't require additional changes for RS5, be able to have bigger platform limits using the v2 schema with up to 127 VPMem devices, and the possibility to have multiple SCSI controllers per SVM/UVM. However, currently LCOW is using HCS v1 schema calls, and there's no plans to rewrite the graphdriver/libcontainerd components outside of the moving LCOW fully over to the containerd runtime/snapshotter using HCS v2 schema, which is still some time off fruition. PS OK, while waiting for a full run to complete, I did get bored. Turns out it won't overflow line length as max(uint64) is 18446744073709551616 which would still be short enough at 127 layers, double the current platform limit. And I could always change it to hex or base36 to make it even shorter, or remove the 'd' from /tmp/dN. IOW, pretty sure no-one is going to hit the limit even if we could get the platform to 256 which is the current Hyper-V SCSI limit per VM (4x64), although PMEM at 127 would be the next immediate limit.
2018-08-15 20:56:28 +00:00
// Must map to the actual "short" container path where the SCSI disk was mounted
actualContainerPath := fgc.svm.getShortContainerPath(fgc.mvd)
if actualContainerPath == "" {
return nil, fmt.Errorf("inconsistency detected: couldn't get short container path for %+v in utility VM %s", fgc.mvd, fgc.svm.config.Name)
}
file := path.Join(actualContainerPath, filename)
LCOW:Enable image push when files have spaces Signed-off-by: John Howard <jhoward@microsoft.com> Reported internally at Microsoft through VSO#19696554. Using the solution from https://groups.google.com/forum/#!topic/Golang-Nuts/DpldsmrhPio to quote file name and escape single quotes (https://play.golang.org/p/ntk8EEGjfk) Simple repro steps are something like: On an ubuntu box run something like ``` docker run -d --rm -p 5000:5000 registry:latest hostname-I to get the ip address ``` On Windows start the daemon adding `--insecure-registry 10.124.186.18:5000` (or whatever the IP address from above was) ``` docker run -it alpine sh / # echo bar > "with space"​ / # echo foo > 'single quote space' / # exit docker ps -a docker commit <containerid> (note the first few of the image id) docker tag <first few> 10.124.186.18:5000/test docker push 10.124.186.18:5000/test ``` Resulting error when pushing the image: ``` PS E:\docker\build\19696554> docker push 10.124.186.18:5000/simpletest2 The push refers to repository [10.124.186.18:5000/simpletest2]​ d328d7f5f277: Pushing [==================================================>] 74.24kB/74.24kB​ 503e53e365f3: Layer already exists​ svm.runProcess: command cat /tmp/d59/single quote space failed with exit code 1​ PS E:\docker\build\19696554> ``` After this change pushing the image: ``` PS E:\docker\build\19696554> docker push 10.124.186.18:5000/simpletest2 The push refers to repository [10.124.186.18:5000/simpletest2] d328d7f5f277: Pushing [==================================================>] 74.24kB/74.24kB 503e53e365f3: Layer already exists latest: digest: sha256:b9828a2d2a3d2421a4c342f48b7936714b3d8409dc32c103da5f3fb13b54bdbf size: 735 PS E:\docker\build\19696554> ```
2019-02-04 20:07:49 +00:00
// Ugly fix for MSFT internal bug VSO#19696554
// If a file name contains a space, pushing an image fails.
// Using solution from https://groups.google.com/forum/#!topic/Golang-Nuts/DpldsmrhPio to escape for shell execution
file = "'" + strings.Join(strings.Split(file, "'"), `'"'"'`) + "'"
if err := fgc.svm.runProcess(fmt.Sprintf("cat %s", file), nil, outOut, errOut); err != nil {
logrus.Debugf("cat %s failed: %s", file, errOut.String())
return nil, err
}
return nopCloser{bytes.NewReader(outOut.Bytes())}, nil
}
// DiffGetter returns a FileGetCloser that can read files from the directory that
// contains files for the layer differences. Used for direct access for tar-split.
func (d *Driver) DiffGetter(id string) (graphdriver.FileGetCloser, error) {
title := fmt.Sprintf("lcowdriver: diffgetter: %s", id)
logrus.Debugf(title)
ld, err := getLayerDetails(d.dir(id))
if err != nil {
logrus.Debugf("%s: failed to get vhdx information of %s: %s", title, d.dir(id), err)
return nil, err
}
// Start the SVM with a mapped virtual disk. Note that if the SVM is
// already running and we are in global mode, this will be hot-added.
mvd := hcsshim.MappedVirtualDisk{
HostPath: ld.filename,
ContainerPath: hostToGuest(ld.filename),
CreateInUtilityVM: true,
ReadOnly: true,
}
logrus.Debugf("%s: starting service VM", title)
svm, err := d.startServiceVMIfNotRunning(id, []hcsshim.MappedVirtualDisk{mvd}, fmt.Sprintf("diffgetter %s", id))
if err != nil {
return nil, err
}
logrus.Debugf("%s: waiting for svm to finish booting", title)
err = svm.getStartError()
if err != nil {
d.terminateServiceVM(id, fmt.Sprintf("diff %s", id), false)
return nil, fmt.Errorf("%s: svm failed to boot: %s", title, err)
}
return &fileGetCloserFromSVM{
id: id,
svm: svm,
mvd: &mvd,
d: d}, nil
}