From bde99960657818efabf313991867c42921882275 Mon Sep 17 00:00:00 2001 From: John Howard Date: Tue, 2 Oct 2018 16:22:08 -0700 Subject: [PATCH 1/2] LCOW: ApplyDiff() use tar2ext4, not SVM Signed-off-by: John Howard This removes the need for an SVM in the LCOW driver to ApplyDiff. This change relates to a fix for https://github.com/moby/moby/issues/36353 However, it found another issue, tracked by https://github.com/moby/moby/issues/37955 --- daemon/graphdriver/lcow/lcow.go | 95 +++++++++++++++++++++++++-------- 1 file changed, 74 insertions(+), 21 deletions(-) diff --git a/daemon/graphdriver/lcow/lcow.go b/daemon/graphdriver/lcow/lcow.go index a26d13ede1..321d3a36f5 100644 --- a/daemon/graphdriver/lcow/lcow.go +++ b/daemon/graphdriver/lcow/lcow.go @@ -71,20 +71,33 @@ import ( "time" "github.com/Microsoft/hcsshim" + "github.com/Microsoft/hcsshim/ext4/tar2ext4" "github.com/Microsoft/opengcs/client" "github.com/docker/docker/daemon/graphdriver" "github.com/docker/docker/pkg/archive" "github.com/docker/docker/pkg/containerfs" "github.com/docker/docker/pkg/idtools" "github.com/docker/docker/pkg/ioutils" + "github.com/docker/docker/pkg/reexec" "github.com/docker/docker/pkg/system" "github.com/sirupsen/logrus" ) +// noreexec controls reexec functionality. Off by default, on for debugging purposes. +var noreexec = false + // init registers this driver to the register. It gets initialised by the // function passed in the second parameter, implemented in this file. func init() { graphdriver.Register("lcow", InitDriver) + // DOCKER_LCOW_NOREEXEC allows for inline processing which makes + // debugging issues in the re-exec codepath significantly easier. + if os.Getenv("DOCKER_LCOW_NOREEXEC") != "" { + logrus.Warnf("LCOW Graphdriver is set to not re-exec. This is intended for debugging purposes only.") + noreexec = true + } else { + reexec.Register("docker-lcow-tar2ext4", tar2ext4Reexec) + } } const ( @@ -846,32 +859,72 @@ func (d *Driver) Diff(id, parent string) (io.ReadCloser, error) { func (d *Driver) ApplyDiff(id, parent string, diff io.Reader) (int64, error) { logrus.Debugf("lcowdriver: applydiff: id %s", id) - svm, err := d.startServiceVMIfNotRunning(id, nil, fmt.Sprintf("applydiff %s", id)) + // Log failures here as it's undiagnosable sometimes, due to a possible panic. + // See https://github.com/moby/moby/issues/37955 for more information. + + dest := filepath.Join(d.dataRoot, id, layerFilename) + if !noreexec { + cmd := reexec.Command([]string{"docker-lcow-tar2ext4", dest}...) + stdout := bytes.NewBuffer(nil) + stderr := bytes.NewBuffer(nil) + cmd.Stdin = diff + cmd.Stdout = stdout + cmd.Stderr = stderr + + if err := cmd.Start(); err != nil { + logrus.Warnf("lcowdriver: applydiff: id %s failed to start re-exec: %s", id, err) + return 0, err + } + + if err := cmd.Wait(); err != nil { + logrus.Warnf("lcowdriver: applydiff: id %s failed %s", id, err) + return 0, fmt.Errorf("re-exec error: %v: stderr: %s", err, stderr) + } + return strconv.ParseInt(stdout.String(), 10, 64) + } + // The inline case + size, err := tar2ext4Actual(dest, diff) + if err != nil { + logrus.Warnf("lcowdriver: applydiff: id %s failed %s", id, err) + } + return size, err +} + +// tar2ext4Reexec is the re-exec entry point for writing a layer from a tar file +func tar2ext4Reexec() { + size, err := tar2ext4Actual(os.Args[1], os.Stdin) + if err != nil { + fmt.Fprint(os.Stderr, err) + os.Exit(1) + } + fmt.Fprint(os.Stdout, size) +} + +// tar2ext4Actual is the implementation of tar2ext to write a layer from a tar file. +// It can be called through re-exec (default), or inline for debugging. +func tar2ext4Actual(dest string, diff io.Reader) (int64, error) { + // maxDiskSize is not relating to the sandbox size - this is the + // maximum possible size a layer VHD generated can be from an EXT4 + // layout perspective. + const maxDiskSize = 128 * 1024 * 1024 * 1024 // 128GB + out, err := os.Create(dest) if err != nil { return 0, err } - defer d.terminateServiceVM(id, fmt.Sprintf("applydiff %s", id), false) - - logrus.Debugf("lcowdriver: applydiff: waiting for svm to finish booting") - err = svm.getStartError() + defer out.Close() + if err := tar2ext4.Convert( + diff, + out, + tar2ext4.AppendVhdFooter, + tar2ext4.ConvertWhiteout, + tar2ext4.MaximumDiskSize(maxDiskSize)); err != nil { + return 0, err + } + fi, err := os.Stat(dest) if err != nil { - return 0, fmt.Errorf("lcowdriver: applydiff: svm failed to boot: %s", err) - } - - // TODO @jhowardmsft - the retries are temporary to overcome platform reliability issues. - // Obviously this will be removed as platform bugs are fixed. - retries := 0 - for { - retries++ - size, err := svm.config.TarToVhd(filepath.Join(d.dataRoot, id, layerFilename), diff) - if err != nil { - if retries <= 10 { - continue - } - return 0, err - } - return size, err + return 0, err } + return fi.Size(), nil } // Changes produces a list of changes between the specified layer From d03ab106624ebc30b69080d0092702ad1fe1285c Mon Sep 17 00:00:00 2001 From: John Howard Date: Tue, 2 Oct 2018 15:25:41 -0700 Subject: [PATCH 2/2] Vendor Microsoft/hcsshim @ v0.7.9 Signed-off-by: John Howard --- vendor.conf | 2 +- .../ext4/internal/compactext4/compact.go | 1263 +++++++++++++++++ .../hcsshim/ext4/internal/format/format.go | 411 ++++++ .../hcsshim/ext4/tar2ext4/tar2ext4.go | 174 +++ .../hcsshim/ext4/tar2ext4/vhdfooter.go | 76 + .../github.com/Microsoft/hcsshim/version.go | 6 - 6 files changed, 1925 insertions(+), 7 deletions(-) create mode 100644 vendor/github.com/Microsoft/hcsshim/ext4/internal/compactext4/compact.go create mode 100644 vendor/github.com/Microsoft/hcsshim/ext4/internal/format/format.go create mode 100644 vendor/github.com/Microsoft/hcsshim/ext4/tar2ext4/tar2ext4.go create mode 100644 vendor/github.com/Microsoft/hcsshim/ext4/tar2ext4/vhdfooter.go delete mode 100644 vendor/github.com/Microsoft/hcsshim/version.go diff --git a/vendor.conf b/vendor.conf index 64541e9ca1..f0b13afca3 100644 --- a/vendor.conf +++ b/vendor.conf @@ -1,6 +1,6 @@ # the following lines are in sorted order, FYI github.com/Azure/go-ansiterm d6e3b3328b783f23731bc4d058875b0371ff8109 -github.com/Microsoft/hcsshim v0.7.6 +github.com/Microsoft/hcsshim v0.7.9 github.com/Microsoft/go-winio v0.4.11 github.com/docker/libtrust 9cbd2a1374f46905c68a4eb3694a130610adc62a github.com/go-check/check 4ed411733c5785b40214c70bce814c3a3a689609 https://github.com/cpuguy83/check.git diff --git a/vendor/github.com/Microsoft/hcsshim/ext4/internal/compactext4/compact.go b/vendor/github.com/Microsoft/hcsshim/ext4/internal/compactext4/compact.go new file mode 100644 index 0000000000..856173751f --- /dev/null +++ b/vendor/github.com/Microsoft/hcsshim/ext4/internal/compactext4/compact.go @@ -0,0 +1,1263 @@ +package compactext4 + +import ( + "bufio" + "bytes" + "encoding/binary" + "errors" + "fmt" + "io" + "path" + "sort" + "strings" + "time" + + "github.com/Microsoft/hcsshim/ext4/internal/format" +) + +// Writer writes a compact ext4 file system. +type Writer struct { + f io.ReadWriteSeeker + bw *bufio.Writer + inodes []*inode + curName string + curInode *inode + pos int64 + dataWritten, dataMax int64 + err error + initialized bool + supportInlineData bool + maxDiskSize int64 + gdBlocks uint32 +} + +// Mode flags for Linux files. +const ( + S_IXOTH = format.S_IXOTH + S_IWOTH = format.S_IWOTH + S_IROTH = format.S_IROTH + S_IXGRP = format.S_IXGRP + S_IWGRP = format.S_IWGRP + S_IRGRP = format.S_IRGRP + S_IXUSR = format.S_IXUSR + S_IWUSR = format.S_IWUSR + S_IRUSR = format.S_IRUSR + S_ISVTX = format.S_ISVTX + S_ISGID = format.S_ISGID + S_ISUID = format.S_ISUID + S_IFIFO = format.S_IFIFO + S_IFCHR = format.S_IFCHR + S_IFDIR = format.S_IFDIR + S_IFBLK = format.S_IFBLK + S_IFREG = format.S_IFREG + S_IFLNK = format.S_IFLNK + S_IFSOCK = format.S_IFSOCK + + TypeMask = format.TypeMask +) + +type inode struct { + Size int64 + Atime, Ctime, Mtime, Crtime uint64 + Number format.InodeNumber + Mode uint16 + Uid, Gid uint32 + LinkCount uint32 + XattrBlock uint32 + BlockCount uint32 + Devmajor, Devminor uint32 + Flags format.InodeFlag + Data []byte + XattrInline []byte + Children directory +} + +func (node *inode) FileType() uint16 { + return node.Mode & format.TypeMask +} + +func (node *inode) IsDir() bool { + return node.FileType() == S_IFDIR +} + +// A File represents a file to be added to an ext4 file system. +type File struct { + Linkname string + Size int64 + Mode uint16 + Uid, Gid uint32 + Atime, Ctime, Mtime, Crtime time.Time + Devmajor, Devminor uint32 + Xattrs map[string][]byte +} + +const ( + inodeFirst = 11 + inodeLostAndFound = inodeFirst + + blockSize = 4096 + blocksPerGroup = blockSize * 8 + inodeSize = 256 + maxInodesPerGroup = blockSize * 8 // Limited by the inode bitmap + inodesPerGroupIncrement = blockSize / inodeSize + + defaultMaxDiskSize = 16 * 1024 * 1024 * 1024 // 16GB + maxMaxDiskSize = 16 * 1024 * 1024 * 1024 * 1024 // 16TB + + groupDescriptorSize = 32 // Use the small group descriptor + groupsPerDescriptorBlock = blockSize / groupDescriptorSize + + maxFileSize = 128 * 1024 * 1024 * 1024 // 128GB file size maximum for now + smallSymlinkSize = 59 // max symlink size that goes directly in the inode + maxBlocksPerExtent = 0x8000 // maximum number of blocks in an extent + inodeDataSize = 60 + inodeUsedSize = 152 // fields through CrtimeExtra + inodeExtraSize = inodeSize - inodeUsedSize + xattrInodeOverhead = 4 + 4 // magic number + empty next entry value + xattrBlockOverhead = 32 + 4 // header + empty next entry value + inlineDataXattrOverhead = xattrInodeOverhead + 16 + 4 // entry + "data" + inlineDataSize = inodeDataSize + inodeExtraSize - inlineDataXattrOverhead +) + +type exceededMaxSizeError struct { + Size int64 +} + +func (err exceededMaxSizeError) Error() string { + return fmt.Sprintf("disk exceeded maximum size of %d bytes", err.Size) +} + +var directoryEntrySize = binary.Size(format.DirectoryEntry{}) +var extraIsize = uint16(inodeUsedSize - 128) + +type directory map[string]*inode + +func splitFirst(p string) (string, string) { + n := strings.IndexByte(p, '/') + if n >= 0 { + return p[:n], p[n+1:] + } + return p, "" +} + +func (w *Writer) findPath(root *inode, p string) *inode { + inode := root + for inode != nil && len(p) != 0 { + name, rest := splitFirst(p) + p = rest + inode = inode.Children[name] + } + return inode +} + +func timeToFsTime(t time.Time) uint64 { + if t.IsZero() { + return 0 + } + s := t.Unix() + if s < -0x80000000 { + return 0x80000000 + } + if s > 0x37fffffff { + return 0x37fffffff + } + return uint64(s) | uint64(t.Nanosecond())<<34 +} + +func fsTimeToTime(t uint64) time.Time { + if t == 0 { + return time.Time{} + } + s := int64(t & 0x3ffffffff) + if s > 0x7fffffff && s < 0x100000000 { + s = int64(int32(uint32(s))) + } + return time.Unix(s, int64(t>>34)) +} + +func (w *Writer) getInode(i format.InodeNumber) *inode { + if i == 0 || int(i) > len(w.inodes) { + return nil + } + return w.inodes[i-1] +} + +var xattrPrefixes = []struct { + Index uint8 + Prefix string +}{ + {2, "system.posix_acl_access"}, + {3, "system.posix_acl_default"}, + {8, "system.richacl"}, + {7, "system."}, + {1, "user."}, + {4, "trusted."}, + {6, "security."}, +} + +func compressXattrName(name string) (uint8, string) { + for _, p := range xattrPrefixes { + if strings.HasPrefix(name, p.Prefix) { + return p.Index, name[len(p.Prefix):] + } + } + return 0, name +} + +func decompressXattrName(index uint8, name string) string { + for _, p := range xattrPrefixes { + if index == p.Index { + return p.Prefix + name + } + } + return name +} + +func hashXattrEntry(name string, value []byte) uint32 { + var hash uint32 + for i := 0; i < len(name); i++ { + hash = (hash << 5) ^ (hash >> 27) ^ uint32(name[i]) + } + + for i := 0; i+3 < len(value); i += 4 { + hash = (hash << 16) ^ (hash >> 16) ^ binary.LittleEndian.Uint32(value[i:i+4]) + } + + if len(value)%4 != 0 { + var last [4]byte + copy(last[:], value[len(value)&^3:]) + hash = (hash << 16) ^ (hash >> 16) ^ binary.LittleEndian.Uint32(last[:]) + } + return hash +} + +type xattr struct { + Name string + Index uint8 + Value []byte +} + +func (x *xattr) EntryLen() int { + return (len(x.Name)+3)&^3 + 16 +} + +func (x *xattr) ValueLen() int { + return (len(x.Value) + 3) &^ 3 +} + +type xattrState struct { + inode, block []xattr + inodeLeft, blockLeft int +} + +func (s *xattrState) init() { + s.inodeLeft = inodeExtraSize - xattrInodeOverhead + s.blockLeft = blockSize - xattrBlockOverhead +} + +func (s *xattrState) addXattr(name string, value []byte) bool { + index, name := compressXattrName(name) + x := xattr{ + Index: index, + Name: name, + Value: value, + } + length := x.EntryLen() + x.ValueLen() + if s.inodeLeft >= length { + s.inode = append(s.inode, x) + s.inodeLeft -= length + } else if s.blockLeft >= length { + s.block = append(s.block, x) + s.blockLeft -= length + } else { + return false + } + return true +} + +func putXattrs(xattrs []xattr, b []byte, offsetDelta uint16) { + offset := uint16(len(b)) + offsetDelta + eb := b + db := b + for _, xattr := range xattrs { + vl := xattr.ValueLen() + offset -= uint16(vl) + eb[0] = uint8(len(xattr.Name)) + eb[1] = xattr.Index + binary.LittleEndian.PutUint16(eb[2:], offset) + binary.LittleEndian.PutUint32(eb[8:], uint32(len(xattr.Value))) + binary.LittleEndian.PutUint32(eb[12:], hashXattrEntry(xattr.Name, xattr.Value)) + copy(eb[16:], xattr.Name) + eb = eb[xattr.EntryLen():] + copy(db[len(db)-vl:], xattr.Value) + db = db[:len(db)-vl] + } +} + +func getXattrs(b []byte, xattrs map[string][]byte, offsetDelta uint16) { + eb := b + for len(eb) != 0 { + nameLen := eb[0] + if nameLen == 0 { + break + } + index := eb[1] + offset := binary.LittleEndian.Uint16(eb[2:]) - offsetDelta + valueLen := binary.LittleEndian.Uint32(eb[8:]) + attr := xattr{ + Index: index, + Name: string(eb[16 : 16+nameLen]), + Value: b[offset : uint32(offset)+valueLen], + } + xattrs[decompressXattrName(index, attr.Name)] = attr.Value + eb = eb[attr.EntryLen():] + } +} + +func (w *Writer) writeXattrs(inode *inode, state *xattrState) error { + // Write the inline attributes. + if len(state.inode) != 0 { + inode.XattrInline = make([]byte, inodeExtraSize) + binary.LittleEndian.PutUint32(inode.XattrInline[0:], format.XAttrHeaderMagic) // Magic + putXattrs(state.inode, inode.XattrInline[4:], 0) + } + + // Write the block attributes. If there was previously an xattr block, then + // rewrite it even if it is now empty. + if len(state.block) != 0 || inode.XattrBlock != 0 { + sort.Slice(state.block, func(i, j int) bool { + return state.block[i].Index < state.block[j].Index || + len(state.block[i].Name) < len(state.block[j].Name) || + state.block[i].Name < state.block[j].Name + }) + + var b [blockSize]byte + binary.LittleEndian.PutUint32(b[0:], format.XAttrHeaderMagic) // Magic + binary.LittleEndian.PutUint32(b[4:], 1) // ReferenceCount + binary.LittleEndian.PutUint32(b[8:], 1) // Blocks + putXattrs(state.block, b[32:], 32) + + orig := w.block() + if inode.XattrBlock == 0 { + inode.XattrBlock = orig + inode.BlockCount++ + } else { + // Reuse the original block. + w.seekBlock(inode.XattrBlock) + defer w.seekBlock(orig) + } + + if _, err := w.write(b[:]); err != nil { + return err + } + } + + return nil +} + +func (w *Writer) write(b []byte) (int, error) { + if w.err != nil { + return 0, w.err + } + if w.pos+int64(len(b)) > w.maxDiskSize { + w.err = exceededMaxSizeError{w.maxDiskSize} + return 0, w.err + } + n, err := w.bw.Write(b) + w.pos += int64(n) + w.err = err + return n, err +} + +func (w *Writer) zero(n int64) (int64, error) { + if w.err != nil { + return 0, w.err + } + if w.pos+int64(n) > w.maxDiskSize { + w.err = exceededMaxSizeError{w.maxDiskSize} + return 0, w.err + } + n, err := io.CopyN(w.bw, zero, n) + w.pos += n + w.err = err + return n, err +} + +func (w *Writer) makeInode(f *File, node *inode) (*inode, error) { + mode := f.Mode + if mode&format.TypeMask == 0 { + mode |= format.S_IFREG + } + typ := mode & format.TypeMask + ino := format.InodeNumber(len(w.inodes) + 1) + if node == nil { + node = &inode{ + Number: ino, + } + if typ == S_IFDIR { + node.Children = make(directory) + node.LinkCount = 1 // A directory is linked to itself. + } + } else if node.Flags&format.InodeFlagExtents != 0 { + // Since we cannot deallocate or reuse blocks, don't allow updates that + // would invalidate data that has already been written. + return nil, errors.New("cannot overwrite file with non-inline data") + } + node.Mode = mode + node.Uid = f.Uid + node.Gid = f.Gid + node.Flags = format.InodeFlagHugeFile + node.Atime = timeToFsTime(f.Atime) + node.Ctime = timeToFsTime(f.Ctime) + node.Mtime = timeToFsTime(f.Mtime) + node.Crtime = timeToFsTime(f.Crtime) + node.Devmajor = f.Devmajor + node.Devminor = f.Devminor + node.Data = nil + node.XattrInline = nil + + var xstate xattrState + xstate.init() + + var size int64 + switch typ { + case format.S_IFREG: + size = f.Size + if f.Size > maxFileSize { + return nil, fmt.Errorf("file too big: %d > %d", f.Size, maxFileSize) + } + if f.Size <= inlineDataSize && w.supportInlineData { + node.Data = make([]byte, f.Size) + extra := 0 + if f.Size > inodeDataSize { + extra = int(f.Size - inodeDataSize) + } + // Add a dummy entry for now. + if !xstate.addXattr("system.data", node.Data[:extra]) { + panic("not enough room for inline data") + } + node.Flags |= format.InodeFlagInlineData + } + case format.S_IFLNK: + node.Mode |= 0777 // Symlinks should appear as ugw rwx + size = int64(len(f.Linkname)) + if size <= smallSymlinkSize { + // Special case: small symlinks go directly in Block without setting + // an inline data flag. + node.Data = make([]byte, len(f.Linkname)) + copy(node.Data, f.Linkname) + } + case format.S_IFDIR, format.S_IFIFO, format.S_IFSOCK, format.S_IFCHR, format.S_IFBLK: + default: + return nil, fmt.Errorf("invalid mode %o", mode) + } + + // Accumulate the extended attributes. + if len(f.Xattrs) != 0 { + // Sort the xattrs to avoid non-determinism in map iteration. + var xattrs []string + for name := range f.Xattrs { + xattrs = append(xattrs, name) + } + sort.Strings(xattrs) + for _, name := range xattrs { + if !xstate.addXattr(name, f.Xattrs[name]) { + return nil, fmt.Errorf("could not fit xattr %s", name) + } + } + } + + if err := w.writeXattrs(node, &xstate); err != nil { + return nil, err + } + + node.Size = size + if typ == format.S_IFLNK && size > smallSymlinkSize { + // Write the link name as data. + w.startInode("", node, size) + if _, err := w.Write([]byte(f.Linkname)); err != nil { + return nil, err + } + if err := w.finishInode(); err != nil { + return nil, err + } + } + + if int(node.Number-1) >= len(w.inodes) { + w.inodes = append(w.inodes, node) + } + return node, nil +} + +func (w *Writer) root() *inode { + return w.getInode(format.InodeRoot) +} + +func (w *Writer) lookup(name string, mustExist bool) (*inode, *inode, string, error) { + root := w.root() + cleanname := path.Clean("/" + name)[1:] + if len(cleanname) == 0 { + return root, root, "", nil + } + dirname, childname := path.Split(cleanname) + if len(childname) == 0 || len(childname) > 0xff { + return nil, nil, "", fmt.Errorf("%s: invalid name", name) + } + dir := w.findPath(root, dirname) + if dir == nil || !dir.IsDir() { + return nil, nil, "", fmt.Errorf("%s: path not found", name) + } + child := dir.Children[childname] + if child == nil && mustExist { + return nil, nil, "", fmt.Errorf("%s: file not found", name) + } + return dir, child, childname, nil +} + +// Create adds a file to the file system. +func (w *Writer) Create(name string, f *File) error { + if err := w.finishInode(); err != nil { + return err + } + dir, existing, childname, err := w.lookup(name, false) + if err != nil { + return err + } + var reuse *inode + if existing != nil { + if existing.IsDir() { + if f.Mode&TypeMask != S_IFDIR { + return fmt.Errorf("%s: cannot replace a directory with a file", name) + } + reuse = existing + } else if f.Mode&TypeMask == S_IFDIR { + return fmt.Errorf("%s: cannot replace a file with a directory", name) + } else if existing.LinkCount < 2 { + reuse = existing + } + } else { + if f.Mode&TypeMask == S_IFDIR && dir.LinkCount >= format.MaxLinks { + return fmt.Errorf("%s: exceeded parent directory maximum link count", name) + } + } + child, err := w.makeInode(f, reuse) + if err != nil { + return fmt.Errorf("%s: %s", name, err) + } + if existing != child { + if existing != nil { + existing.LinkCount-- + } + dir.Children[childname] = child + child.LinkCount++ + if child.IsDir() { + dir.LinkCount++ + } + } + if child.Mode&format.TypeMask == format.S_IFREG { + w.startInode(name, child, f.Size) + } + return nil +} + +// Link adds a hard link to the file system. +func (w *Writer) Link(oldname, newname string) error { + if err := w.finishInode(); err != nil { + return err + } + newdir, existing, newchildname, err := w.lookup(newname, false) + if err != nil { + return err + } + if existing != nil && (existing.IsDir() || existing.LinkCount < 2) { + return fmt.Errorf("%s: cannot orphan existing file or directory", newname) + } + + _, oldfile, _, err := w.lookup(oldname, true) + if err != nil { + return err + } + switch oldfile.Mode & format.TypeMask { + case format.S_IFDIR, format.S_IFLNK: + return fmt.Errorf("%s: link target cannot be a directory or symlink: %s", newname, oldname) + } + + if existing != oldfile && oldfile.LinkCount >= format.MaxLinks { + return fmt.Errorf("%s: link target would exceed maximum link count: %s", newname, oldname) + } + + if existing != nil { + existing.LinkCount-- + } + oldfile.LinkCount++ + newdir.Children[newchildname] = oldfile + return nil +} + +// Stat returns information about a file that has been written. +func (w *Writer) Stat(name string) (*File, error) { + if err := w.finishInode(); err != nil { + return nil, err + } + _, node, _, err := w.lookup(name, true) + if err != nil { + return nil, err + } + f := &File{ + Size: node.Size, + Mode: node.Mode, + Uid: node.Uid, + Gid: node.Gid, + Atime: fsTimeToTime(node.Atime), + Ctime: fsTimeToTime(node.Ctime), + Mtime: fsTimeToTime(node.Mtime), + Crtime: fsTimeToTime(node.Crtime), + Devmajor: node.Devmajor, + Devminor: node.Devminor, + } + f.Xattrs = make(map[string][]byte) + if node.XattrBlock != 0 || len(node.XattrInline) != 0 { + if node.XattrBlock != 0 { + orig := w.block() + w.seekBlock(node.XattrBlock) + if w.err != nil { + return nil, w.err + } + var b [blockSize]byte + _, err := w.f.Read(b[:]) + w.seekBlock(orig) + if err != nil { + return nil, err + } + getXattrs(b[32:], f.Xattrs, 32) + } + if len(node.XattrInline) != 0 { + getXattrs(node.XattrInline[4:], f.Xattrs, 0) + delete(f.Xattrs, "system.data") + } + } + if node.FileType() == S_IFLNK { + if node.Size > smallSymlinkSize { + return nil, fmt.Errorf("%s: cannot retrieve link information", name) + } + f.Linkname = string(node.Data) + } + return f, nil +} + +func (w *Writer) Write(b []byte) (int, error) { + if len(b) == 0 { + return 0, nil + } + if w.dataWritten+int64(len(b)) > w.dataMax { + return 0, fmt.Errorf("%s: wrote too much: %d > %d", w.curName, w.dataWritten+int64(len(b)), w.dataMax) + } + + if w.curInode.Flags&format.InodeFlagInlineData != 0 { + copy(w.curInode.Data[w.dataWritten:], b) + w.dataWritten += int64(len(b)) + return len(b), nil + } + + n, err := w.write(b) + w.dataWritten += int64(n) + return n, err +} + +func (w *Writer) startInode(name string, inode *inode, size int64) { + if w.curInode != nil { + panic("inode already in progress") + } + w.curName = name + w.curInode = inode + w.dataWritten = 0 + w.dataMax = size +} + +func (w *Writer) block() uint32 { + return uint32(w.pos / blockSize) +} + +func (w *Writer) seekBlock(block uint32) { + w.pos = int64(block) * blockSize + if w.err != nil { + return + } + w.err = w.bw.Flush() + if w.err != nil { + return + } + _, w.err = w.f.Seek(w.pos, io.SeekStart) +} + +func (w *Writer) nextBlock() { + if w.pos%blockSize != 0 { + // Simplify callers; w.err is updated on failure. + w.zero(blockSize - w.pos%blockSize) + } +} + +func fillExtents(hdr *format.ExtentHeader, extents []format.ExtentLeafNode, startBlock, offset, inodeSize uint32) { + *hdr = format.ExtentHeader{ + Magic: format.ExtentHeaderMagic, + Entries: uint16(len(extents)), + Max: uint16(cap(extents)), + Depth: 0, + } + for i := range extents { + block := offset + uint32(i)*maxBlocksPerExtent + length := inodeSize - block + if length > maxBlocksPerExtent { + length = maxBlocksPerExtent + } + start := startBlock + block + extents[i] = format.ExtentLeafNode{ + Block: block, + Length: uint16(length), + StartLow: start, + } + } +} + +func (w *Writer) writeExtents(inode *inode) error { + start := w.pos - w.dataWritten + if start%blockSize != 0 { + panic("unaligned") + } + w.nextBlock() + + startBlock := uint32(start / blockSize) + blocks := w.block() - startBlock + usedBlocks := blocks + + const extentNodeSize = 12 + const extentsPerBlock = blockSize/extentNodeSize - 1 + + extents := (blocks + maxBlocksPerExtent - 1) / maxBlocksPerExtent + var b bytes.Buffer + if extents == 0 { + // Nothing to do. + } else if extents <= 4 { + var root struct { + hdr format.ExtentHeader + extents [4]format.ExtentLeafNode + } + fillExtents(&root.hdr, root.extents[:extents], startBlock, 0, blocks) + binary.Write(&b, binary.LittleEndian, root) + } else if extents <= 4*extentsPerBlock { + const extentsPerBlock = blockSize/extentNodeSize - 1 + extentBlocks := extents/extentsPerBlock + 1 + usedBlocks += extentBlocks + var b2 bytes.Buffer + + var root struct { + hdr format.ExtentHeader + nodes [4]format.ExtentIndexNode + } + root.hdr = format.ExtentHeader{ + Magic: format.ExtentHeaderMagic, + Entries: uint16(extentBlocks), + Max: 4, + Depth: 1, + } + for i := uint32(0); i < extentBlocks; i++ { + root.nodes[i] = format.ExtentIndexNode{ + Block: i * extentsPerBlock * maxBlocksPerExtent, + LeafLow: w.block(), + } + extentsInBlock := extents - i*extentBlocks + if extentsInBlock > extentsPerBlock { + extentsInBlock = extentsPerBlock + } + + var node struct { + hdr format.ExtentHeader + extents [extentsPerBlock]format.ExtentLeafNode + _ [blockSize - (extentsPerBlock+1)*extentNodeSize]byte + } + + offset := i * extentsPerBlock * maxBlocksPerExtent + fillExtents(&node.hdr, node.extents[:extentsInBlock], startBlock+offset, offset, blocks) + binary.Write(&b2, binary.LittleEndian, node) + if _, err := w.write(b2.Next(blockSize)); err != nil { + return err + } + } + binary.Write(&b, binary.LittleEndian, root) + } else { + panic("file too big") + } + + inode.Data = b.Bytes() + inode.Flags |= format.InodeFlagExtents + inode.BlockCount += usedBlocks + return w.err +} + +func (w *Writer) finishInode() error { + if !w.initialized { + if err := w.init(); err != nil { + return err + } + } + if w.curInode == nil { + return nil + } + if w.dataWritten != w.dataMax { + return fmt.Errorf("did not write the right amount: %d != %d", w.dataWritten, w.dataMax) + } + + if w.dataMax != 0 && w.curInode.Flags&format.InodeFlagInlineData == 0 { + if err := w.writeExtents(w.curInode); err != nil { + return err + } + } + + w.dataWritten = 0 + w.dataMax = 0 + w.curInode = nil + return w.err +} + +func modeToFileType(mode uint16) format.FileType { + switch mode & format.TypeMask { + default: + return format.FileTypeUnknown + case format.S_IFREG: + return format.FileTypeRegular + case format.S_IFDIR: + return format.FileTypeDirectory + case format.S_IFCHR: + return format.FileTypeCharacter + case format.S_IFBLK: + return format.FileTypeBlock + case format.S_IFIFO: + return format.FileTypeFIFO + case format.S_IFSOCK: + return format.FileTypeSocket + case format.S_IFLNK: + return format.FileTypeSymbolicLink + } +} + +type constReader byte + +var zero = constReader(0) + +func (r constReader) Read(b []byte) (int, error) { + for i := range b { + b[i] = byte(r) + } + return len(b), nil +} + +func (w *Writer) writeDirectory(dir, parent *inode) error { + if err := w.finishInode(); err != nil { + return err + } + + // The size of the directory is not known yet. + w.startInode("", dir, 0x7fffffffffffffff) + left := blockSize + finishBlock := func() error { + if left > 0 { + e := format.DirectoryEntry{ + RecordLength: uint16(left), + } + err := binary.Write(w, binary.LittleEndian, e) + if err != nil { + return err + } + left -= directoryEntrySize + if left < 4 { + panic("not enough space for trailing entry") + } + _, err = io.CopyN(w, zero, int64(left)) + if err != nil { + return err + } + } + left = blockSize + return nil + } + + writeEntry := func(ino format.InodeNumber, name string) error { + rlb := directoryEntrySize + len(name) + rl := (rlb + 3) & ^3 + if left < rl+12 { + if err := finishBlock(); err != nil { + return err + } + } + e := format.DirectoryEntry{ + Inode: ino, + RecordLength: uint16(rl), + NameLength: uint8(len(name)), + FileType: modeToFileType(w.getInode(ino).Mode), + } + err := binary.Write(w, binary.LittleEndian, e) + if err != nil { + return err + } + _, err = w.Write([]byte(name)) + if err != nil { + return err + } + var zero [4]byte + _, err = w.Write(zero[:rl-rlb]) + if err != nil { + return err + } + left -= rl + return nil + } + if err := writeEntry(dir.Number, "."); err != nil { + return err + } + if err := writeEntry(parent.Number, ".."); err != nil { + return err + } + + // Follow e2fsck's convention and sort the children by inode number. + var children []string + for name := range dir.Children { + children = append(children, name) + } + sort.Slice(children, func(i, j int) bool { + return dir.Children[children[i]].Number < dir.Children[children[j]].Number + }) + + for _, name := range children { + child := dir.Children[name] + if err := writeEntry(child.Number, name); err != nil { + return err + } + } + if err := finishBlock(); err != nil { + return err + } + w.curInode.Size = w.dataWritten + w.dataMax = w.dataWritten + return nil +} + +func (w *Writer) writeDirectoryRecursive(dir, parent *inode) error { + if err := w.writeDirectory(dir, parent); err != nil { + return err + } + for _, child := range dir.Children { + if child.IsDir() { + if err := w.writeDirectoryRecursive(child, dir); err != nil { + return err + } + } + } + return nil +} + +func (w *Writer) writeInodeTable(tableSize uint32) error { + var b bytes.Buffer + for _, inode := range w.inodes { + if inode != nil { + binode := format.Inode{ + Mode: inode.Mode, + Uid: uint16(inode.Uid & 0xffff), + Gid: uint16(inode.Gid & 0xffff), + SizeLow: uint32(inode.Size & 0xffffffff), + SizeHigh: uint32(inode.Size >> 32), + LinksCount: uint16(inode.LinkCount), + BlocksLow: inode.BlockCount, + Flags: inode.Flags, + XattrBlockLow: inode.XattrBlock, + UidHigh: uint16(inode.Uid >> 16), + GidHigh: uint16(inode.Gid >> 16), + ExtraIsize: uint16(inodeUsedSize - 128), + Atime: uint32(inode.Atime), + AtimeExtra: uint32(inode.Atime >> 32), + Ctime: uint32(inode.Ctime), + CtimeExtra: uint32(inode.Ctime >> 32), + Mtime: uint32(inode.Mtime), + MtimeExtra: uint32(inode.Mtime >> 32), + Crtime: uint32(inode.Crtime), + CrtimeExtra: uint32(inode.Crtime >> 32), + } + switch inode.Mode & format.TypeMask { + case format.S_IFDIR, format.S_IFREG, format.S_IFLNK: + n := copy(binode.Block[:], inode.Data) + if n < len(inode.Data) { + // Rewrite the first xattr with the data. + xattr := [1]xattr{{ + Name: "data", + Index: 7, // "system." + Value: inode.Data[n:], + }} + putXattrs(xattr[:], inode.XattrInline[4:], 0) + } + case format.S_IFBLK, format.S_IFCHR: + dev := inode.Devminor&0xff | inode.Devmajor<<8 | (inode.Devminor&0xffffff00)<<12 + binary.LittleEndian.PutUint32(binode.Block[4:], dev) + } + + binary.Write(&b, binary.LittleEndian, binode) + b.Truncate(inodeUsedSize) + n, _ := b.Write(inode.XattrInline) + io.CopyN(&b, zero, int64(inodeExtraSize-n)) + } else { + io.CopyN(&b, zero, inodeSize) + } + if _, err := w.write(b.Next(inodeSize)); err != nil { + return err + } + } + rest := tableSize - uint32(len(w.inodes)*inodeSize) + if _, err := w.zero(int64(rest)); err != nil { + return err + } + return nil +} + +// NewWriter returns a Writer that writes an ext4 file system to the provided +// WriteSeeker. +func NewWriter(f io.ReadWriteSeeker, opts ...Option) *Writer { + w := &Writer{ + f: f, + bw: bufio.NewWriterSize(f, 65536*8), + maxDiskSize: defaultMaxDiskSize, + } + for _, opt := range opts { + opt(w) + } + return w +} + +// An Option provides extra options to NewWriter. +type Option func(*Writer) + +// InlineData instructs the Writer to write small files into the inode +// structures directly. This creates smaller images but currently is not +// compatible with DAX. +func InlineData(w *Writer) { + w.supportInlineData = true +} + +// MaximumDiskSize instructs the writer to reserve enough metadata space for the +// specified disk size. If not provided, then 16GB is the default. +func MaximumDiskSize(size int64) Option { + return func(w *Writer) { + if size < 0 || size > maxMaxDiskSize { + w.maxDiskSize = maxMaxDiskSize + } else if size == 0 { + w.maxDiskSize = defaultMaxDiskSize + } else { + w.maxDiskSize = (size + blockSize - 1) &^ (blockSize - 1) + } + } +} + +func (w *Writer) init() error { + // Skip the defective block inode. + w.inodes = make([]*inode, 1, 32) + // Create the root directory. + root, _ := w.makeInode(&File{ + Mode: format.S_IFDIR | 0755, + }, nil) + root.LinkCount++ // The root is linked to itself. + // Skip until the first non-reserved inode. + w.inodes = append(w.inodes, make([]*inode, inodeFirst-len(w.inodes)-1)...) + maxBlocks := (w.maxDiskSize-1)/blockSize + 1 + maxGroups := (maxBlocks-1)/blocksPerGroup + 1 + w.gdBlocks = uint32((maxGroups-1)/groupsPerDescriptorBlock + 1) + + // Skip past the superblock and block descriptor table. + w.seekBlock(1 + w.gdBlocks) + w.initialized = true + + // The lost+found directory is required to exist for e2fsck to pass. + if err := w.Create("lost+found", &File{Mode: format.S_IFDIR | 0700}); err != nil { + return err + } + return w.err +} + +func groupCount(blocks uint32, inodes uint32, inodesPerGroup uint32) uint32 { + inodeBlocksPerGroup := inodesPerGroup * inodeSize / blockSize + dataBlocksPerGroup := blocksPerGroup - inodeBlocksPerGroup - 2 // save room for the bitmaps + + // Increase the block count to ensure there are enough groups for all the + // inodes. + minBlocks := (inodes-1)/inodesPerGroup*dataBlocksPerGroup + 1 + if blocks < minBlocks { + blocks = minBlocks + } + + return (blocks + dataBlocksPerGroup - 1) / dataBlocksPerGroup +} + +func bestGroupCount(blocks uint32, inodes uint32) (groups uint32, inodesPerGroup uint32) { + groups = 0xffffffff + for ipg := uint32(inodesPerGroupIncrement); ipg <= maxInodesPerGroup; ipg += inodesPerGroupIncrement { + g := groupCount(blocks, inodes, ipg) + if g < groups { + groups = g + inodesPerGroup = ipg + } + } + return +} + +func (w *Writer) Close() error { + if err := w.finishInode(); err != nil { + return err + } + root := w.root() + if err := w.writeDirectoryRecursive(root, root); err != nil { + return err + } + // Finish the last inode (probably a directory). + if err := w.finishInode(); err != nil { + return err + } + + // Write the inode table + inodeTableOffset := w.block() + groups, inodesPerGroup := bestGroupCount(inodeTableOffset, uint32(len(w.inodes))) + err := w.writeInodeTable(groups * inodesPerGroup * inodeSize) + if err != nil { + return err + } + + // Write the bitmaps. + bitmapOffset := w.block() + bitmapSize := groups * 2 + validDataSize := bitmapOffset + bitmapSize + diskSize := validDataSize + minSize := (groups-1)*blocksPerGroup + 1 + if diskSize < minSize { + diskSize = minSize + } + + usedGdBlocks := (groups-1)/groupDescriptorSize + 1 + if usedGdBlocks > w.gdBlocks { + return exceededMaxSizeError{w.maxDiskSize} + } + + gds := make([]format.GroupDescriptor, w.gdBlocks*groupsPerDescriptorBlock) + inodeTableSizePerGroup := inodesPerGroup * inodeSize / blockSize + var totalUsedBlocks, totalUsedInodes uint32 + for g := uint32(0); g < groups; g++ { + var b [blockSize * 2]byte + var dirCount, usedInodeCount, usedBlockCount uint16 + + // Block bitmap + if (g+1)*blocksPerGroup <= validDataSize { + // This group is fully allocated. + for j := range b[:blockSize] { + b[j] = 0xff + } + usedBlockCount = blocksPerGroup + } else if g*blocksPerGroup < validDataSize { + for j := uint32(0); j < validDataSize-g*blocksPerGroup; j++ { + b[j/8] |= 1 << (j % 8) + usedBlockCount++ + } + } + if g == 0 { + // Unused group descriptor blocks should be cleared. + for j := 1 + usedGdBlocks; j < 1+w.gdBlocks; j++ { + b[j/8] &^= 1 << (j % 8) + usedBlockCount-- + } + } + if g == groups-1 && diskSize%blocksPerGroup != 0 { + // Blocks that aren't present in the disk should be marked as + // allocated. + for j := diskSize % blocksPerGroup; j < blocksPerGroup; j++ { + b[j/8] |= 1 << (j % 8) + usedBlockCount++ + } + } + // Inode bitmap + for j := uint32(0); j < inodesPerGroup; j++ { + ino := format.InodeNumber(1 + g*inodesPerGroup + j) + inode := w.getInode(ino) + if ino < inodeFirst || inode != nil { + b[blockSize+j/8] |= 1 << (j % 8) + usedInodeCount++ + } + if inode != nil && inode.Mode&format.TypeMask == format.S_IFDIR { + dirCount++ + } + } + _, err := w.write(b[:]) + if err != nil { + return err + } + gds[g] = format.GroupDescriptor{ + BlockBitmapLow: bitmapOffset + 2*g, + InodeBitmapLow: bitmapOffset + 2*g + 1, + InodeTableLow: inodeTableOffset + g*inodeTableSizePerGroup, + UsedDirsCountLow: dirCount, + FreeInodesCountLow: uint16(inodesPerGroup) - usedInodeCount, + FreeBlocksCountLow: blocksPerGroup - usedBlockCount, + } + + totalUsedBlocks += uint32(usedBlockCount) + totalUsedInodes += uint32(usedInodeCount) + } + + // Zero up to the disk size. + _, err = w.zero(int64(diskSize-bitmapOffset-bitmapSize) * blockSize) + if err != nil { + return err + } + + // Write the block descriptors + w.seekBlock(1) + if w.err != nil { + return w.err + } + err = binary.Write(w.bw, binary.LittleEndian, gds) + if err != nil { + return err + } + + // Write the super block + var blk [blockSize]byte + b := bytes.NewBuffer(blk[:1024]) + sb := &format.SuperBlock{ + InodesCount: inodesPerGroup * groups, + BlocksCountLow: diskSize, + FreeBlocksCountLow: blocksPerGroup*groups - totalUsedBlocks, + FreeInodesCount: inodesPerGroup*groups - totalUsedInodes, + FirstDataBlock: 0, + LogBlockSize: 2, // 2^(10 + 2) + LogClusterSize: 2, + BlocksPerGroup: blocksPerGroup, + ClustersPerGroup: blocksPerGroup, + InodesPerGroup: inodesPerGroup, + Magic: format.SuperBlockMagic, + State: 1, // cleanly unmounted + Errors: 1, // continue on error? + CreatorOS: 0, // Linux + RevisionLevel: 1, // dynamic inode sizes + FirstInode: inodeFirst, + LpfInode: inodeLostAndFound, + InodeSize: inodeSize, + FeatureCompat: format.CompatSparseSuper2 | format.CompatExtAttr, + FeatureIncompat: format.IncompatFiletype | format.IncompatExtents | format.IncompatFlexBg, + FeatureRoCompat: format.RoCompatLargeFile | format.RoCompatHugeFile | format.RoCompatExtraIsize | format.RoCompatReadonly, + MinExtraIsize: extraIsize, + WantExtraIsize: extraIsize, + LogGroupsPerFlex: 31, + } + if w.supportInlineData { + sb.FeatureIncompat |= format.IncompatInlineData + } + binary.Write(b, binary.LittleEndian, sb) + w.seekBlock(0) + if _, err := w.write(blk[:]); err != nil { + return err + } + w.seekBlock(diskSize) + return w.err +} diff --git a/vendor/github.com/Microsoft/hcsshim/ext4/internal/format/format.go b/vendor/github.com/Microsoft/hcsshim/ext4/internal/format/format.go new file mode 100644 index 0000000000..9dc4c4e164 --- /dev/null +++ b/vendor/github.com/Microsoft/hcsshim/ext4/internal/format/format.go @@ -0,0 +1,411 @@ +package format + +type SuperBlock struct { + InodesCount uint32 + BlocksCountLow uint32 + RootBlocksCountLow uint32 + FreeBlocksCountLow uint32 + FreeInodesCount uint32 + FirstDataBlock uint32 + LogBlockSize uint32 + LogClusterSize uint32 + BlocksPerGroup uint32 + ClustersPerGroup uint32 + InodesPerGroup uint32 + Mtime uint32 + Wtime uint32 + MountCount uint16 + MaxMountCount uint16 + Magic uint16 + State uint16 + Errors uint16 + MinorRevisionLevel uint16 + LastCheck uint32 + CheckInterval uint32 + CreatorOS uint32 + RevisionLevel uint32 + DefaultReservedUid uint16 + DefaultReservedGid uint16 + FirstInode uint32 + InodeSize uint16 + BlockGroupNr uint16 + FeatureCompat CompatFeature + FeatureIncompat IncompatFeature + FeatureRoCompat RoCompatFeature + UUID [16]uint8 + VolumeName [16]byte + LastMounted [64]byte + AlgorithmUsageBitmap uint32 + PreallocBlocks uint8 + PreallocDirBlocks uint8 + ReservedGdtBlocks uint16 + JournalUUID [16]uint8 + JournalInum uint32 + JournalDev uint32 + LastOrphan uint32 + HashSeed [4]uint32 + DefHashVersion uint8 + JournalBackupType uint8 + DescSize uint16 + DefaultMountOpts uint32 + FirstMetaBg uint32 + MkfsTime uint32 + JournalBlocks [17]uint32 + BlocksCountHigh uint32 + RBlocksCountHigh uint32 + FreeBlocksCountHigh uint32 + MinExtraIsize uint16 + WantExtraIsize uint16 + Flags uint32 + RaidStride uint16 + MmpInterval uint16 + MmpBlock uint64 + RaidStripeWidth uint32 + LogGroupsPerFlex uint8 + ChecksumType uint8 + ReservedPad uint16 + KbytesWritten uint64 + SnapshotInum uint32 + SnapshotID uint32 + SnapshotRBlocksCount uint64 + SnapshotList uint32 + ErrorCount uint32 + FirstErrorTime uint32 + FirstErrorInode uint32 + FirstErrorBlock uint64 + FirstErrorFunc [32]uint8 + FirstErrorLine uint32 + LastErrorTime uint32 + LastErrorInode uint32 + LastErrorLine uint32 + LastErrorBlock uint64 + LastErrorFunc [32]uint8 + MountOpts [64]uint8 + UserQuotaInum uint32 + GroupQuotaInum uint32 + OverheadBlocks uint32 + BackupBgs [2]uint32 + EncryptAlgos [4]uint8 + EncryptPwSalt [16]uint8 + LpfInode uint32 + ProjectQuotaInum uint32 + ChecksumSeed uint32 + WtimeHigh uint8 + MtimeHigh uint8 + MkfsTimeHigh uint8 + LastcheckHigh uint8 + FirstErrorTimeHigh uint8 + LastErrorTimeHigh uint8 + Pad [2]uint8 + Reserved [96]uint32 + Checksum uint32 +} + +const SuperBlockMagic uint16 = 0xef53 + +type CompatFeature uint32 +type IncompatFeature uint32 +type RoCompatFeature uint32 + +const ( + CompatDirPrealloc CompatFeature = 0x1 + CompatImagicInodes CompatFeature = 0x2 + CompatHasJournal CompatFeature = 0x4 + CompatExtAttr CompatFeature = 0x8 + CompatResizeInode CompatFeature = 0x10 + CompatDirIndex CompatFeature = 0x20 + CompatLazyBg CompatFeature = 0x40 + CompatExcludeInode CompatFeature = 0x80 + CompatExcludeBitmap CompatFeature = 0x100 + CompatSparseSuper2 CompatFeature = 0x200 + + IncompatCompression IncompatFeature = 0x1 + IncompatFiletype IncompatFeature = 0x2 + IncompatRecover IncompatFeature = 0x4 + IncompatJournalDev IncompatFeature = 0x8 + IncompatMetaBg IncompatFeature = 0x10 + IncompatExtents IncompatFeature = 0x40 + Incompat_64Bit IncompatFeature = 0x80 + IncompatMmp IncompatFeature = 0x100 + IncompatFlexBg IncompatFeature = 0x200 + IncompatEaInode IncompatFeature = 0x400 + IncompatDirdata IncompatFeature = 0x1000 + IncompatCsumSeed IncompatFeature = 0x2000 + IncompatLargedir IncompatFeature = 0x4000 + IncompatInlineData IncompatFeature = 0x8000 + IncompatEncrypt IncompatFeature = 0x10000 + + RoCompatSparseSuper RoCompatFeature = 0x1 + RoCompatLargeFile RoCompatFeature = 0x2 + RoCompatBtreeDir RoCompatFeature = 0x4 + RoCompatHugeFile RoCompatFeature = 0x8 + RoCompatGdtCsum RoCompatFeature = 0x10 + RoCompatDirNlink RoCompatFeature = 0x20 + RoCompatExtraIsize RoCompatFeature = 0x40 + RoCompatHasSnapshot RoCompatFeature = 0x80 + RoCompatQuota RoCompatFeature = 0x100 + RoCompatBigalloc RoCompatFeature = 0x200 + RoCompatMetadataCsum RoCompatFeature = 0x400 + RoCompatReplica RoCompatFeature = 0x800 + RoCompatReadonly RoCompatFeature = 0x1000 + RoCompatProject RoCompatFeature = 0x2000 +) + +type BlockGroupFlag uint16 + +const ( + BlockGroupInodeUninit BlockGroupFlag = 0x1 + BlockGroupBlockUninit BlockGroupFlag = 0x2 + BlockGroupInodeZeroed BlockGroupFlag = 0x4 +) + +type GroupDescriptor struct { + BlockBitmapLow uint32 + InodeBitmapLow uint32 + InodeTableLow uint32 + FreeBlocksCountLow uint16 + FreeInodesCountLow uint16 + UsedDirsCountLow uint16 + Flags BlockGroupFlag + ExcludeBitmapLow uint32 + BlockBitmapCsumLow uint16 + InodeBitmapCsumLow uint16 + ItableUnusedLow uint16 + Checksum uint16 +} + +type GroupDescriptor64 struct { + GroupDescriptor + BlockBitmapHigh uint32 + InodeBitmapHigh uint32 + InodeTableHigh uint32 + FreeBlocksCountHigh uint16 + FreeInodesCountHigh uint16 + UsedDirsCountHigh uint16 + ItableUnusedHigh uint16 + ExcludeBitmapHigh uint32 + BlockBitmapCsumHigh uint16 + InodeBitmapCsumHigh uint16 + Reserved uint32 +} + +const ( + S_IXOTH = 0x1 + S_IWOTH = 0x2 + S_IROTH = 0x4 + S_IXGRP = 0x8 + S_IWGRP = 0x10 + S_IRGRP = 0x20 + S_IXUSR = 0x40 + S_IWUSR = 0x80 + S_IRUSR = 0x100 + S_ISVTX = 0x200 + S_ISGID = 0x400 + S_ISUID = 0x800 + S_IFIFO = 0x1000 + S_IFCHR = 0x2000 + S_IFDIR = 0x4000 + S_IFBLK = 0x6000 + S_IFREG = 0x8000 + S_IFLNK = 0xA000 + S_IFSOCK = 0xC000 + + TypeMask uint16 = 0xF000 +) + +type InodeNumber uint32 + +const ( + InodeRoot = 2 +) + +type Inode struct { + Mode uint16 + Uid uint16 + SizeLow uint32 + Atime uint32 + Ctime uint32 + Mtime uint32 + Dtime uint32 + Gid uint16 + LinksCount uint16 + BlocksLow uint32 + Flags InodeFlag + Version uint32 + Block [60]byte + Generation uint32 + XattrBlockLow uint32 + SizeHigh uint32 + ObsoleteFragmentAddr uint32 + BlocksHigh uint16 + XattrBlockHigh uint16 + UidHigh uint16 + GidHigh uint16 + ChecksumLow uint16 + Reserved uint16 + ExtraIsize uint16 + ChecksumHigh uint16 + CtimeExtra uint32 + MtimeExtra uint32 + AtimeExtra uint32 + Crtime uint32 + CrtimeExtra uint32 + VersionHigh uint32 + Projid uint32 +} + +type InodeFlag uint32 + +const ( + InodeFlagSecRm InodeFlag = 0x1 + InodeFlagUnRm InodeFlag = 0x2 + InodeFlagCompressed InodeFlag = 0x4 + InodeFlagSync InodeFlag = 0x8 + InodeFlagImmutable InodeFlag = 0x10 + InodeFlagAppend InodeFlag = 0x20 + InodeFlagNoDump InodeFlag = 0x40 + InodeFlagNoAtime InodeFlag = 0x80 + InodeFlagDirtyCompressed InodeFlag = 0x100 + InodeFlagCompressedClusters InodeFlag = 0x200 + InodeFlagNoCompress InodeFlag = 0x400 + InodeFlagEncrypted InodeFlag = 0x800 + InodeFlagHashedIndex InodeFlag = 0x1000 + InodeFlagMagic InodeFlag = 0x2000 + InodeFlagJournalData InodeFlag = 0x4000 + InodeFlagNoTail InodeFlag = 0x8000 + InodeFlagDirSync InodeFlag = 0x10000 + InodeFlagTopDir InodeFlag = 0x20000 + InodeFlagHugeFile InodeFlag = 0x40000 + InodeFlagExtents InodeFlag = 0x80000 + InodeFlagEaInode InodeFlag = 0x200000 + InodeFlagEOFBlocks InodeFlag = 0x400000 + InodeFlagSnapfile InodeFlag = 0x01000000 + InodeFlagSnapfileDeleted InodeFlag = 0x04000000 + InodeFlagSnapfileShrunk InodeFlag = 0x08000000 + InodeFlagInlineData InodeFlag = 0x10000000 + InodeFlagProjectIDInherit InodeFlag = 0x20000000 + InodeFlagReserved InodeFlag = 0x80000000 +) + +const ( + MaxLinks = 65000 +) + +type ExtentHeader struct { + Magic uint16 + Entries uint16 + Max uint16 + Depth uint16 + Generation uint32 +} + +const ExtentHeaderMagic uint16 = 0xf30a + +type ExtentIndexNode struct { + Block uint32 + LeafLow uint32 + LeafHigh uint16 + Unused uint16 +} + +type ExtentLeafNode struct { + Block uint32 + Length uint16 + StartHigh uint16 + StartLow uint32 +} + +type ExtentTail struct { + Checksum uint32 +} + +type DirectoryEntry struct { + Inode InodeNumber + RecordLength uint16 + NameLength uint8 + FileType FileType + //Name []byte +} + +type FileType uint8 + +const ( + FileTypeUnknown FileType = 0x0 + FileTypeRegular FileType = 0x1 + FileTypeDirectory FileType = 0x2 + FileTypeCharacter FileType = 0x3 + FileTypeBlock FileType = 0x4 + FileTypeFIFO FileType = 0x5 + FileTypeSocket FileType = 0x6 + FileTypeSymbolicLink FileType = 0x7 +) + +type DirectoryEntryTail struct { + ReservedZero1 uint32 + RecordLength uint16 + ReservedZero2 uint8 + FileType uint8 + Checksum uint32 +} + +type DirectoryTreeRoot struct { + Dot DirectoryEntry + DotName [4]byte + DotDot DirectoryEntry + DotDotName [4]byte + ReservedZero uint32 + HashVersion uint8 + InfoLength uint8 + IndirectLevels uint8 + UnusedFlags uint8 + Limit uint16 + Count uint16 + Block uint32 + //Entries []DirectoryTreeEntry +} + +type DirectoryTreeNode struct { + FakeInode uint32 + FakeRecordLength uint16 + NameLength uint8 + FileType uint8 + Limit uint16 + Count uint16 + Block uint32 + //Entries []DirectoryTreeEntry +} + +type DirectoryTreeEntry struct { + Hash uint32 + Block uint32 +} + +type DirectoryTreeTail struct { + Reserved uint32 + Checksum uint32 +} + +type XAttrInodeBodyHeader struct { + Magic uint32 +} + +type XAttrHeader struct { + Magic uint32 + ReferenceCount uint32 + Blocks uint32 + Hash uint32 + Checksum uint32 + Reserved [3]uint32 +} + +const XAttrHeaderMagic uint32 = 0xea020000 + +type XAttrEntry struct { + NameLength uint8 + NameIndex uint8 + ValueOffset uint16 + ValueInum uint32 + ValueSize uint32 + Hash uint32 + //Name []byte +} diff --git a/vendor/github.com/Microsoft/hcsshim/ext4/tar2ext4/tar2ext4.go b/vendor/github.com/Microsoft/hcsshim/ext4/tar2ext4/tar2ext4.go new file mode 100644 index 0000000000..ad09210469 --- /dev/null +++ b/vendor/github.com/Microsoft/hcsshim/ext4/tar2ext4/tar2ext4.go @@ -0,0 +1,174 @@ +package tar2ext4 + +import ( + "archive/tar" + "bufio" + "encoding/binary" + "io" + "path" + "strings" + + "github.com/Microsoft/hcsshim/ext4/internal/compactext4" +) + +type params struct { + convertWhiteout bool + appendVhdFooter bool + ext4opts []compactext4.Option +} + +// Option is the type for optional parameters to Convert. +type Option func(*params) + +// ConvertWhiteout instructs the converter to convert OCI-style whiteouts +// (beginning with .wh.) to overlay-style whiteouts. +func ConvertWhiteout(p *params) { + p.convertWhiteout = true +} + +// AppendVhdFooter instructs the converter to add a fixed VHD footer to the +// file. +func AppendVhdFooter(p *params) { + p.appendVhdFooter = true +} + +// InlineData instructs the converter to write small files into the inode +// structures directly. This creates smaller images but currently is not +// compatible with DAX. +func InlineData(p *params) { + p.ext4opts = append(p.ext4opts, compactext4.InlineData) +} + +// MaximumDiskSize instructs the writer to limit the disk size to the specified +// value. This also reserves enough metadata space for the specified disk size. +// If not provided, then 16GB is the default. +func MaximumDiskSize(size int64) Option { + return func(p *params) { + p.ext4opts = append(p.ext4opts, compactext4.MaximumDiskSize(size)) + } +} + +const ( + whiteoutPrefix = ".wh." + opaqueWhiteout = ".wh..wh..opq" +) + +// Convert writes a compact ext4 file system image that contains the files in the +// input tar stream. +func Convert(r io.Reader, w io.ReadWriteSeeker, options ...Option) error { + var p params + for _, opt := range options { + opt(&p) + } + t := tar.NewReader(bufio.NewReader(r)) + fs := compactext4.NewWriter(w, p.ext4opts...) + for { + hdr, err := t.Next() + if err == io.EOF { + break + } + if err != nil { + return err + } + + if p.convertWhiteout { + dir, name := path.Split(hdr.Name) + if strings.HasPrefix(name, whiteoutPrefix) { + if name == opaqueWhiteout { + // Update the directory with the appropriate xattr. + f, err := fs.Stat(dir) + if err != nil { + return err + } + f.Xattrs["trusted.overlay.opaque"] = []byte("y") + err = fs.Create(dir, f) + if err != nil { + return err + } + } else { + // Create an overlay-style whiteout. + f := &compactext4.File{ + Mode: compactext4.S_IFCHR, + Devmajor: 0, + Devminor: 0, + } + err = fs.Create(path.Join(dir, name[len(whiteoutPrefix):]), f) + if err != nil { + return err + } + } + + continue + } + } + + if hdr.Typeflag == tar.TypeLink { + err = fs.Link(hdr.Linkname, hdr.Name) + if err != nil { + return err + } + } else { + f := &compactext4.File{ + Mode: uint16(hdr.Mode), + Atime: hdr.AccessTime, + Mtime: hdr.ModTime, + Ctime: hdr.ChangeTime, + Crtime: hdr.ModTime, + Size: hdr.Size, + Uid: uint32(hdr.Uid), + Gid: uint32(hdr.Gid), + Linkname: hdr.Linkname, + Devmajor: uint32(hdr.Devmajor), + Devminor: uint32(hdr.Devminor), + Xattrs: make(map[string][]byte), + } + for key, value := range hdr.PAXRecords { + const xattrPrefix = "SCHILY.xattr." + if strings.HasPrefix(key, xattrPrefix) { + f.Xattrs[key[len(xattrPrefix):]] = []byte(value) + } + } + + var typ uint16 + switch hdr.Typeflag { + case tar.TypeReg, tar.TypeRegA: + typ = compactext4.S_IFREG + case tar.TypeSymlink: + typ = compactext4.S_IFLNK + case tar.TypeChar: + typ = compactext4.S_IFCHR + case tar.TypeBlock: + typ = compactext4.S_IFBLK + case tar.TypeDir: + typ = compactext4.S_IFDIR + case tar.TypeFifo: + typ = compactext4.S_IFIFO + } + f.Mode &= ^compactext4.TypeMask + f.Mode |= typ + err = fs.Create(hdr.Name, f) + if err != nil { + return err + } + _, err = io.Copy(fs, t) + if err != nil { + return err + } + } + } + err := fs.Close() + if err != nil { + return err + } + if p.appendVhdFooter { + size, err := w.Seek(0, io.SeekEnd) + if err != nil { + return err + } + err = binary.Write(w, binary.BigEndian, makeFixedVHDFooter(size)) + if err != nil { + return err + } + } + return nil +} diff --git a/vendor/github.com/Microsoft/hcsshim/ext4/tar2ext4/vhdfooter.go b/vendor/github.com/Microsoft/hcsshim/ext4/tar2ext4/vhdfooter.go new file mode 100644 index 0000000000..c98740302f --- /dev/null +++ b/vendor/github.com/Microsoft/hcsshim/ext4/tar2ext4/vhdfooter.go @@ -0,0 +1,76 @@ +package tar2ext4 + +import ( + "bytes" + "crypto/rand" + "encoding/binary" +) + +// Constants for the VHD footer +const ( + cookieMagic = "conectix" + featureMask = 0x2 + fileFormatVersionMagic = 0x00010000 + fixedDataOffset = -1 + creatorVersionMagic = 0x000a0000 + diskTypeFixed = 2 +) + +type vhdFooter struct { + Cookie [8]byte + Features uint32 + FileFormatVersion uint32 + DataOffset int64 + TimeStamp uint32 + CreatorApplication [4]byte + CreatorVersion uint32 + CreatorHostOS [4]byte + OriginalSize int64 + CurrentSize int64 + DiskGeometry uint32 + DiskType uint32 + Checksum uint32 + UniqueID [16]uint8 + SavedState uint8 + Reserved [427]uint8 +} + +func makeFixedVHDFooter(size int64) *vhdFooter { + footer := &vhdFooter{ + Features: featureMask, + FileFormatVersion: fileFormatVersionMagic, + DataOffset: fixedDataOffset, + CreatorVersion: creatorVersionMagic, + OriginalSize: size, + CurrentSize: size, + DiskType: diskTypeFixed, + UniqueID: generateUUID(), + } + copy(footer.Cookie[:], cookieMagic) + footer.Checksum = calculateCheckSum(footer) + return footer +} + +func calculateCheckSum(footer *vhdFooter) uint32 { + oldchk := footer.Checksum + footer.Checksum = 0 + + buf := &bytes.Buffer{} + binary.Write(buf, binary.BigEndian, footer) + + var chk uint32 + bufBytes := buf.Bytes() + for i := 0; i < len(bufBytes); i++ { + chk += uint32(bufBytes[i]) + } + footer.Checksum = oldchk + return uint32(^chk) +} + +func generateUUID() [16]byte { + res := [16]byte{} + if _, err := rand.Read(res[:]); err != nil { + panic(err) + } + return res +} diff --git a/vendor/github.com/Microsoft/hcsshim/version.go b/vendor/github.com/Microsoft/hcsshim/version.go deleted file mode 100644 index 9ebb257b3e..0000000000 --- a/vendor/github.com/Microsoft/hcsshim/version.go +++ /dev/null @@ -1,6 +0,0 @@ -package hcsshim - -// IsTP4 returns whether the currently running Windows build is at least TP4. -func IsTP4() bool { - return false -}