mirror of
https://github.com/moby/moby.git
synced 2022-11-09 12:21:53 -05:00
3ec4ec2857
This changeset allows Docker's VFS, and Overlay to take advantage of Linux's zerocopy APIs. The copy function first tries to use the ficlone ioctl. Reason being: - they do not allow partial success (aka short writes) - clones are expected to be a fast metadata operation See: http://oss.sgi.com/archives/xfs/2015-12/msg00356.html If the clone fails, we fall back to copy_file_range, which internally may fall back to splice, which has an upper limit on the size of copy it can perform. Given that, we have to loop until the copy is done. For a given dirCopy operation, if the clone fails, we will not try it again during any other file copy. Same is true with copy_file_range. If all else fails, we fall back to traditional copy. Signed-off-by: Sargun Dhillon <sargun@sargun.me>
234 lines
5.5 KiB
Go
234 lines
5.5 KiB
Go
// +build linux
|
|
|
|
package copy
|
|
|
|
/*
|
|
#include <linux/fs.h>
|
|
|
|
#ifndef FICLONE
|
|
#define FICLONE _IOW(0x94, 9, int)
|
|
#endif
|
|
*/
|
|
import "C"
|
|
import (
|
|
"fmt"
|
|
"io"
|
|
"os"
|
|
"path/filepath"
|
|
"syscall"
|
|
"time"
|
|
|
|
"github.com/docker/docker/pkg/pools"
|
|
"github.com/docker/docker/pkg/system"
|
|
rsystem "github.com/opencontainers/runc/libcontainer/system"
|
|
"golang.org/x/sys/unix"
|
|
)
|
|
|
|
// Mode indicates whether to use hardlink or copy content
|
|
type Mode int
|
|
|
|
const (
|
|
// Content creates a new file, and copies the content of the file
|
|
Content Mode = iota
|
|
// Hardlink creates a new hardlink to the existing file
|
|
Hardlink
|
|
)
|
|
|
|
func copyRegular(srcPath, dstPath string, fileinfo os.FileInfo, copyWithFileRange, copyWithFileClone *bool) error {
|
|
srcFile, err := os.Open(srcPath)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer srcFile.Close()
|
|
|
|
// If the destination file already exists, we shouldn't blow it away
|
|
dstFile, err := os.OpenFile(dstPath, os.O_WRONLY|os.O_CREATE|os.O_EXCL, fileinfo.Mode())
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer dstFile.Close()
|
|
|
|
if *copyWithFileClone {
|
|
_, _, err = unix.Syscall(unix.SYS_IOCTL, dstFile.Fd(), C.FICLONE, srcFile.Fd())
|
|
if err == nil {
|
|
return nil
|
|
}
|
|
|
|
*copyWithFileClone = false
|
|
if err == unix.EXDEV {
|
|
*copyWithFileRange = false
|
|
}
|
|
}
|
|
if *copyWithFileRange {
|
|
err = doCopyWithFileRange(srcFile, dstFile, fileinfo)
|
|
// Trying the file_clone may not have caught the exdev case
|
|
// as the ioctl may not have been available (therefore EINVAL)
|
|
if err == unix.EXDEV || err == unix.ENOSYS {
|
|
*copyWithFileRange = false
|
|
} else if err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return legacyCopy(srcFile, dstFile)
|
|
}
|
|
|
|
func doCopyWithFileRange(srcFile, dstFile *os.File, fileinfo os.FileInfo) error {
|
|
amountLeftToCopy := fileinfo.Size()
|
|
|
|
for amountLeftToCopy > 0 {
|
|
n, err := unix.CopyFileRange(int(srcFile.Fd()), nil, int(dstFile.Fd()), nil, int(amountLeftToCopy), 0)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
amountLeftToCopy = amountLeftToCopy - int64(n)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func legacyCopy(srcFile io.Reader, dstFile io.Writer) error {
|
|
_, err := pools.Copy(dstFile, srcFile)
|
|
|
|
return err
|
|
}
|
|
|
|
func copyXattr(srcPath, dstPath, attr string) error {
|
|
data, err := system.Lgetxattr(srcPath, attr)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
if data != nil {
|
|
if err := system.Lsetxattr(dstPath, attr, data, 0); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// DirCopy copies or hardlinks the contents of one directory to another,
|
|
// properly handling xattrs, and soft links
|
|
func DirCopy(srcDir, dstDir string, copyMode Mode) error {
|
|
copyWithFileRange := true
|
|
copyWithFileClone := true
|
|
err := filepath.Walk(srcDir, func(srcPath string, f os.FileInfo, err error) error {
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
// Rebase path
|
|
relPath, err := filepath.Rel(srcDir, srcPath)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
dstPath := filepath.Join(dstDir, relPath)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
stat, ok := f.Sys().(*syscall.Stat_t)
|
|
if !ok {
|
|
return fmt.Errorf("Unable to get raw syscall.Stat_t data for %s", srcPath)
|
|
}
|
|
|
|
isHardlink := false
|
|
|
|
switch f.Mode() & os.ModeType {
|
|
case 0: // Regular file
|
|
if copyMode == Hardlink {
|
|
isHardlink = true
|
|
if err2 := os.Link(srcPath, dstPath); err2 != nil {
|
|
return err2
|
|
}
|
|
} else {
|
|
if err2 := copyRegular(srcPath, dstPath, f, ©WithFileRange, ©WithFileClone); err2 != nil {
|
|
return err2
|
|
}
|
|
}
|
|
|
|
case os.ModeDir:
|
|
if err := os.Mkdir(dstPath, f.Mode()); err != nil && !os.IsExist(err) {
|
|
return err
|
|
}
|
|
|
|
case os.ModeSymlink:
|
|
link, err := os.Readlink(srcPath)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := os.Symlink(link, dstPath); err != nil {
|
|
return err
|
|
}
|
|
|
|
case os.ModeNamedPipe:
|
|
fallthrough
|
|
case os.ModeSocket:
|
|
if rsystem.RunningInUserNS() {
|
|
// cannot create a device if running in user namespace
|
|
return nil
|
|
}
|
|
if err := unix.Mkfifo(dstPath, stat.Mode); err != nil {
|
|
return err
|
|
}
|
|
|
|
case os.ModeDevice:
|
|
if err := unix.Mknod(dstPath, stat.Mode, int(stat.Rdev)); err != nil {
|
|
return err
|
|
}
|
|
|
|
default:
|
|
return fmt.Errorf("unknown file type for %s", srcPath)
|
|
}
|
|
|
|
// Everything below is copying metadata from src to dst. All this metadata
|
|
// already shares an inode for hardlinks.
|
|
if isHardlink {
|
|
return nil
|
|
}
|
|
|
|
if err := os.Lchown(dstPath, int(stat.Uid), int(stat.Gid)); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := copyXattr(srcPath, dstPath, "security.capability"); err != nil {
|
|
return err
|
|
}
|
|
|
|
// We need to copy this attribute if it appears in an overlay upper layer, as
|
|
// this function is used to copy those. It is set by overlay if a directory
|
|
// is removed and then re-created and should not inherit anything from the
|
|
// same dir in the lower dir.
|
|
if err := copyXattr(srcPath, dstPath, "trusted.overlay.opaque"); err != nil {
|
|
return err
|
|
}
|
|
|
|
isSymlink := f.Mode()&os.ModeSymlink != 0
|
|
|
|
// There is no LChmod, so ignore mode for symlink. Also, this
|
|
// must happen after chown, as that can modify the file mode
|
|
if !isSymlink {
|
|
if err := os.Chmod(dstPath, f.Mode()); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
|
|
// system.Chtimes doesn't support a NOFOLLOW flag atm
|
|
// nolint: unconvert
|
|
if !isSymlink {
|
|
aTime := time.Unix(int64(stat.Atim.Sec), int64(stat.Atim.Nsec))
|
|
mTime := time.Unix(int64(stat.Mtim.Sec), int64(stat.Mtim.Nsec))
|
|
if err := system.Chtimes(dstPath, aTime, mTime); err != nil {
|
|
return err
|
|
}
|
|
} else {
|
|
ts := []syscall.Timespec{stat.Atim, stat.Mtim}
|
|
if err := system.LUtimesNano(dstPath, ts); err != nil {
|
|
return err
|
|
}
|
|
}
|
|
return nil
|
|
})
|
|
return err
|
|
}
|