From 6c266c4b42eeabe2d433a994753d86637fe52a0b Mon Sep 17 00:00:00 2001 From: Alexander Larsson Date: Mon, 3 Mar 2014 16:15:29 +0100 Subject: [PATCH] Move all bind-mounts in the container inside the namespace This moves the bind mounts like /.dockerinit, /etc/hostname, volumes, etc into the container namespace, by setting them up using lxc. This is useful to avoid littering the global namespace with a lot of mounts that are internal to each container and are not generally needed on the outside. In particular, it seems that having a lot of mounts is problematic wrt scaling to a lot of containers on systems where the root filesystem is mounted --rshared. Note that the "private" option is only supported by the native driver, as lxc doesn't support setting this. This is not a huge problem, but it does mean that some mounts are unnecessarily shared inside the container if you're using the lxc driver. Docker-DCO-1.1-Signed-off-by: Alexander Larsson (github: alexlarsson) --- execdriver/driver.go | 8 ++++ execdriver/execdrivers/execdrivers.go | 4 +- execdriver/lxc/lxc_template.go | 8 ++++ execdriver/native/default_template.go | 4 ++ execdriver/native/driver.go | 10 ++-- pkg/libcontainer/container.go | 10 ++++ pkg/libcontainer/nsinit/init.go | 2 +- pkg/libcontainer/nsinit/mount.go | 20 +++++++- runtime/container.go | 6 +-- runtime/runtime.go | 2 +- runtime/volumes.go | 68 +++++++-------------------- 11 files changed, 78 insertions(+), 64 deletions(-) diff --git a/execdriver/driver.go b/execdriver/driver.go index ec8f48f52d..ff37b6bc5b 100644 --- a/execdriver/driver.go +++ b/execdriver/driver.go @@ -97,6 +97,13 @@ type Resources struct { CpuShares int64 `json:"cpu_shares"` } +type Mount struct { + Source string `json:"source"` + Destination string `json:"destination"` + Writable bool `json:"writable"` + Private bool `json:"private"` +} + // Process wrapps an os/exec.Cmd to add more metadata type Command struct { exec.Cmd `json:"-"` @@ -114,6 +121,7 @@ type Command struct { Network *Network `json:"network"` // if network is nil then networking is disabled Config []string `json:"config"` // generic values that specific drivers can consume Resources *Resources `json:"resources"` + Mounts []Mount `json:"mounts"` Terminal Terminal `json:"-"` // standard or tty terminal Console string `json:"-"` // dev/console path diff --git a/execdriver/execdrivers/execdrivers.go b/execdriver/execdrivers/execdrivers.go index 95b2fc634d..7486d649c1 100644 --- a/execdriver/execdrivers/execdrivers.go +++ b/execdriver/execdrivers/execdrivers.go @@ -9,7 +9,7 @@ import ( "path" ) -func NewDriver(name, root string, sysInfo *sysinfo.SysInfo) (execdriver.Driver, error) { +func NewDriver(name, root, initPath string, sysInfo *sysinfo.SysInfo) (execdriver.Driver, error) { switch name { case "lxc": // we want to five the lxc driver the full docker root because it needs @@ -17,7 +17,7 @@ func NewDriver(name, root string, sysInfo *sysinfo.SysInfo) (execdriver.Driver, // to be backwards compatible return lxc.NewDriver(root, sysInfo.AppArmor) case "native": - return native.NewDriver(path.Join(root, "execdriver", "native")) + return native.NewDriver(path.Join(root, "execdriver", "native"), initPath) } return nil, fmt.Errorf("unknown exec driver %s", name) } diff --git a/execdriver/lxc/lxc_template.go b/execdriver/lxc/lxc_template.go index 1181396a18..84cd4e442e 100644 --- a/execdriver/lxc/lxc_template.go +++ b/execdriver/lxc/lxc_template.go @@ -88,6 +88,14 @@ lxc.mount.entry = {{.Console}} {{escapeFstabSpaces $ROOTFS}}/dev/console none bi lxc.mount.entry = devpts {{escapeFstabSpaces $ROOTFS}}/dev/pts devpts newinstance,ptmxmode=0666,nosuid,noexec 0 0 lxc.mount.entry = shm {{escapeFstabSpaces $ROOTFS}}/dev/shm tmpfs size=65536k,nosuid,nodev,noexec 0 0 +{{range $value := .Mounts}} +{{if $value.Writable}} +lxc.mount.entry = {{$value.Source}} {{escapeFstabSpaces $ROOTFS}}/{{escapeFstabSpaces $value.Destination}} none bind,rw 0 0 +{{else}} +lxc.mount.entry = {{$value.Source}} {{escapeFstabSpaces $ROOTFS}}/{{escapeFstabSpaces $value.Destination}} none bind,ro 0 0 +{{end}} +{{end}} + {{if .Privileged}} {{if .AppArmor}} lxc.aa_profile = unconfined diff --git a/execdriver/native/default_template.go b/execdriver/native/default_template.go index 6e7d597b7b..2798f3b084 100644 --- a/execdriver/native/default_template.go +++ b/execdriver/native/default_template.go @@ -48,6 +48,10 @@ func createContainer(c *execdriver.Command) *libcontainer.Container { // check to see if we are running in ramdisk to disable pivot root container.NoPivotRoot = os.Getenv("DOCKER_RAMDISK") != "" + for _, m := range c.Mounts { + container.Mounts = append(container.Mounts, libcontainer.Mount{m.Source, m.Destination, m.Writable, m.Private}) + } + return container } diff --git a/execdriver/native/driver.go b/execdriver/native/driver.go index 452e802523..f6c7242620 100644 --- a/execdriver/native/driver.go +++ b/execdriver/native/driver.go @@ -55,10 +55,11 @@ func init() { } type driver struct { - root string + root string + initPath string } -func NewDriver(root string) (*driver, error) { +func NewDriver(root, initPath string) (*driver, error) { if err := os.MkdirAll(root, 0700); err != nil { return nil, err } @@ -66,7 +67,8 @@ func NewDriver(root string) (*driver, error) { return nil, err } return &driver{ - root: root, + root: root, + initPath: initPath, }, nil } @@ -210,7 +212,7 @@ func (d *dockerCommandFactory) Create(container *libcontainer.Container, console // we need to join the rootfs because nsinit will setup the rootfs and chroot initPath := filepath.Join(d.c.Rootfs, d.c.InitPath) - d.c.Path = initPath + d.c.Path = d.driver.initPath d.c.Args = append([]string{ initPath, "-driver", DriverName, diff --git a/pkg/libcontainer/container.go b/pkg/libcontainer/container.go index a777da58a4..14b4b65db7 100644 --- a/pkg/libcontainer/container.go +++ b/pkg/libcontainer/container.go @@ -23,6 +23,7 @@ type Container struct { Networks []*Network `json:"networks,omitempty"` // nil for host's network stack Cgroups *cgroups.Cgroup `json:"cgroups,omitempty"` // cgroups Context Context `json:"context,omitempty"` // generic context for specific options (apparmor, selinux) + Mounts []Mount `json:"mounts,omitempty"` } // Network defines configuration for a container's networking stack @@ -36,3 +37,12 @@ type Network struct { Gateway string `json:"gateway,omitempty"` Mtu int `json:"mtu,omitempty"` } + +// Bind mounts from the host system to the container +// +type Mount struct { + Source string `json:"source"` // Source path, in the host namespace + Destination string `json:"destination"` // Destination path, in the container + Writable bool `json:"writable"` + Private bool `json:"private"` +} diff --git a/pkg/libcontainer/nsinit/init.go b/pkg/libcontainer/nsinit/init.go index 336fc1eaaf..5d47b95057 100644 --- a/pkg/libcontainer/nsinit/init.go +++ b/pkg/libcontainer/nsinit/init.go @@ -51,7 +51,7 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol if err := system.ParentDeathSignal(); err != nil { return fmt.Errorf("parent death signal %s", err) } - if err := setupNewMountNamespace(rootfs, console, container.ReadonlyFs, container.NoPivotRoot); err != nil { + if err := setupNewMountNamespace(rootfs, container.Mounts, console, container.ReadonlyFs, container.NoPivotRoot); err != nil { return fmt.Errorf("setup mount namespace %s", err) } if err := setupNetwork(container, context); err != nil { diff --git a/pkg/libcontainer/nsinit/mount.go b/pkg/libcontainer/nsinit/mount.go index 83577cfa8c..562ae25a59 100644 --- a/pkg/libcontainer/nsinit/mount.go +++ b/pkg/libcontainer/nsinit/mount.go @@ -4,6 +4,7 @@ package nsinit import ( "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" "github.com/dotcloud/docker/pkg/system" "io/ioutil" "os" @@ -19,7 +20,7 @@ const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NOD // // There is no need to unmount the new mounts because as soon as the mount namespace // is no longer in use, the mounts will be removed automatically -func setupNewMountNamespace(rootfs, console string, readonly, noPivotRoot bool) error { +func setupNewMountNamespace(rootfs string, bindMounts []libcontainer.Mount, console string, readonly, noPivotRoot bool) error { flag := syscall.MS_PRIVATE if noPivotRoot { flag = syscall.MS_SLAVE @@ -38,6 +39,23 @@ func setupNewMountNamespace(rootfs, console string, readonly, noPivotRoot bool) if err := mountSystem(rootfs); err != nil { return fmt.Errorf("mount system %s", err) } + + for _, m := range bindMounts { + flags := syscall.MS_BIND | syscall.MS_REC + if !m.Writable { + flags = flags | syscall.MS_RDONLY + } + dest := filepath.Join(rootfs, m.Destination) + if err := system.Mount(m.Source, dest, "bind", uintptr(flags), ""); err != nil { + return fmt.Errorf("mounting %s into %s %s", m.Source, dest, err) + } + if m.Private { + if err := system.Mount("", dest, "none", uintptr(syscall.MS_PRIVATE), ""); err != nil { + return fmt.Errorf("mounting %s private %s", dest, err) + } + } + } + if err := copyDevNodes(rootfs); err != nil { return fmt.Errorf("copy dev nodes %s", err) } diff --git a/runtime/container.go b/runtime/container.go index 813147e508..2a98149f27 100644 --- a/runtime/container.go +++ b/runtime/container.go @@ -529,13 +529,13 @@ func (container *Container) Start() (err error) { return err } + populateCommand(container) + container.command.Env = env + if err := mountVolumesForContainer(container, envPath); err != nil { return err } - populateCommand(container) - container.command.Env = env - // Setup logging of stdout and stderr to disk if err := container.runtime.LogToDisk(container.stdout, container.logPath("json"), "stdout"); err != nil { return err diff --git a/runtime/runtime.go b/runtime/runtime.go index 72245a4555..28e7bbd1e4 100644 --- a/runtime/runtime.go +++ b/runtime/runtime.go @@ -733,7 +733,7 @@ func NewRuntimeFromDirectory(config *daemonconfig.Config, eng *engine.Engine) (* } sysInfo := sysinfo.New(false) - ed, err := execdrivers.NewDriver(config.ExecDriver, config.Root, sysInfo) + ed, err := execdrivers.NewDriver(config.ExecDriver, config.Root, sysInitPath, sysInfo) if err != nil { return nil, err } diff --git a/runtime/volumes.go b/runtime/volumes.go index 1a548eca47..81a305f72c 100644 --- a/runtime/volumes.go +++ b/runtime/volumes.go @@ -3,6 +3,7 @@ package runtime import ( "fmt" "github.com/dotcloud/docker/archive" + "github.com/dotcloud/docker/execdriver" "github.com/dotcloud/docker/pkg/mount" "github.com/dotcloud/docker/utils" "io/ioutil" @@ -55,70 +56,33 @@ func mountVolumesForContainer(container *Container, envPath string) error { return err } - // Mount docker specific files into the containers root fs - if err := mount.Mount(runtime.sysInitPath, filepath.Join(root, "/.dockerinit"), "none", "bind,ro"); err != nil { - return err - } - if err := mount.Mount(envPath, filepath.Join(root, "/.dockerenv"), "none", "bind,ro"); err != nil { - return err - } - if err := mount.Mount(container.ResolvConfPath, filepath.Join(root, "/etc/resolv.conf"), "none", "bind,ro"); err != nil { - return err + mounts := []execdriver.Mount{ + {runtime.sysInitPath, "/.dockerinit", false, true}, + {envPath, "/.dockerenv", false, true}, + {container.ResolvConfPath, "/etc/resolv.conf", false, true}, } if container.HostnamePath != "" && container.HostsPath != "" { - if err := mount.Mount(container.HostnamePath, filepath.Join(root, "/etc/hostname"), "none", "bind,ro"); err != nil { - return err - } - if err := mount.Mount(container.HostsPath, filepath.Join(root, "/etc/hosts"), "none", "bind,ro"); err != nil { - return err - } + mounts = append(mounts, execdriver.Mount{container.HostnamePath, "/etc/hostname", false, true}) + mounts = append(mounts, execdriver.Mount{container.HostsPath, "/etc/hosts", false, true}) } // Mount user specified volumes + // Note, these are not private because you may want propagation of (un)mounts from host + // volumes. For instance if you use -v /usr:/usr and the host later mounts /usr/share you + // want this new mount in the container for r, v := range container.Volumes { - mountAs := "ro" - if container.VolumesRW[r] { - mountAs = "rw" - } - - r = filepath.Join(root, r) - if p, err := utils.FollowSymlinkInScope(r, root); err != nil { - return err - } else { - r = p - } - - if err := mount.Mount(v, r, "none", fmt.Sprintf("bind,%s", mountAs)); err != nil { - return err - } + mounts = append(mounts, execdriver.Mount{v, r, container.VolumesRW[r], false}) } + + container.command.Mounts = mounts + return nil } func unmountVolumesForContainer(container *Container) { - var ( - root = container.RootfsPath() - mounts = []string{ - root, - filepath.Join(root, "/.dockerinit"), - filepath.Join(root, "/.dockerenv"), - filepath.Join(root, "/etc/resolv.conf"), - } - ) - - if container.HostnamePath != "" && container.HostsPath != "" { - mounts = append(mounts, filepath.Join(root, "/etc/hostname"), filepath.Join(root, "/etc/hosts")) - } - - for r := range container.Volumes { - mounts = append(mounts, filepath.Join(root, r)) - } - - for i := len(mounts) - 1; i >= 0; i-- { - if lastError := mount.Unmount(mounts[i]); lastError != nil { - log.Printf("Failed to umount %v: %v", mounts[i], lastError) - } + if err := mount.Unmount(container.RootfsPath()); err != nil { + log.Printf("Failed to umount container: %v", err) } }