diff --git a/daemon/execdriver/lxc/driver.go b/daemon/execdriver/lxc/driver.go index 1ebb73e807..1232d608a3 100644 --- a/daemon/execdriver/lxc/driver.go +++ b/daemon/execdriver/lxc/driver.go @@ -59,9 +59,10 @@ func init() { } type driver struct { - root string // root path for the driver to use - apparmor bool - sharedRoot bool + root string // root path for the driver to use + apparmor bool + sharedRoot bool + restrictionPath string } func NewDriver(root string, apparmor bool) (*driver, error) { @@ -69,10 +70,15 @@ func NewDriver(root string, apparmor bool) (*driver, error) { if err := linkLxcStart(root); err != nil { return nil, err } + restrictionPath := filepath.Join(root, "empty") + if err := os.MkdirAll(restrictionPath, 0700); err != nil { + return nil, err + } return &driver{ - apparmor: apparmor, - root: root, - sharedRoot: rootIsShared(), + apparmor: apparmor, + root: root, + sharedRoot: rootIsShared(), + restrictionPath: restrictionPath, }, nil } @@ -403,14 +409,16 @@ func (d *driver) generateLXCConfig(c *execdriver.Command) (string, error) { if err := LxcTemplateCompiled.Execute(fo, struct { *execdriver.Command - AppArmor bool - ProcessLabel string - MountLabel string + AppArmor bool + ProcessLabel string + MountLabel string + RestrictionSource string }{ - Command: c, - AppArmor: d.apparmor, - ProcessLabel: process, - MountLabel: mount, + Command: c, + AppArmor: d.apparmor, + ProcessLabel: process, + MountLabel: mount, + RestrictionSource: d.restrictionPath, }); err != nil { return "", err } diff --git a/daemon/execdriver/lxc/lxc_template.go b/daemon/execdriver/lxc/lxc_template.go index f4cb3d19eb..bc94e7a19d 100644 --- a/daemon/execdriver/lxc/lxc_template.go +++ b/daemon/execdriver/lxc/lxc_template.go @@ -88,7 +88,9 @@ lxc.mount.entry = proc {{escapeFstabSpaces $ROOTFS}}/proc proc nosuid,nodev,noex # WARNING: sysfs is a known attack vector and should probably be disabled # if your userspace allows it. eg. see http://bit.ly/T9CkqJ +{{if .Privileged}} lxc.mount.entry = sysfs {{escapeFstabSpaces $ROOTFS}}/sys sysfs nosuid,nodev,noexec 0 0 +{{end}} {{if .Tty}} lxc.mount.entry = {{.Console}} {{escapeFstabSpaces $ROOTFS}}/dev/console none bind,rw 0 0 @@ -109,8 +111,15 @@ lxc.mount.entry = {{$value.Source}} {{escapeFstabSpaces $ROOTFS}}/{{escapeFstabS {{if .AppArmor}} lxc.aa_profile = unconfined {{else}} -#lxc.aa_profile = unconfined +# not unconfined {{end}} +{{else}} +# restrict access to proc +lxc.mount.entry = {{.RestrictionSource}} {{escapeFstabSpaces $ROOTFS}}/proc/sys none bind,ro 0 0 +lxc.mount.entry = {{.RestrictionSource}} {{escapeFstabSpaces $ROOTFS}}/proc/irq none bind,ro 0 0 +lxc.mount.entry = {{.RestrictionSource}} {{escapeFstabSpaces $ROOTFS}}/proc/acpi none bind,ro 0 0 +lxc.mount.entry = {{escapeFstabSpaces $ROOTFS}}/dev/null {{escapeFstabSpaces $ROOTFS}}/proc/sysrq-trigger none bind,ro 0 0 +lxc.mount.entry = {{escapeFstabSpaces $ROOTFS}}/dev/null {{escapeFstabSpaces $ROOTFS}}/proc/kcore none bind,ro 0 0 {{end}} # limits diff --git a/daemon/execdriver/native/create.go b/daemon/execdriver/native/create.go index 334a97ad4b..f724ef67e6 100644 --- a/daemon/execdriver/native/create.go +++ b/daemon/execdriver/native/create.go @@ -25,6 +25,7 @@ func (d *driver) createContainer(c *execdriver.Command) (*libcontainer.Container container.Cgroups.Name = c.ID // check to see if we are running in ramdisk to disable pivot root container.NoPivotRoot = os.Getenv("DOCKER_RAMDISK") != "" + container.Context["restriction_path"] = d.restrictionPath if err := d.createNetwork(container, c); err != nil { return nil, err @@ -33,6 +34,8 @@ func (d *driver) createContainer(c *execdriver.Command) (*libcontainer.Container if err := d.setPrivileged(container); err != nil { return nil, err } + } else { + container.Mounts = append(container.Mounts, libcontainer.Mount{Type: "devtmpfs"}) } if err := d.setupCgroups(container, c); err != nil { return nil, err @@ -81,6 +84,11 @@ func (d *driver) setPrivileged(container *libcontainer.Container) error { c.Enabled = true } container.Cgroups.DeviceAccess = true + + // add sysfs as a mount for privileged containers + container.Mounts = append(container.Mounts, libcontainer.Mount{Type: "sysfs"}) + delete(container.Context, "restriction_path") + if apparmor.IsEnabled() { container.Context["apparmor_profile"] = "unconfined" } @@ -99,7 +107,13 @@ func (d *driver) setupCgroups(container *libcontainer.Container, c *execdriver.C func (d *driver) setupMounts(container *libcontainer.Container, c *execdriver.Command) error { for _, m := range c.Mounts { - container.Mounts = append(container.Mounts, libcontainer.Mount{m.Source, m.Destination, m.Writable, m.Private}) + container.Mounts = append(container.Mounts, libcontainer.Mount{ + Type: "bind", + Source: m.Source, + Destination: m.Destination, + Writable: m.Writable, + Private: m.Private, + }) } return nil } diff --git a/daemon/execdriver/native/driver.go b/daemon/execdriver/native/driver.go index ab82cdcc65..8b374d9938 100644 --- a/daemon/execdriver/native/driver.go +++ b/daemon/execdriver/native/driver.go @@ -23,7 +23,7 @@ import ( const ( DriverName = "native" - Version = "0.1" + Version = "0.2" BackupApparmorProfilePath = "apparmor/docker.back" // relative to docker root ) @@ -62,6 +62,7 @@ type driver struct { root string initPath string activeContainers map[string]*exec.Cmd + restrictionPath string } func NewDriver(root, initPath string) (*driver, error) { @@ -72,8 +73,14 @@ func NewDriver(root, initPath string) (*driver, error) { if err := apparmor.InstallDefaultProfile(filepath.Join(root, "../..", BackupApparmorProfilePath)); err != nil { return nil, err } + restrictionPath := filepath.Join(root, "empty") + if err := os.MkdirAll(restrictionPath, 0700); err != nil { + return nil, err + } + return &driver{ root: root, + restrictionPath: restrictionPath, initPath: initPath, activeContainers: make(map[string]*exec.Cmd), }, nil diff --git a/integration-cli/docker_cli_run_test.go b/integration-cli/docker_cli_run_test.go index d356f5f4de..5973f2fe1b 100644 --- a/integration-cli/docker_cli_run_test.go +++ b/integration-cli/docker_cli_run_test.go @@ -665,3 +665,25 @@ func TestUnPrivilegedCannotMount(t *testing.T) { logDone("run - test un-privileged cannot mount") } + +func TestSysNotAvaliableInNonPrivilegedContainers(t *testing.T) { + cmd := exec.Command(dockerBinary, "run", "busybox", "ls", "/sys/kernel") + if code, err := runCommand(cmd); err == nil || code == 0 { + t.Fatal("sys should not be available in a non privileged container") + } + + deleteAllContainers() + + logDone("run - sys not avaliable in non privileged container") +} + +func TestSysAvaliableInPrivilegedContainers(t *testing.T) { + cmd := exec.Command(dockerBinary, "run", "--privileged", "busybox", "ls", "/sys/kernel") + if code, err := runCommand(cmd); err != nil || code != 0 { + t.Fatalf("sys should be available in privileged container") + } + + deleteAllContainers() + + logDone("run - sys avaliable in privileged container") +} diff --git a/pkg/libcontainer/README.md b/pkg/libcontainer/README.md index d6d0fbae44..31031b26cd 100644 --- a/pkg/libcontainer/README.md +++ b/pkg/libcontainer/README.md @@ -16,76 +16,149 @@ process are specified in this file. The configuration is used for each process Sample `container.json` file: ```json { + "mounts" : [ + { + "type" : "devtmpfs" + } + ], + "tty" : true, + "environment" : [ + "HOME=/", + "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", + "container=docker", + "TERM=xterm-256color" + ], "hostname" : "koye", + "cgroups" : { + "parent" : "docker", + "name" : "docker-koye" + }, + "capabilities_mask" : [ + { + "value" : 8, + "key" : "SETPCAP", + "enabled" : false + }, + { + "enabled" : false, + "value" : 16, + "key" : "SYS_MODULE" + }, + { + "value" : 17, + "key" : "SYS_RAWIO", + "enabled" : false + }, + { + "key" : "SYS_PACCT", + "value" : 20, + "enabled" : false + }, + { + "value" : 21, + "key" : "SYS_ADMIN", + "enabled" : false + }, + { + "value" : 23, + "key" : "SYS_NICE", + "enabled" : false + }, + { + "value" : 24, + "key" : "SYS_RESOURCE", + "enabled" : false + }, + { + "key" : "SYS_TIME", + "value" : 25, + "enabled" : false + }, + { + "enabled" : false, + "value" : 26, + "key" : "SYS_TTY_CONFIG" + }, + { + "key" : "AUDIT_WRITE", + "value" : 29, + "enabled" : false + }, + { + "value" : 30, + "key" : "AUDIT_CONTROL", + "enabled" : false + }, + { + "enabled" : false, + "key" : "MAC_OVERRIDE", + "value" : 32 + }, + { + "enabled" : false, + "key" : "MAC_ADMIN", + "value" : 33 + }, + { + "key" : "NET_ADMIN", + "value" : 12, + "enabled" : false + }, + { + "value" : 27, + "key" : "MKNOD", + "enabled" : true + } + ], "networks" : [ { - "gateway" : "172.17.42.1", + "mtu" : 1500, + "address" : "127.0.0.1/0", + "type" : "loopback", + "gateway" : "localhost" + }, + { + "mtu" : 1500, + "address" : "172.17.42.2/16", + "type" : "veth", "context" : { "bridge" : "docker0", "prefix" : "veth" }, - "address" : "172.17.0.2/16", - "type" : "veth", - "mtu" : 1500 - } - ], - "cgroups" : { - "parent" : "docker", - "name" : "11bb30683fb0bdd57fab4d3a8238877f1e4395a2cfc7320ea359f7a02c1a5620" - }, - "tty" : true, - "environment" : [ - "HOME=/", - "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", - "HOSTNAME=11bb30683fb0", - "TERM=xterm" - ], - "capabilities_mask" : [ - "SETPCAP", - "SYS_MODULE", - "SYS_RAWIO", - "SYS_PACCT", - "SYS_ADMIN", - "SYS_NICE", - "SYS_RESOURCE", - "SYS_TIME", - "SYS_TTY_CONFIG", - "MKNOD", - "AUDIT_WRITE", - "AUDIT_CONTROL", - "MAC_OVERRIDE", - "MAC_ADMIN", - "NET_ADMIN" - ], - "context" : { - "apparmor_profile" : "docker-default" - }, - "mounts" : [ - { - "source" : "/var/lib/docker/containers/11bb30683fb0bdd57fab4d3a8238877f1e4395a2cfc7320ea359f7a02c1a5620/resolv.conf", - "writable" : false, - "destination" : "/etc/resolv.conf", - "private" : true - }, - { - "source" : "/var/lib/docker/containers/11bb30683fb0bdd57fab4d3a8238877f1e4395a2cfc7320ea359f7a02c1a5620/hostname", - "writable" : false, - "destination" : "/etc/hostname", - "private" : true - }, - { - "source" : "/var/lib/docker/containers/11bb30683fb0bdd57fab4d3a8238877f1e4395a2cfc7320ea359f7a02c1a5620/hosts", - "writable" : false, - "destination" : "/etc/hosts", - "private" : true + "gateway" : "172.17.42.1" } ], "namespaces" : [ - "NEWNS", - "NEWUTS", - "NEWIPC", - "NEWPID", - "NEWNET" + { + "key" : "NEWNS", + "value" : 131072, + "enabled" : true, + "file" : "mnt" + }, + { + "key" : "NEWUTS", + "value" : 67108864, + "enabled" : true, + "file" : "uts" + }, + { + "enabled" : true, + "file" : "ipc", + "key" : "NEWIPC", + "value" : 134217728 + }, + { + "file" : "pid", + "enabled" : true, + "value" : 536870912, + "key" : "NEWPID" + }, + { + "enabled" : true, + "file" : "net", + "key" : "NEWNET", + "value" : 1073741824 + } ] } ``` diff --git a/pkg/libcontainer/console/console.go b/pkg/libcontainer/console/console.go new file mode 100644 index 0000000000..05cd08a92e --- /dev/null +++ b/pkg/libcontainer/console/console.go @@ -0,0 +1,60 @@ +// +build linux + +package console + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/label" + "github.com/dotcloud/docker/pkg/system" + "os" + "path/filepath" + "syscall" +) + +// Setup initializes the proper /dev/console inside the rootfs path +func Setup(rootfs, consolePath, mountLabel string) error { + oldMask := system.Umask(0000) + defer system.Umask(oldMask) + + stat, err := os.Stat(consolePath) + if err != nil { + return fmt.Errorf("stat console %s %s", consolePath, err) + } + var ( + st = stat.Sys().(*syscall.Stat_t) + dest = filepath.Join(rootfs, "dev/console") + ) + if err := os.Remove(dest); err != nil && !os.IsNotExist(err) { + return fmt.Errorf("remove %s %s", dest, err) + } + if err := os.Chmod(consolePath, 0600); err != nil { + return err + } + if err := os.Chown(consolePath, 0, 0); err != nil { + return err + } + if err := system.Mknod(dest, (st.Mode&^07777)|0600, int(st.Rdev)); err != nil { + return fmt.Errorf("mknod %s %s", dest, err) + } + if err := label.SetFileLabel(consolePath, mountLabel); err != nil { + return fmt.Errorf("set file label %s %s", dest, err) + } + if err := system.Mount(consolePath, dest, "bind", syscall.MS_BIND, ""); err != nil { + return fmt.Errorf("bind %s to %s %s", consolePath, dest, err) + } + return nil +} + +func OpenAndDup(consolePath string) error { + slave, err := system.OpenTerminal(consolePath, syscall.O_RDWR) + if err != nil { + return fmt.Errorf("open terminal %s", err) + } + if err := system.Dup2(slave.Fd(), 0); err != nil { + return err + } + if err := system.Dup2(slave.Fd(), 1); err != nil { + return err + } + return system.Dup2(slave.Fd(), 2) +} diff --git a/pkg/libcontainer/container.go b/pkg/libcontainer/container.go index c7cac35428..ddcc6cab70 100644 --- a/pkg/libcontainer/container.go +++ b/pkg/libcontainer/container.go @@ -23,7 +23,7 @@ type Container struct { Networks []*Network `json:"networks,omitempty"` // nil for host's network stack Cgroups *cgroups.Cgroup `json:"cgroups,omitempty"` // cgroups Context Context `json:"context,omitempty"` // generic context for specific options (apparmor, selinux) - Mounts []Mount `json:"mounts,omitempty"` + Mounts Mounts `json:"mounts,omitempty"` } // Network defines configuration for a container's networking stack @@ -37,12 +37,3 @@ type Network struct { Gateway string `json:"gateway,omitempty"` Mtu int `json:"mtu,omitempty"` } - -// Bind mounts from the host system to the container -// -type Mount struct { - Source string `json:"source"` // Source path, in the host namespace - Destination string `json:"destination"` // Destination path, in the container - Writable bool `json:"writable"` - Private bool `json:"private"` -} diff --git a/pkg/libcontainer/container.json b/pkg/libcontainer/container.json index f045315a41..f15a49ab05 100644 --- a/pkg/libcontainer/container.json +++ b/pkg/libcontainer/container.json @@ -1,50 +1,146 @@ { - "hostname": "koye", - "tty": true, - "environment": [ - "HOME=/", - "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", - "container=docker", - "TERM=xterm-256color" - ], - "namespaces": [ - "NEWIPC", - "NEWNS", - "NEWPID", - "NEWUTS", - "NEWNET" - ], - "capabilities_mask": [ - "SETPCAP", - "SYS_MODULE", - "SYS_RAWIO", - "SYS_PACCT", - "SYS_ADMIN", - "SYS_NICE", - "SYS_RESOURCE", - "SYS_TIME", - "SYS_TTY_CONFIG", - "MKNOD", - "AUDIT_WRITE", - "AUDIT_CONTROL", - "MAC_OVERRIDE", - "MAC_ADMIN", - "NET_ADMIN" - ], - "networks": [{ - "type": "veth", - "context": { - "bridge": "docker0", - "prefix": "dock" - }, - "address": "172.17.0.100/16", - "gateway": "172.17.42.1", - "mtu": 1500 - } - ], - "cgroups": { - "name": "docker-koye", - "parent": "docker", - "memory": 5248000 - } + "mounts" : [ + { + "type" : "devtmpfs" + } + ], + "tty" : true, + "environment" : [ + "HOME=/", + "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", + "container=docker", + "TERM=xterm-256color" + ], + "hostname" : "koye", + "cgroups" : { + "parent" : "docker", + "name" : "docker-koye" + }, + "capabilities_mask" : [ + { + "value" : 8, + "key" : "SETPCAP", + "enabled" : false + }, + { + "enabled" : false, + "value" : 16, + "key" : "SYS_MODULE" + }, + { + "value" : 17, + "key" : "SYS_RAWIO", + "enabled" : false + }, + { + "key" : "SYS_PACCT", + "value" : 20, + "enabled" : false + }, + { + "value" : 21, + "key" : "SYS_ADMIN", + "enabled" : false + }, + { + "value" : 23, + "key" : "SYS_NICE", + "enabled" : false + }, + { + "value" : 24, + "key" : "SYS_RESOURCE", + "enabled" : false + }, + { + "key" : "SYS_TIME", + "value" : 25, + "enabled" : false + }, + { + "enabled" : false, + "value" : 26, + "key" : "SYS_TTY_CONFIG" + }, + { + "key" : "AUDIT_WRITE", + "value" : 29, + "enabled" : false + }, + { + "value" : 30, + "key" : "AUDIT_CONTROL", + "enabled" : false + }, + { + "enabled" : false, + "key" : "MAC_OVERRIDE", + "value" : 32 + }, + { + "enabled" : false, + "key" : "MAC_ADMIN", + "value" : 33 + }, + { + "key" : "NET_ADMIN", + "value" : 12, + "enabled" : false + }, + { + "value" : 27, + "key" : "MKNOD", + "enabled" : true + } + ], + "networks" : [ + { + "mtu" : 1500, + "address" : "127.0.0.1/0", + "type" : "loopback", + "gateway" : "localhost" + }, + { + "mtu" : 1500, + "address" : "172.17.42.2/16", + "type" : "veth", + "context" : { + "bridge" : "docker0", + "prefix" : "veth" + }, + "gateway" : "172.17.42.1" + } + ], + "namespaces" : [ + { + "key" : "NEWNS", + "value" : 131072, + "enabled" : true, + "file" : "mnt" + }, + { + "key" : "NEWUTS", + "value" : 67108864, + "enabled" : true, + "file" : "uts" + }, + { + "enabled" : true, + "file" : "ipc", + "key" : "NEWIPC", + "value" : 134217728 + }, + { + "file" : "pid", + "enabled" : true, + "value" : 536870912, + "key" : "NEWPID" + }, + { + "enabled" : true, + "file" : "net", + "key" : "NEWNET", + "value" : 1073741824 + } + ] } diff --git a/pkg/libcontainer/mount/init.go b/pkg/libcontainer/mount/init.go new file mode 100644 index 0000000000..06b2c82f56 --- /dev/null +++ b/pkg/libcontainer/mount/init.go @@ -0,0 +1,143 @@ +// +build linux + +package mount + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/label" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/mount/nodes" + "github.com/dotcloud/docker/pkg/libcontainer/security/restrict" + "github.com/dotcloud/docker/pkg/system" + "os" + "path/filepath" + "syscall" +) + +// default mount point flags +const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV + +type mount struct { + source string + path string + device string + flags int + data string +} + +// InitializeMountNamespace setups up the devices, mount points, and filesystems for use inside a +// new mount namepsace +func InitializeMountNamespace(rootfs, console string, container *libcontainer.Container) error { + var ( + err error + flag = syscall.MS_PRIVATE + ) + if container.NoPivotRoot { + flag = syscall.MS_SLAVE + } + if err := system.Mount("", "/", "", uintptr(flag|syscall.MS_REC), ""); err != nil { + return fmt.Errorf("mounting / as slave %s", err) + } + if err := system.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil { + return fmt.Errorf("mouting %s as bind %s", rootfs, err) + } + if err := mountSystem(rootfs, container); err != nil { + return fmt.Errorf("mount system %s", err) + } + if err := setupBindmounts(rootfs, container.Mounts); err != nil { + return fmt.Errorf("bind mounts %s", err) + } + if err := nodes.CopyN(rootfs, nodes.DefaultNodes); err != nil { + return fmt.Errorf("copy dev nodes %s", err) + } + if restrictionPath := container.Context["restriction_path"]; restrictionPath != "" { + if err := restrict.Restrict(rootfs, restrictionPath); err != nil { + return fmt.Errorf("restrict %s", err) + } + } + if err := SetupPtmx(rootfs, console, container.Context["mount_label"]); err != nil { + return err + } + if err := system.Chdir(rootfs); err != nil { + return fmt.Errorf("chdir into %s %s", rootfs, err) + } + + if container.NoPivotRoot { + err = MsMoveRoot(rootfs) + } else { + err = PivotRoot(rootfs) + } + if err != nil { + return err + } + + if container.ReadonlyFs { + if err := SetReadonly(); err != nil { + return fmt.Errorf("set readonly %s", err) + } + } + + system.Umask(0022) + + return nil +} + +// mountSystem sets up linux specific system mounts like sys, proc, shm, and devpts +// inside the mount namespace +func mountSystem(rootfs string, container *libcontainer.Container) error { + for _, m := range newSystemMounts(rootfs, container.Context["mount_label"], container.Mounts) { + if err := os.MkdirAll(m.path, 0755); err != nil && !os.IsExist(err) { + return fmt.Errorf("mkdirall %s %s", m.path, err) + } + if err := system.Mount(m.source, m.path, m.device, uintptr(m.flags), m.data); err != nil { + return fmt.Errorf("mounting %s into %s %s", m.source, m.path, err) + } + } + return nil +} + +func setupBindmounts(rootfs string, bindMounts libcontainer.Mounts) error { + for _, m := range bindMounts.OfType("bind") { + var ( + flags = syscall.MS_BIND | syscall.MS_REC + dest = filepath.Join(rootfs, m.Destination) + ) + if !m.Writable { + flags = flags | syscall.MS_RDONLY + } + if err := system.Mount(m.Source, dest, "bind", uintptr(flags), ""); err != nil { + return fmt.Errorf("mounting %s into %s %s", m.Source, dest, err) + } + if !m.Writable { + if err := system.Mount(m.Source, dest, "bind", uintptr(flags|syscall.MS_REMOUNT), ""); err != nil { + return fmt.Errorf("remounting %s into %s %s", m.Source, dest, err) + } + } + if m.Private { + if err := system.Mount("", dest, "none", uintptr(syscall.MS_PRIVATE), ""); err != nil { + return fmt.Errorf("mounting %s private %s", dest, err) + } + } + } + return nil +} + +// TODO: this is crappy right now and should be cleaned up with a better way of handling system and +// standard bind mounts allowing them to be more dymanic +func newSystemMounts(rootfs, mountLabel string, mounts libcontainer.Mounts) []mount { + systemMounts := []mount{ + {source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaultMountFlags}, + } + + if len(mounts.OfType("devtmpfs")) == 1 { + systemMounts = append(systemMounts, mount{source: "tmpfs", path: filepath.Join(rootfs, "dev"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME, data: "mode=755"}) + } + systemMounts = append(systemMounts, + mount{source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaultMountFlags, data: label.FormatMountLabel("mode=1777,size=65536k", mountLabel)}, + mount{source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: label.FormatMountLabel("newinstance,ptmxmode=0666,mode=620,gid=5", mountLabel)}) + + if len(mounts.OfType("sysfs")) == 1 { + systemMounts = append(systemMounts, mount{source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaultMountFlags}) + } + return systemMounts +} diff --git a/pkg/libcontainer/mount/msmoveroot.go b/pkg/libcontainer/mount/msmoveroot.go new file mode 100644 index 0000000000..b336c86495 --- /dev/null +++ b/pkg/libcontainer/mount/msmoveroot.go @@ -0,0 +1,19 @@ +// +build linux + +package mount + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/system" + "syscall" +) + +func MsMoveRoot(rootfs string) error { + if err := system.Mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil { + return fmt.Errorf("mount move %s into / %s", rootfs, err) + } + if err := system.Chroot("."); err != nil { + return fmt.Errorf("chroot . %s", err) + } + return system.Chdir("/") +} diff --git a/pkg/libcontainer/mount/nodes/nodes.go b/pkg/libcontainer/mount/nodes/nodes.go new file mode 100644 index 0000000000..5022f85b0b --- /dev/null +++ b/pkg/libcontainer/mount/nodes/nodes.go @@ -0,0 +1,49 @@ +// +build linux + +package nodes + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/system" + "os" + "path/filepath" + "syscall" +) + +// Default list of device nodes to copy +var DefaultNodes = []string{ + "null", + "zero", + "full", + "random", + "urandom", + "tty", +} + +// CopyN copies the device node from the host into the rootfs +func CopyN(rootfs string, nodesToCopy []string) error { + oldMask := system.Umask(0000) + defer system.Umask(oldMask) + + for _, node := range nodesToCopy { + if err := Copy(rootfs, node); err != nil { + return err + } + } + return nil +} + +func Copy(rootfs, node string) error { + stat, err := os.Stat(filepath.Join("/dev", node)) + if err != nil { + return err + } + var ( + dest = filepath.Join(rootfs, "dev", node) + st = stat.Sys().(*syscall.Stat_t) + ) + if err := system.Mknod(dest, st.Mode, int(st.Rdev)); err != nil && !os.IsExist(err) { + return fmt.Errorf("copy %s %s", node, err) + } + return nil +} diff --git a/pkg/libcontainer/mount/pivotroot.go b/pkg/libcontainer/mount/pivotroot.go new file mode 100644 index 0000000000..447f5904b2 --- /dev/null +++ b/pkg/libcontainer/mount/pivotroot.go @@ -0,0 +1,31 @@ +// +build linux + +package mount + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/system" + "io/ioutil" + "os" + "path/filepath" + "syscall" +) + +func PivotRoot(rootfs string) error { + pivotDir, err := ioutil.TempDir(rootfs, ".pivot_root") + if err != nil { + return fmt.Errorf("can't create pivot_root dir %s", pivotDir, err) + } + if err := system.Pivotroot(rootfs, pivotDir); err != nil { + return fmt.Errorf("pivot_root %s", err) + } + if err := system.Chdir("/"); err != nil { + return fmt.Errorf("chdir / %s", err) + } + // path to pivot dir now changed, update + pivotDir = filepath.Join("/", filepath.Base(pivotDir)) + if err := system.Unmount(pivotDir, syscall.MNT_DETACH); err != nil { + return fmt.Errorf("unmount pivot_root dir %s", err) + } + return os.Remove(pivotDir) +} diff --git a/pkg/libcontainer/mount/ptmx.go b/pkg/libcontainer/mount/ptmx.go new file mode 100644 index 0000000000..f6ca534637 --- /dev/null +++ b/pkg/libcontainer/mount/ptmx.go @@ -0,0 +1,26 @@ +// +build linux + +package mount + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer/console" + "os" + "path/filepath" +) + +func SetupPtmx(rootfs, consolePath, mountLabel string) error { + ptmx := filepath.Join(rootfs, "dev/ptmx") + if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) { + return err + } + if err := os.Symlink("pts/ptmx", ptmx); err != nil { + return fmt.Errorf("symlink dev ptmx %s", err) + } + if consolePath != "" { + if err := console.Setup(rootfs, consolePath, mountLabel); err != nil { + return err + } + } + return nil +} diff --git a/pkg/libcontainer/mount/readonly.go b/pkg/libcontainer/mount/readonly.go new file mode 100644 index 0000000000..0658358ad6 --- /dev/null +++ b/pkg/libcontainer/mount/readonly.go @@ -0,0 +1,12 @@ +// +build linux + +package mount + +import ( + "github.com/dotcloud/docker/pkg/system" + "syscall" +) + +func SetReadonly() error { + return system.Mount("/", "/", "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, "") +} diff --git a/pkg/libcontainer/mount/remount.go b/pkg/libcontainer/mount/remount.go new file mode 100644 index 0000000000..3e00509ae0 --- /dev/null +++ b/pkg/libcontainer/mount/remount.go @@ -0,0 +1,31 @@ +// +build linux + +package mount + +import ( + "github.com/dotcloud/docker/pkg/system" + "syscall" +) + +func RemountProc() error { + if err := system.Unmount("/proc", syscall.MNT_DETACH); err != nil { + return err + } + if err := system.Mount("proc", "/proc", "proc", uintptr(defaultMountFlags), ""); err != nil { + return err + } + return nil +} + +func RemountSys() error { + if err := system.Unmount("/sys", syscall.MNT_DETACH); err != nil { + if err != syscall.EINVAL { + return err + } + } else { + if err := system.Mount("sysfs", "/sys", "sysfs", uintptr(defaultMountFlags), ""); err != nil { + return err + } + } + return nil +} diff --git a/pkg/libcontainer/nsinit/execin.go b/pkg/libcontainer/nsinit/execin.go index 9017af06e9..b79881015f 100644 --- a/pkg/libcontainer/nsinit/execin.go +++ b/pkg/libcontainer/nsinit/execin.go @@ -6,6 +6,7 @@ import ( "fmt" "github.com/dotcloud/docker/pkg/label" "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/mount" "github.com/dotcloud/docker/pkg/system" "os" "path/filepath" @@ -63,10 +64,10 @@ func (ns *linuxNs) ExecIn(container *libcontainer.Container, nspid int, args []s if err := system.Unshare(syscall.CLONE_NEWNS); err != nil { return -1, err } - if err := remountProc(); err != nil { + if err := mount.RemountProc(); err != nil { return -1, fmt.Errorf("remount proc %s", err) } - if err := remountSys(); err != nil { + if err := mount.RemountSys(); err != nil { return -1, fmt.Errorf("remount sys %s", err) } goto dropAndExec diff --git a/pkg/libcontainer/nsinit/init.go b/pkg/libcontainer/nsinit/init.go index 0e85c0e4be..67095fdba1 100644 --- a/pkg/libcontainer/nsinit/init.go +++ b/pkg/libcontainer/nsinit/init.go @@ -11,8 +11,10 @@ import ( "github.com/dotcloud/docker/pkg/apparmor" "github.com/dotcloud/docker/pkg/label" "github.com/dotcloud/docker/pkg/libcontainer" - "github.com/dotcloud/docker/pkg/libcontainer/capabilities" + "github.com/dotcloud/docker/pkg/libcontainer/console" + "github.com/dotcloud/docker/pkg/libcontainer/mount" "github.com/dotcloud/docker/pkg/libcontainer/network" + "github.com/dotcloud/docker/pkg/libcontainer/security/capabilities" "github.com/dotcloud/docker/pkg/libcontainer/utils" "github.com/dotcloud/docker/pkg/system" "github.com/dotcloud/docker/pkg/user" @@ -20,7 +22,7 @@ import ( // Init is the init process that first runs inside a new namespace to setup mounts, users, networking, // and other options required for the new container. -func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, console string, syncPipe *SyncPipe, args []string) error { +func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consolePath string, syncPipe *SyncPipe, args []string) error { rootfs, err := utils.ResolveRootfs(uncleanRootfs) if err != nil { return err @@ -36,20 +38,16 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol ns.logger.Println("received context from parent") syncPipe.Close() - if console != "" { - ns.logger.Printf("setting up %s as console\n", console) - slave, err := system.OpenTerminal(console, syscall.O_RDWR) - if err != nil { - return fmt.Errorf("open terminal %s", err) - } - if err := dupSlave(slave); err != nil { - return fmt.Errorf("dup2 slave %s", err) + if consolePath != "" { + ns.logger.Printf("setting up %s as console\n", consolePath) + if err := console.OpenAndDup(consolePath); err != nil { + return err } } if _, err := system.Setsid(); err != nil { return fmt.Errorf("setsid %s", err) } - if console != "" { + if consolePath != "" { if err := system.Setctty(); err != nil { return fmt.Errorf("setctty %s", err) } @@ -60,7 +58,7 @@ func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, consol label.Init() ns.logger.Println("setup mount namespace") - if err := setupNewMountNamespace(rootfs, container.Mounts, console, container.ReadonlyFs, container.NoPivotRoot, container.Context["mount_label"]); err != nil { + if err := mount.InitializeMountNamespace(rootfs, consolePath, container); err != nil { return fmt.Errorf("setup mount namespace %s", err) } if err := system.Sethostname(container.Hostname); err != nil { @@ -114,21 +112,6 @@ func setupUser(container *libcontainer.Container) error { return nil } -// dupSlave dup2 the pty slave's fd into stdout and stdin and ensures that -// the slave's fd is 0, or stdin -func dupSlave(slave *os.File) error { - if err := system.Dup2(slave.Fd(), 0); err != nil { - return err - } - if err := system.Dup2(slave.Fd(), 1); err != nil { - return err - } - if err := system.Dup2(slave.Fd(), 2); err != nil { - return err - } - return nil -} - // setupVethNetwork uses the Network config if it is not nil to initialize // the new veth interface inside the container for use by changing the name to eth0 // setting the MTU and IP address along with the default gateway diff --git a/pkg/libcontainer/nsinit/mount.go b/pkg/libcontainer/nsinit/mount.go deleted file mode 100644 index dd6b1c8a43..0000000000 --- a/pkg/libcontainer/nsinit/mount.go +++ /dev/null @@ -1,265 +0,0 @@ -// +build linux - -package nsinit - -import ( - "fmt" - "github.com/dotcloud/docker/pkg/label" - "github.com/dotcloud/docker/pkg/libcontainer" - "github.com/dotcloud/docker/pkg/system" - "io/ioutil" - "os" - "path/filepath" - "syscall" -) - -// default mount point flags -const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV - -// setupNewMountNamespace is used to initialize a new mount namespace for an new -// container in the rootfs that is specified. -// -// There is no need to unmount the new mounts because as soon as the mount namespace -// is no longer in use, the mounts will be removed automatically -func setupNewMountNamespace(rootfs string, bindMounts []libcontainer.Mount, console string, readonly, noPivotRoot bool, mountLabel string) error { - flag := syscall.MS_PRIVATE - if noPivotRoot { - flag = syscall.MS_SLAVE - } - if err := system.Mount("", "/", "", uintptr(flag|syscall.MS_REC), ""); err != nil { - return fmt.Errorf("mounting / as slave %s", err) - } - if err := system.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil { - return fmt.Errorf("mouting %s as bind %s", rootfs, err) - } - if err := mountSystem(rootfs, mountLabel); err != nil { - return fmt.Errorf("mount system %s", err) - } - - for _, m := range bindMounts { - var ( - flags = syscall.MS_BIND | syscall.MS_REC - dest = filepath.Join(rootfs, m.Destination) - ) - if !m.Writable { - flags = flags | syscall.MS_RDONLY - } - if err := system.Mount(m.Source, dest, "bind", uintptr(flags), ""); err != nil { - return fmt.Errorf("mounting %s into %s %s", m.Source, dest, err) - } - if !m.Writable { - if err := system.Mount(m.Source, dest, "bind", uintptr(flags|syscall.MS_REMOUNT), ""); err != nil { - return fmt.Errorf("remounting %s into %s %s", m.Source, dest, err) - } - } - if m.Private { - if err := system.Mount("", dest, "none", uintptr(syscall.MS_PRIVATE), ""); err != nil { - return fmt.Errorf("mounting %s private %s", dest, err) - } - } - } - - if err := copyDevNodes(rootfs); err != nil { - return fmt.Errorf("copy dev nodes %s", err) - } - if err := setupPtmx(rootfs, console, mountLabel); err != nil { - return err - } - if err := system.Chdir(rootfs); err != nil { - return fmt.Errorf("chdir into %s %s", rootfs, err) - } - - if noPivotRoot { - if err := rootMsMove(rootfs); err != nil { - return err - } - } else { - if err := rootPivot(rootfs); err != nil { - return err - } - } - - if readonly { - if err := system.Mount("/", "/", "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, ""); err != nil { - return fmt.Errorf("mounting %s as readonly %s", rootfs, err) - } - } - - system.Umask(0022) - - return nil -} - -// use a pivot root to setup the rootfs -func rootPivot(rootfs string) error { - pivotDir, err := ioutil.TempDir(rootfs, ".pivot_root") - if err != nil { - return fmt.Errorf("can't create pivot_root dir %s", pivotDir, err) - } - if err := system.Pivotroot(rootfs, pivotDir); err != nil { - return fmt.Errorf("pivot_root %s", err) - } - if err := system.Chdir("/"); err != nil { - return fmt.Errorf("chdir / %s", err) - } - // path to pivot dir now changed, update - pivotDir = filepath.Join("/", filepath.Base(pivotDir)) - if err := system.Unmount(pivotDir, syscall.MNT_DETACH); err != nil { - return fmt.Errorf("unmount pivot_root dir %s", err) - } - if err := os.Remove(pivotDir); err != nil { - return fmt.Errorf("remove pivot_root dir %s", err) - } - return nil -} - -// use MS_MOVE and chroot to setup the rootfs -func rootMsMove(rootfs string) error { - if err := system.Mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil { - return fmt.Errorf("mount move %s into / %s", rootfs, err) - } - if err := system.Chroot("."); err != nil { - return fmt.Errorf("chroot . %s", err) - } - if err := system.Chdir("/"); err != nil { - return fmt.Errorf("chdir / %s", err) - } - return nil -} - -// copyDevNodes mknods the hosts devices so the new container has access to them -func copyDevNodes(rootfs string) error { - oldMask := system.Umask(0000) - defer system.Umask(oldMask) - - for _, node := range []string{ - "null", - "zero", - "full", - "random", - "urandom", - "tty", - } { - if err := copyDevNode(rootfs, node); err != nil { - return err - } - } - return nil -} - -func copyDevNode(rootfs, node string) error { - stat, err := os.Stat(filepath.Join("/dev", node)) - if err != nil { - return err - } - var ( - dest = filepath.Join(rootfs, "dev", node) - st = stat.Sys().(*syscall.Stat_t) - ) - if err := system.Mknod(dest, st.Mode, int(st.Rdev)); err != nil && !os.IsExist(err) { - return fmt.Errorf("copy %s %s", node, err) - } - return nil -} - -// setupConsole ensures that the container has a proper /dev/console setup -func setupConsole(rootfs, console string, mountLabel string) error { - oldMask := system.Umask(0000) - defer system.Umask(oldMask) - - stat, err := os.Stat(console) - if err != nil { - return fmt.Errorf("stat console %s %s", console, err) - } - var ( - st = stat.Sys().(*syscall.Stat_t) - dest = filepath.Join(rootfs, "dev/console") - ) - if err := os.Remove(dest); err != nil && !os.IsNotExist(err) { - return fmt.Errorf("remove %s %s", dest, err) - } - if err := os.Chmod(console, 0600); err != nil { - return err - } - if err := os.Chown(console, 0, 0); err != nil { - return err - } - if err := system.Mknod(dest, (st.Mode&^07777)|0600, int(st.Rdev)); err != nil { - return fmt.Errorf("mknod %s %s", dest, err) - } - if err := label.SetFileLabel(console, mountLabel); err != nil { - return fmt.Errorf("SetFileLabel Failed %s %s", dest, err) - } - if err := system.Mount(console, dest, "bind", syscall.MS_BIND, ""); err != nil { - return fmt.Errorf("bind %s to %s %s", console, dest, err) - } - return nil -} - -// mountSystem sets up linux specific system mounts like sys, proc, shm, and devpts -// inside the mount namespace -func mountSystem(rootfs string, mountLabel string) error { - for _, m := range []struct { - source string - path string - device string - flags int - data string - }{ - {source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaultMountFlags}, - {source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaultMountFlags}, - {source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaultMountFlags, data: label.FormatMountLabel("mode=1777,size=65536k", mountLabel)}, - {source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: label.FormatMountLabel("newinstance,ptmxmode=0666,mode=620,gid=5", mountLabel)}, - } { - if err := os.MkdirAll(m.path, 0755); err != nil && !os.IsExist(err) { - return fmt.Errorf("mkdirall %s %s", m.path, err) - } - if err := system.Mount(m.source, m.path, m.device, uintptr(m.flags), m.data); err != nil { - return fmt.Errorf("mounting %s into %s %s", m.source, m.path, err) - } - } - return nil -} - -// setupPtmx adds a symlink to pts/ptmx for /dev/ptmx and -// finishes setting up /dev/console -func setupPtmx(rootfs, console string, mountLabel string) error { - ptmx := filepath.Join(rootfs, "dev/ptmx") - if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) { - return err - } - if err := os.Symlink("pts/ptmx", ptmx); err != nil { - return fmt.Errorf("symlink dev ptmx %s", err) - } - if console != "" { - if err := setupConsole(rootfs, console, mountLabel); err != nil { - return err - } - } - return nil -} - -// remountProc is used to detach and remount the proc filesystem -// commonly needed with running a new process inside an existing container -func remountProc() error { - if err := system.Unmount("/proc", syscall.MNT_DETACH); err != nil { - return err - } - if err := system.Mount("proc", "/proc", "proc", uintptr(defaultMountFlags), ""); err != nil { - return err - } - return nil -} - -func remountSys() error { - if err := system.Unmount("/sys", syscall.MNT_DETACH); err != nil { - if err != syscall.EINVAL { - return err - } - } else { - if err := system.Mount("sysfs", "/sys", "sysfs", uintptr(defaultMountFlags), ""); err != nil { - return err - } - } - return nil -} diff --git a/pkg/libcontainer/capabilities/capabilities.go b/pkg/libcontainer/security/capabilities/capabilities.go similarity index 100% rename from pkg/libcontainer/capabilities/capabilities.go rename to pkg/libcontainer/security/capabilities/capabilities.go diff --git a/pkg/libcontainer/security/restrict/restrict.go b/pkg/libcontainer/security/restrict/restrict.go new file mode 100644 index 0000000000..291d6ca5dc --- /dev/null +++ b/pkg/libcontainer/security/restrict/restrict.go @@ -0,0 +1,51 @@ +package restrict + +import ( + "fmt" + "os" + "path/filepath" + "syscall" + + "github.com/dotcloud/docker/pkg/system" +) + +const flags = syscall.MS_BIND | syscall.MS_REC | syscall.MS_RDONLY + +var restrictions = map[string]string{ + // dirs + "/proc/sys": "", + "/proc/irq": "", + "/proc/acpi": "", + + // files + "/proc/sysrq-trigger": "/dev/null", + "/proc/kcore": "/dev/null", +} + +// Restrict locks down access to many areas of proc +// by using the asumption that the user does not have mount caps to +// revert the changes made here +func Restrict(rootfs, empty string) error { + for dest, source := range restrictions { + dest = filepath.Join(rootfs, dest) + + // we don't have a "/dev/null" for dirs so have the requester pass a dir + // for us to bind mount + switch source { + case "": + source = empty + default: + source = filepath.Join(rootfs, source) + } + if err := system.Mount(source, dest, "bind", flags, ""); err != nil { + if os.IsNotExist(err) { + continue + } + return fmt.Errorf("unable to mount %s over %s %s", source, dest, err) + } + if err := system.Mount("", dest, "bind", flags|syscall.MS_REMOUNT, ""); err != nil { + return fmt.Errorf("unable to mount %s over %s %s", source, dest, err) + } + } + return nil +} diff --git a/pkg/libcontainer/types.go b/pkg/libcontainer/types.go index d4818c3ffe..ade3c32f1d 100644 --- a/pkg/libcontainer/types.go +++ b/pkg/libcontainer/types.go @@ -11,6 +11,26 @@ var ( ErrUnsupported = errors.New("Unsupported method") ) +type Mounts []Mount + +func (s Mounts) OfType(t string) Mounts { + out := Mounts{} + for _, m := range s { + if m.Type == t { + out = append(out, m) + } + } + return out +} + +type Mount struct { + Type string `json:"type,omitempty"` + Source string `json:"source,omitempty"` // Source path, in the host namespace + Destination string `json:"destination,omitempty"` // Destination path, in the container + Writable bool `json:"writable,omitempty"` + Private bool `json:"private,omitempty"` +} + // namespaceList is used to convert the libcontainer types // into the names of the files located in /proc//ns/* for // each namespace