From 68493e2f7f9cb8303302e1098e3293b521ace243 Mon Sep 17 00:00:00 2001 From: Alexander Larsson Date: Mon, 19 May 2014 09:51:20 +0200 Subject: [PATCH] Make /proc writable, but not /proc/sys and /proc/sysrq-trigger Some applications want to write to /proc. For instance: docker run -it centos groupadd foo Gives: groupadd: failure while writing changes to /etc/group And strace reveals why: open("/proc/self/task/13/attr/fscreate", O_RDWR) = -1 EROFS (Read-only file system) I've looked at what other systems do, and systemd-nspawn makes /proc read-write and /proc/sys readonly, while lxc allows "proc:mixed" which does the same, plus it makes /proc/sysrq-trigger also readonly. The later seems like a prudent idea, so we follows lxc proc:mixed. Additionally we make /proc/irq and /proc/bus, as these seem to let you control various hardware things. Docker-DCO-1.1-Signed-off-by: Alexander Larsson (github: alexlarsson) --- pkg/libcontainer/nsinit/init.go | 2 +- .../security/restrict/restrict.go | 21 ++++++++++++++++++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/pkg/libcontainer/nsinit/init.go b/pkg/libcontainer/nsinit/init.go index af60d739ca..25b5fc4334 100644 --- a/pkg/libcontainer/nsinit/init.go +++ b/pkg/libcontainer/nsinit/init.go @@ -81,7 +81,7 @@ func Init(container *libcontainer.Container, uncleanRootfs, consolePath string, return fmt.Errorf("set process label %s", err) } if container.Context["restrictions"] != "" { - if err := restrict.Restrict("proc", "sys"); err != nil { + if err := restrict.Restrict("proc/sys", "proc/sysrq-trigger", "proc/irq", "proc/bus", "sys"); err != nil { return err } } diff --git a/pkg/libcontainer/security/restrict/restrict.go b/pkg/libcontainer/security/restrict/restrict.go index 361d0764a0..a22a1aa73e 100644 --- a/pkg/libcontainer/security/restrict/restrict.go +++ b/pkg/libcontainer/security/restrict/restrict.go @@ -10,12 +10,31 @@ import ( "github.com/dotcloud/docker/pkg/system" ) +const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV + +func mountReadonly(path string) error { + if err := system.Mount("", path, "", syscall.MS_REMOUNT|syscall.MS_RDONLY, ""); err != nil { + if err == syscall.EINVAL { + // Probably not a mountpoint, use bind-mount + if err := system.Mount(path, path, "", syscall.MS_BIND, ""); err != nil { + return err + } + if err := system.Mount(path, path, "", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC|defaultMountFlags, ""); err != nil { + return err + } + } else { + return err + } + } + return nil +} + // This has to be called while the container still has CAP_SYS_ADMIN (to be able to perform mounts). // However, afterwards, CAP_SYS_ADMIN should be dropped (otherwise the user will be able to revert those changes). func Restrict(mounts ...string) error { // remount proc and sys as readonly for _, dest := range mounts { - if err := system.Mount("", dest, "", syscall.MS_REMOUNT|syscall.MS_RDONLY, ""); err != nil { + if err := mountReadonly(dest); err != nil { return fmt.Errorf("unable to remount %s readonly: %s", dest, err) } }