diff --git a/pkg/libcontainer/MAINTAINERS b/pkg/libcontainer/MAINTAINERS new file mode 100644 index 0000000000..e53d933d47 --- /dev/null +++ b/pkg/libcontainer/MAINTAINERS @@ -0,0 +1,2 @@ +Michael Crosby (@crosbymichael) +Guillaume Charmes (@creack) diff --git a/pkg/libcontainer/README.md b/pkg/libcontainer/README.md new file mode 100644 index 0000000000..91d747863c --- /dev/null +++ b/pkg/libcontainer/README.md @@ -0,0 +1,63 @@ +## libcontainer - reference implementation for containers + +#### playground + + +Use the cli package to test out functionality + +First setup a container configuration. You will need a root fs, better go the path to a +stopped docker container and use that. + + +```json +{ + "id": "koye", + "namespace_pid": 12265, + "command": { + "args": [ + "/bin/bash" + ], + "environment": [ + "HOME=/", + "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", + "container=docker", + "TERM=xterm" + ] + }, + "rootfs": "/root/development/gocode/src/github.com/docker/libcontainer/namespaces/ubuntu", + "network": null, + "user": "", + "working_dir": "", + "namespaces": [ + "NEWNET", + "NEWIPC", + "NEWNS", + "NEWPID", + "NEWUTS" + ], + "capabilities": [ + "SETPCAP", + "SYS_MODULE", + "SYS_RAWIO", + "SYS_PACCT", + "SYS_ADMIN", + "SYS_NICE", + "SYS_RESOURCE", + "SYS_TIME", + "SYS_TTY_CONFIG", + "MKNOD", + "AUDIT_WRITE", + "AUDIT_CONTROL", + "MAC_OVERRIDE", + "MAC_ADMIN" + ] +} +``` + +After you have a json file and a rootfs path to use just run: +`./cli exec container.json` + + +If you want to attach to an existing namespace just use the same json +file with the container still running and do: +`./cli execin container.json` diff --git a/pkg/libcontainer/capabilities/capabilities.go b/pkg/libcontainer/capabilities/capabilities.go new file mode 100644 index 0000000000..3301e10f7f --- /dev/null +++ b/pkg/libcontainer/capabilities/capabilities.go @@ -0,0 +1,49 @@ +package capabilities + +import ( + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/syndtr/gocapability/capability" + "os" +) + +var capMap = map[libcontainer.Capability]capability.Cap{ + libcontainer.CAP_SETPCAP: capability.CAP_SETPCAP, + libcontainer.CAP_SYS_MODULE: capability.CAP_SYS_MODULE, + libcontainer.CAP_SYS_RAWIO: capability.CAP_SYS_RAWIO, + libcontainer.CAP_SYS_PACCT: capability.CAP_SYS_PACCT, + libcontainer.CAP_SYS_ADMIN: capability.CAP_SYS_ADMIN, + libcontainer.CAP_SYS_NICE: capability.CAP_SYS_NICE, + libcontainer.CAP_SYS_RESOURCE: capability.CAP_SYS_RESOURCE, + libcontainer.CAP_SYS_TIME: capability.CAP_SYS_TIME, + libcontainer.CAP_SYS_TTY_CONFIG: capability.CAP_SYS_TTY_CONFIG, + libcontainer.CAP_MKNOD: capability.CAP_MKNOD, + libcontainer.CAP_AUDIT_WRITE: capability.CAP_AUDIT_WRITE, + libcontainer.CAP_AUDIT_CONTROL: capability.CAP_AUDIT_CONTROL, + libcontainer.CAP_MAC_OVERRIDE: capability.CAP_MAC_OVERRIDE, + libcontainer.CAP_MAC_ADMIN: capability.CAP_MAC_ADMIN, +} + +// DropCapabilities drops capabilities for the current process based +// on the container's configuration. +func DropCapabilities(container *libcontainer.Container) error { + if drop := getCapabilities(container); len(drop) > 0 { + c, err := capability.NewPid(os.Getpid()) + if err != nil { + return err + } + c.Unset(capability.CAPS|capability.BOUNDS, drop...) + + if err := c.Apply(capability.CAPS | capability.BOUNDS); err != nil { + return err + } + } + return nil +} + +func getCapabilities(container *libcontainer.Container) []capability.Cap { + drop := []capability.Cap{} + for _, c := range container.Capabilities { + drop = append(drop, capMap[c]) + } + return drop +} diff --git a/pkg/libcontainer/cli/main.go b/pkg/libcontainer/cli/main.go new file mode 100644 index 0000000000..490135ef5a --- /dev/null +++ b/pkg/libcontainer/cli/main.go @@ -0,0 +1,171 @@ +package main + +import ( + "encoding/json" + "flag" + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/namespaces" + "github.com/dotcloud/docker/pkg/libcontainer/network" + "github.com/dotcloud/docker/pkg/libcontainer/utils" + "os" +) + +var ( + displayPid bool + newCommand string + usrNet bool +) + +func init() { + flag.BoolVar(&displayPid, "pid", false, "display the pid before waiting") + flag.StringVar(&newCommand, "cmd", "/bin/bash", "command to run in the existing namespace") + flag.BoolVar(&usrNet, "net", false, "user a net namespace") + flag.Parse() +} + +func exec(container *libcontainer.Container) error { + var ( + netFile *os.File + err error + ) + container.NetNsFd = 0 + + if usrNet { + netFile, err = os.Open("/root/nsroot/test") + if err != nil { + return err + } + container.NetNsFd = netFile.Fd() + } + + pid, err := namespaces.Exec(container) + if err != nil { + return fmt.Errorf("error exec container %s", err) + } + + if displayPid { + fmt.Println(pid) + } + + exitcode, err := utils.WaitOnPid(pid) + if err != nil { + return fmt.Errorf("error waiting on child %s", err) + } + fmt.Println(exitcode) + if usrNet { + netFile.Close() + if err := network.DeleteNetworkNamespace("/root/nsroot/test"); err != nil { + return err + } + } + os.Exit(exitcode) + return nil +} + +func execIn(container *libcontainer.Container) error { + f, err := os.Open("/root/nsroot/test") + if err != nil { + return err + } + container.NetNsFd = f.Fd() + pid, err := namespaces.ExecIn(container, &libcontainer.Command{ + Env: container.Command.Env, + Args: []string{ + newCommand, + }, + }) + if err != nil { + return fmt.Errorf("error exexin container %s", err) + } + exitcode, err := utils.WaitOnPid(pid) + if err != nil { + return fmt.Errorf("error waiting on child %s", err) + } + os.Exit(exitcode) + return nil +} + +func createNet(config *libcontainer.Network) error { + root := "/root/nsroot" + if err := network.SetupNamespaceMountDir(root); err != nil { + return err + } + + nspath := root + "/test" + if err := network.CreateNetworkNamespace(nspath); err != nil { + return nil + } + if err := network.CreateVethPair("veth0", config.TempVethName); err != nil { + return err + } + if err := network.SetInterfaceMaster("veth0", config.Bridge); err != nil { + return err + } + if err := network.InterfaceUp("veth0"); err != nil { + return err + } + + f, err := os.Open(nspath) + if err != nil { + return err + } + defer f.Close() + + if err := network.SetInterfaceInNamespaceFd("veth1", int(f.Fd())); err != nil { + return err + } + + /* + if err := network.SetupVethInsideNamespace(f.Fd(), config); err != nil { + return err + } + */ + return nil +} + +func printErr(err error) { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) +} + +func main() { + var ( + err error + cliCmd = flag.Arg(0) + config = flag.Arg(1) + ) + f, err := os.Open(config) + if err != nil { + printErr(err) + } + + dec := json.NewDecoder(f) + var container *libcontainer.Container + + if err := dec.Decode(&container); err != nil { + printErr(err) + } + f.Close() + + switch cliCmd { + case "exec": + err = exec(container) + case "execin": + err = execIn(container) + case "net": + err = createNet(&libcontainer.Network{ + TempVethName: "veth1", + IP: "172.17.0.100/16", + Gateway: "172.17.42.1", + Mtu: 1500, + Bridge: "docker0", + }) + default: + err = fmt.Errorf("command not supported: %s", cliCmd) + } + + if err != nil { + printErr(err) + } +} diff --git a/pkg/libcontainer/container.go b/pkg/libcontainer/container.go new file mode 100644 index 0000000000..b77890fb5c --- /dev/null +++ b/pkg/libcontainer/container.go @@ -0,0 +1,27 @@ +package libcontainer + +type Container struct { + ID string `json:"id,omitempty"` + NsPid int `json:"namespace_pid,omitempty"` + Command *Command `json:"command,omitempty"` + RootFs string `json:"rootfs,omitempty"` + ReadonlyFs bool `json:"readonly_fs,omitempty"` + NetNsFd uintptr `json:"network_namespace_fd,omitempty"` + User string `json:"user,omitempty"` + WorkingDir string `json:"working_dir,omitempty"` + Namespaces Namespaces `json:"namespaces,omitempty"` + Capabilities Capabilities `json:"capabilities,omitempty"` +} + +type Command struct { + Args []string `json:"args,omitempty"` + Env []string `json:"environment,omitempty"` +} + +type Network struct { + TempVethName string `json:"temp_veth,omitempty"` + IP string `json:"ip,omitempty"` + Gateway string `json:"gateway,omitempty"` + Bridge string `json:"bridge,omitempty"` + Mtu int `json:"mtu,omitempty"` +} diff --git a/pkg/libcontainer/container.json b/pkg/libcontainer/container.json new file mode 100644 index 0000000000..ed8eb1bd78 --- /dev/null +++ b/pkg/libcontainer/container.json @@ -0,0 +1,38 @@ +{ + "id": "koye", + "namespace_pid": 3117, + "command": { + "args": [ + "/bin/bash" + ], + "environment": [ + "HOME=/", + "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", + "container=docker", + "TERM=xterm" + ] + }, + "rootfs": "/root/main/mycontainer", + "namespaces": [ + "NEWIPC", + "NEWNS", + "NEWPID", + "NEWUTS" + ], + "capabilities": [ + "SETPCAP", + "SYS_MODULE", + "SYS_RAWIO", + "SYS_PACCT", + "SYS_ADMIN", + "SYS_NICE", + "SYS_RESOURCE", + "SYS_TIME", + "SYS_TTY_CONFIG", + "MKNOD", + "AUDIT_WRITE", + "AUDIT_CONTROL", + "MAC_OVERRIDE", + "MAC_ADMIN" + ] +} diff --git a/pkg/libcontainer/errors.go b/pkg/libcontainer/errors.go new file mode 100644 index 0000000000..c6964ee8e6 --- /dev/null +++ b/pkg/libcontainer/errors.go @@ -0,0 +1,9 @@ +package libcontainer + +import ( + "errors" +) + +var ( + ErrInvalidPid = errors.New("no ns pid found") +) diff --git a/pkg/libcontainer/namespaces/calls_linux.go b/pkg/libcontainer/namespaces/calls_linux.go new file mode 100644 index 0000000000..793e940b6e --- /dev/null +++ b/pkg/libcontainer/namespaces/calls_linux.go @@ -0,0 +1,164 @@ +package namespaces + +import ( + "fmt" + "os" + "syscall" + "unsafe" +) + +const ( + TIOCGPTN = 0x80045430 + TIOCSPTLCK = 0x40045431 +) + +func chroot(dir string) error { + return syscall.Chroot(dir) +} + +func chdir(dir string) error { + return syscall.Chdir(dir) +} + +func exec(cmd string, args []string, env []string) error { + return syscall.Exec(cmd, args, env) +} + +func fork() (int, error) { + syscall.ForkLock.Lock() + pid, _, err := syscall.Syscall(syscall.SYS_FORK, 0, 0, 0) + syscall.ForkLock.Unlock() + if err != 0 { + return -1, err + } + return int(pid), nil +} + +func vfork() (int, error) { + syscall.ForkLock.Lock() + pid, _, err := syscall.Syscall(syscall.SYS_VFORK, 0, 0, 0) + syscall.ForkLock.Unlock() + if err != 0 { + return -1, err + } + return int(pid), nil +} + +func mount(source, target, fstype string, flags uintptr, data string) error { + return syscall.Mount(source, target, fstype, flags, data) +} + +func unmount(target string, flags int) error { + return syscall.Unmount(target, flags) +} + +func pivotroot(newroot, putold string) error { + return syscall.PivotRoot(newroot, putold) +} + +func unshare(flags int) error { + return syscall.Unshare(flags) +} + +func clone(flags uintptr) (int, error) { + syscall.ForkLock.Lock() + pid, _, err := syscall.RawSyscall(syscall.SYS_CLONE, flags, 0, 0) + syscall.ForkLock.Unlock() + if err != 0 { + return -1, err + } + return int(pid), nil +} + +func setns(fd uintptr, flags uintptr) error { + _, _, err := syscall.RawSyscall(SYS_SETNS, fd, flags, 0) + if err != 0 { + return err + } + return nil +} + +func usetCloseOnExec(fd uintptr) error { + if _, _, err := syscall.Syscall(syscall.SYS_FCNTL, fd, syscall.F_SETFD, 0); err != 0 { + return err + } + return nil +} + +func setgroups(gids []int) error { + return syscall.Setgroups(gids) +} + +func setresgid(rgid, egid, sgid int) error { + return syscall.Setresgid(rgid, egid, sgid) +} + +func setresuid(ruid, euid, suid int) error { + return syscall.Setresuid(ruid, euid, suid) +} + +func sethostname(name string) error { + return syscall.Sethostname([]byte(name)) +} + +func setsid() (int, error) { + return syscall.Setsid() +} + +func ioctl(fd uintptr, flag, data uintptr) error { + if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, fd, flag, data); err != 0 { + return err + } + return nil +} + +func openpmtx() (*os.File, error) { + return os.OpenFile("/dev/ptmx", syscall.O_RDONLY|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) +} + +func unlockpt(f *os.File) error { + var u int + return ioctl(f.Fd(), TIOCSPTLCK, uintptr(unsafe.Pointer(&u))) +} + +func ptsname(f *os.File) (string, error) { + var n int + if err := ioctl(f.Fd(), TIOCGPTN, uintptr(unsafe.Pointer(&n))); err != nil { + return "", err + } + return fmt.Sprintf("/dev/pts/%d", n), nil +} + +func closefd(fd uintptr) error { + return syscall.Close(int(fd)) +} + +func dup2(fd1, fd2 uintptr) error { + return syscall.Dup2(int(fd1), int(fd2)) +} + +func mknod(path string, mode uint32, dev int) error { + return syscall.Mknod(path, mode, dev) +} + +func parentDeathSignal() error { + if _, _, err := syscall.RawSyscall6(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, uintptr(syscall.SIGKILL), 0, 0, 0, 0); err != 0 { + return err + } + return nil +} + +func setctty() error { + if _, _, err := syscall.RawSyscall(syscall.SYS_IOCTL, 0, uintptr(syscall.TIOCSCTTY), 0); err != 0 { + return err + } + return nil +} + +func mkfifo(name string, mode uint32) error { + return syscall.Mkfifo(name, mode) +} + +func umask(mask int) int { + return syscall.Umask(mask) +} diff --git a/pkg/libcontainer/namespaces/exec.go b/pkg/libcontainer/namespaces/exec.go new file mode 100644 index 0000000000..893b302887 --- /dev/null +++ b/pkg/libcontainer/namespaces/exec.go @@ -0,0 +1,266 @@ +/* + Higher level convience functions for setting up a container +*/ + +package namespaces + +import ( + "errors" + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/capabilities" + "github.com/dotcloud/docker/pkg/libcontainer/utils" + "io" + "log" + "os" + "path/filepath" + "syscall" +) + +var ( + ErrExistingNetworkNamespace = errors.New("specified both CLONE_NEWNET and an existing network namespace") +) + +// Exec will spawn new namespaces with the specified Container configuration +// in the RootFs path and return the pid of the new containerized process. +// +// If an existing network namespace is specified the container +// will join that namespace. If an existing network namespace is not specified but CLONE_NEWNET is, +// the container will be spawned with a new network namespace with no configuration. Omiting an +// existing network namespace and the CLONE_NEWNET option in the container configuration will allow +// the container to the the host's networking options and configuration. +func Exec(container *libcontainer.Container) (pid int, err error) { + // a user cannot pass CLONE_NEWNET and an existing net namespace fd to join + if container.NetNsFd > 0 && container.Namespaces.Contains(libcontainer.CLONE_NEWNET) { + return -1, ErrExistingNetworkNamespace + } + + rootfs, err := resolveRootfs(container) + if err != nil { + return -1, err + } + + master, console, err := createMasterAndConsole() + if err != nil { + return -1, err + } + + logger, err := os.OpenFile("/root/logs", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0755) + if err != nil { + return -1, err + } + log.SetOutput(logger) + + // we need CLONE_VFORK so we can wait on the child + flag := getNamespaceFlags(container.Namespaces) | CLONE_VFORK + + if pid, err = clone(uintptr(flag | SIGCHLD)); err != nil { + return -1, fmt.Errorf("error cloning process: %s", err) + } + + if pid == 0 { + // welcome to your new namespace ;) + // + // any errors encoutered inside the namespace we should write + // out to a log or a pipe to our parent and exit(1) + // because writing to stderr will not work after we close + if err := closeMasterAndStd(master); err != nil { + writeError("close master and std %s", err) + } + slave, err := openTerminal(console, syscall.O_RDWR) + if err != nil { + writeError("open terminal %s", err) + } + if err := dupSlave(slave); err != nil { + writeError("dup2 slave %s", err) + } + + if container.NetNsFd > 0 { + if err := JoinExistingNamespace(container.NetNsFd, libcontainer.CLONE_NEWNET); err != nil { + writeError("join existing net namespace %s", err) + } + } + + if _, err := setsid(); err != nil { + writeError("setsid %s", err) + } + if err := setctty(); err != nil { + writeError("setctty %s", err) + } + if err := parentDeathSignal(); err != nil { + writeError("parent deth signal %s", err) + } + if err := SetupNewMountNamespace(rootfs, console, container.ReadonlyFs); err != nil { + writeError("setup mount namespace %s", err) + } + if err := sethostname(container.ID); err != nil { + writeError("sethostname %s", err) + } + if err := capabilities.DropCapabilities(container); err != nil { + writeError("drop capabilities %s", err) + } + if err := setupUser(container); err != nil { + writeError("setup user %s", err) + } + if container.WorkingDir != "" { + if err := chdir(container.WorkingDir); err != nil { + writeError("chdir to %s %s", container.WorkingDir, err) + } + } + if err := exec(container.Command.Args[0], container.Command.Args[0:], container.Command.Env); err != nil { + writeError("exec %s", err) + } + panic("unreachable") + } + + go func() { + if _, err := io.Copy(os.Stdout, master); err != nil { + log.Println(err) + } + }() + go func() { + if _, err := io.Copy(master, os.Stdin); err != nil { + log.Println(err) + } + }() + return pid, nil +} + +// ExecIn will spawn a new command inside an existing container's namespaces. The existing container's +// pid and namespace configuration is needed along with the specific capabilities that should +// be dropped once inside the namespace. +func ExecIn(container *libcontainer.Container, cmd *libcontainer.Command) (int, error) { + if container.NsPid <= 0 { + return -1, libcontainer.ErrInvalidPid + } + + fds, err := getNsFds(container) + if err != nil { + return -1, err + } + + if container.NetNsFd > 0 { + fds = append(fds, container.NetNsFd) + } + + pid, err := fork() + if err != nil { + for _, fd := range fds { + syscall.Close(int(fd)) + } + return -1, err + } + + if pid == 0 { + for _, fd := range fds { + if fd > 0 { + if err := JoinExistingNamespace(fd, ""); err != nil { + for _, fd := range fds { + syscall.Close(int(fd)) + } + writeError("join existing namespace for %d %s", fd, err) + } + } + syscall.Close(int(fd)) + } + + if container.Namespaces.Contains(libcontainer.CLONE_NEWNS) && + container.Namespaces.Contains(libcontainer.CLONE_NEWPID) { + // important: + // + // we need to fork and unshare so that re can remount proc and sys within + // the namespace so the CLONE_NEWPID namespace will take effect + // if we don't fork we would end up unmounting proc and sys for the entire + // namespace + child, err := fork() + if err != nil { + writeError("fork child %s", err) + } + + if child == 0 { + if err := unshare(CLONE_NEWNS); err != nil { + writeError("unshare newns %s", err) + } + if err := remountProc(); err != nil { + writeError("remount proc %s", err) + } + if err := remountSys(); err != nil { + writeError("remount sys %s", err) + } + if err := capabilities.DropCapabilities(container); err != nil { + writeError("drop caps %s", err) + } + if err := exec(cmd.Args[0], cmd.Args[0:], cmd.Env); err != nil { + writeError("exec %s", err) + } + panic("unreachable") + } + exit, err := utils.WaitOnPid(child) + if err != nil { + writeError("wait on child %s", err) + } + os.Exit(exit) + } + if err := exec(cmd.Args[0], cmd.Args[0:], cmd.Env); err != nil { + writeError("exec %s", err) + } + panic("unreachable") + } + return pid, err +} + +func resolveRootfs(container *libcontainer.Container) (string, error) { + rootfs, err := filepath.Abs(container.RootFs) + if err != nil { + return "", err + } + return filepath.EvalSymlinks(rootfs) +} + +func createMasterAndConsole() (*os.File, string, error) { + master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) + if err != nil { + return nil, "", err + } + + console, err := ptsname(master) + if err != nil { + return nil, "", err + } + + if err := unlockpt(master); err != nil { + return nil, "", err + } + return master, console, nil +} + +func closeMasterAndStd(master *os.File) error { + closefd(master.Fd()) + closefd(0) + closefd(1) + closefd(2) + + return nil +} + +func dupSlave(slave *os.File) error { + // we close Stdin,etc so our pty slave should have fd 0 + if slave.Fd() != 0 { + return fmt.Errorf("slave fd not 0 %d", slave.Fd()) + } + if err := dup2(slave.Fd(), 1); err != nil { + return err + } + if err := dup2(slave.Fd(), 2); err != nil { + return err + } + return nil +} + +func openTerminal(name string, flag int) (*os.File, error) { + r, e := syscall.Open(name, flag, 0) + if e != nil { + return nil, &os.PathError{"open", name, e} + } + return os.NewFile(uintptr(r), name), nil +} diff --git a/pkg/libcontainer/namespaces/linux_x86_64.go b/pkg/libcontainer/namespaces/linux_x86_64.go new file mode 100644 index 0000000000..ac9a014763 --- /dev/null +++ b/pkg/libcontainer/namespaces/linux_x86_64.go @@ -0,0 +1,7 @@ +// +build linux,x86_64 +package namespaces + +// Via http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7b21fddd087678a70ad64afc0f632e0f1071b092 +const ( + SYS_SETNS = 308 +) diff --git a/pkg/libcontainer/namespaces/mount.go b/pkg/libcontainer/namespaces/mount.go new file mode 100644 index 0000000000..6d867c91ec --- /dev/null +++ b/pkg/libcontainer/namespaces/mount.go @@ -0,0 +1,207 @@ +package namespaces + +import ( + "fmt" + "log" + "os" + "path/filepath" + "syscall" +) + +var ( + // default mount point options + defaults = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV +) + +func SetupNewMountNamespace(rootfs, console string, readonly bool) error { + if err := mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil { + return fmt.Errorf("mounting / as slave %s", err) + } + + if err := mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil { + return fmt.Errorf("mouting %s as bind %s", rootfs, err) + } + + if readonly { + if err := mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, ""); err != nil { + return fmt.Errorf("mounting %s as readonly %s", rootfs, err) + } + } + + if err := mountSystem(rootfs); err != nil { + return fmt.Errorf("mount system %s", err) + } + + if err := copyDevNodes(rootfs); err != nil { + return fmt.Errorf("copy dev nodes %s", err) + } + + ptmx := filepath.Join(rootfs, "dev/ptmx") + if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) { + return err + } + if err := os.Symlink(filepath.Join(rootfs, "pts/ptmx"), ptmx); err != nil { + return fmt.Errorf("symlink dev ptmx %s", err) + } + + if err := setupDev(rootfs); err != nil { + return err + } + + if err := setupConsole(rootfs, console); err != nil { + return err + } + + if err := chdir(rootfs); err != nil { + return fmt.Errorf("chdir into %s %s", rootfs, err) + } + + if err := mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil { + return fmt.Errorf("mount move %s into / %s", rootfs, err) + } + + if err := chroot("."); err != nil { + return fmt.Errorf("chroot . %s", err) + } + + if err := chdir("/"); err != nil { + return fmt.Errorf("chdir / %s", err) + } + + umask(0022) + + return nil +} + +func copyDevNodes(rootfs string) error { + umask(0000) + + for _, node := range []string{ + "null", + "zero", + "full", + "random", + "urandom", + "tty", + } { + stat, err := os.Stat(filepath.Join("/dev", node)) + if err != nil { + return err + } + + var ( + dest = filepath.Join(rootfs, "dev", node) + st = stat.Sys().(*syscall.Stat_t) + ) + + log.Printf("copy %s to %s %d\n", node, dest, st.Rdev) + if err := mknod(dest, st.Mode, int(st.Rdev)); err != nil && !os.IsExist(err) { + return fmt.Errorf("copy %s %s", node, err) + } + } + return nil +} + +func setupDev(rootfs string) error { + for _, link := range []struct { + from string + to string + }{ + {"/proc/kcore", "/dev/core"}, + {"/proc/self/fd", "/dev/fd"}, + {"/proc/self/fd/0", "/dev/stdin"}, + {"/proc/self/fd/1", "/dev/stdout"}, + {"/proc/self/fd/2", "/dev/stderr"}, + } { + dest := filepath.Join(rootfs, link.to) + if err := os.Remove(dest); err != nil && !os.IsNotExist(err) { + return fmt.Errorf("remove %s %s", dest, err) + } + if err := os.Symlink(link.from, dest); err != nil { + return fmt.Errorf("symlink %s %s", dest, err) + } + } + return nil +} + +func setupConsole(rootfs, console string) error { + umask(0000) + + stat, err := os.Stat(console) + if err != nil { + return fmt.Errorf("stat console %s %s", console, err) + } + st := stat.Sys().(*syscall.Stat_t) + + dest := filepath.Join(rootfs, "dev/console") + if err := os.Remove(dest); err != nil && !os.IsNotExist(err) { + return fmt.Errorf("remove %s %s", dest, err) + } + + if err := os.Chmod(console, 0600); err != nil { + return err + } + if err := os.Chown(console, 0, 0); err != nil { + return err + } + + if err := mknod(dest, (st.Mode&^07777)|0600, int(st.Rdev)); err != nil { + return fmt.Errorf("mknod %s %s", dest, err) + } + + if err := mount(console, dest, "bind", syscall.MS_BIND, ""); err != nil { + return fmt.Errorf("bind %s to %s %s", console, dest, err) + } + return nil +} + +// mountSystem sets up linux specific system mounts like sys, proc, shm, and devpts +// inside the mount namespace +func mountSystem(rootfs string) error { + mounts := []struct { + source string + path string + device string + flags int + data string + }{ + {source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaults}, + {source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaults}, + {source: "tmpfs", path: filepath.Join(rootfs, "dev"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME, data: "mode=755"}, + {source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaults, data: "mode=1777"}, + {source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: "newinstance,ptmxmode=0666,mode=620,gid=5"}, + {source: "tmpfs", path: filepath.Join(rootfs, "run"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_NODEV | syscall.MS_STRICTATIME, data: "mode=755"}, + } + for _, m := range mounts { + if err := os.MkdirAll(m.path, 0755); err != nil && !os.IsExist(err) { + return fmt.Errorf("mkdirall %s %s", m.path, err) + } + if err := mount(m.source, m.path, m.device, uintptr(m.flags), m.data); err != nil { + return fmt.Errorf("mounting %s into %s %s", m.source, m.path, err) + } + } + return nil +} + +func remountProc() error { + if err := unmount("/proc", syscall.MNT_DETACH); err != nil { + return err + } + if err := mount("proc", "/proc", "proc", uintptr(defaults), ""); err != nil { + return err + } + return nil +} + +func remountSys() error { + if err := unmount("/sys", syscall.MNT_DETACH); err != nil { + if err != syscall.EINVAL { + return err + } + } else { + if err := mount("sysfs", "/sys", "sysfs", uintptr(defaults), ""); err != nil { + return err + } + } + return nil +} diff --git a/pkg/libcontainer/namespaces/namespaces.go b/pkg/libcontainer/namespaces/namespaces.go new file mode 100644 index 0000000000..2a50847015 --- /dev/null +++ b/pkg/libcontainer/namespaces/namespaces.go @@ -0,0 +1,70 @@ +/* + TODO + pivot root + cgroups + more mount stuff that I probably am forgetting + apparmor +*/ + +package namespaces + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/utils" + "os" + "path/filepath" + "syscall" +) + +// CreateNewNamespace creates a new namespace and binds it's fd to the specified path +func CreateNewNamespace(namespace libcontainer.Namespace, bindTo string) error { + var ( + flag = namespaceMap[namespace] + name = namespaceFileMap[namespace] + nspath = filepath.Join("/proc/self/ns", name) + ) + // TODO: perform validation on name and flag + + pid, err := fork() + if err != nil { + return err + } + + if pid == 0 { + if err := unshare(flag); err != nil { + writeError("unshare %s", err) + } + if err := mount(nspath, bindTo, "none", syscall.MS_BIND, ""); err != nil { + writeError("bind mount %s", err) + } + os.Exit(0) + } + exit, err := utils.WaitOnPid(pid) + if err != nil { + return err + } + if exit != 0 { + return fmt.Errorf("exit status %d", exit) + } + return err +} + +// JoinExistingNamespace uses the fd of an existing linux namespace and +// has the current process join that namespace or the spacespace specified by ns +func JoinExistingNamespace(fd uintptr, ns libcontainer.Namespace) error { + flag := namespaceMap[ns] + if err := setns(fd, uintptr(flag)); err != nil { + return err + } + return nil +} + +// getNamespaceFlags parses the container's Namespaces options to set the correct +// flags on clone, unshare, and setns +func getNamespaceFlags(namespaces libcontainer.Namespaces) (flag int) { + for _, ns := range namespaces { + flag |= namespaceMap[ns] + } + return +} diff --git a/pkg/libcontainer/namespaces/ns_linux.go b/pkg/libcontainer/namespaces/ns_linux.go new file mode 100644 index 0000000000..b0e5119130 --- /dev/null +++ b/pkg/libcontainer/namespaces/ns_linux.go @@ -0,0 +1,35 @@ +package namespaces + +import ( + "github.com/dotcloud/docker/pkg/libcontainer" +) + +const ( + SIGCHLD = 0x14 + CLONE_VFORK = 0x00004000 + CLONE_NEWNS = 0x00020000 + CLONE_NEWUTS = 0x04000000 + CLONE_NEWIPC = 0x08000000 + CLONE_NEWUSER = 0x10000000 + CLONE_NEWPID = 0x20000000 + CLONE_NEWNET = 0x40000000 +) + +var namespaceMap = map[libcontainer.Namespace]int{ + "": 0, + libcontainer.CLONE_NEWNS: CLONE_NEWNS, + libcontainer.CLONE_NEWUTS: CLONE_NEWUTS, + libcontainer.CLONE_NEWIPC: CLONE_NEWIPC, + libcontainer.CLONE_NEWUSER: CLONE_NEWUSER, + libcontainer.CLONE_NEWPID: CLONE_NEWPID, + libcontainer.CLONE_NEWNET: CLONE_NEWNET, +} + +var namespaceFileMap = map[libcontainer.Namespace]string{ + libcontainer.CLONE_NEWNS: "mnt", + libcontainer.CLONE_NEWUTS: "uts", + libcontainer.CLONE_NEWIPC: "ipc", + libcontainer.CLONE_NEWUSER: "user", + libcontainer.CLONE_NEWPID: "pid", + libcontainer.CLONE_NEWNET: "net", +} diff --git a/pkg/libcontainer/namespaces/utils.go b/pkg/libcontainer/namespaces/utils.go new file mode 100644 index 0000000000..438d896484 --- /dev/null +++ b/pkg/libcontainer/namespaces/utils.go @@ -0,0 +1,108 @@ +package namespaces + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "os" + "path/filepath" + "strconv" + "strings" + "syscall" +) + +func addEnvIfNotSet(container *libcontainer.Container, key, value string) { + jv := fmt.Sprintf("%s=%s", key, value) + if len(container.Command.Env) == 0 { + container.Command.Env = []string{jv} + return + } + + for _, v := range container.Command.Env { + parts := strings.Split(v, "=") + if parts[0] == key { + return + } + } + container.Command.Env = append(container.Command.Env, jv) +} + +// print and error to stderr and exit(1) +func writeError(format string, v ...interface{}) { + fmt.Fprintf(os.Stderr, format, v...) + os.Exit(1) +} + +// getNsFds inspects the container's namespace configuration and opens the fds to +// each of the namespaces. +func getNsFds(container *libcontainer.Container) ([]uintptr, error) { + var ( + namespaces = []string{} + fds = []uintptr{} + ) + + for _, ns := range container.Namespaces { + namespaces = append(namespaces, namespaceFileMap[ns]) + } + + for _, ns := range namespaces { + fd, err := getNsFd(container.NsPid, ns) + if err != nil { + for _, fd = range fds { + syscall.Close(int(fd)) + } + return nil, err + } + fds = append(fds, fd) + } + return fds, nil +} + +// getNsFd returns the fd for a specific pid and namespace option +func getNsFd(pid int, ns string) (uintptr, error) { + nspath := filepath.Join("/proc", strconv.Itoa(pid), "ns", ns) + // OpenFile adds closOnExec + f, err := os.OpenFile(nspath, os.O_RDONLY, 0666) + if err != nil { + return 0, err + } + return f.Fd(), nil +} + +// setupEnvironment adds additional environment variables to the container's +// Command such as USER, LOGNAME, container, and TERM +func setupEnvironment(container *libcontainer.Container) { + addEnvIfNotSet(container, "container", "docker") + // TODO: check if pty + addEnvIfNotSet(container, "TERM", "xterm") + // TODO: get username from container + addEnvIfNotSet(container, "USER", "root") + addEnvIfNotSet(container, "LOGNAME", "root") +} + +func setupUser(container *libcontainer.Container) error { + // TODO: honor user passed on container + if err := setgroups(nil); err != nil { + return err + } + if err := setresgid(0, 0, 0); err != nil { + return err + } + if err := setresuid(0, 0, 0); err != nil { + return err + } + return nil +} + +func getMasterAndConsole(container *libcontainer.Container) (string, *os.File, error) { + master, err := openpmtx() + if err != nil { + return "", nil, err + } + + console, err := ptsname(master) + if err != nil { + master.Close() + return "", nil, err + } + return console, master, nil +} diff --git a/pkg/libcontainer/network/network.go b/pkg/libcontainer/network/network.go new file mode 100644 index 0000000000..31c5d32492 --- /dev/null +++ b/pkg/libcontainer/network/network.go @@ -0,0 +1,104 @@ +package network + +import ( + "errors" + "github.com/dotcloud/docker/pkg/netlink" + "net" +) + +var ( + ErrNoDefaultRoute = errors.New("no default network route found") +) + +func InterfaceUp(name string) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + return netlink.NetworkLinkUp(iface) +} + +func InterfaceDown(name string) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + return netlink.NetworkLinkDown(iface) +} + +func ChangeInterfaceName(old, newName string) error { + iface, err := net.InterfaceByName(old) + if err != nil { + return err + } + return netlink.NetworkChangeName(iface, newName) +} + +func CreateVethPair(name1, name2 string) error { + return netlink.NetworkCreateVethPair(name1, name2) +} + +func SetInterfaceInNamespacePid(name string, nsPid int) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + return netlink.NetworkSetNsPid(iface, nsPid) +} + +func SetInterfaceInNamespaceFd(name string, fd int) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + return netlink.NetworkSetNsFd(iface, fd) +} + +func SetInterfaceMaster(name, master string) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + masterIface, err := net.InterfaceByName(master) + if err != nil { + return err + } + return netlink.NetworkSetMaster(iface, masterIface) +} + +func SetDefaultGateway(ip string) error { + return netlink.AddDefaultGw(net.ParseIP(ip)) +} + +func SetInterfaceIp(name string, rawIp string) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + ip, ipNet, err := net.ParseCIDR(rawIp) + if err != nil { + return err + } + return netlink.NetworkLinkAddIp(iface, ip, ipNet) +} + +func SetMtu(name string, mtu int) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + return netlink.NetworkSetMTU(iface, mtu) +} + +func GetDefaultMtu() (int, error) { + routes, err := netlink.NetworkGetRoutes() + if err != nil { + return -1, err + } + for _, r := range routes { + if r.Default { + return r.Iface.MTU, nil + } + } + return -1, ErrNoDefaultRoute +} diff --git a/pkg/libcontainer/network/veth.go b/pkg/libcontainer/network/veth.go new file mode 100644 index 0000000000..dc207b3394 --- /dev/null +++ b/pkg/libcontainer/network/veth.go @@ -0,0 +1,85 @@ +package network + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/namespaces" + "os" + "syscall" +) + +// SetupVeth sets up an existing network namespace with the specified +// network configuration. +func SetupVeth(config *libcontainer.Network) error { + if err := InterfaceDown(config.TempVethName); err != nil { + return fmt.Errorf("interface down %s %s", config.TempVethName, err) + } + if err := ChangeInterfaceName(config.TempVethName, "eth0"); err != nil { + return fmt.Errorf("change %s to eth0 %s", config.TempVethName, err) + } + if err := SetInterfaceIp("eth0", config.IP); err != nil { + return fmt.Errorf("set eth0 ip %s", err) + } + + if err := SetMtu("eth0", config.Mtu); err != nil { + return fmt.Errorf("set eth0 mtu to %d %s", config.Mtu, err) + } + if err := InterfaceUp("eth0"); err != nil { + return fmt.Errorf("eth0 up %s", err) + } + + if err := SetMtu("lo", config.Mtu); err != nil { + return fmt.Errorf("set lo mtu to %d %s", config.Mtu, err) + } + if err := InterfaceUp("lo"); err != nil { + return fmt.Errorf("lo up %s", err) + } + + if config.Gateway != "" { + if err := SetDefaultGateway(config.Gateway); err != nil { + return fmt.Errorf("set gateway to %s %s", config.Gateway, err) + } + } + return nil +} + +// SetupNamespaceMountDir prepares a new root for use as a mount +// source for bind mounting namespace fd to an outside path +func SetupNamespaceMountDir(root string) error { + if err := os.MkdirAll(root, 0666); err != nil { + return err + } + // make sure mounts are not unmounted by other mnt namespaces + if err := syscall.Mount("", root, "none", syscall.MS_SHARED|syscall.MS_REC, ""); err != nil && err != syscall.EINVAL { + return err + } + if err := syscall.Mount(root, root, "none", syscall.MS_BIND, ""); err != nil { + return err + } + return nil +} + +// CreateNetworkNamespace creates a new network namespace and binds it's fd +// at the binding path +func CreateNetworkNamespace(bindingPath string) error { + f, err := os.OpenFile(bindingPath, os.O_RDONLY|os.O_CREATE|os.O_EXCL, 0) + if err != nil { + return err + } + f.Close() + + if err := namespaces.CreateNewNamespace(libcontainer.CLONE_NEWNET, bindingPath); err != nil { + return err + } + return nil +} + +// DeleteNetworkNamespace unmounts the binding path and removes the +// file so that no references to the fd are present and the network +// namespace is automatically cleaned up +func DeleteNetworkNamespace(bindingPath string) error { + if err := syscall.Unmount(bindingPath, 0); err != nil { + return err + } + return os.Remove(bindingPath) +} diff --git a/pkg/libcontainer/privileged.json b/pkg/libcontainer/privileged.json new file mode 100644 index 0000000000..be877ad335 --- /dev/null +++ b/pkg/libcontainer/privileged.json @@ -0,0 +1,22 @@ +{ + "id": "koye", + "namespace_pid": 3745, + "command": { + "args": [ + "/usr/lib/systemd/systemd" + ], + "environment": [ + "HOME=/", + "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", + "container=docker", + "TERM=" + ] + }, + "rootfs": "/root/main/mycontainer", + "namespaces": [ + "NEWIPC", + "NEWNS", + "NEWPID", + "NEWUTS" + ] +} diff --git a/pkg/libcontainer/types.go b/pkg/libcontainer/types.go new file mode 100644 index 0000000000..db1c3b9738 --- /dev/null +++ b/pkg/libcontainer/types.go @@ -0,0 +1,49 @@ +package libcontainer + +type Namespace string +type Namespaces []Namespace + +func (n Namespaces) Contains(ns Namespace) bool { + for _, nns := range n { + if nns == ns { + return true + } + } + return false +} + +type Capability string +type Capabilities []Capability + +func (c Capabilities) Contains(capp Capability) bool { + for _, cc := range c { + if cc == capp { + return true + } + } + return false +} + +const ( + CAP_SETPCAP Capability = "SETPCAP" + CAP_SYS_MODULE Capability = "SYS_MODULE" + CAP_SYS_RAWIO Capability = "SYS_RAWIO" + CAP_SYS_PACCT Capability = "SYS_PACCT" + CAP_SYS_ADMIN Capability = "SYS_ADMIN" + CAP_SYS_NICE Capability = "SYS_NICE" + CAP_SYS_RESOURCE Capability = "SYS_RESOURCE" + CAP_SYS_TIME Capability = "SYS_TIME" + CAP_SYS_TTY_CONFIG Capability = "SYS_TTY_CONFIG" + CAP_MKNOD Capability = "MKNOD" + CAP_AUDIT_WRITE Capability = "AUDIT_WRITE" + CAP_AUDIT_CONTROL Capability = "AUDIT_CONTROL" + CAP_MAC_OVERRIDE Capability = "MAC_OVERRIDE" + CAP_MAC_ADMIN Capability = "MAC_ADMIN" + + CLONE_NEWNS Namespace = "NEWNS" // mount + CLONE_NEWUTS Namespace = "NEWUTS" // utsname + CLONE_NEWIPC Namespace = "NEWIPC" // ipc + CLONE_NEWUSER Namespace = "NEWUSER" // user + CLONE_NEWPID Namespace = "NEWPID" // pid + CLONE_NEWNET Namespace = "NEWNET" // network +) diff --git a/pkg/libcontainer/ubuntu.json b/pkg/libcontainer/ubuntu.json new file mode 100644 index 0000000000..0a450ae066 --- /dev/null +++ b/pkg/libcontainer/ubuntu.json @@ -0,0 +1,22 @@ +{ + "id": "koye", + "namespace_pid": 3745, + "command": { + "args": [ + "/sbin/init" + ], + "environment": [ + "HOME=/", + "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", + "container=docker", + "TERM=xterm" + ] + }, + "rootfs": "/var/lib/docker/btrfs/subvolumes/7c0f15df1ad2e2fe04d7a6e079aec17406e9465a6a37dd16cb0dd754fc0167b3", + "namespaces": [ + "NEWIPC", + "NEWNS", + "NEWPID", + "NEWUTS" + ] +} diff --git a/pkg/libcontainer/utils/utils.go b/pkg/libcontainer/utils/utils.go new file mode 100644 index 0000000000..7289fecf2e --- /dev/null +++ b/pkg/libcontainer/utils/utils.go @@ -0,0 +1,33 @@ +package utils + +import ( + "crypto/rand" + "encoding/hex" + "io" + "os" + "syscall" +) + +func WaitOnPid(pid int) (exitcode int, err error) { + child, err := os.FindProcess(pid) + if err != nil { + return -1, err + } + state, err := child.Wait() + if err != nil { + return -1, err + } + return getExitCode(state), nil +} + +func getExitCode(state *os.ProcessState) int { + return state.Sys().(syscall.WaitStatus).ExitStatus() +} + +func GenerateRandomName(size int) (string, error) { + id := make([]byte, size) + if _, err := io.ReadFull(rand.Reader, id); err != nil { + return "", err + } + return hex.EncodeToString(id), nil +}