diff --git a/container.go b/container.go index 8dd672d0f9..e0a037dc3a 100644 --- a/container.go +++ b/container.go @@ -532,6 +532,7 @@ func (container *Container) Start() (err error) { } populateCommand(container) + container.command.Env = env // Setup logging of stdout and stderr to disk if err := container.runtime.LogToDisk(container.stdout, container.logPath("json"), "stdout"); err != nil { diff --git a/docker/docker.go b/docker/docker.go index 5dda9d269e..3ea3d63027 100644 --- a/docker/docker.go +++ b/docker/docker.go @@ -17,7 +17,7 @@ import ( ) func main() { - if selfPath := utils.SelfPath(); selfPath == "/sbin/init" || selfPath == "/.dockerinit" { + if selfPath := utils.SelfPath(); strings.Contains(selfPath, ".dockerinit") { // Running in init mode sysinit.SysInit() return @@ -39,7 +39,7 @@ func main() { flDefaultIp = flag.String([]string{"#ip", "-ip"}, "0.0.0.0", "Default IP address to use when binding container ports") flInterContainerComm = flag.Bool([]string{"#icc", "-icc"}, true, "Enable inter-container communication") flGraphDriver = flag.String([]string{"s", "-storage-driver"}, "", "Force the docker runtime to use a specific storage driver") - flExecDriver = flag.String([]string{"e", "-exec-driver"}, "", "Force the docker runtime to use a specific exec driver") + flExecDriver = flag.String([]string{"e", "-exec-driver"}, "native", "Force the docker runtime to use a specific exec driver") flHosts = opts.NewListOpts(api.ValidateHost) flMtu = flag.Int([]string{"#mtu", "-mtu"}, 0, "Set the containers network MTU; if no value is provided: default to the default route MTU or 1500 if no default route is available") ) diff --git a/execdriver/chroot/driver.go b/execdriver/chroot/driver.go deleted file mode 100644 index dfec680d84..0000000000 --- a/execdriver/chroot/driver.go +++ /dev/null @@ -1,101 +0,0 @@ -package chroot - -import ( - "fmt" - "github.com/dotcloud/docker/execdriver" - "github.com/dotcloud/docker/pkg/mount" - "os" - "os/exec" - "syscall" -) - -const ( - DriverName = "chroot" - Version = "0.1" -) - -func init() { - execdriver.RegisterInitFunc(DriverName, func(args *execdriver.InitArgs) error { - if err := mount.ForceMount("proc", "proc", "proc", ""); err != nil { - return err - } - defer mount.ForceUnmount("proc") - cmd := exec.Command(args.Args[0], args.Args[1:]...) - - cmd.Stderr = os.Stderr - cmd.Stdout = os.Stdout - cmd.Stdin = os.Stdin - - return cmd.Run() - }) -} - -type driver struct { -} - -func NewDriver() (*driver, error) { - return &driver{}, nil -} - -func (d *driver) Run(c *execdriver.Command, pipes *execdriver.Pipes, startCallback execdriver.StartCallback) (int, error) { - params := []string{ - "chroot", - c.Rootfs, - "/.dockerinit", - "-driver", - DriverName, - } - params = append(params, c.Entrypoint) - params = append(params, c.Arguments...) - - var ( - name = params[0] - arg = params[1:] - ) - aname, err := exec.LookPath(name) - if err != nil { - aname = name - } - c.Path = aname - c.Args = append([]string{name}, arg...) - - if err := c.Start(); err != nil { - return -1, err - } - - if startCallback != nil { - startCallback(c) - } - - err = c.Wait() - return getExitCode(c), err -} - -/// Return the exit code of the process -// if the process has not exited -1 will be returned -func getExitCode(c *execdriver.Command) int { - if c.ProcessState == nil { - return -1 - } - return c.ProcessState.Sys().(syscall.WaitStatus).ExitStatus() -} - -func (d *driver) Kill(p *execdriver.Command, sig int) error { - return p.Process.Kill() -} - -func (d *driver) Restore(c *execdriver.Command) error { - panic("Not Implemented") -} - -func (d *driver) Info(id string) execdriver.Info { - panic("Not implemented") -} - -func (d *driver) Name() string { - return fmt.Sprintf("%s-%s", DriverName, Version) -} - -func (d *driver) GetPidsForContainer(id string) ([]int, error) { - return nil, fmt.Errorf("Not supported") -} diff --git a/execdriver/driver.go b/execdriver/driver.go index a6d865caf3..d64c08fa6c 100644 --- a/execdriver/driver.go +++ b/execdriver/driver.go @@ -51,6 +51,9 @@ type InitArgs struct { Args []string Mtu int Driver string + Console string + Pipe int + Root string } // Driver specific information based on diff --git a/execdriver/native/default_template.go b/execdriver/native/default_template.go new file mode 100644 index 0000000000..91fd646c8e --- /dev/null +++ b/execdriver/native/default_template.go @@ -0,0 +1,82 @@ +package native + +import ( + "fmt" + "github.com/dotcloud/docker/execdriver" + "github.com/dotcloud/docker/pkg/cgroups" + "github.com/dotcloud/docker/pkg/libcontainer" +) + +// createContainer populates and configures the container type with the +// data provided by the execdriver.Command +func createContainer(c *execdriver.Command) *libcontainer.Container { + container := getDefaultTemplate() + + container.Hostname = getEnv("HOSTNAME", c.Env) + container.Tty = c.Tty + container.User = c.User + container.WorkingDir = c.WorkingDir + container.Env = c.Env + + if c.Network != nil { + container.Networks = []*libcontainer.Network{ + { + Mtu: c.Network.Mtu, + Address: fmt.Sprintf("%s/%d", c.Network.IPAddress, c.Network.IPPrefixLen), + Gateway: c.Network.Gateway, + Type: "veth", + Context: libcontainer.Context{ + "prefix": "dock", + "bridge": c.Network.Bridge, + }, + }, + } + } + + container.Cgroups.Name = c.ID + if c.Privileged { + container.Capabilities = nil + container.Cgroups.DeviceAccess = true + } + if c.Resources != nil { + container.Cgroups.CpuShares = c.Resources.CpuShares + container.Cgroups.Memory = c.Resources.Memory + container.Cgroups.MemorySwap = c.Resources.MemorySwap + } + return container +} + +// getDefaultTemplate returns the docker default for +// the libcontainer configuration file +func getDefaultTemplate() *libcontainer.Container { + return &libcontainer.Container{ + Capabilities: libcontainer.Capabilities{ + libcontainer.GetCapability("SETPCAP"), + libcontainer.GetCapability("SYS_MODULE"), + libcontainer.GetCapability("SYS_RAWIO"), + libcontainer.GetCapability("SYS_PACCT"), + libcontainer.GetCapability("SYS_ADMIN"), + libcontainer.GetCapability("SYS_NICE"), + libcontainer.GetCapability("SYS_RESOURCE"), + libcontainer.GetCapability("SYS_TIME"), + libcontainer.GetCapability("SYS_TTY_CONFIG"), + libcontainer.GetCapability("MKNOD"), + libcontainer.GetCapability("AUDIT_WRITE"), + libcontainer.GetCapability("AUDIT_CONTROL"), + libcontainer.GetCapability("MAC_OVERRIDE"), + libcontainer.GetCapability("MAC_ADMIN"), + libcontainer.GetCapability("NET_ADMIN"), + }, + Namespaces: libcontainer.Namespaces{ + libcontainer.GetNamespace("NEWNS"), + libcontainer.GetNamespace("NEWUTS"), + libcontainer.GetNamespace("NEWIPC"), + libcontainer.GetNamespace("NEWPID"), + libcontainer.GetNamespace("NEWNET"), + }, + Cgroups: &cgroups.Cgroup{ + Parent: "docker", + DeviceAccess: false, + }, + } +} diff --git a/execdriver/native/driver.go b/execdriver/native/driver.go new file mode 100644 index 0000000000..88253a5940 --- /dev/null +++ b/execdriver/native/driver.go @@ -0,0 +1,277 @@ +package native + +import ( + "encoding/json" + "fmt" + "github.com/dotcloud/docker/execdriver" + "github.com/dotcloud/docker/pkg/cgroups" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/nsinit" + "github.com/dotcloud/docker/pkg/system" + "io/ioutil" + "os" + "os/exec" + "path/filepath" + "strconv" + "strings" + "syscall" + "time" +) + +const ( + DriverName = "native" + Version = "0.1" +) + +func init() { + execdriver.RegisterInitFunc(DriverName, func(args *execdriver.InitArgs) error { + var ( + container *libcontainer.Container + ns = nsinit.NewNsInit(&nsinit.DefaultCommandFactory{}, &nsinit.DefaultStateWriter{args.Root}) + ) + f, err := os.Open(filepath.Join(args.Root, "container.json")) + if err != nil { + return err + } + if err := json.NewDecoder(f).Decode(&container); err != nil { + f.Close() + return err + } + f.Close() + + cwd, err := os.Getwd() + if err != nil { + return err + } + syncPipe, err := nsinit.NewSyncPipeFromFd(0, uintptr(args.Pipe)) + if err != nil { + return err + } + if err := ns.Init(container, cwd, args.Console, syncPipe, args.Args); err != nil { + return err + } + return nil + }) +} + +type driver struct { + root string +} + +func NewDriver(root string) (*driver, error) { + if err := os.MkdirAll(root, 0700); err != nil { + return nil, err + } + return &driver{ + root: root, + }, nil +} + +func (d *driver) Run(c *execdriver.Command, pipes *execdriver.Pipes, startCallback execdriver.StartCallback) (int, error) { + if err := d.validateCommand(c); err != nil { + return -1, err + } + var ( + term nsinit.Terminal + container = createContainer(c) + factory = &dockerCommandFactory{c: c, driver: d} + stateWriter = &dockerStateWriter{ + callback: startCallback, + c: c, + dsw: &nsinit.DefaultStateWriter{filepath.Join(d.root, c.ID)}, + } + ns = nsinit.NewNsInit(factory, stateWriter) + args = append([]string{c.Entrypoint}, c.Arguments...) + ) + if err := d.createContainerRoot(c.ID); err != nil { + return -1, err + } + defer d.removeContainerRoot(c.ID) + + if c.Tty { + term = &dockerTtyTerm{ + pipes: pipes, + } + } else { + term = &dockerStdTerm{ + pipes: pipes, + } + } + c.Terminal = term + if err := d.writeContainerFile(container, c.ID); err != nil { + return -1, err + } + return ns.Exec(container, term, args) +} + +func (d *driver) Kill(p *execdriver.Command, sig int) error { + return syscall.Kill(p.Process.Pid, syscall.Signal(sig)) +} + +func (d *driver) Restore(c *execdriver.Command) error { + var nspid int + f, err := os.Open(filepath.Join(d.root, c.ID, "pid")) + if err != nil { + return err + } + defer d.removeContainerRoot(c.ID) + + if _, err := fmt.Fscanf(f, "%d", &nspid); err != nil { + f.Close() + return err + } + f.Close() + + if _, err := os.FindProcess(nspid); err != nil { + return fmt.Errorf("finding existing pid %d %s", nspid, err) + } + c.Process = &os.Process{ + Pid: nspid, + } + ticker := time.NewTicker(500 * time.Millisecond) + defer ticker.Stop() + + for _ = range ticker.C { + if err := syscall.Kill(nspid, 0); err != nil { + if strings.Contains(err.Error(), "no such process") { + return nil + } + return fmt.Errorf("signal error %s", err) + } + } + return nil +} + +func (d *driver) Info(id string) execdriver.Info { + return &info{ + ID: id, + driver: d, + } +} + +func (d *driver) Name() string { + return fmt.Sprintf("%s-%s", DriverName, Version) +} + +// TODO: this can be improved with our driver +// there has to be a better way to do this +func (d *driver) GetPidsForContainer(id string) ([]int, error) { + pids := []int{} + + subsystem := "devices" + cgroupRoot, err := cgroups.FindCgroupMountpoint(subsystem) + if err != nil { + return pids, err + } + cgroupDir, err := cgroups.GetThisCgroupDir(subsystem) + if err != nil { + return pids, err + } + + filename := filepath.Join(cgroupRoot, cgroupDir, id, "tasks") + if _, err := os.Stat(filename); os.IsNotExist(err) { + filename = filepath.Join(cgroupRoot, cgroupDir, "docker", id, "tasks") + } + + output, err := ioutil.ReadFile(filename) + if err != nil { + return pids, err + } + for _, p := range strings.Split(string(output), "\n") { + if len(p) == 0 { + continue + } + pid, err := strconv.Atoi(p) + if err != nil { + return pids, fmt.Errorf("Invalid pid '%s': %s", p, err) + } + pids = append(pids, pid) + } + return pids, nil +} + +func (d *driver) writeContainerFile(container *libcontainer.Container, id string) error { + data, err := json.Marshal(container) + if err != nil { + return err + } + return ioutil.WriteFile(filepath.Join(d.root, id, "container.json"), data, 0655) +} + +func (d *driver) createContainerRoot(id string) error { + return os.MkdirAll(filepath.Join(d.root, id), 0655) +} + +func (d *driver) removeContainerRoot(id string) error { + return os.RemoveAll(filepath.Join(d.root, id)) +} + +func (d *driver) validateCommand(c *execdriver.Command) error { + // we need to check the Config of the command to make sure that we + // do not have any of the lxc-conf variables + for _, conf := range c.Config { + if strings.Contains(conf, "lxc") { + return fmt.Errorf("%s is not supported by the native driver", conf) + } + } + return nil +} + +func getEnv(key string, env []string) string { + for _, pair := range env { + parts := strings.Split(pair, "=") + if parts[0] == key { + return parts[1] + } + } + return "" +} + +type dockerCommandFactory struct { + c *execdriver.Command + driver *driver +} + +// createCommand will return an exec.Cmd with the Cloneflags set to the proper namespaces +// defined on the container's configuration and use the current binary as the init with the +// args provided +func (d *dockerCommandFactory) Create(container *libcontainer.Container, console string, syncFd uintptr, args []string) *exec.Cmd { + // we need to join the rootfs because nsinit will setup the rootfs and chroot + initPath := filepath.Join(d.c.Rootfs, d.c.InitPath) + + d.c.Path = initPath + d.c.Args = append([]string{ + initPath, + "-driver", DriverName, + "-console", console, + "-pipe", fmt.Sprint(syncFd), + "-root", filepath.Join(d.driver.root, d.c.ID), + }, args...) + + // set this to nil so that when we set the clone flags anything else is reset + d.c.SysProcAttr = nil + system.SetCloneFlags(&d.c.Cmd, uintptr(nsinit.GetNamespaceFlags(container.Namespaces))) + + d.c.Env = container.Env + d.c.Dir = d.c.Rootfs + + return &d.c.Cmd +} + +type dockerStateWriter struct { + dsw nsinit.StateWriter + c *execdriver.Command + callback execdriver.StartCallback +} + +func (d *dockerStateWriter) WritePid(pid int) error { + err := d.dsw.WritePid(pid) + if d.callback != nil { + d.callback(d.c) + } + return err +} + +func (d *dockerStateWriter) DeletePid() error { + return d.dsw.DeletePid() +} diff --git a/execdriver/native/info.go b/execdriver/native/info.go new file mode 100644 index 0000000000..aef2f85c6b --- /dev/null +++ b/execdriver/native/info.go @@ -0,0 +1,21 @@ +package native + +import ( + "os" + "path/filepath" +) + +type info struct { + ID string + driver *driver +} + +// IsRunning is determined by looking for the +// pid file for a container. If the file exists then the +// container is currently running +func (i *info) IsRunning() bool { + if _, err := os.Stat(filepath.Join(i.driver.root, i.ID, "pid")); err == nil { + return true + } + return false +} diff --git a/execdriver/native/term.go b/execdriver/native/term.go new file mode 100644 index 0000000000..ec69820f75 --- /dev/null +++ b/execdriver/native/term.go @@ -0,0 +1,42 @@ +/* + These types are wrappers around the libcontainer Terminal interface so that + we can resuse the docker implementations where possible. +*/ +package native + +import ( + "github.com/dotcloud/docker/execdriver" + "io" + "os" + "os/exec" +) + +type dockerStdTerm struct { + execdriver.StdConsole + pipes *execdriver.Pipes +} + +func (d *dockerStdTerm) Attach(cmd *exec.Cmd) error { + return d.AttachPipes(cmd, d.pipes) +} + +func (d *dockerStdTerm) SetMaster(master *os.File) { + // do nothing +} + +type dockerTtyTerm struct { + execdriver.TtyConsole + pipes *execdriver.Pipes +} + +func (t *dockerTtyTerm) Attach(cmd *exec.Cmd) error { + go io.Copy(t.pipes.Stdout, t.MasterPty) + if t.pipes.Stdin != nil { + go io.Copy(t.MasterPty, t.pipes.Stdin) + } + return nil +} + +func (t *dockerTtyTerm) SetMaster(master *os.File) { + t.MasterPty = master +} diff --git a/execdriver/termconsole.go b/execdriver/termconsole.go index 2da48d1864..af6b88d3d1 100644 --- a/execdriver/termconsole.go +++ b/execdriver/termconsole.go @@ -5,6 +5,7 @@ import ( "github.com/kr/pty" "io" "os" + "os/exec" ) func SetTerminal(command *Command, pipes *Pipes) error { @@ -25,8 +26,8 @@ func SetTerminal(command *Command, pipes *Pipes) error { } type TtyConsole struct { - master *os.File - slave *os.File + MasterPty *os.File + SlavePty *os.File } func NewTtyConsole(command *Command, pipes *Pipes) (*TtyConsole, error) { @@ -35,28 +36,28 @@ func NewTtyConsole(command *Command, pipes *Pipes) (*TtyConsole, error) { return nil, err } tty := &TtyConsole{ - master: ptyMaster, - slave: ptySlave, + MasterPty: ptyMaster, + SlavePty: ptySlave, } - if err := tty.attach(command, pipes); err != nil { + if err := tty.AttachPipes(&command.Cmd, pipes); err != nil { tty.Close() return nil, err } + command.Console = tty.SlavePty.Name() return tty, nil } func (t *TtyConsole) Master() *os.File { - return t.master + return t.MasterPty } func (t *TtyConsole) Resize(h, w int) error { - return term.SetWinsize(t.master.Fd(), &term.Winsize{Height: uint16(h), Width: uint16(w)}) + return term.SetWinsize(t.MasterPty.Fd(), &term.Winsize{Height: uint16(h), Width: uint16(w)}) } -func (t *TtyConsole) attach(command *Command, pipes *Pipes) error { - command.Stdout = t.slave - command.Stderr = t.slave - command.Console = t.slave.Name() +func (t *TtyConsole) AttachPipes(command *exec.Cmd, pipes *Pipes) error { + command.Stdout = t.SlavePty + command.Stderr = t.SlavePty go func() { if wb, ok := pipes.Stdout.(interface { @@ -64,24 +65,24 @@ func (t *TtyConsole) attach(command *Command, pipes *Pipes) error { }); ok { defer wb.CloseWriters() } - io.Copy(pipes.Stdout, t.master) + io.Copy(pipes.Stdout, t.MasterPty) }() if pipes.Stdin != nil { - command.Stdin = t.slave + command.Stdin = t.SlavePty command.SysProcAttr.Setctty = true go func() { defer pipes.Stdin.Close() - io.Copy(t.master, pipes.Stdin) + io.Copy(t.MasterPty, pipes.Stdin) }() } return nil } func (t *TtyConsole) Close() error { - t.slave.Close() - return t.master.Close() + t.SlavePty.Close() + return t.MasterPty.Close() } type StdConsole struct { @@ -90,13 +91,13 @@ type StdConsole struct { func NewStdConsole(command *Command, pipes *Pipes) (*StdConsole, error) { std := &StdConsole{} - if err := std.attach(command, pipes); err != nil { + if err := std.AttachPipes(&command.Cmd, pipes); err != nil { return nil, err } return std, nil } -func (s *StdConsole) attach(command *Command, pipes *Pipes) error { +func (s *StdConsole) AttachPipes(command *exec.Cmd, pipes *Pipes) error { command.Stdout = pipes.Stdout command.Stderr = pipes.Stderr diff --git a/integration/container_test.go b/integration/container_test.go index b961e1d147..4efb95a2a1 100644 --- a/integration/container_test.go +++ b/integration/container_test.go @@ -1044,7 +1044,6 @@ func TestEnv(t *testing.T) { goodEnv := []string{ "PATH=/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin", "HOME=/", - "container=lxc", "HOSTNAME=" + utils.TruncateID(container.ID), "FALSE=true", "TRUE=false", @@ -1581,8 +1580,8 @@ func TestPrivilegedCanMknod(t *testing.T) { eng := NewTestEngine(t) runtime := mkRuntimeFromEngine(eng, t) defer runtime.Nuke() - if output, _ := runContainer(eng, runtime, []string{"-privileged", "_", "sh", "-c", "mknod /tmp/sda b 8 0 && echo ok"}, t); output != "ok\n" { - t.Fatal("Could not mknod into privileged container") + if output, err := runContainer(eng, runtime, []string{"-privileged", "_", "sh", "-c", "mknod /tmp/sda b 8 0 && echo ok"}, t); output != "ok\n" { + t.Fatalf("Could not mknod into privileged container %s %v", output, err) } } diff --git a/integration/runtime_test.go b/integration/runtime_test.go index 9eaf778b96..6003c89b51 100644 --- a/integration/runtime_test.go +++ b/integration/runtime_test.go @@ -85,7 +85,7 @@ func init() { os.Setenv("TEST", "1") // Hack to run sys init during unit testing - if selfPath := utils.SelfPath(); selfPath == "/sbin/init" || selfPath == "/.dockerinit" { + if selfPath := utils.SelfPath(); strings.Contains(selfPath, ".dockerinit") { sysinit.SysInit() return } diff --git a/integration/utils_test.go b/integration/utils_test.go index 50a4c9bf25..05d73df52a 100644 --- a/integration/utils_test.go +++ b/integration/utils_test.go @@ -190,6 +190,7 @@ func newTestEngine(t utils.Fataler, autorestart bool, root string) *engine.Engin job := eng.Job("initserver") job.Setenv("Root", root) job.SetenvBool("AutoRestart", autorestart) + job.Setenv("ExecDriver", "native") // TestGetEnabledCors and TestOptionsRoute require EnableCors=true job.SetenvBool("EnableCors", true) if err := job.Run(); err != nil { diff --git a/pkg/cgroups/cgroups.go b/pkg/cgroups/cgroups.go index 91ac3842ac..b40e1a31fa 100644 --- a/pkg/cgroups/cgroups.go +++ b/pkg/cgroups/cgroups.go @@ -5,10 +5,23 @@ import ( "fmt" "github.com/dotcloud/docker/pkg/mount" "io" + "io/ioutil" "os" + "path/filepath" + "strconv" "strings" ) +type Cgroup struct { + Name string `json:"name,omitempty"` + Parent string `json:"parent,omitempty"` + + DeviceAccess bool `json:"device_access,omitempty"` // name of parent cgroup or slice + Memory int64 `json:"memory,omitempty"` // Memory limit (in bytes) + MemorySwap int64 `json:"memory_swap,omitempty"` // Total memory usage (memory + swap); set `-1' to disable swap + CpuShares int64 `json:"cpu_shares,omitempty"` // CPU shares (relative weight vs. other containers) +} + // https://www.kernel.org/doc/Documentation/cgroups/cgroups.txt func FindCgroupMountpoint(subsystem string) (string, error) { mounts, err := mount.GetMounts() @@ -25,7 +38,6 @@ func FindCgroupMountpoint(subsystem string) (string, error) { } } } - return "", fmt.Errorf("cgroup mountpoint not found for %s", subsystem) } @@ -40,18 +52,199 @@ func GetThisCgroupDir(subsystem string) (string, error) { return parseCgroupFile(subsystem, f) } +func GetInitCgroupDir(subsystem string) (string, error) { + f, err := os.Open("/proc/1/cgroup") + if err != nil { + return "", err + } + defer f.Close() + + return parseCgroupFile(subsystem, f) +} + +func (c *Cgroup) Path(root, subsystem string) (string, error) { + cgroup := c.Name + if c.Parent != "" { + cgroup = filepath.Join(c.Parent, cgroup) + } + initPath, err := GetInitCgroupDir(subsystem) + if err != nil { + return "", err + } + return filepath.Join(root, subsystem, initPath, cgroup), nil +} + +func (c *Cgroup) Join(root, subsystem string, pid int) (string, error) { + path, err := c.Path(root, subsystem) + if err != nil { + return "", err + } + if err := os.MkdirAll(path, 0755); err != nil && !os.IsExist(err) { + return "", err + } + if err := writeFile(path, "tasks", strconv.Itoa(pid)); err != nil { + return "", err + } + return path, nil +} + +func (c *Cgroup) Cleanup(root string) error { + get := func(subsystem string) string { + path, _ := c.Path(root, subsystem) + return path + } + + for _, path := range []string{ + get("memory"), + get("devices"), + get("cpu"), + } { + os.RemoveAll(path) + } + return nil +} + func parseCgroupFile(subsystem string, r io.Reader) (string, error) { s := bufio.NewScanner(r) - for s.Scan() { if err := s.Err(); err != nil { return "", err } text := s.Text() parts := strings.Split(text, ":") - if parts[1] == subsystem { - return parts[2], nil + for _, subs := range strings.Split(parts[1], ",") { + if subs == subsystem { + return parts[2], nil + } } } return "", fmt.Errorf("cgroup '%s' not found in /proc/self/cgroup", subsystem) } + +func writeFile(dir, file, data string) error { + return ioutil.WriteFile(filepath.Join(dir, file), []byte(data), 0700) +} + +func (c *Cgroup) Apply(pid int) error { + // We have two implementation of cgroups support, one is based on + // systemd and the dbus api, and one is based on raw cgroup fs operations + // following the pre-single-writer model docs at: + // http://www.freedesktop.org/wiki/Software/systemd/PaxControlGroups/ + // + // we can pick any subsystem to find the root + cgroupRoot, err := FindCgroupMountpoint("cpu") + if err != nil { + return err + } + cgroupRoot = filepath.Dir(cgroupRoot) + + if _, err := os.Stat(cgroupRoot); err != nil { + return fmt.Errorf("cgroups fs not found") + } + if err := c.setupDevices(cgroupRoot, pid); err != nil { + return err + } + if err := c.setupMemory(cgroupRoot, pid); err != nil { + return err + } + if err := c.setupCpu(cgroupRoot, pid); err != nil { + return err + } + return nil +} + +func (c *Cgroup) setupDevices(cgroupRoot string, pid int) (err error) { + if !c.DeviceAccess { + dir, err := c.Join(cgroupRoot, "devices", pid) + if err != nil { + return err + } + + defer func() { + if err != nil { + os.RemoveAll(dir) + } + }() + + if err := writeFile(dir, "devices.deny", "a"); err != nil { + return err + } + + allow := []string{ + // /dev/null, zero, full + "c 1:3 rwm", + "c 1:5 rwm", + "c 1:7 rwm", + + // consoles + "c 5:1 rwm", + "c 5:0 rwm", + "c 4:0 rwm", + "c 4:1 rwm", + + // /dev/urandom,/dev/random + "c 1:9 rwm", + "c 1:8 rwm", + + // /dev/pts/ - pts namespaces are "coming soon" + "c 136:* rwm", + "c 5:2 rwm", + + // tuntap + "c 10:200 rwm", + } + + for _, val := range allow { + if err := writeFile(dir, "devices.allow", val); err != nil { + return err + } + } + } + return nil +} + +func (c *Cgroup) setupMemory(cgroupRoot string, pid int) (err error) { + if c.Memory != 0 || c.MemorySwap != 0 { + dir, err := c.Join(cgroupRoot, "memory", pid) + if err != nil { + return err + } + defer func() { + if err != nil { + os.RemoveAll(dir) + } + }() + + if c.Memory != 0 { + if err := writeFile(dir, "memory.limit_in_bytes", strconv.FormatInt(c.Memory, 10)); err != nil { + return err + } + if err := writeFile(dir, "memory.soft_limit_in_bytes", strconv.FormatInt(c.Memory, 10)); err != nil { + return err + } + } + // By default, MemorySwap is set to twice the size of RAM. + // If you want to omit MemorySwap, set it to `-1'. + if c.MemorySwap != -1 { + if err := writeFile(dir, "memory.memsw.limit_in_bytes", strconv.FormatInt(c.Memory*2, 10)); err != nil { + return err + } + } + } + return nil +} + +func (c *Cgroup) setupCpu(cgroupRoot string, pid int) (err error) { + // We always want to join the cpu group, to allow fair cpu scheduling + // on a container basis + dir, err := c.Join(cgroupRoot, "cpu", pid) + if err != nil { + return err + } + if c.CpuShares != 0 { + if err := writeFile(dir, "cpu.shares", strconv.FormatInt(c.CpuShares, 10)); err != nil { + return err + } + } + return nil +} diff --git a/pkg/libcontainer/MAINTAINERS b/pkg/libcontainer/MAINTAINERS new file mode 100644 index 0000000000..e53d933d47 --- /dev/null +++ b/pkg/libcontainer/MAINTAINERS @@ -0,0 +1,2 @@ +Michael Crosby (@crosbymichael) +Guillaume Charmes (@creack) diff --git a/pkg/libcontainer/README.md b/pkg/libcontainer/README.md new file mode 100644 index 0000000000..d6e4dedd63 --- /dev/null +++ b/pkg/libcontainer/README.md @@ -0,0 +1,90 @@ +## libcontainer - reference implementation for containers + +#### background + +libcontainer specifies configuration options for what a container is. It provides a native Go implementation +for using linux namespaces with no external dependencies. libcontainer provides many convience functions for working with namespaces, networking, and management. + + +#### container +A container is a self contained directory that is able to run one or more processes without +affecting the host system. The directory is usually a full system tree. Inside the directory +a `container.json` file is placed with the runtime configuration for how the processes +should be contained and ran. Environment, networking, and different capabilities for the +process are specified in this file. The configuration is used for each process executed inside the container. + +Sample `container.json` file: +```json +{ + "hostname": "koye", + "tty": true, + "environment": [ + "HOME=/", + "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", + "container=docker", + "TERM=xterm-256color" + ], + "namespaces": [ + "NEWIPC", + "NEWNS", + "NEWPID", + "NEWUTS", + "NEWNET" + ], + "capabilities": [ + "SETPCAP", + "SYS_MODULE", + "SYS_RAWIO", + "SYS_PACCT", + "SYS_ADMIN", + "SYS_NICE", + "SYS_RESOURCE", + "SYS_TIME", + "SYS_TTY_CONFIG", + "MKNOD", + "AUDIT_WRITE", + "AUDIT_CONTROL", + "MAC_OVERRIDE", + "MAC_ADMIN", + "NET_ADMIN" + ], + "networks": [{ + "type": "veth", + "context": { + "bridge": "docker0", + "prefix": "dock" + }, + "address": "172.17.0.100/16", + "gateway": "172.17.42.1", + "mtu": 1500 + } + ], + "cgroups": { + "name": "docker-koye", + "parent": "docker", + "memory": 5248000 + } +} +``` + +Using this configuration and the current directory holding the rootfs for a process, one can use libcontainer to exec the container. Running the life of the namespace, a `pid` file +is written to the current directory with the pid of the namespaced process to the external world. A client can use this pid to wait, kill, or perform other operation with the container. If a user tries to run an new process inside an existing container with a live namespace the namespace will be joined by the new process. + + +You may also specify an alternate root place where the `container.json` file is read and where the `pid` file will be saved. + +#### nsinit + +`nsinit` is a cli application used as the reference implementation of libcontainer. It is able to +spawn or join new containers giving the current directory. To use `nsinit` cd into a linux +rootfs and copy a `container.json` file into the directory with your specified configuration. + +To execute `/bin/bash` in the current directory as a container just run: +```bash +nsinit exec /bin/bash +``` + +If you wish to spawn another process inside the container while your current bash session is +running just run the exact same command again to get another bash shell or change the command. If the original process dies, PID 1, all other processes spawned inside the container will also be killed and the namespace will be removed. + +You can identify if a process is running in a container by looking to see if `pid` is in the root of the directory. diff --git a/pkg/libcontainer/TODO.md b/pkg/libcontainer/TODO.md new file mode 100644 index 0000000000..f18c0b4c51 --- /dev/null +++ b/pkg/libcontainer/TODO.md @@ -0,0 +1,17 @@ +#### goals +* small and simple - line count is not everything but less code is better +* clean lines between what we do in the pkg +* provide primitives for working with namespaces not cater to every option +* extend via configuration not by features - host networking, no networking, veth network can be accomplished via adjusting the container.json, nothing to do with code + +#### tasks +* proper tty for a new process in an existing container +* use exec or raw syscalls for new process in existing container +* setup proper user in namespace if specified +* implement hook or clean interface for cgroups +* example configs for different setups (host networking, boot init) +* improve pkg documentation with comments +* testing - this is hard in a low level pkg but we could do some, maybe +* pivot root +* selinux +* apparmor diff --git a/pkg/libcontainer/capabilities/capabilities.go b/pkg/libcontainer/capabilities/capabilities.go new file mode 100644 index 0000000000..3c6d752496 --- /dev/null +++ b/pkg/libcontainer/capabilities/capabilities.go @@ -0,0 +1,33 @@ +package capabilities + +import ( + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/syndtr/gocapability/capability" + "os" +) + +// DropCapabilities drops capabilities for the current process based +// on the container's configuration. +func DropCapabilities(container *libcontainer.Container) error { + if drop := getCapabilities(container); len(drop) > 0 { + c, err := capability.NewPid(os.Getpid()) + if err != nil { + return err + } + c.Unset(capability.CAPS|capability.BOUNDS, drop...) + + if err := c.Apply(capability.CAPS | capability.BOUNDS); err != nil { + return err + } + } + return nil +} + +// getCapabilities returns the specific cap values for the libcontainer types +func getCapabilities(container *libcontainer.Container) []capability.Cap { + drop := []capability.Cap{} + for _, c := range container.Capabilities { + drop = append(drop, c.Value) + } + return drop +} diff --git a/pkg/libcontainer/container.go b/pkg/libcontainer/container.go new file mode 100644 index 0000000000..12a3d7ba8e --- /dev/null +++ b/pkg/libcontainer/container.go @@ -0,0 +1,36 @@ +package libcontainer + +import ( + "github.com/dotcloud/docker/pkg/cgroups" +) + +// Context is a generic key value pair that allows +// arbatrary data to be sent +type Context map[string]string + +// Container defines configuration options for how a +// container is setup inside a directory and how a process should be executed +type Container struct { + Hostname string `json:"hostname,omitempty"` // hostname + ReadonlyFs bool `json:"readonly_fs,omitempty"` // set the containers rootfs as readonly + User string `json:"user,omitempty"` // user to execute the process as + WorkingDir string `json:"working_dir,omitempty"` // current working directory + Env []string `json:"environment,omitempty"` // environment to set + Tty bool `json:"tty,omitempty"` // setup a proper tty or not + Namespaces Namespaces `json:"namespaces,omitempty"` // namespaces to apply + Capabilities Capabilities `json:"capabilities,omitempty"` // capabilities to drop + Networks []*Network `json:"networks,omitempty"` // nil for host's network stack + Cgroups *cgroups.Cgroup `json:"cgroups,omitempty"` +} + +// Network defines configuration for a container's networking stack +// +// The network configuration can be omited from a container causing the +// container to be setup with the host's networking stack +type Network struct { + Type string `json:"type,omitempty"` // type of networking to setup i.e. veth, macvlan, etc + Context Context `json:"context,omitempty"` // generic context for type specific networking options + Address string `json:"address,omitempty"` + Gateway string `json:"gateway,omitempty"` + Mtu int `json:"mtu,omitempty"` +} diff --git a/pkg/libcontainer/container.json b/pkg/libcontainer/container.json new file mode 100644 index 0000000000..83e407467c --- /dev/null +++ b/pkg/libcontainer/container.json @@ -0,0 +1,50 @@ +{ + "hostname": "koye", + "tty": true, + "environment": [ + "HOME=/", + "PATH=PATH=$PATH:/bin:/usr/bin:/sbin:/usr/sbin", + "container=docker", + "TERM=xterm-256color" + ], + "namespaces": [ + "NEWIPC", + "NEWNS", + "NEWPID", + "NEWUTS", + "NEWNET" + ], + "capabilities": [ + "SETPCAP", + "SYS_MODULE", + "SYS_RAWIO", + "SYS_PACCT", + "SYS_ADMIN", + "SYS_NICE", + "SYS_RESOURCE", + "SYS_TIME", + "SYS_TTY_CONFIG", + "MKNOD", + "AUDIT_WRITE", + "AUDIT_CONTROL", + "MAC_OVERRIDE", + "MAC_ADMIN", + "NET_ADMIN" + ], + "networks": [{ + "type": "veth", + "context": { + "bridge": "docker0", + "prefix": "dock" + }, + "address": "172.17.0.100/16", + "gateway": "172.17.42.1", + "mtu": 1500 + } + ], + "cgroups": { + "name": "docker-koye", + "parent": "docker", + "memory": 5248000 + } +} diff --git a/pkg/libcontainer/network/network.go b/pkg/libcontainer/network/network.go new file mode 100644 index 0000000000..8c7a4b618e --- /dev/null +++ b/pkg/libcontainer/network/network.go @@ -0,0 +1,78 @@ +package network + +import ( + "github.com/dotcloud/docker/pkg/netlink" + "net" +) + +func InterfaceUp(name string) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + return netlink.NetworkLinkUp(iface) +} + +func InterfaceDown(name string) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + return netlink.NetworkLinkDown(iface) +} + +func ChangeInterfaceName(old, newName string) error { + iface, err := net.InterfaceByName(old) + if err != nil { + return err + } + return netlink.NetworkChangeName(iface, newName) +} + +func CreateVethPair(name1, name2 string) error { + return netlink.NetworkCreateVethPair(name1, name2) +} + +func SetInterfaceInNamespacePid(name string, nsPid int) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + return netlink.NetworkSetNsPid(iface, nsPid) +} + +func SetInterfaceMaster(name, master string) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + masterIface, err := net.InterfaceByName(master) + if err != nil { + return err + } + return netlink.NetworkSetMaster(iface, masterIface) +} + +func SetDefaultGateway(ip string) error { + return netlink.AddDefaultGw(net.ParseIP(ip)) +} + +func SetInterfaceIp(name string, rawIp string) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + ip, ipNet, err := net.ParseCIDR(rawIp) + if err != nil { + return err + } + return netlink.NetworkLinkAddIp(iface, ip, ipNet) +} + +func SetMtu(name string, mtu int) error { + iface, err := net.InterfaceByName(name) + if err != nil { + return err + } + return netlink.NetworkSetMTU(iface, mtu) +} diff --git a/pkg/libcontainer/network/strategy.go b/pkg/libcontainer/network/strategy.go new file mode 100644 index 0000000000..234fcc0aa2 --- /dev/null +++ b/pkg/libcontainer/network/strategy.go @@ -0,0 +1,32 @@ +package network + +import ( + "errors" + "github.com/dotcloud/docker/pkg/libcontainer" +) + +var ( + ErrNotValidStrategyType = errors.New("not a valid network strategy type") +) + +var strategies = map[string]NetworkStrategy{ + "veth": &Veth{}, +} + +// NetworkStrategy represents a specific network configuration for +// a container's networking stack +type NetworkStrategy interface { + Create(*libcontainer.Network, int, libcontainer.Context) error + Initialize(*libcontainer.Network, libcontainer.Context) error +} + +// GetStrategy returns the specific network strategy for the +// provided type. If no strategy is registered for the type an +// ErrNotValidStrategyType is returned. +func GetStrategy(tpe string) (NetworkStrategy, error) { + s, exists := strategies[tpe] + if !exists { + return nil, ErrNotValidStrategyType + } + return s, nil +} diff --git a/pkg/libcontainer/network/veth.go b/pkg/libcontainer/network/veth.go new file mode 100644 index 0000000000..3ab1b2393b --- /dev/null +++ b/pkg/libcontainer/network/veth.go @@ -0,0 +1,100 @@ +package network + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/utils" +) + +// Veth is a network strategy that uses a bridge and creates +// a veth pair, one that stays outside on the host and the other +// is placed inside the container's namespace +type Veth struct { +} + +func (v *Veth) Create(n *libcontainer.Network, nspid int, context libcontainer.Context) error { + var ( + bridge string + prefix string + exists bool + ) + if bridge, exists = n.Context["bridge"]; !exists { + return fmt.Errorf("bridge does not exist in network context") + } + if prefix, exists = n.Context["prefix"]; !exists { + return fmt.Errorf("veth prefix does not exist in network context") + } + name1, name2, err := createVethPair(prefix) + if err != nil { + return err + } + context["veth-host"] = name1 + context["veth-child"] = name2 + if err := SetInterfaceMaster(name1, bridge); err != nil { + return err + } + if err := SetMtu(name1, n.Mtu); err != nil { + return err + } + if err := InterfaceUp(name1); err != nil { + return err + } + if err := SetInterfaceInNamespacePid(name2, nspid); err != nil { + return err + } + return nil +} + +func (v *Veth) Initialize(config *libcontainer.Network, context libcontainer.Context) error { + var ( + vethChild string + exists bool + ) + if vethChild, exists = context["veth-child"]; !exists { + return fmt.Errorf("vethChild does not exist in network context") + } + if err := InterfaceDown(vethChild); err != nil { + return fmt.Errorf("interface down %s %s", vethChild, err) + } + if err := ChangeInterfaceName(vethChild, "eth0"); err != nil { + return fmt.Errorf("change %s to eth0 %s", vethChild, err) + } + if err := SetInterfaceIp("eth0", config.Address); err != nil { + return fmt.Errorf("set eth0 ip %s", err) + } + if err := SetMtu("eth0", config.Mtu); err != nil { + return fmt.Errorf("set eth0 mtu to %d %s", config.Mtu, err) + } + if err := InterfaceUp("eth0"); err != nil { + return fmt.Errorf("eth0 up %s", err) + } + if err := SetMtu("lo", config.Mtu); err != nil { + return fmt.Errorf("set lo mtu to %d %s", config.Mtu, err) + } + if err := InterfaceUp("lo"); err != nil { + return fmt.Errorf("lo up %s", err) + } + if config.Gateway != "" { + if err := SetDefaultGateway(config.Gateway); err != nil { + return fmt.Errorf("set gateway to %s %s", config.Gateway, err) + } + } + return nil +} + +// createVethPair will automatically generage two random names for +// the veth pair and ensure that they have been created +func createVethPair(prefix string) (name1 string, name2 string, err error) { + name1, err = utils.GenerateRandomName(prefix, 4) + if err != nil { + return + } + name2, err = utils.GenerateRandomName(prefix, 4) + if err != nil { + return + } + if err = CreateVethPair(name1, name2); err != nil { + return + } + return +} diff --git a/pkg/libcontainer/nsinit/command.go b/pkg/libcontainer/nsinit/command.go new file mode 100644 index 0000000000..8ddf1e7e71 --- /dev/null +++ b/pkg/libcontainer/nsinit/command.go @@ -0,0 +1,45 @@ +package nsinit + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/system" + "os" + "os/exec" +) + +// CommandFactory takes the container's configuration and options passed by the +// parent processes and creates an *exec.Cmd that will be used to fork/exec the +// namespaced init process +type CommandFactory interface { + Create(container *libcontainer.Container, console string, syncFd uintptr, args []string) *exec.Cmd +} + +type DefaultCommandFactory struct { + Root string +} + +// Create will return an exec.Cmd with the Cloneflags set to the proper namespaces +// defined on the container's configuration and use the current binary as the init with the +// args provided +func (c *DefaultCommandFactory) Create(container *libcontainer.Container, console string, pipe uintptr, args []string) *exec.Cmd { + // get our binary name from arg0 so we can always reexec ourself + command := exec.Command(os.Args[0], append([]string{ + "-console", console, + "-pipe", fmt.Sprint(pipe), + "-root", c.Root, + "init"}, args...)...) + + system.SetCloneFlags(command, uintptr(GetNamespaceFlags(container.Namespaces))) + command.Env = container.Env + return command +} + +// GetNamespaceFlags parses the container's Namespaces options to set the correct +// flags on clone, unshare, and setns +func GetNamespaceFlags(namespaces libcontainer.Namespaces) (flag int) { + for _, ns := range namespaces { + flag |= ns.Value + } + return flag +} diff --git a/pkg/libcontainer/nsinit/exec.go b/pkg/libcontainer/nsinit/exec.go new file mode 100644 index 0000000000..f1a4e2477a --- /dev/null +++ b/pkg/libcontainer/nsinit/exec.go @@ -0,0 +1,96 @@ +// +build linux + +package nsinit + +import ( + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/network" + "github.com/dotcloud/docker/pkg/system" + "os" + "os/exec" + "syscall" +) + +// Exec performes setup outside of a namespace so that a container can be +// executed. Exec is a high level function for working with container namespaces. +func (ns *linuxNs) Exec(container *libcontainer.Container, term Terminal, args []string) (int, error) { + var ( + master *os.File + console string + err error + ) + + // create a pipe so that we can syncronize with the namespaced process and + // pass the veth name to the child + syncPipe, err := NewSyncPipe() + if err != nil { + return -1, err + } + + if container.Tty { + master, console, err = system.CreateMasterAndConsole() + if err != nil { + return -1, err + } + term.SetMaster(master) + } + + command := ns.commandFactory.Create(container, console, syncPipe.child.Fd(), args) + if err := term.Attach(command); err != nil { + return -1, err + } + defer term.Close() + + if err := command.Start(); err != nil { + return -1, err + } + if err := ns.stateWriter.WritePid(command.Process.Pid); err != nil { + command.Process.Kill() + return -1, err + } + defer ns.stateWriter.DeletePid() + + // Do this before syncing with child so that no children + // can escape the cgroup + if err := ns.SetupCgroups(container, command.Process.Pid); err != nil { + command.Process.Kill() + return -1, err + } + if err := ns.InitializeNetworking(container, command.Process.Pid, syncPipe); err != nil { + command.Process.Kill() + return -1, err + } + + // Sync with child + syncPipe.Close() + + if err := command.Wait(); err != nil { + if _, ok := err.(*exec.ExitError); !ok { + return -1, err + } + } + return command.ProcessState.Sys().(syscall.WaitStatus).ExitStatus(), nil +} + +func (ns *linuxNs) SetupCgroups(container *libcontainer.Container, nspid int) error { + if container.Cgroups != nil { + if err := container.Cgroups.Apply(nspid); err != nil { + return err + } + } + return nil +} + +func (ns *linuxNs) InitializeNetworking(container *libcontainer.Container, nspid int, pipe *SyncPipe) error { + context := libcontainer.Context{} + for _, config := range container.Networks { + strategy, err := network.GetStrategy(config.Type) + if err != nil { + return err + } + if err := strategy.Create(config, nspid, context); err != nil { + return err + } + } + return pipe.SendToChild(context) +} diff --git a/pkg/libcontainer/nsinit/execin.go b/pkg/libcontainer/nsinit/execin.go new file mode 100644 index 0000000000..488fe0e248 --- /dev/null +++ b/pkg/libcontainer/nsinit/execin.go @@ -0,0 +1,94 @@ +// +build linux + +package nsinit + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/system" + "os" + "path/filepath" + "strconv" + "syscall" +) + +// ExecIn uses an existing pid and joins the pid's namespaces with the new command. +func (ns *linuxNs) ExecIn(container *libcontainer.Container, nspid int, args []string) (int, error) { + for _, ns := range container.Namespaces { + if err := system.Unshare(ns.Value); err != nil { + return -1, err + } + } + fds, err := ns.getNsFds(nspid, container) + closeFds := func() { + for _, f := range fds { + system.Closefd(f) + } + } + if err != nil { + closeFds() + return -1, err + } + + // foreach namespace fd, use setns to join an existing container's namespaces + for _, fd := range fds { + if fd > 0 { + if err := system.Setns(fd, 0); err != nil { + closeFds() + return -1, fmt.Errorf("setns %s", err) + } + } + system.Closefd(fd) + } + + // if the container has a new pid and mount namespace we need to + // remount proc and sys to pick up the changes + if container.Namespaces.Contains("NEWNS") && container.Namespaces.Contains("NEWPID") { + pid, err := system.Fork() + if err != nil { + return -1, err + } + if pid == 0 { + // TODO: make all raw syscalls to be fork safe + if err := system.Unshare(syscall.CLONE_NEWNS); err != nil { + return -1, err + } + if err := remountProc(); err != nil { + return -1, fmt.Errorf("remount proc %s", err) + } + if err := remountSys(); err != nil { + return -1, fmt.Errorf("remount sys %s", err) + } + goto dropAndExec + } + proc, err := os.FindProcess(pid) + if err != nil { + return -1, err + } + state, err := proc.Wait() + if err != nil { + return -1, err + } + os.Exit(state.Sys().(syscall.WaitStatus).ExitStatus()) + } +dropAndExec: + if err := finalizeNamespace(container); err != nil { + return -1, err + } + if err := system.Execv(args[0], args[0:], container.Env); err != nil { + return -1, err + } + panic("unreachable") +} + +func (ns *linuxNs) getNsFds(pid int, container *libcontainer.Container) ([]uintptr, error) { + fds := make([]uintptr, len(container.Namespaces)) + for i, ns := range container.Namespaces { + f, err := os.OpenFile(filepath.Join("/proc/", strconv.Itoa(pid), "ns", ns.File), os.O_RDONLY, 0) + if err != nil { + return fds, err + } + fds[i] = f.Fd() + } + return fds, nil +} diff --git a/pkg/libcontainer/nsinit/init.go b/pkg/libcontainer/nsinit/init.go new file mode 100644 index 0000000000..565030f252 --- /dev/null +++ b/pkg/libcontainer/nsinit/init.go @@ -0,0 +1,153 @@ +// +build linux + +package nsinit + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/capabilities" + "github.com/dotcloud/docker/pkg/libcontainer/network" + "github.com/dotcloud/docker/pkg/libcontainer/utils" + "github.com/dotcloud/docker/pkg/system" + "github.com/dotcloud/docker/pkg/user" + "os" + "syscall" +) + +// Init is the init process that first runs inside a new namespace to setup mounts, users, networking, +// and other options required for the new container. +func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, console string, syncPipe *SyncPipe, args []string) error { + rootfs, err := utils.ResolveRootfs(uncleanRootfs) + if err != nil { + return err + } + + // We always read this as it is a way to sync with the parent as well + context, err := syncPipe.ReadFromParent() + if err != nil { + syncPipe.Close() + return err + } + syncPipe.Close() + + if console != "" { + // close pipes so that we can replace it with the pty + closeStdPipes() + slave, err := system.OpenTerminal(console, syscall.O_RDWR) + if err != nil { + return fmt.Errorf("open terminal %s", err) + } + if err := dupSlave(slave); err != nil { + return fmt.Errorf("dup2 slave %s", err) + } + } + if _, err := system.Setsid(); err != nil { + return fmt.Errorf("setsid %s", err) + } + if console != "" { + if err := system.Setctty(); err != nil { + return fmt.Errorf("setctty %s", err) + } + } + + /* + if err := system.ParentDeathSignal(); err != nil { + return fmt.Errorf("parent death signal %s", err) + } + */ + if err := setupNewMountNamespace(rootfs, console, container.ReadonlyFs); err != nil { + return fmt.Errorf("setup mount namespace %s", err) + } + if err := setupNetwork(container, context); err != nil { + return fmt.Errorf("setup networking %s", err) + } + if err := system.Sethostname(container.Hostname); err != nil { + return fmt.Errorf("sethostname %s", err) + } + if err := finalizeNamespace(container); err != nil { + return fmt.Errorf("finalize namespace %s", err) + } + return system.Execv(args[0], args[0:], container.Env) +} + +func closeStdPipes() { + os.Stdin.Close() + os.Stdout.Close() + os.Stderr.Close() +} + +func setupUser(container *libcontainer.Container) error { + switch container.User { + case "root", "": + if err := system.Setgroups(nil); err != nil { + return err + } + if err := system.Setresgid(0, 0, 0); err != nil { + return err + } + if err := system.Setresuid(0, 0, 0); err != nil { + return err + } + default: + uid, gid, suppGids, err := user.GetUserGroupSupplementary(container.User, syscall.Getuid(), syscall.Getgid()) + if err != nil { + return err + } + if err := system.Setgroups(suppGids); err != nil { + return err + } + if err := system.Setgid(gid); err != nil { + return err + } + if err := system.Setuid(uid); err != nil { + return err + } + } + return nil +} + +// dupSlave dup2 the pty slave's fd into stdout and stdin and ensures that +// the slave's fd is 0, or stdin +func dupSlave(slave *os.File) error { + if slave.Fd() != 0 { + return fmt.Errorf("slave fd not 0 %d", slave.Fd()) + } + if err := system.Dup2(slave.Fd(), 1); err != nil { + return err + } + if err := system.Dup2(slave.Fd(), 2); err != nil { + return err + } + return nil +} + +// setupVethNetwork uses the Network config if it is not nil to initialize +// the new veth interface inside the container for use by changing the name to eth0 +// setting the MTU and IP address along with the default gateway +func setupNetwork(container *libcontainer.Container, context libcontainer.Context) error { + for _, config := range container.Networks { + strategy, err := network.GetStrategy(config.Type) + if err != nil { + return err + } + return strategy.Initialize(config, context) + } + return nil +} + +// finalizeNamespace drops the caps and sets the correct user +// and working dir before execing the command inside the namespace +func finalizeNamespace(container *libcontainer.Container) error { + if err := capabilities.DropCapabilities(container); err != nil { + return fmt.Errorf("drop capabilities %s", err) + } + if err := setupUser(container); err != nil { + return fmt.Errorf("setup user %s", err) + } + if container.WorkingDir != "" { + if err := system.Chdir(container.WorkingDir); err != nil { + return fmt.Errorf("chdir to %s %s", container.WorkingDir, err) + } + } + return nil +} diff --git a/pkg/libcontainer/nsinit/mount.go b/pkg/libcontainer/nsinit/mount.go new file mode 100644 index 0000000000..55c2655ab0 --- /dev/null +++ b/pkg/libcontainer/nsinit/mount.go @@ -0,0 +1,254 @@ +// +build linux + +package nsinit + +import ( + "fmt" + "github.com/dotcloud/docker/pkg/system" + "os" + "path/filepath" + "syscall" +) + +// default mount point flags +const defaultMountFlags = syscall.MS_NOEXEC | syscall.MS_NOSUID | syscall.MS_NODEV + +// setupNewMountNamespace is used to initialize a new mount namespace for an new +// container in the rootfs that is specified. +// +// There is no need to unmount the new mounts because as soon as the mount namespace +// is no longer in use, the mounts will be removed automatically +func setupNewMountNamespace(rootfs, console string, readonly bool) error { + // mount as slave so that the new mounts do not propagate to the host + if err := system.Mount("", "/", "", syscall.MS_SLAVE|syscall.MS_REC, ""); err != nil { + return fmt.Errorf("mounting / as slave %s", err) + } + if err := system.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REC, ""); err != nil { + return fmt.Errorf("mouting %s as bind %s", rootfs, err) + } + if readonly { + if err := system.Mount(rootfs, rootfs, "bind", syscall.MS_BIND|syscall.MS_REMOUNT|syscall.MS_RDONLY|syscall.MS_REC, ""); err != nil { + return fmt.Errorf("mounting %s as readonly %s", rootfs, err) + } + } + if err := mountSystem(rootfs); err != nil { + return fmt.Errorf("mount system %s", err) + } + if err := copyDevNodes(rootfs); err != nil { + return fmt.Errorf("copy dev nodes %s", err) + } + if err := setupLoopbackDevices(rootfs); err != nil { + return fmt.Errorf("setup loopback devices %s", err) + } + if err := setupDev(rootfs); err != nil { + return err + } + if console != "" { + if err := setupPtmx(rootfs, console); err != nil { + return err + } + } + if err := system.Chdir(rootfs); err != nil { + return fmt.Errorf("chdir into %s %s", rootfs, err) + } + if err := system.Mount(rootfs, "/", "", syscall.MS_MOVE, ""); err != nil { + return fmt.Errorf("mount move %s into / %s", rootfs, err) + } + if err := system.Chroot("."); err != nil { + return fmt.Errorf("chroot . %s", err) + } + if err := system.Chdir("/"); err != nil { + return fmt.Errorf("chdir / %s", err) + } + + system.Umask(0022) + + return nil +} + +// copyDevNodes mknods the hosts devices so the new container has access to them +func copyDevNodes(rootfs string) error { + oldMask := system.Umask(0000) + defer system.Umask(oldMask) + + for _, node := range []string{ + "null", + "zero", + "full", + "random", + "urandom", + "tty", + } { + if err := copyDevNode(rootfs, node); err != nil { + return err + } + } + return nil +} + +func setupLoopbackDevices(rootfs string) error { + for i := 0; ; i++ { + var ( + device = fmt.Sprintf("loop%d", i) + source = filepath.Join("/dev", device) + dest = filepath.Join(rootfs, "dev", device) + ) + + if _, err := os.Stat(source); err != nil { + if !os.IsNotExist(err) { + return err + } + return nil + } + if _, err := os.Stat(dest); err == nil { + os.Remove(dest) + } + f, err := os.Create(dest) + if err != nil { + return err + } + f.Close() + if err := system.Mount(source, dest, "none", syscall.MS_BIND, ""); err != nil { + return err + } + } + return nil +} + +func copyDevNode(rootfs, node string) error { + stat, err := os.Stat(filepath.Join("/dev", node)) + if err != nil { + return err + } + var ( + dest = filepath.Join(rootfs, "dev", node) + st = stat.Sys().(*syscall.Stat_t) + ) + if err := system.Mknod(dest, st.Mode, int(st.Rdev)); err != nil && !os.IsExist(err) { + return fmt.Errorf("copy %s %s", node, err) + } + return nil +} + +// setupDev symlinks the current processes pipes into the +// appropriate destination on the containers rootfs +func setupDev(rootfs string) error { + for _, link := range []struct { + from string + to string + }{ + {"/proc/kcore", "/dev/core"}, + {"/proc/self/fd", "/dev/fd"}, + {"/proc/self/fd/0", "/dev/stdin"}, + {"/proc/self/fd/1", "/dev/stdout"}, + {"/proc/self/fd/2", "/dev/stderr"}, + } { + dest := filepath.Join(rootfs, link.to) + if err := os.Remove(dest); err != nil && !os.IsNotExist(err) { + return fmt.Errorf("remove %s %s", dest, err) + } + if err := os.Symlink(link.from, dest); err != nil { + return fmt.Errorf("symlink %s %s", dest, err) + } + } + return nil +} + +// setupConsole ensures that the container has a proper /dev/console setup +func setupConsole(rootfs, console string) error { + oldMask := system.Umask(0000) + defer system.Umask(oldMask) + + stat, err := os.Stat(console) + if err != nil { + return fmt.Errorf("stat console %s %s", console, err) + } + var ( + st = stat.Sys().(*syscall.Stat_t) + dest = filepath.Join(rootfs, "dev/console") + ) + if err := os.Remove(dest); err != nil && !os.IsNotExist(err) { + return fmt.Errorf("remove %s %s", dest, err) + } + if err := os.Chmod(console, 0600); err != nil { + return err + } + if err := os.Chown(console, 0, 0); err != nil { + return err + } + if err := system.Mknod(dest, (st.Mode&^07777)|0600, int(st.Rdev)); err != nil { + return fmt.Errorf("mknod %s %s", dest, err) + } + if err := system.Mount(console, dest, "bind", syscall.MS_BIND, ""); err != nil { + return fmt.Errorf("bind %s to %s %s", console, dest, err) + } + return nil +} + +// mountSystem sets up linux specific system mounts like sys, proc, shm, and devpts +// inside the mount namespace +func mountSystem(rootfs string) error { + for _, m := range []struct { + source string + path string + device string + flags int + data string + }{ + {source: "proc", path: filepath.Join(rootfs, "proc"), device: "proc", flags: defaultMountFlags}, + {source: "sysfs", path: filepath.Join(rootfs, "sys"), device: "sysfs", flags: defaultMountFlags}, + {source: "tmpfs", path: filepath.Join(rootfs, "dev"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_STRICTATIME, data: "mode=755"}, + {source: "shm", path: filepath.Join(rootfs, "dev", "shm"), device: "tmpfs", flags: defaultMountFlags, data: "mode=1777"}, + {source: "devpts", path: filepath.Join(rootfs, "dev", "pts"), device: "devpts", flags: syscall.MS_NOSUID | syscall.MS_NOEXEC, data: "newinstance,ptmxmode=0666,mode=620,gid=5"}, + {source: "tmpfs", path: filepath.Join(rootfs, "run"), device: "tmpfs", flags: syscall.MS_NOSUID | syscall.MS_NODEV | syscall.MS_STRICTATIME, data: "mode=755"}, + } { + if err := os.MkdirAll(m.path, 0755); err != nil && !os.IsExist(err) { + return fmt.Errorf("mkdirall %s %s", m.path, err) + } + if err := system.Mount(m.source, m.path, m.device, uintptr(m.flags), m.data); err != nil { + return fmt.Errorf("mounting %s into %s %s", m.source, m.path, err) + } + } + return nil +} + +// setupPtmx adds a symlink to pts/ptmx for /dev/ptmx and +// finishes setting up /dev/console +func setupPtmx(rootfs, console string) error { + ptmx := filepath.Join(rootfs, "dev/ptmx") + if err := os.Remove(ptmx); err != nil && !os.IsNotExist(err) { + return err + } + if err := os.Symlink("pts/ptmx", ptmx); err != nil { + return fmt.Errorf("symlink dev ptmx %s", err) + } + if err := setupConsole(rootfs, console); err != nil { + return err + } + return nil +} + +// remountProc is used to detach and remount the proc filesystem +// commonly needed with running a new process inside an existing container +func remountProc() error { + if err := system.Unmount("/proc", syscall.MNT_DETACH); err != nil { + return err + } + if err := system.Mount("proc", "/proc", "proc", uintptr(defaultMountFlags), ""); err != nil { + return err + } + return nil +} + +func remountSys() error { + if err := system.Unmount("/sys", syscall.MNT_DETACH); err != nil { + if err != syscall.EINVAL { + return err + } + } else { + if err := system.Mount("sysfs", "/sys", "sysfs", uintptr(defaultMountFlags), ""); err != nil { + return err + } + } + return nil +} diff --git a/pkg/libcontainer/nsinit/nsinit.go b/pkg/libcontainer/nsinit/nsinit.go new file mode 100644 index 0000000000..f09a130aa2 --- /dev/null +++ b/pkg/libcontainer/nsinit/nsinit.go @@ -0,0 +1,26 @@ +package nsinit + +import ( + "github.com/dotcloud/docker/pkg/libcontainer" +) + +// NsInit is an interface with the public facing methods to provide high level +// exec operations on a container +type NsInit interface { + Exec(container *libcontainer.Container, term Terminal, args []string) (int, error) + ExecIn(container *libcontainer.Container, nspid int, args []string) (int, error) + Init(container *libcontainer.Container, uncleanRootfs, console string, syncPipe *SyncPipe, args []string) error +} + +type linuxNs struct { + root string + commandFactory CommandFactory + stateWriter StateWriter +} + +func NewNsInit(command CommandFactory, state StateWriter) NsInit { + return &linuxNs{ + commandFactory: command, + stateWriter: state, + } +} diff --git a/pkg/libcontainer/nsinit/nsinit/main.go b/pkg/libcontainer/nsinit/nsinit/main.go new file mode 100644 index 0000000000..61921c59a3 --- /dev/null +++ b/pkg/libcontainer/nsinit/nsinit/main.go @@ -0,0 +1,110 @@ +package main + +import ( + "encoding/json" + "flag" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/libcontainer/nsinit" + "io/ioutil" + "log" + "os" + "path/filepath" + "strconv" +) + +var ( + root, console string + pipeFd int +) + +func registerFlags() { + flag.StringVar(&console, "console", "", "console (pty slave) path") + flag.IntVar(&pipeFd, "pipe", 0, "sync pipe fd") + flag.StringVar(&root, "root", ".", "root for storing configuration data") + + flag.Parse() +} + +func main() { + registerFlags() + + if flag.NArg() < 1 { + log.Fatalf("wrong number of argments %d", flag.NArg()) + } + container, err := loadContainer() + if err != nil { + log.Fatal(err) + } + ns, err := newNsInit() + if err != nil { + log.Fatal(err) + } + + switch flag.Arg(0) { + case "exec": // this is executed outside of the namespace in the cwd + var exitCode int + nspid, err := readPid() + if err != nil { + if !os.IsNotExist(err) { + log.Fatal(err) + } + } + if nspid > 0 { + exitCode, err = ns.ExecIn(container, nspid, flag.Args()[1:]) + } else { + term := nsinit.NewTerminal(os.Stdin, os.Stdout, os.Stderr, container.Tty) + exitCode, err = ns.Exec(container, term, flag.Args()[1:]) + } + if err != nil { + log.Fatal(err) + } + os.Exit(exitCode) + case "init": // this is executed inside of the namespace to setup the container + cwd, err := os.Getwd() + if err != nil { + log.Fatal(err) + } + if flag.NArg() < 2 { + log.Fatalf("wrong number of argments %d", flag.NArg()) + } + syncPipe, err := nsinit.NewSyncPipeFromFd(0, uintptr(pipeFd)) + if err != nil { + log.Fatal(err) + } + if err := ns.Init(container, cwd, console, syncPipe, flag.Args()[1:]); err != nil { + log.Fatal(err) + } + default: + log.Fatalf("command not supported for nsinit %s", flag.Arg(0)) + } +} + +func loadContainer() (*libcontainer.Container, error) { + f, err := os.Open(filepath.Join(root, "container.json")) + if err != nil { + return nil, err + } + defer f.Close() + + var container *libcontainer.Container + if err := json.NewDecoder(f).Decode(&container); err != nil { + return nil, err + } + return container, nil +} + +func readPid() (int, error) { + data, err := ioutil.ReadFile(filepath.Join(root, "pid")) + if err != nil { + return -1, err + } + pid, err := strconv.Atoi(string(data)) + if err != nil { + return -1, err + } + return pid, nil +} + +func newNsInit() (nsinit.NsInit, error) { + return nsinit.NewNsInit(&nsinit.DefaultCommandFactory{root}, &nsinit.DefaultStateWriter{root}), nil +} diff --git a/pkg/libcontainer/nsinit/state.go b/pkg/libcontainer/nsinit/state.go new file mode 100644 index 0000000000..af38008c03 --- /dev/null +++ b/pkg/libcontainer/nsinit/state.go @@ -0,0 +1,28 @@ +package nsinit + +import ( + "fmt" + "io/ioutil" + "os" + "path/filepath" +) + +// StateWriter handles writing and deleting the pid file +// on disk +type StateWriter interface { + WritePid(pid int) error + DeletePid() error +} + +type DefaultStateWriter struct { + Root string +} + +// writePidFile writes the namespaced processes pid to pid in the rootfs for the container +func (d *DefaultStateWriter) WritePid(pid int) error { + return ioutil.WriteFile(filepath.Join(d.Root, "pid"), []byte(fmt.Sprint(pid)), 0655) +} + +func (d *DefaultStateWriter) DeletePid() error { + return os.Remove(filepath.Join(d.Root, "pid")) +} diff --git a/pkg/libcontainer/nsinit/sync_pipe.go b/pkg/libcontainer/nsinit/sync_pipe.go new file mode 100644 index 0000000000..7b29e98680 --- /dev/null +++ b/pkg/libcontainer/nsinit/sync_pipe.go @@ -0,0 +1,73 @@ +package nsinit + +import ( + "encoding/json" + "fmt" + "github.com/dotcloud/docker/pkg/libcontainer" + "github.com/dotcloud/docker/pkg/system" + "io/ioutil" + "os" +) + +// SyncPipe allows communication to and from the child processes +// to it's parent and allows the two independent processes to +// syncronize their state. +type SyncPipe struct { + parent, child *os.File +} + +func NewSyncPipe() (s *SyncPipe, err error) { + s = &SyncPipe{} + s.child, s.parent, err = os.Pipe() + if err != nil { + return nil, err + } + system.UsetCloseOnExec(s.child.Fd()) + return s, nil +} + +func NewSyncPipeFromFd(parendFd, childFd uintptr) (*SyncPipe, error) { + s := &SyncPipe{} + if parendFd > 0 { + s.parent = os.NewFile(parendFd, "parendPipe") + } else if childFd > 0 { + s.child = os.NewFile(childFd, "childPipe") + } else { + return nil, fmt.Errorf("no valid sync pipe fd specified") + } + return s, nil +} + +func (s *SyncPipe) SendToChild(context libcontainer.Context) error { + data, err := json.Marshal(context) + if err != nil { + return err + } + s.parent.Write(data) + return nil +} + +func (s *SyncPipe) ReadFromParent() (libcontainer.Context, error) { + data, err := ioutil.ReadAll(s.child) + if err != nil { + return nil, fmt.Errorf("error reading from sync pipe %s", err) + } + var context libcontainer.Context + if len(data) > 0 { + if err := json.Unmarshal(data, &context); err != nil { + return nil, err + } + } + return context, nil + +} + +func (s *SyncPipe) Close() error { + if s.parent != nil { + s.parent.Close() + } + if s.child != nil { + s.child.Close() + } + return nil +} diff --git a/pkg/libcontainer/nsinit/term.go b/pkg/libcontainer/nsinit/term.go new file mode 100644 index 0000000000..58dccab2b8 --- /dev/null +++ b/pkg/libcontainer/nsinit/term.go @@ -0,0 +1,118 @@ +package nsinit + +import ( + "github.com/dotcloud/docker/pkg/term" + "io" + "os" + "os/exec" +) + +type Terminal interface { + io.Closer + SetMaster(*os.File) + Attach(*exec.Cmd) error + Resize(h, w int) error +} + +func NewTerminal(stdin io.Reader, stdout, stderr io.Writer, tty bool) Terminal { + if tty { + return &TtyTerminal{ + stdin: stdin, + stdout: stdout, + stderr: stderr, + } + } + return &StdTerminal{ + stdin: stdin, + stdout: stdout, + stderr: stderr, + } +} + +type TtyTerminal struct { + stdin io.Reader + stdout, stderr io.Writer + master *os.File + state *term.State +} + +func (t *TtyTerminal) Resize(h, w int) error { + return term.SetWinsize(t.master.Fd(), &term.Winsize{Height: uint16(h), Width: uint16(w)}) +} + +func (t *TtyTerminal) SetMaster(master *os.File) { + t.master = master +} + +func (t *TtyTerminal) Attach(command *exec.Cmd) error { + go io.Copy(t.stdout, t.master) + go io.Copy(t.master, t.stdin) + + state, err := t.setupWindow(t.master, os.Stdin) + if err != nil { + command.Process.Kill() + return err + } + t.state = state + return err +} + +// SetupWindow gets the parent window size and sets the master +// pty to the current size and set the parents mode to RAW +func (t *TtyTerminal) setupWindow(master, parent *os.File) (*term.State, error) { + ws, err := term.GetWinsize(parent.Fd()) + if err != nil { + return nil, err + } + if err := term.SetWinsize(master.Fd(), ws); err != nil { + return nil, err + } + return term.SetRawTerminal(parent.Fd()) +} + +func (t *TtyTerminal) Close() error { + term.RestoreTerminal(os.Stdin.Fd(), t.state) + return t.master.Close() +} + +type StdTerminal struct { + stdin io.Reader + stdout, stderr io.Writer +} + +func (s *StdTerminal) SetMaster(*os.File) { + // no need to set master on non tty +} + +func (s *StdTerminal) Close() error { + return nil +} + +func (s *StdTerminal) Resize(h, w int) error { + return nil +} + +func (s *StdTerminal) Attach(command *exec.Cmd) error { + inPipe, err := command.StdinPipe() + if err != nil { + return err + } + outPipe, err := command.StdoutPipe() + if err != nil { + return err + } + errPipe, err := command.StderrPipe() + if err != nil { + return err + } + + go func() { + defer inPipe.Close() + io.Copy(inPipe, s.stdin) + }() + + go io.Copy(s.stdout, outPipe) + go io.Copy(s.stderr, errPipe) + + return nil +} diff --git a/pkg/libcontainer/nsinit/unsupported.go b/pkg/libcontainer/nsinit/unsupported.go new file mode 100644 index 0000000000..2412223d28 --- /dev/null +++ b/pkg/libcontainer/nsinit/unsupported.go @@ -0,0 +1,19 @@ +// +build !linux + +package nsinit + +import ( + "github.com/dotcloud/docker/pkg/libcontainer" +) + +func (ns *linuxNs) Exec(container *libcontainer.Container, term Terminal, args []string) (int, error) { + return -1, libcontainer.ErrUnsupported +} + +func (ns *linuxNs) ExecIn(container *libcontainer.Container, nspid int, args []string) (int, error) { + return -1, libcontainer.ErrUnsupported +} + +func (ns *linuxNs) Init(container *libcontainer.Container, uncleanRootfs, console string, syncPipe *SyncPipe, args []string) error { + return libcontainer.ErrUnsupported +} diff --git a/pkg/libcontainer/types.go b/pkg/libcontainer/types.go new file mode 100644 index 0000000000..8c28530140 --- /dev/null +++ b/pkg/libcontainer/types.go @@ -0,0 +1,134 @@ +package libcontainer + +import ( + "encoding/json" + "errors" + "github.com/syndtr/gocapability/capability" + "os" +) + +var ( + ErrUnkownNamespace = errors.New("Unknown namespace") + ErrUnkownCapability = errors.New("Unknown capability") + ErrUnsupported = errors.New("Unsupported method") +) + +// namespaceList is used to convert the libcontainer types +// into the names of the files located in /proc//ns/* for +// each namespace +var ( + namespaceList = Namespaces{} + + capabilityList = Capabilities{ + {Key: "SETPCAP", Value: capability.CAP_SETPCAP}, + {Key: "SYS_MODULE", Value: capability.CAP_SYS_MODULE}, + {Key: "SYS_RAWIO", Value: capability.CAP_SYS_RAWIO}, + {Key: "SYS_PACCT", Value: capability.CAP_SYS_PACCT}, + {Key: "SYS_ADMIN", Value: capability.CAP_SYS_ADMIN}, + {Key: "SYS_NICE", Value: capability.CAP_SYS_NICE}, + {Key: "SYS_RESOURCE", Value: capability.CAP_SYS_RESOURCE}, + {Key: "SYS_TIME", Value: capability.CAP_SYS_TIME}, + {Key: "SYS_TTY_CONFIG", Value: capability.CAP_SYS_TTY_CONFIG}, + {Key: "MKNOD", Value: capability.CAP_MKNOD}, + {Key: "AUDIT_WRITE", Value: capability.CAP_AUDIT_WRITE}, + {Key: "AUDIT_CONTROL", Value: capability.CAP_AUDIT_CONTROL}, + {Key: "MAC_OVERRIDE", Value: capability.CAP_MAC_OVERRIDE}, + {Key: "MAC_ADMIN", Value: capability.CAP_MAC_ADMIN}, + {Key: "NET_ADMIN", Value: capability.CAP_NET_ADMIN}, + } +) + +type ( + Namespace struct { + Key string + Value int + File string + } + Namespaces []*Namespace +) + +func (ns *Namespace) String() string { + return ns.Key +} + +func (ns *Namespace) MarshalJSON() ([]byte, error) { + return json.Marshal(ns.Key) +} + +func (ns *Namespace) UnmarshalJSON(src []byte) error { + var nsName string + if err := json.Unmarshal(src, &nsName); err != nil { + return err + } + ret := GetNamespace(nsName) + if ret == nil { + return ErrUnkownNamespace + } + *ns = *ret + return nil +} + +func GetNamespace(key string) *Namespace { + for _, ns := range namespaceList { + if ns.Key == key { + return ns + } + } + if os.Getenv("DEBUG") != "" { + panic("Unreachable: Namespace not found") + } + return nil +} + +// Contains returns true if the specified Namespace is +// in the slice +func (n Namespaces) Contains(ns string) bool { + return GetNamespace(ns) != nil +} + +type ( + Capability struct { + Key string + Value capability.Cap + } + Capabilities []*Capability +) + +func (c *Capability) String() string { + return c.Key +} + +func (c *Capability) MarshalJSON() ([]byte, error) { + return json.Marshal(c.Key) +} + +func (c *Capability) UnmarshalJSON(src []byte) error { + var capName string + if err := json.Unmarshal(src, &capName); err != nil { + return err + } + ret := GetCapability(capName) + if ret == nil { + return ErrUnkownCapability + } + *c = *ret + return nil +} + +func GetCapability(key string) *Capability { + for _, capp := range capabilityList { + if capp.Key == key { + return capp + } + } + if os.Getenv("DEBUG") != "" { + panic("Unreachable: Capability not found") + } + return nil +} + +// Contains returns true if the specified Capability is +// in the slice +func (c Capabilities) Contains(capp string) bool { + return GetCapability(capp) != nil +} diff --git a/pkg/libcontainer/types_linux.go b/pkg/libcontainer/types_linux.go new file mode 100644 index 0000000000..c14531df20 --- /dev/null +++ b/pkg/libcontainer/types_linux.go @@ -0,0 +1,16 @@ +package libcontainer + +import ( + "syscall" +) + +func init() { + namespaceList = Namespaces{ + {Key: "NEWNS", Value: syscall.CLONE_NEWNS, File: "mnt"}, + {Key: "NEWUTS", Value: syscall.CLONE_NEWUTS, File: "uts"}, + {Key: "NEWIPC", Value: syscall.CLONE_NEWIPC, File: "ipc"}, + {Key: "NEWUSER", Value: syscall.CLONE_NEWUSER, File: "user"}, + {Key: "NEWPID", Value: syscall.CLONE_NEWPID, File: "pid"}, + {Key: "NEWNET", Value: syscall.CLONE_NEWNET, File: "net"}, + } +} diff --git a/pkg/libcontainer/utils/utils.go b/pkg/libcontainer/utils/utils.go new file mode 100644 index 0000000000..0d919bc43d --- /dev/null +++ b/pkg/libcontainer/utils/utils.go @@ -0,0 +1,28 @@ +package utils + +import ( + "crypto/rand" + "encoding/hex" + "io" + "path/filepath" +) + +// GenerateRandomName returns a new name joined with a prefix. This size +// specified is used to truncate the randomly generated value +func GenerateRandomName(prefix string, size int) (string, error) { + id := make([]byte, 32) + if _, err := io.ReadFull(rand.Reader, id); err != nil { + return "", err + } + return prefix + hex.EncodeToString(id)[:size], nil +} + +// ResolveRootfs ensures that the current working directory is +// not a symlink and returns the absolute path to the rootfs +func ResolveRootfs(uncleanRootfs string) (string, error) { + rootfs, err := filepath.Abs(uncleanRootfs) + if err != nil { + return "", err + } + return filepath.EvalSymlinks(rootfs) +} diff --git a/pkg/system/calls_linux.go b/pkg/system/calls_linux.go new file mode 100644 index 0000000000..bf667c535b --- /dev/null +++ b/pkg/system/calls_linux.go @@ -0,0 +1,145 @@ +package system + +import ( + "os/exec" + "syscall" +) + +func Chroot(dir string) error { + return syscall.Chroot(dir) +} + +func Chdir(dir string) error { + return syscall.Chdir(dir) +} + +func Exec(cmd string, args []string, env []string) error { + return syscall.Exec(cmd, args, env) +} + +func Execv(cmd string, args []string, env []string) error { + name, err := exec.LookPath(cmd) + if err != nil { + return err + } + return Exec(name, args, env) +} + +func Fork() (int, error) { + syscall.ForkLock.Lock() + pid, _, err := syscall.Syscall(syscall.SYS_FORK, 0, 0, 0) + syscall.ForkLock.Unlock() + if err != 0 { + return -1, err + } + return int(pid), nil +} + +func Mount(source, target, fstype string, flags uintptr, data string) error { + return syscall.Mount(source, target, fstype, flags, data) +} + +func Unmount(target string, flags int) error { + return syscall.Unmount(target, flags) +} + +func Pivotroot(newroot, putold string) error { + return syscall.PivotRoot(newroot, putold) +} + +func Unshare(flags int) error { + return syscall.Unshare(flags) +} + +func Clone(flags uintptr) (int, error) { + syscall.ForkLock.Lock() + pid, _, err := syscall.RawSyscall(syscall.SYS_CLONE, flags, 0, 0) + syscall.ForkLock.Unlock() + if err != 0 { + return -1, err + } + return int(pid), nil +} + +func UsetCloseOnExec(fd uintptr) error { + if _, _, err := syscall.Syscall(syscall.SYS_FCNTL, fd, syscall.F_SETFD, 0); err != 0 { + return err + } + return nil +} + +func Setgroups(gids []int) error { + return syscall.Setgroups(gids) +} + +func Setresgid(rgid, egid, sgid int) error { + return syscall.Setresgid(rgid, egid, sgid) +} + +func Setresuid(ruid, euid, suid int) error { + return syscall.Setresuid(ruid, euid, suid) +} + +func Setgid(gid int) error { + return syscall.Setgid(gid) +} + +func Setuid(uid int) error { + return syscall.Setuid(uid) +} + +func Sethostname(name string) error { + return syscall.Sethostname([]byte(name)) +} + +func Setsid() (int, error) { + return syscall.Setsid() +} + +func Ioctl(fd uintptr, flag, data uintptr) error { + if _, _, err := syscall.Syscall(syscall.SYS_IOCTL, fd, flag, data); err != 0 { + return err + } + return nil +} + +func Closefd(fd uintptr) error { + return syscall.Close(int(fd)) +} + +func Dup2(fd1, fd2 uintptr) error { + return syscall.Dup2(int(fd1), int(fd2)) +} + +func Mknod(path string, mode uint32, dev int) error { + return syscall.Mknod(path, mode, dev) +} + +func ParentDeathSignal() error { + if _, _, err := syscall.RawSyscall(syscall.SYS_PRCTL, syscall.PR_SET_PDEATHSIG, uintptr(syscall.SIGKILL), 0); err != 0 { + return err + } + return nil +} + +func Setctty() error { + if _, _, err := syscall.RawSyscall(syscall.SYS_IOCTL, 0, uintptr(syscall.TIOCSCTTY), 0); err != 0 { + return err + } + return nil +} + +func Mkfifo(name string, mode uint32) error { + return syscall.Mkfifo(name, mode) +} + +func Umask(mask int) int { + return syscall.Umask(mask) +} + +func SetCloneFlags(cmd *exec.Cmd, flag uintptr) { + if cmd.SysProcAttr == nil { + cmd.SysProcAttr = &syscall.SysProcAttr{} + } + cmd.SysProcAttr.Cloneflags = flag +} diff --git a/pkg/system/errors.go b/pkg/system/errors.go new file mode 100644 index 0000000000..63045186fe --- /dev/null +++ b/pkg/system/errors.go @@ -0,0 +1,9 @@ +package system + +import ( + "errors" +) + +var ( + ErrNotSupportedPlatform = errors.New("platform and architecture is not supported") +) diff --git a/pkg/system/pty_linux.go b/pkg/system/pty_linux.go new file mode 100644 index 0000000000..ca588d8ce9 --- /dev/null +++ b/pkg/system/pty_linux.go @@ -0,0 +1,58 @@ +package system + +import ( + "fmt" + "os" + "syscall" + "unsafe" +) + +// Unlockpt unlocks the slave pseudoterminal device corresponding to the master pseudoterminal referred to by f. +// Unlockpt should be called before opening the slave side of a pseudoterminal. +func Unlockpt(f *os.File) error { + var u int + return Ioctl(f.Fd(), syscall.TIOCSPTLCK, uintptr(unsafe.Pointer(&u))) +} + +// Ptsname retrieves the name of the first available pts for the given master. +func Ptsname(f *os.File) (string, error) { + var n int + + if err := Ioctl(f.Fd(), syscall.TIOCGPTN, uintptr(unsafe.Pointer(&n))); err != nil { + return "", err + } + return fmt.Sprintf("/dev/pts/%d", n), nil +} + +// CreateMasterAndConsole will open /dev/ptmx on the host and retreive the +// pts name for use as the pty slave inside the container +func CreateMasterAndConsole() (*os.File, string, error) { + master, err := os.OpenFile("/dev/ptmx", syscall.O_RDWR|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) + if err != nil { + return nil, "", err + } + console, err := Ptsname(master) + if err != nil { + return nil, "", err + } + if err := Unlockpt(master); err != nil { + return nil, "", err + } + return master, console, nil +} + +// OpenPtmx opens /dev/ptmx, i.e. the PTY master. +func OpenPtmx() (*os.File, error) { + // O_NOCTTY and O_CLOEXEC are not present in os package so we use the syscall's one for all. + return os.OpenFile("/dev/ptmx", syscall.O_RDONLY|syscall.O_NOCTTY|syscall.O_CLOEXEC, 0) +} + +// OpenTerminal is a clone of os.OpenFile without the O_CLOEXEC +// used to open the pty slave inside the container namespace +func OpenTerminal(name string, flag int) (*os.File, error) { + r, e := syscall.Open(name, flag, 0) + if e != nil { + return nil, &os.PathError{"open", name, e} + } + return os.NewFile(uintptr(r), name), nil +} diff --git a/pkg/system/setns_linux.go b/pkg/system/setns_linux.go new file mode 100644 index 0000000000..2b6f9e77ec --- /dev/null +++ b/pkg/system/setns_linux.go @@ -0,0 +1,27 @@ +package system + +import ( + "fmt" + "runtime" + "syscall" +) + +// Via http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=7b21fddd087678a70ad64afc0f632e0f1071b092 +// +// We need different setns values for the different platforms and arch +// We are declaring the macro here because the SETNS syscall does not exist in th stdlib +var setNsMap = map[string]uintptr{ + "linux/amd64": 308, +} + +func Setns(fd uintptr, flags uintptr) error { + ns, exists := setNsMap[fmt.Sprintf("%s/%s", runtime.GOOS, runtime.GOARCH)] + if !exists { + return ErrNotSupportedPlatform + } + _, _, err := syscall.RawSyscall(ns, fd, flags, 0) + if err != 0 { + return err + } + return nil +} diff --git a/pkg/system/unsupported.go b/pkg/system/unsupported.go new file mode 100644 index 0000000000..eb3ec7ee92 --- /dev/null +++ b/pkg/system/unsupported.go @@ -0,0 +1,15 @@ +// +build !linux + +package system + +import ( + "os/exec" +) + +func SetCloneFlags(cmd *exec.Cmd, flag uintptr) { + +} + +func UsetCloseOnExec(fd uintptr) error { + return ErrNotSupportedPlatform +} diff --git a/runtime.go b/runtime.go index 6cebab6e1c..846b47d04e 100644 --- a/runtime.go +++ b/runtime.go @@ -7,8 +7,8 @@ import ( "github.com/dotcloud/docker/dockerversion" "github.com/dotcloud/docker/engine" "github.com/dotcloud/docker/execdriver" - "github.com/dotcloud/docker/execdriver/chroot" "github.com/dotcloud/docker/execdriver/lxc" + "github.com/dotcloud/docker/execdriver/native" "github.com/dotcloud/docker/graphdriver" "github.com/dotcloud/docker/graphdriver/aufs" _ "github.com/dotcloud/docker/graphdriver/btrfs" @@ -702,17 +702,21 @@ func NewRuntimeFromDirectory(config *DaemonConfig, eng *engine.Engine) (*Runtime sysInitPath = localCopy } - sysInfo := sysinfo.New(false) + var ( + ed execdriver.Driver + sysInfo = sysinfo.New(false) + ) - var ed execdriver.Driver - utils.Debugf("execDriver: provided %s", config.ExecDriver) - if config.ExecDriver == "chroot" && false { - // chroot is presently a noop driver https://github.com/dotcloud/docker/pull/4189#issuecomment-35330655 - ed, err = chroot.NewDriver() - utils.Debugf("execDriver: using chroot") - } else { + switch config.ExecDriver { + case "lxc": + // we want to five the lxc driver the full docker root because it needs + // to access and write config and template files in /var/lib/docker/containers/* + // to be backwards compatible ed, err = lxc.NewDriver(config.Root, sysInfo.AppArmor) - utils.Debugf("execDriver: using lxc") + case "native": + ed, err = native.NewDriver(path.Join(config.Root, "execdriver", "native")) + default: + return nil, fmt.Errorf("unknown exec driver %s", config.ExecDriver) } if err != nil { return nil, err diff --git a/sysinit/sysinit.go b/sysinit/sysinit.go index b02cf027aa..c84c05982c 100644 --- a/sysinit/sysinit.go +++ b/sysinit/sysinit.go @@ -5,8 +5,8 @@ import ( "flag" "fmt" "github.com/dotcloud/docker/execdriver" - _ "github.com/dotcloud/docker/execdriver/chroot" _ "github.com/dotcloud/docker/execdriver/lxc" + _ "github.com/dotcloud/docker/execdriver/native" "io/ioutil" "log" "os" @@ -53,19 +53,21 @@ func SysInit() { privileged = flag.Bool("privileged", false, "privileged mode") mtu = flag.Int("mtu", 1500, "interface mtu") driver = flag.String("driver", "", "exec driver") + pipe = flag.Int("pipe", 0, "sync pipe fd") + console = flag.String("console", "", "console (pty slave) path") + root = flag.String("root", ".", "root path for configuration files") ) flag.Parse() // Get env var env []string - content, err := ioutil.ReadFile("/.dockerenv") + content, err := ioutil.ReadFile(".dockerenv") if err != nil { log.Fatalf("Unable to load environment variables: %v", err) } if err := json.Unmarshal(content, &env); err != nil { log.Fatalf("Unable to unmarshal environment variables: %v", err) } - // Propagate the plugin-specific container env variable env = append(env, "container="+os.Getenv("container")) @@ -79,6 +81,9 @@ func SysInit() { Args: flag.Args(), Mtu: *mtu, Driver: *driver, + Console: *console, + Pipe: *pipe, + Root: *root, } if err := executeProgram(args); err != nil {