From 497fc8876ede9924f61c0eee4dfadd71e5d9f537 Mon Sep 17 00:00:00 2001 From: Dan Walsh Date: Mon, 10 Nov 2014 16:14:17 -0500 Subject: [PATCH] Allow IPC namespace to be shared between containers or with the host Some workloads rely on IPC for communications with other processes. We would like to split workloads between two container but still allow them to communicate though shared IPC. This patch mimics the --net code to allow --ipc=host to not split off the IPC Namespace. ipc=container:CONTAINERID to share ipc between containers If you share IPC between containers, then you need to make sure SELinux labels match. Docker-DCO-1.1-Signed-off-by: Dan Walsh (github: rhatdan) --- daemon/container.go | 28 +++++++++ daemon/create.go | 26 +++++++++ daemon/execdriver/driver.go | 7 +++ daemon/execdriver/native/create.go | 26 +++++++++ docs/man/docker-run.1.md | 81 ++++++++++++++++++++++++-- docs/sources/reference/run.md | 17 ++++++ integration-cli/docker_cli_run_test.go | 70 ++++++++++++++++++++++ runconfig/hostconfig.go | 40 +++++++++++++ runconfig/parse.go | 7 +++ 9 files changed, 298 insertions(+), 4 deletions(-) diff --git a/daemon/container.go b/daemon/container.go index 2ac8316137..bf93787ebf 100644 --- a/daemon/container.go +++ b/daemon/container.go @@ -233,6 +233,18 @@ func populateCommand(c *Container, env []string) error { return fmt.Errorf("invalid network mode: %s", c.hostConfig.NetworkMode) } + ipc := &execdriver.Ipc{} + + if c.hostConfig.IpcMode.IsContainer() { + ic, err := c.getIpcContainer() + if err != nil { + return err + } + ipc.ContainerID = ic.ID + } else { + ipc.HostIpc = c.hostConfig.IpcMode.IsHost() + } + // Build lists of devices allowed and created within the container. userSpecifiedDevices := make([]*devices.Device, len(c.hostConfig.Devices)) for i, deviceMapping := range c.hostConfig.Devices { @@ -274,6 +286,7 @@ func populateCommand(c *Container, env []string) error { InitPath: "/.dockerinit", WorkingDir: c.Config.WorkingDir, Network: en, + Ipc: ipc, Resources: resources, AllowedDevices: allowedDevices, AutoCreatedDevices: autoCreatedDevices, @@ -1250,10 +1263,25 @@ func (container *Container) GetMountLabel() string { return container.MountLabel } +func (container *Container) getIpcContainer() (*Container, error) { + containerID := container.hostConfig.IpcMode.Container() + c := container.daemon.Get(containerID) + if c == nil { + return nil, fmt.Errorf("no such container to join IPC: %s", containerID) + } + if !c.IsRunning() { + return nil, fmt.Errorf("cannot join IPC of a non running container: %s", containerID) + } + return c, nil +} + func (container *Container) getNetworkedContainer() (*Container, error) { parts := strings.SplitN(string(container.hostConfig.NetworkMode), ":", 2) switch parts[0] { case "container": + if len(parts) != 2 { + return nil, fmt.Errorf("no container specified to join network") + } nc := container.daemon.Get(parts[1]) if nc == nil { return nil, fmt.Errorf("no such container to join network: %s", parts[1]) diff --git a/daemon/create.go b/daemon/create.go index e72b0ef206..3a71a8ac7e 100644 --- a/daemon/create.go +++ b/daemon/create.go @@ -1,10 +1,13 @@ package daemon import ( + "fmt" + "github.com/docker/docker/engine" "github.com/docker/docker/graph" "github.com/docker/docker/pkg/parsers" "github.com/docker/docker/runconfig" + "github.com/docker/libcontainer/label" ) func (daemon *Daemon) ContainerCreate(job *engine.Job) engine.Status { @@ -80,6 +83,12 @@ func (daemon *Daemon) Create(config *runconfig.Config, hostConfig *runconfig.Hos if warnings, err = daemon.mergeAndVerifyConfig(config, img); err != nil { return nil, nil, err } + if hostConfig != nil && config.SecurityOpt == nil { + config.SecurityOpt, err = daemon.GenerateSecurityOpt(hostConfig.IpcMode) + if err != nil { + return nil, nil, err + } + } if container, err = daemon.newContainer(name, config, img); err != nil { return nil, nil, err } @@ -99,3 +108,20 @@ func (daemon *Daemon) Create(config *runconfig.Config, hostConfig *runconfig.Hos } return container, warnings, nil } +func (daemon *Daemon) GenerateSecurityOpt(ipcMode runconfig.IpcMode) ([]string, error) { + if ipcMode.IsHost() { + return label.DisableSecOpt(), nil + } + if ipcContainer := ipcMode.Container(); ipcContainer != "" { + c := daemon.Get(ipcContainer) + if c == nil { + return nil, fmt.Errorf("no such container to join IPC: %s", ipcContainer) + } + if !c.IsRunning() { + return nil, fmt.Errorf("cannot join IPC of a non running container: %s", ipcContainer) + } + + return label.DupSecOpt(c.ProcessLabel), nil + } + return nil, nil +} diff --git a/daemon/execdriver/driver.go b/daemon/execdriver/driver.go index bc2eb24eda..b2febe5761 100644 --- a/daemon/execdriver/driver.go +++ b/daemon/execdriver/driver.go @@ -62,6 +62,12 @@ type Network struct { HostNetworking bool `json:"host_networking"` } +// IPC settings of the container +type Ipc struct { + ContainerID string `json:"container_id"` // id of the container to join ipc. + HostIpc bool `json:"host_ipc"` +} + type NetworkInterface struct { Gateway string `json:"gateway"` IPAddress string `json:"ip"` @@ -106,6 +112,7 @@ type Command struct { WorkingDir string `json:"working_dir"` ConfigPath string `json:"config_path"` // this should be able to be removed when the lxc template is moved into the driver Network *Network `json:"network"` + Ipc *Ipc `json:"ipc"` Resources *Resources `json:"resources"` Mounts []Mount `json:"mounts"` AllowedDevices []*devices.Device `json:"allowed_devices"` diff --git a/daemon/execdriver/native/create.go b/daemon/execdriver/native/create.go index 492247e492..de103eca8a 100644 --- a/daemon/execdriver/native/create.go +++ b/daemon/execdriver/native/create.go @@ -36,6 +36,10 @@ func (d *driver) createContainer(c *execdriver.Command) (*libcontainer.Config, e container.MountConfig.NoPivotRoot = os.Getenv("DOCKER_RAMDISK") != "" container.RestrictSys = true + if err := d.createIpc(container, c); err != nil { + return nil, err + } + if err := d.createNetwork(container, c); err != nil { return nil, err } @@ -124,6 +128,28 @@ func (d *driver) createNetwork(container *libcontainer.Config, c *execdriver.Com return nil } +func (d *driver) createIpc(container *libcontainer.Config, c *execdriver.Command) error { + if c.Ipc.HostIpc { + container.Namespaces["NEWIPC"] = false + return nil + } + + if c.Ipc.ContainerID != "" { + d.Lock() + active := d.activeContainers[c.Ipc.ContainerID] + d.Unlock() + + if active == nil || active.cmd.Process == nil { + return fmt.Errorf("%s is not a valid running container to join", c.Ipc.ContainerID) + } + cmd := active.cmd + + container.IpcNsPath = filepath.Join("/proc", fmt.Sprint(cmd.Process.Pid), "ns", "ipc") + } + + return nil +} + func (d *driver) setPrivileged(container *libcontainer.Config) (err error) { container.Capabilities = capabilities.GetAllCapabilities() container.Cgroups.AllowAllDevices = true diff --git a/docs/man/docker-run.1.md b/docs/man/docker-run.1.md index ff3dac17b0..0aa4cad3fe 100644 --- a/docs/man/docker-run.1.md +++ b/docs/man/docker-run.1.md @@ -23,6 +23,7 @@ docker-run - Run a command in a new container [**--expose**[=*[]*]] [**-h**|**--hostname**[=*HOSTNAME*]] [**-i**|**--interactive**[=*false*]] +[**--ipc**[=*[]*]] [**--security-opt**[=*[]*]] [**--link**[=*[]*]] [**--lxc-conf**[=*[]*]] @@ -142,6 +143,12 @@ ENTRYPOINT. **-i**, **--interactive**=*true*|*false* When set to true, keep stdin open even if not attached. The default is false. +**--ipc**=[] + Set the IPC mode for the container + **container**:<*name*|*id*>: reuses another container's IPC stack + **host**: use the host's IPC stack inside the container. + Note: the host mode gives the container full access to local IPC and is therefore considered insecure. + **--security-opt**=*secdriver*:*name*:*value* "label:user:USER" : Set the label user for the container "label:role:ROLE" : Set the label role for the container @@ -183,10 +190,11 @@ and foreground Docker containers. **--net**="bridge" Set the Network mode for the container - 'bridge': creates a new network stack for the container on the docker bridge - 'none': no networking for this container - 'container:': reuses another container network stack - 'host': use the host network stack inside the container. Note: the host mode gives the container full access to local system services such as D-bus and is therefore considered insecure. + **bridge**: creates a new network stack for the container on the docker bridge + **none**: no networking for this container + **container**:<*name*|*id*>: reuses another container's network stack + **host**: use the host network stack inside the container. + Note: the host mode gives the container full access to local system services such as D-bus and is therefore considered insecure. **--mac-address**=*macaddress* Set the MAC address for the container's Ethernet device: @@ -310,6 +318,71 @@ you’d like to connect instead, as in: # docker run -a stdin -a stdout -i -t fedora /bin/bash +## Sharing IPC between containers + +Using shm_server.c available here: http://www.cs.cf.ac.uk/Dave/C/node27.html + +Testing `--ipc=host` mode: + +Host shows a shared memory segment with 7 pids attached, happens to be from httpd: + +``` + $ sudo ipcs -m + + ------ Shared Memory Segments -------- + key shmid owner perms bytes nattch status + 0x01128e25 0 root 600 1000 7 +``` + +Now run a regular container, and it correctly does NOT see the shared memory segment from the host: + +``` + $ sudo docker run -it shm ipcs -m + + ------ Shared Memory Segments -------- + key shmid owner perms bytes nattch status +``` + +Run a container with the new `--ipc=host` option, and it now sees the shared memory segment from the host httpd: + + ``` + $ sudo docker run -it --ipc=host shm ipcs -m + + ------ Shared Memory Segments -------- + key shmid owner perms bytes nattch status + 0x01128e25 0 root 600 1000 7 +``` +Testing `--ipc=container:CONTAINERID` mode: + +Start a container with a program to create a shared memory segment: +``` + sudo docker run -it shm bash + $ sudo shm/shm_server & + $ sudo ipcs -m + + ------ Shared Memory Segments -------- + key shmid owner perms bytes nattch status + 0x0000162e 0 root 666 27 1 +``` +Create a 2nd container correctly shows no shared memory segment from 1st container: +``` + $ sudo docker run shm ipcs -m + + ------ Shared Memory Segments -------- + key shmid owner perms bytes nattch status +``` + +Create a 3rd container using the new --ipc=container:CONTAINERID option, now it shows the shared memory segment from the first: + +``` + $ sudo docker run -it --ipc=container:ed735b2264ac shm ipcs -m + $ sudo ipcs -m + + ------ Shared Memory Segments -------- + key shmid owner perms bytes nattch status + 0x0000162e 0 root 666 27 1 +``` + ## Linking Containers The link feature allows multiple containers to communicate with each other. For diff --git a/docs/sources/reference/run.md b/docs/sources/reference/run.md index 1abb7d0575..31029e2a11 100644 --- a/docs/sources/reference/run.md +++ b/docs/sources/reference/run.md @@ -50,6 +50,7 @@ following options. - [Container Identification](#container-identification) - [Name (--name)](#name-name) - [PID Equivalent](#pid-equivalent) + - [IPC Settings](#ipc-settings) - [Network Settings](#network-settings) - [Clean Up (--rm)](#clean-up-rm) - [Runtime Constraints on CPU and Memory](#runtime-constraints-on-cpu-and-memory) @@ -131,6 +132,22 @@ While not strictly a means of identifying a container, you can specify a version image you'd like to run the container with by adding `image[:tag]` to the command. For example, `docker run ubuntu:14.04`. +## IPC Settings + --ipc="" : Set the IPC mode for the container, + 'container:': reuses another container's IPC namespace + 'host': use the host's IPC namespace inside the container +By default, all containers have the IPC namespace enabled + +IPC (POSIX/SysV IPC) namespace provides separation of named shared memory segments, semaphores and message queues. + +Shared memory segments are used to accelerate inter-process communication at +memory speed, rather than through pipes or through the network stack. Shared +memory is commonly used by databases and custom-built (typically C/OpenMPI, +C++/using boost libraries) high performance applications for scientific +computing and financial services industries. If these types of applications +are broken into multiple containers, you might need to share the IPC mechanisms +of the containers. + ## Network settings --dns=[] : Set custom dns servers for the container diff --git a/integration-cli/docker_cli_run_test.go b/integration-cli/docker_cli_run_test.go index d536c626bb..54949730a1 100644 --- a/integration-cli/docker_cli_run_test.go +++ b/integration-cli/docker_cli_run_test.go @@ -2568,3 +2568,73 @@ func TestRunUnknownCommand(t *testing.T) { logDone("run - Unknown Command") } + +func TestRunModeIpcHost(t *testing.T) { + hostIpc, err := os.Readlink("/proc/1/ns/ipc") + if err != nil { + t.Fatal(err) + } + + cmd := exec.Command(dockerBinary, "run", "--ipc=host", "busybox", "readlink", "/proc/self/ns/ipc") + out2, _, err := runCommandWithOutput(cmd) + if err != nil { + t.Fatal(err, out2) + } + + out2 = strings.Trim(out2, "\n") + if hostIpc != out2 { + t.Fatalf("IPC different with --ipc=host %s != %s\n", hostIpc, out2) + } + + cmd = exec.Command(dockerBinary, "run", "busybox", "readlink", "/proc/self/ns/ipc") + out2, _, err = runCommandWithOutput(cmd) + if err != nil { + t.Fatal(err, out2) + } + + out2 = strings.Trim(out2, "\n") + if hostIpc == out2 { + t.Fatalf("IPC should be different without --ipc=host %s != %s\n", hostIpc, out2) + } + deleteAllContainers() + + logDone("run - hostname and several network modes") +} + +func TestRunModeIpcContainer(t *testing.T) { + cmd := exec.Command(dockerBinary, "run", "-d", "busybox", "top") + out, _, err := runCommandWithOutput(cmd) + if err != nil { + t.Fatal(err, out) + } + id := strings.TrimSpace(out) + state, err := inspectField(id, "State.Running") + if err != nil { + t.Fatal(err) + } + if state != "true" { + t.Fatal("Container state is 'not running'") + } + pid1, err := inspectField(id, "State.Pid") + if err != nil { + t.Fatal(err) + } + + parentContainerIpc, err := os.Readlink(fmt.Sprintf("/proc/%s/ns/ipc", pid1)) + if err != nil { + t.Fatal(err) + } + cmd = exec.Command(dockerBinary, "run", fmt.Sprintf("--ipc=container:%s", id), "busybox", "readlink", "/proc/self/ns/ipc") + out2, _, err := runCommandWithOutput(cmd) + if err != nil { + t.Fatal(err, out2) + } + + out2 = strings.Trim(out2, "\n") + if parentContainerIpc != out2 { + t.Fatalf("IPC different with --ipc=container:%s %s != %s\n", id, parentContainerIpc, out2) + } + deleteAllContainers() + + logDone("run - hostname and several network modes") +} diff --git a/runconfig/hostconfig.go b/runconfig/hostconfig.go index 5c49522038..01388ad727 100644 --- a/runconfig/hostconfig.go +++ b/runconfig/hostconfig.go @@ -28,6 +28,44 @@ func (n NetworkMode) IsNone() bool { return n == "none" } +type IpcMode string + +// IsPrivate indicates whether container use it's private ipc stack +func (n IpcMode) IsPrivate() bool { + return !(n.IsHost() || n.IsContainer()) +} + +func (n IpcMode) IsHost() bool { + return n == "host" +} + +func (n IpcMode) IsContainer() bool { + parts := strings.SplitN(string(n), ":", 2) + return len(parts) > 1 && parts[0] == "container" +} + +func (n IpcMode) Valid() bool { + parts := strings.Split(string(n), ":") + switch mode := parts[0]; mode { + case "", "host": + case "container": + if len(parts) != 2 || parts[1] == "" { + return false + } + default: + return false + } + return true +} + +func (n IpcMode) Container() string { + parts := strings.SplitN(string(n), ":", 2) + if len(parts) > 1 { + return parts[1] + } + return "" +} + type DeviceMapping struct { PathOnHost string PathInContainer string @@ -53,6 +91,7 @@ type HostConfig struct { VolumesFrom []string Devices []DeviceMapping NetworkMode NetworkMode + IpcMode IpcMode CapAdd []string CapDrop []string RestartPolicy RestartPolicy @@ -84,6 +123,7 @@ func ContainerHostConfigFromJob(job *engine.Job) *HostConfig { Privileged: job.GetenvBool("Privileged"), PublishAllPorts: job.GetenvBool("PublishAllPorts"), NetworkMode: NetworkMode(job.Getenv("NetworkMode")), + IpcMode: IpcMode(job.Getenv("IpcMode")), } job.GetenvJson("LxcConf", &hostConfig.LxcConf) diff --git a/runconfig/parse.go b/runconfig/parse.go index c62ab3fdd4..dfc84c1892 100644 --- a/runconfig/parse.go +++ b/runconfig/parse.go @@ -60,6 +60,7 @@ func Parse(cmd *flag.FlagSet, args []string, sysInfo *sysinfo.SysInfo) (*Config, flCpuset = cmd.String([]string{"-cpuset"}, "", "CPUs in which to allow execution (0-3, 0,1)") flNetMode = cmd.String([]string{"-net"}, "bridge", "Set the Network mode for the container\n'bridge': creates a new network stack for the container on the docker bridge\n'none': no networking for this container\n'container:': reuses another container network stack\n'host': use the host network stack inside the container. Note: the host mode gives the container full access to local system services such as D-bus and is therefore considered insecure.") flMacAddress = cmd.String([]string{"-mac-address"}, "", "Container MAC address (e.g. 92:d0:c6:0a:29:33)") + flIpcMode = cmd.String([]string{"-ipc"}, "", "Default is to create a private IPC namespace (POSIX SysV IPC) for the container\n'container:': reuses another container shared memory, semaphores and message queues\n'host': use the host shared memory,semaphores and message queues inside the container. Note: the host mode gives the container full access to local shared memory and is therefore considered insecure.") flRestartPolicy = cmd.String([]string{"-restart"}, "", "Restart policy to apply when a container exits (no, on-failure[:max-retry], always)") ) @@ -241,6 +242,11 @@ func Parse(cmd *flag.FlagSet, args []string, sysInfo *sysinfo.SysInfo) (*Config, // parse the '-e' and '--env' after, to allow override envVariables = append(envVariables, flEnv.GetAll()...) + ipcMode := IpcMode(*flIpcMode) + if !ipcMode.Valid() { + return nil, nil, cmd, fmt.Errorf("--ipc: invalid IPC mode: %v", err) + } + netMode, err := parseNetMode(*flNetMode) if err != nil { return nil, nil, cmd, fmt.Errorf("--net: invalid net mode: %v", err) @@ -289,6 +295,7 @@ func Parse(cmd *flag.FlagSet, args []string, sysInfo *sysinfo.SysInfo) (*Config, ExtraHosts: flExtraHosts.GetAll(), VolumesFrom: flVolumesFrom.GetAll(), NetworkMode: netMode, + IpcMode: ipcMode, Devices: deviceMappings, CapAdd: flCapAdd.GetAll(), CapDrop: flCapDrop.GetAll(),