From 9caf7aeefd23263a209c26c8439d26c147972d81 Mon Sep 17 00:00:00 2001 From: Dan Walsh Date: Tue, 29 Mar 2016 08:24:28 -0400 Subject: [PATCH] Add support for setting sysctls This patch will allow users to specify namespace specific "kernel parameters" for running inside of a container. Signed-off-by: Dan Walsh --- contrib/completion/bash/docker | 1 + contrib/completion/zsh/_docker | 1 + daemon/oci_linux.go | 1 + docs/reference/api/docker_remote_api.md | 1 + docs/reference/api/docker_remote_api_v1.21.md | 8 +++++ docs/reference/commandline/create.md | 1 + docs/reference/commandline/run.md | 28 ++++++++++++++++ integration-cli/docker_cli_run_unix_test.go | 32 ++++++++++++++++++ man/docker-create.1.md | 16 +++++++++ man/docker-run.1.md | 33 +++++++++++++++++++ opts/opts.go | 32 ++++++++++++++++++ runconfig/opts/parse.go | 3 ++ 12 files changed, 157 insertions(+) diff --git a/contrib/completion/bash/docker b/contrib/completion/bash/docker index b0f3ccd7ba..51c6a0207f 100644 --- a/contrib/completion/bash/docker +++ b/contrib/completion/bash/docker @@ -1671,6 +1671,7 @@ _docker_run() { --shm-size --stop-signal --tmpfs + --sysctl --ulimit --user -u --userns diff --git a/contrib/completion/zsh/_docker b/contrib/completion/zsh/_docker index 25f53c53e1..a66b3457ca 100644 --- a/contrib/completion/zsh/_docker +++ b/contrib/completion/zsh/_docker @@ -644,6 +644,7 @@ __docker_subcommand() { "($help)--privileged[Give extended privileges to this container]" "($help)--read-only[Mount the container's root filesystem as read only]" "($help)*--security-opt=[Security options]:security option: " + "($help)*--sysctl=-[sysctl options]:sysctl: " "($help -t --tty)"{-t,--tty}"[Allocate a pseudo-tty]" "($help -u --user)"{-u=,--user=}"[Username or UID]:user:_users" "($help)--tmpfs[mount tmpfs]" diff --git a/daemon/oci_linux.go b/daemon/oci_linux.go index ca9ed020f3..7ae90acb37 100644 --- a/daemon/oci_linux.go +++ b/daemon/oci_linux.go @@ -611,6 +611,7 @@ func (daemon *Daemon) createSpec(c *container.Container) (*libcontainerd.Spec, e return nil, fmt.Errorf("linux runtime spec resources: %v", err) } s.Linux.Resources.OOMScoreAdj = &c.HostConfig.OomScoreAdj + s.Linux.Sysctl = c.HostConfig.Sysctls if err := setDevices(&s, c); err != nil { return nil, fmt.Errorf("linux runtime spec devices: %v", err) } diff --git a/docs/reference/api/docker_remote_api.md b/docs/reference/api/docker_remote_api.md index 59eccf1e66..26c094118b 100644 --- a/docs/reference/api/docker_remote_api.md +++ b/docs/reference/api/docker_remote_api.md @@ -176,6 +176,7 @@ This section lists each version from latest to oldest. Each listing includes a [Docker Remote API v1.21](docker_remote_api_v1.21.md) documentation +* `POST /containers/create` and `POST /containers/(id)/start` allow you to configure kernel parameters (sysctls) for use in the container. * `GET /volumes` lists volumes from all volume drivers. * `POST /volumes/create` to create a volume. * `GET /volumes/(name)` get low-level information about a volume. diff --git a/docs/reference/api/docker_remote_api_v1.21.md b/docs/reference/api/docker_remote_api_v1.21.md index 7cdfd0f3f3..c96274be9a 100644 --- a/docs/reference/api/docker_remote_api_v1.21.md +++ b/docs/reference/api/docker_remote_api_v1.21.md @@ -199,6 +199,7 @@ Create a container "RestartPolicy": { "Name": "", "MaximumRetryCount": 0 }, "NetworkMode": "bridge", "Devices": [], + "Sysctls": { "net.ipv4.ip_forward": "1" }, "Ulimits": [{}], "LogConfig": { "Type": "json-file", "Config": {} }, "SecurityOpt": [], @@ -306,6 +307,10 @@ Json Parameters: - **Devices** - A list of devices to add to the container specified as a JSON object in the form `{ "PathOnHost": "/dev/deviceName", "PathInContainer": "/dev/deviceName", "CgroupPermissions": "mrw"}` + - **Sysctls** - A list of kernel parameters (sysctls) to set in the container, specified as + `{ : }`, for example: + `{ "net.ipv4.ip_forward": "1" }` + - **Ulimits** - A list of ulimits to set in the container, specified as `{ "Name": , "Soft": , "Hard": }`, for example: `Ulimits: { "Name": "nofile", "Soft": 1024, "Hard": 2048 }` @@ -426,6 +431,9 @@ Return low-level information on the container `id` "Type": "json-file" }, "SecurityOpt": null, + "Sysctls": { + "net.ipv4.ip_forward": "1" + }, "VolumesFrom": null, "Ulimits": [{}], "VolumeDriver": "" diff --git a/docs/reference/commandline/create.md b/docs/reference/commandline/create.md index 0540b3f34a..3e4024374c 100644 --- a/docs/reference/commandline/create.md +++ b/docs/reference/commandline/create.md @@ -82,6 +82,7 @@ Creates a new container. --stop-signal="SIGTERM" Signal to stop a container --shm-size=[] Size of `/dev/shm`. The format is ``. `number` must be greater than `0`. Unit is optional and can be `b` (bytes), `k` (kilobytes), `m` (megabytes), or `g` (gigabytes). If you omit the unit, the system uses bytes. If you omit the size entirely, the system uses `64m`. --storage-opt=[] Set storage driver options per container + --sysctl[=*[]*]] Configure namespaced kernel parameters at runtime -t, --tty Allocate a pseudo-TTY -u, --user="" Username or UID --userns="" Container user namespace diff --git a/docs/reference/commandline/run.md b/docs/reference/commandline/run.md index 00438bebad..67e0010232 100644 --- a/docs/reference/commandline/run.md +++ b/docs/reference/commandline/run.md @@ -84,6 +84,7 @@ parent = "smn_cli" --sig-proxy=true Proxy received signals to the process --stop-signal="SIGTERM" Signal to stop a container --storage-opt=[] Set storage driver options per container + --sysctl[=*[]*]] Configure namespaced kernel parameters at runtime -t, --tty Allocate a pseudo-TTY -u, --user="" Username or UID (format: [:]) --userns="" Container user namespace @@ -620,3 +621,30 @@ If you have set the `--exec-opt isolation=hyperv` option on the Docker `daemon`, $ docker run -d --isolation default busybox top $ docker run -d --isolation hyperv busybox top ``` + +### Configure namespaced kernel parameters (sysctls) at runtime + +The `--sysctl` sets namespaced kernel parameters (sysctls) in the +container. For example, to turn on IP forwarding in the containers +network namespace, run this command: + + $ docker run --sysctl net.ipv4.ip_forward=1 someimage + + +> **Note**: Not all sysctls are namespaced. docker does not support changing sysctls +> inside of a container that also modify the host system. As the kernel +> evolves we expect to see more sysctls become namespaced. + +#### Currently supported sysctls + + `IPC Namespace`: + + kernel.msgmax, kernel.msgmnb, kernel.msgmni, kernel.sem, kernel.shmall, kernel.shmmax, kernel.shmmni, kernel.shm_rmid_forced + Sysctls beginning with fs.mqueue.* + + If you use the `--ipc=host` option these sysctls will not be allowed. + + `Network Namespace`: + Sysctls beginning with net.* + + If you use the `--net=host` option using these sysctls will not be allowed. diff --git a/integration-cli/docker_cli_run_unix_test.go b/integration-cli/docker_cli_run_unix_test.go index 5aefa11cb7..61d40e1492 100644 --- a/integration-cli/docker_cli_run_unix_test.go +++ b/integration-cli/docker_cli_run_unix_test.go @@ -4,6 +4,7 @@ package main import ( "bufio" + "encoding/json" "fmt" "io/ioutil" "os" @@ -747,6 +748,37 @@ func (s *DockerSuite) TestRunTmpfsMounts(c *check.C) { } } +func (s *DockerSuite) TestRunSysctls(c *check.C) { + + testRequires(c, DaemonIsLinux) + var err error + + out, _ := dockerCmd(c, "run", "--sysctl", "net.ipv4.ip_forward=1", "--name", "test", "busybox", "cat", "/proc/sys/net/ipv4/ip_forward") + c.Assert(strings.TrimSpace(out), check.Equals, "1") + + out = inspectFieldJSON(c, "test", "HostConfig.Sysctls") + + sysctls := make(map[string]string) + err = json.Unmarshal([]byte(out), &sysctls) + c.Assert(err, check.IsNil) + c.Assert(sysctls["net.ipv4.ip_forward"], check.Equals, "1") + + out, _ = dockerCmd(c, "run", "--sysctl", "net.ipv4.ip_forward=0", "--name", "test1", "busybox", "cat", "/proc/sys/net/ipv4/ip_forward") + c.Assert(strings.TrimSpace(out), check.Equals, "0") + + out = inspectFieldJSON(c, "test1", "HostConfig.Sysctls") + + err = json.Unmarshal([]byte(out), &sysctls) + c.Assert(err, check.IsNil) + c.Assert(sysctls["net.ipv4.ip_forward"], check.Equals, "0") + + runCmd := exec.Command(dockerBinary, "run", "--sysctl", "kernel.foobar=1", "--name", "test2", "busybox", "cat", "/proc/sys/kernel/foobar") + out, _, _ = runCommandWithOutput(runCmd) + if !strings.Contains(out, "invalid value") { + c.Fatalf("expected --sysctl to fail, got %s", out) + } +} + // TestRunSeccompProfileDenyUnshare checks that 'docker run --security-opt seccomp=/tmp/profile.json debian:jessie unshare' exits with operation not permitted. func (s *DockerSuite) TestRunSeccompProfileDenyUnshare(c *check.C) { testRequires(c, SameHostDaemon, seccompEnabled, NotArm, Apparmor) diff --git a/man/docker-create.1.md b/man/docker-create.1.md index d3cb85c78c..3f90a3a1d8 100644 --- a/man/docker-create.1.md +++ b/man/docker-create.1.md @@ -67,6 +67,7 @@ docker-create - Create a new container [**--storage-opt**[=*[]*]] [**--stop-signal**[=*SIGNAL*]] [**--shm-size**[=*[]*]] +[**--sysctl**[=*[]*]] [**-t**|**--tty**] [**--tmpfs**[=*[CONTAINER-DIR[:]*]] [**-u**|**--user**[=*USER*]] @@ -336,6 +337,21 @@ unit, `b` is used. Set LIMIT to `-1` to enable unlimited swap. **--stop-signal**=*SIGTERM* Signal to stop a container. Default is SIGTERM. +**--sysctl**=SYSCTL + Configure namespaced kernel parameters at runtime + + IPC Namespace - current sysctls allowed: + + kernel.msgmax, kernel.msgmnb, kernel.msgmni, kernel.sem, kernel.shmall, kernel.shmmax, kernel.shmmni, kernel.shm_rmid_forced + Sysctls beginning with fs.mqueue.* + + Note: if you use --ipc=host using these sysctls will not be allowed. + + Network Namespace - current sysctls allowed: + Sysctls beginning with net.* + + Note: if you use --net=host using these sysctls will not be allowed. + **-t**, **--tty**=*true*|*false* Allocate a pseudo-TTY. The default is *false*. diff --git a/man/docker-run.1.md b/man/docker-run.1.md index e6757fc512..921ff9a07b 100644 --- a/man/docker-run.1.md +++ b/man/docker-run.1.md @@ -71,6 +71,7 @@ docker-run - Run a command in a new container [**--stop-signal**[=*SIGNAL*]] [**--shm-size**[=*[]*]] [**--sig-proxy**[=*true*]] +[**--sysctl**[=*[]*]] [**-t**|**--tty**] [**--tmpfs**[=*[CONTAINER-DIR[:]*]] [**-u**|**--user**[=*USER*]] @@ -492,6 +493,21 @@ its root filesystem mounted as read only prohibiting any writes. `number` must be greater than `0`. Unit is optional and can be `b` (bytes), `k` (kilobytes), `m`(megabytes), or `g` (gigabytes). If you omit the unit, the system uses bytes. If you omit the size entirely, the system uses `64m`. +**--sysctl**=SYSCTL + Configure namespaced kernel parameters at runtime + + IPC Namespace - current sysctls allowed: + + kernel.msgmax, kernel.msgmnb, kernel.msgmni, kernel.sem, kernel.shmall, kernel.shmmax, kernel.shmmni, kernel.shm_rmid_forced + Sysctls beginning with fs.mqueue.* + + If you use the `--ipc=host` option these sysctls will not be allowed. + + Network Namespace - current sysctls allowed: + Sysctls beginning with net.* + + If you use the `--net=host` option these sysctls will not be allowed. + **--sig-proxy**=*true*|*false* Proxy received signals to the process (non-TTY mode only). SIGCHLD, SIGSTOP, and SIGKILL are not proxied. The default is *true*. @@ -955,6 +971,23 @@ $ docker run -d --isolation default busybox top $ docker run -d --isolation hyperv busybox top ``` +## Setting Namespaced Kernel Parameters (Sysctls) + +The `--sysctl` sets namespaced kernel parameters (sysctls) in the +container. For example, to turn on IP forwarding in the containers +network namespace, run this command: + + $ docker run --sysctl net.ipv4.ip_forward=1 someimage + +Note: + +Not all sysctls are namespaced. docker does not support changing sysctls +inside of a container that also modify the host system. As the kernel +evolves we expect to see more sysctls become namespaced. + +See the definition of the `--sysctl` option above for the current list of +supported sysctls. + # HISTORY April 2014, Originally compiled by William Henry (whenry at redhat dot com) based on docker.com source material and internal work. diff --git a/opts/opts.go b/opts/opts.go index a56c0cc42e..0b09981778 100644 --- a/opts/opts.go +++ b/opts/opts.go @@ -240,3 +240,35 @@ func ValidateLabel(val string) (string, error) { } return val, nil } + +// ValidateSysctl validates an sysctl and returns it. +func ValidateSysctl(val string) (string, error) { + validSysctlMap := map[string]bool{ + "kernel.msgmax": true, + "kernel.msgmnb": true, + "kernel.msgmni": true, + "kernel.sem": true, + "kernel.shmall": true, + "kernel.shmmax": true, + "kernel.shmmni": true, + "kernel.shm_rmid_forced": true, + } + validSysctlPrefixes := []string{ + "net.", + "fs.mqueue.", + } + arr := strings.Split(val, "=") + if len(arr) < 2 { + return "", fmt.Errorf("sysctl '%s' is not whitelisted", val) + } + if validSysctlMap[arr[0]] { + return val, nil + } + + for _, vp := range validSysctlPrefixes { + if strings.HasPrefix(arr[0], vp) { + return val, nil + } + } + return "", fmt.Errorf("sysctl '%s' is not whitelisted", val) +} diff --git a/runconfig/opts/parse.go b/runconfig/opts/parse.go index 44a896d77c..7c71dfcb74 100644 --- a/runconfig/opts/parse.go +++ b/runconfig/opts/parse.go @@ -42,6 +42,7 @@ func Parse(cmd *flag.FlagSet, args []string) (*container.Config, *container.Host flDevices = opts.NewListOpts(ValidateDevice) flUlimits = NewUlimitOpt(nil) + flSysctls = opts.NewMapOpts(nil, opts.ValidateSysctl) flPublish = opts.NewListOpts(nil) flExpose = opts.NewListOpts(nil) @@ -127,6 +128,7 @@ func Parse(cmd *flag.FlagSet, args []string) (*container.Config, *container.Host cmd.Var(&flSecurityOpt, []string{"-security-opt"}, "Security Options") cmd.Var(&flStorageOpt, []string{"-storage-opt"}, "Set storage driver options per container") cmd.Var(flUlimits, []string{"-ulimit"}, "Ulimit options") + cmd.Var(flSysctls, []string{"-sysctl"}, "Sysctl options") cmd.Var(&flLoggingOpts, []string{"-log-opt"}, "Log driver options") cmd.Require(flag.Min, 1) @@ -430,6 +432,7 @@ func Parse(cmd *flag.FlagSet, args []string) (*container.Config, *container.Host ShmSize: shmSize, Resources: resources, Tmpfs: tmpfs, + Sysctls: flSysctls.GetAll(), } // When allocating stdin in attached mode, close stdin at client disconnect