diff --git a/Dockerfile b/Dockerfile index 891d84b57c..5bb00c173f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -163,7 +163,7 @@ RUN set -x \ && rm -rf "$GOPATH" # Get the "docker-py" source so we can run their integration tests -ENV DOCKER_PY_COMMIT 47ab89ec2bd3bddf1221b856ffbaff333edeabb4 +ENV DOCKER_PY_COMMIT 57512760c83fbe41302891aa51e34a86f4db74de RUN git clone https://github.com/docker/docker-py.git /docker-py \ && cd /docker-py \ && git checkout -q $DOCKER_PY_COMMIT \ @@ -197,6 +197,7 @@ RUN ln -sv $PWD/contrib/completion/bash/docker /etc/bash_completion.d/docker COPY contrib/download-frozen-image-v2.sh /go/src/github.com/docker/docker/contrib/ RUN ./contrib/download-frozen-image-v2.sh /docker-frozen-images \ busybox:latest@sha256:eb3c0d4680f9213ee5f348ea6d39489a1f85a318a2ae09e012c426f78252a6d2 \ + debian:jessie@sha256:24a900d1671b269d6640b4224e7b63801880d8e3cb2bcbfaa10a5dddcf4469ed \ hello-world:latest@sha256:8be990ef2aeb16dbcb9271ddfe2610fa6658d13f6dfb8bc72074cc1ca36966a7 \ jess/unshare:latest@sha256:2e3a8c0591c4690b82d4eba7e5ef8f49f2ddfe9f867f3e865198db9bd1436c5b # see also "hack/make/.ensure-frozen-images" (which needs to be updated any time this list is) diff --git a/contrib/userns-test/Dockerfile b/contrib/userns-test/Dockerfile new file mode 100644 index 0000000000..90eb5092ce --- /dev/null +++ b/contrib/userns-test/Dockerfile @@ -0,0 +1,3 @@ +FROM debian:jessie +COPY userns-test . +ENTRYPOINT ["./userns-test"] diff --git a/contrib/userns-test/main.c b/contrib/userns-test/main.c new file mode 100644 index 0000000000..9f4d93aaab --- /dev/null +++ b/contrib/userns-test/main.c @@ -0,0 +1,54 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define STACKSIZE (1024*1024) +static char child_stack[STACKSIZE]; + +struct clone_args { + char **argv; +}; + +// child_exec is the func that will be executed as the result of clone +static int child_exec(void *stuff) +{ + struct clone_args *args = (struct clone_args *)stuff; + if (execvp(args->argv[0], args->argv) != 0) { + fprintf(stderr, "failed to execvp argments %s\n", + strerror(errno)); + exit(-1); + } + // we should never reach here! + exit(EXIT_FAILURE); +} + +int main(int argc, char **argv) +{ + struct clone_args args; + args.argv = &argv[1]; + + int clone_flags = CLONE_NEWUSER | SIGCHLD; + + // the result of this call is that our child_exec will be run in another + // process returning it's pid + pid_t pid = + clone(child_exec, child_stack + STACKSIZE, clone_flags, &args); + if (pid < 0) { + fprintf(stderr, "clone failed: %s\n", strerror(errno)); + exit(EXIT_FAILURE); + } + // lets wait on our child process here before we, the parent, exits + if (waitpid(pid, NULL, 0) == -1) { + fprintf(stderr, "failed to wait pid %d\n", pid); + exit(EXIT_FAILURE); + } + exit(EXIT_SUCCESS); +} diff --git a/daemon/execdriver/native/create.go b/daemon/execdriver/native/create.go index 0154801382..16ad4508d7 100644 --- a/daemon/execdriver/native/create.go +++ b/daemon/execdriver/native/create.go @@ -69,6 +69,10 @@ func (d *Driver) createContainer(c *execdriver.Command, hooks execdriver.Hooks) if err := d.setCapabilities(container, c); err != nil { return nil, err } + + if c.SeccompProfile == "" { + container.Seccomp = getDefaultSeccompProfile() + } } // add CAP_ prefix to all caps for new libcontainer update to match // the spec format. @@ -83,12 +87,13 @@ func (d *Driver) createContainer(c *execdriver.Command, hooks execdriver.Hooks) container.AppArmorProfile = c.AppArmorProfile } - if c.SeccompProfile != "" { + if c.SeccompProfile != "" && c.SeccompProfile != "unconfined" { container.Seccomp, err = loadSeccompProfile(c.SeccompProfile) if err != nil { return nil, err } } + if err := execdriver.SetupCgroups(container, c); err != nil { return nil, err } diff --git a/daemon/execdriver/native/seccomp.go b/daemon/execdriver/native/seccomp.go index b6c5ae0d7d..c34651a1ae 100644 --- a/daemon/execdriver/native/seccomp.go +++ b/daemon/execdriver/native/seccomp.go @@ -12,6 +12,10 @@ import ( "github.com/opencontainers/specs" ) +func getDefaultSeccompProfile() *configs.Seccomp { + return defaultSeccompProfile +} + func loadSeccompProfile(path string) (*configs.Seccomp, error) { f, err := ioutil.ReadFile(path) if err != nil { diff --git a/daemon/execdriver/native/seccomp_default.go b/daemon/execdriver/native/seccomp_default.go new file mode 100644 index 0000000000..dbd00312a3 --- /dev/null +++ b/daemon/execdriver/native/seccomp_default.go @@ -0,0 +1,319 @@ +// +build linux + +package native + +import "github.com/opencontainers/runc/libcontainer/configs" + +var defaultSeccompProfile = &configs.Seccomp{ + DefaultAction: configs.Allow, + Syscalls: []*configs.Syscall{ + { + // Quota and Accounting syscalls which could let containers + // disable their own resource limits or process accounting + Name: "acct", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Prevent containers from using the kernel keyring, + // which is not namespaced + Name: "add_key", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Similar to clock_settime and settimeofday + // Time/Date is not namespaced + Name: "adjtimex", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Time/Date is not namespaced + Name: "clock_settime", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Deny cloning new namespaces + Name: "clone", + Action: configs.Errno, + Args: []*configs.Arg{ + { + // flags from sched.h + // CLONE_NEWUTS 0x04000000 + // CLONE_NEWIPC 0x08000000 + // CLONE_NEWUSER 0x10000000 + // CLONE_NEWPID 0x20000000 + // CLONE_NEWNET 0x40000000 + Index: 0, + Value: uint64(0x04000000), + Op: configs.GreaterThanOrEqualTo, + }, + { + // flags from sched.h + // CLONE_NEWNS 0x00020000 + Index: 0, + Value: uint64(0x00020000), + Op: configs.EqualTo, + }, + }, + }, + { + // Deny manipulation and functions on kernel modules. + Name: "create_module", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Deny manipulation and functions on kernel modules. + Name: "delete_module", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Deny retrieval of exported kernel and module symbols + Name: "get_kernel_syms", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Terrifying syscalls that modify kernel memory and NUMA settings. + // They're gated by CAP_SYS_NICE, + // which we do not retain by default in containers. + Name: "get_mempolicy", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Deny getting the list of robust futexes + Name: "get_robust_list", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Deny manipulation and functions on kernel modules. + Name: "init_module", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Prevent containers from modifying kernel I/O privilege levels. + // Already restricted as containers drop CAP_SYS_RAWIO by default. + Name: "ioperm", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Prevent containers from modifying kernel I/O privilege levels. + // Already restricted as containers drop CAP_SYS_RAWIO by default. + Name: "iopl", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Sister syscall of kexec_load that does the same thing, + // slightly different arguments + Name: "kexec_file_load", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Deny loading a new kernel for later execution + Name: "kexec_load", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Prevent containers from using the kernel keyring, + // which is not namespaced + Name: "keyctl", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Tracing/profiling syscalls, + // which could leak a lot of information on the host + Name: "lookup_dcookie", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Terrifying syscalls that modify kernel memory and NUMA settings. + // They're gated by CAP_SYS_NICE, + // which we do not retain by default in containers. + Name: "mbind", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Terrifying syscalls that modify kernel memory and NUMA settings. + // They're gated by CAP_SYS_NICE, + // which we do not retain by default in containers. + Name: "migrate_pages", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Old syscall only used in 16-bit code, + // and a potential information leak + Name: "modify_ldt", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Deny mount + Name: "mount", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Terrifying syscalls that modify kernel memory and NUMA settings. + // They're gated by CAP_SYS_NICE, + // which we do not retain by default in containers. + Name: "move_pages", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Deny interaction with the kernel nfs daemon + Name: "nfsservctl", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Cause of an old container breakout, + // might as well restrict it to be on the safe side + Name: "open_by_handle_at", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Tracing/profiling syscalls, + // which could leak a lot of information on the host + Name: "perf_event_open", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Prevent container from enabling BSD emulation. + // Not inherently dangerous, but poorly tested, + // potential for a lot of kernel vulns in this. + Name: "personality", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Deny pivot_root + Name: "pivot_root", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Already blocked by dropping CAP_PTRACE + Name: "ptrace", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Deny manipulation and functions on kernel modules. + Name: "query_module", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Quota and Accounting syscalls which could let containers + // disable their own resource limits or process accounting + Name: "quotactl", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Probably a bad idea to let containers reboot the host + Name: "reboot", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Probably a bad idea to let containers restart + Name: "restart_syscall", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Prevent containers from using the kernel keyring, + // which is not namespaced + Name: "request_key", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // meta, deny seccomp + Name: "seccomp", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Terrifying syscalls that modify kernel memory and NUMA settings. + // They're gated by CAP_SYS_NICE, + // which we do not retain by default in containers. + Name: "set_mempolicy", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // deny associating a thread with a namespace + Name: "setns", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Deny setting the list of robust futexes + Name: "set_robust_list", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Time/Date is not namespaced + Name: "settimeofday", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Deny start/stop swapping to file/device + Name: "swapon", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Deny start/stop swapping to file/device + Name: "swapoff", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Deny read/write system parameters + Name: "_sysctl", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Deny umount + Name: "umount2", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Same as clone + Name: "unshare", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + { + // Older syscall related to shared libraries, unused for a long time + Name: "uselib", + Action: configs.Errno, + Args: []*configs.Arg{}, + }, + }, +} diff --git a/docs/security/seccomp.md b/docs/security/seccomp.md index 25c03e0839..baf52ef30a 100644 --- a/docs/security/seccomp.md +++ b/docs/security/seccomp.md @@ -62,3 +62,22 @@ Then you can run with: ``` $ docker run --rm -it --security-opt seccomp:/path/to/seccomp/profile.json hello-world ``` + +Default Profile +--------------- + +The default seccomp profile provides a sane default for running +containers with seccomp. It is moderately protective while +providing wide application compatibility. + + +Overriding the default profile for a container +---------------------------------------------- + +You can pass `unconfined` to run a container without the default seccomp +profile. + +``` +$ docker run --rm -it --security-opt seccomp:unconfined debian:jessie \ + unshare --map-root-user --user sh -c whoami +``` diff --git a/hack/make/.ensure-frozen-images b/hack/make/.ensure-frozen-images index a73c12f06f..eef951a0c2 100644 --- a/hack/make/.ensure-frozen-images +++ b/hack/make/.ensure-frozen-images @@ -27,6 +27,7 @@ case "$DOCKER_ENGINE_OSARCH" in *) images=( busybox:latest + debian:jessie hello-world:latest jess/unshare:latest ) diff --git a/hack/make/.ensure-userns-test b/hack/make/.ensure-userns-test new file mode 100644 index 0000000000..a43a76e6f8 --- /dev/null +++ b/hack/make/.ensure-userns-test @@ -0,0 +1,17 @@ +#!/bin/bash +set -e + +# Build a C binary for cloning a userns for seccomp tests +# and compile it for target daemon + +dir="$DEST/userns-test" +mkdir -p "$dir" +( + if [ "$(go env GOOS)" = "linux" ]; then + cd "$dir" + gcc -g -Wall -static ../../../../contrib/userns-test/main.c -o ./userns-test + cp ../../../../contrib/userns-test/Dockerfile . + docker build -qt userns-test . > /dev/null + fi +) +rm -rf "$dir" diff --git a/hack/make/.integration-daemon-setup b/hack/make/.integration-daemon-setup index ab9d45c32c..318489bf3f 100644 --- a/hack/make/.integration-daemon-setup +++ b/hack/make/.integration-daemon-setup @@ -3,3 +3,4 @@ bundle .ensure-emptyfs bundle .ensure-frozen-images bundle .ensure-httpserver +bundle .ensure-userns-test diff --git a/integration-cli/docker_cli_run_test.go b/integration-cli/docker_cli_run_test.go index e696078950..f2eabf23c0 100644 --- a/integration-cli/docker_cli_run_test.go +++ b/integration-cli/docker_cli_run_test.go @@ -2858,18 +2858,25 @@ func (s *DockerSuite) TestRunUnshareProc(c *check.C) { testRequires(c, Apparmor, DaemonIsLinux, NotUserNamespace) name := "acidburn" - if out, _, err := dockerCmdWithError("run", "--name", name, "jess/unshare", "unshare", "-p", "-m", "-f", "-r", "--mount-proc=/proc", "mount"); err == nil || !strings.Contains(out, "Permission denied") { + out, _, err := dockerCmdWithError("run", "--name", name, "--security-opt", "seccomp:unconfined", "jess/unshare", "unshare", "-p", "-m", "-f", "-r", "--mount-proc=/proc", "mount") + if err == nil || + !(strings.Contains(strings.ToLower(out), "permission denied") || + strings.Contains(strings.ToLower(out), "operation not permitted")) { c.Fatalf("unshare with --mount-proc should have failed with permission denied, got: %s, %v", out, err) } name = "cereal" - if out, _, err := dockerCmdWithError("run", "--name", name, "jess/unshare", "unshare", "-p", "-m", "-f", "-r", "mount", "-t", "proc", "none", "/proc"); err == nil || !strings.Contains(out, "Permission denied") { + out, _, err = dockerCmdWithError("run", "--name", name, "--security-opt", "seccomp:unconfined", "jess/unshare", "unshare", "-p", "-m", "-f", "-r", "mount", "-t", "proc", "none", "/proc") + if err == nil || + !(strings.Contains(strings.ToLower(out), "permission denied") || + strings.Contains(strings.ToLower(out), "operation not permitted")) { c.Fatalf("unshare and mount of /proc should have failed with permission denied, got: %s, %v", out, err) } /* Ensure still fails if running privileged with the default policy */ name = "crashoverride" - if out, _, err := dockerCmdWithError("run", "--privileged", "--security-opt", "apparmor:docker-default", "--name", name, "jess/unshare", "unshare", "-p", "-m", "-f", "-r", "mount", "-t", "proc", "none", "/proc"); err == nil || !(strings.Contains(strings.ToLower(out), "permission denied") || strings.Contains(strings.ToLower(out), "operation not permitted")) { + out, _, err = dockerCmdWithError("run", "--privileged", "--security-opt", "seccomp:unconfined", "--security-opt", "apparmor:docker-default", "--name", name, "jess/unshare", "unshare", "-p", "-m", "-f", "-r", "mount", "-t", "proc", "none", "/proc") + if err == nil || !(strings.Contains(strings.ToLower(out), "permission denied") || strings.Contains(strings.ToLower(out), "operation not permitted")) { c.Fatalf("privileged unshare with apparmor should have failed with permission denied, got: %s, %v", out, err) } } diff --git a/integration-cli/docker_cli_run_unix_test.go b/integration-cli/docker_cli_run_unix_test.go index 16607bd8ce..053da76f80 100644 --- a/integration-cli/docker_cli_run_unix_test.go +++ b/integration-cli/docker_cli_run_unix_test.go @@ -514,7 +514,7 @@ func (s *DockerSuite) TestRunSeccompProfileDenyUnshare(c *check.C) { if _, err := tmpFile.Write([]byte(jsonData)); err != nil { c.Fatal(err) } - runCmd := exec.Command(dockerBinary, "run", "--security-opt", "seccomp:"+tmpFile.Name(), "jess/unshare", "unshare", "-p", "-m", "-f", "-r", "mount", "-t", "proc", "none", "/proc") + runCmd := exec.Command(dockerBinary, "run", "--security-opt", "apparmor:unconfined", "--security-opt", "seccomp:"+tmpFile.Name(), "debian:jessie", "unshare", "-p", "-m", "-f", "-r", "mount", "-t", "proc", "none", "/proc") out, _, _ := runCommandWithOutput(runCmd) if !strings.Contains(out, "Operation not permitted") { c.Fatalf("expected unshare with seccomp profile denied to fail, got %s", out) @@ -548,3 +548,87 @@ func (s *DockerSuite) TestRunSeccompProfileDenyChmod(c *check.C) { c.Fatalf("expected chmod with seccomp profile denied to fail, got %s", out) } } + +// TestRunSeccompProfileDenyUnshareUserns checks that 'docker run jess/unshare unshare --map-root-user --user sh -c whoami' with a specific profile to +// deny unhare of a userns exits with operation not permitted. +func (s *DockerSuite) TestRunSeccompProfileDenyUnshareUserns(c *check.C) { + testRequires(c, SameHostDaemon, seccompEnabled) + // from sched.h + jsonData := fmt.Sprintf(`{ + "defaultAction": "SCMP_ACT_ALLOW", + "syscalls": [ + { + "name": "unshare", + "action": "SCMP_ACT_ERRNO", + "args": [ + { + "index": 0, + "value": %d, + "op": "SCMP_CMP_EQ" + } + ] + } + ] +}`, uint64(0x10000000)) + tmpFile, err := ioutil.TempFile("", "profile.json") + defer tmpFile.Close() + if err != nil { + c.Fatal(err) + } + + if _, err := tmpFile.Write([]byte(jsonData)); err != nil { + c.Fatal(err) + } + runCmd := exec.Command(dockerBinary, "run", "--security-opt", "apparmor:unconfined", "--security-opt", "seccomp:"+tmpFile.Name(), "debian:jessie", "unshare", "--map-root-user", "--user", "sh", "-c", "whoami") + out, _, _ := runCommandWithOutput(runCmd) + if !strings.Contains(out, "Operation not permitted") { + c.Fatalf("expected unshare userns with seccomp profile denied to fail, got %s", out) + } +} + +// TestRunSeccompProfileDenyCloneUserns checks that 'docker run userns-test' +// with a the default seccomp profile exits with operation not permitted. +func (s *DockerSuite) TestRunSeccompProfileDenyCloneUserns(c *check.C) { + testRequires(c, SameHostDaemon, seccompEnabled) + + runCmd := exec.Command(dockerBinary, "run", "userns-test", "id") + out, _, err := runCommandWithOutput(runCmd) + if err == nil || !strings.Contains(out, "clone failed: Operation not permitted") { + c.Fatalf("expected clone userns with default seccomp profile denied to fail, got %s: %v", out, err) + } +} + +// TestRunSeccompUnconfinedCloneUserns checks that +// 'docker run --security-opt seccomp:unconfined userns-test' allows creating a userns. +func (s *DockerSuite) TestRunSeccompUnconfinedCloneUserns(c *check.C) { + testRequires(c, SameHostDaemon, seccompEnabled, NotUserNamespace) + + // make sure running w privileged is ok + runCmd := exec.Command(dockerBinary, "run", "--security-opt", "seccomp:unconfined", "userns-test", "id") + if out, _, err := runCommandWithOutput(runCmd); err != nil || !strings.Contains(out, "nobody") { + c.Fatalf("expected clone userns with --security-opt seccomp:unconfined to succeed, got %s: %v", out, err) + } +} + +// TestRunSeccompAllowPrivCloneUserns checks that 'docker run --privileged userns-test' +// allows creating a userns. +func (s *DockerSuite) TestRunSeccompAllowPrivCloneUserns(c *check.C) { + testRequires(c, SameHostDaemon, seccompEnabled, NotUserNamespace) + + // make sure running w privileged is ok + runCmd := exec.Command(dockerBinary, "run", "--privileged", "userns-test", "id") + if out, _, err := runCommandWithOutput(runCmd); err != nil || !strings.Contains(out, "nobody") { + c.Fatalf("expected clone userns with --privileged to succeed, got %s: %v", out, err) + } +} + +// TestRunSeccompAllowAptKey checks that 'docker run debian:jessie apt-key' succeeds. +func (s *DockerSuite) TestRunSeccompAllowAptKey(c *check.C) { + testRequires(c, SameHostDaemon, seccompEnabled) + + // apt-key uses setrlimit & getrlimit, so we want to make sure we don't break it + runCmd := exec.Command(dockerBinary, "run", "debian:jessie", "apt-key", "adv", "--keyserver", "hkp://p80.pool.sks-keyservers.net:80", "--recv-keys", "E871F18B51E0147C77796AC81196BA81F6B0FC61") + if out, _, err := runCommandWithOutput(runCmd); err != nil { + c.Fatalf("expected apt-key with seccomp to succeed, got %s: %v", out, err) + } +}