Merge pull request #18780 from jfrazelle/seccomp-default

set default seccomp profile
This commit is contained in:
David Calavera 2015-12-28 16:46:30 -08:00
commit 78ce43bad8
12 changed files with 521 additions and 6 deletions

View File

@ -163,7 +163,7 @@ RUN set -x \
&& rm -rf "$GOPATH"
# Get the "docker-py" source so we can run their integration tests
ENV DOCKER_PY_COMMIT 47ab89ec2bd3bddf1221b856ffbaff333edeabb4
ENV DOCKER_PY_COMMIT 57512760c83fbe41302891aa51e34a86f4db74de
RUN git clone https://github.com/docker/docker-py.git /docker-py \
&& cd /docker-py \
&& git checkout -q $DOCKER_PY_COMMIT \
@ -197,6 +197,7 @@ RUN ln -sv $PWD/contrib/completion/bash/docker /etc/bash_completion.d/docker
COPY contrib/download-frozen-image-v2.sh /go/src/github.com/docker/docker/contrib/
RUN ./contrib/download-frozen-image-v2.sh /docker-frozen-images \
busybox:latest@sha256:eb3c0d4680f9213ee5f348ea6d39489a1f85a318a2ae09e012c426f78252a6d2 \
debian:jessie@sha256:24a900d1671b269d6640b4224e7b63801880d8e3cb2bcbfaa10a5dddcf4469ed \
hello-world:latest@sha256:8be990ef2aeb16dbcb9271ddfe2610fa6658d13f6dfb8bc72074cc1ca36966a7 \
jess/unshare:latest@sha256:2e3a8c0591c4690b82d4eba7e5ef8f49f2ddfe9f867f3e865198db9bd1436c5b
# see also "hack/make/.ensure-frozen-images" (which needs to be updated any time this list is)

View File

@ -0,0 +1,3 @@
FROM debian:jessie
COPY userns-test .
ENTRYPOINT ["./userns-test"]

View File

@ -0,0 +1,54 @@
#define _GNU_SOURCE
#include <sched.h>
#include <unistd.h>
#include <stdlib.h>
#include <sys/wait.h>
#include <signal.h>
#include <fcntl.h>
#include <stdio.h>
#include <string.h>
#include <limits.h>
#include <errno.h>
#define STACKSIZE (1024*1024)
static char child_stack[STACKSIZE];
struct clone_args {
char **argv;
};
// child_exec is the func that will be executed as the result of clone
static int child_exec(void *stuff)
{
struct clone_args *args = (struct clone_args *)stuff;
if (execvp(args->argv[0], args->argv) != 0) {
fprintf(stderr, "failed to execvp argments %s\n",
strerror(errno));
exit(-1);
}
// we should never reach here!
exit(EXIT_FAILURE);
}
int main(int argc, char **argv)
{
struct clone_args args;
args.argv = &argv[1];
int clone_flags = CLONE_NEWUSER | SIGCHLD;
// the result of this call is that our child_exec will be run in another
// process returning it's pid
pid_t pid =
clone(child_exec, child_stack + STACKSIZE, clone_flags, &args);
if (pid < 0) {
fprintf(stderr, "clone failed: %s\n", strerror(errno));
exit(EXIT_FAILURE);
}
// lets wait on our child process here before we, the parent, exits
if (waitpid(pid, NULL, 0) == -1) {
fprintf(stderr, "failed to wait pid %d\n", pid);
exit(EXIT_FAILURE);
}
exit(EXIT_SUCCESS);
}

View File

@ -69,6 +69,10 @@ func (d *Driver) createContainer(c *execdriver.Command, hooks execdriver.Hooks)
if err := d.setCapabilities(container, c); err != nil {
return nil, err
}
if c.SeccompProfile == "" {
container.Seccomp = getDefaultSeccompProfile()
}
}
// add CAP_ prefix to all caps for new libcontainer update to match
// the spec format.
@ -83,12 +87,13 @@ func (d *Driver) createContainer(c *execdriver.Command, hooks execdriver.Hooks)
container.AppArmorProfile = c.AppArmorProfile
}
if c.SeccompProfile != "" {
if c.SeccompProfile != "" && c.SeccompProfile != "unconfined" {
container.Seccomp, err = loadSeccompProfile(c.SeccompProfile)
if err != nil {
return nil, err
}
}
if err := execdriver.SetupCgroups(container, c); err != nil {
return nil, err
}

View File

@ -12,6 +12,10 @@ import (
"github.com/opencontainers/specs"
)
func getDefaultSeccompProfile() *configs.Seccomp {
return defaultSeccompProfile
}
func loadSeccompProfile(path string) (*configs.Seccomp, error) {
f, err := ioutil.ReadFile(path)
if err != nil {

View File

@ -0,0 +1,319 @@
// +build linux
package native
import "github.com/opencontainers/runc/libcontainer/configs"
var defaultSeccompProfile = &configs.Seccomp{
DefaultAction: configs.Allow,
Syscalls: []*configs.Syscall{
{
// Quota and Accounting syscalls which could let containers
// disable their own resource limits or process accounting
Name: "acct",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Prevent containers from using the kernel keyring,
// which is not namespaced
Name: "add_key",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Similar to clock_settime and settimeofday
// Time/Date is not namespaced
Name: "adjtimex",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Time/Date is not namespaced
Name: "clock_settime",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Deny cloning new namespaces
Name: "clone",
Action: configs.Errno,
Args: []*configs.Arg{
{
// flags from sched.h
// CLONE_NEWUTS 0x04000000
// CLONE_NEWIPC 0x08000000
// CLONE_NEWUSER 0x10000000
// CLONE_NEWPID 0x20000000
// CLONE_NEWNET 0x40000000
Index: 0,
Value: uint64(0x04000000),
Op: configs.GreaterThanOrEqualTo,
},
{
// flags from sched.h
// CLONE_NEWNS 0x00020000
Index: 0,
Value: uint64(0x00020000),
Op: configs.EqualTo,
},
},
},
{
// Deny manipulation and functions on kernel modules.
Name: "create_module",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Deny manipulation and functions on kernel modules.
Name: "delete_module",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Deny retrieval of exported kernel and module symbols
Name: "get_kernel_syms",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Terrifying syscalls that modify kernel memory and NUMA settings.
// They're gated by CAP_SYS_NICE,
// which we do not retain by default in containers.
Name: "get_mempolicy",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Deny getting the list of robust futexes
Name: "get_robust_list",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Deny manipulation and functions on kernel modules.
Name: "init_module",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Prevent containers from modifying kernel I/O privilege levels.
// Already restricted as containers drop CAP_SYS_RAWIO by default.
Name: "ioperm",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Prevent containers from modifying kernel I/O privilege levels.
// Already restricted as containers drop CAP_SYS_RAWIO by default.
Name: "iopl",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Sister syscall of kexec_load that does the same thing,
// slightly different arguments
Name: "kexec_file_load",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Deny loading a new kernel for later execution
Name: "kexec_load",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Prevent containers from using the kernel keyring,
// which is not namespaced
Name: "keyctl",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Tracing/profiling syscalls,
// which could leak a lot of information on the host
Name: "lookup_dcookie",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Terrifying syscalls that modify kernel memory and NUMA settings.
// They're gated by CAP_SYS_NICE,
// which we do not retain by default in containers.
Name: "mbind",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Terrifying syscalls that modify kernel memory and NUMA settings.
// They're gated by CAP_SYS_NICE,
// which we do not retain by default in containers.
Name: "migrate_pages",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Old syscall only used in 16-bit code,
// and a potential information leak
Name: "modify_ldt",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Deny mount
Name: "mount",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Terrifying syscalls that modify kernel memory and NUMA settings.
// They're gated by CAP_SYS_NICE,
// which we do not retain by default in containers.
Name: "move_pages",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Deny interaction with the kernel nfs daemon
Name: "nfsservctl",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Cause of an old container breakout,
// might as well restrict it to be on the safe side
Name: "open_by_handle_at",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Tracing/profiling syscalls,
// which could leak a lot of information on the host
Name: "perf_event_open",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Prevent container from enabling BSD emulation.
// Not inherently dangerous, but poorly tested,
// potential for a lot of kernel vulns in this.
Name: "personality",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Deny pivot_root
Name: "pivot_root",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Already blocked by dropping CAP_PTRACE
Name: "ptrace",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Deny manipulation and functions on kernel modules.
Name: "query_module",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Quota and Accounting syscalls which could let containers
// disable their own resource limits or process accounting
Name: "quotactl",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Probably a bad idea to let containers reboot the host
Name: "reboot",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Probably a bad idea to let containers restart
Name: "restart_syscall",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Prevent containers from using the kernel keyring,
// which is not namespaced
Name: "request_key",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// meta, deny seccomp
Name: "seccomp",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Terrifying syscalls that modify kernel memory and NUMA settings.
// They're gated by CAP_SYS_NICE,
// which we do not retain by default in containers.
Name: "set_mempolicy",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// deny associating a thread with a namespace
Name: "setns",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Deny setting the list of robust futexes
Name: "set_robust_list",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Time/Date is not namespaced
Name: "settimeofday",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Deny start/stop swapping to file/device
Name: "swapon",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Deny start/stop swapping to file/device
Name: "swapoff",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Deny read/write system parameters
Name: "_sysctl",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Deny umount
Name: "umount2",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Same as clone
Name: "unshare",
Action: configs.Errno,
Args: []*configs.Arg{},
},
{
// Older syscall related to shared libraries, unused for a long time
Name: "uselib",
Action: configs.Errno,
Args: []*configs.Arg{},
},
},
}

View File

@ -62,3 +62,22 @@ Then you can run with:
```
$ docker run --rm -it --security-opt seccomp:/path/to/seccomp/profile.json hello-world
```
Default Profile
---------------
The default seccomp profile provides a sane default for running
containers with seccomp. It is moderately protective while
providing wide application compatibility.
Overriding the default profile for a container
----------------------------------------------
You can pass `unconfined` to run a container without the default seccomp
profile.
```
$ docker run --rm -it --security-opt seccomp:unconfined debian:jessie \
unshare --map-root-user --user sh -c whoami
```

View File

@ -27,6 +27,7 @@ case "$DOCKER_ENGINE_OSARCH" in
*)
images=(
busybox:latest
debian:jessie
hello-world:latest
jess/unshare:latest
)

View File

@ -0,0 +1,17 @@
#!/bin/bash
set -e
# Build a C binary for cloning a userns for seccomp tests
# and compile it for target daemon
dir="$DEST/userns-test"
mkdir -p "$dir"
(
if [ "$(go env GOOS)" = "linux" ]; then
cd "$dir"
gcc -g -Wall -static ../../../../contrib/userns-test/main.c -o ./userns-test
cp ../../../../contrib/userns-test/Dockerfile .
docker build -qt userns-test . > /dev/null
fi
)
rm -rf "$dir"

View File

@ -3,3 +3,4 @@
bundle .ensure-emptyfs
bundle .ensure-frozen-images
bundle .ensure-httpserver
bundle .ensure-userns-test

View File

@ -2858,18 +2858,25 @@ func (s *DockerSuite) TestRunUnshareProc(c *check.C) {
testRequires(c, Apparmor, DaemonIsLinux, NotUserNamespace)
name := "acidburn"
if out, _, err := dockerCmdWithError("run", "--name", name, "jess/unshare", "unshare", "-p", "-m", "-f", "-r", "--mount-proc=/proc", "mount"); err == nil || !strings.Contains(out, "Permission denied") {
out, _, err := dockerCmdWithError("run", "--name", name, "--security-opt", "seccomp:unconfined", "jess/unshare", "unshare", "-p", "-m", "-f", "-r", "--mount-proc=/proc", "mount")
if err == nil ||
!(strings.Contains(strings.ToLower(out), "permission denied") ||
strings.Contains(strings.ToLower(out), "operation not permitted")) {
c.Fatalf("unshare with --mount-proc should have failed with permission denied, got: %s, %v", out, err)
}
name = "cereal"
if out, _, err := dockerCmdWithError("run", "--name", name, "jess/unshare", "unshare", "-p", "-m", "-f", "-r", "mount", "-t", "proc", "none", "/proc"); err == nil || !strings.Contains(out, "Permission denied") {
out, _, err = dockerCmdWithError("run", "--name", name, "--security-opt", "seccomp:unconfined", "jess/unshare", "unshare", "-p", "-m", "-f", "-r", "mount", "-t", "proc", "none", "/proc")
if err == nil ||
!(strings.Contains(strings.ToLower(out), "permission denied") ||
strings.Contains(strings.ToLower(out), "operation not permitted")) {
c.Fatalf("unshare and mount of /proc should have failed with permission denied, got: %s, %v", out, err)
}
/* Ensure still fails if running privileged with the default policy */
name = "crashoverride"
if out, _, err := dockerCmdWithError("run", "--privileged", "--security-opt", "apparmor:docker-default", "--name", name, "jess/unshare", "unshare", "-p", "-m", "-f", "-r", "mount", "-t", "proc", "none", "/proc"); err == nil || !(strings.Contains(strings.ToLower(out), "permission denied") || strings.Contains(strings.ToLower(out), "operation not permitted")) {
out, _, err = dockerCmdWithError("run", "--privileged", "--security-opt", "seccomp:unconfined", "--security-opt", "apparmor:docker-default", "--name", name, "jess/unshare", "unshare", "-p", "-m", "-f", "-r", "mount", "-t", "proc", "none", "/proc")
if err == nil || !(strings.Contains(strings.ToLower(out), "permission denied") || strings.Contains(strings.ToLower(out), "operation not permitted")) {
c.Fatalf("privileged unshare with apparmor should have failed with permission denied, got: %s, %v", out, err)
}
}

View File

@ -514,7 +514,7 @@ func (s *DockerSuite) TestRunSeccompProfileDenyUnshare(c *check.C) {
if _, err := tmpFile.Write([]byte(jsonData)); err != nil {
c.Fatal(err)
}
runCmd := exec.Command(dockerBinary, "run", "--security-opt", "seccomp:"+tmpFile.Name(), "jess/unshare", "unshare", "-p", "-m", "-f", "-r", "mount", "-t", "proc", "none", "/proc")
runCmd := exec.Command(dockerBinary, "run", "--security-opt", "apparmor:unconfined", "--security-opt", "seccomp:"+tmpFile.Name(), "debian:jessie", "unshare", "-p", "-m", "-f", "-r", "mount", "-t", "proc", "none", "/proc")
out, _, _ := runCommandWithOutput(runCmd)
if !strings.Contains(out, "Operation not permitted") {
c.Fatalf("expected unshare with seccomp profile denied to fail, got %s", out)
@ -548,3 +548,87 @@ func (s *DockerSuite) TestRunSeccompProfileDenyChmod(c *check.C) {
c.Fatalf("expected chmod with seccomp profile denied to fail, got %s", out)
}
}
// TestRunSeccompProfileDenyUnshareUserns checks that 'docker run jess/unshare unshare --map-root-user --user sh -c whoami' with a specific profile to
// deny unhare of a userns exits with operation not permitted.
func (s *DockerSuite) TestRunSeccompProfileDenyUnshareUserns(c *check.C) {
testRequires(c, SameHostDaemon, seccompEnabled)
// from sched.h
jsonData := fmt.Sprintf(`{
"defaultAction": "SCMP_ACT_ALLOW",
"syscalls": [
{
"name": "unshare",
"action": "SCMP_ACT_ERRNO",
"args": [
{
"index": 0,
"value": %d,
"op": "SCMP_CMP_EQ"
}
]
}
]
}`, uint64(0x10000000))
tmpFile, err := ioutil.TempFile("", "profile.json")
defer tmpFile.Close()
if err != nil {
c.Fatal(err)
}
if _, err := tmpFile.Write([]byte(jsonData)); err != nil {
c.Fatal(err)
}
runCmd := exec.Command(dockerBinary, "run", "--security-opt", "apparmor:unconfined", "--security-opt", "seccomp:"+tmpFile.Name(), "debian:jessie", "unshare", "--map-root-user", "--user", "sh", "-c", "whoami")
out, _, _ := runCommandWithOutput(runCmd)
if !strings.Contains(out, "Operation not permitted") {
c.Fatalf("expected unshare userns with seccomp profile denied to fail, got %s", out)
}
}
// TestRunSeccompProfileDenyCloneUserns checks that 'docker run userns-test'
// with a the default seccomp profile exits with operation not permitted.
func (s *DockerSuite) TestRunSeccompProfileDenyCloneUserns(c *check.C) {
testRequires(c, SameHostDaemon, seccompEnabled)
runCmd := exec.Command(dockerBinary, "run", "userns-test", "id")
out, _, err := runCommandWithOutput(runCmd)
if err == nil || !strings.Contains(out, "clone failed: Operation not permitted") {
c.Fatalf("expected clone userns with default seccomp profile denied to fail, got %s: %v", out, err)
}
}
// TestRunSeccompUnconfinedCloneUserns checks that
// 'docker run --security-opt seccomp:unconfined userns-test' allows creating a userns.
func (s *DockerSuite) TestRunSeccompUnconfinedCloneUserns(c *check.C) {
testRequires(c, SameHostDaemon, seccompEnabled, NotUserNamespace)
// make sure running w privileged is ok
runCmd := exec.Command(dockerBinary, "run", "--security-opt", "seccomp:unconfined", "userns-test", "id")
if out, _, err := runCommandWithOutput(runCmd); err != nil || !strings.Contains(out, "nobody") {
c.Fatalf("expected clone userns with --security-opt seccomp:unconfined to succeed, got %s: %v", out, err)
}
}
// TestRunSeccompAllowPrivCloneUserns checks that 'docker run --privileged userns-test'
// allows creating a userns.
func (s *DockerSuite) TestRunSeccompAllowPrivCloneUserns(c *check.C) {
testRequires(c, SameHostDaemon, seccompEnabled, NotUserNamespace)
// make sure running w privileged is ok
runCmd := exec.Command(dockerBinary, "run", "--privileged", "userns-test", "id")
if out, _, err := runCommandWithOutput(runCmd); err != nil || !strings.Contains(out, "nobody") {
c.Fatalf("expected clone userns with --privileged to succeed, got %s: %v", out, err)
}
}
// TestRunSeccompAllowAptKey checks that 'docker run debian:jessie apt-key' succeeds.
func (s *DockerSuite) TestRunSeccompAllowAptKey(c *check.C) {
testRequires(c, SameHostDaemon, seccompEnabled)
// apt-key uses setrlimit & getrlimit, so we want to make sure we don't break it
runCmd := exec.Command(dockerBinary, "run", "debian:jessie", "apt-key", "adv", "--keyserver", "hkp://p80.pool.sks-keyservers.net:80", "--recv-keys", "E871F18B51E0147C77796AC81196BA81F6B0FC61")
if out, _, err := runCommandWithOutput(runCmd); err != nil {
c.Fatalf("expected apt-key with seccomp to succeed, got %s: %v", out, err)
}
}