From dae652e2e5e47d99c8febd5bc81df0a3269beb74 Mon Sep 17 00:00:00 2001 From: Justin Cormack Date: Tue, 26 May 2020 15:58:24 +0100 Subject: [PATCH] Add default sysctls to allow ping sockets and privileged ports with no capabilities Currently default capability CAP_NET_RAW allows users to open ICMP echo sockets, and CAP_NET_BIND_SERVICE allows binding to ports under 1024. Both of these are safe operations, and Linux now provides ways that these can be set, per container, to be allowed without any capabilties for non root users. Enable these by default. Users can revert to the previous behaviour by overriding the sysctl values explicitly. Signed-off-by: Justin Cormack --- daemon/oci_linux.go | 25 +++++++++++++ daemon/oci_linux_test.go | 40 ++++++++++++++++++++- integration-cli/docker_cli_run_unix_test.go | 5 +-- 3 files changed, 67 insertions(+), 3 deletions(-) diff --git a/daemon/oci_linux.go b/daemon/oci_linux.go index fe72c24f01..be9a296428 100644 --- a/daemon/oci_linux.go +++ b/daemon/oci_linux.go @@ -716,6 +716,14 @@ func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts { } } +// sysctlExists checks if a sysctl exists; runc will error if we add any that do not actually +// exist, so do not add the default ones if running on an old kernel. +func sysctlExists(s string) bool { + f := filepath.Join("/proc", "sys", strings.Replace(s, ".", "/", -1)) + _, err := os.Stat(f) + return err == nil +} + // WithCommonOptions sets common docker options func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts { return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { @@ -768,6 +776,23 @@ func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts { s.Hostname = c.Config.Hostname setLinuxDomainname(c, s) + // Add default sysctls that are generally safe and useful; currently we + // grant the capabilities to allow these anyway. You can override if + // you want to restore the original behaviour. + // We do not set network sysctls if network namespace is host, or if we are + // joining an existing namespace, only if we create a new net namespace. + if c.HostConfig.NetworkMode.IsPrivate() { + // We cannot set up ping socket support in a user namespace + if !c.HostConfig.UsernsMode.IsPrivate() && sysctlExists("net.ipv4.ping_group_range") { + // allow unprivileged ICMP echo sockets without CAP_NET_RAW + s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647" + } + // allow opening any port less than 1024 without CAP_NET_BIND_SERVICE + if sysctlExists("net.ipv4.ip_unprivileged_port_start") { + s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"] = "0" + } + } + return nil } } diff --git a/daemon/oci_linux_test.go b/daemon/oci_linux_test.go index 31b5a68872..796842c7ce 100644 --- a/daemon/oci_linux_test.go +++ b/daemon/oci_linux_test.go @@ -114,7 +114,9 @@ func TestSysctlOverride(t *testing.T) { Domainname: "baz.cyphar.com", }, HostConfig: &containertypes.HostConfig{ - Sysctls: map[string]string{}, + NetworkMode: "bridge", + Sysctls: map[string]string{}, + UsernsMode: "host", }, } d := setupFakeDaemon(t, c) @@ -125,15 +127,51 @@ func TestSysctlOverride(t *testing.T) { assert.NilError(t, err) assert.Equal(t, s.Hostname, "foobar") assert.Equal(t, s.Linux.Sysctl["kernel.domainname"], c.Config.Domainname) + if sysctlExists("net.ipv4.ip_unprivileged_port_start") { + assert.Equal(t, s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"], "0") + } + if sysctlExists("net.ipv4.ping_group_range") { + assert.Equal(t, s.Linux.Sysctl["net.ipv4.ping_group_range"], "0 2147483647") + } // Set an explicit sysctl. c.HostConfig.Sysctls["kernel.domainname"] = "foobar.net" assert.Assert(t, c.HostConfig.Sysctls["kernel.domainname"] != c.Config.Domainname) + c.HostConfig.Sysctls["net.ipv4.ip_unprivileged_port_start"] = "1024" s, err = d.createSpec(c) assert.NilError(t, err) assert.Equal(t, s.Hostname, "foobar") assert.Equal(t, s.Linux.Sysctl["kernel.domainname"], c.HostConfig.Sysctls["kernel.domainname"]) + assert.Equal(t, s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"], c.HostConfig.Sysctls["net.ipv4.ip_unprivileged_port_start"]) +} + +// TestSysctlOverrideHost ensures that any implicit network sysctls are not set +// with host networking +func TestSysctlOverrideHost(t *testing.T) { + c := &container.Container{ + Config: &containertypes.Config{}, + HostConfig: &containertypes.HostConfig{ + NetworkMode: "host", + Sysctls: map[string]string{}, + UsernsMode: "host", + }, + } + d := setupFakeDaemon(t, c) + defer cleanupFakeContainer(c) + + // Ensure that the implicit sysctl is not set + s, err := d.createSpec(c) + assert.NilError(t, err) + assert.Equal(t, s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"], "") + assert.Equal(t, s.Linux.Sysctl["net.ipv4.ping_group_range"], "") + + // Set an explicit sysctl. + c.HostConfig.Sysctls["net.ipv4.ip_unprivileged_port_start"] = "1024" + + s, err = d.createSpec(c) + assert.NilError(t, err) + assert.Equal(t, s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"], c.HostConfig.Sysctls["net.ipv4.ip_unprivileged_port_start"]) } func TestGetSourceMount(t *testing.T) { diff --git a/integration-cli/docker_cli_run_unix_test.go b/integration-cli/docker_cli_run_unix_test.go index e2a8b89de1..f9187cb27b 100644 --- a/integration-cli/docker_cli_run_unix_test.go +++ b/integration-cli/docker_cli_run_unix_test.go @@ -1252,12 +1252,13 @@ func (s *DockerSuite) TestUserNoEffectiveCapabilitiesNetBindService(c *testing.T // test that a root user has default capability CAP_NET_BIND_SERVICE dockerCmd(c, "run", "syscall-test", "socket-test") // test that non root user does not have default capability CAP_NET_BIND_SERVICE - icmd.RunCommand(dockerBinary, "run", "--user", "1000:1000", "syscall-test", "socket-test").Assert(c, icmd.Expected{ + // as we allow this via sysctl, also tweak the sysctl back to default + icmd.RunCommand(dockerBinary, "run", "--user", "1000:1000", "--sysctl", "net.ipv4.ip_unprivileged_port_start=1024", "syscall-test", "socket-test").Assert(c, icmd.Expected{ ExitCode: 1, Err: "Permission denied", }) // test that root user can drop default capability CAP_NET_BIND_SERVICE - icmd.RunCommand(dockerBinary, "run", "--cap-drop", "net_bind_service", "syscall-test", "socket-test").Assert(c, icmd.Expected{ + icmd.RunCommand(dockerBinary, "run", "--cap-drop", "net_bind_service", "--sysctl", "net.ipv4.ip_unprivileged_port_start=1024", "syscall-test", "socket-test").Assert(c, icmd.Expected{ ExitCode: 1, Err: "Permission denied", })