diff --git a/daemon/oci_linux.go b/daemon/oci_linux.go index fe72c24f01..be9a296428 100644 --- a/daemon/oci_linux.go +++ b/daemon/oci_linux.go @@ -716,6 +716,14 @@ func WithMounts(daemon *Daemon, c *container.Container) coci.SpecOpts { } } +// sysctlExists checks if a sysctl exists; runc will error if we add any that do not actually +// exist, so do not add the default ones if running on an old kernel. +func sysctlExists(s string) bool { + f := filepath.Join("/proc", "sys", strings.Replace(s, ".", "/", -1)) + _, err := os.Stat(f) + return err == nil +} + // WithCommonOptions sets common docker options func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts { return func(ctx context.Context, _ coci.Client, _ *containers.Container, s *coci.Spec) error { @@ -768,6 +776,23 @@ func WithCommonOptions(daemon *Daemon, c *container.Container) coci.SpecOpts { s.Hostname = c.Config.Hostname setLinuxDomainname(c, s) + // Add default sysctls that are generally safe and useful; currently we + // grant the capabilities to allow these anyway. You can override if + // you want to restore the original behaviour. + // We do not set network sysctls if network namespace is host, or if we are + // joining an existing namespace, only if we create a new net namespace. + if c.HostConfig.NetworkMode.IsPrivate() { + // We cannot set up ping socket support in a user namespace + if !c.HostConfig.UsernsMode.IsPrivate() && sysctlExists("net.ipv4.ping_group_range") { + // allow unprivileged ICMP echo sockets without CAP_NET_RAW + s.Linux.Sysctl["net.ipv4.ping_group_range"] = "0 2147483647" + } + // allow opening any port less than 1024 without CAP_NET_BIND_SERVICE + if sysctlExists("net.ipv4.ip_unprivileged_port_start") { + s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"] = "0" + } + } + return nil } } diff --git a/daemon/oci_linux_test.go b/daemon/oci_linux_test.go index 31b5a68872..796842c7ce 100644 --- a/daemon/oci_linux_test.go +++ b/daemon/oci_linux_test.go @@ -114,7 +114,9 @@ func TestSysctlOverride(t *testing.T) { Domainname: "baz.cyphar.com", }, HostConfig: &containertypes.HostConfig{ - Sysctls: map[string]string{}, + NetworkMode: "bridge", + Sysctls: map[string]string{}, + UsernsMode: "host", }, } d := setupFakeDaemon(t, c) @@ -125,15 +127,51 @@ func TestSysctlOverride(t *testing.T) { assert.NilError(t, err) assert.Equal(t, s.Hostname, "foobar") assert.Equal(t, s.Linux.Sysctl["kernel.domainname"], c.Config.Domainname) + if sysctlExists("net.ipv4.ip_unprivileged_port_start") { + assert.Equal(t, s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"], "0") + } + if sysctlExists("net.ipv4.ping_group_range") { + assert.Equal(t, s.Linux.Sysctl["net.ipv4.ping_group_range"], "0 2147483647") + } // Set an explicit sysctl. c.HostConfig.Sysctls["kernel.domainname"] = "foobar.net" assert.Assert(t, c.HostConfig.Sysctls["kernel.domainname"] != c.Config.Domainname) + c.HostConfig.Sysctls["net.ipv4.ip_unprivileged_port_start"] = "1024" s, err = d.createSpec(c) assert.NilError(t, err) assert.Equal(t, s.Hostname, "foobar") assert.Equal(t, s.Linux.Sysctl["kernel.domainname"], c.HostConfig.Sysctls["kernel.domainname"]) + assert.Equal(t, s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"], c.HostConfig.Sysctls["net.ipv4.ip_unprivileged_port_start"]) +} + +// TestSysctlOverrideHost ensures that any implicit network sysctls are not set +// with host networking +func TestSysctlOverrideHost(t *testing.T) { + c := &container.Container{ + Config: &containertypes.Config{}, + HostConfig: &containertypes.HostConfig{ + NetworkMode: "host", + Sysctls: map[string]string{}, + UsernsMode: "host", + }, + } + d := setupFakeDaemon(t, c) + defer cleanupFakeContainer(c) + + // Ensure that the implicit sysctl is not set + s, err := d.createSpec(c) + assert.NilError(t, err) + assert.Equal(t, s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"], "") + assert.Equal(t, s.Linux.Sysctl["net.ipv4.ping_group_range"], "") + + // Set an explicit sysctl. + c.HostConfig.Sysctls["net.ipv4.ip_unprivileged_port_start"] = "1024" + + s, err = d.createSpec(c) + assert.NilError(t, err) + assert.Equal(t, s.Linux.Sysctl["net.ipv4.ip_unprivileged_port_start"], c.HostConfig.Sysctls["net.ipv4.ip_unprivileged_port_start"]) } func TestGetSourceMount(t *testing.T) { diff --git a/integration-cli/docker_cli_run_unix_test.go b/integration-cli/docker_cli_run_unix_test.go index e2a8b89de1..f9187cb27b 100644 --- a/integration-cli/docker_cli_run_unix_test.go +++ b/integration-cli/docker_cli_run_unix_test.go @@ -1252,12 +1252,13 @@ func (s *DockerSuite) TestUserNoEffectiveCapabilitiesNetBindService(c *testing.T // test that a root user has default capability CAP_NET_BIND_SERVICE dockerCmd(c, "run", "syscall-test", "socket-test") // test that non root user does not have default capability CAP_NET_BIND_SERVICE - icmd.RunCommand(dockerBinary, "run", "--user", "1000:1000", "syscall-test", "socket-test").Assert(c, icmd.Expected{ + // as we allow this via sysctl, also tweak the sysctl back to default + icmd.RunCommand(dockerBinary, "run", "--user", "1000:1000", "--sysctl", "net.ipv4.ip_unprivileged_port_start=1024", "syscall-test", "socket-test").Assert(c, icmd.Expected{ ExitCode: 1, Err: "Permission denied", }) // test that root user can drop default capability CAP_NET_BIND_SERVICE - icmd.RunCommand(dockerBinary, "run", "--cap-drop", "net_bind_service", "syscall-test", "socket-test").Assert(c, icmd.Expected{ + icmd.RunCommand(dockerBinary, "run", "--cap-drop", "net_bind_service", "--sysctl", "net.ipv4.ip_unprivileged_port_start=1024", "syscall-test", "socket-test").Assert(c, icmd.Expected{ ExitCode: 1, Err: "Permission denied", })