moby--moby/profiles/seccomp/default_linux.go

814 lines
15 KiB
Go
Raw Permalink Normal View History

package seccomp // import "github.com/docker/docker/profiles/seccomp"
import (
"github.com/opencontainers/runtime-spec/specs-go"
"golang.org/x/sys/unix"
)
func arches() []Architecture {
return []Architecture{
{
Arch: specs.ArchX86_64,
SubArches: []specs.Arch{specs.ArchX86, specs.ArchX32},
},
{
Arch: specs.ArchAARCH64,
SubArches: []specs.Arch{specs.ArchARM},
},
{
Arch: specs.ArchMIPS64,
SubArches: []specs.Arch{specs.ArchMIPS, specs.ArchMIPS64N32},
},
{
Arch: specs.ArchMIPS64N32,
SubArches: []specs.Arch{specs.ArchMIPS, specs.ArchMIPS64},
},
{
Arch: specs.ArchMIPSEL64,
SubArches: []specs.Arch{specs.ArchMIPSEL, specs.ArchMIPSEL64N32},
},
{
Arch: specs.ArchMIPSEL64N32,
SubArches: []specs.Arch{specs.ArchMIPSEL, specs.ArchMIPSEL64},
},
{
Arch: specs.ArchS390X,
SubArches: []specs.Arch{specs.ArchS390},
},
{
Arch: specs.ArchRISCV64,
SubArches: nil,
},
}
}
// DefaultProfile defines the allowed syscalls for the default seccomp profile.
func DefaultProfile() *Seccomp {
seccomp: add support for "clone3" syscall in default policy If no seccomp policy is requested, then the built-in default policy in dockerd applies. This has no rule for "clone3" defined, nor any default errno defined. So when runc receives the config it attempts to determine a default errno, using logic defined in its commit: https://github.com/opencontainers/runc/commit/7a8d7162f9d72f20d83eaa36aeb5426deecd58f2 As explained in the above commit message, runc uses a heuristic to decide which errno to return by default: [quote] The solution applied here is to prepend a "stub" filter which returns -ENOSYS if the requested syscall has a larger syscall number than any syscall mentioned in the filter. The reason for this specific rule is that syscall numbers are (roughly) allocated sequentially and thus newer syscalls will (usually) have a larger syscall number -- thus causing our filters to produce -ENOSYS if the filter was written before the syscall existed. [/quote] Unfortunately clone3 appears to one of the edge cases that does not result in use of ENOSYS, instead ending up with the historical EPERM errno. Latest glibc (2.33.9000, in Fedora 35 rawhide) will attempt to use clone3 by default. If it sees ENOSYS then it will automatically fallback to using clone. Any other errno is treated as a fatal error. Thus when docker seccomp policy triggers EPERM from clone3, no fallback occurs and programs are thus unable to spawn threads. The clone3 syscall is much more complicated than clone, most notably its flags are not exposed as a directly argument any more. Instead they are hidden inside a struct. This means that seccomp filters are unable to apply policy based on values seen in flags. Thus we can't directly replicate the current "clone" filtering for "clone3". We can at least ensure "clone3" returns ENOSYS errno, to trigger fallback to "clone" at which point we can filter on flags. Fixes: https://github.com/moby/moby/issues/42680 Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
2021-07-26 18:10:01 +00:00
nosys := uint(unix.ENOSYS)
syscalls := []*Syscall{
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"accept",
"accept4",
"access",
"adjtimex",
"alarm",
"bind",
"brk",
"capget",
"capset",
"chdir",
"chmod",
"chown",
"chown32",
"clock_adjtime",
"clock_adjtime64",
"clock_getres",
"clock_getres_time64",
"clock_gettime",
"clock_gettime64",
"clock_nanosleep",
"clock_nanosleep_time64",
"close",
"close_range",
"connect",
"copy_file_range",
"creat",
"dup",
"dup2",
"dup3",
"epoll_create",
"epoll_create1",
"epoll_ctl",
"epoll_ctl_old",
"epoll_pwait",
"epoll_pwait2",
"epoll_wait",
"epoll_wait_old",
"eventfd",
"eventfd2",
"execve",
"execveat",
"exit",
"exit_group",
"faccessat",
"faccessat2",
"fadvise64",
"fadvise64_64",
"fallocate",
"fanotify_mark",
"fchdir",
"fchmod",
"fchmodat",
"fchown",
"fchown32",
"fchownat",
"fcntl",
"fcntl64",
"fdatasync",
"fgetxattr",
"flistxattr",
"flock",
"fork",
"fremovexattr",
"fsetxattr",
"fstat",
"fstat64",
"fstatat64",
"fstatfs",
"fstatfs64",
"fsync",
"ftruncate",
"ftruncate64",
"futex",
"futex_time64",
"futex_waitv",
"futimesat",
"getcpu",
"getcwd",
"getdents",
"getdents64",
"getegid",
"getegid32",
"geteuid",
"geteuid32",
"getgid",
"getgid32",
"getgroups",
"getgroups32",
"getitimer",
"getpeername",
"getpgid",
"getpgrp",
"getpid",
"getppid",
"getpriority",
"getrandom",
"getresgid",
"getresgid32",
"getresuid",
"getresuid32",
"getrlimit",
"get_robust_list",
"getrusage",
"getsid",
"getsockname",
"getsockopt",
"get_thread_area",
"gettid",
"gettimeofday",
"getuid",
"getuid32",
"getxattr",
"inotify_add_watch",
"inotify_init",
"inotify_init1",
"inotify_rm_watch",
"io_cancel",
"ioctl",
"io_destroy",
"io_getevents",
"io_pgetevents",
"io_pgetevents_time64",
"ioprio_get",
"ioprio_set",
"io_setup",
"io_submit",
"io_uring_enter",
"io_uring_register",
"io_uring_setup",
"ipc",
"kill",
"landlock_add_rule",
"landlock_create_ruleset",
"landlock_restrict_self",
"lchown",
"lchown32",
"lgetxattr",
"link",
"linkat",
"listen",
"listxattr",
"llistxattr",
"_llseek",
"lremovexattr",
"lseek",
"lsetxattr",
"lstat",
"lstat64",
"madvise",
"membarrier",
"memfd_create",
"memfd_secret",
"mincore",
"mkdir",
"mkdirat",
"mknod",
"mknodat",
"mlock",
"mlock2",
"mlockall",
"mmap",
"mmap2",
"mprotect",
"mq_getsetattr",
"mq_notify",
"mq_open",
"mq_timedreceive",
"mq_timedreceive_time64",
"mq_timedsend",
"mq_timedsend_time64",
"mq_unlink",
"mremap",
"msgctl",
"msgget",
"msgrcv",
"msgsnd",
"msync",
"munlock",
"munlockall",
"munmap",
"nanosleep",
"newfstatat",
"_newselect",
"open",
"openat",
"openat2",
"pause",
"pidfd_open",
"pidfd_send_signal",
"pipe",
"pipe2",
"pkey_alloc",
"pkey_free",
"pkey_mprotect",
"poll",
"ppoll",
"ppoll_time64",
"prctl",
"pread64",
"preadv",
"preadv2",
"prlimit64",
"process_mrelease",
"pselect6",
"pselect6_time64",
"pwrite64",
"pwritev",
"pwritev2",
"read",
"readahead",
"readlink",
"readlinkat",
"readv",
"recv",
"recvfrom",
"recvmmsg",
"recvmmsg_time64",
"recvmsg",
"remap_file_pages",
"removexattr",
"rename",
"renameat",
"renameat2",
"restart_syscall",
"rmdir",
"rseq",
"rt_sigaction",
"rt_sigpending",
"rt_sigprocmask",
"rt_sigqueueinfo",
"rt_sigreturn",
"rt_sigsuspend",
"rt_sigtimedwait",
"rt_sigtimedwait_time64",
"rt_tgsigqueueinfo",
"sched_getaffinity",
"sched_getattr",
"sched_getparam",
"sched_get_priority_max",
"sched_get_priority_min",
"sched_getscheduler",
"sched_rr_get_interval",
"sched_rr_get_interval_time64",
"sched_setaffinity",
"sched_setattr",
"sched_setparam",
"sched_setscheduler",
"sched_yield",
"seccomp",
"select",
"semctl",
"semget",
"semop",
"semtimedop",
"semtimedop_time64",
"send",
"sendfile",
"sendfile64",
"sendmmsg",
"sendmsg",
"sendto",
"setfsgid",
"setfsgid32",
"setfsuid",
"setfsuid32",
"setgid",
"setgid32",
"setgroups",
"setgroups32",
"setitimer",
"setpgid",
"setpriority",
"setregid",
"setregid32",
"setresgid",
"setresgid32",
"setresuid",
"setresuid32",
"setreuid",
"setreuid32",
"setrlimit",
"set_robust_list",
"setsid",
"setsockopt",
"set_thread_area",
"set_tid_address",
"setuid",
"setuid32",
"setxattr",
"shmat",
"shmctl",
"shmdt",
"shmget",
"shutdown",
"sigaltstack",
"signalfd",
"signalfd4",
"sigprocmask",
"sigreturn",
"socket",
"socketcall",
"socketpair",
"splice",
"stat",
"stat64",
"statfs",
"statfs64",
"statx",
"symlink",
"symlinkat",
"sync",
"sync_file_range",
"syncfs",
"sysinfo",
"tee",
"tgkill",
"time",
"timer_create",
"timer_delete",
"timer_getoverrun",
"timer_gettime",
"timer_gettime64",
"timer_settime",
"timer_settime64",
"timerfd_create",
"timerfd_gettime",
"timerfd_gettime64",
"timerfd_settime",
"timerfd_settime64",
"times",
"tkill",
"truncate",
"truncate64",
"ugetrlimit",
"umask",
"uname",
"unlink",
"unlinkat",
"utime",
"utimensat",
"utimensat_time64",
"utimes",
"vfork",
"vmsplice",
"wait4",
"waitid",
"waitpid",
"write",
"writev",
},
Action: specs.ActAllow,
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"process_vm_readv",
"process_vm_writev",
"ptrace",
},
Action: specs.ActAllow,
},
Includes: &Filter{
MinKernel: &KernelVersion{4, 8},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{"personality"},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{
{
Index: 0,
Value: 0x0,
Op: specs.OpEqualTo,
},
},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{"personality"},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{
{
Index: 0,
Value: 0x0008,
Op: specs.OpEqualTo,
},
},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{"personality"},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{
{
Index: 0,
Value: 0x20000,
Op: specs.OpEqualTo,
},
},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{"personality"},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{
{
Index: 0,
Value: 0x20008,
Op: specs.OpEqualTo,
},
},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{"personality"},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{
{
Index: 0,
Value: 0xffffffff,
Op: specs.OpEqualTo,
},
},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"sync_file_range2",
"swapcontext",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Arches: []string{"ppc64le"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"arm_fadvise64_64",
"arm_sync_file_range",
"sync_file_range2",
"breakpoint",
"cacheflush",
"set_tls",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Arches: []string{"arm", "arm64"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"arch_prctl",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Arches: []string{"amd64", "x32"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"modify_ldt",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Arches: []string{"amd64", "x32", "x86"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"s390_pci_mmio_read",
"s390_pci_mmio_write",
"s390_runtime_instr",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Arches: []string{"s390", "s390x"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"riscv_flush_icache",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Arches: []string{"riscv64"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"open_by_handle_at",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_DAC_READ_SEARCH"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"bpf",
"clone",
seccomp: add support for "clone3" syscall in default policy If no seccomp policy is requested, then the built-in default policy in dockerd applies. This has no rule for "clone3" defined, nor any default errno defined. So when runc receives the config it attempts to determine a default errno, using logic defined in its commit: https://github.com/opencontainers/runc/commit/7a8d7162f9d72f20d83eaa36aeb5426deecd58f2 As explained in the above commit message, runc uses a heuristic to decide which errno to return by default: [quote] The solution applied here is to prepend a "stub" filter which returns -ENOSYS if the requested syscall has a larger syscall number than any syscall mentioned in the filter. The reason for this specific rule is that syscall numbers are (roughly) allocated sequentially and thus newer syscalls will (usually) have a larger syscall number -- thus causing our filters to produce -ENOSYS if the filter was written before the syscall existed. [/quote] Unfortunately clone3 appears to one of the edge cases that does not result in use of ENOSYS, instead ending up with the historical EPERM errno. Latest glibc (2.33.9000, in Fedora 35 rawhide) will attempt to use clone3 by default. If it sees ENOSYS then it will automatically fallback to using clone. Any other errno is treated as a fatal error. Thus when docker seccomp policy triggers EPERM from clone3, no fallback occurs and programs are thus unable to spawn threads. The clone3 syscall is much more complicated than clone, most notably its flags are not exposed as a directly argument any more. Instead they are hidden inside a struct. This means that seccomp filters are unable to apply policy based on values seen in flags. Thus we can't directly replicate the current "clone" filtering for "clone3". We can at least ensure "clone3" returns ENOSYS errno, to trigger fallback to "clone" at which point we can filter on flags. Fixes: https://github.com/moby/moby/issues/42680 Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
2021-07-26 18:10:01 +00:00
"clone3",
"fanotify_init",
"fsconfig",
"fsmount",
"fsopen",
"fspick",
"lookup_dcookie",
"mount",
"mount_setattr",
"move_mount",
"name_to_handle_at",
"open_tree",
"perf_event_open",
"quotactl",
"quotactl_fd",
"setdomainname",
"sethostname",
"setns",
"syslog",
"umount",
"umount2",
"unshare",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYS_ADMIN"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"clone",
},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{
{
Index: 0,
Value: unix.CLONE_NEWNS | unix.CLONE_NEWUTS | unix.CLONE_NEWIPC | unix.CLONE_NEWUSER | unix.CLONE_NEWPID | unix.CLONE_NEWNET | unix.CLONE_NEWCGROUP,
ValueTwo: 0,
Op: specs.OpMaskedEqual,
},
},
},
Excludes: &Filter{
Caps: []string{"CAP_SYS_ADMIN"},
Arches: []string{"s390", "s390x"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"clone",
},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{
{
Index: 1,
Value: unix.CLONE_NEWNS | unix.CLONE_NEWUTS | unix.CLONE_NEWIPC | unix.CLONE_NEWUSER | unix.CLONE_NEWPID | unix.CLONE_NEWNET | unix.CLONE_NEWCGROUP,
ValueTwo: 0,
Op: specs.OpMaskedEqual,
},
},
},
Comment: "s390 parameter ordering for clone is different",
Includes: &Filter{
Arches: []string{"s390", "s390x"},
},
Excludes: &Filter{
Caps: []string{"CAP_SYS_ADMIN"},
},
},
seccomp: add support for "clone3" syscall in default policy If no seccomp policy is requested, then the built-in default policy in dockerd applies. This has no rule for "clone3" defined, nor any default errno defined. So when runc receives the config it attempts to determine a default errno, using logic defined in its commit: https://github.com/opencontainers/runc/commit/7a8d7162f9d72f20d83eaa36aeb5426deecd58f2 As explained in the above commit message, runc uses a heuristic to decide which errno to return by default: [quote] The solution applied here is to prepend a "stub" filter which returns -ENOSYS if the requested syscall has a larger syscall number than any syscall mentioned in the filter. The reason for this specific rule is that syscall numbers are (roughly) allocated sequentially and thus newer syscalls will (usually) have a larger syscall number -- thus causing our filters to produce -ENOSYS if the filter was written before the syscall existed. [/quote] Unfortunately clone3 appears to one of the edge cases that does not result in use of ENOSYS, instead ending up with the historical EPERM errno. Latest glibc (2.33.9000, in Fedora 35 rawhide) will attempt to use clone3 by default. If it sees ENOSYS then it will automatically fallback to using clone. Any other errno is treated as a fatal error. Thus when docker seccomp policy triggers EPERM from clone3, no fallback occurs and programs are thus unable to spawn threads. The clone3 syscall is much more complicated than clone, most notably its flags are not exposed as a directly argument any more. Instead they are hidden inside a struct. This means that seccomp filters are unable to apply policy based on values seen in flags. Thus we can't directly replicate the current "clone" filtering for "clone3". We can at least ensure "clone3" returns ENOSYS errno, to trigger fallback to "clone" at which point we can filter on flags. Fixes: https://github.com/moby/moby/issues/42680 Signed-off-by: Daniel P. Berrangé <berrange@redhat.com>
2021-07-26 18:10:01 +00:00
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"clone3",
},
Action: specs.ActErrno,
ErrnoRet: &nosys,
},
Excludes: &Filter{
Caps: []string{"CAP_SYS_ADMIN"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"reboot",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYS_BOOT"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"chroot",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYS_CHROOT"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"delete_module",
"init_module",
"finit_module",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYS_MODULE"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"acct",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYS_PACCT"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"kcmp",
"pidfd_getfd",
"process_madvise",
"process_vm_readv",
"process_vm_writev",
"ptrace",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYS_PTRACE"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"iopl",
"ioperm",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYS_RAWIO"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"settimeofday",
"stime",
"clock_settime",
"clock_settime64",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYS_TIME"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"vhangup",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYS_TTY_CONFIG"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"get_mempolicy",
"mbind",
"set_mempolicy",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYS_NICE"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"syslog",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYSLOG"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"bpf",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_BPF"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"perf_event_open",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_PERFMON"},
},
},
}
errnoRet := uint(unix.EPERM)
return &Seccomp{
LinuxSeccomp: specs.LinuxSeccomp{
DefaultAction: specs.ActErrno,
DefaultErrnoRet: &errnoRet,
},
ArchMap: arches(),
Syscalls: syscalls,
}
}