1
0
Fork 0
mirror of https://github.com/moby/moby.git synced 2022-11-09 12:21:53 -05:00
moby--moby/profiles/seccomp/default_linux.go
Sebastiaan van Stijn 7b7d1132e8
seccomp: allow "bpf", "perf_event_open", gated by CAP_BPF, CAP_PERFMON
Update the profile to make use of CAP_BPF and CAP_PERFMON capabilities. Prior to
kernel 5.8, bpf and perf_event_open required CAP_SYS_ADMIN. This change enables
finer control of the privilege setting, thus allowing us to run certain system
tracing tools with minimal privileges.

Based on the original patch from Henry Wang in the containerd repository.

Signed-off-by: Sebastiaan van Stijn <github@gone.nl>
2022-08-18 18:34:09 +02:00

813 lines
15 KiB
Go

package seccomp // import "github.com/docker/docker/profiles/seccomp"
import (
"github.com/opencontainers/runtime-spec/specs-go"
"golang.org/x/sys/unix"
)
func arches() []Architecture {
return []Architecture{
{
Arch: specs.ArchX86_64,
SubArches: []specs.Arch{specs.ArchX86, specs.ArchX32},
},
{
Arch: specs.ArchAARCH64,
SubArches: []specs.Arch{specs.ArchARM},
},
{
Arch: specs.ArchMIPS64,
SubArches: []specs.Arch{specs.ArchMIPS, specs.ArchMIPS64N32},
},
{
Arch: specs.ArchMIPS64N32,
SubArches: []specs.Arch{specs.ArchMIPS, specs.ArchMIPS64},
},
{
Arch: specs.ArchMIPSEL64,
SubArches: []specs.Arch{specs.ArchMIPSEL, specs.ArchMIPSEL64N32},
},
{
Arch: specs.ArchMIPSEL64N32,
SubArches: []specs.Arch{specs.ArchMIPSEL, specs.ArchMIPSEL64},
},
{
Arch: specs.ArchS390X,
SubArches: []specs.Arch{specs.ArchS390},
},
{
Arch: specs.ArchRISCV64,
SubArches: nil,
},
}
}
// DefaultProfile defines the allowed syscalls for the default seccomp profile.
func DefaultProfile() *Seccomp {
nosys := uint(unix.ENOSYS)
syscalls := []*Syscall{
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"accept",
"accept4",
"access",
"adjtimex",
"alarm",
"bind",
"brk",
"capget",
"capset",
"chdir",
"chmod",
"chown",
"chown32",
"clock_adjtime",
"clock_adjtime64",
"clock_getres",
"clock_getres_time64",
"clock_gettime",
"clock_gettime64",
"clock_nanosleep",
"clock_nanosleep_time64",
"close",
"close_range",
"connect",
"copy_file_range",
"creat",
"dup",
"dup2",
"dup3",
"epoll_create",
"epoll_create1",
"epoll_ctl",
"epoll_ctl_old",
"epoll_pwait",
"epoll_pwait2",
"epoll_wait",
"epoll_wait_old",
"eventfd",
"eventfd2",
"execve",
"execveat",
"exit",
"exit_group",
"faccessat",
"faccessat2",
"fadvise64",
"fadvise64_64",
"fallocate",
"fanotify_mark",
"fchdir",
"fchmod",
"fchmodat",
"fchown",
"fchown32",
"fchownat",
"fcntl",
"fcntl64",
"fdatasync",
"fgetxattr",
"flistxattr",
"flock",
"fork",
"fremovexattr",
"fsetxattr",
"fstat",
"fstat64",
"fstatat64",
"fstatfs",
"fstatfs64",
"fsync",
"ftruncate",
"ftruncate64",
"futex",
"futex_time64",
"futex_waitv",
"futimesat",
"getcpu",
"getcwd",
"getdents",
"getdents64",
"getegid",
"getegid32",
"geteuid",
"geteuid32",
"getgid",
"getgid32",
"getgroups",
"getgroups32",
"getitimer",
"getpeername",
"getpgid",
"getpgrp",
"getpid",
"getppid",
"getpriority",
"getrandom",
"getresgid",
"getresgid32",
"getresuid",
"getresuid32",
"getrlimit",
"get_robust_list",
"getrusage",
"getsid",
"getsockname",
"getsockopt",
"get_thread_area",
"gettid",
"gettimeofday",
"getuid",
"getuid32",
"getxattr",
"inotify_add_watch",
"inotify_init",
"inotify_init1",
"inotify_rm_watch",
"io_cancel",
"ioctl",
"io_destroy",
"io_getevents",
"io_pgetevents",
"io_pgetevents_time64",
"ioprio_get",
"ioprio_set",
"io_setup",
"io_submit",
"io_uring_enter",
"io_uring_register",
"io_uring_setup",
"ipc",
"kill",
"landlock_add_rule",
"landlock_create_ruleset",
"landlock_restrict_self",
"lchown",
"lchown32",
"lgetxattr",
"link",
"linkat",
"listen",
"listxattr",
"llistxattr",
"_llseek",
"lremovexattr",
"lseek",
"lsetxattr",
"lstat",
"lstat64",
"madvise",
"membarrier",
"memfd_create",
"memfd_secret",
"mincore",
"mkdir",
"mkdirat",
"mknod",
"mknodat",
"mlock",
"mlock2",
"mlockall",
"mmap",
"mmap2",
"mprotect",
"mq_getsetattr",
"mq_notify",
"mq_open",
"mq_timedreceive",
"mq_timedreceive_time64",
"mq_timedsend",
"mq_timedsend_time64",
"mq_unlink",
"mremap",
"msgctl",
"msgget",
"msgrcv",
"msgsnd",
"msync",
"munlock",
"munlockall",
"munmap",
"nanosleep",
"newfstatat",
"_newselect",
"open",
"openat",
"openat2",
"pause",
"pidfd_open",
"pidfd_send_signal",
"pipe",
"pipe2",
"pkey_alloc",
"pkey_free",
"pkey_mprotect",
"poll",
"ppoll",
"ppoll_time64",
"prctl",
"pread64",
"preadv",
"preadv2",
"prlimit64",
"process_mrelease",
"pselect6",
"pselect6_time64",
"pwrite64",
"pwritev",
"pwritev2",
"read",
"readahead",
"readlink",
"readlinkat",
"readv",
"recv",
"recvfrom",
"recvmmsg",
"recvmmsg_time64",
"recvmsg",
"remap_file_pages",
"removexattr",
"rename",
"renameat",
"renameat2",
"restart_syscall",
"rmdir",
"rseq",
"rt_sigaction",
"rt_sigpending",
"rt_sigprocmask",
"rt_sigqueueinfo",
"rt_sigreturn",
"rt_sigsuspend",
"rt_sigtimedwait",
"rt_sigtimedwait_time64",
"rt_tgsigqueueinfo",
"sched_getaffinity",
"sched_getattr",
"sched_getparam",
"sched_get_priority_max",
"sched_get_priority_min",
"sched_getscheduler",
"sched_rr_get_interval",
"sched_rr_get_interval_time64",
"sched_setaffinity",
"sched_setattr",
"sched_setparam",
"sched_setscheduler",
"sched_yield",
"seccomp",
"select",
"semctl",
"semget",
"semop",
"semtimedop",
"semtimedop_time64",
"send",
"sendfile",
"sendfile64",
"sendmmsg",
"sendmsg",
"sendto",
"setfsgid",
"setfsgid32",
"setfsuid",
"setfsuid32",
"setgid",
"setgid32",
"setgroups",
"setgroups32",
"setitimer",
"setpgid",
"setpriority",
"setregid",
"setregid32",
"setresgid",
"setresgid32",
"setresuid",
"setresuid32",
"setreuid",
"setreuid32",
"setrlimit",
"set_robust_list",
"setsid",
"setsockopt",
"set_thread_area",
"set_tid_address",
"setuid",
"setuid32",
"setxattr",
"shmat",
"shmctl",
"shmdt",
"shmget",
"shutdown",
"sigaltstack",
"signalfd",
"signalfd4",
"sigprocmask",
"sigreturn",
"socket",
"socketcall",
"socketpair",
"splice",
"stat",
"stat64",
"statfs",
"statfs64",
"statx",
"symlink",
"symlinkat",
"sync",
"sync_file_range",
"syncfs",
"sysinfo",
"tee",
"tgkill",
"time",
"timer_create",
"timer_delete",
"timer_getoverrun",
"timer_gettime",
"timer_gettime64",
"timer_settime",
"timer_settime64",
"timerfd_create",
"timerfd_gettime",
"timerfd_gettime64",
"timerfd_settime",
"timerfd_settime64",
"times",
"tkill",
"truncate",
"truncate64",
"ugetrlimit",
"umask",
"uname",
"unlink",
"unlinkat",
"utime",
"utimensat",
"utimensat_time64",
"utimes",
"vfork",
"vmsplice",
"wait4",
"waitid",
"waitpid",
"write",
"writev",
},
Action: specs.ActAllow,
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"process_vm_readv",
"process_vm_writev",
"ptrace",
},
Action: specs.ActAllow,
},
Includes: &Filter{
MinKernel: &KernelVersion{4, 8},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{"personality"},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{
{
Index: 0,
Value: 0x0,
Op: specs.OpEqualTo,
},
},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{"personality"},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{
{
Index: 0,
Value: 0x0008,
Op: specs.OpEqualTo,
},
},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{"personality"},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{
{
Index: 0,
Value: 0x20000,
Op: specs.OpEqualTo,
},
},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{"personality"},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{
{
Index: 0,
Value: 0x20008,
Op: specs.OpEqualTo,
},
},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{"personality"},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{
{
Index: 0,
Value: 0xffffffff,
Op: specs.OpEqualTo,
},
},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"sync_file_range2",
"swapcontext",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Arches: []string{"ppc64le"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"arm_fadvise64_64",
"arm_sync_file_range",
"sync_file_range2",
"breakpoint",
"cacheflush",
"set_tls",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Arches: []string{"arm", "arm64"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"arch_prctl",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Arches: []string{"amd64", "x32"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"modify_ldt",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Arches: []string{"amd64", "x32", "x86"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"s390_pci_mmio_read",
"s390_pci_mmio_write",
"s390_runtime_instr",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Arches: []string{"s390", "s390x"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"riscv_flush_icache",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Arches: []string{"riscv64"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"open_by_handle_at",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_DAC_READ_SEARCH"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"bpf",
"clone",
"clone3",
"fanotify_init",
"fsconfig",
"fsmount",
"fsopen",
"fspick",
"lookup_dcookie",
"mount",
"mount_setattr",
"move_mount",
"name_to_handle_at",
"open_tree",
"perf_event_open",
"quotactl",
"quotactl_fd",
"setdomainname",
"sethostname",
"setns",
"syslog",
"umount",
"umount2",
"unshare",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYS_ADMIN"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"clone",
},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{
{
Index: 0,
Value: unix.CLONE_NEWNS | unix.CLONE_NEWUTS | unix.CLONE_NEWIPC | unix.CLONE_NEWUSER | unix.CLONE_NEWPID | unix.CLONE_NEWNET | unix.CLONE_NEWCGROUP,
ValueTwo: 0,
Op: specs.OpMaskedEqual,
},
},
},
Excludes: &Filter{
Caps: []string{"CAP_SYS_ADMIN"},
Arches: []string{"s390", "s390x"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"clone",
},
Action: specs.ActAllow,
Args: []specs.LinuxSeccompArg{
{
Index: 1,
Value: unix.CLONE_NEWNS | unix.CLONE_NEWUTS | unix.CLONE_NEWIPC | unix.CLONE_NEWUSER | unix.CLONE_NEWPID | unix.CLONE_NEWNET | unix.CLONE_NEWCGROUP,
ValueTwo: 0,
Op: specs.OpMaskedEqual,
},
},
},
Comment: "s390 parameter ordering for clone is different",
Includes: &Filter{
Arches: []string{"s390", "s390x"},
},
Excludes: &Filter{
Caps: []string{"CAP_SYS_ADMIN"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"clone3",
},
Action: specs.ActErrno,
ErrnoRet: &nosys,
},
Excludes: &Filter{
Caps: []string{"CAP_SYS_ADMIN"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"reboot",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYS_BOOT"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"chroot",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYS_CHROOT"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"delete_module",
"init_module",
"finit_module",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYS_MODULE"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"acct",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYS_PACCT"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"kcmp",
"pidfd_getfd",
"process_madvise",
"process_vm_readv",
"process_vm_writev",
"ptrace",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYS_PTRACE"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"iopl",
"ioperm",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYS_RAWIO"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"settimeofday",
"stime",
"clock_settime",
"clock_settime64",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYS_TIME"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"vhangup",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYS_TTY_CONFIG"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"get_mempolicy",
"mbind",
"set_mempolicy",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYS_NICE"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"syslog",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_SYSLOG"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"bpf",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_BPF"},
},
},
{
LinuxSyscall: specs.LinuxSyscall{
Names: []string{
"perf_event_open",
},
Action: specs.ActAllow,
},
Includes: &Filter{
Caps: []string{"CAP_PERFMON"},
},
},
}
errnoRet := uint(unix.EPERM)
return &Seccomp{
LinuxSeccomp: specs.LinuxSeccomp{
DefaultAction: specs.ActErrno,
DefaultErrnoRet: &errnoRet,
},
ArchMap: arches(),
Syscalls: syscalls,
}
}