seccomp: add support for "clone3" syscall in default policy

This is a backport of 9f6b562dd1, adapted to avoid the refactoring that happened in d92739713c.

Original commit message is as follows:

> If no seccomp policy is requested, then the built-in default policy in
> dockerd applies. This has no rule for "clone3" defined, nor any default
> errno defined. So when runc receives the config it attempts to determine
> a default errno, using logic defined in its commit:
>
>   opencontainers/runc@7a8d716
>
> As explained in the above commit message, runc uses a heuristic to
> decide which errno to return by default:
>
> [quote]
>   The solution applied here is to prepend a "stub" filter which returns
>   -ENOSYS if the requested syscall has a larger syscall number than any
>   syscall mentioned in the filter. The reason for this specific rule is
>   that syscall numbers are (roughly) allocated sequentially and thus newer
>   syscalls will (usually) have a larger syscall number -- thus causing our
>   filters to produce -ENOSYS if the filter was written before the syscall
>   existed.
> [/quote]
>
> Unfortunately clone3 appears to one of the edge cases that does not
> result in use of ENOSYS, instead ending up with the historical EPERM
> errno.
>
> Latest glibc (2.33.9000, in Fedora 35 rawhide) will attempt to use
> clone3 by default. If it sees ENOSYS then it will automatically
> fallback to using clone. Any other errno is treated as a fatal
> error. Thus when docker seccomp policy triggers EPERM from clone3,
> no fallback occurs and programs are thus unable to spawn threads.
>
> The clone3 syscall is much more complicated than clone, most notably its
> flags are not exposed as a directly argument any more. Instead they are
> hidden inside a struct. This means that seccomp filters are unable to
> apply policy based on values seen in flags. Thus we can't directly
> replicate the current "clone" filtering for "clone3". We can at least
> ensure "clone3" returns ENOSYS errno, to trigger fallback to "clone"
> at which point we can filter on flags.

Signed-off-by: Tianon Gravi <admwiggin@gmail.com>
Co-authored-by: Daniel P. Berrangé <berrange@redhat.com>
This commit is contained in:
Tianon Gravi 2021-09-09 11:31:30 -07:00
parent d24c6dc5cf
commit 567c01f6d1
4 changed files with 42 additions and 16 deletions

View File

@ -591,6 +591,7 @@
"names": [
"bpf",
"clone",
"clone3",
"fanotify_init",
"fsconfig",
"fsmount",
@ -670,6 +671,21 @@
]
}
},
{
"names": [
"clone3"
],
"action": "SCMP_ACT_ERRNO",
"errnoRet": 38,
"args": [],
"comment": "",
"includes": {},
"excludes": {
"caps": [
"CAP_SYS_ADMIN"
]
}
},
{
"names": [
"reboot"

View File

@ -42,6 +42,7 @@ func arches() []Architecture {
// DefaultProfile defines the allowed syscalls for the default seccomp profile.
func DefaultProfile() *Seccomp {
nosys := uint(unix.ENOSYS)
syscalls := []*Syscall{
{
Names: []string{
@ -522,6 +523,7 @@ func DefaultProfile() *Seccomp {
Names: []string{
"bpf",
"clone",
"clone3",
"fanotify_init",
"fsconfig",
"fsmount",
@ -587,6 +589,17 @@ func DefaultProfile() *Seccomp {
Caps: []string{"CAP_SYS_ADMIN"},
},
},
{
Names: []string{
"clone3",
},
Action: specs.ActErrno,
ErrnoRet: &nosys,
Args: []*specs.LinuxSeccompArg{},
Excludes: Filter{
Caps: []string{"CAP_SYS_ADMIN"},
},
},
{
Names: []string{
"reboot",

View File

@ -45,6 +45,7 @@ type Syscall struct {
Name string `json:"name,omitempty"`
Names []string `json:"names,omitempty"`
Action specs.LinuxSeccompAction `json:"action"`
ErrnoRet *uint `json:"errnoRet,omitempty"`
Args []*specs.LinuxSeccompArg `json:"args"`
Comment string `json:"comment"`
Includes Filter `json:"includes"`

View File

@ -150,29 +150,25 @@ Loop:
}
}
newCall := specs.LinuxSyscall{
Action: call.Action,
ErrnoRet: call.ErrnoRet,
}
if call.Name != "" && len(call.Names) != 0 {
return nil, errors.New("'name' and 'names' were specified in the seccomp profile, use either 'name' or 'names'")
}
if call.Name != "" {
newConfig.Syscalls = append(newConfig.Syscalls, createSpecsSyscall([]string{call.Name}, call.Action, call.Args))
newCall.Names = []string{call.Name}
} else {
newConfig.Syscalls = append(newConfig.Syscalls, createSpecsSyscall(call.Names, call.Action, call.Args))
newCall.Names = call.Names
}
// Loop through all the arguments of the syscall and convert them
for _, arg := range call.Args {
newCall.Args = append(newCall.Args, *arg)
}
newConfig.Syscalls = append(newConfig.Syscalls, newCall)
}
return newConfig, nil
}
func createSpecsSyscall(names []string, action specs.LinuxSeccompAction, args []*specs.LinuxSeccompArg) specs.LinuxSyscall {
newCall := specs.LinuxSyscall{
Names: names,
Action: action,
}
// Loop through all the arguments of the syscall and convert them
for _, arg := range args {
newCall.Args = append(newCall.Args, *arg)
}
return newCall
}