Merge changes I45d3895f,I541be3ea,Ibe51416d into main

* changes:
  libstore/build: block io_uring
  libstore/build: use an allowlist approach to syscall filtering
  libstore/build: always treat seccomp setup failures as fatal
This commit is contained in:
alois31 2024-07-26 07:08:35 +00:00 committed by Gerrit Code Review
commit d945e89e19
11 changed files with 591 additions and 66 deletions

View file

@ -0,0 +1,12 @@
---
synopsis: "Block io_uring in the Linux sandbox"
cls: 1611
credits: alois31
category: Breaking Changes
---
The io\_uring API has the unfortunate property that it is not possible to selectively decide which operations should be allowed.
This, together with the fact that new operations are routinely added, makes it a hazard to the proper function of the sandbox.
Therefore, any access to io\_uring has been made unavailable inside the sandbox.
As such, attempts to execute any system calls forming part of this API will fail with the error `ENOSYS`, as if io\_uring support had not been configured into the kernel.

View file

@ -164,6 +164,7 @@
nixUnstable = prev.nixUnstable;
check-headers = final.buildPackages.callPackage ./maintainers/check-headers.nix { };
check-syscalls = final.buildPackages.callPackage ./maintainers/check-syscalls.nix { };
default-busybox-sandbox-shell = final.busybox.override {
useMusl = true;

View file

@ -0,0 +1,16 @@
{
runCommandNoCC,
lib,
libseccomp,
writeShellScriptBin,
}:
let
syscalls-csv = runCommandNoCC "syscalls.csv" { } ''
echo ${lib.escapeShellArg libseccomp.src}
tar -xf ${lib.escapeShellArg libseccomp.src} --strip-components=2 ${libseccomp.name}/src/syscalls.csv
mv syscalls.csv "$out"
'';
in
writeShellScriptBin "check-syscalls" ''
${./check-syscalls.sh} ${syscalls-csv}
''

7
maintainers/check-syscalls.sh Executable file
View file

@ -0,0 +1,7 @@
#!/usr/bin/env bash
set -e
diff -u <(awk < src/libstore/build/local-derivation-goal.cc '/BEGIN extract-syscalls/ { extracting = 1; next }
match($0, /allowSyscall\(ctx, SCMP_SYS\(([^)]*)\)\);|\/\/ skip ([^ ]*)/, result) { print result[1] result[2] }
/END extract-syscalls/ { extracting = 0; next }') <(tail -n+2 "$1" | cut -d, -f 1)

View file

@ -391,6 +391,7 @@ stdenv.mkDerivation (finalAttrs: {
# Lix specific packages
pre-commit-checks,
contribNotice,
check-syscalls,
}:
let
glibcFix = lib.optionalAttrs (buildPlatform.isLinux && glibcLocales != null) {
@ -441,6 +442,7 @@ stdenv.mkDerivation (finalAttrs: {
# `bash` from inside `nix develop`, say, because you are using it
# via direnv, you will by default get bash (unusable edition).
bashInteractive
check-syscalls
pythonEnv
# docker image tool
skopeo

View file

@ -45,7 +45,6 @@
#include <sys/prctl.h>
#include <sys/syscall.h>
#if HAVE_SECCOMP
#include "linux/fchmodat2-compat.hh"
#include <seccomp.h>
#endif
#define pivot_root(new_root, put_old) (syscall(SYS_pivot_root, new_root, put_old))
@ -1363,6 +1362,20 @@ void LocalDerivationGoal::chownToBuilder(const Path & path)
throw SysError("cannot change ownership of '%1%'", path);
}
#if HAVE_SECCOMP
static void allowSyscall(scmp_filter_ctx ctx, int syscall) {
if (seccomp_rule_add(ctx, SCMP_ACT_ALLOW, syscall, 0) != 0)
throw SysError("unable to add seccomp rule");
}
#define ALLOW_CHMOD_IF_SAFE(ctx, syscall, modePos) \
if (seccomp_rule_add(ctx, SCMP_ACT_ALLOW, syscall, 1, SCMP_A##modePos(SCMP_CMP_MASKED_EQ, S_ISUID | S_ISGID, 0)) != 0 || \
seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), syscall, 1, SCMP_A##modePos(SCMP_CMP_MASKED_EQ, S_ISUID, S_ISUID)) != 0 || \
seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), syscall, 1, SCMP_A##modePos(SCMP_CMP_MASKED_EQ, S_ISGID, S_ISGID)) != 0) \
throw SysError("unable to add seccomp rule");
#endif
void setupSeccomp()
{
@ -1370,7 +1383,9 @@ void setupSeccomp()
#if HAVE_SECCOMP
scmp_filter_ctx ctx;
if (!(ctx = seccomp_init(SCMP_ACT_ALLOW)))
// Pretend that syscalls we don't yet know about don't exist.
// This is the best option for compatibility: after all, they did in fact not exist not too long ago.
if (!(ctx = seccomp_init(SCMP_ACT_ERRNO(ENOSYS))))
throw SysError("unable to initialize seccomp mode 2");
Finally cleanup([&]() {
@ -1405,28 +1420,514 @@ void setupSeccomp()
seccomp_arch_add(ctx, SCMP_ARCH_MIPSEL64N32) != 0)
printError("unable to add mips64el-*abin32 seccomp architecture");
/* Prevent builders from creating setuid/setgid binaries. */
for (int perm : { S_ISUID, S_ISGID }) {
if (seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), SCMP_SYS(chmod), 1,
SCMP_A1(SCMP_CMP_MASKED_EQ, (scmp_datum_t) perm, (scmp_datum_t) perm)) != 0)
throw SysError("unable to add seccomp rule");
// This list is intended for machine consumption.
// Please keep its format, order and BEGIN/END markers.
//
// Currently, it is up to date with libseccomp 2.5.5 and glibc 2.39.
// Run check-syscalls to determine which new syscalls should be added.
// New syscalls must be audited and handled in a way that blocks the following dangerous operations:
// * Creation of non-empty setuid/setgid files
// * Creation of extended attributes (including ACLs)
//
// BEGIN extract-syscalls
allowSyscall(ctx, SCMP_SYS(accept));
allowSyscall(ctx, SCMP_SYS(accept4));
allowSyscall(ctx, SCMP_SYS(access));
allowSyscall(ctx, SCMP_SYS(acct));
allowSyscall(ctx, SCMP_SYS(add_key));
allowSyscall(ctx, SCMP_SYS(adjtimex));
allowSyscall(ctx, SCMP_SYS(afs_syscall));
allowSyscall(ctx, SCMP_SYS(alarm));
allowSyscall(ctx, SCMP_SYS(arch_prctl));
allowSyscall(ctx, SCMP_SYS(arm_fadvise64_64));
allowSyscall(ctx, SCMP_SYS(arm_sync_file_range));
allowSyscall(ctx, SCMP_SYS(bdflush));
allowSyscall(ctx, SCMP_SYS(bind));
allowSyscall(ctx, SCMP_SYS(bpf));
allowSyscall(ctx, SCMP_SYS(break));
allowSyscall(ctx, SCMP_SYS(breakpoint));
allowSyscall(ctx, SCMP_SYS(brk));
allowSyscall(ctx, SCMP_SYS(cachectl));
allowSyscall(ctx, SCMP_SYS(cacheflush));
allowSyscall(ctx, SCMP_SYS(cachestat));
allowSyscall(ctx, SCMP_SYS(capget));
allowSyscall(ctx, SCMP_SYS(capset));
allowSyscall(ctx, SCMP_SYS(chdir));
// skip chmod (dangerous)
allowSyscall(ctx, SCMP_SYS(chown));
allowSyscall(ctx, SCMP_SYS(chown32));
allowSyscall(ctx, SCMP_SYS(chroot));
allowSyscall(ctx, SCMP_SYS(clock_adjtime));
allowSyscall(ctx, SCMP_SYS(clock_adjtime64));
allowSyscall(ctx, SCMP_SYS(clock_getres));
allowSyscall(ctx, SCMP_SYS(clock_getres_time64));
allowSyscall(ctx, SCMP_SYS(clock_gettime));
allowSyscall(ctx, SCMP_SYS(clock_gettime64));
allowSyscall(ctx, SCMP_SYS(clock_nanosleep));
allowSyscall(ctx, SCMP_SYS(clock_nanosleep_time64));
allowSyscall(ctx, SCMP_SYS(clock_settime));
allowSyscall(ctx, SCMP_SYS(clock_settime64));
allowSyscall(ctx, SCMP_SYS(clone));
allowSyscall(ctx, SCMP_SYS(clone3));
allowSyscall(ctx, SCMP_SYS(close));
allowSyscall(ctx, SCMP_SYS(close_range));
allowSyscall(ctx, SCMP_SYS(connect));
allowSyscall(ctx, SCMP_SYS(copy_file_range));
allowSyscall(ctx, SCMP_SYS(creat));
allowSyscall(ctx, SCMP_SYS(create_module));
allowSyscall(ctx, SCMP_SYS(delete_module));
allowSyscall(ctx, SCMP_SYS(dup));
allowSyscall(ctx, SCMP_SYS(dup2));
allowSyscall(ctx, SCMP_SYS(dup3));
allowSyscall(ctx, SCMP_SYS(epoll_create));
allowSyscall(ctx, SCMP_SYS(epoll_create1));
allowSyscall(ctx, SCMP_SYS(epoll_ctl));
allowSyscall(ctx, SCMP_SYS(epoll_ctl_old));
allowSyscall(ctx, SCMP_SYS(epoll_pwait));
allowSyscall(ctx, SCMP_SYS(epoll_pwait2));
allowSyscall(ctx, SCMP_SYS(epoll_wait));
allowSyscall(ctx, SCMP_SYS(epoll_wait_old));
allowSyscall(ctx, SCMP_SYS(eventfd));
allowSyscall(ctx, SCMP_SYS(eventfd2));
allowSyscall(ctx, SCMP_SYS(execve));
allowSyscall(ctx, SCMP_SYS(execveat));
allowSyscall(ctx, SCMP_SYS(exit));
allowSyscall(ctx, SCMP_SYS(exit_group));
allowSyscall(ctx, SCMP_SYS(faccessat));
allowSyscall(ctx, SCMP_SYS(faccessat2));
allowSyscall(ctx, SCMP_SYS(fadvise64));
allowSyscall(ctx, SCMP_SYS(fadvise64_64));
allowSyscall(ctx, SCMP_SYS(fallocate));
allowSyscall(ctx, SCMP_SYS(fanotify_init));
allowSyscall(ctx, SCMP_SYS(fanotify_mark));
allowSyscall(ctx, SCMP_SYS(fchdir));
// skip fchmod (dangerous)
// skip fchmodat (dangerous)
// skip fchmodat2 (dangerous)
allowSyscall(ctx, SCMP_SYS(fchown));
allowSyscall(ctx, SCMP_SYS(fchown32));
allowSyscall(ctx, SCMP_SYS(fchownat));
allowSyscall(ctx, SCMP_SYS(fcntl));
allowSyscall(ctx, SCMP_SYS(fcntl64));
allowSyscall(ctx, SCMP_SYS(fdatasync));
allowSyscall(ctx, SCMP_SYS(fgetxattr));
allowSyscall(ctx, SCMP_SYS(finit_module));
allowSyscall(ctx, SCMP_SYS(flistxattr));
allowSyscall(ctx, SCMP_SYS(flock));
allowSyscall(ctx, SCMP_SYS(fork));
allowSyscall(ctx, SCMP_SYS(fremovexattr));
allowSyscall(ctx, SCMP_SYS(fsconfig));
// skip fsetxattr (dangerous)
allowSyscall(ctx, SCMP_SYS(fsmount));
allowSyscall(ctx, SCMP_SYS(fsopen));
allowSyscall(ctx, SCMP_SYS(fspick));
allowSyscall(ctx, SCMP_SYS(fstat));
allowSyscall(ctx, SCMP_SYS(fstat64));
allowSyscall(ctx, SCMP_SYS(fstatat64));
allowSyscall(ctx, SCMP_SYS(fstatfs));
allowSyscall(ctx, SCMP_SYS(fstatfs64));
allowSyscall(ctx, SCMP_SYS(fsync));
allowSyscall(ctx, SCMP_SYS(ftime));
allowSyscall(ctx, SCMP_SYS(ftruncate));
allowSyscall(ctx, SCMP_SYS(ftruncate64));
allowSyscall(ctx, SCMP_SYS(futex));
allowSyscall(ctx, SCMP_SYS(futex_requeue));
allowSyscall(ctx, SCMP_SYS(futex_time64));
allowSyscall(ctx, SCMP_SYS(futex_wait));
allowSyscall(ctx, SCMP_SYS(futex_waitv));
allowSyscall(ctx, SCMP_SYS(futex_wake));
allowSyscall(ctx, SCMP_SYS(futimesat));
allowSyscall(ctx, SCMP_SYS(getcpu));
allowSyscall(ctx, SCMP_SYS(getcwd));
allowSyscall(ctx, SCMP_SYS(getdents));
allowSyscall(ctx, SCMP_SYS(getdents64));
allowSyscall(ctx, SCMP_SYS(getegid));
allowSyscall(ctx, SCMP_SYS(getegid32));
allowSyscall(ctx, SCMP_SYS(geteuid));
allowSyscall(ctx, SCMP_SYS(geteuid32));
allowSyscall(ctx, SCMP_SYS(getgid));
allowSyscall(ctx, SCMP_SYS(getgid32));
allowSyscall(ctx, SCMP_SYS(getgroups));
allowSyscall(ctx, SCMP_SYS(getgroups32));
allowSyscall(ctx, SCMP_SYS(getitimer));
allowSyscall(ctx, SCMP_SYS(get_kernel_syms));
allowSyscall(ctx, SCMP_SYS(get_mempolicy));
allowSyscall(ctx, SCMP_SYS(getpeername));
allowSyscall(ctx, SCMP_SYS(getpgid));
allowSyscall(ctx, SCMP_SYS(getpgrp));
allowSyscall(ctx, SCMP_SYS(getpid));
allowSyscall(ctx, SCMP_SYS(getpmsg));
allowSyscall(ctx, SCMP_SYS(getppid));
allowSyscall(ctx, SCMP_SYS(getpriority));
allowSyscall(ctx, SCMP_SYS(getrandom));
allowSyscall(ctx, SCMP_SYS(getresgid));
allowSyscall(ctx, SCMP_SYS(getresgid32));
allowSyscall(ctx, SCMP_SYS(getresuid));
allowSyscall(ctx, SCMP_SYS(getresuid32));
allowSyscall(ctx, SCMP_SYS(getrlimit));
allowSyscall(ctx, SCMP_SYS(get_robust_list));
allowSyscall(ctx, SCMP_SYS(getrusage));
allowSyscall(ctx, SCMP_SYS(getsid));
allowSyscall(ctx, SCMP_SYS(getsockname));
allowSyscall(ctx, SCMP_SYS(getsockopt));
allowSyscall(ctx, SCMP_SYS(get_thread_area));
allowSyscall(ctx, SCMP_SYS(gettid));
allowSyscall(ctx, SCMP_SYS(gettimeofday));
allowSyscall(ctx, SCMP_SYS(get_tls));
allowSyscall(ctx, SCMP_SYS(getuid));
allowSyscall(ctx, SCMP_SYS(getuid32));
allowSyscall(ctx, SCMP_SYS(getxattr));
allowSyscall(ctx, SCMP_SYS(gtty));
allowSyscall(ctx, SCMP_SYS(idle));
allowSyscall(ctx, SCMP_SYS(init_module));
allowSyscall(ctx, SCMP_SYS(inotify_add_watch));
allowSyscall(ctx, SCMP_SYS(inotify_init));
allowSyscall(ctx, SCMP_SYS(inotify_init1));
allowSyscall(ctx, SCMP_SYS(inotify_rm_watch));
allowSyscall(ctx, SCMP_SYS(io_cancel));
allowSyscall(ctx, SCMP_SYS(ioctl));
allowSyscall(ctx, SCMP_SYS(io_destroy));
allowSyscall(ctx, SCMP_SYS(io_getevents));
allowSyscall(ctx, SCMP_SYS(ioperm));
allowSyscall(ctx, SCMP_SYS(io_pgetevents));
allowSyscall(ctx, SCMP_SYS(io_pgetevents_time64));
allowSyscall(ctx, SCMP_SYS(iopl));
allowSyscall(ctx, SCMP_SYS(ioprio_get));
allowSyscall(ctx, SCMP_SYS(ioprio_set));
allowSyscall(ctx, SCMP_SYS(io_setup));
allowSyscall(ctx, SCMP_SYS(io_submit));
// skip io_uring_enter (may become dangerous)
// skip io_uring_register (may become dangerous)
// skip io_uring_setup (may become dangerous)
allowSyscall(ctx, SCMP_SYS(ipc));
allowSyscall(ctx, SCMP_SYS(kcmp));
allowSyscall(ctx, SCMP_SYS(kexec_file_load));
allowSyscall(ctx, SCMP_SYS(kexec_load));
allowSyscall(ctx, SCMP_SYS(keyctl));
allowSyscall(ctx, SCMP_SYS(kill));
allowSyscall(ctx, SCMP_SYS(landlock_add_rule));
allowSyscall(ctx, SCMP_SYS(landlock_create_ruleset));
allowSyscall(ctx, SCMP_SYS(landlock_restrict_self));
allowSyscall(ctx, SCMP_SYS(lchown));
allowSyscall(ctx, SCMP_SYS(lchown32));
allowSyscall(ctx, SCMP_SYS(lgetxattr));
allowSyscall(ctx, SCMP_SYS(link));
allowSyscall(ctx, SCMP_SYS(linkat));
allowSyscall(ctx, SCMP_SYS(listen));
allowSyscall(ctx, SCMP_SYS(listxattr));
allowSyscall(ctx, SCMP_SYS(llistxattr));
allowSyscall(ctx, SCMP_SYS(_llseek));
allowSyscall(ctx, SCMP_SYS(lock));
allowSyscall(ctx, SCMP_SYS(lookup_dcookie));
allowSyscall(ctx, SCMP_SYS(lremovexattr));
allowSyscall(ctx, SCMP_SYS(lseek));
// skip lsetxattr (dangerous)
allowSyscall(ctx, SCMP_SYS(lstat));
allowSyscall(ctx, SCMP_SYS(lstat64));
allowSyscall(ctx, SCMP_SYS(madvise));
allowSyscall(ctx, SCMP_SYS(map_shadow_stack));
allowSyscall(ctx, SCMP_SYS(mbind));
allowSyscall(ctx, SCMP_SYS(membarrier));
allowSyscall(ctx, SCMP_SYS(memfd_create));
allowSyscall(ctx, SCMP_SYS(memfd_secret));
allowSyscall(ctx, SCMP_SYS(migrate_pages));
allowSyscall(ctx, SCMP_SYS(mincore));
allowSyscall(ctx, SCMP_SYS(mkdir));
allowSyscall(ctx, SCMP_SYS(mkdirat));
allowSyscall(ctx, SCMP_SYS(mknod));
allowSyscall(ctx, SCMP_SYS(mknodat));
allowSyscall(ctx, SCMP_SYS(mlock));
allowSyscall(ctx, SCMP_SYS(mlock2));
allowSyscall(ctx, SCMP_SYS(mlockall));
allowSyscall(ctx, SCMP_SYS(mmap));
allowSyscall(ctx, SCMP_SYS(mmap2));
allowSyscall(ctx, SCMP_SYS(modify_ldt));
allowSyscall(ctx, SCMP_SYS(mount));
allowSyscall(ctx, SCMP_SYS(mount_setattr));
allowSyscall(ctx, SCMP_SYS(move_mount));
allowSyscall(ctx, SCMP_SYS(move_pages));
allowSyscall(ctx, SCMP_SYS(mprotect));
allowSyscall(ctx, SCMP_SYS(mpx));
allowSyscall(ctx, SCMP_SYS(mq_getsetattr));
allowSyscall(ctx, SCMP_SYS(mq_notify));
allowSyscall(ctx, SCMP_SYS(mq_open));
allowSyscall(ctx, SCMP_SYS(mq_timedreceive));
allowSyscall(ctx, SCMP_SYS(mq_timedreceive_time64));
allowSyscall(ctx, SCMP_SYS(mq_timedsend));
allowSyscall(ctx, SCMP_SYS(mq_timedsend_time64));
allowSyscall(ctx, SCMP_SYS(mq_unlink));
allowSyscall(ctx, SCMP_SYS(mremap));
allowSyscall(ctx, SCMP_SYS(msgctl));
allowSyscall(ctx, SCMP_SYS(msgget));
allowSyscall(ctx, SCMP_SYS(msgrcv));
allowSyscall(ctx, SCMP_SYS(msgsnd));
allowSyscall(ctx, SCMP_SYS(msync));
allowSyscall(ctx, SCMP_SYS(multiplexer));
allowSyscall(ctx, SCMP_SYS(munlock));
allowSyscall(ctx, SCMP_SYS(munlockall));
allowSyscall(ctx, SCMP_SYS(munmap));
allowSyscall(ctx, SCMP_SYS(name_to_handle_at));
allowSyscall(ctx, SCMP_SYS(nanosleep));
allowSyscall(ctx, SCMP_SYS(newfstatat));
allowSyscall(ctx, SCMP_SYS(_newselect));
allowSyscall(ctx, SCMP_SYS(nfsservctl));
allowSyscall(ctx, SCMP_SYS(nice));
allowSyscall(ctx, SCMP_SYS(oldfstat));
allowSyscall(ctx, SCMP_SYS(oldlstat));
allowSyscall(ctx, SCMP_SYS(oldolduname));
allowSyscall(ctx, SCMP_SYS(oldstat));
allowSyscall(ctx, SCMP_SYS(olduname));
allowSyscall(ctx, SCMP_SYS(open));
allowSyscall(ctx, SCMP_SYS(openat));
allowSyscall(ctx, SCMP_SYS(openat2));
allowSyscall(ctx, SCMP_SYS(open_by_handle_at));
allowSyscall(ctx, SCMP_SYS(open_tree));
allowSyscall(ctx, SCMP_SYS(pause));
allowSyscall(ctx, SCMP_SYS(pciconfig_iobase));
allowSyscall(ctx, SCMP_SYS(pciconfig_read));
allowSyscall(ctx, SCMP_SYS(pciconfig_write));
allowSyscall(ctx, SCMP_SYS(perf_event_open));
allowSyscall(ctx, SCMP_SYS(personality));
allowSyscall(ctx, SCMP_SYS(pidfd_getfd));
allowSyscall(ctx, SCMP_SYS(pidfd_open));
allowSyscall(ctx, SCMP_SYS(pidfd_send_signal));
allowSyscall(ctx, SCMP_SYS(pipe));
allowSyscall(ctx, SCMP_SYS(pipe2));
allowSyscall(ctx, SCMP_SYS(pivot_root));
allowSyscall(ctx, SCMP_SYS(pkey_alloc));
allowSyscall(ctx, SCMP_SYS(pkey_free));
allowSyscall(ctx, SCMP_SYS(pkey_mprotect));
allowSyscall(ctx, SCMP_SYS(poll));
allowSyscall(ctx, SCMP_SYS(ppoll));
allowSyscall(ctx, SCMP_SYS(ppoll_time64));
allowSyscall(ctx, SCMP_SYS(prctl));
allowSyscall(ctx, SCMP_SYS(pread64));
allowSyscall(ctx, SCMP_SYS(preadv));
allowSyscall(ctx, SCMP_SYS(preadv2));
allowSyscall(ctx, SCMP_SYS(prlimit64));
allowSyscall(ctx, SCMP_SYS(process_madvise));
allowSyscall(ctx, SCMP_SYS(process_mrelease));
allowSyscall(ctx, SCMP_SYS(process_vm_readv));
allowSyscall(ctx, SCMP_SYS(process_vm_writev));
allowSyscall(ctx, SCMP_SYS(prof));
allowSyscall(ctx, SCMP_SYS(profil));
allowSyscall(ctx, SCMP_SYS(pselect6));
allowSyscall(ctx, SCMP_SYS(pselect6_time64));
allowSyscall(ctx, SCMP_SYS(ptrace));
allowSyscall(ctx, SCMP_SYS(putpmsg));
allowSyscall(ctx, SCMP_SYS(pwrite64));
allowSyscall(ctx, SCMP_SYS(pwritev));
allowSyscall(ctx, SCMP_SYS(pwritev2));
allowSyscall(ctx, SCMP_SYS(query_module));
allowSyscall(ctx, SCMP_SYS(quotactl));
allowSyscall(ctx, SCMP_SYS(quotactl_fd));
allowSyscall(ctx, SCMP_SYS(read));
allowSyscall(ctx, SCMP_SYS(readahead));
allowSyscall(ctx, SCMP_SYS(readdir));
allowSyscall(ctx, SCMP_SYS(readlink));
allowSyscall(ctx, SCMP_SYS(readlinkat));
allowSyscall(ctx, SCMP_SYS(readv));
allowSyscall(ctx, SCMP_SYS(reboot));
allowSyscall(ctx, SCMP_SYS(recv));
allowSyscall(ctx, SCMP_SYS(recvfrom));
allowSyscall(ctx, SCMP_SYS(recvmmsg));
allowSyscall(ctx, SCMP_SYS(recvmmsg_time64));
allowSyscall(ctx, SCMP_SYS(recvmsg));
allowSyscall(ctx, SCMP_SYS(remap_file_pages));
allowSyscall(ctx, SCMP_SYS(removexattr));
allowSyscall(ctx, SCMP_SYS(rename));
allowSyscall(ctx, SCMP_SYS(renameat));
allowSyscall(ctx, SCMP_SYS(renameat2));
allowSyscall(ctx, SCMP_SYS(request_key));
allowSyscall(ctx, SCMP_SYS(restart_syscall));
allowSyscall(ctx, SCMP_SYS(riscv_flush_icache));
allowSyscall(ctx, SCMP_SYS(rmdir));
allowSyscall(ctx, SCMP_SYS(rseq));
allowSyscall(ctx, SCMP_SYS(rtas));
allowSyscall(ctx, SCMP_SYS(rt_sigaction));
allowSyscall(ctx, SCMP_SYS(rt_sigpending));
allowSyscall(ctx, SCMP_SYS(rt_sigprocmask));
allowSyscall(ctx, SCMP_SYS(rt_sigqueueinfo));
allowSyscall(ctx, SCMP_SYS(rt_sigreturn));
allowSyscall(ctx, SCMP_SYS(rt_sigsuspend));
allowSyscall(ctx, SCMP_SYS(rt_sigtimedwait));
allowSyscall(ctx, SCMP_SYS(rt_sigtimedwait_time64));
allowSyscall(ctx, SCMP_SYS(rt_tgsigqueueinfo));
allowSyscall(ctx, SCMP_SYS(s390_guarded_storage));
allowSyscall(ctx, SCMP_SYS(s390_pci_mmio_read));
allowSyscall(ctx, SCMP_SYS(s390_pci_mmio_write));
allowSyscall(ctx, SCMP_SYS(s390_runtime_instr));
allowSyscall(ctx, SCMP_SYS(s390_sthyi));
allowSyscall(ctx, SCMP_SYS(sched_getaffinity));
allowSyscall(ctx, SCMP_SYS(sched_getattr));
allowSyscall(ctx, SCMP_SYS(sched_getparam));
allowSyscall(ctx, SCMP_SYS(sched_get_priority_max));
allowSyscall(ctx, SCMP_SYS(sched_get_priority_min));
allowSyscall(ctx, SCMP_SYS(sched_getscheduler));
allowSyscall(ctx, SCMP_SYS(sched_rr_get_interval));
allowSyscall(ctx, SCMP_SYS(sched_rr_get_interval_time64));
allowSyscall(ctx, SCMP_SYS(sched_setaffinity));
allowSyscall(ctx, SCMP_SYS(sched_setattr));
allowSyscall(ctx, SCMP_SYS(sched_setparam));
allowSyscall(ctx, SCMP_SYS(sched_setscheduler));
allowSyscall(ctx, SCMP_SYS(sched_yield));
allowSyscall(ctx, SCMP_SYS(seccomp));
allowSyscall(ctx, SCMP_SYS(security));
allowSyscall(ctx, SCMP_SYS(select));
allowSyscall(ctx, SCMP_SYS(semctl));
allowSyscall(ctx, SCMP_SYS(semget));
allowSyscall(ctx, SCMP_SYS(semop));
allowSyscall(ctx, SCMP_SYS(semtimedop));
allowSyscall(ctx, SCMP_SYS(semtimedop_time64));
allowSyscall(ctx, SCMP_SYS(send));
allowSyscall(ctx, SCMP_SYS(sendfile));
allowSyscall(ctx, SCMP_SYS(sendfile64));
allowSyscall(ctx, SCMP_SYS(sendmmsg));
allowSyscall(ctx, SCMP_SYS(sendmsg));
allowSyscall(ctx, SCMP_SYS(sendto));
allowSyscall(ctx, SCMP_SYS(setdomainname));
allowSyscall(ctx, SCMP_SYS(setfsgid));
allowSyscall(ctx, SCMP_SYS(setfsgid32));
allowSyscall(ctx, SCMP_SYS(setfsuid));
allowSyscall(ctx, SCMP_SYS(setfsuid32));
allowSyscall(ctx, SCMP_SYS(setgid));
allowSyscall(ctx, SCMP_SYS(setgid32));
allowSyscall(ctx, SCMP_SYS(setgroups));
allowSyscall(ctx, SCMP_SYS(setgroups32));
allowSyscall(ctx, SCMP_SYS(sethostname));
allowSyscall(ctx, SCMP_SYS(setitimer));
allowSyscall(ctx, SCMP_SYS(set_mempolicy));
allowSyscall(ctx, SCMP_SYS(set_mempolicy_home_node));
allowSyscall(ctx, SCMP_SYS(setns));
allowSyscall(ctx, SCMP_SYS(setpgid));
allowSyscall(ctx, SCMP_SYS(setpriority));
allowSyscall(ctx, SCMP_SYS(setregid));
allowSyscall(ctx, SCMP_SYS(setregid32));
allowSyscall(ctx, SCMP_SYS(setresgid));
allowSyscall(ctx, SCMP_SYS(setresgid32));
allowSyscall(ctx, SCMP_SYS(setresuid));
allowSyscall(ctx, SCMP_SYS(setresuid32));
allowSyscall(ctx, SCMP_SYS(setreuid));
allowSyscall(ctx, SCMP_SYS(setreuid32));
allowSyscall(ctx, SCMP_SYS(setrlimit));
allowSyscall(ctx, SCMP_SYS(set_robust_list));
allowSyscall(ctx, SCMP_SYS(setsid));
allowSyscall(ctx, SCMP_SYS(setsockopt));
allowSyscall(ctx, SCMP_SYS(set_thread_area));
allowSyscall(ctx, SCMP_SYS(set_tid_address));
allowSyscall(ctx, SCMP_SYS(settimeofday));
allowSyscall(ctx, SCMP_SYS(set_tls));
allowSyscall(ctx, SCMP_SYS(setuid));
allowSyscall(ctx, SCMP_SYS(setuid32));
// skip setxattr (dangerous)
allowSyscall(ctx, SCMP_SYS(sgetmask));
allowSyscall(ctx, SCMP_SYS(shmat));
allowSyscall(ctx, SCMP_SYS(shmctl));
allowSyscall(ctx, SCMP_SYS(shmdt));
allowSyscall(ctx, SCMP_SYS(shmget));
allowSyscall(ctx, SCMP_SYS(shutdown));
allowSyscall(ctx, SCMP_SYS(sigaction));
allowSyscall(ctx, SCMP_SYS(sigaltstack));
allowSyscall(ctx, SCMP_SYS(signal));
allowSyscall(ctx, SCMP_SYS(signalfd));
allowSyscall(ctx, SCMP_SYS(signalfd4));
allowSyscall(ctx, SCMP_SYS(sigpending));
allowSyscall(ctx, SCMP_SYS(sigprocmask));
allowSyscall(ctx, SCMP_SYS(sigreturn));
allowSyscall(ctx, SCMP_SYS(sigsuspend));
allowSyscall(ctx, SCMP_SYS(socket));
allowSyscall(ctx, SCMP_SYS(socketcall));
allowSyscall(ctx, SCMP_SYS(socketpair));
allowSyscall(ctx, SCMP_SYS(splice));
allowSyscall(ctx, SCMP_SYS(spu_create));
allowSyscall(ctx, SCMP_SYS(spu_run));
allowSyscall(ctx, SCMP_SYS(ssetmask));
allowSyscall(ctx, SCMP_SYS(stat));
allowSyscall(ctx, SCMP_SYS(stat64));
allowSyscall(ctx, SCMP_SYS(statfs));
allowSyscall(ctx, SCMP_SYS(statfs64));
allowSyscall(ctx, SCMP_SYS(statx));
allowSyscall(ctx, SCMP_SYS(stime));
allowSyscall(ctx, SCMP_SYS(stty));
allowSyscall(ctx, SCMP_SYS(subpage_prot));
allowSyscall(ctx, SCMP_SYS(swapcontext));
allowSyscall(ctx, SCMP_SYS(swapoff));
allowSyscall(ctx, SCMP_SYS(swapon));
allowSyscall(ctx, SCMP_SYS(switch_endian));
allowSyscall(ctx, SCMP_SYS(symlink));
allowSyscall(ctx, SCMP_SYS(symlinkat));
allowSyscall(ctx, SCMP_SYS(sync));
allowSyscall(ctx, SCMP_SYS(sync_file_range));
allowSyscall(ctx, SCMP_SYS(sync_file_range2));
allowSyscall(ctx, SCMP_SYS(syncfs));
allowSyscall(ctx, SCMP_SYS(syscall));
allowSyscall(ctx, SCMP_SYS(_sysctl));
allowSyscall(ctx, SCMP_SYS(sys_debug_setcontext));
allowSyscall(ctx, SCMP_SYS(sysfs));
allowSyscall(ctx, SCMP_SYS(sysinfo));
allowSyscall(ctx, SCMP_SYS(syslog));
allowSyscall(ctx, SCMP_SYS(sysmips));
allowSyscall(ctx, SCMP_SYS(tee));
allowSyscall(ctx, SCMP_SYS(tgkill));
allowSyscall(ctx, SCMP_SYS(time));
allowSyscall(ctx, SCMP_SYS(timer_create));
allowSyscall(ctx, SCMP_SYS(timer_delete));
allowSyscall(ctx, SCMP_SYS(timerfd));
allowSyscall(ctx, SCMP_SYS(timerfd_create));
allowSyscall(ctx, SCMP_SYS(timerfd_gettime));
allowSyscall(ctx, SCMP_SYS(timerfd_gettime64));
allowSyscall(ctx, SCMP_SYS(timerfd_settime));
allowSyscall(ctx, SCMP_SYS(timerfd_settime64));
allowSyscall(ctx, SCMP_SYS(timer_getoverrun));
allowSyscall(ctx, SCMP_SYS(timer_gettime));
allowSyscall(ctx, SCMP_SYS(timer_gettime64));
allowSyscall(ctx, SCMP_SYS(timer_settime));
allowSyscall(ctx, SCMP_SYS(timer_settime64));
allowSyscall(ctx, SCMP_SYS(times));
allowSyscall(ctx, SCMP_SYS(tkill));
allowSyscall(ctx, SCMP_SYS(truncate));
allowSyscall(ctx, SCMP_SYS(truncate64));
allowSyscall(ctx, SCMP_SYS(tuxcall));
allowSyscall(ctx, SCMP_SYS(ugetrlimit));
allowSyscall(ctx, SCMP_SYS(ulimit));
allowSyscall(ctx, SCMP_SYS(umask));
allowSyscall(ctx, SCMP_SYS(umount));
allowSyscall(ctx, SCMP_SYS(umount2));
allowSyscall(ctx, SCMP_SYS(uname));
allowSyscall(ctx, SCMP_SYS(unlink));
allowSyscall(ctx, SCMP_SYS(unlinkat));
allowSyscall(ctx, SCMP_SYS(unshare));
allowSyscall(ctx, SCMP_SYS(uselib));
allowSyscall(ctx, SCMP_SYS(userfaultfd));
allowSyscall(ctx, SCMP_SYS(usr26));
allowSyscall(ctx, SCMP_SYS(usr32));
allowSyscall(ctx, SCMP_SYS(ustat));
allowSyscall(ctx, SCMP_SYS(utime));
allowSyscall(ctx, SCMP_SYS(utimensat));
allowSyscall(ctx, SCMP_SYS(utimensat_time64));
allowSyscall(ctx, SCMP_SYS(utimes));
allowSyscall(ctx, SCMP_SYS(vfork));
allowSyscall(ctx, SCMP_SYS(vhangup));
allowSyscall(ctx, SCMP_SYS(vm86));
allowSyscall(ctx, SCMP_SYS(vm86old));
allowSyscall(ctx, SCMP_SYS(vmsplice));
allowSyscall(ctx, SCMP_SYS(vserver));
allowSyscall(ctx, SCMP_SYS(wait4));
allowSyscall(ctx, SCMP_SYS(waitid));
allowSyscall(ctx, SCMP_SYS(waitpid));
allowSyscall(ctx, SCMP_SYS(write));
allowSyscall(ctx, SCMP_SYS(writev));
// END extract-syscalls
if (seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), SCMP_SYS(fchmod), 1,
SCMP_A1(SCMP_CMP_MASKED_EQ, (scmp_datum_t) perm, (scmp_datum_t) perm)) != 0)
throw SysError("unable to add seccomp rule");
// chmod family: prevent adding setuid/setgid bits to existing files.
// The Nix store does not support setuid/setgid, and even their temporary creation can weaken the security of the sandbox.
ALLOW_CHMOD_IF_SAFE(ctx, SCMP_SYS(chmod), 1);
ALLOW_CHMOD_IF_SAFE(ctx, SCMP_SYS(fchmod), 1);
ALLOW_CHMOD_IF_SAFE(ctx, SCMP_SYS(fchmodat), 2);
ALLOW_CHMOD_IF_SAFE(ctx, SCMP_SYS(fchmodat2), 2);
if (seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), SCMP_SYS(fchmodat), 1,
SCMP_A2(SCMP_CMP_MASKED_EQ, (scmp_datum_t) perm, (scmp_datum_t) perm)) != 0)
throw SysError("unable to add seccomp rule");
if (seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), NIX_SYSCALL_FCHMODAT2, 1,
SCMP_A2(SCMP_CMP_MASKED_EQ, (scmp_datum_t) perm, (scmp_datum_t) perm)) != 0)
throw SysError("unable to add seccomp rule");
}
/* Prevent builders from creating EAs or ACLs. Not all filesystems
support these, and they're not allowed in the Nix store because
they're not representable in the NAR serialisation. */
// setxattr family: prevent creation of extended attributes or ACLs.
// Not all filesystems support them, and they're incompatible with the NAR format.
if (seccomp_rule_add(ctx, SCMP_ACT_ERRNO(ENOTSUP), SCMP_SYS(setxattr), 0) != 0 ||
seccomp_rule_add(ctx, SCMP_ACT_ERRNO(ENOTSUP), SCMP_SYS(lsetxattr), 0) != 0 ||
seccomp_rule_add(ctx, SCMP_ACT_ERRNO(ENOTSUP), SCMP_SYS(fsetxattr), 0) != 0)
@ -1460,11 +1961,7 @@ void LocalDerivationGoal::runChild()
commonChildInit();
try {
setupSeccomp();
} catch (...) {
if (buildUser) throw;
}
setupSeccomp();
bool setUser = true;

View file

@ -1,35 +0,0 @@
/*
* Determine the syscall number for `fchmodat2`.
*
* On most platforms this is 452. Exceptions can be found on
* a glibc git checkout via `rg --pcre2 'define __NR_fchmodat2 (?!452)'`.
*
* The problem is that glibc 2.39 and libseccomp 2.5.5 are needed to
* get the syscall number. However, a Lix built against nixpkgs 23.11
* (glibc 2.38) should still have the issue fixed without depending
* on the build environment.
*
* To achieve that, the macros below try to determine the platform and
* set the syscall number which is platform-specific, but
* in most cases 452.
*
* TODO: remove this when 23.11 is EOL and the entire (supported) ecosystem
* is on glibc 2.39.
*/
#pragma once
///@file
#if defined(__alpha__)
# define NIX_SYSCALL_FCHMODAT2 562
#elif defined(__x86_64__) && SIZE_MAX == 0xFFFFFFFF // x32
# define NIX_SYSCALL_FCHMODAT2 1073742276
#elif defined(__mips__) && defined(__mips64) && defined(_ABIN64) // mips64/n64
# define NIX_SYSCALL_FCHMODAT2 5452
#elif defined(__mips__) && defined(__mips64) && defined(_ABIN32) // mips64/n32
# define NIX_SYSCALL_FCHMODAT2 6452
#elif defined(__mips__) && defined(_ABIO32) // mips32
# define NIX_SYSCALL_FCHMODAT2 4452
#else
# define NIX_SYSCALL_FCHMODAT2 452
#endif

View file

@ -155,4 +155,6 @@ in
broken-userns = runNixOSTestFor "x86_64-linux" ./broken-userns.nix;
coredumps = runNixOSTestFor "x86_64-linux" ./coredumps;
io_uring = runNixOSTestFor "x86_64-linux" ./io_uring;
}

View file

@ -0,0 +1,7 @@
let
inherit (import ../util.nix) mkNixBuildTest;
in
mkNixBuildTest {
name = "io_uring";
expressionFile = ./package.nix;
}

View file

@ -0,0 +1,19 @@
{ runCommandCC }:
runCommandCC "io_uring-is-blocked" { } ''
cat > test.c <<EOF
#include <errno.h>
#include <sys/syscall.h>
#include <unistd.h>
int main() {
int res = syscall(SYS_io_uring_setup, 0, NULL);
return res == -1 && errno == ENOSYS ? 0 : 1;
}
EOF
"$CC" -o test test.c
if ! ./test; then
echo "Oh no! io_uring is available!"
exit 1
fi
touch "$out"
''

View file

@ -12,10 +12,7 @@ int main(void) {
fprintf(fd, "henlo :3");
fclose(fd);
// FIXME use something nicer here that's less
// platform-dependent as soon as we go to 24.05
// and the glibc is new enough to support fchmodat2
long rs = syscall(452, NULL, name, S_ISUID, 0);
long rs = syscall(SYS_fchmodat2, NULL, name, S_ISUID, 0);
assert(rs == -1);
assert(errno == EPERM);
}