From 233408f677f8cd310d9760ff6c1bd9c14282cc57 Mon Sep 17 00:00:00 2001 From: Alois Wohlschlager Date: Mon, 27 May 2024 17:05:44 +0200 Subject: [PATCH 1/3] libstore/build: always treat seccomp setup failures as fatal In f047e4357b4f7ad66c2e476506bf35cab82e441e, I missed the behavior that if building without a dedicated build user (i.e. in single-user setups), seccomp setup failures are silently ignored. This was introduced without explanation 7 years ago (ff6becafa8efc2f7e6f2b9b889ba4adf20b8d524). Hopefully the only use-case nowadays is causing spurious test suite successes when messing up the seccomp filter during development. Let's try removing it. Change-Id: Ibe51416d9c7a6dd635c2282990224861adf1ceab --- src/libstore/build/local-derivation-goal.cc | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/src/libstore/build/local-derivation-goal.cc b/src/libstore/build/local-derivation-goal.cc index 7d1d339e8..8e1f4e849 100644 --- a/src/libstore/build/local-derivation-goal.cc +++ b/src/libstore/build/local-derivation-goal.cc @@ -1460,11 +1460,7 @@ void LocalDerivationGoal::runChild() commonChildInit(); - try { - setupSeccomp(); - } catch (...) { - if (buildUser) throw; - } + setupSeccomp(); bool setUser = true; From 127ee1a101e3f5ebab39ad98cbe58fefcd52eca5 Mon Sep 17 00:00:00 2001 From: Alois Wohlschlager Date: Sun, 26 May 2024 10:35:03 +0200 Subject: [PATCH 2/3] libstore/build: use an allowlist approach to syscall filtering Previously, system call filtering (to prevent builders from storing files with setuid/setgid permission bits or extended attributes) was performed using a blocklist. While this looks simple at first, it actually carries significant security and maintainability risks: after all, the kernel may add new syscalls to achieve the same functionality one is trying to block, and it can even be hard to actually add the syscall to the blocklist when building against a C library that doesn't know about it yet. For a recent demonstration of this happening in practice to Nix, see the introduction of fchmodat2 [0] [1]. The allowlist approach does not share the same drawback. While it does require a rather large list of harmless syscalls to be maintained in the codebase, failing to update this list (and roll out the update to all users) in time has rather benign effects; at worst, very recent programs that already rely on new syscalls will fail with an error the same way they would on a slightly older kernel that doesn't support them yet. Most importantly, no unintended new ways of performing dangerous operations will be silently allowed. Another possible drawback is reduced system call performance due to the larger filter created by the allowlist requiring more computation [2]. However, this issue has not convincingly been demonstrated yet in practice, for example in systemd or various browsers. To the contrary, it has been measured that the the actual filter constructed here has approximately the same overhead as a very simple filter blocking only one system call. This commit tries to keep the behavior as close to unchanged as possible. The system call list is in line with libseccomp 2.5.5 and glibc 2.39, which are the latest versions at the point of writing. Since libseccomp 2.5.5 is already a requirement and the distributions shipping this together with older versions of glibc are mostly not a thing any more, this should not lead to more build failures any more. [0] https://github.com/NixOS/nixpkgs/issues/300635 [1] https://github.com/NixOS/nix/issues/10424 [2] https://github.com/flatpak/flatpak/pull/4462#issuecomment-1061690607 Change-Id: I541be3ea9b249bcceddfed6a5a13ac10b11e16ad --- flake.nix | 1 + maintainers/check-syscalls.nix | 16 + maintainers/check-syscalls.sh | 7 + package.nix | 2 + src/libstore/build/local-derivation-goal.cc | 545 +++++++++++++++++++- src/libstore/linux/fchmodat2-compat.hh | 35 -- tests/nixos/setuid/fchmodat2-suid.c | 5 +- 7 files changed, 550 insertions(+), 61 deletions(-) create mode 100644 maintainers/check-syscalls.nix create mode 100755 maintainers/check-syscalls.sh delete mode 100644 src/libstore/linux/fchmodat2-compat.hh diff --git a/flake.nix b/flake.nix index 0609a8681..1fc02a590 100644 --- a/flake.nix +++ b/flake.nix @@ -164,6 +164,7 @@ nixUnstable = prev.nixUnstable; check-headers = final.buildPackages.callPackage ./maintainers/check-headers.nix { }; + check-syscalls = final.buildPackages.callPackage ./maintainers/check-syscalls.nix { }; default-busybox-sandbox-shell = final.busybox.override { useMusl = true; diff --git a/maintainers/check-syscalls.nix b/maintainers/check-syscalls.nix new file mode 100644 index 000000000..1a3de5c6d --- /dev/null +++ b/maintainers/check-syscalls.nix @@ -0,0 +1,16 @@ +{ + runCommandNoCC, + lib, + libseccomp, + writeShellScriptBin, +}: +let + syscalls-csv = runCommandNoCC "syscalls.csv" { } '' + echo ${lib.escapeShellArg libseccomp.src} + tar -xf ${lib.escapeShellArg libseccomp.src} --strip-components=2 ${libseccomp.name}/src/syscalls.csv + mv syscalls.csv "$out" + ''; +in +writeShellScriptBin "check-syscalls" '' + ${./check-syscalls.sh} ${syscalls-csv} +'' diff --git a/maintainers/check-syscalls.sh b/maintainers/check-syscalls.sh new file mode 100755 index 000000000..cd72ac23b --- /dev/null +++ b/maintainers/check-syscalls.sh @@ -0,0 +1,7 @@ +#!/usr/bin/env bash + +set -e + +diff -u <(awk < src/libstore/build/local-derivation-goal.cc '/BEGIN extract-syscalls/ { extracting = 1; next } +match($0, /allowSyscall\(ctx, SCMP_SYS\(([^)]*)\)\);|\/\/ skip ([^ ]*)/, result) { print result[1] result[2] } +/END extract-syscalls/ { extracting = 0; next }') <(tail -n+2 "$1" | cut -d, -f 1) diff --git a/package.nix b/package.nix index 16477a6a2..c6b2e4451 100644 --- a/package.nix +++ b/package.nix @@ -403,6 +403,7 @@ stdenv.mkDerivation (finalAttrs: { # Lix specific packages pre-commit-checks, contribNotice, + check-syscalls, }: let glibcFix = lib.optionalAttrs (buildPlatform.isLinux && glibcLocales != null) { @@ -453,6 +454,7 @@ stdenv.mkDerivation (finalAttrs: { # `bash` from inside `nix develop`, say, because you are using it # via direnv, you will by default get bash (unusable edition). bashInteractive + check-syscalls pythonEnv # docker image tool skopeo diff --git a/src/libstore/build/local-derivation-goal.cc b/src/libstore/build/local-derivation-goal.cc index 8e1f4e849..33cbfb29d 100644 --- a/src/libstore/build/local-derivation-goal.cc +++ b/src/libstore/build/local-derivation-goal.cc @@ -45,7 +45,6 @@ #include #include #if HAVE_SECCOMP -#include "linux/fchmodat2-compat.hh" #include #endif #define pivot_root(new_root, put_old) (syscall(SYS_pivot_root, new_root, put_old)) @@ -1363,6 +1362,20 @@ void LocalDerivationGoal::chownToBuilder(const Path & path) throw SysError("cannot change ownership of '%1%'", path); } +#if HAVE_SECCOMP + +static void allowSyscall(scmp_filter_ctx ctx, int syscall) { + if (seccomp_rule_add(ctx, SCMP_ACT_ALLOW, syscall, 0) != 0) + throw SysError("unable to add seccomp rule"); +} + +#define ALLOW_CHMOD_IF_SAFE(ctx, syscall, modePos) \ + if (seccomp_rule_add(ctx, SCMP_ACT_ALLOW, syscall, 1, SCMP_A##modePos(SCMP_CMP_MASKED_EQ, S_ISUID | S_ISGID, 0)) != 0 || \ + seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), syscall, 1, SCMP_A##modePos(SCMP_CMP_MASKED_EQ, S_ISUID, S_ISUID)) != 0 || \ + seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), syscall, 1, SCMP_A##modePos(SCMP_CMP_MASKED_EQ, S_ISGID, S_ISGID)) != 0) \ + throw SysError("unable to add seccomp rule"); + +#endif void setupSeccomp() { @@ -1370,7 +1383,9 @@ void setupSeccomp() #if HAVE_SECCOMP scmp_filter_ctx ctx; - if (!(ctx = seccomp_init(SCMP_ACT_ALLOW))) + // Pretend that syscalls we don't yet know about don't exist. + // This is the best option for compatibility: after all, they did in fact not exist not too long ago. + if (!(ctx = seccomp_init(SCMP_ACT_ERRNO(ENOSYS)))) throw SysError("unable to initialize seccomp mode 2"); Finally cleanup([&]() { @@ -1405,28 +1420,514 @@ void setupSeccomp() seccomp_arch_add(ctx, SCMP_ARCH_MIPSEL64N32) != 0) printError("unable to add mips64el-*abin32 seccomp architecture"); - /* Prevent builders from creating setuid/setgid binaries. */ - for (int perm : { S_ISUID, S_ISGID }) { - if (seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), SCMP_SYS(chmod), 1, - SCMP_A1(SCMP_CMP_MASKED_EQ, (scmp_datum_t) perm, (scmp_datum_t) perm)) != 0) - throw SysError("unable to add seccomp rule"); + // This list is intended for machine consumption. + // Please keep its format, order and BEGIN/END markers. + // + // Currently, it is up to date with libseccomp 2.5.5 and glibc 2.39. + // Run check-syscalls to determine which new syscalls should be added. + // New syscalls must be audited and handled in a way that blocks the following dangerous operations: + // * Creation of non-empty setuid/setgid files + // * Creation of extended attributes (including ACLs) + // + // BEGIN extract-syscalls + allowSyscall(ctx, SCMP_SYS(accept)); + allowSyscall(ctx, SCMP_SYS(accept4)); + allowSyscall(ctx, SCMP_SYS(access)); + allowSyscall(ctx, SCMP_SYS(acct)); + allowSyscall(ctx, SCMP_SYS(add_key)); + allowSyscall(ctx, SCMP_SYS(adjtimex)); + allowSyscall(ctx, SCMP_SYS(afs_syscall)); + allowSyscall(ctx, SCMP_SYS(alarm)); + allowSyscall(ctx, SCMP_SYS(arch_prctl)); + allowSyscall(ctx, SCMP_SYS(arm_fadvise64_64)); + allowSyscall(ctx, SCMP_SYS(arm_sync_file_range)); + allowSyscall(ctx, SCMP_SYS(bdflush)); + allowSyscall(ctx, SCMP_SYS(bind)); + allowSyscall(ctx, SCMP_SYS(bpf)); + allowSyscall(ctx, SCMP_SYS(break)); + allowSyscall(ctx, SCMP_SYS(breakpoint)); + allowSyscall(ctx, SCMP_SYS(brk)); + allowSyscall(ctx, SCMP_SYS(cachectl)); + allowSyscall(ctx, SCMP_SYS(cacheflush)); + allowSyscall(ctx, SCMP_SYS(cachestat)); + allowSyscall(ctx, SCMP_SYS(capget)); + allowSyscall(ctx, SCMP_SYS(capset)); + allowSyscall(ctx, SCMP_SYS(chdir)); + // skip chmod (dangerous) + allowSyscall(ctx, SCMP_SYS(chown)); + allowSyscall(ctx, SCMP_SYS(chown32)); + allowSyscall(ctx, SCMP_SYS(chroot)); + allowSyscall(ctx, SCMP_SYS(clock_adjtime)); + allowSyscall(ctx, SCMP_SYS(clock_adjtime64)); + allowSyscall(ctx, SCMP_SYS(clock_getres)); + allowSyscall(ctx, SCMP_SYS(clock_getres_time64)); + allowSyscall(ctx, SCMP_SYS(clock_gettime)); + allowSyscall(ctx, SCMP_SYS(clock_gettime64)); + allowSyscall(ctx, SCMP_SYS(clock_nanosleep)); + allowSyscall(ctx, SCMP_SYS(clock_nanosleep_time64)); + allowSyscall(ctx, SCMP_SYS(clock_settime)); + allowSyscall(ctx, SCMP_SYS(clock_settime64)); + allowSyscall(ctx, SCMP_SYS(clone)); + allowSyscall(ctx, SCMP_SYS(clone3)); + allowSyscall(ctx, SCMP_SYS(close)); + allowSyscall(ctx, SCMP_SYS(close_range)); + allowSyscall(ctx, SCMP_SYS(connect)); + allowSyscall(ctx, SCMP_SYS(copy_file_range)); + allowSyscall(ctx, SCMP_SYS(creat)); + allowSyscall(ctx, SCMP_SYS(create_module)); + allowSyscall(ctx, SCMP_SYS(delete_module)); + allowSyscall(ctx, SCMP_SYS(dup)); + allowSyscall(ctx, SCMP_SYS(dup2)); + allowSyscall(ctx, SCMP_SYS(dup3)); + allowSyscall(ctx, SCMP_SYS(epoll_create)); + allowSyscall(ctx, SCMP_SYS(epoll_create1)); + allowSyscall(ctx, SCMP_SYS(epoll_ctl)); + allowSyscall(ctx, SCMP_SYS(epoll_ctl_old)); + allowSyscall(ctx, SCMP_SYS(epoll_pwait)); + allowSyscall(ctx, SCMP_SYS(epoll_pwait2)); + allowSyscall(ctx, SCMP_SYS(epoll_wait)); + allowSyscall(ctx, SCMP_SYS(epoll_wait_old)); + allowSyscall(ctx, SCMP_SYS(eventfd)); + allowSyscall(ctx, SCMP_SYS(eventfd2)); + allowSyscall(ctx, SCMP_SYS(execve)); + allowSyscall(ctx, SCMP_SYS(execveat)); + allowSyscall(ctx, SCMP_SYS(exit)); + allowSyscall(ctx, SCMP_SYS(exit_group)); + allowSyscall(ctx, SCMP_SYS(faccessat)); + allowSyscall(ctx, SCMP_SYS(faccessat2)); + allowSyscall(ctx, SCMP_SYS(fadvise64)); + allowSyscall(ctx, SCMP_SYS(fadvise64_64)); + allowSyscall(ctx, SCMP_SYS(fallocate)); + allowSyscall(ctx, SCMP_SYS(fanotify_init)); + allowSyscall(ctx, SCMP_SYS(fanotify_mark)); + allowSyscall(ctx, SCMP_SYS(fchdir)); + // skip fchmod (dangerous) + // skip fchmodat (dangerous) + // skip fchmodat2 (dangerous) + allowSyscall(ctx, SCMP_SYS(fchown)); + allowSyscall(ctx, SCMP_SYS(fchown32)); + allowSyscall(ctx, SCMP_SYS(fchownat)); + allowSyscall(ctx, SCMP_SYS(fcntl)); + allowSyscall(ctx, SCMP_SYS(fcntl64)); + allowSyscall(ctx, SCMP_SYS(fdatasync)); + allowSyscall(ctx, SCMP_SYS(fgetxattr)); + allowSyscall(ctx, SCMP_SYS(finit_module)); + allowSyscall(ctx, SCMP_SYS(flistxattr)); + allowSyscall(ctx, SCMP_SYS(flock)); + allowSyscall(ctx, SCMP_SYS(fork)); + allowSyscall(ctx, SCMP_SYS(fremovexattr)); + allowSyscall(ctx, SCMP_SYS(fsconfig)); + // skip fsetxattr (dangerous) + allowSyscall(ctx, SCMP_SYS(fsmount)); + allowSyscall(ctx, SCMP_SYS(fsopen)); + allowSyscall(ctx, SCMP_SYS(fspick)); + allowSyscall(ctx, SCMP_SYS(fstat)); + allowSyscall(ctx, SCMP_SYS(fstat64)); + allowSyscall(ctx, SCMP_SYS(fstatat64)); + allowSyscall(ctx, SCMP_SYS(fstatfs)); + allowSyscall(ctx, SCMP_SYS(fstatfs64)); + allowSyscall(ctx, SCMP_SYS(fsync)); + allowSyscall(ctx, SCMP_SYS(ftime)); + allowSyscall(ctx, SCMP_SYS(ftruncate)); + allowSyscall(ctx, SCMP_SYS(ftruncate64)); + allowSyscall(ctx, SCMP_SYS(futex)); + allowSyscall(ctx, SCMP_SYS(futex_requeue)); + allowSyscall(ctx, SCMP_SYS(futex_time64)); + allowSyscall(ctx, SCMP_SYS(futex_wait)); + allowSyscall(ctx, SCMP_SYS(futex_waitv)); + allowSyscall(ctx, SCMP_SYS(futex_wake)); + allowSyscall(ctx, SCMP_SYS(futimesat)); + allowSyscall(ctx, SCMP_SYS(getcpu)); + allowSyscall(ctx, SCMP_SYS(getcwd)); + allowSyscall(ctx, SCMP_SYS(getdents)); + allowSyscall(ctx, SCMP_SYS(getdents64)); + allowSyscall(ctx, SCMP_SYS(getegid)); + allowSyscall(ctx, SCMP_SYS(getegid32)); + allowSyscall(ctx, SCMP_SYS(geteuid)); + allowSyscall(ctx, SCMP_SYS(geteuid32)); + allowSyscall(ctx, SCMP_SYS(getgid)); + allowSyscall(ctx, SCMP_SYS(getgid32)); + allowSyscall(ctx, SCMP_SYS(getgroups)); + allowSyscall(ctx, SCMP_SYS(getgroups32)); + allowSyscall(ctx, SCMP_SYS(getitimer)); + allowSyscall(ctx, SCMP_SYS(get_kernel_syms)); + allowSyscall(ctx, SCMP_SYS(get_mempolicy)); + allowSyscall(ctx, SCMP_SYS(getpeername)); + allowSyscall(ctx, SCMP_SYS(getpgid)); + allowSyscall(ctx, SCMP_SYS(getpgrp)); + allowSyscall(ctx, SCMP_SYS(getpid)); + allowSyscall(ctx, SCMP_SYS(getpmsg)); + allowSyscall(ctx, SCMP_SYS(getppid)); + allowSyscall(ctx, SCMP_SYS(getpriority)); + allowSyscall(ctx, SCMP_SYS(getrandom)); + allowSyscall(ctx, SCMP_SYS(getresgid)); + allowSyscall(ctx, SCMP_SYS(getresgid32)); + allowSyscall(ctx, SCMP_SYS(getresuid)); + allowSyscall(ctx, SCMP_SYS(getresuid32)); + allowSyscall(ctx, SCMP_SYS(getrlimit)); + allowSyscall(ctx, SCMP_SYS(get_robust_list)); + allowSyscall(ctx, SCMP_SYS(getrusage)); + allowSyscall(ctx, SCMP_SYS(getsid)); + allowSyscall(ctx, SCMP_SYS(getsockname)); + allowSyscall(ctx, SCMP_SYS(getsockopt)); + allowSyscall(ctx, SCMP_SYS(get_thread_area)); + allowSyscall(ctx, SCMP_SYS(gettid)); + allowSyscall(ctx, SCMP_SYS(gettimeofday)); + allowSyscall(ctx, SCMP_SYS(get_tls)); + allowSyscall(ctx, SCMP_SYS(getuid)); + allowSyscall(ctx, SCMP_SYS(getuid32)); + allowSyscall(ctx, SCMP_SYS(getxattr)); + allowSyscall(ctx, SCMP_SYS(gtty)); + allowSyscall(ctx, SCMP_SYS(idle)); + allowSyscall(ctx, SCMP_SYS(init_module)); + allowSyscall(ctx, SCMP_SYS(inotify_add_watch)); + allowSyscall(ctx, SCMP_SYS(inotify_init)); + allowSyscall(ctx, SCMP_SYS(inotify_init1)); + allowSyscall(ctx, SCMP_SYS(inotify_rm_watch)); + allowSyscall(ctx, SCMP_SYS(io_cancel)); + allowSyscall(ctx, SCMP_SYS(ioctl)); + allowSyscall(ctx, SCMP_SYS(io_destroy)); + allowSyscall(ctx, SCMP_SYS(io_getevents)); + allowSyscall(ctx, SCMP_SYS(ioperm)); + allowSyscall(ctx, SCMP_SYS(io_pgetevents)); + allowSyscall(ctx, SCMP_SYS(io_pgetevents_time64)); + allowSyscall(ctx, SCMP_SYS(iopl)); + allowSyscall(ctx, SCMP_SYS(ioprio_get)); + allowSyscall(ctx, SCMP_SYS(ioprio_set)); + allowSyscall(ctx, SCMP_SYS(io_setup)); + allowSyscall(ctx, SCMP_SYS(io_submit)); + allowSyscall(ctx, SCMP_SYS(io_uring_enter)); + allowSyscall(ctx, SCMP_SYS(io_uring_register)); + allowSyscall(ctx, SCMP_SYS(io_uring_setup)); + allowSyscall(ctx, SCMP_SYS(ipc)); + allowSyscall(ctx, SCMP_SYS(kcmp)); + allowSyscall(ctx, SCMP_SYS(kexec_file_load)); + allowSyscall(ctx, SCMP_SYS(kexec_load)); + allowSyscall(ctx, SCMP_SYS(keyctl)); + allowSyscall(ctx, SCMP_SYS(kill)); + allowSyscall(ctx, SCMP_SYS(landlock_add_rule)); + allowSyscall(ctx, SCMP_SYS(landlock_create_ruleset)); + allowSyscall(ctx, SCMP_SYS(landlock_restrict_self)); + allowSyscall(ctx, SCMP_SYS(lchown)); + allowSyscall(ctx, SCMP_SYS(lchown32)); + allowSyscall(ctx, SCMP_SYS(lgetxattr)); + allowSyscall(ctx, SCMP_SYS(link)); + allowSyscall(ctx, SCMP_SYS(linkat)); + allowSyscall(ctx, SCMP_SYS(listen)); + allowSyscall(ctx, SCMP_SYS(listxattr)); + allowSyscall(ctx, SCMP_SYS(llistxattr)); + allowSyscall(ctx, SCMP_SYS(_llseek)); + allowSyscall(ctx, SCMP_SYS(lock)); + allowSyscall(ctx, SCMP_SYS(lookup_dcookie)); + allowSyscall(ctx, SCMP_SYS(lremovexattr)); + allowSyscall(ctx, SCMP_SYS(lseek)); + // skip lsetxattr (dangerous) + allowSyscall(ctx, SCMP_SYS(lstat)); + allowSyscall(ctx, SCMP_SYS(lstat64)); + allowSyscall(ctx, SCMP_SYS(madvise)); + allowSyscall(ctx, SCMP_SYS(map_shadow_stack)); + allowSyscall(ctx, SCMP_SYS(mbind)); + allowSyscall(ctx, SCMP_SYS(membarrier)); + allowSyscall(ctx, SCMP_SYS(memfd_create)); + allowSyscall(ctx, SCMP_SYS(memfd_secret)); + allowSyscall(ctx, SCMP_SYS(migrate_pages)); + allowSyscall(ctx, SCMP_SYS(mincore)); + allowSyscall(ctx, SCMP_SYS(mkdir)); + allowSyscall(ctx, SCMP_SYS(mkdirat)); + allowSyscall(ctx, SCMP_SYS(mknod)); + allowSyscall(ctx, SCMP_SYS(mknodat)); + allowSyscall(ctx, SCMP_SYS(mlock)); + allowSyscall(ctx, SCMP_SYS(mlock2)); + allowSyscall(ctx, SCMP_SYS(mlockall)); + allowSyscall(ctx, SCMP_SYS(mmap)); + allowSyscall(ctx, SCMP_SYS(mmap2)); + allowSyscall(ctx, SCMP_SYS(modify_ldt)); + allowSyscall(ctx, SCMP_SYS(mount)); + allowSyscall(ctx, SCMP_SYS(mount_setattr)); + allowSyscall(ctx, SCMP_SYS(move_mount)); + allowSyscall(ctx, SCMP_SYS(move_pages)); + allowSyscall(ctx, SCMP_SYS(mprotect)); + allowSyscall(ctx, SCMP_SYS(mpx)); + allowSyscall(ctx, SCMP_SYS(mq_getsetattr)); + allowSyscall(ctx, SCMP_SYS(mq_notify)); + allowSyscall(ctx, SCMP_SYS(mq_open)); + allowSyscall(ctx, SCMP_SYS(mq_timedreceive)); + allowSyscall(ctx, SCMP_SYS(mq_timedreceive_time64)); + allowSyscall(ctx, SCMP_SYS(mq_timedsend)); + allowSyscall(ctx, SCMP_SYS(mq_timedsend_time64)); + allowSyscall(ctx, SCMP_SYS(mq_unlink)); + allowSyscall(ctx, SCMP_SYS(mremap)); + allowSyscall(ctx, SCMP_SYS(msgctl)); + allowSyscall(ctx, SCMP_SYS(msgget)); + allowSyscall(ctx, SCMP_SYS(msgrcv)); + allowSyscall(ctx, SCMP_SYS(msgsnd)); + allowSyscall(ctx, SCMP_SYS(msync)); + allowSyscall(ctx, SCMP_SYS(multiplexer)); + allowSyscall(ctx, SCMP_SYS(munlock)); + allowSyscall(ctx, SCMP_SYS(munlockall)); + allowSyscall(ctx, SCMP_SYS(munmap)); + allowSyscall(ctx, SCMP_SYS(name_to_handle_at)); + allowSyscall(ctx, SCMP_SYS(nanosleep)); + allowSyscall(ctx, SCMP_SYS(newfstatat)); + allowSyscall(ctx, SCMP_SYS(_newselect)); + allowSyscall(ctx, SCMP_SYS(nfsservctl)); + allowSyscall(ctx, SCMP_SYS(nice)); + allowSyscall(ctx, SCMP_SYS(oldfstat)); + allowSyscall(ctx, SCMP_SYS(oldlstat)); + allowSyscall(ctx, SCMP_SYS(oldolduname)); + allowSyscall(ctx, SCMP_SYS(oldstat)); + allowSyscall(ctx, SCMP_SYS(olduname)); + allowSyscall(ctx, SCMP_SYS(open)); + allowSyscall(ctx, SCMP_SYS(openat)); + allowSyscall(ctx, SCMP_SYS(openat2)); + allowSyscall(ctx, SCMP_SYS(open_by_handle_at)); + allowSyscall(ctx, SCMP_SYS(open_tree)); + allowSyscall(ctx, SCMP_SYS(pause)); + allowSyscall(ctx, SCMP_SYS(pciconfig_iobase)); + allowSyscall(ctx, SCMP_SYS(pciconfig_read)); + allowSyscall(ctx, SCMP_SYS(pciconfig_write)); + allowSyscall(ctx, SCMP_SYS(perf_event_open)); + allowSyscall(ctx, SCMP_SYS(personality)); + allowSyscall(ctx, SCMP_SYS(pidfd_getfd)); + allowSyscall(ctx, SCMP_SYS(pidfd_open)); + allowSyscall(ctx, SCMP_SYS(pidfd_send_signal)); + allowSyscall(ctx, SCMP_SYS(pipe)); + allowSyscall(ctx, SCMP_SYS(pipe2)); + allowSyscall(ctx, SCMP_SYS(pivot_root)); + allowSyscall(ctx, SCMP_SYS(pkey_alloc)); + allowSyscall(ctx, SCMP_SYS(pkey_free)); + allowSyscall(ctx, SCMP_SYS(pkey_mprotect)); + allowSyscall(ctx, SCMP_SYS(poll)); + allowSyscall(ctx, SCMP_SYS(ppoll)); + allowSyscall(ctx, SCMP_SYS(ppoll_time64)); + allowSyscall(ctx, SCMP_SYS(prctl)); + allowSyscall(ctx, SCMP_SYS(pread64)); + allowSyscall(ctx, SCMP_SYS(preadv)); + allowSyscall(ctx, SCMP_SYS(preadv2)); + allowSyscall(ctx, SCMP_SYS(prlimit64)); + allowSyscall(ctx, SCMP_SYS(process_madvise)); + allowSyscall(ctx, SCMP_SYS(process_mrelease)); + allowSyscall(ctx, SCMP_SYS(process_vm_readv)); + allowSyscall(ctx, SCMP_SYS(process_vm_writev)); + allowSyscall(ctx, SCMP_SYS(prof)); + allowSyscall(ctx, SCMP_SYS(profil)); + allowSyscall(ctx, SCMP_SYS(pselect6)); + allowSyscall(ctx, SCMP_SYS(pselect6_time64)); + allowSyscall(ctx, SCMP_SYS(ptrace)); + allowSyscall(ctx, SCMP_SYS(putpmsg)); + allowSyscall(ctx, SCMP_SYS(pwrite64)); + allowSyscall(ctx, SCMP_SYS(pwritev)); + allowSyscall(ctx, SCMP_SYS(pwritev2)); + allowSyscall(ctx, SCMP_SYS(query_module)); + allowSyscall(ctx, SCMP_SYS(quotactl)); + allowSyscall(ctx, SCMP_SYS(quotactl_fd)); + allowSyscall(ctx, SCMP_SYS(read)); + allowSyscall(ctx, SCMP_SYS(readahead)); + allowSyscall(ctx, SCMP_SYS(readdir)); + allowSyscall(ctx, SCMP_SYS(readlink)); + allowSyscall(ctx, SCMP_SYS(readlinkat)); + allowSyscall(ctx, SCMP_SYS(readv)); + allowSyscall(ctx, SCMP_SYS(reboot)); + allowSyscall(ctx, SCMP_SYS(recv)); + allowSyscall(ctx, SCMP_SYS(recvfrom)); + allowSyscall(ctx, SCMP_SYS(recvmmsg)); + allowSyscall(ctx, SCMP_SYS(recvmmsg_time64)); + allowSyscall(ctx, SCMP_SYS(recvmsg)); + allowSyscall(ctx, SCMP_SYS(remap_file_pages)); + allowSyscall(ctx, SCMP_SYS(removexattr)); + allowSyscall(ctx, SCMP_SYS(rename)); + allowSyscall(ctx, SCMP_SYS(renameat)); + allowSyscall(ctx, SCMP_SYS(renameat2)); + allowSyscall(ctx, SCMP_SYS(request_key)); + allowSyscall(ctx, SCMP_SYS(restart_syscall)); + allowSyscall(ctx, SCMP_SYS(riscv_flush_icache)); + allowSyscall(ctx, SCMP_SYS(rmdir)); + allowSyscall(ctx, SCMP_SYS(rseq)); + allowSyscall(ctx, SCMP_SYS(rtas)); + allowSyscall(ctx, SCMP_SYS(rt_sigaction)); + allowSyscall(ctx, SCMP_SYS(rt_sigpending)); + allowSyscall(ctx, SCMP_SYS(rt_sigprocmask)); + allowSyscall(ctx, SCMP_SYS(rt_sigqueueinfo)); + allowSyscall(ctx, SCMP_SYS(rt_sigreturn)); + allowSyscall(ctx, SCMP_SYS(rt_sigsuspend)); + allowSyscall(ctx, SCMP_SYS(rt_sigtimedwait)); + allowSyscall(ctx, SCMP_SYS(rt_sigtimedwait_time64)); + allowSyscall(ctx, SCMP_SYS(rt_tgsigqueueinfo)); + allowSyscall(ctx, SCMP_SYS(s390_guarded_storage)); + allowSyscall(ctx, SCMP_SYS(s390_pci_mmio_read)); + allowSyscall(ctx, SCMP_SYS(s390_pci_mmio_write)); + allowSyscall(ctx, SCMP_SYS(s390_runtime_instr)); + allowSyscall(ctx, SCMP_SYS(s390_sthyi)); + allowSyscall(ctx, SCMP_SYS(sched_getaffinity)); + allowSyscall(ctx, SCMP_SYS(sched_getattr)); + allowSyscall(ctx, SCMP_SYS(sched_getparam)); + allowSyscall(ctx, SCMP_SYS(sched_get_priority_max)); + allowSyscall(ctx, SCMP_SYS(sched_get_priority_min)); + allowSyscall(ctx, SCMP_SYS(sched_getscheduler)); + allowSyscall(ctx, SCMP_SYS(sched_rr_get_interval)); + allowSyscall(ctx, SCMP_SYS(sched_rr_get_interval_time64)); + allowSyscall(ctx, SCMP_SYS(sched_setaffinity)); + allowSyscall(ctx, SCMP_SYS(sched_setattr)); + allowSyscall(ctx, SCMP_SYS(sched_setparam)); + allowSyscall(ctx, SCMP_SYS(sched_setscheduler)); + allowSyscall(ctx, SCMP_SYS(sched_yield)); + allowSyscall(ctx, SCMP_SYS(seccomp)); + allowSyscall(ctx, SCMP_SYS(security)); + allowSyscall(ctx, SCMP_SYS(select)); + allowSyscall(ctx, SCMP_SYS(semctl)); + allowSyscall(ctx, SCMP_SYS(semget)); + allowSyscall(ctx, SCMP_SYS(semop)); + allowSyscall(ctx, SCMP_SYS(semtimedop)); + allowSyscall(ctx, SCMP_SYS(semtimedop_time64)); + allowSyscall(ctx, SCMP_SYS(send)); + allowSyscall(ctx, SCMP_SYS(sendfile)); + allowSyscall(ctx, SCMP_SYS(sendfile64)); + allowSyscall(ctx, SCMP_SYS(sendmmsg)); + allowSyscall(ctx, SCMP_SYS(sendmsg)); + allowSyscall(ctx, SCMP_SYS(sendto)); + allowSyscall(ctx, SCMP_SYS(setdomainname)); + allowSyscall(ctx, SCMP_SYS(setfsgid)); + allowSyscall(ctx, SCMP_SYS(setfsgid32)); + allowSyscall(ctx, SCMP_SYS(setfsuid)); + allowSyscall(ctx, SCMP_SYS(setfsuid32)); + allowSyscall(ctx, SCMP_SYS(setgid)); + allowSyscall(ctx, SCMP_SYS(setgid32)); + allowSyscall(ctx, SCMP_SYS(setgroups)); + allowSyscall(ctx, SCMP_SYS(setgroups32)); + allowSyscall(ctx, SCMP_SYS(sethostname)); + allowSyscall(ctx, SCMP_SYS(setitimer)); + allowSyscall(ctx, SCMP_SYS(set_mempolicy)); + allowSyscall(ctx, SCMP_SYS(set_mempolicy_home_node)); + allowSyscall(ctx, SCMP_SYS(setns)); + allowSyscall(ctx, SCMP_SYS(setpgid)); + allowSyscall(ctx, SCMP_SYS(setpriority)); + allowSyscall(ctx, SCMP_SYS(setregid)); + allowSyscall(ctx, SCMP_SYS(setregid32)); + allowSyscall(ctx, SCMP_SYS(setresgid)); + allowSyscall(ctx, SCMP_SYS(setresgid32)); + allowSyscall(ctx, SCMP_SYS(setresuid)); + allowSyscall(ctx, SCMP_SYS(setresuid32)); + allowSyscall(ctx, SCMP_SYS(setreuid)); + allowSyscall(ctx, SCMP_SYS(setreuid32)); + allowSyscall(ctx, SCMP_SYS(setrlimit)); + allowSyscall(ctx, SCMP_SYS(set_robust_list)); + allowSyscall(ctx, SCMP_SYS(setsid)); + allowSyscall(ctx, SCMP_SYS(setsockopt)); + allowSyscall(ctx, SCMP_SYS(set_thread_area)); + allowSyscall(ctx, SCMP_SYS(set_tid_address)); + allowSyscall(ctx, SCMP_SYS(settimeofday)); + allowSyscall(ctx, SCMP_SYS(set_tls)); + allowSyscall(ctx, SCMP_SYS(setuid)); + allowSyscall(ctx, SCMP_SYS(setuid32)); + // skip setxattr (dangerous) + allowSyscall(ctx, SCMP_SYS(sgetmask)); + allowSyscall(ctx, SCMP_SYS(shmat)); + allowSyscall(ctx, SCMP_SYS(shmctl)); + allowSyscall(ctx, SCMP_SYS(shmdt)); + allowSyscall(ctx, SCMP_SYS(shmget)); + allowSyscall(ctx, SCMP_SYS(shutdown)); + allowSyscall(ctx, SCMP_SYS(sigaction)); + allowSyscall(ctx, SCMP_SYS(sigaltstack)); + allowSyscall(ctx, SCMP_SYS(signal)); + allowSyscall(ctx, SCMP_SYS(signalfd)); + allowSyscall(ctx, SCMP_SYS(signalfd4)); + allowSyscall(ctx, SCMP_SYS(sigpending)); + allowSyscall(ctx, SCMP_SYS(sigprocmask)); + allowSyscall(ctx, SCMP_SYS(sigreturn)); + allowSyscall(ctx, SCMP_SYS(sigsuspend)); + allowSyscall(ctx, SCMP_SYS(socket)); + allowSyscall(ctx, SCMP_SYS(socketcall)); + allowSyscall(ctx, SCMP_SYS(socketpair)); + allowSyscall(ctx, SCMP_SYS(splice)); + allowSyscall(ctx, SCMP_SYS(spu_create)); + allowSyscall(ctx, SCMP_SYS(spu_run)); + allowSyscall(ctx, SCMP_SYS(ssetmask)); + allowSyscall(ctx, SCMP_SYS(stat)); + allowSyscall(ctx, SCMP_SYS(stat64)); + allowSyscall(ctx, SCMP_SYS(statfs)); + allowSyscall(ctx, SCMP_SYS(statfs64)); + allowSyscall(ctx, SCMP_SYS(statx)); + allowSyscall(ctx, SCMP_SYS(stime)); + allowSyscall(ctx, SCMP_SYS(stty)); + allowSyscall(ctx, SCMP_SYS(subpage_prot)); + allowSyscall(ctx, SCMP_SYS(swapcontext)); + allowSyscall(ctx, SCMP_SYS(swapoff)); + allowSyscall(ctx, SCMP_SYS(swapon)); + allowSyscall(ctx, SCMP_SYS(switch_endian)); + allowSyscall(ctx, SCMP_SYS(symlink)); + allowSyscall(ctx, SCMP_SYS(symlinkat)); + allowSyscall(ctx, SCMP_SYS(sync)); + allowSyscall(ctx, SCMP_SYS(sync_file_range)); + allowSyscall(ctx, SCMP_SYS(sync_file_range2)); + allowSyscall(ctx, SCMP_SYS(syncfs)); + allowSyscall(ctx, SCMP_SYS(syscall)); + allowSyscall(ctx, SCMP_SYS(_sysctl)); + allowSyscall(ctx, SCMP_SYS(sys_debug_setcontext)); + allowSyscall(ctx, SCMP_SYS(sysfs)); + allowSyscall(ctx, SCMP_SYS(sysinfo)); + allowSyscall(ctx, SCMP_SYS(syslog)); + allowSyscall(ctx, SCMP_SYS(sysmips)); + allowSyscall(ctx, SCMP_SYS(tee)); + allowSyscall(ctx, SCMP_SYS(tgkill)); + allowSyscall(ctx, SCMP_SYS(time)); + allowSyscall(ctx, SCMP_SYS(timer_create)); + allowSyscall(ctx, SCMP_SYS(timer_delete)); + allowSyscall(ctx, SCMP_SYS(timerfd)); + allowSyscall(ctx, SCMP_SYS(timerfd_create)); + allowSyscall(ctx, SCMP_SYS(timerfd_gettime)); + allowSyscall(ctx, SCMP_SYS(timerfd_gettime64)); + allowSyscall(ctx, SCMP_SYS(timerfd_settime)); + allowSyscall(ctx, SCMP_SYS(timerfd_settime64)); + allowSyscall(ctx, SCMP_SYS(timer_getoverrun)); + allowSyscall(ctx, SCMP_SYS(timer_gettime)); + allowSyscall(ctx, SCMP_SYS(timer_gettime64)); + allowSyscall(ctx, SCMP_SYS(timer_settime)); + allowSyscall(ctx, SCMP_SYS(timer_settime64)); + allowSyscall(ctx, SCMP_SYS(times)); + allowSyscall(ctx, SCMP_SYS(tkill)); + allowSyscall(ctx, SCMP_SYS(truncate)); + allowSyscall(ctx, SCMP_SYS(truncate64)); + allowSyscall(ctx, SCMP_SYS(tuxcall)); + allowSyscall(ctx, SCMP_SYS(ugetrlimit)); + allowSyscall(ctx, SCMP_SYS(ulimit)); + allowSyscall(ctx, SCMP_SYS(umask)); + allowSyscall(ctx, SCMP_SYS(umount)); + allowSyscall(ctx, SCMP_SYS(umount2)); + allowSyscall(ctx, SCMP_SYS(uname)); + allowSyscall(ctx, SCMP_SYS(unlink)); + allowSyscall(ctx, SCMP_SYS(unlinkat)); + allowSyscall(ctx, SCMP_SYS(unshare)); + allowSyscall(ctx, SCMP_SYS(uselib)); + allowSyscall(ctx, SCMP_SYS(userfaultfd)); + allowSyscall(ctx, SCMP_SYS(usr26)); + allowSyscall(ctx, SCMP_SYS(usr32)); + allowSyscall(ctx, SCMP_SYS(ustat)); + allowSyscall(ctx, SCMP_SYS(utime)); + allowSyscall(ctx, SCMP_SYS(utimensat)); + allowSyscall(ctx, SCMP_SYS(utimensat_time64)); + allowSyscall(ctx, SCMP_SYS(utimes)); + allowSyscall(ctx, SCMP_SYS(vfork)); + allowSyscall(ctx, SCMP_SYS(vhangup)); + allowSyscall(ctx, SCMP_SYS(vm86)); + allowSyscall(ctx, SCMP_SYS(vm86old)); + allowSyscall(ctx, SCMP_SYS(vmsplice)); + allowSyscall(ctx, SCMP_SYS(vserver)); + allowSyscall(ctx, SCMP_SYS(wait4)); + allowSyscall(ctx, SCMP_SYS(waitid)); + allowSyscall(ctx, SCMP_SYS(waitpid)); + allowSyscall(ctx, SCMP_SYS(write)); + allowSyscall(ctx, SCMP_SYS(writev)); + // END extract-syscalls - if (seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), SCMP_SYS(fchmod), 1, - SCMP_A1(SCMP_CMP_MASKED_EQ, (scmp_datum_t) perm, (scmp_datum_t) perm)) != 0) - throw SysError("unable to add seccomp rule"); + // chmod family: prevent adding setuid/setgid bits to existing files. + // The Nix store does not support setuid/setgid, and even their temporary creation can weaken the security of the sandbox. + ALLOW_CHMOD_IF_SAFE(ctx, SCMP_SYS(chmod), 1); + ALLOW_CHMOD_IF_SAFE(ctx, SCMP_SYS(fchmod), 1); + ALLOW_CHMOD_IF_SAFE(ctx, SCMP_SYS(fchmodat), 2); + ALLOW_CHMOD_IF_SAFE(ctx, SCMP_SYS(fchmodat2), 2); - if (seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), SCMP_SYS(fchmodat), 1, - SCMP_A2(SCMP_CMP_MASKED_EQ, (scmp_datum_t) perm, (scmp_datum_t) perm)) != 0) - throw SysError("unable to add seccomp rule"); - - if (seccomp_rule_add(ctx, SCMP_ACT_ERRNO(EPERM), NIX_SYSCALL_FCHMODAT2, 1, - SCMP_A2(SCMP_CMP_MASKED_EQ, (scmp_datum_t) perm, (scmp_datum_t) perm)) != 0) - throw SysError("unable to add seccomp rule"); - } - - /* Prevent builders from creating EAs or ACLs. Not all filesystems - support these, and they're not allowed in the Nix store because - they're not representable in the NAR serialisation. */ + // setxattr family: prevent creation of extended attributes or ACLs. + // Not all filesystems support them, and they're incompatible with the NAR format. if (seccomp_rule_add(ctx, SCMP_ACT_ERRNO(ENOTSUP), SCMP_SYS(setxattr), 0) != 0 || seccomp_rule_add(ctx, SCMP_ACT_ERRNO(ENOTSUP), SCMP_SYS(lsetxattr), 0) != 0 || seccomp_rule_add(ctx, SCMP_ACT_ERRNO(ENOTSUP), SCMP_SYS(fsetxattr), 0) != 0) diff --git a/src/libstore/linux/fchmodat2-compat.hh b/src/libstore/linux/fchmodat2-compat.hh deleted file mode 100644 index d5ef81e22..000000000 --- a/src/libstore/linux/fchmodat2-compat.hh +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Determine the syscall number for `fchmodat2`. - * - * On most platforms this is 452. Exceptions can be found on - * a glibc git checkout via `rg --pcre2 'define __NR_fchmodat2 (?!452)'`. - * - * The problem is that glibc 2.39 and libseccomp 2.5.5 are needed to - * get the syscall number. However, a Lix built against nixpkgs 23.11 - * (glibc 2.38) should still have the issue fixed without depending - * on the build environment. - * - * To achieve that, the macros below try to determine the platform and - * set the syscall number which is platform-specific, but - * in most cases 452. - * - * TODO: remove this when 23.11 is EOL and the entire (supported) ecosystem - * is on glibc 2.39. - */ - -#pragma once -///@file - -#if defined(__alpha__) -# define NIX_SYSCALL_FCHMODAT2 562 -#elif defined(__x86_64__) && SIZE_MAX == 0xFFFFFFFF // x32 -# define NIX_SYSCALL_FCHMODAT2 1073742276 -#elif defined(__mips__) && defined(__mips64) && defined(_ABIN64) // mips64/n64 -# define NIX_SYSCALL_FCHMODAT2 5452 -#elif defined(__mips__) && defined(__mips64) && defined(_ABIN32) // mips64/n32 -# define NIX_SYSCALL_FCHMODAT2 6452 -#elif defined(__mips__) && defined(_ABIO32) // mips32 -# define NIX_SYSCALL_FCHMODAT2 4452 -#else -# define NIX_SYSCALL_FCHMODAT2 452 -#endif diff --git a/tests/nixos/setuid/fchmodat2-suid.c b/tests/nixos/setuid/fchmodat2-suid.c index 931489ad7..7280331d5 100644 --- a/tests/nixos/setuid/fchmodat2-suid.c +++ b/tests/nixos/setuid/fchmodat2-suid.c @@ -12,10 +12,7 @@ int main(void) { fprintf(fd, "henlo :3"); fclose(fd); - // FIXME use something nicer here that's less - // platform-dependent as soon as we go to 24.05 - // and the glibc is new enough to support fchmodat2 - long rs = syscall(452, NULL, name, S_ISUID, 0); + long rs = syscall(SYS_fchmodat2, NULL, name, S_ISUID, 0); assert(rs == -1); assert(errno == EPERM); } From e7188e211a5a2ac0ba34635a846569560bb5f000 Mon Sep 17 00:00:00 2001 From: Alois Wohlschlager Date: Mon, 1 Jul 2024 09:18:01 +0200 Subject: [PATCH 3/3] libstore/build: block io_uring Unfortunately, io_uring is totally opaque to seccomp, and while currently there are no dangerous operations implemented, there is no guarantee that it remains this way. This means that io_uring should be blocked entirely to ensure that the sandbox is future-proof. This has not been observed to cause issues in practice. Change-Id: I45d3895f95abe1bc103a63969f444c334dbbf50d --- doc/manual/rl-next/block-io-uring.md | 12 ++++++++++++ src/libstore/build/local-derivation-goal.cc | 6 +++--- tests/nixos/default.nix | 2 ++ tests/nixos/io_uring/default.nix | 7 +++++++ tests/nixos/io_uring/package.nix | 19 +++++++++++++++++++ 5 files changed, 43 insertions(+), 3 deletions(-) create mode 100644 doc/manual/rl-next/block-io-uring.md create mode 100644 tests/nixos/io_uring/default.nix create mode 100644 tests/nixos/io_uring/package.nix diff --git a/doc/manual/rl-next/block-io-uring.md b/doc/manual/rl-next/block-io-uring.md new file mode 100644 index 000000000..6ebba9a20 --- /dev/null +++ b/doc/manual/rl-next/block-io-uring.md @@ -0,0 +1,12 @@ +--- +synopsis: "Block io_uring in the Linux sandbox" +cls: 1611 +credits: alois31 +category: Breaking Changes +--- + +The io\_uring API has the unfortunate property that it is not possible to selectively decide which operations should be allowed. +This, together with the fact that new operations are routinely added, makes it a hazard to the proper function of the sandbox. + +Therefore, any access to io\_uring has been made unavailable inside the sandbox. +As such, attempts to execute any system calls forming part of this API will fail with the error `ENOSYS`, as if io\_uring support had not been configured into the kernel. diff --git a/src/libstore/build/local-derivation-goal.cc b/src/libstore/build/local-derivation-goal.cc index 33cbfb29d..1dbe66f72 100644 --- a/src/libstore/build/local-derivation-goal.cc +++ b/src/libstore/build/local-derivation-goal.cc @@ -1596,9 +1596,9 @@ void setupSeccomp() allowSyscall(ctx, SCMP_SYS(ioprio_set)); allowSyscall(ctx, SCMP_SYS(io_setup)); allowSyscall(ctx, SCMP_SYS(io_submit)); - allowSyscall(ctx, SCMP_SYS(io_uring_enter)); - allowSyscall(ctx, SCMP_SYS(io_uring_register)); - allowSyscall(ctx, SCMP_SYS(io_uring_setup)); + // skip io_uring_enter (may become dangerous) + // skip io_uring_register (may become dangerous) + // skip io_uring_setup (may become dangerous) allowSyscall(ctx, SCMP_SYS(ipc)); allowSyscall(ctx, SCMP_SYS(kcmp)); allowSyscall(ctx, SCMP_SYS(kexec_file_load)); diff --git a/tests/nixos/default.nix b/tests/nixos/default.nix index 301eede46..20e66f6c1 100644 --- a/tests/nixos/default.nix +++ b/tests/nixos/default.nix @@ -155,4 +155,6 @@ in broken-userns = runNixOSTestFor "x86_64-linux" ./broken-userns.nix; coredumps = runNixOSTestFor "x86_64-linux" ./coredumps; + + io_uring = runNixOSTestFor "x86_64-linux" ./io_uring; } diff --git a/tests/nixos/io_uring/default.nix b/tests/nixos/io_uring/default.nix new file mode 100644 index 000000000..9cd445d6a --- /dev/null +++ b/tests/nixos/io_uring/default.nix @@ -0,0 +1,7 @@ +let + inherit (import ../util.nix) mkNixBuildTest; +in +mkNixBuildTest { + name = "io_uring"; + expressionFile = ./package.nix; +} diff --git a/tests/nixos/io_uring/package.nix b/tests/nixos/io_uring/package.nix new file mode 100644 index 000000000..8f980183a --- /dev/null +++ b/tests/nixos/io_uring/package.nix @@ -0,0 +1,19 @@ +{ runCommandCC }: +runCommandCC "io_uring-is-blocked" { } '' + cat > test.c < + #include + #include + + int main() { + int res = syscall(SYS_io_uring_setup, 0, NULL); + return res == -1 && errno == ENOSYS ? 0 : 1; + } + EOF + "$CC" -o test test.c + if ! ./test; then + echo "Oh no! io_uring is available!" + exit 1 + fi + touch "$out" +''