From f094ba7386fc5fbb3df5fd84008ca07d2289ff26 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Fri, 10 Feb 2023 14:38:14 +0100 Subject: [PATCH 1/4] Simplify the PID namespace check: just try to mount /proc Fixes #7783. --- src/libstore/build/local-derivation-goal.cc | 2 +- src/libutil/namespaces.cc | 81 ++++++++++----------- src/libutil/namespaces.hh | 4 +- src/libutil/util.cc | 29 +++++++- src/libutil/util.hh | 1 + 5 files changed, 69 insertions(+), 48 deletions(-) diff --git a/src/libstore/build/local-derivation-goal.cc b/src/libstore/build/local-derivation-goal.cc index e1cc504f8..e5ba3ac0d 100644 --- a/src/libstore/build/local-derivation-goal.cc +++ b/src/libstore/build/local-derivation-goal.cc @@ -209,7 +209,7 @@ void LocalDerivationGoal::tryLocalBuild() #if __linux__ if (useChroot) { - if (!mountNamespacesSupported() || !pidNamespacesSupported()) { + if (!mountAndPidNamespacesSupported()) { if (!settings.sandboxFallback) throw Error("this system does not support the kernel namespaces that are required for sandboxing; use '--no-sandbox' to disable sandboxing"); debug("auto-disabling sandboxing because the prerequisite namespaces are not available"); diff --git a/src/libutil/namespaces.cc b/src/libutil/namespaces.cc index fdd52d92b..f66accb10 100644 --- a/src/libutil/namespaces.cc +++ b/src/libutil/namespaces.cc @@ -4,7 +4,7 @@ #include "util.hh" #include "finally.hh" -#include +#include namespace nix { @@ -33,63 +33,60 @@ bool userNamespacesSupported() return false; } - Pid pid = startProcess([&]() - { - auto res = unshare(CLONE_NEWUSER); - _exit(res ? 1 : 0); - }); + try { + Pid pid = startProcess([&]() + { + _exit(0); + }, { + .cloneFlags = CLONE_NEWUSER + }); - bool supported = pid.wait() == 0; + auto r = pid.wait(); + assert(!r); + } catch (SysError & e) { + debug("user namespaces do not work on this system: %s", e.msg()); + return false; + } - if (!supported) - debug("user namespaces do not work on this system"); - - return supported; + return true; }(); return res; } -bool mountNamespacesSupported() +bool mountAndPidNamespacesSupported() { static auto res = [&]() -> bool { - bool useUserNamespace = userNamespacesSupported(); + try { - Pid pid = startProcess([&]() - { - auto res = unshare(CLONE_NEWNS | (useUserNamespace ? CLONE_NEWUSER : 0)); - _exit(res ? 1 : 0); - }); + Pid pid = startProcess([&]() + { + /* Make sure we don't remount the parent's /proc. */ + if (mount(0, "/", 0, MS_PRIVATE | MS_REC, 0) == -1) + _exit(1); - bool supported = pid.wait() == 0; + /* Test whether we can remount /proc. The kernel disallows + this if /proc is not fully visible, i.e. if there are + filesystems mounted on top of files inside /proc. See + https://lore.kernel.org/lkml/87tvsrjai0.fsf@xmission.com/T/. */ + if (mount("none", "/proc", "proc", 0, 0) == -1) + _exit(2); - if (!supported) - debug("mount namespaces do not work on this system"); + _exit(0); + }, { + .cloneFlags = CLONE_NEWNS | CLONE_NEWPID | (userNamespacesSupported() ? CLONE_NEWUSER : 0) + }); - return supported; - }(); - return res; -} - -bool pidNamespacesSupported() -{ - static auto res = [&]() -> bool - { - /* Check whether /proc is fully visible, i.e. there are no - filesystems mounted on top of files inside /proc. If this - is not the case, then we cannot mount a new /proc inside - the sandbox that matches the sandbox's PID namespace. - See https://lore.kernel.org/lkml/87tvsrjai0.fsf@xmission.com/T/. */ - auto fp = fopen("/proc/mounts", "r"); - if (!fp) return false; - Finally delFP = [&]() { fclose(fp); }; - - while (auto ent = getmntent(fp)) - if (hasPrefix(std::string_view(ent->mnt_dir), "/proc/")) { - debug("PID namespaces do not work because /proc is not fully visible; disabling sandboxing"); + if (pid.wait()) { + debug("PID namespaces do not work on this system: cannot remount /proc"); return false; } + } catch (SysError & e) { + debug("mount namespaces do not work on this system: %s", e.msg()); + return false; + } + return true; }(); return res; diff --git a/src/libutil/namespaces.hh b/src/libutil/namespaces.hh index 34e54d5ad..e82379b9c 100644 --- a/src/libutil/namespaces.hh +++ b/src/libutil/namespaces.hh @@ -6,9 +6,7 @@ namespace nix { bool userNamespacesSupported(); -bool mountNamespacesSupported(); - -bool pidNamespacesSupported(); +bool mountAndPidNamespacesSupported(); #endif diff --git a/src/libutil/util.cc b/src/libutil/util.cc index 40faa4bf2..94da37561 100644 --- a/src/libutil/util.cc +++ b/src/libutil/util.cc @@ -36,6 +36,7 @@ #ifdef __linux__ #include #include +#include #include #endif @@ -1051,9 +1052,17 @@ static pid_t doFork(bool allowVfork, std::function fun) } +static int childEntry(void * arg) +{ + auto main = (std::function *) arg; + (*main)(); + return 1; +} + + pid_t startProcess(std::function fun, const ProcessOptions & options) { - auto wrapper = [&]() { + std::function wrapper = [&]() { if (!options.allowVfork) logger = makeSimpleLogger(); try { @@ -1073,7 +1082,23 @@ pid_t startProcess(std::function fun, const ProcessOptions & options) _exit(1); }; - pid_t pid = doFork(options.allowVfork, wrapper); + pid_t pid = -1; + + if (options.cloneFlags) { + // Not supported, since then we don't know when to free the stack. + assert(!(options.cloneFlags & CLONE_VM)); + + size_t stackSize = 1 * 1024 * 1024; + auto stack = (char *) mmap(0, stackSize, + PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0); + if (stack == MAP_FAILED) throw SysError("allocating stack"); + + Finally freeStack([&]() { munmap(stack, stackSize); }); + + pid = clone(childEntry, stack + stackSize, options.cloneFlags | SIGCHLD, &wrapper); + } else + pid = doFork(options.allowVfork, wrapper); + if (pid == -1) throw SysError("unable to fork"); return pid; diff --git a/src/libutil/util.hh b/src/libutil/util.hh index 266da0ae3..95562280e 100644 --- a/src/libutil/util.hh +++ b/src/libutil/util.hh @@ -301,6 +301,7 @@ struct ProcessOptions bool dieWithParent = true; bool runExitHandlers = false; bool allowVfork = false; + int cloneFlags = 0; // use clone() with the specified flags (Linux only) }; pid_t startProcess(std::function fun, const ProcessOptions & options = ProcessOptions()); From 3e6e34cdf589d67c767bffe19483a6c9e53233a7 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Fri, 10 Feb 2023 14:44:25 +0100 Subject: [PATCH 2/4] LocalDerivationGoal::startBuilder(): Use startProcess() to clone --- src/libstore/build/local-derivation-goal.cc | 22 +++++---------------- 1 file changed, 5 insertions(+), 17 deletions(-) diff --git a/src/libstore/build/local-derivation-goal.cc b/src/libstore/build/local-derivation-goal.cc index e5ba3ac0d..7c4892c96 100644 --- a/src/libstore/build/local-derivation-goal.cc +++ b/src/libstore/build/local-derivation-goal.cc @@ -385,12 +385,6 @@ void LocalDerivationGoal::cleanupPostOutputsRegisteredModeNonCheck() } -int childEntry(void * arg) -{ - ((LocalDerivationGoal *) arg)->runChild(); - return 1; -} - #if __linux__ static void linkOrCopy(const Path & from, const Path & to) { @@ -916,21 +910,15 @@ void LocalDerivationGoal::startBuilder() if (getuid() == 0 && setgroups(0, 0) == -1) throw SysError("setgroups failed"); - size_t stackSize = 1 * 1024 * 1024; - char * stack = (char *) mmap(0, stackSize, - PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0); - if (stack == MAP_FAILED) throw SysError("allocating stack"); - - int flags = CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_PARENT | SIGCHLD; + ProcessOptions options; + options.cloneFlags = CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_PARENT | SIGCHLD; if (privateNetwork) - flags |= CLONE_NEWNET; + options.cloneFlags |= CLONE_NEWNET; if (usingUserNamespace) - flags |= CLONE_NEWUSER; + options.cloneFlags |= CLONE_NEWUSER; - pid_t child = clone(childEntry, stack + stackSize, flags, this); + pid_t child = startProcess([&]() { runChild(); }, options); - if (child == -1) - throw SysError("creating sandboxed builder process using clone()"); writeFull(builderOut.writeSide.get(), fmt("%d %d\n", usingUserNamespace, child)); _exit(0); From c49b7472eaa7a6501aa937b40f80702ecf0f43e6 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Fri, 10 Feb 2023 16:32:30 +0100 Subject: [PATCH 3/4] Fix macOS build --- src/libutil/util.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/libutil/util.cc b/src/libutil/util.cc index 94da37561..795b66dc3 100644 --- a/src/libutil/util.cc +++ b/src/libutil/util.cc @@ -1085,6 +1085,7 @@ pid_t startProcess(std::function fun, const ProcessOptions & options) pid_t pid = -1; if (options.cloneFlags) { + #ifdef __linux__ // Not supported, since then we don't know when to free the stack. assert(!(options.cloneFlags & CLONE_VM)); @@ -1096,6 +1097,9 @@ pid_t startProcess(std::function fun, const ProcessOptions & options) Finally freeStack([&]() { munmap(stack, stackSize); }); pid = clone(childEntry, stack + stackSize, options.cloneFlags | SIGCHLD, &wrapper); + #else + throw Error("clone flags are only supported on Linux"); + #endif } else pid = doFork(options.allowVfork, wrapper); From a21405a4e8a5ca4bfbe8df8de2f76d69c4608a9f Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Fri, 10 Feb 2023 17:51:44 +0100 Subject: [PATCH 4/4] Add regression test --- tests/nixos/remote-builds.nix | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/tests/nixos/remote-builds.nix b/tests/nixos/remote-builds.nix index 696cd2652..1c96cc787 100644 --- a/tests/nixos/remote-builds.nix +++ b/tests/nixos/remote-builds.nix @@ -11,6 +11,11 @@ let { services.openssh.enable = true; virtualisation.writableStore = true; nix.settings.sandbox = true; + + # Regression test for use of PID namespaces when /proc has + # filesystems mounted on top of it + # (i.e. /proc/sys/fs/binfmt_misc). + boot.binfmt.emulatedSystems = [ "aarch64-linux" ]; }; # Trivial Nix expression to build remotely.