From e040b762a48f022b1ea4080020083f7367cf3ee5 Mon Sep 17 00:00:00 2001 From: Artemis Tosini Date: Sun, 19 May 2024 23:01:51 +0000 Subject: [PATCH] libstore: add LocalDerivationGoal startChild hook Add a platform-specific function for starting sandboxed child. Generally this just means startProcess, but on Linux we use flags for clone to start a new namespace Change-Id: I41c8aba62676a162388bbe5ab8a7518904c7b058 --- src/libstore/build/local-derivation-goal.cc | 164 ++------------------ src/libstore/build/local-derivation-goal.hh | 6 + src/libstore/platform/linux.cc | 154 ++++++++++++++++++ src/libstore/platform/linux.hh | 6 + 4 files changed, 176 insertions(+), 154 deletions(-) diff --git a/src/libstore/build/local-derivation-goal.cc b/src/libstore/build/local-derivation-goal.cc index bc8879428..fb86a9d18 100644 --- a/src/libstore/build/local-derivation-goal.cc +++ b/src/libstore/build/local-derivation-goal.cc @@ -1,6 +1,7 @@ #include "local-derivation-goal.hh" #include "indirect-root-store.hh" #include "hook-instance.hh" +#include "store-api.hh" #include "worker.hh" #include "builtins.hh" #include "builtins/buildenv.hh" @@ -771,160 +772,7 @@ void LocalDerivationGoal::startBuilder() buildResult.startTime = time(0); /* Fork a child to build the package. */ - -#if __linux__ - if (useChroot) { - /* Set up private namespaces for the build: - - - The PID namespace causes the build to start as PID 1. - Processes outside of the chroot are not visible to those - on the inside, but processes inside the chroot are - visible from the outside (though with different PIDs). - - - The private mount namespace ensures that all the bind - mounts we do will only show up in this process and its - children, and will disappear automatically when we're - done. - - - The private network namespace ensures that the builder - cannot talk to the outside world (or vice versa). It - only has a private loopback interface. (Fixed-output - derivations are not run in a private network namespace - to allow functions like fetchurl to work.) - - - The IPC namespace prevents the builder from communicating - with outside processes using SysV IPC mechanisms (shared - memory, message queues, semaphores). It also ensures - that all IPC objects are destroyed when the builder - exits. - - - The UTS namespace ensures that builders see a hostname of - localhost rather than the actual hostname. - - We use a helper process to do the clone() to work around - clone() being broken in multi-threaded programs due to - at-fork handlers not being run. Note that we use - CLONE_PARENT to ensure that the real builder is parented to - us. - */ - - if (derivationType->isSandboxed()) - privateNetwork = true; - - userNamespaceSync.create(); - - Pipe sendPid; - sendPid.create(); - - Pid helper = startProcess([&]() { - sendPid.readSide.close(); - - /* We need to open the slave early, before - CLONE_NEWUSER. Otherwise we get EPERM when running as - root. */ - openSlave(); - - /* Drop additional groups here because we can't do it - after we've created the new user namespace. */ - if (setgroups(0, 0) == -1) { - if (errno != EPERM) - throw SysError("setgroups failed"); - if (settings.requireDropSupplementaryGroups) - throw Error("setgroups failed. Set the require-drop-supplementary-groups option to false to skip this step."); - } - - ProcessOptions options; - options.cloneFlags = CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_PARENT | SIGCHLD; - if (privateNetwork) - options.cloneFlags |= CLONE_NEWNET; - if (usingUserNamespace) - options.cloneFlags |= CLONE_NEWUSER; - - pid_t child = startProcess([&]() { runChild(); }, options).release(); - - writeFull(sendPid.writeSide.get(), fmt("%d\n", child)); - _exit(0); - }); - - sendPid.writeSide.close(); - - if (helper.wait() != 0) - throw Error("unable to start build process"); - - userNamespaceSync.readSide.reset(); - - /* Close the write side to prevent runChild() from hanging - reading from this. */ - Finally cleanup([&]() { - userNamespaceSync.writeSide.reset(); - }); - - auto ss = tokenizeString>(readLine(sendPid.readSide.get())); - assert(ss.size() == 1); - pid = Pid{string2Int(ss[0]).value()}; - - if (usingUserNamespace) { - /* Set the UID/GID mapping of the builder's user namespace - such that the sandbox user maps to the build user, or to - the calling user (if build users are disabled). */ - uid_t hostUid = buildUser ? buildUser->getUID() : getuid(); - uid_t hostGid = buildUser ? buildUser->getGID() : getgid(); - uid_t nrIds = buildUser ? buildUser->getUIDCount() : 1; - - writeFile("/proc/" + std::to_string(pid.get()) + "/uid_map", - fmt("%d %d %d", sandboxUid(), hostUid, nrIds)); - - if (!buildUser || buildUser->getUIDCount() == 1) - writeFile("/proc/" + std::to_string(pid.get()) + "/setgroups", "deny"); - - writeFile("/proc/" + std::to_string(pid.get()) + "/gid_map", - fmt("%d %d %d", sandboxGid(), hostGid, nrIds)); - } else { - debug("note: not using a user namespace"); - } - - /* Now that we now the sandbox uid, we can write - /etc/passwd. */ - writeFile(chrootRootDir + "/etc/passwd", fmt( - "root:x:0:0:Nix build user:%3%:/noshell\n" - "nixbld:x:%1%:%2%:Nix build user:%3%:/noshell\n" - "nobody:x:65534:65534:Nobody:/:/noshell\n", - sandboxUid(), sandboxGid(), settings.sandboxBuildDir)); - - /* Declare the build user's group so that programs get a consistent - view of the system (e.g., "id -gn"). */ - writeFile(chrootRootDir + "/etc/group", - fmt("root:x:0:\n" - "nixbld:!:%1%:\n" - "nogroup:x:65534:\n", sandboxGid())); - - /* Save the mount- and user namespace of the child. We have to do this - *before* the child does a chroot. */ - sandboxMountNamespace = AutoCloseFD{open(fmt("/proc/%d/ns/mnt", pid.get()).c_str(), O_RDONLY)}; - if (sandboxMountNamespace.get() == -1) - throw SysError("getting sandbox mount namespace"); - - if (usingUserNamespace) { - sandboxUserNamespace = AutoCloseFD{open(fmt("/proc/%d/ns/user", pid.get()).c_str(), O_RDONLY)}; - if (sandboxUserNamespace.get() == -1) - throw SysError("getting sandbox user namespace"); - } - - /* Move the child into its own cgroup. */ - if (cgroup) - writeFile(*cgroup + "/cgroup.procs", fmt("%d", pid.get())); - - /* Signal the builder that we've updated its user namespace. */ - writeFull(userNamespaceSync.writeSide.get(), "1"); - - } else -#endif - { - pid = startProcess([&]() { - openSlave(); - runChild(); - }); - } + pid = startChild(openSlave); /* parent */ pid.setSeparatePG(true); @@ -958,6 +806,14 @@ void LocalDerivationGoal::startBuilder() } +Pid LocalDerivationGoal::startChild(std::function openSlave) { + return startProcess([&]() { + openSlave(); + runChild(); + }); +} + + void LocalDerivationGoal::initTmpDir() { /* In a sandbox, for determinism, always use the same temporary directory. */ diff --git a/src/libstore/build/local-derivation-goal.hh b/src/libstore/build/local-derivation-goal.hh index 857339b5d..727a7f406 100644 --- a/src/libstore/build/local-derivation-goal.hh +++ b/src/libstore/build/local-derivation-goal.hh @@ -333,6 +333,12 @@ protected: throw Error("sandboxing builds is not supported on this platform"); }; + /** + * Create a new process that runs `openSlave` and `runChild` + * On some platforms this process is created with sandboxing flags. + */ + virtual Pid startChild(std::function openSlave); + /** * Execute the builder, replacing the current process. * Generally this means an `execve` call. diff --git a/src/libstore/platform/linux.cc b/src/libstore/platform/linux.cc index ac0440e5a..204f62b71 100644 --- a/src/libstore/platform/linux.cc +++ b/src/libstore/platform/linux.cc @@ -1,10 +1,12 @@ #include "build/worker.hh" #include "cgroup.hh" +#include "finally.hh" #include "gc-store.hh" #include "signals.hh" #include "platform/linux.hh" #include "regex.hh" +#include #include namespace nix { @@ -204,6 +206,158 @@ void LinuxLocalDerivationGoal::prepareSandbox() } } +Pid LinuxLocalDerivationGoal::startChild(std::function openSlave) +{ + // If we're not sandboxing no need to faff about, use the fallback + if (!useChroot) { + return LocalDerivationGoal::startChild(openSlave); + } + /* Set up private namespaces for the build: + + - The PID namespace causes the build to start as PID 1. + Processes outside of the chroot are not visible to those + on the inside, but processes inside the chroot are + visible from the outside (though with different PIDs). + + - The private mount namespace ensures that all the bind + mounts we do will only show up in this process and its + children, and will disappear automatically when we're + done. + + - The private network namespace ensures that the builder + cannot talk to the outside world (or vice versa). It + only has a private loopback interface. (Fixed-output + derivations are not run in a private network namespace + to allow functions like fetchurl to work.) + + - The IPC namespace prevents the builder from communicating + with outside processes using SysV IPC mechanisms (shared + memory, message queues, semaphores). It also ensures + that all IPC objects are destroyed when the builder + exits. + + - The UTS namespace ensures that builders see a hostname of + localhost rather than the actual hostname. + + We use a helper process to do the clone() to work around + clone() being broken in multi-threaded programs due to + at-fork handlers not being run. Note that we use + CLONE_PARENT to ensure that the real builder is parented to + us. + */ + + if (derivationType->isSandboxed()) + privateNetwork = true; + + userNamespaceSync.create(); + + Pipe sendPid; + sendPid.create(); + + Pid helper = startProcess([&]() { + sendPid.readSide.close(); + + /* We need to open the slave early, before + CLONE_NEWUSER. Otherwise we get EPERM when running as + root. */ + openSlave(); + + /* Drop additional groups here because we can't do it + after we've created the new user namespace. */ + if (setgroups(0, 0) == -1) { + if (errno != EPERM) + throw SysError("setgroups failed"); + if (settings.requireDropSupplementaryGroups) + throw Error("setgroups failed. Set the require-drop-supplementary-groups option to false to skip this step."); + } + + ProcessOptions options; + options.cloneFlags = CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_PARENT | SIGCHLD; + if (privateNetwork) + options.cloneFlags |= CLONE_NEWNET; + if (usingUserNamespace) + options.cloneFlags |= CLONE_NEWUSER; + + pid_t child = startProcess([&]() { runChild(); }, options).release(); + + writeFull(sendPid.writeSide.get(), fmt("%d\n", child)); + _exit(0); + }); + + sendPid.writeSide.close(); + + if (helper.wait() != 0) + throw Error("unable to start build process"); + + userNamespaceSync.readSide.reset(); + + /* Close the write side to prevent runChild() from hanging + reading from this. */ + Finally cleanup([&]() { + userNamespaceSync.writeSide.reset(); + }); + + auto ss = tokenizeString>(readLine(sendPid.readSide.get())); + assert(ss.size() == 1); + Pid pid = Pid{string2Int(ss[0]).value()}; + + if (usingUserNamespace) { + /* Set the UID/GID mapping of the builder's user namespace + such that the sandbox user maps to the build user, or to + the calling user (if build users are disabled). */ + uid_t hostUid = buildUser ? buildUser->getUID() : getuid(); + uid_t hostGid = buildUser ? buildUser->getGID() : getgid(); + uid_t nrIds = buildUser ? buildUser->getUIDCount() : 1; + + writeFile("/proc/" + std::to_string(pid.get()) + "/uid_map", + fmt("%d %d %d", sandboxUid(), hostUid, nrIds)); + + if (!buildUser || buildUser->getUIDCount() == 1) + writeFile("/proc/" + std::to_string(pid.get()) + "/setgroups", "deny"); + + writeFile("/proc/" + std::to_string(pid.get()) + "/gid_map", + fmt("%d %d %d", sandboxGid(), hostGid, nrIds)); + } else { + debug("note: not using a user namespace"); + } + + /* Now that we now the sandbox uid, we can write + /etc/passwd. */ + writeFile(chrootRootDir + "/etc/passwd", fmt( + "root:x:0:0:Nix build user:%3%:/noshell\n" + "nixbld:x:%1%:%2%:Nix build user:%3%:/noshell\n" + "nobody:x:65534:65534:Nobody:/:/noshell\n", + sandboxUid(), sandboxGid(), settings.sandboxBuildDir)); + + /* Declare the build user's group so that programs get a consistent + view of the system (e.g., "id -gn"). */ + writeFile(chrootRootDir + "/etc/group", + fmt("root:x:0:\n" + "nixbld:!:%1%:\n" + "nogroup:x:65534:\n", sandboxGid())); + + /* Save the mount- and user namespace of the child. We have to do this + *before* the child does a chroot. */ + sandboxMountNamespace = AutoCloseFD{open(fmt("/proc/%d/ns/mnt", pid.get()).c_str(), O_RDONLY)}; + if (sandboxMountNamespace.get() == -1) + throw SysError("getting sandbox mount namespace"); + + if (usingUserNamespace) { + sandboxUserNamespace = AutoCloseFD{open(fmt("/proc/%d/ns/user", pid.get()).c_str(), O_RDONLY)}; + if (sandboxUserNamespace.get() == -1) + throw SysError("getting sandbox user namespace"); + } + + /* Move the child into its own cgroup. */ + if (cgroup) + writeFile(*cgroup + "/cgroup.procs", fmt("%d", pid.get())); + + /* Signal the builder that we've updated its user namespace. */ + writeFull(userNamespaceSync.writeSide.get(), "1"); + + return pid; +} + void LinuxLocalDerivationGoal::killSandbox(bool getStats) { if (cgroup) { diff --git a/src/libstore/platform/linux.hh b/src/libstore/platform/linux.hh index 7ef578d2c..2173205bc 100644 --- a/src/libstore/platform/linux.hh +++ b/src/libstore/platform/linux.hh @@ -47,6 +47,12 @@ private: */ void prepareSandbox() override; + /** + * Start child process in new namespaces and cgroup, + * create /etc/passwd and /etc/group based on discovered uid/gid + */ + Pid startChild(std::function openSlave) override; + /** * Kill all processes by build user, possibly using a reused * cgroup if we have one