libstore: add LocalDerivationGoal startChild hook

Add a platform-specific function for starting sandboxed child.
Generally this just means startProcess, but on Linux we use flags
for clone to start a new namespace

Change-Id: I41c8aba62676a162388bbe5ab8a7518904c7b058
This commit is contained in:
Artemis Tosini 2024-05-19 23:01:51 +00:00 committed by Artemis Tosini
parent af1dcc2d5e
commit e040b762a4
Signed by untrusted user: artemist
GPG key ID: EE5227935FE3FF18
4 changed files with 176 additions and 154 deletions

View file

@ -1,6 +1,7 @@
#include "local-derivation-goal.hh"
#include "indirect-root-store.hh"
#include "hook-instance.hh"
#include "store-api.hh"
#include "worker.hh"
#include "builtins.hh"
#include "builtins/buildenv.hh"
@ -771,160 +772,7 @@ void LocalDerivationGoal::startBuilder()
buildResult.startTime = time(0);
/* Fork a child to build the package. */
#if __linux__
if (useChroot) {
/* Set up private namespaces for the build:
- The PID namespace causes the build to start as PID 1.
Processes outside of the chroot are not visible to those
on the inside, but processes inside the chroot are
visible from the outside (though with different PIDs).
- The private mount namespace ensures that all the bind
mounts we do will only show up in this process and its
children, and will disappear automatically when we're
done.
- The private network namespace ensures that the builder
cannot talk to the outside world (or vice versa). It
only has a private loopback interface. (Fixed-output
derivations are not run in a private network namespace
to allow functions like fetchurl to work.)
- The IPC namespace prevents the builder from communicating
with outside processes using SysV IPC mechanisms (shared
memory, message queues, semaphores). It also ensures
that all IPC objects are destroyed when the builder
exits.
- The UTS namespace ensures that builders see a hostname of
localhost rather than the actual hostname.
We use a helper process to do the clone() to work around
clone() being broken in multi-threaded programs due to
at-fork handlers not being run. Note that we use
CLONE_PARENT to ensure that the real builder is parented to
us.
*/
if (derivationType->isSandboxed())
privateNetwork = true;
userNamespaceSync.create();
Pipe sendPid;
sendPid.create();
Pid helper = startProcess([&]() {
sendPid.readSide.close();
/* We need to open the slave early, before
CLONE_NEWUSER. Otherwise we get EPERM when running as
root. */
openSlave();
/* Drop additional groups here because we can't do it
after we've created the new user namespace. */
if (setgroups(0, 0) == -1) {
if (errno != EPERM)
throw SysError("setgroups failed");
if (settings.requireDropSupplementaryGroups)
throw Error("setgroups failed. Set the require-drop-supplementary-groups option to false to skip this step.");
}
ProcessOptions options;
options.cloneFlags = CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_PARENT | SIGCHLD;
if (privateNetwork)
options.cloneFlags |= CLONE_NEWNET;
if (usingUserNamespace)
options.cloneFlags |= CLONE_NEWUSER;
pid_t child = startProcess([&]() { runChild(); }, options).release();
writeFull(sendPid.writeSide.get(), fmt("%d\n", child));
_exit(0);
});
sendPid.writeSide.close();
if (helper.wait() != 0)
throw Error("unable to start build process");
userNamespaceSync.readSide.reset();
/* Close the write side to prevent runChild() from hanging
reading from this. */
Finally cleanup([&]() {
userNamespaceSync.writeSide.reset();
});
auto ss = tokenizeString<std::vector<std::string>>(readLine(sendPid.readSide.get()));
assert(ss.size() == 1);
pid = Pid{string2Int<pid_t>(ss[0]).value()};
if (usingUserNamespace) {
/* Set the UID/GID mapping of the builder's user namespace
such that the sandbox user maps to the build user, or to
the calling user (if build users are disabled). */
uid_t hostUid = buildUser ? buildUser->getUID() : getuid();
uid_t hostGid = buildUser ? buildUser->getGID() : getgid();
uid_t nrIds = buildUser ? buildUser->getUIDCount() : 1;
writeFile("/proc/" + std::to_string(pid.get()) + "/uid_map",
fmt("%d %d %d", sandboxUid(), hostUid, nrIds));
if (!buildUser || buildUser->getUIDCount() == 1)
writeFile("/proc/" + std::to_string(pid.get()) + "/setgroups", "deny");
writeFile("/proc/" + std::to_string(pid.get()) + "/gid_map",
fmt("%d %d %d", sandboxGid(), hostGid, nrIds));
} else {
debug("note: not using a user namespace");
}
/* Now that we now the sandbox uid, we can write
/etc/passwd. */
writeFile(chrootRootDir + "/etc/passwd", fmt(
"root:x:0:0:Nix build user:%3%:/noshell\n"
"nixbld:x:%1%:%2%:Nix build user:%3%:/noshell\n"
"nobody:x:65534:65534:Nobody:/:/noshell\n",
sandboxUid(), sandboxGid(), settings.sandboxBuildDir));
/* Declare the build user's group so that programs get a consistent
view of the system (e.g., "id -gn"). */
writeFile(chrootRootDir + "/etc/group",
fmt("root:x:0:\n"
"nixbld:!:%1%:\n"
"nogroup:x:65534:\n", sandboxGid()));
/* Save the mount- and user namespace of the child. We have to do this
*before* the child does a chroot. */
sandboxMountNamespace = AutoCloseFD{open(fmt("/proc/%d/ns/mnt", pid.get()).c_str(), O_RDONLY)};
if (sandboxMountNamespace.get() == -1)
throw SysError("getting sandbox mount namespace");
if (usingUserNamespace) {
sandboxUserNamespace = AutoCloseFD{open(fmt("/proc/%d/ns/user", pid.get()).c_str(), O_RDONLY)};
if (sandboxUserNamespace.get() == -1)
throw SysError("getting sandbox user namespace");
}
/* Move the child into its own cgroup. */
if (cgroup)
writeFile(*cgroup + "/cgroup.procs", fmt("%d", pid.get()));
/* Signal the builder that we've updated its user namespace. */
writeFull(userNamespaceSync.writeSide.get(), "1");
} else
#endif
{
pid = startProcess([&]() {
openSlave();
runChild();
});
}
pid = startChild(openSlave);
/* parent */
pid.setSeparatePG(true);
@ -958,6 +806,14 @@ void LocalDerivationGoal::startBuilder()
}
Pid LocalDerivationGoal::startChild(std::function<void()> openSlave) {
return startProcess([&]() {
openSlave();
runChild();
});
}
void LocalDerivationGoal::initTmpDir() {
/* In a sandbox, for determinism, always use the same temporary
directory. */

View file

@ -333,6 +333,12 @@ protected:
throw Error("sandboxing builds is not supported on this platform");
};
/**
* Create a new process that runs `openSlave` and `runChild`
* On some platforms this process is created with sandboxing flags.
*/
virtual Pid startChild(std::function<void()> openSlave);
/**
* Execute the builder, replacing the current process.
* Generally this means an `execve` call.

View file

@ -1,10 +1,12 @@
#include "build/worker.hh"
#include "cgroup.hh"
#include "finally.hh"
#include "gc-store.hh"
#include "signals.hh"
#include "platform/linux.hh"
#include "regex.hh"
#include <grp.h>
#include <regex>
namespace nix {
@ -204,6 +206,158 @@ void LinuxLocalDerivationGoal::prepareSandbox()
}
}
Pid LinuxLocalDerivationGoal::startChild(std::function<void()> openSlave)
{
// If we're not sandboxing no need to faff about, use the fallback
if (!useChroot) {
return LocalDerivationGoal::startChild(openSlave);
}
/* Set up private namespaces for the build:
- The PID namespace causes the build to start as PID 1.
Processes outside of the chroot are not visible to those
on the inside, but processes inside the chroot are
visible from the outside (though with different PIDs).
- The private mount namespace ensures that all the bind
mounts we do will only show up in this process and its
children, and will disappear automatically when we're
done.
- The private network namespace ensures that the builder
cannot talk to the outside world (or vice versa). It
only has a private loopback interface. (Fixed-output
derivations are not run in a private network namespace
to allow functions like fetchurl to work.)
- The IPC namespace prevents the builder from communicating
with outside processes using SysV IPC mechanisms (shared
memory, message queues, semaphores). It also ensures
that all IPC objects are destroyed when the builder
exits.
- The UTS namespace ensures that builders see a hostname of
localhost rather than the actual hostname.
We use a helper process to do the clone() to work around
clone() being broken in multi-threaded programs due to
at-fork handlers not being run. Note that we use
CLONE_PARENT to ensure that the real builder is parented to
us.
*/
if (derivationType->isSandboxed())
privateNetwork = true;
userNamespaceSync.create();
Pipe sendPid;
sendPid.create();
Pid helper = startProcess([&]() {
sendPid.readSide.close();
/* We need to open the slave early, before
CLONE_NEWUSER. Otherwise we get EPERM when running as
root. */
openSlave();
/* Drop additional groups here because we can't do it
after we've created the new user namespace. */
if (setgroups(0, 0) == -1) {
if (errno != EPERM)
throw SysError("setgroups failed");
if (settings.requireDropSupplementaryGroups)
throw Error("setgroups failed. Set the require-drop-supplementary-groups option to false to skip this step.");
}
ProcessOptions options;
options.cloneFlags = CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_PARENT | SIGCHLD;
if (privateNetwork)
options.cloneFlags |= CLONE_NEWNET;
if (usingUserNamespace)
options.cloneFlags |= CLONE_NEWUSER;
pid_t child = startProcess([&]() { runChild(); }, options).release();
writeFull(sendPid.writeSide.get(), fmt("%d\n", child));
_exit(0);
});
sendPid.writeSide.close();
if (helper.wait() != 0)
throw Error("unable to start build process");
userNamespaceSync.readSide.reset();
/* Close the write side to prevent runChild() from hanging
reading from this. */
Finally cleanup([&]() {
userNamespaceSync.writeSide.reset();
});
auto ss = tokenizeString<std::vector<std::string>>(readLine(sendPid.readSide.get()));
assert(ss.size() == 1);
Pid pid = Pid{string2Int<pid_t>(ss[0]).value()};
if (usingUserNamespace) {
/* Set the UID/GID mapping of the builder's user namespace
such that the sandbox user maps to the build user, or to
the calling user (if build users are disabled). */
uid_t hostUid = buildUser ? buildUser->getUID() : getuid();
uid_t hostGid = buildUser ? buildUser->getGID() : getgid();
uid_t nrIds = buildUser ? buildUser->getUIDCount() : 1;
writeFile("/proc/" + std::to_string(pid.get()) + "/uid_map",
fmt("%d %d %d", sandboxUid(), hostUid, nrIds));
if (!buildUser || buildUser->getUIDCount() == 1)
writeFile("/proc/" + std::to_string(pid.get()) + "/setgroups", "deny");
writeFile("/proc/" + std::to_string(pid.get()) + "/gid_map",
fmt("%d %d %d", sandboxGid(), hostGid, nrIds));
} else {
debug("note: not using a user namespace");
}
/* Now that we now the sandbox uid, we can write
/etc/passwd. */
writeFile(chrootRootDir + "/etc/passwd", fmt(
"root:x:0:0:Nix build user:%3%:/noshell\n"
"nixbld:x:%1%:%2%:Nix build user:%3%:/noshell\n"
"nobody:x:65534:65534:Nobody:/:/noshell\n",
sandboxUid(), sandboxGid(), settings.sandboxBuildDir));
/* Declare the build user's group so that programs get a consistent
view of the system (e.g., "id -gn"). */
writeFile(chrootRootDir + "/etc/group",
fmt("root:x:0:\n"
"nixbld:!:%1%:\n"
"nogroup:x:65534:\n", sandboxGid()));
/* Save the mount- and user namespace of the child. We have to do this
*before* the child does a chroot. */
sandboxMountNamespace = AutoCloseFD{open(fmt("/proc/%d/ns/mnt", pid.get()).c_str(), O_RDONLY)};
if (sandboxMountNamespace.get() == -1)
throw SysError("getting sandbox mount namespace");
if (usingUserNamespace) {
sandboxUserNamespace = AutoCloseFD{open(fmt("/proc/%d/ns/user", pid.get()).c_str(), O_RDONLY)};
if (sandboxUserNamespace.get() == -1)
throw SysError("getting sandbox user namespace");
}
/* Move the child into its own cgroup. */
if (cgroup)
writeFile(*cgroup + "/cgroup.procs", fmt("%d", pid.get()));
/* Signal the builder that we've updated its user namespace. */
writeFull(userNamespaceSync.writeSide.get(), "1");
return pid;
}
void LinuxLocalDerivationGoal::killSandbox(bool getStats)
{
if (cgroup) {

View file

@ -47,6 +47,12 @@ private:
*/
void prepareSandbox() override;
/**
* Start child process in new namespaces and cgroup,
* create /etc/passwd and /etc/group based on discovered uid/gid
*/
Pid startChild(std::function<void()> openSlave) override;
/**
* Kill all processes by build user, possibly using a reused
* cgroup if we have one