From 128910ba23f586ba1765a137ecff23cfd22cff89 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Fri, 18 Nov 2022 10:39:28 +0100 Subject: [PATCH] Separate cgroup support from auto-uid-allocation The new experimental feature 'cgroups' enables the use of cgroups for all builds. This allows better containment and enables setting resource limits and getting some build stats. --- src/libstore/build/local-derivation-goal.cc | 114 ++++++++++++++------ src/libstore/build/local-derivation-goal.hh | 7 ++ src/libstore/lock.cc | 71 ------------ src/libstore/lock.hh | 7 -- src/libutil/experimental-features.cc | 1 + src/libutil/experimental-features.hh | 1 + 6 files changed, 93 insertions(+), 108 deletions(-) diff --git a/src/libstore/build/local-derivation-goal.cc b/src/libstore/build/local-derivation-goal.cc index e652c425c..2d1e093ca 100644 --- a/src/libstore/build/local-derivation-goal.cc +++ b/src/libstore/build/local-derivation-goal.cc @@ -15,6 +15,7 @@ #include "topo-sort.hh" #include "callback.hh" #include "json-utils.hh" +#include "cgroup.hh" #include #include @@ -129,26 +130,36 @@ void LocalDerivationGoal::killChild() if (pid != -1) { worker.childTerminated(this); - if (buildUser) { - /* If we're using a build user, then there is a tricky - race condition: if we kill the build user before the - child has done its setuid() to the build user uid, then - it won't be killed, and we'll potentially lock up in - pid.wait(). So also send a conventional kill to the - child. */ - ::kill(-pid, SIGKILL); /* ignore the result */ - buildUser->kill(); - pid.wait(); - } else - pid.kill(); + /* If we're using a build user, then there is a tricky race + condition: if we kill the build user before the child has + done its setuid() to the build user uid, then it won't be + killed, and we'll potentially lock up in pid.wait(). So + also send a conventional kill to the child. */ + ::kill(-pid, SIGKILL); /* ignore the result */ - assert(pid == -1); + killSandbox(); + + pid.wait(); } DerivationGoal::killChild(); } +void LocalDerivationGoal::killSandbox() +{ + if (cgroup) { + destroyCgroup(*cgroup); + } + + else if (buildUser) { + auto uid = buildUser->getUID(); + assert(uid != 0); + killUser(uid); + } +} + + void LocalDerivationGoal::tryLocalBuild() { unsigned int curBuilds = worker.getNrLocalBuilds(); if (curBuilds >= settings.maxBuildJobs) { @@ -169,10 +180,6 @@ void LocalDerivationGoal::tryLocalBuild() { worker.waitForAWhile(shared_from_this()); return; } - - /* Make sure that no other processes are executing under this - uid. */ - buildUser->kill(); } actLock.reset(); @@ -263,7 +270,7 @@ void LocalDerivationGoal::cleanupPostChildKill() malicious user from leaving behind a process that keeps files open and modifies them after they have been chown'ed to root. */ - if (buildUser) buildUser->kill(); + killSandbox(); /* Terminate the recursive Nix daemon. */ stopDaemon(); @@ -356,6 +363,55 @@ static void linkOrCopy(const Path & from, const Path & to) void LocalDerivationGoal::startBuilder() { + if ((buildUser && buildUser->getUIDCount() != 1) + || settings.isExperimentalFeatureEnabled(Xp::Cgroups)) + { + #if __linux__ + auto ourCgroups = getCgroups("/proc/self/cgroup"); + auto ourCgroup = ourCgroups[""]; + if (ourCgroup == "") + throw Error("cannot determine cgroup name from /proc/self/cgroup"); + + auto ourCgroupPath = canonPath("/sys/fs/cgroup/" + ourCgroup); + + if (!pathExists(ourCgroupPath)) + throw Error("expected cgroup directory '%s'", ourCgroupPath); + + static std::atomic counter{0}; + + cgroup = buildUser + ? fmt("%s/nix-build-uid-%d", ourCgroupPath, buildUser->getUID()) + : fmt("%s/nix-build-pid-%d-%d", ourCgroupPath, getpid(), counter++); + + debug("using cgroup '%s'", *cgroup); + + /* When using a build user, record the cgroup we used for that + user so that if we got interrupted previously, we can kill + any left-over cgroup first. */ + if (buildUser) { + auto cgroupsDir = settings.nixStateDir + "/cgroups"; + createDirs(cgroupsDir); + + auto cgroupFile = fmt("%s/%d", cgroupsDir, buildUser->getUID()); + + if (pathExists(cgroupFile)) { + auto prevCgroup = readFile(cgroupFile); + destroyCgroup(prevCgroup); + } + + writeFile(cgroupFile, *cgroup); + } + + #else + throw Error("cgroups are not supported on this platform"); + #endif + } + + /* Make sure that no other processes are executing under the + sandbox uids. This must be done before any chownToBuilder() + calls. */ + killSandbox(); + /* Right platform? */ if (!parsedDrv->canBuildLocally(worker.store)) throw Error("a '%s' with features {%s} is required to build '%s', but I am a '%s' with features {%s}", @@ -646,13 +702,13 @@ void LocalDerivationGoal::startBuilder() dirsInChroot.erase(worker.store.printStorePath(*i.second.second)); } - if (buildUser) { - if (auto cgroup = buildUser->getCgroup()) { - chownToBuilder(*cgroup); - chownToBuilder(*cgroup + "/cgroup.procs"); - chownToBuilder(*cgroup + "/cgroup.threads"); - //chownToBuilder(*cgroup + "/cgroup.subtree_control"); - } + if (cgroup) { + if (mkdir(cgroup->c_str(), 0755) != 0) + throw SysError("creating cgroup '%s'", *cgroup); + chownToBuilder(*cgroup); + chownToBuilder(*cgroup + "/cgroup.procs"); + chownToBuilder(*cgroup + "/cgroup.threads"); + //chownToBuilder(*cgroup + "/cgroup.subtree_control"); } #else @@ -965,10 +1021,8 @@ void LocalDerivationGoal::startBuilder() } /* Move the child into its own cgroup. */ - if (buildUser) { - if (auto cgroup = buildUser->getCgroup()) - writeFile(*cgroup + "/cgroup.procs", fmt("%d", (pid_t) pid)); - } + if (cgroup) + writeFile(*cgroup + "/cgroup.procs", fmt("%d", (pid_t) pid)); /* Signal the builder that we've updated its user namespace. */ writeFull(userNamespaceSync.writeSide.get(), "1"); @@ -1838,7 +1892,7 @@ void LocalDerivationGoal::runChild() /* Unshare the cgroup namespace. This means /proc/self/cgroup will show the child's cgroup as '/' rather than whatever it is in the parent. */ - if (buildUser && buildUser->getUIDCount() != 1 && unshare(CLONE_NEWCGROUP) == -1) + if (cgroup && unshare(CLONE_NEWCGROUP) == -1) throw SysError("unsharing cgroup namespace"); /* Do the chroot(). */ diff --git a/src/libstore/build/local-derivation-goal.hh b/src/libstore/build/local-derivation-goal.hh index f92280aa1..1ec6b3649 100644 --- a/src/libstore/build/local-derivation-goal.hh +++ b/src/libstore/build/local-derivation-goal.hh @@ -15,6 +15,9 @@ struct LocalDerivationGoal : public DerivationGoal /* The process ID of the builder. */ Pid pid; + /* The cgroup of the builder, if any. */ + std::optional cgroup; + /* The temporary directory. */ Path tmpDir; @@ -197,6 +200,10 @@ struct LocalDerivationGoal : public DerivationGoal /* Forcibly kill the child process, if any. */ void killChild() override; + /* Kill any processes running under the build user UID or in the + cgroup of the build. */ + void killSandbox(); + /* Create alternative path calculated from but distinct from the input, so we can avoid overwriting outputs (or other store paths) that already exist. */ diff --git a/src/libstore/lock.cc b/src/libstore/lock.cc index f9892bb91..4fad3bfd2 100644 --- a/src/libstore/lock.cc +++ b/src/libstore/lock.cc @@ -1,7 +1,6 @@ #include "lock.hh" #include "globals.hh" #include "pathlocks.hh" -#include "cgroup.hh" #include #include @@ -15,11 +14,6 @@ struct SimpleUserLock : UserLock gid_t gid; std::vector supplementaryGIDs; - void kill() override - { - killUser(uid); - } - uid_t getUID() override { assert(uid); return uid; } uid_t getUIDCount() override { return 1; } gid_t getGID() override { assert(gid); return gid; } @@ -116,32 +110,6 @@ struct AutoUserLock : UserLock AutoCloseFD fdUserLock; uid_t firstUid = 0; uid_t nrIds = 1; - #if __linux__ - std::optional cgroup; - #endif - - ~AutoUserLock() - { - #if __linux__ - // Get rid of our cgroup, ignoring errors. - if (cgroup) rmdir(cgroup->c_str()); - #endif - } - - void kill() override - { - #if __linux__ - if (cgroup) { - destroyCgroup(*cgroup); - if (mkdir(cgroup->c_str(), 0755) == -1) - throw SysError("creating cgroup '%s'", *cgroup); - } else - #endif - { - assert(firstUid); - killUser(firstUid); - } - } uid_t getUID() override { assert(firstUid); return firstUid; } @@ -183,55 +151,16 @@ struct AutoUserLock : UserLock throw SysError("opening user lock '%s'", fnUserLock); if (lockFile(fd.get(), ltWrite, false)) { - auto s = drainFD(fd.get()); - - #if __linux__ - if (s != "") { - /* Kill the old cgroup, to ensure there are no - processes left over from an interrupted build. */ - destroyCgroup(s); - } - #endif - - if (ftruncate(fd.get(), 0) == -1) - throw Error("truncating user lock"); - auto lock = std::make_unique(); lock->fdUserLock = std::move(fd); lock->firstUid = settings.startId + i * maxIdsPerBuild; lock->nrIds = nrIds; - - #if __linux__ - if (nrIds > 1) { - auto ourCgroups = getCgroups("/proc/self/cgroup"); - auto ourCgroup = ourCgroups[""]; - if (ourCgroup == "") - throw Error("cannot determine cgroup name from /proc/self/cgroup"); - - auto ourCgroupPath = canonPath("/sys/fs/cgroup/" + ourCgroup); - - if (!pathExists(ourCgroupPath)) - throw Error("expected cgroup directory '%s'", ourCgroupPath); - - lock->cgroup = fmt("%s/nix-build-%d", ourCgroupPath, lock->firstUid); - - /* Record the cgroup in the lock file. This ensures that - if we subsequently get executed under a different parent - cgroup, we kill the previous cgroup first. */ - writeFull(lock->fdUserLock.get(), *lock->cgroup); - } - #endif - return lock; } } return nullptr; } - - #if __linux__ - std::optional getCgroup() override { return cgroup; } - #endif }; std::unique_ptr acquireUserLock(uid_t nrIds) diff --git a/src/libstore/lock.hh b/src/libstore/lock.hh index b5536408c..e7ceefab8 100644 --- a/src/libstore/lock.hh +++ b/src/libstore/lock.hh @@ -27,13 +27,6 @@ struct UserLock virtual gid_t getGID() = 0; virtual std::vector getSupplementaryGIDs() = 0; - - /* Kill any processes currently executing as this user. */ - virtual void kill() = 0; - - #if __linux__ - virtual std::optional getCgroup() { return {}; }; - #endif }; /* Acquire a user lock for a UID range of size `nrIds`. Note that this diff --git a/src/libutil/experimental-features.cc b/src/libutil/experimental-features.cc index 0f05f3752..e0902971e 100644 --- a/src/libutil/experimental-features.cc +++ b/src/libutil/experimental-features.cc @@ -15,6 +15,7 @@ std::map stringifiedXpFeatures = { { Xp::FetchClosure, "fetch-closure" }, { Xp::ReplFlake, "repl-flake" }, { Xp::AutoAllocateUids, "auto-allocate-uids" }, + { Xp::Cgroups, "cgroups" }, }; const std::optional parseExperimentalFeature(const std::string_view & name) diff --git a/src/libutil/experimental-features.hh b/src/libutil/experimental-features.hh index cf0c06eac..af775feb0 100644 --- a/src/libutil/experimental-features.hh +++ b/src/libutil/experimental-features.hh @@ -24,6 +24,7 @@ enum struct ExperimentalFeature FetchClosure, ReplFlake, AutoAllocateUids, + Cgroups, }; /**