Add "uid-range" and "systemd-cgroup" system features

"uid-range" provides 65536 UIDs to a build and runs the build as root
in its user namespace. "systemd-cgroup" allows the build to mount the
systemd cgroup controller (needed for running systemd-nspawn and NixOS
containers).

Also, add a configuration option "auto-allocate-uids" which is needed
to enable these features, and some experimental feature gates.

So to enable support for containers you need the following in
nix.conf:

  experimental-features = auto-allocate-uids systemd-cgroup
  auto-allocate-uids = true
  system-features = uid-range systemd-cgroup
This commit is contained in:
Eelco Dolstra 2020-05-19 23:25:44 +02:00
parent 570c443f56
commit ba50c3efa3
5 changed files with 340 additions and 205 deletions

View file

@ -16,7 +16,7 @@
#include "machines.hh"
#include "daemon.hh"
#include "worker-protocol.hh"
#include "cgroup.hh"
#include "user-lock.hh"
#include <algorithm>
#include <iostream>
@ -504,154 +504,6 @@ void handleDiffHook(
}
}
//////////////////////////////////////////////////////////////////////
class UserLock
{
private:
Path fnUserLock;
AutoCloseFD fdUserLock;
bool isEnabled = false;
uid_t uid = 0;
gid_t gid = 0;
std::vector<gid_t> supplementaryGIDs;
public:
UserLock();
void kill();
uid_t getUID() { assert(uid); return uid; }
gid_t getGID() { assert(gid); return gid; }
uint32_t getIDCount() { return settings.idsPerBuild; }
std::vector<gid_t> getSupplementaryGIDs() { return supplementaryGIDs; }
bool findFreeUser();
bool enabled() { return isEnabled; }
};
UserLock::UserLock()
{
#if 0
assert(settings.buildUsersGroup != "");
createDirs(settings.nixStateDir + "/userpool");
#endif
}
bool UserLock::findFreeUser() {
if (enabled()) return true;
#if 0
/* Get the members of the build-users-group. */
struct group * gr = getgrnam(settings.buildUsersGroup.get().c_str());
if (!gr)
throw Error("the group '%1%' specified in 'build-users-group' does not exist",
settings.buildUsersGroup);
gid = gr->gr_gid;
/* Copy the result of getgrnam. */
Strings users;
for (char * * p = gr->gr_mem; *p; ++p) {
debug("found build user '%1%'", *p);
users.push_back(*p);
}
if (users.empty())
throw Error("the build users group '%1%' has no members",
settings.buildUsersGroup);
/* Find a user account that isn't currently in use for another
build. */
for (auto & i : users) {
debug("trying user '%1%'", i);
struct passwd * pw = getpwnam(i.c_str());
if (!pw)
throw Error("the user '%1%' in the group '%2%' does not exist",
i, settings.buildUsersGroup);
fnUserLock = (format("%1%/userpool/%2%") % settings.nixStateDir % pw->pw_uid).str();
AutoCloseFD fd = open(fnUserLock.c_str(), O_RDWR | O_CREAT | O_CLOEXEC, 0600);
if (!fd)
throw SysError("opening user lock '%1%'", fnUserLock);
if (lockFile(fd.get(), ltWrite, false)) {
fdUserLock = std::move(fd);
user = i;
uid = pw->pw_uid;
/* Sanity check... */
if (uid == getuid() || uid == geteuid())
throw Error("the Nix user should not be a member of '%1%'",
settings.buildUsersGroup);
#if __linux__
/* Get the list of supplementary groups of this build user. This
is usually either empty or contains a group such as "kvm". */
supplementaryGIDs.resize(10);
int ngroups = supplementaryGIDs.size();
int err = getgrouplist(pw->pw_name, pw->pw_gid,
supplementaryGIDs.data(), &ngroups);
if (err == -1)
throw Error("failed to get list of supplementary groups for '%1%'", pw->pw_name);
supplementaryGIDs.resize(ngroups);
#endif
isEnabled = true;
return true;
}
}
return false;
#endif
assert(settings.startId > 0);
assert(settings.startId % settings.idsPerBuild == 0);
assert(settings.uidCount % settings.idsPerBuild == 0);
assert((uint64_t) settings.startId + (uint64_t) settings.uidCount <= std::numeric_limits<uid_t>::max());
// FIXME: check whether the id range overlaps any known users
size_t nrSlots = settings.uidCount / settings.idsPerBuild;
for (size_t i = 0; i < nrSlots; i++) {
debug("trying user slot '%d'", i);
createDirs(settings.nixStateDir + "/userpool");
fnUserLock = fmt("%s/userpool/slot-%d", settings.nixStateDir, i);
AutoCloseFD fd = open(fnUserLock.c_str(), O_RDWR | O_CREAT | O_CLOEXEC, 0600);
if (!fd)
throw SysError("opening user lock '%1%'", fnUserLock);
if (lockFile(fd.get(), ltWrite, false)) {
fdUserLock = std::move(fd);
uid = settings.startId + i * settings.idsPerBuild;
gid = settings.startId + i * settings.idsPerBuild;
return true;
}
}
return false;
}
void UserLock::kill()
{
// FIXME: use a cgroup to kill all processes in the build?
#if 0
killUser(uid);
#endif
}
//////////////////////////////////////////////////////////////////////
@ -840,6 +692,13 @@ private:
Path chrootRootDir;
/* Whether to give the build more than 1 UID. */
bool useUidRange = false;
/* Whether to make the 'systemd' cgroup controller available to
the build. */
bool useSystemdCgroup = false;
/* RAII object to delete the chroot directory. */
std::shared_ptr<AutoDelete> autoDelChroot;
@ -896,8 +755,8 @@ private:
result. */
std::map<Path, ValidPathInfo> prevInfos;
const uid_t sandboxUid = 1000;
const gid_t sandboxGid = 100;
uid_t sandboxUid = -1;
gid_t sandboxGid = -1;
const static Path homeDir;
@ -1445,6 +1304,7 @@ void DerivationGoal::inputsRealised()
result = BuildResult();
}
void DerivationGoal::started() {
auto msg = fmt(
buildMode == bmRepair ? "repairing outputs of '%s'" :
@ -1459,6 +1319,7 @@ void DerivationGoal::started() {
worker.updateProgress();
}
void DerivationGoal::tryToBuild()
{
trace("trying to build");
@ -1556,25 +1417,28 @@ void DerivationGoal::tryToBuild()
worker.wakeUp(shared_from_this());
}
void DerivationGoal::tryLocalBuild() {
/* If `build-users-group' is not empty, then we have to build as
one of the members of that group. */
if ((settings.buildUsersGroup != "" || settings.startId.get() != 0) && getuid() == 0) {
static bool useBuildUsers = (settings.buildUsersGroup != "" || settings.startId.get() != 0) && getuid() == 0;
if (useBuildUsers) {
#if defined(__linux__) || defined(__APPLE__)
if (!buildUser) buildUser = std::make_unique<UserLock>();
if (!buildUser)
buildUser = acquireUserLock();
if (buildUser->findFreeUser()) {
/* Make sure that no other processes are executing under this
uid. */
buildUser->kill();
} else {
if (!buildUser) {
if (!actLock)
actLock = std::make_unique<Activity>(*logger, lvlWarn, actBuildWaiting,
fmt("waiting for UID to build '%s'", yellowtxt(worker.store.printStorePath(drvPath))));
worker.waitForAWhile(shared_from_this());
return;
}
/* Make sure that no other processes are executing under this
uid. */
buildUser->kill();
#else
/* Don't know how to block the creation of setuid/setgid
binaries on this platform. */
@ -2087,6 +1951,9 @@ void DerivationGoal::startBuilder()
}
}
useUidRange = parsedDrv->getRequiredSystemFeatures().count("uid-range");
useSystemdCgroup = parsedDrv->getRequiredSystemFeatures().count("systemd-cgroup");
if (useChroot) {
/* Allow a user-configurable set of directories from the
@ -2166,7 +2033,7 @@ void DerivationGoal::startBuilder()
printMsg(lvlChatty, format("setting up chroot environment in '%1%'") % chrootRootDir);
if (mkdir(chrootRootDir.c_str(), 0755) == -1)
if (mkdir(chrootRootDir.c_str(), useUidRange ? 0755 : 0750) == -1)
throw SysError("cannot create '%1%'", chrootRootDir);
// FIXME: only make root writable for user namespace builds.
@ -2186,6 +2053,12 @@ void DerivationGoal::startBuilder()
createDirs(chrootRootDir + "/etc");
chownToBuilder(chrootRootDir + "/etc");
if (useUidRange && (!buildUser || buildUser->getUIDCount() < 65536))
throw Error("feature 'uid-range' requires '%s' to be enabled", settings.autoAllocateUids.name);
sandboxUid = useUidRange ? 0 : 1000;
sandboxGid = useUidRange ? 0 : 100;
writeFile(chrootRootDir + "/etc/passwd", fmt(
"root:x:0:0:Nix build user:%3%:/noshell\n"
"nixbld:x:%1%:%2%:Nix build user:%3%:/noshell\n"
@ -2238,12 +2111,32 @@ void DerivationGoal::startBuilder()
for (auto & i : drv->outputs)
dirsInChroot.erase(worker.store.printStorePath(i.second.path));
#elif __APPLE__
if (useSystemdCgroup) {
settings.requireExperimentalFeature("systemd-cgroup");
std::optional<Path> cgroup;
if (!buildUser || !(cgroup = buildUser->getCgroup()))
throw Error("feature 'systemd-cgroup' requires 'auto-allocate-uids = true' in nix.conf");
chownToBuilder(*cgroup);
chownToBuilder(*cgroup + "/cgroup.procs");
}
#else
if (useUidRange)
throw Error("feature 'uid-range' is not supported on this platform");
if (useSystemdCgroup)
throw Error("feature 'systemd-cgroup' is not supported on this platform");
#if __APPLE__
/* We don't really have any parent prep work to do (yet?)
All work happens in the child, instead. */
#else
throw Error("sandboxing builds is not supported on this platform");
#endif
#endif
} else {
if (useUidRange)
throw Error("feature 'uid-range' is only supported in sandboxed builds");
if (useSystemdCgroup)
throw Error("feature 'systemd-cgroup' is only supported in sandboxed builds");
}
if (needsHashRewrite()) {
@ -2375,31 +2268,6 @@ void DerivationGoal::startBuilder()
#if __linux__
if (useChroot) {
/* Create a systemd cgroup since that's the minimum required
by systemd-nspawn. */
// FIXME: do we want to use the parent cgroup? We should
// always use the same cgroup regardless of whether we're the
// daemon or run from a user session via sudo.
auto ourCgroups = getCgroups("/proc/self/cgroup");
auto systemdCgroup = ourCgroups["systemd"];
if (systemdCgroup == "")
throw Error("'systemd' cgroup does not exist");
auto hostCgroup = canonPath("/sys/fs/cgroup/systemd/" + systemdCgroup);
if (!pathExists(hostCgroup))
throw Error("expected cgroup directory '%s'", hostCgroup);
auto childCgroup = fmt("%s/nix-%d", hostCgroup, buildUser->getUID());
destroyCgroup(childCgroup);
if (mkdir(childCgroup.c_str(), 0755) == -1)
throw SysError("creating cgroup '%s'", childCgroup);
chownToBuilder(childCgroup);
chownToBuilder(childCgroup + "/cgroup.procs");
/* Set up private namespaces for the build:
- The PID namespace causes the build to start as PID 1.
@ -2508,15 +2376,16 @@ void DerivationGoal::startBuilder()
the calling user (if build users are disabled). */
uid_t hostUid = buildUser ? buildUser->getUID() : getuid();
uid_t hostGid = buildUser ? buildUser->getGID() : getgid();
uint32_t nrIds = settings.idsPerBuild; // FIXME
uint32_t nrIds = buildUser && useUidRange ? buildUser->getUIDCount() : 1;
writeFile("/proc/" + std::to_string(pid) + "/uid_map",
fmt("%d %d %d", /* sandboxUid */ 0, hostUid, nrIds));
fmt("%d %d %d", sandboxUid, hostUid, nrIds));
//writeFile("/proc/" + std::to_string(pid) + "/setgroups", "deny");
if (!useUidRange)
writeFile("/proc/" + std::to_string(pid) + "/setgroups", "deny");
writeFile("/proc/" + std::to_string(pid) + "/gid_map",
fmt("%d %d %d", /* sandboxGid */ 0, hostGid, nrIds));
fmt("%d %d %d", sandboxGid, hostGid, nrIds));
/* Save the mount namespace of the child. We have to do this
*before* the child does a chroot. */
@ -2525,7 +2394,10 @@ void DerivationGoal::startBuilder()
throw SysError("getting sandbox mount namespace");
/* Move the child into its own cgroup. */
writeFile(childCgroup + "/cgroup.procs", fmt("%d", (pid_t) pid));
if (buildUser) {
if (auto cgroup = buildUser->getCgroup())
writeFile(*cgroup + "/cgroup.procs", fmt("%d", (pid_t) pid));
}
/* Signal the builder that we've updated its user namespace. */
writeFull(userNamespaceSync.writeSide.get(), "1");
@ -3361,7 +3233,7 @@ void DerivationGoal::runChild()
/* Unshare the cgroup namespace. This means
/proc/self/cgroup will show the child's cgroup as '/'
rather than whatever it is in the parent. */
if (unshare(CLONE_NEWCGROUP) == -1)
if (useSystemdCgroup && unshare(CLONE_NEWCGROUP) == -1)
throw SysError("unsharing cgroup namespace");
/* Do the chroot(). */
@ -3386,16 +3258,10 @@ void DerivationGoal::runChild()
/* Switch to the sandbox uid/gid in the user namespace,
which corresponds to the build user or calling user in
the parent namespace. */
#if 0
if (setgid(sandboxGid) == -1)
throw SysError("setgid failed");
if (setuid(sandboxUid) == -1)
throw SysError("setuid failed");
#endif
if (setgid(0) == -1)
throw SysError("setgid failed");
if (setuid(0) == -1)
throw SysError("setuid failed");
setUser = false;
}
@ -3789,7 +3655,7 @@ void DerivationGoal::registerOutputs()
something like that. */
canonicalisePathMetaData(
actualPath,
buildUser ? std::optional(std::make_pair(buildUser->getUID(), buildUser->getUID() + buildUser->getIDCount() - 1)) : std::nullopt,
buildUser ? std::optional(buildUser->getUIDRange()) : std::nullopt,
inodesSeen);
/* FIXME: this is in-memory. */
@ -3866,7 +3732,7 @@ void DerivationGoal::registerOutputs()
all files are owned by the build user, if applicable. */
canonicalisePathMetaData(actualPath,
buildUser && !rewritten
? std::optional(std::make_pair(buildUser->getUID(), buildUser->getUID() + buildUser->getIDCount() - 1))
? std::optional(buildUser->getUIDRange())
: std::nullopt,
inodesSeen);

View file

@ -4,6 +4,7 @@
#include "util.hh"
#include <chrono>
#include <unordered_set>
#include <dirent.h>
@ -19,7 +20,7 @@ std::map<std::string, std::string> getCgroups(const Path & cgroupFile)
if (!std::regex_match(line, match, regex))
throw Error("invalid line '%s' in '%s'", line, cgroupFile);
std::string name = hasPrefix(match[2], "name=") ? std::string(match[2], 5) : match[2];
std::string name = hasPrefix(std::string(match[2]), "name=") ? std::string(match[2], 5) : match[2];
cgroups.insert_or_assign(name, match[3]);
}
@ -28,6 +29,8 @@ std::map<std::string, std::string> getCgroups(const Path & cgroupFile)
void destroyCgroup(const Path & cgroup)
{
if (!pathExists(cgroup)) return;
for (auto & entry : readDirectory(cgroup)) {
if (entry.type != DT_DIR) continue;
destroyCgroup(cgroup + "/" + entry.name);
@ -35,6 +38,8 @@ void destroyCgroup(const Path & cgroup)
int round = 1;
std::unordered_set<pid_t> pidsShown;
while (true) {
auto pids = tokenizeString<std::vector<std::string>>(readFile(cgroup + "/cgroup.procs"));
@ -46,12 +51,22 @@ void destroyCgroup(const Path & cgroup)
for (auto & pid_s : pids) {
pid_t pid;
if (!string2Int(pid_s, pid)) throw Error("invalid pid '%s'", pid);
if (pidsShown.insert(pid).second) {
try {
auto cmdline = readFile(fmt("/proc/%d/cmdline", pid));
using namespace std::string_literals;
warn("killing stray builder process %d (%s)...",
pid, trim(replaceStrings(cmdline, "\0"s, " ")));
} catch (SysError &) {
}
}
// FIXME: pid wraparound
if (kill(pid, SIGKILL) == -1 && errno != ESRCH)
throw SysError("killing member %d of cgroup '%s'", pid, cgroup);
}
auto sleep = std::chrono::milliseconds((int) std::pow(2.0, std::min(round, 10)));
if (sleep.count() > 100)
printError("waiting for %d ms for cgroup '%s' to become empty", sleep.count(), cgroup);
std::this_thread::sleep_for(sleep);
round++;

View file

@ -149,10 +149,13 @@ public:
"The Unix group that contains the build users."};
#if __linux__
Setting<bool> autoAllocateUids{this, false, "auto-allocate-uids",
"Whether to allocate UIDs for builders automatically."};
const uint32_t idsPerBuild = 1 << 16;
Setting<uint32_t> startId{this, 872415232, "start-id",
"The first UID and GID to use for dynamic ID allocation. (0 means disable.)"};
"The first UID and GID to use for dynamic ID allocation."};
Setting<uint32_t> uidCount{this, idsPerBuild * 128, "id-count",
"The number of UIDs/GIDs to use for dynamic ID allocation."};

212
src/libstore/user-lock.cc Normal file
View file

@ -0,0 +1,212 @@
#include "user-lock.hh"
#include "globals.hh"
#include "pathlocks.hh"
#include "cgroup.hh"
namespace nix {
struct SimpleUserLock : UserLock
{
AutoCloseFD fdUserLock;
uid_t uid;
gid_t gid;
std::vector<gid_t> supplementaryGIDs;
void kill() override
{
killUser(uid);
}
std::pair<uid_t, uid_t> getUIDRange() override
{
assert(uid);
return {uid, uid};
}
gid_t getGID() override { assert(gid); return gid; }
std::vector<gid_t> getSupplementaryGIDs() override { return supplementaryGIDs; }
static std::unique_ptr<UserLock> acquire()
{
assert(settings.buildUsersGroup != "");
createDirs(settings.nixStateDir + "/userpool");
/* Get the members of the build-users-group. */
struct group * gr = getgrnam(settings.buildUsersGroup.get().c_str());
if (!gr)
throw Error("the group '%s' specified in 'build-users-group' does not exist", settings.buildUsersGroup);
/* Copy the result of getgrnam. */
Strings users;
for (char * * p = gr->gr_mem; *p; ++p) {
debug("found build user '%s'", *p);
users.push_back(*p);
}
if (users.empty())
throw Error("the build users group '%s' has no members", settings.buildUsersGroup);
/* Find a user account that isn't currently in use for another
build. */
for (auto & i : users) {
debug("trying user '%s'", i);
struct passwd * pw = getpwnam(i.c_str());
if (!pw)
throw Error("the user '%s' in the group '%s' does not exist", i, settings.buildUsersGroup);
auto fnUserLock = fmt("%s/userpool/%s", settings.nixStateDir,pw->pw_uid);
AutoCloseFD fd = open(fnUserLock.c_str(), O_RDWR | O_CREAT | O_CLOEXEC, 0600);
if (!fd)
throw SysError("opening user lock '%s'", fnUserLock);
if (lockFile(fd.get(), ltWrite, false)) {
auto lock = std::make_unique<SimpleUserLock>();
lock->fdUserLock = std::move(fd);
lock->uid = pw->pw_uid;
lock->gid = gr->gr_gid;
/* Sanity check... */
if (lock->uid == getuid() || lock->uid == geteuid())
throw Error("the Nix user should not be a member of '%s'", settings.buildUsersGroup);
#if __linux__
/* Get the list of supplementary groups of this build
user. This is usually either empty or contains a
group such as "kvm". */
lock->supplementaryGIDs.resize(10);
int ngroups = lock->supplementaryGIDs.size();
int err = getgrouplist(pw->pw_name, pw->pw_gid,
lock->supplementaryGIDs.data(), &ngroups);
if (err == -1)
throw Error("failed to get list of supplementary groups for '%s'", pw->pw_name);
lock->supplementaryGIDs.resize(ngroups);
#endif
return lock;
}
}
return nullptr;
}
};
#if __linux__
struct CgroupUserLock : UserLock
{
AutoCloseFD fdUserLock;
uid_t uid;
void kill() override
{
if (cgroup) {
destroyCgroup(*cgroup);
cgroup.reset();
}
}
std::pair<uid_t, uid_t> getUIDRange() override
{
assert(uid);
return {uid, uid + settings.idsPerBuild - 1};
}
gid_t getGID() override
{
// We use the same GID ranges as for the UIDs.
assert(uid);
return uid;
}
std::vector<gid_t> getSupplementaryGIDs() override { return {}; } // FIXME
static std::unique_ptr<UserLock> acquire()
{
settings.requireExperimentalFeature("auto-allocate-uids");
assert(settings.startId > 0);
assert(settings.startId % settings.idsPerBuild == 0);
assert(settings.uidCount % settings.idsPerBuild == 0);
assert((uint64_t) settings.startId + (uint64_t) settings.uidCount <= std::numeric_limits<uid_t>::max());
// FIXME: check whether the id range overlaps any known users
createDirs(settings.nixStateDir + "/userpool2");
size_t nrSlots = settings.uidCount / settings.idsPerBuild;
for (size_t i = 0; i < nrSlots; i++) {
debug("trying user slot '%d'", i);
createDirs(settings.nixStateDir + "/userpool2");
auto fnUserLock = fmt("%s/userpool2/slot-%d", settings.nixStateDir, i);
AutoCloseFD fd = open(fnUserLock.c_str(), O_RDWR | O_CREAT | O_CLOEXEC, 0600);
if (!fd)
throw SysError("opening user lock '%s'", fnUserLock);
if (lockFile(fd.get(), ltWrite, false)) {
auto lock = std::make_unique<CgroupUserLock>();
lock->fdUserLock = std::move(fd);
lock->uid = settings.startId + i * settings.idsPerBuild;
auto s = drainFD(lock->fdUserLock.get());
if (s != "") lock->cgroup = s;
return lock;
}
}
return nullptr;
}
std::optional<Path> cgroup;
std::optional<Path> getCgroup() override
{
if (!cgroup) {
/* Create a systemd cgroup since that's the minimum
required by systemd-nspawn. */
auto ourCgroups = getCgroups("/proc/self/cgroup");
auto systemdCgroup = ourCgroups["systemd"];
if (systemdCgroup == "")
throw Error("'systemd' cgroup does not exist");
auto hostCgroup = canonPath("/sys/fs/cgroup/systemd/" + systemdCgroup);
if (!pathExists(hostCgroup))
throw Error("expected cgroup directory '%s'", hostCgroup);
cgroup = fmt("%s/nix-%d", hostCgroup, uid);
destroyCgroup(*cgroup);
if (mkdir(cgroup->c_str(), 0755) == -1)
throw SysError("creating cgroup '%s'", *cgroup);
/* Record the cgroup in the lock file. This ensures that
if we subsequently get executed under a different parent
cgroup, we kill the previous cgroup first. */
if (ftruncate(fdUserLock.get(), 0) == -1)
throw Error("truncating user lock");
writeFull(fdUserLock.get(), *cgroup);
}
return cgroup;
};
};
#endif
std::unique_ptr<UserLock> acquireUserLock()
{
#if __linux__
if (settings.autoAllocateUids)
return CgroupUserLock::acquire();
else
#endif
return SimpleUserLock::acquire();
}
}

39
src/libstore/user-lock.hh Normal file
View file

@ -0,0 +1,39 @@
#pragma once
#include "types.hh"
namespace nix {
struct UserLock
{
virtual ~UserLock() { }
/* Get the first and last UID. */
virtual std::pair<uid_t, uid_t> getUIDRange() = 0;
/* Get the first UID. */
uid_t getUID()
{
return getUIDRange().first;
}
uid_t getUIDCount()
{
return getUIDRange().second - getUIDRange().first + 1;
}
virtual gid_t getGID() = 0;
virtual std::vector<gid_t> getSupplementaryGIDs() = 0;
/* Kill any processes currently executing as this user. */
virtual void kill() = 0;
virtual std::optional<Path> getCgroup() { return {}; };
};
/* Acquire a user lock. Note that this may return nullptr if no user
is available. */
std::unique_ptr<UserLock> acquireUserLock();
}