When using chroots, use a private PID namespace

In a private PID namespace, processes have PIDs that are separate from
the rest of the system.  The initial child gets PID 1.  Processes in
the chroot cannot see processes outside of the chroot.  This improves
isolation between builds.  However, processes on the outside can see
processes in the chroot and send signals to them (if they have
appropriate rights).

Since the builder gets PID 1, it serves as the reaper for zombies in
the chroot.  This might turn out to be a problem.  In that case we'll
need to have a small PID 1 process that sits in a loop calling wait().
This commit is contained in:
Eelco Dolstra 2012-06-25 15:45:16 -04:00
parent 5489086456
commit 1db38ae81b
2 changed files with 199 additions and 169 deletions

View file

@ -793,6 +793,11 @@ private:
typedef void (DerivationGoal::*GoalState)(); typedef void (DerivationGoal::*GoalState)();
GoalState state; GoalState state;
/* Stuff we need to pass to initChild(). */
PathSet dirsInChroot;
typedef map<string, string> Environment;
Environment env;
public: public:
DerivationGoal(const Path & drvPath, Worker & worker); DerivationGoal(const Path & drvPath, Worker & worker);
~DerivationGoal(); ~DerivationGoal();
@ -821,6 +826,11 @@ private:
/* Start building a derivation. */ /* Start building a derivation. */
void startBuilder(); void startBuilder();
/* Initialise the builder's process. */
void initChild();
friend int childEntry(void *);
/* Must be called after the output paths have become valid (either /* Must be called after the output paths have become valid (either
due to a successful build or hook, or because they already due to a successful build or hook, or because they already
were). */ were). */
@ -1468,6 +1478,13 @@ void chmod(const Path & path, mode_t mode)
} }
int childEntry(void * arg)
{
((DerivationGoal *) arg)->initChild();
return 1;
}
void DerivationGoal::startBuilder() void DerivationGoal::startBuilder()
{ {
startNest(nest, lvlInfo, startNest(nest, lvlInfo,
@ -1480,8 +1497,6 @@ void DerivationGoal::startBuilder()
% drv.platform % thisSystem % drvPath); % drv.platform % thisSystem % drvPath);
/* Construct the environment passed to the builder. */ /* Construct the environment passed to the builder. */
typedef map<string, string> Environment;
Environment env;
/* Most shells initialise PATH to some default (/bin:/usr/bin:...) when /* Most shells initialise PATH to some default (/bin:/usr/bin:...) when
PATH is not set. We don't want this, so we fill it in with some dummy PATH is not set. We don't want this, so we fill it in with some dummy
@ -1635,7 +1650,6 @@ void DerivationGoal::startBuilder()
work properly. Purity checking for fixed-output derivations work properly. Purity checking for fixed-output derivations
is somewhat pointless anyway. */ is somewhat pointless anyway. */
useChroot = queryBoolSetting("build-use-chroot", false); useChroot = queryBoolSetting("build-use-chroot", false);
PathSet dirsInChroot;
if (fixedOutput) useChroot = false; if (fixedOutput) useChroot = false;
@ -1691,7 +1705,6 @@ void DerivationGoal::startBuilder()
Paths defaultDirs; Paths defaultDirs;
defaultDirs.push_back("/dev"); defaultDirs.push_back("/dev");
defaultDirs.push_back("/dev/pts"); defaultDirs.push_back("/dev/pts");
defaultDirs.push_back("/proc");
Paths dirsInChroot_ = querySetting("build-chroot-dirs", defaultDirs); Paths dirsInChroot_ = querySetting("build-chroot-dirs", defaultDirs);
dirsInChroot.insert(dirsInChroot_.begin(), dirsInChroot_.end()); dirsInChroot.insert(dirsInChroot_.begin(), dirsInChroot_.end());
@ -1760,42 +1773,64 @@ void DerivationGoal::startBuilder()
/* Fork a child to build the package. Note that while we /* Fork a child to build the package. Note that while we
currently use forks to run and wait for the children, it currently use forks to run and wait for the children, it
shouldn't be hard to use threads for this on systems where shouldn't be hard to use threads for this on systems where
fork() is unavailable or inefficient. */ fork() is unavailable or inefficient.
If we're building in a chroot, then also set up private
namespaces for the build:
- The PID namespace causes the build to start as PID 1.
Processes outside of the chroot are not visible to those on
the inside, but processes inside the chroot are visible from
the outside (though with different PIDs).
- The private mount namespace ensures that all the bind mounts
we do will only show up in this process and its children, and
will disappear automatically when we're done.
- The private network namespace ensures that the builder cannot
talk to the outside world (or vice versa). It only has a
private loopback interface.
- The IPC namespace prevents the builder from communicating
with outside processes using SysV IPC mechanisms (shared
memory, message queues, semaphores). It also ensures that
all IPC objects are destroyed when the builder exits.
*/
#if CHROOT_ENABLED
if (useChroot) {
char stack[32 * 1024];
pid = clone(childEntry, stack + sizeof(stack) - 8,
CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWNET | CLONE_NEWIPC | CLONE_NEWUTS | SIGCHLD, this);
} else
#endif
{
pid = fork(); pid = fork();
switch (pid) { if (pid == 0) initChild();
else if (pid == -1) throw SysError("unable to fork");
}
case -1: /* parent */
throw SysError("unable to fork"); pid.setSeparatePG(true);
builderOut.writeSide.close();
worker.childStarted(shared_from_this(), pid,
singleton<set<int> >(builderOut.readSide), true, true);
case 0: if (printBuildTrace) {
printMsg(lvlError, format("@ build-started %1% %2% %3% %4%")
% drvPath % drv.outputs["out"].path % drv.platform % logFile);
}
}
/* Warning: in the child we should absolutely not make any
SQLite calls! */ void DerivationGoal::initChild()
{
/* Warning: in the child we should absolutely not make any SQLite
calls! */
try { /* child */ try { /* child */
#if CHROOT_ENABLED #if CHROOT_ENABLED
if (useChroot) { if (useChroot) {
/* Set up private namespaces for the build:
- The private mount namespace ensures that all the
bind mounts we do will only show up in this
process and its children, and will disappear
automatically when we're done.
- The private network namespace ensures that the
builder cannot talk to the outside world (or vice
versa). It only has a private loopback
interface.
- The IPC namespace prevents the builder from
communicating with outside processes using SysV
IPC mechanisms (shared memory, message queues,
semaphores). It also ensures that all IPC
objects are destroyed when the builder exits. */
if (unshare(CLONE_NEWNS | CLONE_NEWNET | CLONE_NEWIPC | CLONE_NEWUTS) == -1)
throw SysError("cannot set up private namespaces");
/* Initialise the loopback interface. */ /* Initialise the loopback interface. */
AutoCloseFD fd(socket(PF_INET, SOCK_DGRAM, IPPROTO_IP)); AutoCloseFD fd(socket(PF_INET, SOCK_DGRAM, IPPROTO_IP));
if (fd == -1) throw SysError("cannot open IP socket"); if (fd == -1) throw SysError("cannot open IP socket");
@ -1828,11 +1863,16 @@ void DerivationGoal::startBuilder()
throw SysError(format("bind mount from `%1%' to `%2%' failed") % source % target); throw SysError(format("bind mount from `%1%' to `%2%' failed") % source % target);
} }
/* Bind a new instance of procfs on /proc to reflect our
private PID namespace. */
if (mount("none", (chrootRootDir + "/proc").c_str(), "proc", 0, 0) == -1)
throw SysError("mounting /proc");
/* Do the chroot(). Below we do a chdir() to the /* Do the chroot(). Below we do a chdir() to the
temporary build directory to make sure the current temporary build directory to make sure the current
directory is in the chroot. (Actually the order directory is in the chroot. (Actually the order
doesn't matter, since due to the bind mount tmpDir doesn't matter, since due to the bind mount tmpDir and
and tmpRootDit/tmpDir are the same directories.) */ tmpRootDit/tmpDir are the same directories.) */
if (chroot(chrootRootDir.c_str()) == -1) if (chroot(chrootRootDir.c_str()) == -1)
throw SysError(format("cannot change root directory to `%1%'") % chrootRootDir); throw SysError(format("cannot change root directory to `%1%'") % chrootRootDir);
} }
@ -1854,8 +1894,8 @@ void DerivationGoal::startBuilder()
throw SysError("cannot set i686-linux personality"); throw SysError("cannot set i686-linux personality");
} }
/* Impersonate a Linux 2.6 machine to get some determinism /* Impersonate a Linux 2.6 machine to get some determinism in
in builds that depend on the kernel version. */ builds that depend on the kernel version. */
if ((drv.platform == "i686-linux" || drv.platform == "x86_64-linux") && if ((drv.platform == "i686-linux" || drv.platform == "x86_64-linux") &&
queryBoolSetting("build-impersonate-linux-26", true)) queryBoolSetting("build-impersonate-linux-26", true))
{ {
@ -1874,12 +1914,12 @@ void DerivationGoal::startBuilder()
std::vector<const char *> args; /* careful with c_str()! */ std::vector<const char *> args; /* careful with c_str()! */
string user; /* must be here for its c_str()! */ string user; /* must be here for its c_str()! */
/* If we are running in `build-users' mode, then switch to /* If we are running in `build-users' mode, then switch to the
the user we allocated above. Make sure that we drop user we allocated above. Make sure that we drop all root
all root privileges. Note that above we have closed privileges. Note that above we have closed all file
all file descriptors except std*, so that's safe. Also descriptors except std*, so that's safe. Also note that
note that setuid() when run as root sets the real, setuid() when run as root sets the real, effective and
effective and saved UIDs. */ saved UIDs. */
if (buildUser.enabled()) { if (buildUser.enabled()) {
printMsg(lvlChatty, format("switching to user `%1%'") % buildUser.getUser()); printMsg(lvlChatty, format("switching to user `%1%'") % buildUser.getUser());
@ -1931,19 +1971,6 @@ void DerivationGoal::startBuilder()
} }
/* parent */
pid.setSeparatePG(true);
builderOut.writeSide.close();
worker.childStarted(shared_from_this(), pid,
singleton<set<int> >(builderOut.readSide), true, true);
if (printBuildTrace) {
printMsg(lvlError, format("@ build-started %1% %2% %3% %4%")
% drvPath % drv.outputs["out"].path % drv.platform % logFile);
}
}
/* Parse a list of reference specifiers. Each element must either be /* Parse a list of reference specifiers. Each element must either be
a store path, or the symbolic name of the output of the derivation a store path, or the symbolic name of the output of the derivation
(such as `out'). */ (such as `out'). */

View file

@ -779,8 +779,11 @@ void Pid::kill()
int status; int status;
while (waitpid(pid, &status, 0) == -1) { while (waitpid(pid, &status, 0) == -1) {
checkInterrupt(); checkInterrupt();
if (errno != EINTR) printMsg(lvlError, if (errno != EINTR) {
printMsg(lvlError,
(SysError(format("waiting for process %1%") % pid).msg())); (SysError(format("waiting for process %1%") % pid).msg()));
break;
}
} }
pid = -1; pid = -1;