* Made the build hook mechanism more efficient. Rather than starting

the hook every time we want to ask whether we can run a remote build
  (which can be very often), we now reuse a hook process for answering
  those queries until it accepts a build.  So if there are N
  derivations to be built, at most N hooks will be started.
This commit is contained in:
Eelco Dolstra 2010-08-25 20:44:28 +00:00
parent 1a396f3789
commit e437b08250
3 changed files with 332 additions and 346 deletions

View file

@ -31,36 +31,36 @@ $ENV{"DISPLAY"} = "";
$ENV{"SSH_ASKPASS"} = "";
my $loadIncreased = 0;
my ($amWilling, $localSystem, $neededSystem, $drvPath, $maxSilentTime) = @ARGV;
$maxSilentTime = 0 unless defined $maxSilentTime;
sub sendReply {
my $reply = shift;
print STDERR "# $reply\n";
}
sub decline {
sendReply "decline";
exit 0;
}
# Initialisation.
my $loadIncreased = 0;
my ($localSystem, $maxSilentTime) = @ARGV;
$maxSilentTime = 0 unless defined $maxSilentTime;
my $currentLoad = $ENV{"NIX_CURRENT_LOAD"};
decline unless defined $currentLoad;
mkdir $currentLoad, 0777 or die unless -d $currentLoad;
my $conf = $ENV{"NIX_REMOTE_SYSTEMS"};
decline if !defined $conf || ! -e $conf;
my $canBuildLocally = $amWilling && ($localSystem eq $neededSystem);
sub openSlotLock {
my ($machine, $slot) = @_;
my $slotLockFn = "$currentLoad/" . (join '+', @{$machine->{systemTypes}}) . "-" . $machine->{hostName} . "-$slot";
my $slotLock = new IO::Handle;
open $slotLock, ">>$slotLockFn" or die;
return $slotLock;
}
# Read the list of machines.
my @machines;
open CONF, "< $conf" or die;
while (<CONF>) {
if (defined $conf && -e $conf) {
open CONF, "< $conf" or die;
while (<CONF>) {
chomp;
s/\#.*$//g;
next if /^\s*$/;
@ -73,33 +73,38 @@ while (<CONF>) {
, speedFactor => 1.0 * ($6 || 1)
, enabled => 1
};
}
close CONF;
# Acquire the exclusive lock on $currentLoad/main-lock.
my $mainLock = "$currentLoad/main-lock";
open MAINLOCK, ">>$mainLock" or die;
flock(MAINLOCK, LOCK_EX) or die;
sub openSlotLock {
my ($machine, $slot) = @_;
my $slotLockFn = "$currentLoad/" . (join '+', @{$machine->{systemTypes}}) . "-" . $machine->{hostName} . "-$slot";
my $slotLock = new IO::Handle;
open $slotLock, ">>$slotLockFn" or die;
return $slotLock;
}
close CONF;
}
my $hostName;
my $slotLock;
while (1) {
# Wait for the calling process to ask us whether we can build some derivation.
my ($drvPath, $hostName, $slotLock);
# Find all machine that can execute this build, i.e., that support
# builds for the given platform and are not at their job limit.
REQ: while (1) {
$_ = <STDIN> || exit 0;
my ($amWilling, $neededSystem);
($amWilling, $neededSystem, $drvPath) = split;
my $canBuildLocally = $amWilling && ($localSystem eq $neededSystem);
if (!defined $currentLoad) {
sendReply "decline";
next;
}
# Acquire the exclusive lock on $currentLoad/main-lock.
mkdir $currentLoad, 0777 or die unless -d $currentLoad;
my $mainLock = "$currentLoad/main-lock";
open MAINLOCK, ">>$mainLock" or die;
flock(MAINLOCK, LOCK_EX) or die;
while (1) {
# Find all machine that can execute this build, i.e., that
# support builds for the given platform and are not at their
# job limit.
my $rightType = 0;
my @available = ();
LOOP: foreach my $cur (@machines) {
@ -136,14 +141,15 @@ while (1) {
# Didn't find any available machine? Then decline or postpone.
if (scalar @available == 0) {
# Postpone if we have a machine of the right type, except if the
# local system can and wants to do the build.
# Postpone if we have a machine of the right type, except
# if the local system can and wants to do the build.
if ($rightType && !$canBuildLocally) {
sendReply "postpone";
exit 0;
} else {
decline;
sendReply "decline";
}
close MAINLOCK;
next REQ;
}
@ -175,27 +181,20 @@ while (1) {
# Connect to the selected machine.
@sshOpts = ("-i", $machine->{sshKeys}, "-x");
$hostName = $machine->{hostName};
last if openSSHConnection $hostName;
last REQ if openSSHConnection $hostName;
warn "unable to open SSH connection to $hostName, trying other available machines...\n";
$machine->{enabled} = 0;
}
}
# Tell Nix we've accepted the build.
sendReply "accept";
my $x = <STDIN>;
chomp $x;
if ($x ne "okay") {
exit 0;
}
print STDERR "building `$drvPath' on `$hostName'\n";
sendReply "accept";
my @inputs = split /\s/, readline(STDIN);
my @outputs = split /\s/, readline(STDIN);
my @inputs = split /\s/, do { local $/; local @ARGV = "inputs"; <> };
my @outputs = split /\s/, do { local $/; local @ARGV = "outputs"; <> };
my $maybeSign = "";
$maybeSign = "--sign" if -e "/nix/etc/nix/signing-key.sec";
@ -238,7 +237,7 @@ if (system("ssh $hostName @sshOpts -tt 'nix-store -r $drvPath $buildFlags > /dev
exit $res;
}
print "build of `$drvPath' on `$hostName' succeeded\n";
#print "build of `$drvPath' on `$hostName' succeeded\n";
# Copy the output from the build machine.
@ -246,7 +245,8 @@ foreach my $output (@outputs) {
my $maybeSignRemote = "";
$maybeSignRemote = "--sign" if $UID != 0;
system("ssh $hostName @sshOpts 'nix-store --export $maybeSignRemote $output' | @bindir@/nix-store --import > /dev/null") == 0
system("ssh $hostName @sshOpts 'nix-store --export $maybeSignRemote $output'" .
"| NIX_HELD_LOCKS=$output @bindir@/nix-store --import > /dev/null") == 0
or die "cannot copy $output from $hostName: $?";
}

View file

@ -64,6 +64,7 @@ static const uid_t rootUserId = 0;
/* Forward definition. */
class Worker;
class HookInstance;
/* A pointer to a goal. */
@ -215,6 +216,8 @@ public:
LocalStore & store;
boost::shared_ptr<HookInstance> hook;
Worker(LocalStore & store);
~Worker();
@ -615,6 +618,94 @@ void deletePathWrapped(const Path & path)
//////////////////////////////////////////////////////////////////////
struct HookInstance
{
/* Pipes for talking to the build hook. */
Pipe toHook;
/* Pipe for the hook's standard output/error. */
Pipe fromHook;
/* The process ID of the hook. */
Pid pid;
HookInstance();
~HookInstance();
};
HookInstance::HookInstance()
{
debug("starting build hook");
Path buildHook = absPath(getEnv("NIX_BUILD_HOOK"));
/* Create a pipe to get the output of the child. */
fromHook.create();
/* Create the communication pipes. */
toHook.create();
/* Fork the hook. */
pid = fork();
switch (pid) {
case -1:
throw SysError("unable to fork");
case 0:
try { /* child */
commonChildInit(fromHook);
if (chdir("/") == -1) throw SysError("changing into `/");
/* Dup the communication pipes. */
toHook.writeSide.close();
if (dup2(toHook.readSide, STDIN_FILENO) == -1)
throw SysError("dupping to-hook read side");
execl(buildHook.c_str(), buildHook.c_str(), thisSystem.c_str(),
(format("%1%") % maxSilentTime).str().c_str(), NULL);
throw SysError(format("executing `%1%'") % buildHook);
} catch (std::exception & e) {
std::cerr << format("build hook error: %1%") % e.what() << std::endl;
}
quickExit(1);
}
/* parent */
pid.setSeparatePG(true);
pid.setKillSignal(SIGTERM);
fromHook.writeSide.close();
toHook.readSide.close();
}
HookInstance::~HookInstance()
{
try {
/* Cleanly shut down the hook by closing its stdin if it's not
already building. Otherwise pid's destructor will kill
it. */
if (pid != -1 && toHook.writeSide != -1) {
toHook.writeSide.close();
pid.wait(true);
}
} catch (...) {
ignoreException();
}
}
//////////////////////////////////////////////////////////////////////
typedef enum {rpAccept, rpDecline, rpPostpone} HookReply;
class DerivationGoal : public Goal
{
private:
@ -651,11 +742,8 @@ private:
/* Pipe for the builder's standard output/error. */
Pipe logPipe;
/* Whether we're building using a build hook. */
bool usingBuildHook;
/* Pipes for talking to the build hook (if any). */
Pipe toHook;
/* The build hook. */
boost::shared_ptr<HookInstance> hook;
/* Whether we're currently doing a chroot build. */
bool useChroot;
@ -694,12 +782,8 @@ private:
void buildDone();
/* Is the build hook willing to perform the build? */
typedef enum {rpAccept, rpDecline, rpPostpone} HookReply;
HookReply tryBuildHook();
/* Synchronously wait for a build hook to finish. */
void terminateBuildHook(bool kill = false);
/* Start building a derivation. */
void startBuilder();
@ -711,10 +795,6 @@ private:
/* Open a log file and a pipe to it. */
Path openLogFile();
/* Common initialisation to be performed in child processes (i.e.,
both in builders and in build hooks). */
void initChild();
/* Delete the temporary directory, if we have one. */
void deleteTmpDir(bool force);
@ -742,6 +822,7 @@ DerivationGoal::DerivationGoal(const Path & drvPath, Worker & worker)
trace("created");
}
DerivationGoal::~DerivationGoal()
{
/* Careful: we should never ever throw an exception from a
@ -754,6 +835,7 @@ DerivationGoal::~DerivationGoal()
}
}
void DerivationGoal::killChild()
{
if (pid != -1) {
@ -778,6 +860,8 @@ void DerivationGoal::killChild()
assert(pid == -1);
}
hook.reset();
}
@ -1048,7 +1132,6 @@ void DerivationGoal::tryToBuild()
/* Is the build hook willing to accept this job? */
if (!preferLocalBuild) {
usingBuildHook = true;
switch (tryBuildHook()) {
case rpAccept:
/* Yes, it has started doing so. Wait until we get
@ -1056,7 +1139,8 @@ void DerivationGoal::tryToBuild()
state = &DerivationGoal::buildDone;
return;
case rpPostpone:
/* Not now; wait until at least one child finishes. */
/* Not now; wait until at least one child finishes or
the wake-up timeout expires. */
worker.waitForAWhile(shared_from_this());
outputLocks.unlock();
return;
@ -1066,8 +1150,6 @@ void DerivationGoal::tryToBuild()
}
}
usingBuildHook = false;
/* Make sure that we are allowed to start a build. If this
derivation prefers to be done locally, do it even if
maxBuildJobs is 0. */
@ -1108,10 +1190,17 @@ void DerivationGoal::buildDone()
to have terminated. In fact, the builder could also have
simply have closed its end of the pipe --- just don't do that
:-) */
int status;
pid_t savedPid;
if (hook) {
savedPid = hook->pid;
status = hook->pid.wait(true);
} else {
/* !!! this could block! security problem! solution: kill the
child */
pid_t savedPid = pid;
int status = pid.wait(true);
savedPid = pid;
status = pid.wait(true);
}
debug(format("builder process for `%1%' finished") % drvPath);
@ -1192,11 +1281,11 @@ void DerivationGoal::buildDone()
/* When using a build hook, the hook will return a remote
build failure using exit code 100. Anything else is a hook
problem. */
bool hookError = usingBuildHook &&
bool hookError = hook &&
(!WIFEXITED(status) || WEXITSTATUS(status) != 100);
if (printBuildTrace) {
if (usingBuildHook && hookError)
if (hook && hookError)
printMsg(lvlError, format("@ hook-failed %1% %2% %3% %4%")
% drvPath % drv.outputs["out"].path % status % e.msg());
else
@ -1231,162 +1320,74 @@ void DerivationGoal::buildDone()
}
DerivationGoal::HookReply DerivationGoal::tryBuildHook()
HookReply DerivationGoal::tryBuildHook()
{
if (!useBuildHook) return rpDecline;
Path buildHook = getEnv("NIX_BUILD_HOOK");
if (buildHook == "") return rpDecline;
buildHook = absPath(buildHook);
if (!useBuildHook || getEnv("NIX_BUILD_HOOK") == "") return rpDecline;
/* Create a directory where we will store files used for
communication between us and the build hook. */
tmpDir = createTempDir();
if (!worker.hook)
worker.hook = boost::shared_ptr<HookInstance>(new HookInstance);
/* Create the log file and pipe. */
Path logFile = openLogFile();
/* Create the communication pipes. */
toHook.create();
/* Fork the hook. */
pid = fork();
switch (pid) {
case -1:
throw SysError("unable to fork");
case 0:
try { /* child */
initChild();
string s;
foreach (DerivationOutputs::const_iterator, i, drv.outputs)
s += i->second.path + " ";
if (setenv("NIX_HELD_LOCKS", s.c_str(), 1))
throw SysError("setting an environment variable");
execl(buildHook.c_str(), buildHook.c_str(),
(worker.getNrLocalBuilds() < maxBuildJobs ? (string) "1" : "0").c_str(),
thisSystem.c_str(),
drv.platform.c_str(),
drvPath.c_str(),
(format("%1%") % maxSilentTime).str().c_str(),
NULL);
throw SysError(format("executing `%1%'") % buildHook);
} catch (std::exception & e) {
std::cerr << format("build hook error: %1%") % e.what() << std::endl;
}
quickExit(1);
}
/* parent */
pid.setSeparatePG(true);
pid.setKillSignal(SIGTERM);
logPipe.writeSide.close();
worker.childStarted(shared_from_this(),
pid, singleton<set<int> >(logPipe.readSide), false, false);
toHook.readSide.close();
writeLine(worker.hook->toHook.writeSide, (format("%1% %2% %3%") %
(worker.getNrLocalBuilds() < maxBuildJobs ? "1" : "0") % drv.platform % drvPath).str());
/* Read the first line of input, which should be a word indicating
whether the hook wishes to perform the build. */
string reply;
try {
while (true) {
string s = readLine(logPipe.readSide);
string s = readLine(worker.hook->fromHook.readSide);
if (string(s, 0, 2) == "# ") {
reply = string(s, 2);
break;
}
handleChildOutput(logPipe.readSide, s + "\n");
}
} catch (Error & e) {
terminateBuildHook(true);
throw;
handleChildOutput(worker.hook->fromHook.readSide, s + "\n");
}
debug(format("hook reply is `%1%'") % reply);
if (reply == "decline" || reply == "postpone") {
/* Clean up the child. !!! hacky / should verify */
terminateBuildHook();
if (reply == "decline" || reply == "postpone")
return reply == "decline" ? rpDecline : rpPostpone;
}
else if (reply != "accept")
throw Error(format("bad hook reply `%1%'") % reply);
else if (reply == "accept") {
printMsg(lvlInfo, format("using hook to build path(s) %1%")
printMsg(lvlTalkative, format("using hook to build path(s) %1%")
% showPaths(outputPaths(drv.outputs)));
/* Write the information that the hook needs to perform the
build, i.e., the set of input paths, the set of output
paths, and the references (pointer graph) in the input
paths. */
hook = worker.hook;
worker.hook.reset();
Path inputListFN = tmpDir + "/inputs";
Path outputListFN = tmpDir + "/outputs";
Path referencesFN = tmpDir + "/references";
/* The `inputs' file lists all inputs that have to be copied
to the remote system. This unfortunately has to contain
the entire derivation closure to ensure that the validity
invariant holds on the remote system. (I.e., it's
unfortunate that we have to list it since the remote system
*probably* already has it.) */
/* Tell the hook all the inputs that have to be copied to the
remote system. This unfortunately has to contain the entire
derivation closure to ensure that the validity invariant holds
on the remote system. (I.e., it's unfortunate that we have to
list it since the remote system *probably* already has it.) */
PathSet allInputs;
allInputs.insert(inputPaths.begin(), inputPaths.end());
computeFSClosure(drvPath, allInputs);
string s;
foreach (PathSet::iterator, i, allInputs) s += *i + "\n";
foreach (PathSet::iterator, i, allInputs) s += *i + " ";
writeLine(hook->toHook.writeSide, s);
writeFile(inputListFN, s);
/* The `outputs' file lists all outputs that have to be copied
from the remote system. */
/* Tell the hooks the outputs that have to be copied back from the
remote system. */
s = "";
foreach (DerivationOutputs::iterator, i, drv.outputs)
s += i->second.path + "\n";
writeFile(outputListFN, s);
s += i->second.path + " ";
writeLine(hook->toHook.writeSide, s);
/* The `references' file has exactly the format accepted by
`nix-store --register-validity'. */
writeFile(referencesFN,
makeValidityRegistration(allInputs, true, false));
hook->toHook.writeSide.close();
/* Tell the hook to proceed. */
writeLine(toHook.writeSide, "okay");
toHook.writeSide.close();
/* Create the log file and pipe. */
Path logFile = openLogFile();
worker.childStarted(shared_from_this(),
hook->pid, singleton<set<int> >(hook->fromHook.readSide), false, false);
if (printBuildTrace)
printMsg(lvlError, format("@ build-started %1% %2% %3% %4%")
% drvPath % drv.outputs["out"].path % drv.platform % logFile);
return rpAccept;
}
else throw Error(format("bad hook reply `%1%'") % reply);
}
void DerivationGoal::terminateBuildHook(bool kill)
{
debug("terminating build hook");
pid_t savedPid = pid;
if (kill)
pid.kill();
else
pid.wait(true);
/* `false' means don't wake up waiting goals, since we want to
keep this build slot ourselves. */
worker.childTerminated(savedPid, false);
toHook.writeSide.close();
fdLogFile.close();
logPipe.readSide.close();
deleteTmpDir(true); /* get rid of the hook's temporary directory */
}
@ -1667,9 +1668,12 @@ void DerivationGoal::startBuilder()
printMsg(lvlChatty, format("executing builder `%1%'") %
drv.builder);
/* Create the log file and pipe. */
/* Create the log file. */
Path logFile = openLogFile();
/* Create a pipe to get the output of the child. */
logPipe.create();
/* Fork a child to build the package. Note that while we
currently use forks to run and wait for the children, it
shouldn't be hard to use threads for this on systems where
@ -1710,18 +1714,23 @@ void DerivationGoal::startBuilder()
throw SysError(format("bind mount from `%1%' to `%2%' failed") % source % target);
}
/* Do the chroot(). initChild() will do a chdir() to
the temporary build directory to make sure the
current directory is in the chroot. (Actually the
order doesn't matter, since due to the bind mount
tmpDir and tmpRootDit/tmpDir are the same
directories.) */
/* Do the chroot(). Below we do a chdir() to the
temporary build directory to make sure the current
directory is in the chroot. (Actually the order
doesn't matter, since due to the bind mount tmpDir
and tmpRootDit/tmpDir are the same directories.) */
if (chroot(chrootRootDir.c_str()) == -1)
throw SysError(format("cannot change root directory to `%1%'") % chrootRootDir);
}
#endif
initChild();
commonChildInit(logPipe);
if (chdir(tmpDir.c_str()) == -1)
throw SysError(format("changing into `%1%'") % tmpDir);
/* Close all other file descriptors. */
closeMostFDs(set<int>());
#ifdef CAN_DO_LINUX32_BUILDS
if (drv.platform == "i686-linux" && thisSystem == "x86_64-linux") {
@ -1742,10 +1751,10 @@ void DerivationGoal::startBuilder()
/* If we are running in `build-users' mode, then switch to
the user we allocated above. Make sure that we drop
all root privileges. Note that initChild() above has
closed all file descriptors except std*, so that's
safe. Also note that setuid() when run as root sets
the real, effective and saved UIDs. */
all root privileges. Note that above we have closed
all file descriptors except std*, so that's safe. Also
note that setuid() when run as root sets the real,
effective and saved UIDs. */
if (buildUser.enabled()) {
printMsg(lvlChatty, format("switching to user `%1%'") % buildUser.getUser());
@ -1838,7 +1847,7 @@ void DerivationGoal::computeClosure()
/* When using a build hook, the build hook can register the output
as valid (by doing `nix-store --import'). If so we don't have
to do anything here. */
if (usingBuildHook) {
if (hook) {
bool allValid = true;
foreach (DerivationOutputs::iterator, i, drv.outputs)
if (!worker.store.isValidPath(i->second.path)) allValid = false;
@ -1966,32 +1975,10 @@ Path DerivationGoal::openLogFile()
if (fdLogFile == -1)
throw SysError(format("creating log file `%1%'") % logFileName);
/* Create a pipe to get the output of the child. */
logPipe.create();
return logFileName;
}
void DerivationGoal::initChild()
{
commonChildInit(logPipe);
if (chdir(tmpDir.c_str()) == -1)
throw SysError(format("changing into `%1%'") % tmpDir);
/* When running a hook, dup the communication pipes. */
if (usingBuildHook) {
toHook.writeSide.close();
if (dup2(toHook.readSide, STDIN_FILENO) == -1)
throw SysError("dupping to-hook read side");
}
/* Close all other file descriptors. */
closeMostFDs(set<int>());
}
void DerivationGoal::deleteTmpDir(bool force)
{
if (tmpDir != "") {
@ -2011,19 +1998,16 @@ void DerivationGoal::deleteTmpDir(bool force)
void DerivationGoal::handleChildOutput(int fd, const string & data)
{
if (fd == logPipe.readSide) {
if (verbosity >= buildVerbosity)
writeToStderr((unsigned char *) data.c_str(), data.size());
if (fdLogFile != -1)
writeFull(fdLogFile, (unsigned char *) data.c_str(), data.size());
}
else abort();
}
void DerivationGoal::handleEOF(int fd)
{
if (fd == logPipe.readSide) worker.wakeUp(shared_from_this());
worker.wakeUp(shared_from_this());
}

View file

@ -2,20 +2,22 @@
#set -x
drv=$4
while read x y drv rest; do
echo "HOOK for $drv" >&2
echo "HOOK for $drv" >&2
outPath=`sed 's/Derive(\[("out",\"\([^\"]*\)\".*/\1/' $drv`
outPath=`sed 's/Derive(\[("out",\"\([^\"]*\)\".*/\1/' $drv`
echo "output path is $outPath" >&2
echo "output path is $outPath" >&2
if `echo $outPath | grep -q input-1`; then
if `echo $outPath | grep -q input-1`; then
echo "# accept" >&2
read x
echo "got $x"
read inputs
read outputs
mkdir $outPath
echo "BAR" > $outPath/foo
else
else
echo "# decline" >&2
fi
fi
done