lix-releng-staging/scripts/build-remote.pl.in

301 lines
10 KiB
Perl
Raw Normal View History

2011-10-11 11:45:36 +00:00
#! @perl@ -w @perlFlags@
use Fcntl qw(:DEFAULT :flock);
use English '-no_match_vars';
use IO::Handle;
use Nix::Config;
2011-10-11 11:45:36 +00:00
use Nix::SSH qw/sshOpts openSSHConnection/;
use Nix::CopyClosure;
use Nix::Store;
no warnings('once');
# General operation:
#
# Try to find a free machine of type $neededSystem. We do this as
# follows:
# - We acquire an exclusive lock on $currentLoad/main-lock.
# - For each machine $machine of type $neededSystem and for each $slot
# less than the maximum load for that machine, we try to get an
# exclusive lock on $currentLoad/$machine-$slot (without blocking).
# If we get such a lock, we send "accept" to the caller. Otherwise,
# we send "postpone" and exit.
# - We release the exclusive lock on $currentLoad/main-lock.
# - We perform the build on $neededSystem.
# - We release the exclusive lock on $currentLoad/$machine-$slot.
#
# The nice thing about this scheme is that if we die prematurely, the
# locks are released automatically.
# Make sure that we don't get any SSH passphrase or host key popups -
# if there is any problem it should fail, not do something
# interactive.
$ENV{"DISPLAY"} = "";
$ENV{"SSH_ASKPASS"} = "";
sub sendReply {
my $reply = shift;
print STDERR "# $reply\n";
}
sub all { $_ || return 0 for @_; 1 }
# Initialisation.
my $loadIncreased = 0;
my ($localSystem, $maxSilentTime, $printBuildTrace, $buildTimeout) = @ARGV;
my $currentLoad = $ENV{"NIX_CURRENT_LOAD"};
my $conf = $ENV{"NIX_REMOTE_SYSTEMS"};
sub openSlotLock {
my ($machine, $slot) = @_;
my $slotLockFn = "$currentLoad/" . (join '+', @{$machine->{systemTypes}}) . "-" . $machine->{hostName} . "-$slot";
my $slotLock = new IO::Handle;
sysopen $slotLock, "$slotLockFn", O_RDWR|O_CREAT, 0600 or die;
return $slotLock;
}
# Read the list of machines.
my @machines;
if (defined $conf && -e $conf) {
open CONF, "<$conf" or die;
while (<CONF>) {
chomp;
s/\#.*$//g;
next if /^\s*$/;
my @tokens = split /\s/, $_;
my @supportedFeatures = split(/,/, $tokens[5] || "");
my @mandatoryFeatures = split(/,/, $tokens[6] || "");
push @machines,
{ hostName => $tokens[0]
, systemTypes => [ split(/,/, $tokens[1]) ]
, sshKeys => $tokens[2]
, maxJobs => int($tokens[3])
, speedFactor => 1.0 * (defined $tokens[4] ? int($tokens[4]) : 1)
, supportedFeatures => [ @supportedFeatures, @mandatoryFeatures ]
, mandatoryFeatures => [ @mandatoryFeatures ]
, enabled => 1
};
}
close CONF;
}
# Wait for the calling process to ask us whether we can build some derivation.
my ($drvPath, $hostName, $slotLock);
REQ: while (1) {
$_ = <STDIN> || exit 0;
(my $amWilling, my $neededSystem, $drvPath, my $requiredFeatures) = split;
my @requiredFeatures = split /,/, $requiredFeatures;
my $canBuildLocally = $amWilling && ($localSystem eq $neededSystem);
if (!defined $currentLoad) {
sendReply "decline";
next;
}
# Acquire the exclusive lock on $currentLoad/main-lock.
mkdir $currentLoad, 0777 or die unless -d $currentLoad;
my $mainLock = "$currentLoad/main-lock";
sysopen MAINLOCK, "$mainLock", O_RDWR|O_CREAT, 0600 or die;
flock(MAINLOCK, LOCK_EX) or die;
while (1) {
# Find all machine that can execute this build, i.e., that
# support builds for the given platform and features, and are
# not at their job limit.
my $rightType = 0;
my @available = ();
LOOP: foreach my $cur (@machines) {
if ($cur->{enabled}
&& (grep { $neededSystem eq $_ } @{$cur->{systemTypes}})
&& all(map { my $f = $_; 0 != grep { $f eq $_ } @{$cur->{supportedFeatures}} } (@requiredFeatures, @mandatoryFeatures))
&& all(map { my $f = $_; 0 != grep { $f eq $_ } @requiredFeatures } @{$cur->{mandatoryFeatures}})
)
{
$rightType = 1;
# We have a machine of the right type. Determine the load on
# the machine.
my $slot = 0;
my $load = 0;
my $free;
while ($slot < $cur->{maxJobs}) {
my $slotLock = openSlotLock($cur, $slot);
if (flock($slotLock, LOCK_EX | LOCK_NB)) {
$free = $slot unless defined $free;
flock($slotLock, LOCK_UN) or die;
} else {
$load++;
}
close $slotLock;
$slot++;
}
push @available, { machine => $cur, load => $load, free => $free }
if $load < $cur->{maxJobs};
}
}
if (defined $ENV{NIX_DEBUG_HOOK}) {
print STDERR "load on " . $_->{machine}->{hostName} . " = " . $_->{load} . "\n"
foreach @available;
}
# Didn't find any available machine? Then decline or postpone.
if (scalar @available == 0) {
# Postpone if we have a machine of the right type, except
# if the local system can and wants to do the build.
if ($rightType && !$canBuildLocally) {
sendReply "postpone";
} else {
sendReply "decline";
}
close MAINLOCK;
next REQ;
}
# Prioritise the available machines as follows:
# - First by load divided by speed factor, rounded to the nearest
# integer. This causes fast machines to be preferred over slow
# machines with similar loads.
# - Then by speed factor.
# - Finally by load.
sub lf { my $x = shift; return int($x->{load} / $x->{machine}->{speedFactor} + 0.4999); }
@available = sort
{ lf($a) <=> lf($b)
|| $b->{machine}->{speedFactor} <=> $a->{machine}->{speedFactor}
|| $a->{load} <=> $b->{load}
} @available;
# Select the best available machine and lock a free slot.
my $selected = $available[0];
my $machine = $selected->{machine};
$slotLock = openSlotLock($machine, $selected->{free});
flock($slotLock, LOCK_EX | LOCK_NB) or die;
utime undef, undef, $slotLock;
close MAINLOCK;
# Connect to the selected machine.
@sshOpts = ("-i", $machine->{sshKeys}, "-x");
$hostName = $machine->{hostName};
last REQ if openSSHConnection $hostName;
warn "unable to open SSH connection to $hostName, trying other available machines...\n";
$machine->{enabled} = 0;
}
}
# Tell Nix we've accepted the build.
sendReply "accept";
my @inputs = split /\s/, readline(STDIN);
my @outputs = split /\s/, readline(STDIN);
print STDERR "building `$drvPath' on `$hostName'\n";
print STDERR "@ build-remote $drvPath $hostName\n" if $printBuildTrace;
my $maybeSign = "";
$maybeSign = "--sign" if -e "$Nix::Config::confDir/signing-key.sec";
# Register the derivation as a temporary GC root. Note that $PPID is
# the PID of the remote SSH process, which, due to the use of a
# persistant SSH connection, should be the same across all remote
# command invocations for this session.
my $rootsDir = "@localstatedir@/nix/gcroots/tmp";
system("ssh $hostName @sshOpts 'mkdir -m 1777 -p $rootsDir; ln -sfn $drvPath $rootsDir/\$PPID.drv'");
sub removeRoots {
system("ssh $hostName @sshOpts 'rm -f $rootsDir/\$PPID.drv $rootsDir/\$PPID.out'");
}
# Copy the derivation and its dependencies to the build machine. This
# is guarded by an exclusive lock per machine to prevent multiple
# build-remote instances from copying to a machine simultaneously.
# That's undesirable because we may end up with N instances uploading
# the same missing path simultaneously, causing the effective network
# bandwidth and target disk speed to be divided by N.
my $uploadLock = "$currentLoad/$hostName.upload-lock";
sysopen UPLOADLOCK, "$uploadLock", O_RDWR|O_CREAT, 0600 or die;
eval {
local $SIG{ALRM} = sub { die "alarm\n" };
# Don't wait forever, so that a process that gets stuck while
# holding the lock doesn't block everybody else indefinitely.
# It's safe to continue after a timeout, just (potentially)
# inefficient.
alarm 15 * 60;
flock(UPLOADLOCK, LOCK_EX);
alarm 0;
};
if ($@) {
die unless $@ eq "alarm\n";
print STDERR "somebody is hogging $uploadLock, continuing...\n";
unlink $uploadLock;
}
Nix::CopyClosure::copyTo($hostName, [ @sshOpts ], [ $drvPath, @inputs ], "", "", 0, 0, $maybeSign ne "", "");
close UPLOADLOCK;
# Perform the build.
my $buildFlags =
"--max-silent-time $maxSilentTime --option build-timeout $buildTimeout"
. " --fallback --add-root $rootsDir/\$PPID.out --option verbosity 0"
. " --option build-keep-log false";
# We let the remote side kill its process group when the connection is
# closed unexpectedly. This is necessary to ensure that no processes
# are left running on the remote system if the local Nix process is
# killed. (SSH itself doesn't kill child processes if the connection
# is interrupted unless the `-tt' flag is used to force a pseudo-tty,
# in which case every child receives SIGHUP; however, `-tt' doesn't
# work on some platforms when connection sharing is used.)
pipe STDIN, DUMMY; # make sure we have a readable STDIN
2011-11-29 10:51:22 +00:00
if (system("exec ssh $hostName @sshOpts '(read; kill -INT -\$\$) <&0 & nix-store -r $drvPath $buildFlags > /dev/null' 2>&4") != 0) {
# Note that if we get exit code 100 from `nix-store -r', it
# denotes a permanent build failure (as opposed to an SSH problem
# or a temporary Nix problem). We propagate this to the caller to
# allow it to distinguish between transient and permanent
# failures.
my $res = $? >> 8;
print STDERR "build of `$drvPath' on `$hostName' failed with exit code $res\n";
removeRoots;
exit $res;
}
#print "build of `$drvPath' on `$hostName' succeeded\n";
# Copy the output from the build machine.
2010-08-25 12:19:30 +00:00
foreach my $output (@outputs) {
my $maybeSignRemote = "";
$maybeSignRemote = "--sign" if $UID != 0;
next if isValidPath($output);
2011-11-29 10:51:22 +00:00
system("exec ssh $hostName @sshOpts 'nix-store --export $maybeSignRemote $output'" .
"| NIX_HELD_LOCKS=$output @bindir@/nix-store --import > /dev/null") == 0
or die "cannot copy $output from $hostName: $?";
}
# Get rid of the temporary GC roots.
removeRoots;