* In the build hook, if connecting to a machine fails, try the other

machines of the right type (if available).  This makes the build
  farm more robust to failures.
This commit is contained in:
Eelco Dolstra 2010-02-03 20:35:37 +00:00
parent f56a039775
commit d0c32dc135
3 changed files with 83 additions and 79 deletions

View file

@ -71,6 +71,7 @@ while (<CONF>) {
, sshKeys => $3 , sshKeys => $3
, maxJobs => $4 , maxJobs => $4
, speedFactor => 1.0 * ($6 || 1) , speedFactor => 1.0 * ($6 || 1)
, enabled => 1
}; };
} }
@ -92,12 +93,16 @@ sub openSlotLock {
} }
# Find all machine that can execute this build, i.e., that support my $hostName;
# builds for the given platform and are not at their job limit.
my $rightType = 0; while (1) {
my @available = ();
LOOP: foreach my $cur (@machines) { # Find all machine that can execute this build, i.e., that support
if (grep { $neededSystem eq $_ } @{$cur->{systemTypes}}) { # builds for the given platform and are not at their job limit.
my $rightType = 0;
my @available = ();
LOOP: foreach my $cur (@machines) {
if ($cur->{enabled} && grep { $neededSystem eq $_ } @{$cur->{systemTypes}}) {
$rightType = 1; $rightType = 1;
# We have a machine of the right type. Determine the load on # We have a machine of the right type. Determine the load on
@ -120,16 +125,16 @@ LOOP: foreach my $cur (@machines) {
push @available, { machine => $cur, load => $load, free => $free } push @available, { machine => $cur, load => $load, free => $free }
if $load < $cur->{maxJobs}; if $load < $cur->{maxJobs};
} }
} }
if (defined $ENV{NIX_DEBUG_HOOK}) { if (defined $ENV{NIX_DEBUG_HOOK}) {
print STDERR "load on " . $_->{machine}->{hostName} . " = " . $_->{load} . "\n" print STDERR "load on " . $_->{machine}->{hostName} . " = " . $_->{load} . "\n"
foreach @available; foreach @available;
} }
# Didn't find any available machine? Then decline or postpone. # Didn't find any available machine? Then decline or postpone.
if (scalar @available == 0) { if (scalar @available == 0) {
# Postpone if we have a machine of the right type, except if the # Postpone if we have a machine of the right type, except if the
# local system can and wants to do the build. # local system can and wants to do the build.
if ($rightType && !$canBuildLocally) { if ($rightType && !$canBuildLocally) {
@ -138,43 +143,46 @@ if (scalar @available == 0) {
} else { } else {
decline; decline;
} }
} }
# Prioritise the available machines as follows: # Prioritise the available machines as follows:
# - First by load divided by speed factor, rounded to the nearest # - First by load divided by speed factor, rounded to the nearest
# integer. This causes fast machines to be preferred over slow # integer. This causes fast machines to be preferred over slow
# machines with similar loads. # machines with similar loads.
# - Then by speed factor. # - Then by speed factor.
# - Finally by load. # - Finally by load.
sub lf { my $x = shift; return int($x->{load} / $x->{machine}->{speedFactor} + 0.4999); } sub lf { my $x = shift; return int($x->{load} / $x->{machine}->{speedFactor} + 0.4999); }
@available = sort @available = sort
{ lf($a) <=> lf($b) { lf($a) <=> lf($b)
|| $b->{machine}->{speedFactor} <=> $a->{machine}->{speedFactor} || $b->{machine}->{speedFactor} <=> $a->{machine}->{speedFactor}
|| $a->{load} <=> $b->{load} || $a->{load} <=> $b->{load}
} @available; } @available;
# Select the best available machine and lock a free slot. # Select the best available machine and lock a free slot.
my $selected = $available[0]; my $selected = $available[0];
my $machine = $selected->{machine}; my $machine = $selected->{machine};
my $slotLock = openSlotLock($machine, $selected->{free}); my $slotLock = openSlotLock($machine, $selected->{free});
flock($slotLock, LOCK_EX | LOCK_NB) or die; flock($slotLock, LOCK_EX | LOCK_NB) or die;
utime undef, undef, $slotLock; utime undef, undef, $slotLock;
close MAINLOCK; close MAINLOCK;
# Connect to the selected machine.
@sshOpts = ("-i", $machine->{sshKeys}, "-x");
$hostName = $machine->{hostName};
last if openSSHConnection $hostName;
warn "unable to open SSH connection to $hostName, trying other available machines...\n";
$machine->{enabled} = 0;
}
# Tell Nix we've accepted the build. # Tell Nix we've accepted the build.
sendReply "accept"; sendReply "accept";
if (defined $ENV{NIX_DEBUG_HOOK}) {
my $hostName = $machine->{hostName};
my $sp = $machine->{speedFactor};
print STDERR "building `$drvPath' on `$hostName' - $sp - " . $selected->{free} . "\n";
sleep 10;
exit 0;
}
my $x = <STDIN>; my $x = <STDIN>;
chomp $x; chomp $x;
@ -184,13 +192,8 @@ if ($x ne "okay") {
# Do the actual build. # Do the actual build.
my $hostName = $machine->{hostName};
print STDERR "building `$drvPath' on `$hostName'\n"; print STDERR "building `$drvPath' on `$hostName'\n";
push @sshOpts, "-i", $machine->{sshKeys}, "-x";
openSSHConnection $hostName;
my $inputs = `cat inputs`; die if ($? != 0); my $inputs = `cat inputs`; die if ($? != 0);
$inputs =~ s/\n/ /g; $inputs =~ s/\n/ /g;

View file

@ -53,7 +53,7 @@ while (@ARGV) {
} }
openSSHConnection $sshHost; openSSHConnection $sshHost or die "$0: unable to start SSH\n";
if ($toMode) { # Copy TO the remote machine. if ($toMode) { # Copy TO the remote machine.

View file

@ -12,15 +12,16 @@ sub openSSHConnection {
my ($host) = @_; my ($host) = @_;
die if $sshStarted; die if $sshStarted;
$sshHost = $host; $sshHost = $host;
return if system("ssh $sshHost @sshOpts -O check 2> /dev/null") == 0; return 1 if system("ssh $sshHost @sshOpts -O check 2> /dev/null") == 0;
my $tmpDir = tempdir("nix-ssh.XXXXXX", CLEANUP => 1, TMPDIR => 1) my $tmpDir = tempdir("nix-ssh.XXXXXX", CLEANUP => 1, TMPDIR => 1)
or die "cannot create a temporary directory"; or die "cannot create a temporary directory";
push @sshOpts, "-S", "$tmpDir/control"; push @sshOpts, "-S", "$tmpDir/control";
system("ssh $sshHost @sshOpts -M -N -f") == 0 system("ssh $sshHost @sshOpts -M -N -f") == 0
or die "unable to start SSH: $?"; or return 0;
$sshStarted = 1; $sshStarted = 1;
return 1;
} }
# Tell the master SSH client to exit. # Tell the master SSH client to exit.