#! @perl@ -w @perlFlags@ use utf8; use Fcntl qw(:DEFAULT :flock); use English '-no_match_vars'; use IO::Handle; use Nix::Config; use Nix::SSH; use Nix::CopyClosure; use Nix::Store; use Encode; no warnings('once'); STDERR->autoflush(1); binmode STDERR, ":encoding(utf8)"; my $debug = defined $ENV{NIX_DEBUG_HOOK}; # General operation: # # Try to find a free machine of type $neededSystem. We do this as # follows: # - We acquire an exclusive lock on $currentLoad/main-lock. # - For each machine $machine of type $neededSystem and for each $slot # less than the maximum load for that machine, we try to get an # exclusive lock on $currentLoad/$machine-$slot (without blocking). # If we get such a lock, we send "accept" to the caller. Otherwise, # we send "postpone" and exit. # - We release the exclusive lock on $currentLoad/main-lock. # - We perform the build on $neededSystem. # - We release the exclusive lock on $currentLoad/$machine-$slot. # # The nice thing about this scheme is that if we die prematurely, the # locks are released automatically. # Make sure that we don't get any SSH passphrase or host key popups - # if there is any problem it should fail, not do something # interactive. $ENV{"DISPLAY"} = ""; $ENV{"SSH_ASKPASS"} = ""; sub sendReply { my $reply = shift; print STDERR "# $reply\n"; } sub all { $_ || return 0 for @_; 1 } # Initialisation. my $loadIncreased = 0; my ($localSystem, $maxSilentTime, $buildTimeout) = @ARGV; my $currentLoad = $ENV{"NIX_CURRENT_LOAD"} // "/run/nix/current-load"; my $conf = $ENV{"NIX_REMOTE_SYSTEMS"} // "@sysconfdir@/nix/machines"; sub openSlotLock { my ($machine, $slot) = @_; my $slotLockFn = "$currentLoad/" . (join '+', @{$machine->{systemTypes}}) . "-" . $machine->{hostName} . "-$slot"; my $slotLock = new IO::Handle; sysopen $slotLock, "$slotLockFn", O_RDWR|O_CREAT, 0600 or die; return $slotLock; } # Read the list of machines. my @machines; if (defined $conf && -e $conf) { open CONF, "<$conf" or die; while (<CONF>) { chomp; s/\#.*$//g; next if /^\s*$/; my @tokens = split /\s/, $_; my @supportedFeatures = split(/,/, $tokens[5] || ""); my @mandatoryFeatures = split(/,/, $tokens[6] || ""); push @machines, { hostName => $tokens[0] , systemTypes => [ split(/,/, $tokens[1]) ] , sshKey => $tokens[2] , maxJobs => int($tokens[3]) , speedFactor => 1.0 * (defined $tokens[4] ? int($tokens[4]) : 1) , supportedFeatures => [ @supportedFeatures, @mandatoryFeatures ] , mandatoryFeatures => [ @mandatoryFeatures ] , enabled => 1 }; } close CONF; } # Wait for the calling process to ask us whether we can build some derivation. my ($drvPath, $hostName, $slotLock); my ($from, $to); REQ: while (1) { $_ = <STDIN> || exit 0; (my $amWilling, my $neededSystem, $drvPath, my $requiredFeatures) = split; my @requiredFeatures = split /,/, $requiredFeatures; my $canBuildLocally = $amWilling && ($localSystem eq $neededSystem); if (!defined $currentLoad) { sendReply "decline"; next; } # Acquire the exclusive lock on $currentLoad/main-lock. mkdir $currentLoad, 0777 or die unless -d $currentLoad; my $mainLock = "$currentLoad/main-lock"; sysopen MAINLOCK, "$mainLock", O_RDWR|O_CREAT, 0600 or die; flock(MAINLOCK, LOCK_EX) or die; while (1) { # Find all machine that can execute this build, i.e., that # support builds for the given platform and features, and are # not at their job limit. my $rightType = 0; my @available = (); LOOP: foreach my $cur (@machines) { if ($cur->{enabled} && (grep { $neededSystem eq $_ } @{$cur->{systemTypes}}) && all(map { my $f = $_; 0 != grep { $f eq $_ } @{$cur->{supportedFeatures}} } (@requiredFeatures, @mandatoryFeatures)) && all(map { my $f = $_; 0 != grep { $f eq $_ } @requiredFeatures } @{$cur->{mandatoryFeatures}}) ) { $rightType = 1; # We have a machine of the right type. Determine the load on # the machine. my $slot = 0; my $load = 0; my $free; while ($slot < $cur->{maxJobs}) { my $slotLock = openSlotLock($cur, $slot); if (flock($slotLock, LOCK_EX | LOCK_NB)) { $free = $slot unless defined $free; flock($slotLock, LOCK_UN) or die; } else { $load++; } close $slotLock; $slot++; } push @available, { machine => $cur, load => $load, free => $free } if $load < $cur->{maxJobs}; } } if ($debug) { print STDERR "load on " . $_->{machine}->{hostName} . " = " . $_->{load} . "\n" foreach @available; } # Didn't find any available machine? Then decline or postpone. if (scalar @available == 0) { # Postpone if we have a machine of the right type, except # if the local system can and wants to do the build. if ($rightType && !$canBuildLocally) { sendReply "postpone"; } else { sendReply "decline"; } close MAINLOCK; next REQ; } # Prioritise the available machines as follows: # - First by load divided by speed factor, rounded to the nearest # integer. This causes fast machines to be preferred over slow # machines with similar loads. # - Then by speed factor. # - Finally by load. sub lf { my $x = shift; return int($x->{load} / $x->{machine}->{speedFactor} + 0.4999); } @available = sort { lf($a) <=> lf($b) || $b->{machine}->{speedFactor} <=> $a->{machine}->{speedFactor} || $a->{load} <=> $b->{load} } @available; # Select the best available machine and lock a free slot. my $selected = $available[0]; my $machine = $selected->{machine}; $slotLock = openSlotLock($machine, $selected->{free}); flock($slotLock, LOCK_EX | LOCK_NB) or die; utime undef, undef, $slotLock; close MAINLOCK; # Connect to the selected machine. my @sshOpts = ("-i", $machine->{sshKey}); $hostName = $machine->{hostName}; eval { ($from, $to) = connectToRemoteNix($hostName, \@sshOpts, "2>&4"); # FIXME: check if builds are inhibited. }; last REQ unless $@; print STDERR "$@"; warn "unable to open SSH connection to ‘$hostName’, trying other available machines...\n"; $from = undef; $to = undef; $machine->{enabled} = 0; } } # Tell Nix we've accepted the build. sendReply "accept"; my @inputs = split /\s/, readline(STDIN); my @outputs = split /\s/, readline(STDIN); # Copy the derivation and its dependencies to the build machine. This # is guarded by an exclusive lock per machine to prevent multiple # build-remote instances from copying to a machine simultaneously. # That's undesirable because we may end up with N instances uploading # the same missing path simultaneously, causing the effective network # bandwidth and target disk speed to be divided by N. my $uploadLock = "$currentLoad/$hostName.upload-lock"; sysopen UPLOADLOCK, "$uploadLock", O_RDWR|O_CREAT, 0600 or die; eval { local $SIG{ALRM} = sub { die "alarm\n" }; # Don't wait forever, so that a process that gets stuck while # holding the lock doesn't block everybody else indefinitely. # It's safe to continue after a timeout, just (potentially) # inefficient. alarm 15 * 60; flock(UPLOADLOCK, LOCK_EX); alarm 0; }; if ($@) { die unless $@ eq "alarm\n"; print STDERR "somebody is hogging $uploadLock, continuing...\n"; unlink $uploadLock; } Nix::CopyClosure::copyToOpen($from, $to, $hostName, [ $drvPath, @inputs ], 0, 0); close UPLOADLOCK; # Perform the build. print STDERR "building ‘$drvPath’ on ‘$hostName’\n"; writeInt(6, $to) or die; # == cmdBuildPaths writeStrings([$drvPath], $to); writeInt($maxSilentTime, $to); writeInt($buildTimeout, $to); my $res = readInt($from); if ($res != 0) { my $msg = decode("utf-8", readString($from)); print STDERR "error: $msg on ‘$hostName’\n"; exit $res; } # Copy the output from the build machine. my @outputs2 = grep { !isValidPath($_) } @outputs; if (scalar @outputs2 > 0) { writeInt(5, $to); # == cmdExportPaths writeInt(0, $to); # don't sign writeStrings(\@outputs2, $to); $ENV{'NIX_HELD_LOCKS'} = "@outputs2"; # FIXME: ugly importPaths(fileno($from), 1); }