From e4407f8c9324d459e0cf676a5a3b29d807a738ee Mon Sep 17 00:00:00 2001 From: Graham Christensen Date: Fri, 21 Jan 2022 15:12:10 -0500 Subject: [PATCH 1/5] HydraTestContext: expose the nix state dir --- t/lib/HydraTestContext.pm | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/t/lib/HydraTestContext.pm b/t/lib/HydraTestContext.pm index 9f9db4f2..b8c254d0 100644 --- a/t/lib/HydraTestContext.pm +++ b/t/lib/HydraTestContext.pm @@ -69,6 +69,7 @@ sub new { _db => undef, db_handle => $pgsql, tmpdir => $dir, + nix_state_dir => "$dir/nix/var/nix", testdir => abs_path(dirname(__FILE__) . "/.."), jobsdir => abs_path(dirname(__FILE__) . "/../jobs") }; @@ -115,6 +116,12 @@ sub jobsdir { return $self->{jobsdir}; } +sub nix_state_dir { + my ($self) = @_; + + return $self->{nix_state_dir}; +} + # Create a jobset, evaluate it, and optionally build the jobs. # # In return, you get a hash of all the Builds records, keyed From 5c3e48fd0dc334efb4aafffc22c26a4db99d390a Mon Sep 17 00:00:00 2001 From: Graham Christensen Date: Fri, 21 Jan 2022 15:17:47 -0500 Subject: [PATCH 2/5] CliRunners: decode UTF8 before printing stderr/stdout MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes yath output. It used to say: step â is now runnable Now it says: step ‘/run...2ipqz6hbc41m4c5w5bkq-dependent-job.drv’ is now runnable --- t/lib/CliRunners.pm | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/t/lib/CliRunners.pm b/t/lib/CliRunners.pm index e693eeb7..7ae936c0 100644 --- a/t/lib/CliRunners.pm +++ b/t/lib/CliRunners.pm @@ -27,6 +27,8 @@ sub evalSucceeds { $jobset->discard_changes; # refresh from DB if ($res) { chomp $stdout; chomp $stderr; + utf8::decode($stdout) or die "Invalid unicode in stdout."; + utf8::decode($stderr) or die "Invalid unicode in stderr."; print STDERR "Evaluation unexpectedly failed for jobset ".$jobset->project->name.":".$jobset->name.": \n".$jobset->errormsg."\n" if $jobset->errormsg; print STDERR "STDOUT: $stdout\n" if $stdout ne ""; print STDERR "STDERR: $stderr\n" if $stderr ne ""; @@ -40,6 +42,8 @@ sub evalFails { $jobset->discard_changes; # refresh from DB if (!$res) { chomp $stdout; chomp $stderr; + utf8::decode($stdout) or die "Invalid unicode in stdout."; + utf8::decode($stderr) or die "Invalid unicode in stderr."; print STDERR "Evaluation unexpectedly succeeded for jobset ".$jobset->project->name.":".$jobset->name.": \n".$jobset->errormsg."\n" if $jobset->errormsg; print STDERR "STDOUT: $stdout\n" if $stdout ne ""; print STDERR "STDERR: $stderr\n" if $stderr ne ""; @@ -51,6 +55,8 @@ sub runBuild { my ($build) = @_; my ($res, $stdout, $stderr) = captureStdoutStderr(60, ("hydra-queue-runner", "-vvvv", "--build-one", $build->id)); if ($res) { + utf8::decode($stdout) or die "Invalid unicode in stdout."; + utf8::decode($stderr) or die "Invalid unicode in stderr."; print STDERR "Queue runner stdout: $stdout\n" if $stdout ne ""; print STDERR "Queue runner stderr: $stderr\n" if $stderr ne ""; } @@ -60,6 +66,8 @@ sub runBuild { sub sendNotifications() { my ($res, $stdout, $stderr) = captureStdoutStderr(60, ("hydra-notify", "--queued-only")); if ($res) { + utf8::decode($stdout) or die "Invalid unicode in stdout."; + utf8::decode($stderr) or die "Invalid unicode in stderr."; print STDERR "hydra notify stdout: $stdout\n" if $stdout ne ""; print STDERR "hydra notify stderr: $stderr\n" if $stderr ne ""; } From 952f629b7cf1ea918023747a198efff1d1686457 Mon Sep 17 00:00:00 2001 From: Graham Christensen Date: Fri, 21 Jan 2022 15:13:52 -0500 Subject: [PATCH 3/5] Test the queue runner in the scenario where a dependency is available in the cache but GC'd locally, where we're building locally --- t/jobs/dependencies/dependency.nix | 17 +++++++ t/jobs/dependencies/dependentOnly.nix | 4 ++ t/jobs/dependencies/underlyingOnly.nix | 4 ++ .../build-locally-with-substitutable-path.t | 51 +++++++++++++++++++ 4 files changed, 76 insertions(+) create mode 100644 t/jobs/dependencies/dependency.nix create mode 100644 t/jobs/dependencies/dependentOnly.nix create mode 100644 t/jobs/dependencies/underlyingOnly.nix create mode 100644 t/queue-runner/build-locally-with-substitutable-path.t diff --git a/t/jobs/dependencies/dependency.nix b/t/jobs/dependencies/dependency.nix new file mode 100644 index 00000000..f528bdc8 --- /dev/null +++ b/t/jobs/dependencies/dependency.nix @@ -0,0 +1,17 @@ +{ exposeUnderlyingJob, exposeDependentJob }: +with import ../config.nix; +let + underlyingJob = mkDerivation { + name = "underlying-job"; + builder = ../empty-dir-builder.sh; + }; + + dependentJob = mkDerivation { + name = "dependent-job"; + builder = ../empty-dir-builder.sh; + inherit underlyingJob; + }; +in +(if exposeUnderlyingJob then { inherit underlyingJob; } else { }) // +(if exposeDependentJob then { inherit dependentJob; } else { }) // +{ } diff --git a/t/jobs/dependencies/dependentOnly.nix b/t/jobs/dependencies/dependentOnly.nix new file mode 100644 index 00000000..edf8a912 --- /dev/null +++ b/t/jobs/dependencies/dependentOnly.nix @@ -0,0 +1,4 @@ +import ./dependency.nix { + exposeUnderlyingJob = false; + exposeDependentJob = true; +} diff --git a/t/jobs/dependencies/underlyingOnly.nix b/t/jobs/dependencies/underlyingOnly.nix new file mode 100644 index 00000000..106d17fe --- /dev/null +++ b/t/jobs/dependencies/underlyingOnly.nix @@ -0,0 +1,4 @@ +import ./dependency.nix { + exposeUnderlyingJob = true; + exposeDependentJob = false; +} diff --git a/t/queue-runner/build-locally-with-substitutable-path.t b/t/queue-runner/build-locally-with-substitutable-path.t new file mode 100644 index 00000000..f335d284 --- /dev/null +++ b/t/queue-runner/build-locally-with-substitutable-path.t @@ -0,0 +1,51 @@ +use strict; +use warnings; +use Setup; +use Data::Dumper; +use Test2::V0; +my $ctx = test_context( + use_external_destination_store => 1 +); + +# This test is regarding https://github.com/NixOS/hydra/pull/1126 +# +# A hydra instance was regularly failing to build derivations with: +# +# possibly transient failure building ‘/nix/store/X.drv’ on ‘localhost’: +# dependency '/nix/store/Y' of '/nix/store/Y.drv' does not exist, +# and substitution is disabled +# +# However it would only fail when building on localhost, and it would only +# fail if the build output was already in the binary cache. +# +# This test replicates this scenario by having two jobs, underlyingJob and +# dependentJob. dependentJob depends on underlyingJob. We first build +# underlyingJob and copy it to an external cache. Then forcefully delete +# the output of underlyingJob, and build dependentJob. In order to pass +# it must either rebuild underlyingJob or fetch it from the cache. + + +subtest "Building, caching, and then garbage collecting the underlying job" => sub { + my $builds = $ctx->makeAndEvaluateJobset( + expression => "dependencies/underlyingOnly.nix", + build => 1 + ); + + my $path = $builds->{"underlyingJob"}->buildoutputs->find({ name => "out" })->path; + + ok(unlink(Hydra::Helper::Nix::gcRootFor($path)), "Unlinking the GC root for underlying Dependency succeeds"); + + (my $ret, my $stdout, my $stderr) = captureStdoutStderr(1, "nix-store", "--delete", $path); + is($ret, 0, "Deleting the underlying dependency should succeed"); +}; + +subtest "Building the dependent job should now succeed, even though we're missing a local dependency" => sub { + my $builds = $ctx->makeAndEvaluateJobset( + expression => "dependencies/dependentOnly.nix" + ); + + ok(runBuild($builds->{"dependentJob"}), "building the job should succeed"); +}; + + +done_testing; From 7e9e82398de272967c90c3c10970d5b0f5901a61 Mon Sep 17 00:00:00 2001 From: Graham Christensen Date: Thu, 20 Jan 2022 15:04:57 -0500 Subject: [PATCH 4/5] build-remote: copy missing paths from the binary cache to localhost MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In a Hydra instance I saw: possibly transient failure building ‘/nix/store/X.drv’ on ‘localhost’: dependency '/nix/store/Y' of '/nix/store/Y.drv' does not exist, and substitution is disabled This is confusing because the Hydra in question does have substitution enabled. This instance uses: keep-outputs = true keep-derivations = true and an S3 binary cache which is not configured as a substituter in the nix.conf. It appears this instance encountered a situation where store path Y was built and present in the binary cache, and Y.drv was GC rooted on the instance, however Y was not on the host. When Hydra would try to build this path locally, it would look in the binary cache to see if it was cached: (nix) 439 bool valid = isValidPathUncached(storePath); 440 441 if (diskCache && !valid) 442 // FIXME: handle valid = true case. 443 diskCache->upsertNarInfo(getUri(), hashPart, 0); 444 445 return valid; Since it was cached, the store path was considered Valid. The queue monitor would then not put this input in for substitution, because the path is valid: (hydra) 470 if (!destStore->isValidPath(*i.second.path(*localStore, step->drv->name, i.first))) { 471 valid = false; 472 missing.insert_or_assign(i.first, i.second); 473 } Hydra appears to correctly handle the case of missing paths that need to be substituted from the binary cache already, but since most Hydra instances use `keep-outputs` *and* all paths in the binary cache originate from that machine, it is not common for a path to be cached and not GC rooted locally. I'll run Hydra with this patch for a while and see if we run in to the problem again. A big thanks to John Ericson who helped debug this particular issue. --- src/hydra-queue-runner/build-remote.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/hydra-queue-runner/build-remote.cc b/src/hydra-queue-runner/build-remote.cc index 9f789978..a82e911e 100644 --- a/src/hydra-queue-runner/build-remote.cc +++ b/src/hydra-queue-runner/build-remote.cc @@ -271,7 +271,11 @@ void State::buildRemote(ref destStore, } /* Copy the input closure. */ - if (!machine->isLocalhost()) { + if (machine->isLocalhost()) { + StorePathSet closure; + destStore->computeFSClosure(inputs, closure); + copyPaths(*destStore, *localStore, closure, NoRepair, NoCheckSigs, NoSubstitute); + } else { auto mc1 = std::make_shared>(nrStepsWaiting); mc1.reset(); MaintainCount mc2(nrStepsCopyingTo); From ba96a13407fcca5d637c0dd162cfb8b2c9cb8bd0 Mon Sep 17 00:00:00 2001 From: Graham Christensen Date: Fri, 21 Jan 2022 15:35:47 -0500 Subject: [PATCH 5/5] Record metrics when getting the closure to localhost --- src/hydra-queue-runner/build-remote.cc | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/hydra-queue-runner/build-remote.cc b/src/hydra-queue-runner/build-remote.cc index a82e911e..e9fdb293 100644 --- a/src/hydra-queue-runner/build-remote.cc +++ b/src/hydra-queue-runner/build-remote.cc @@ -270,21 +270,24 @@ void State::buildRemote(ref destStore, copyPaths(*localStore, *destStore, closure, NoRepair, NoCheckSigs, NoSubstitute); } - /* Copy the input closure. */ - if (machine->isLocalhost()) { - StorePathSet closure; - destStore->computeFSClosure(inputs, closure); - copyPaths(*destStore, *localStore, closure, NoRepair, NoCheckSigs, NoSubstitute); - } else { + { auto mc1 = std::make_shared>(nrStepsWaiting); mc1.reset(); MaintainCount mc2(nrStepsCopyingTo); + printMsg(lvlDebug, "sending closure of ‘%s’ to ‘%s’", localStore->printStorePath(step->drvPath), machine->sshName); auto now1 = std::chrono::steady_clock::now(); - copyClosureTo(machine->state->sendLock, destStore, from, to, inputs, true); + /* Copy the input closure. */ + if (machine->isLocalhost()) { + StorePathSet closure; + destStore->computeFSClosure(inputs, closure); + copyPaths(*destStore, *localStore, closure, NoRepair, NoCheckSigs, NoSubstitute); + } else { + copyClosureTo(machine->state->sendLock, destStore, from, to, inputs, true); + } auto now2 = std::chrono::steady_clock::now();