From 6d738a31bf7b0c01191433c625229bb6c0a56efc Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Wed, 10 Jun 2015 14:57:16 +0200 Subject: [PATCH] Keep track of failed paths in the Hydra database I.e. don't use Nix's failed paths feature anymore. Easier to keep everything in one place. --- src/hydra-queue-runner/hydra-queue-runner.cc | 92 ++++++++++++++------ src/lib/Hydra/Controller/Admin.pm | 2 +- src/lib/Hydra/Helper/Nix.pm | 5 +- src/lib/Hydra/Schema/FailedPaths.pm | 65 ++++++++++++++ src/sql/hydra.sql | 16 ++++ 5 files changed, 149 insertions(+), 31 deletions(-) create mode 100644 src/lib/Hydra/Schema/FailedPaths.pm diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 4f1fc48c..844b19ee 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -780,7 +780,7 @@ void State::builder(Step::ptr step, MachineReservation::ptr reservation) auto store = openStore(); // FIXME: pool doBuildStep(store, step, reservation->machine); } catch (std::exception & e) { - printMsg(lvlError, format("build thread for ‘%1%’: %2%") % step->drvPath % e.what()); + printMsg(lvlError, format("error building ‘%1%’: %2%") % step->drvPath % e.what()); // FIXME: put step back in runnable and retry } @@ -828,35 +828,55 @@ void State::doBuildStep(std::shared_ptr store, Step::ptr step, printMsg(lvlInfo, format("performing build step ‘%1%’ (needed by %2% builds)") % step->drvPath % dependents.size()); } - /* Create a build step record indicating that we started - building. Also, mark the selected build as busy. */ auto conn(dbPool.get()); + RemoteResult result; + BuildResult res; + int stepNr = 0; + result.startTime = time(0); - int stepNr; + + /* If any of the outputs have previously failed, then don't + retry. */ + bool cachedFailure = false; { pqxx::work txn(*conn); - stepNr = createBuildStep(txn, result.startTime, build, step, machine->sshName, bssBusy); - txn.parameterized("update Builds set busy = 1 where id = $1")(build->id).exec(); - txn.commit(); + for (auto & path : outputPaths(step->drv)) + if (!txn.parameterized("select 1 from FailedPaths where path = $1")(path).exec().empty()) { + cachedFailure = true; + break; + } } - try { - buildRemote(store, machine->sshName, machine->sshKey, step->drvPath, step->drv, logDir, result); - } catch (Error & e) { - result.status = RemoteResult::rrMiscFailure; - result.errorMsg = e.msg(); - printMsg(lvlError, format("ERROR: %1%") % e.msg()); - abort(); + if (cachedFailure) + result.status = RemoteResult::rrPermanentFailure; + else { + + /* Create a build step record indicating that we started + building. Also, mark the selected build as busy. */ + { + pqxx::work txn(*conn); + stepNr = createBuildStep(txn, result.startTime, build, step, machine->sshName, bssBusy); + txn.parameterized("update Builds set busy = 1 where id = $1")(build->id).exec(); + txn.commit(); + } + + try { + buildRemote(store, machine->sshName, machine->sshKey, step->drvPath, step->drv, logDir, result); + } catch (Error & e) { + result.status = RemoteResult::rrMiscFailure; + result.errorMsg = e.msg(); + printMsg(lvlError, format("ERROR: %1%") % e.msg()); + abort(); + } + + if (result.status == RemoteResult::rrSuccess) res = getBuildResult(store, step->drv); + + // FIXME: handle failed-with-output } if (!result.stopTime) result.stopTime = time(0); - BuildResult res; - if (result.status == RemoteResult::rrSuccess) res = getBuildResult(store, step->drv); - - // FIXME: handle failed-with-output - /* Remove this step. After this, incoming builds that depend on drvPath will either see that the output paths exist, or will create a new build step for drvPath. The latter is fine - it @@ -894,26 +914,42 @@ void State::doBuildStep(std::shared_ptr store, Step::ptr step, markSucceededBuild(txn, build2, res, false, result.startTime, result.stopTime); } else { - /* Create failed build steps for every build that depends - on this. */ - for (auto build2 : dependents) { - if (build == build2) continue; - createBuildStep(txn, result.stopTime, build2, step, machine->sshName, bssFailed, result.errorMsg, build->id); - } + /* Failure case. */ - finishBuildStep(txn, result.startTime, result.stopTime, build->id, stepNr, machine->sshName, bssFailed, result.errorMsg); + /* For regular failures, we don't care about the error + message. */ + if (result.status != RemoteResult::rrMiscFailure) result.errorMsg = ""; + + if (!cachedFailure) { + + /* Create failed build steps for every build that depends + on this. */ + for (auto build2 : dependents) { + if (build == build2) continue; + createBuildStep(txn, result.stopTime, build2, step, machine->sshName, bssFailed, result.errorMsg, build->id); + } + + finishBuildStep(txn, result.startTime, result.stopTime, build->id, stepNr, machine->sshName, bssFailed, result.errorMsg); + } /* Mark all builds that depend on this derivation as failed. */ for (auto build2 : dependents) { printMsg(lvlError, format("marking build %1% as failed") % build2->id); txn.parameterized - ("update Builds set finished = 1, busy = 0, isCachedBuild = 0, buildStatus = $2, startTime = $3, stopTime = $4 where id = $1") + ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1") (build2->id) ((int) (build2->drvPath == step->drvPath ? bsFailed : bsDepFailed)) (result.startTime) - (result.stopTime).exec(); + (result.stopTime) + (cachedFailure ? 1 : 0).exec(); build2->finishedInDB = true; // FIXME: txn might fail } + + /* Remember failed paths in the database so that they + won't be built again. */ + if (!cachedFailure && result.status == RemoteResult::rrPermanentFailure) + for (auto & path : outputPaths(step->drv)) + txn.parameterized("insert into FailedPaths values ($1)")(path).exec(); } txn.commit(); diff --git a/src/lib/Hydra/Controller/Admin.pm b/src/lib/Hydra/Controller/Admin.pm index 95666f07..a09a5edb 100644 --- a/src/lib/Hydra/Controller/Admin.pm +++ b/src/lib/Hydra/Controller/Admin.pm @@ -45,7 +45,7 @@ sub clear_queue_non_current : Chained('admin') PathPart('clear-queue-non-current sub clearfailedcache : Chained('admin') PathPart('clear-failed-cache') Args(0) { my ($self, $c) = @_; - my $r = `nix-store --clear-failed-paths '*'`; + $c->model('DB::FailedPaths')->delete; $c->res->redirect($c->request->referer // "/"); } diff --git a/src/lib/Hydra/Helper/Nix.pm b/src/lib/Hydra/Helper/Nix.pm index 087e66cb..7a68490f 100644 --- a/src/lib/Hydra/Helper/Nix.pm +++ b/src/lib/Hydra/Helper/Nix.pm @@ -465,9 +465,10 @@ sub restartBuilds($$) { # !!! Should do this in a trigger. $db->resultset('JobsetEvals')->search({ build => \@buildIds }, { join => 'buildIds' })->update({ nrsucceeded => undef }); - # Clear Nix's negative failure cache. + # Clear the failed paths cache. # FIXME: Add this to the API. - system("nix-store", "--clear-failed-paths", @paths); + # FIXME: clear the dependencies? + $db->resultset('FailedPaths')->search({ path => [ @paths ]})->delete; }); return scalar(@buildIds); diff --git a/src/lib/Hydra/Schema/FailedPaths.pm b/src/lib/Hydra/Schema/FailedPaths.pm new file mode 100644 index 00000000..082b989d --- /dev/null +++ b/src/lib/Hydra/Schema/FailedPaths.pm @@ -0,0 +1,65 @@ +use utf8; +package Hydra::Schema::FailedPaths; + +# Created by DBIx::Class::Schema::Loader +# DO NOT MODIFY THE FIRST PART OF THIS FILE + +=head1 NAME + +Hydra::Schema::FailedPaths + +=cut + +use strict; +use warnings; + +use base 'DBIx::Class::Core'; + +=head1 COMPONENTS LOADED + +=over 4 + +=item * L + +=back + +=cut + +__PACKAGE__->load_components("+Hydra::Component::ToJSON"); + +=head1 TABLE: C + +=cut + +__PACKAGE__->table("FailedPaths"); + +=head1 ACCESSORS + +=head2 path + + data_type: 'text' + is_nullable: 0 + +=cut + +__PACKAGE__->add_columns("path", { data_type => "text", is_nullable => 0 }); + +=head1 PRIMARY KEY + +=over 4 + +=item * L + +=back + +=cut + +__PACKAGE__->set_primary_key("path"); + + +# Created by DBIx::Class::Schema::Loader v0.07033 @ 2015-06-10 14:48:16 +# DO NOT MODIFY THIS OR ANYTHING ABOVE! md5sum:WFgjfjH+szE6Ntcicmaflw + + +# You can replace this text with custom code or comments, and it will be preserved on regeneration +1; diff --git a/src/sql/hydra.sql b/src/sql/hydra.sql index 4def1825..83178c25 100644 --- a/src/sql/hydra.sql +++ b/src/sql/hydra.sql @@ -511,6 +511,22 @@ create table StarredJobs ( ); +-- The output paths that have permanently failed. +create table FailedPaths ( + path text primary key not null +); + +#ifdef POSTGRESQL + +-- Needed because Postgres doesn't have "ignore duplicate" or upsert +-- yet. +create rule IdempotentInsert as on insert to FailedPaths + where exists (select 1 from FailedPaths where path = new.path) + do instead nothing; + +#endif + + -- Cache of the number of finished builds. create table NrBuilds ( what text primary key not null,