Keep track of failed paths in the Hydra database

I.e. don't use Nix's failed paths feature anymore. Easier to keep
everything in one place.
This commit is contained in:
Eelco Dolstra 2015-06-10 14:57:16 +02:00
parent c68036f8b0
commit 6d738a31bf
5 changed files with 149 additions and 31 deletions

View file

@ -780,7 +780,7 @@ void State::builder(Step::ptr step, MachineReservation::ptr reservation)
auto store = openStore(); // FIXME: pool auto store = openStore(); // FIXME: pool
doBuildStep(store, step, reservation->machine); doBuildStep(store, step, reservation->machine);
} catch (std::exception & e) { } catch (std::exception & e) {
printMsg(lvlError, format("build thread for %1%: %2%") % step->drvPath % e.what()); printMsg(lvlError, format("error building %1%: %2%") % step->drvPath % e.what());
// FIXME: put step back in runnable and retry // FIXME: put step back in runnable and retry
} }
@ -828,35 +828,55 @@ void State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
printMsg(lvlInfo, format("performing build step %1% (needed by %2% builds)") % step->drvPath % dependents.size()); printMsg(lvlInfo, format("performing build step %1% (needed by %2% builds)") % step->drvPath % dependents.size());
} }
/* Create a build step record indicating that we started
building. Also, mark the selected build as busy. */
auto conn(dbPool.get()); auto conn(dbPool.get());
RemoteResult result; RemoteResult result;
BuildResult res;
int stepNr = 0;
result.startTime = time(0); result.startTime = time(0);
int stepNr;
/* If any of the outputs have previously failed, then don't
retry. */
bool cachedFailure = false;
{ {
pqxx::work txn(*conn); pqxx::work txn(*conn);
stepNr = createBuildStep(txn, result.startTime, build, step, machine->sshName, bssBusy); for (auto & path : outputPaths(step->drv))
txn.parameterized("update Builds set busy = 1 where id = $1")(build->id).exec(); if (!txn.parameterized("select 1 from FailedPaths where path = $1")(path).exec().empty()) {
txn.commit(); cachedFailure = true;
break;
}
} }
try { if (cachedFailure)
buildRemote(store, machine->sshName, machine->sshKey, step->drvPath, step->drv, logDir, result); result.status = RemoteResult::rrPermanentFailure;
} catch (Error & e) { else {
result.status = RemoteResult::rrMiscFailure;
result.errorMsg = e.msg(); /* Create a build step record indicating that we started
printMsg(lvlError, format("ERROR: %1%") % e.msg()); building. Also, mark the selected build as busy. */
abort(); {
pqxx::work txn(*conn);
stepNr = createBuildStep(txn, result.startTime, build, step, machine->sshName, bssBusy);
txn.parameterized("update Builds set busy = 1 where id = $1")(build->id).exec();
txn.commit();
}
try {
buildRemote(store, machine->sshName, machine->sshKey, step->drvPath, step->drv, logDir, result);
} catch (Error & e) {
result.status = RemoteResult::rrMiscFailure;
result.errorMsg = e.msg();
printMsg(lvlError, format("ERROR: %1%") % e.msg());
abort();
}
if (result.status == RemoteResult::rrSuccess) res = getBuildResult(store, step->drv);
// FIXME: handle failed-with-output
} }
if (!result.stopTime) result.stopTime = time(0); if (!result.stopTime) result.stopTime = time(0);
BuildResult res;
if (result.status == RemoteResult::rrSuccess) res = getBuildResult(store, step->drv);
// FIXME: handle failed-with-output
/* Remove this step. After this, incoming builds that depend on /* Remove this step. After this, incoming builds that depend on
drvPath will either see that the output paths exist, or will drvPath will either see that the output paths exist, or will
create a new build step for drvPath. The latter is fine - it create a new build step for drvPath. The latter is fine - it
@ -894,26 +914,42 @@ void State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
markSucceededBuild(txn, build2, res, false, result.startTime, result.stopTime); markSucceededBuild(txn, build2, res, false, result.startTime, result.stopTime);
} else { } else {
/* Create failed build steps for every build that depends /* Failure case. */
on this. */
for (auto build2 : dependents) {
if (build == build2) continue;
createBuildStep(txn, result.stopTime, build2, step, machine->sshName, bssFailed, result.errorMsg, build->id);
}
finishBuildStep(txn, result.startTime, result.stopTime, build->id, stepNr, machine->sshName, bssFailed, result.errorMsg); /* For regular failures, we don't care about the error
message. */
if (result.status != RemoteResult::rrMiscFailure) result.errorMsg = "";
if (!cachedFailure) {
/* Create failed build steps for every build that depends
on this. */
for (auto build2 : dependents) {
if (build == build2) continue;
createBuildStep(txn, result.stopTime, build2, step, machine->sshName, bssFailed, result.errorMsg, build->id);
}
finishBuildStep(txn, result.startTime, result.stopTime, build->id, stepNr, machine->sshName, bssFailed, result.errorMsg);
}
/* Mark all builds that depend on this derivation as failed. */ /* Mark all builds that depend on this derivation as failed. */
for (auto build2 : dependents) { for (auto build2 : dependents) {
printMsg(lvlError, format("marking build %1% as failed") % build2->id); printMsg(lvlError, format("marking build %1% as failed") % build2->id);
txn.parameterized txn.parameterized
("update Builds set finished = 1, busy = 0, isCachedBuild = 0, buildStatus = $2, startTime = $3, stopTime = $4 where id = $1") ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1")
(build2->id) (build2->id)
((int) (build2->drvPath == step->drvPath ? bsFailed : bsDepFailed)) ((int) (build2->drvPath == step->drvPath ? bsFailed : bsDepFailed))
(result.startTime) (result.startTime)
(result.stopTime).exec(); (result.stopTime)
(cachedFailure ? 1 : 0).exec();
build2->finishedInDB = true; // FIXME: txn might fail build2->finishedInDB = true; // FIXME: txn might fail
} }
/* Remember failed paths in the database so that they
won't be built again. */
if (!cachedFailure && result.status == RemoteResult::rrPermanentFailure)
for (auto & path : outputPaths(step->drv))
txn.parameterized("insert into FailedPaths values ($1)")(path).exec();
} }
txn.commit(); txn.commit();

View file

@ -45,7 +45,7 @@ sub clear_queue_non_current : Chained('admin') PathPart('clear-queue-non-current
sub clearfailedcache : Chained('admin') PathPart('clear-failed-cache') Args(0) { sub clearfailedcache : Chained('admin') PathPart('clear-failed-cache') Args(0) {
my ($self, $c) = @_; my ($self, $c) = @_;
my $r = `nix-store --clear-failed-paths '*'`; $c->model('DB::FailedPaths')->delete;
$c->res->redirect($c->request->referer // "/"); $c->res->redirect($c->request->referer // "/");
} }

View file

@ -465,9 +465,10 @@ sub restartBuilds($$) {
# !!! Should do this in a trigger. # !!! Should do this in a trigger.
$db->resultset('JobsetEvals')->search({ build => \@buildIds }, { join => 'buildIds' })->update({ nrsucceeded => undef }); $db->resultset('JobsetEvals')->search({ build => \@buildIds }, { join => 'buildIds' })->update({ nrsucceeded => undef });
# Clear Nix's negative failure cache. # Clear the failed paths cache.
# FIXME: Add this to the API. # FIXME: Add this to the API.
system("nix-store", "--clear-failed-paths", @paths); # FIXME: clear the dependencies?
$db->resultset('FailedPaths')->search({ path => [ @paths ]})->delete;
}); });
return scalar(@buildIds); return scalar(@buildIds);

View file

@ -0,0 +1,65 @@
use utf8;
package Hydra::Schema::FailedPaths;
# Created by DBIx::Class::Schema::Loader
# DO NOT MODIFY THE FIRST PART OF THIS FILE
=head1 NAME
Hydra::Schema::FailedPaths
=cut
use strict;
use warnings;
use base 'DBIx::Class::Core';
=head1 COMPONENTS LOADED
=over 4
=item * L<Hydra::Component::ToJSON>
=back
=cut
__PACKAGE__->load_components("+Hydra::Component::ToJSON");
=head1 TABLE: C<FailedPaths>
=cut
__PACKAGE__->table("FailedPaths");
=head1 ACCESSORS
=head2 path
data_type: 'text'
is_nullable: 0
=cut
__PACKAGE__->add_columns("path", { data_type => "text", is_nullable => 0 });
=head1 PRIMARY KEY
=over 4
=item * L</path>
=back
=cut
__PACKAGE__->set_primary_key("path");
# Created by DBIx::Class::Schema::Loader v0.07033 @ 2015-06-10 14:48:16
# DO NOT MODIFY THIS OR ANYTHING ABOVE! md5sum:WFgjfjH+szE6Ntcicmaflw
# You can replace this text with custom code or comments, and it will be preserved on regeneration
1;

View file

@ -511,6 +511,22 @@ create table StarredJobs (
); );
-- The output paths that have permanently failed.
create table FailedPaths (
path text primary key not null
);
#ifdef POSTGRESQL
-- Needed because Postgres doesn't have "ignore duplicate" or upsert
-- yet.
create rule IdempotentInsert as on insert to FailedPaths
where exists (select 1 from FailedPaths where path = new.path)
do instead nothing;
#endif
-- Cache of the number of finished builds. -- Cache of the number of finished builds.
create table NrBuilds ( create table NrBuilds (
what text primary key not null, what text primary key not null,