diff --git a/src/hydra-queue-runner/builder.cc b/src/hydra-queue-runner/builder.cc index 782c1710..642d6ba5 100644 --- a/src/hydra-queue-runner/builder.cc +++ b/src/hydra-queue-runner/builder.cc @@ -35,10 +35,18 @@ void State::builder(MachineReservation::ptr reservation) activeSteps_.lock()->erase(activeStep); }); + auto conn(dbPool.get()); + try { auto destStore = getDestStore(); // Might release the reservation. - res = doBuildStep(destStore, reservation, activeStep); + res = doBuildStep(destStore, reservation, *conn, activeStep); + } catch (pqxx::broken_connection & e) { + printMsg(lvlError, "db lost while building ‘%s’ on ‘%s’: %s (retriable)", + localStore->printStorePath(activeStep->step->drvPath), + reservation ? reservation->machine->sshName : std::string("(no machine)"), + e.what()); + conn.markBad(); } catch (std::exception & e) { printMsg(lvlError, "uncaught exception building ‘%s’ on ‘%s’: %s", localStore->printStorePath(activeStep->step->drvPath), @@ -76,6 +84,7 @@ void State::builder(MachineReservation::ptr reservation) State::StepResult State::doBuildStep(nix::ref destStore, MachineReservation::ptr & reservation, + Connection & conn, std::shared_ptr activeStep) { auto step(reservation->step); @@ -106,8 +115,6 @@ State::StepResult State::doBuildStep(nix::ref destStore, buildOptions.maxLogSize = maxLogSize; buildOptions.enforceDeterminism = step->isDeterministic; - auto conn(dbPool.get()); - { std::set dependents; std::set steps; @@ -132,7 +139,7 @@ State::StepResult State::doBuildStep(nix::ref destStore, for (auto build2 : dependents) { if (build2->drvPath == step->drvPath) { build = build2; - pqxx::work txn(*conn); + pqxx::work txn(conn); notifyBuildStarted(txn, build->id); txn.commit(); } @@ -187,7 +194,7 @@ State::StepResult State::doBuildStep(nix::ref destStore, /* If any of the outputs have previously failed, then don't bother building again. */ - if (checkCachedFailure(step, *conn)) + if (checkCachedFailure(step, conn)) result.stepStatus = bsCachedFailure; else { @@ -195,13 +202,13 @@ State::StepResult State::doBuildStep(nix::ref destStore, building. */ { auto mc = startDbUpdate(); - pqxx::work txn(*conn); + pqxx::work txn(conn); stepNr = createBuildStep(txn, result.startTime, buildId, step, machine->sshName, bsBusy); txn.commit(); } auto updateStep = [&](StepState stepState) { - pqxx::work txn(*conn); + pqxx::work txn(conn); updateBuildStep(txn, buildId, stepNr, stepState); txn.commit(); }; @@ -252,7 +259,7 @@ State::StepResult State::doBuildStep(nix::ref destStore, /* Finish the step in the database. */ if (stepNr) { - pqxx::work txn(*conn); + pqxx::work txn(conn); finishBuildStep(txn, result, buildId, stepNr, machine->sshName); txn.commit(); } @@ -328,7 +335,7 @@ State::StepResult State::doBuildStep(nix::ref destStore, { auto mc = startDbUpdate(); - pqxx::work txn(*conn); + pqxx::work txn(conn); for (auto & b : direct) { printInfo("marking build %1% as succeeded", b->id); @@ -356,7 +363,7 @@ State::StepResult State::doBuildStep(nix::ref destStore, /* Send notification about the builds that have this step as the top-level. */ { - pqxx::work txn(*conn); + pqxx::work txn(conn); for (auto id : buildIDs) notifyBuildFinished(txn, id, {}); txn.commit(); @@ -385,7 +392,7 @@ State::StepResult State::doBuildStep(nix::ref destStore, } } else - failStep(*conn, step, buildId, result, machine, stepFinished); + failStep(conn, step, buildId, result, machine, stepFinished); // FIXME: keep stats about aborted steps? nrStepsDone++; diff --git a/src/hydra-queue-runner/state.hh b/src/hydra-queue-runner/state.hh index 29349c9b..d0316b2a 100644 --- a/src/hydra-queue-runner/state.hh +++ b/src/hydra-queue-runner/state.hh @@ -594,6 +594,7 @@ private: enum StepResult { sDone, sRetry, sMaybeCancelled }; StepResult doBuildStep(nix::ref destStore, MachineReservation::ptr & reservation, + Connection & conn, std::shared_ptr activeStep); void buildRemote(nix::ref destStore,