queue-runner: handle broken pg pool connections in builder code

Completes 9b62c52e5c with another location
that was initially missed.
This commit is contained in:
Pierre Bourdon 2024-08-25 22:05:13 +02:00
parent 3ee51dbe58
commit 44b9a7b95d
Signed by: delroth
GPG key ID: 6FB80DCD84DA0F1C
2 changed files with 19 additions and 11 deletions

View file

@ -35,10 +35,18 @@ void State::builder(MachineReservation::ptr reservation)
activeSteps_.lock()->erase(activeStep); activeSteps_.lock()->erase(activeStep);
}); });
auto conn(dbPool.get());
try { try {
auto destStore = getDestStore(); auto destStore = getDestStore();
// Might release the reservation. // Might release the reservation.
res = doBuildStep(destStore, reservation, activeStep); res = doBuildStep(destStore, reservation, *conn, activeStep);
} catch (pqxx::broken_connection & e) {
printMsg(lvlError, "db lost while building %s on %s: %s (retriable)",
localStore->printStorePath(activeStep->step->drvPath),
reservation ? reservation->machine->sshName : std::string("(no machine)"),
e.what());
conn.markBad();
} catch (std::exception & e) { } catch (std::exception & e) {
printMsg(lvlError, "uncaught exception building %s on %s: %s", printMsg(lvlError, "uncaught exception building %s on %s: %s",
localStore->printStorePath(activeStep->step->drvPath), localStore->printStorePath(activeStep->step->drvPath),
@ -76,6 +84,7 @@ void State::builder(MachineReservation::ptr reservation)
State::StepResult State::doBuildStep(nix::ref<Store> destStore, State::StepResult State::doBuildStep(nix::ref<Store> destStore,
MachineReservation::ptr & reservation, MachineReservation::ptr & reservation,
Connection & conn,
std::shared_ptr<ActiveStep> activeStep) std::shared_ptr<ActiveStep> activeStep)
{ {
auto step(reservation->step); auto step(reservation->step);
@ -106,8 +115,6 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
buildOptions.maxLogSize = maxLogSize; buildOptions.maxLogSize = maxLogSize;
buildOptions.enforceDeterminism = step->isDeterministic; buildOptions.enforceDeterminism = step->isDeterministic;
auto conn(dbPool.get());
{ {
std::set<Build::ptr> dependents; std::set<Build::ptr> dependents;
std::set<Step::ptr> steps; std::set<Step::ptr> steps;
@ -132,7 +139,7 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
for (auto build2 : dependents) { for (auto build2 : dependents) {
if (build2->drvPath == step->drvPath) { if (build2->drvPath == step->drvPath) {
build = build2; build = build2;
pqxx::work txn(*conn); pqxx::work txn(conn);
notifyBuildStarted(txn, build->id); notifyBuildStarted(txn, build->id);
txn.commit(); txn.commit();
} }
@ -187,7 +194,7 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
/* If any of the outputs have previously failed, then don't bother /* If any of the outputs have previously failed, then don't bother
building again. */ building again. */
if (checkCachedFailure(step, *conn)) if (checkCachedFailure(step, conn))
result.stepStatus = bsCachedFailure; result.stepStatus = bsCachedFailure;
else { else {
@ -195,13 +202,13 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
building. */ building. */
{ {
auto mc = startDbUpdate(); auto mc = startDbUpdate();
pqxx::work txn(*conn); pqxx::work txn(conn);
stepNr = createBuildStep(txn, result.startTime, buildId, step, machine->sshName, bsBusy); stepNr = createBuildStep(txn, result.startTime, buildId, step, machine->sshName, bsBusy);
txn.commit(); txn.commit();
} }
auto updateStep = [&](StepState stepState) { auto updateStep = [&](StepState stepState) {
pqxx::work txn(*conn); pqxx::work txn(conn);
updateBuildStep(txn, buildId, stepNr, stepState); updateBuildStep(txn, buildId, stepNr, stepState);
txn.commit(); txn.commit();
}; };
@ -252,7 +259,7 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
/* Finish the step in the database. */ /* Finish the step in the database. */
if (stepNr) { if (stepNr) {
pqxx::work txn(*conn); pqxx::work txn(conn);
finishBuildStep(txn, result, buildId, stepNr, machine->sshName); finishBuildStep(txn, result, buildId, stepNr, machine->sshName);
txn.commit(); txn.commit();
} }
@ -328,7 +335,7 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
{ {
auto mc = startDbUpdate(); auto mc = startDbUpdate();
pqxx::work txn(*conn); pqxx::work txn(conn);
for (auto & b : direct) { for (auto & b : direct) {
printInfo("marking build %1% as succeeded", b->id); printInfo("marking build %1% as succeeded", b->id);
@ -356,7 +363,7 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
/* Send notification about the builds that have this step as /* Send notification about the builds that have this step as
the top-level. */ the top-level. */
{ {
pqxx::work txn(*conn); pqxx::work txn(conn);
for (auto id : buildIDs) for (auto id : buildIDs)
notifyBuildFinished(txn, id, {}); notifyBuildFinished(txn, id, {});
txn.commit(); txn.commit();
@ -385,7 +392,7 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
} }
} else } else
failStep(*conn, step, buildId, result, machine, stepFinished); failStep(conn, step, buildId, result, machine, stepFinished);
// FIXME: keep stats about aborted steps? // FIXME: keep stats about aborted steps?
nrStepsDone++; nrStepsDone++;

View file

@ -594,6 +594,7 @@ private:
enum StepResult { sDone, sRetry, sMaybeCancelled }; enum StepResult { sDone, sRetry, sMaybeCancelled };
StepResult doBuildStep(nix::ref<nix::Store> destStore, StepResult doBuildStep(nix::ref<nix::Store> destStore,
MachineReservation::ptr & reservation, MachineReservation::ptr & reservation,
Connection & conn,
std::shared_ptr<ActiveStep> activeStep); std::shared_ptr<ActiveStep> activeStep);
void buildRemote(nix::ref<nix::Store> destStore, void buildRemote(nix::ref<nix::Store> destStore,