queue-runner: handle broken pg pool connections in builder code

Completes 9b62c52e5c with another location
that was initially missed.
This commit is contained in:
Pierre Bourdon 2024-08-25 22:05:13 +02:00
parent 3ee51dbe58
commit 44b9a7b95d
Signed by untrusted user: delroth
GPG key ID: 6FB80DCD84DA0F1C
2 changed files with 19 additions and 11 deletions

View file

@ -35,10 +35,18 @@ void State::builder(MachineReservation::ptr reservation)
activeSteps_.lock()->erase(activeStep);
});
auto conn(dbPool.get());
try {
auto destStore = getDestStore();
// Might release the reservation.
res = doBuildStep(destStore, reservation, activeStep);
res = doBuildStep(destStore, reservation, *conn, activeStep);
} catch (pqxx::broken_connection & e) {
printMsg(lvlError, "db lost while building %s on %s: %s (retriable)",
localStore->printStorePath(activeStep->step->drvPath),
reservation ? reservation->machine->sshName : std::string("(no machine)"),
e.what());
conn.markBad();
} catch (std::exception & e) {
printMsg(lvlError, "uncaught exception building %s on %s: %s",
localStore->printStorePath(activeStep->step->drvPath),
@ -76,6 +84,7 @@ void State::builder(MachineReservation::ptr reservation)
State::StepResult State::doBuildStep(nix::ref<Store> destStore,
MachineReservation::ptr & reservation,
Connection & conn,
std::shared_ptr<ActiveStep> activeStep)
{
auto step(reservation->step);
@ -106,8 +115,6 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
buildOptions.maxLogSize = maxLogSize;
buildOptions.enforceDeterminism = step->isDeterministic;
auto conn(dbPool.get());
{
std::set<Build::ptr> dependents;
std::set<Step::ptr> steps;
@ -132,7 +139,7 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
for (auto build2 : dependents) {
if (build2->drvPath == step->drvPath) {
build = build2;
pqxx::work txn(*conn);
pqxx::work txn(conn);
notifyBuildStarted(txn, build->id);
txn.commit();
}
@ -187,7 +194,7 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
/* If any of the outputs have previously failed, then don't bother
building again. */
if (checkCachedFailure(step, *conn))
if (checkCachedFailure(step, conn))
result.stepStatus = bsCachedFailure;
else {
@ -195,13 +202,13 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
building. */
{
auto mc = startDbUpdate();
pqxx::work txn(*conn);
pqxx::work txn(conn);
stepNr = createBuildStep(txn, result.startTime, buildId, step, machine->sshName, bsBusy);
txn.commit();
}
auto updateStep = [&](StepState stepState) {
pqxx::work txn(*conn);
pqxx::work txn(conn);
updateBuildStep(txn, buildId, stepNr, stepState);
txn.commit();
};
@ -252,7 +259,7 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
/* Finish the step in the database. */
if (stepNr) {
pqxx::work txn(*conn);
pqxx::work txn(conn);
finishBuildStep(txn, result, buildId, stepNr, machine->sshName);
txn.commit();
}
@ -328,7 +335,7 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
{
auto mc = startDbUpdate();
pqxx::work txn(*conn);
pqxx::work txn(conn);
for (auto & b : direct) {
printInfo("marking build %1% as succeeded", b->id);
@ -356,7 +363,7 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
/* Send notification about the builds that have this step as
the top-level. */
{
pqxx::work txn(*conn);
pqxx::work txn(conn);
for (auto id : buildIDs)
notifyBuildFinished(txn, id, {});
txn.commit();
@ -385,7 +392,7 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
}
} else
failStep(*conn, step, buildId, result, machine, stepFinished);
failStep(conn, step, buildId, result, machine, stepFinished);
// FIXME: keep stats about aborted steps?
nrStepsDone++;

View file

@ -594,6 +594,7 @@ private:
enum StepResult { sDone, sRetry, sMaybeCancelled };
StepResult doBuildStep(nix::ref<nix::Store> destStore,
MachineReservation::ptr & reservation,
Connection & conn,
std::shared_ptr<ActiveStep> activeStep);
void buildRemote(nix::ref<nix::Store> destStore,