From 1ecc8a4f4099dcfd148aecbbe9277d5486b9eef0 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra <edolstra@gmail.com> Date: Tue, 8 Nov 2016 11:42:31 +0100 Subject: [PATCH] hydra-queue-runner: Fix a race keeping cancelled steps alive If a step is cancelled just as its builder step is starting, doBuildStep() will return sRetry. This causes builder() to make the step runnable again, since the queue monitor may have added new builds referencing it. The idea is that if the latter condition is not true, the step's reference count will drop to zero and it will be deleted. However, if the dispatcher thread sees and locks the step before the reference count can drop to zero in the builder thread, the dispatcher thread will start a new builder thread for the step. Thus the step can be kept alive for an indefinite amount of time. The fix is for State::builder() to use a weak pointer to the step, to ensure that the step's reference count can drop to zero *before* it's added to the runnable queue. --- src/hydra-queue-runner/builder.cc | 32 +++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/src/hydra-queue-runner/builder.cc b/src/hydra-queue-runner/builder.cc index ee3ac779..f943ea6e 100644 --- a/src/hydra-queue-runner/builder.cc +++ b/src/hydra-queue-runner/builder.cc @@ -13,22 +13,24 @@ void State::builder(MachineReservation::ptr reservation) nrStepsStarted++; - auto activeStep = std::make_shared<ActiveStep>(); - activeStep->step = reservation->step; - activeSteps_.lock()->insert(activeStep); + Step::wptr wstep = reservation->step; - Finally removeActiveStep([&]() { - activeSteps_.lock()->erase(activeStep); - }); + { + auto activeStep = std::make_shared<ActiveStep>(); + activeStep->step = reservation->step; + activeSteps_.lock()->insert(activeStep); - auto step = reservation->step; + Finally removeActiveStep([&]() { + activeSteps_.lock()->erase(activeStep); + }); - try { - auto destStore = getDestStore(); - res = doBuildStep(destStore, reservation, activeStep); - } catch (std::exception & e) { - printMsg(lvlError, format("uncaught exception building ‘%1%’ on ‘%2%’: %3%") - % step->drvPath % reservation->machine->sshName % e.what()); + try { + auto destStore = getDestStore(); + res = doBuildStep(destStore, reservation, activeStep); + } catch (std::exception & e) { + printMsg(lvlError, format("uncaught exception building ‘%1%’ on ‘%2%’: %3%") + % reservation->step->drvPath % reservation->machine->sshName % e.what()); + } } /* Release the machine and wake up the dispatcher. */ @@ -38,7 +40,9 @@ void State::builder(MachineReservation::ptr reservation) /* If there was a temporary failure, retry the step after an exponentially increasing interval. */ - if (res != sDone) { + Step::ptr step = wstep.lock(); + if (res != sDone && step) { + if (res == sRetry) { auto step_(step->state.lock()); step_->tries++;