hydra-queue-runner: Fix a race keeping cancelled steps alive

If a step is cancelled just as its builder step is starting,
doBuildStep() will return sRetry. This causes builder() to make the
step runnable again, since the queue monitor may have added new builds
referencing it. The idea is that if the latter condition is not true,
the step's reference count will drop to zero and it will be
deleted. However, if the dispatcher thread sees and locks the step
before the reference count can drop to zero in the builder thread, the
dispatcher thread will start a new builder thread for the step. Thus
the step can be kept alive for an indefinite amount of time.

The fix is for State::builder() to use a weak pointer to the step, to
ensure that the step's reference count can drop to zero *before* it's
added to the runnable queue.
This commit is contained in:
Eelco Dolstra 2016-11-08 11:42:31 +01:00
parent de9d7bcf25
commit 1ecc8a4f40

View file

@ -13,6 +13,9 @@ void State::builder(MachineReservation::ptr reservation)
nrStepsStarted++; nrStepsStarted++;
Step::wptr wstep = reservation->step;
{
auto activeStep = std::make_shared<ActiveStep>(); auto activeStep = std::make_shared<ActiveStep>();
activeStep->step = reservation->step; activeStep->step = reservation->step;
activeSteps_.lock()->insert(activeStep); activeSteps_.lock()->insert(activeStep);
@ -21,14 +24,13 @@ void State::builder(MachineReservation::ptr reservation)
activeSteps_.lock()->erase(activeStep); activeSteps_.lock()->erase(activeStep);
}); });
auto step = reservation->step;
try { try {
auto destStore = getDestStore(); auto destStore = getDestStore();
res = doBuildStep(destStore, reservation, activeStep); res = doBuildStep(destStore, reservation, activeStep);
} catch (std::exception & e) { } catch (std::exception & e) {
printMsg(lvlError, format("uncaught exception building %1% on %2%: %3%") printMsg(lvlError, format("uncaught exception building %1% on %2%: %3%")
% step->drvPath % reservation->machine->sshName % e.what()); % reservation->step->drvPath % reservation->machine->sshName % e.what());
}
} }
/* Release the machine and wake up the dispatcher. */ /* Release the machine and wake up the dispatcher. */
@ -38,7 +40,9 @@ void State::builder(MachineReservation::ptr reservation)
/* If there was a temporary failure, retry the step after an /* If there was a temporary failure, retry the step after an
exponentially increasing interval. */ exponentially increasing interval. */
if (res != sDone) { Step::ptr step = wstep.lock();
if (res != sDone && step) {
if (res == sRetry) { if (res == sRetry) {
auto step_(step->state.lock()); auto step_(step->state.lock());
step_->tries++; step_->tries++;