From 5c3e508e55d99b6d9e50aee48b8bf514e15ad006 Mon Sep 17 00:00:00 2001 From: Pierre Bourdon Date: Sun, 21 Apr 2024 01:46:41 +0200 Subject: [PATCH] queue-runner: release machine reservation while copying outputs This allows for better builder usage when the queue runner is busy. To avoid running into uncontrollable imbalances between builder/queue runner, we only release the machine reservation after the local throttler has found a slot to start copying the outputs for that build. --- src/hydra-queue-runner/build-remote.cc | 10 ++++++++++ src/hydra-queue-runner/builder.cc | 18 ++++++++++-------- src/hydra-queue-runner/state.hh | 3 ++- 3 files changed, 22 insertions(+), 9 deletions(-) diff --git a/src/hydra-queue-runner/build-remote.cc b/src/hydra-queue-runner/build-remote.cc index b4a08d7d..02b82d2b 100644 --- a/src/hydra-queue-runner/build-remote.cc +++ b/src/hydra-queue-runner/build-remote.cc @@ -506,6 +506,7 @@ private: }; void State::buildRemote(ref destStore, + MachineReservation::ptr & reservation, ::Machine::ptr machine, Step::ptr step, const BuildOptions & buildOptions, RemoteResult & result, std::shared_ptr activeStep, @@ -630,6 +631,15 @@ void State::buildRemote(ref destStore, } SemaphoreReleaser releaser(&localWorkThrottler); + /* Once we've started copying outputs, release the machine reservation + * so further builds can happen. We do not release the machine earlier + * to avoid situations where the queue runner is bottlenecked on + * copying outputs and we end up building too many things that we + * haven't been able to allow copy slots for. */ + assert(reservation.unique()); + reservation = 0; + wakeDispatcher(); + StorePathSet outputs; for (auto & [_, realisation] : buildResult.builtOutputs) outputs.insert(realisation.outPath); diff --git a/src/hydra-queue-runner/builder.cc b/src/hydra-queue-runner/builder.cc index 0ef77068..eecf43c4 100644 --- a/src/hydra-queue-runner/builder.cc +++ b/src/hydra-queue-runner/builder.cc @@ -46,10 +46,12 @@ void State::builder(MachineReservation::ptr reservation) } } - /* Release the machine and wake up the dispatcher. */ - assert(reservation.unique()); - reservation = 0; - wakeDispatcher(); + /* If the machine hasn't been released yet, release and wake up the dispatcher. */ + if (reservation) { + assert(reservation.unique()); + reservation = 0; + wakeDispatcher(); + } /* If there was a temporary failure, retry the step after an exponentially increasing interval. */ @@ -72,11 +74,11 @@ void State::builder(MachineReservation::ptr reservation) State::StepResult State::doBuildStep(nix::ref destStore, - MachineReservation::ptr reservation, + MachineReservation::ptr & reservation, std::shared_ptr activeStep) { - auto & step(reservation->step); - auto & machine(reservation->machine); + auto step(reservation->step); + auto machine(reservation->machine); { auto step_(step->state.lock()); @@ -208,7 +210,7 @@ State::StepResult State::doBuildStep(nix::ref destStore, try { /* FIXME: referring builds may have conflicting timeouts. */ - buildRemote(destStore, machine, step, buildOptions, result, activeStep, updateStep, narMembers); + buildRemote(destStore, reservation, machine, step, buildOptions, result, activeStep, updateStep, narMembers); } catch (Error & e) { if (activeStep->state_.lock()->cancelled) { printInfo("marking step %d of build %d as cancelled", stepNr, buildId); diff --git a/src/hydra-queue-runner/state.hh b/src/hydra-queue-runner/state.hh index dd1c8931..49968558 100644 --- a/src/hydra-queue-runner/state.hh +++ b/src/hydra-queue-runner/state.hh @@ -594,10 +594,11 @@ private: retried. */ enum StepResult { sDone, sRetry, sMaybeCancelled }; StepResult doBuildStep(nix::ref destStore, - MachineReservation::ptr reservation, + MachineReservation::ptr & reservation, std::shared_ptr activeStep); void buildRemote(nix::ref destStore, + MachineReservation::ptr & reservation, Machine::ptr machine, Step::ptr step, const BuildOptions & buildOptions, RemoteResult & result, std::shared_ptr activeStep,