forked from lix-project/hydra
Keep stats for the Hydra auto scaler
"hydra-queue-runner --status" now prints how many runnable and running build steps exist for each machine type. This allows additional machines to be provisioned based on the Hydra load.
This commit is contained in:
parent
69e9f73cf6
commit
d571e44b86
5 changed files with 91 additions and 8 deletions
|
@ -6,18 +6,20 @@
|
|||
using namespace nix;
|
||||
|
||||
|
||||
void State::builder(Step::ptr step, Machine::ptr machine, std::shared_ptr<MaintainCount> reservation)
|
||||
void State::builder(MachineReservation::ptr reservation)
|
||||
{
|
||||
bool retry = true;
|
||||
|
||||
MaintainCount mc(nrActiveSteps);
|
||||
|
||||
auto step = reservation->step;
|
||||
|
||||
try {
|
||||
auto store = openStore(); // FIXME: pool
|
||||
retry = doBuildStep(store, step, machine);
|
||||
retry = doBuildStep(store, step, reservation->machine);
|
||||
} catch (std::exception & e) {
|
||||
printMsg(lvlError, format("uncaught exception building ‘%1%’ on ‘%2%’: %3%")
|
||||
% step->drvPath % machine->sshName % e.what());
|
||||
% step->drvPath % reservation->machine->sshName % e.what());
|
||||
}
|
||||
|
||||
/* Release the machine and wake up the dispatcher. */
|
||||
|
|
|
@ -8,5 +8,5 @@ struct MaintainCount
|
|||
{
|
||||
counter & c;
|
||||
MaintainCount(counter & c) : c(c) { c++; }
|
||||
~MaintainCount() { c--; }
|
||||
~MaintainCount() { auto prev = c--; assert(prev); }
|
||||
};
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
#include <algorithm>
|
||||
#include <thread>
|
||||
#include <unordered_map>
|
||||
|
||||
#include "state.hh"
|
||||
|
||||
|
@ -142,6 +143,7 @@ system_time State::doDispatch()
|
|||
FIXME: O(n lg n); obviously, it would be better to keep a
|
||||
runnable queue sorted by priority. */
|
||||
std::vector<Step::ptr> runnableSorted;
|
||||
std::unordered_map<std::string, unsigned int> runnablePerType;
|
||||
{
|
||||
auto runnable_(runnable.lock());
|
||||
runnableSorted.reserve(runnable_->size());
|
||||
|
@ -156,6 +158,8 @@ system_time State::doDispatch()
|
|||
|
||||
++i;
|
||||
|
||||
runnablePerType[step->drv.platform]++;
|
||||
|
||||
/* Skip previously failed steps that aren't ready
|
||||
to be retried. */
|
||||
{
|
||||
|
@ -215,13 +219,14 @@ system_time State::doDispatch()
|
|||
break;
|
||||
} else ++i;
|
||||
assert(removed);
|
||||
assert(runnablePerType[step->drv.platform]);
|
||||
runnablePerType[step->drv.platform]--;
|
||||
}
|
||||
|
||||
/* Make a slot reservation and start a thread to
|
||||
do the build. */
|
||||
auto reservation = std::make_shared<MaintainCount>(mi.machine->state->currentJobs);
|
||||
|
||||
auto builderThread = std::thread(&State::builder, this, step, mi.machine, reservation);
|
||||
auto builderThread = std::thread(&State::builder, this,
|
||||
std::make_shared<MachineReservation>(*this, step, mi.machine));
|
||||
builderThread.detach(); // FIXME?
|
||||
|
||||
keepGoing = true;
|
||||
|
@ -231,6 +236,17 @@ system_time State::doDispatch()
|
|||
if (keepGoing) break;
|
||||
}
|
||||
|
||||
/* Update the stats for the auto-scaler. */
|
||||
{
|
||||
auto machineTypes_(machineTypes.lock());
|
||||
|
||||
for (auto & i : *machineTypes_)
|
||||
i.second.runnable = 0;
|
||||
|
||||
for (auto & i : runnablePerType)
|
||||
(*machineTypes_)[i.first].runnable = i.second;
|
||||
}
|
||||
|
||||
} while (keepGoing);
|
||||
|
||||
return sleepUntil;
|
||||
|
@ -266,3 +282,33 @@ void Jobset::pruneSteps()
|
|||
steps_->erase(i);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
State::MachineReservation::MachineReservation(State & state, Step::ptr step, Machine::ptr machine)
|
||||
: state(state), step(step), machine(machine)
|
||||
{
|
||||
machine->state->currentJobs++;
|
||||
|
||||
{
|
||||
auto machineTypes_(state.machineTypes.lock());
|
||||
(*machineTypes_)[step->drv.platform].running++;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
State::MachineReservation::~MachineReservation()
|
||||
{
|
||||
auto prev = machine->state->currentJobs--;
|
||||
assert(prev);
|
||||
if (prev == 1)
|
||||
machine->state->idleSince = time(0);
|
||||
|
||||
{
|
||||
auto machineTypes_(state.machineTypes.lock());
|
||||
auto & machineType = (*machineTypes_)[step->drv.platform];
|
||||
assert(machineType.running);
|
||||
machineType.running--;
|
||||
if (machineType.running == 0)
|
||||
machineType.lastActive = time(0);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -453,6 +453,8 @@ void State::dumpStatus(Connection & conn, bool log)
|
|||
nested.attr(m->sshName);
|
||||
JSONObject nested2(out);
|
||||
nested2.attr("currentJobs", s->currentJobs);
|
||||
if (s->currentJobs == 0)
|
||||
nested2.attr("idleSince", s->idleSince);
|
||||
nested2.attr("nrStepsDone", s->nrStepsDone);
|
||||
if (m->state->nrStepsDone) {
|
||||
nested2.attr("totalStepTime", s->totalStepTime);
|
||||
|
@ -473,6 +475,19 @@ void State::dumpStatus(Connection & conn, bool log)
|
|||
nested2.attr("seconds", jobset.second->getSeconds());
|
||||
}
|
||||
}
|
||||
{
|
||||
root.attr("machineTypes");
|
||||
JSONObject nested(out);
|
||||
auto machineTypes_(machineTypes.lock());
|
||||
for (auto & i : *machineTypes_) {
|
||||
nested.attr(i.first);
|
||||
JSONObject nested2(out);
|
||||
nested2.attr("runnable", i.second.runnable);
|
||||
nested2.attr("running", i.second.running);
|
||||
if (i.second.running == 0)
|
||||
nested2.attr("lastActive", i.second.lastActive);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (log) printMsg(lvlInfo, format("status: %1%") % out.str());
|
||||
|
|
|
@ -208,6 +208,7 @@ struct Machine
|
|||
counter nrStepsDone{0};
|
||||
counter totalStepTime{0}; // total time for steps, including closure copying
|
||||
counter totalStepBuildTime{0}; // total build time for steps
|
||||
std::atomic<time_t> idleSince{0};
|
||||
|
||||
struct ConnectInfo
|
||||
{
|
||||
|
@ -318,6 +319,25 @@ private:
|
|||
/* Specific build to do for --build-one (testing only). */
|
||||
BuildID buildOne;
|
||||
|
||||
/* Statistics per machine type for the Hydra auto-scaler. */
|
||||
struct MachineType
|
||||
{
|
||||
unsigned int runnable{0}, running{0};
|
||||
time_t lastActive{0};
|
||||
};
|
||||
|
||||
Sync<std::map<std::string, MachineType>> machineTypes;
|
||||
|
||||
struct MachineReservation
|
||||
{
|
||||
typedef std::shared_ptr<MachineReservation> ptr;
|
||||
State & state;
|
||||
Step::ptr step;
|
||||
Machine::ptr machine;
|
||||
MachineReservation(State & state, Step::ptr step, Machine::ptr machine);
|
||||
~MachineReservation();
|
||||
};
|
||||
|
||||
public:
|
||||
State();
|
||||
|
||||
|
@ -369,7 +389,7 @@ private:
|
|||
|
||||
void wakeDispatcher();
|
||||
|
||||
void builder(Step::ptr step, Machine::ptr machine, std::shared_ptr<MaintainCount> reservation);
|
||||
void builder(MachineReservation::ptr reservation);
|
||||
|
||||
/* Perform the given build step. Return true if the step is to be
|
||||
retried. */
|
||||
|
|
Loading…
Reference in a new issue