Keep stats for the Hydra auto scaler

"hydra-queue-runner --status" now prints how many runnable and running
build steps exist for each machine type. This allows additional
machines to be provisioned based on the Hydra load.
This commit is contained in:
Eelco Dolstra 2015-08-17 13:50:41 +02:00
parent 69e9f73cf6
commit d571e44b86
5 changed files with 91 additions and 8 deletions

View file

@ -6,18 +6,20 @@
using namespace nix; using namespace nix;
void State::builder(Step::ptr step, Machine::ptr machine, std::shared_ptr<MaintainCount> reservation) void State::builder(MachineReservation::ptr reservation)
{ {
bool retry = true; bool retry = true;
MaintainCount mc(nrActiveSteps); MaintainCount mc(nrActiveSteps);
auto step = reservation->step;
try { try {
auto store = openStore(); // FIXME: pool auto store = openStore(); // FIXME: pool
retry = doBuildStep(store, step, machine); retry = doBuildStep(store, step, reservation->machine);
} catch (std::exception & e) { } catch (std::exception & e) {
printMsg(lvlError, format("uncaught exception building %1% on %2%: %3%") printMsg(lvlError, format("uncaught exception building %1% on %2%: %3%")
% step->drvPath % machine->sshName % e.what()); % step->drvPath % reservation->machine->sshName % e.what());
} }
/* Release the machine and wake up the dispatcher. */ /* Release the machine and wake up the dispatcher. */

View file

@ -8,5 +8,5 @@ struct MaintainCount
{ {
counter & c; counter & c;
MaintainCount(counter & c) : c(c) { c++; } MaintainCount(counter & c) : c(c) { c++; }
~MaintainCount() { c--; } ~MaintainCount() { auto prev = c--; assert(prev); }
}; };

View file

@ -1,5 +1,6 @@
#include <algorithm> #include <algorithm>
#include <thread> #include <thread>
#include <unordered_map>
#include "state.hh" #include "state.hh"
@ -142,6 +143,7 @@ system_time State::doDispatch()
FIXME: O(n lg n); obviously, it would be better to keep a FIXME: O(n lg n); obviously, it would be better to keep a
runnable queue sorted by priority. */ runnable queue sorted by priority. */
std::vector<Step::ptr> runnableSorted; std::vector<Step::ptr> runnableSorted;
std::unordered_map<std::string, unsigned int> runnablePerType;
{ {
auto runnable_(runnable.lock()); auto runnable_(runnable.lock());
runnableSorted.reserve(runnable_->size()); runnableSorted.reserve(runnable_->size());
@ -156,6 +158,8 @@ system_time State::doDispatch()
++i; ++i;
runnablePerType[step->drv.platform]++;
/* Skip previously failed steps that aren't ready /* Skip previously failed steps that aren't ready
to be retried. */ to be retried. */
{ {
@ -215,13 +219,14 @@ system_time State::doDispatch()
break; break;
} else ++i; } else ++i;
assert(removed); assert(removed);
assert(runnablePerType[step->drv.platform]);
runnablePerType[step->drv.platform]--;
} }
/* Make a slot reservation and start a thread to /* Make a slot reservation and start a thread to
do the build. */ do the build. */
auto reservation = std::make_shared<MaintainCount>(mi.machine->state->currentJobs); auto builderThread = std::thread(&State::builder, this,
std::make_shared<MachineReservation>(*this, step, mi.machine));
auto builderThread = std::thread(&State::builder, this, step, mi.machine, reservation);
builderThread.detach(); // FIXME? builderThread.detach(); // FIXME?
keepGoing = true; keepGoing = true;
@ -231,6 +236,17 @@ system_time State::doDispatch()
if (keepGoing) break; if (keepGoing) break;
} }
/* Update the stats for the auto-scaler. */
{
auto machineTypes_(machineTypes.lock());
for (auto & i : *machineTypes_)
i.second.runnable = 0;
for (auto & i : runnablePerType)
(*machineTypes_)[i.first].runnable = i.second;
}
} while (keepGoing); } while (keepGoing);
return sleepUntil; return sleepUntil;
@ -266,3 +282,33 @@ void Jobset::pruneSteps()
steps_->erase(i); steps_->erase(i);
} }
} }
State::MachineReservation::MachineReservation(State & state, Step::ptr step, Machine::ptr machine)
: state(state), step(step), machine(machine)
{
machine->state->currentJobs++;
{
auto machineTypes_(state.machineTypes.lock());
(*machineTypes_)[step->drv.platform].running++;
}
}
State::MachineReservation::~MachineReservation()
{
auto prev = machine->state->currentJobs--;
assert(prev);
if (prev == 1)
machine->state->idleSince = time(0);
{
auto machineTypes_(state.machineTypes.lock());
auto & machineType = (*machineTypes_)[step->drv.platform];
assert(machineType.running);
machineType.running--;
if (machineType.running == 0)
machineType.lastActive = time(0);
}
}

View file

@ -453,6 +453,8 @@ void State::dumpStatus(Connection & conn, bool log)
nested.attr(m->sshName); nested.attr(m->sshName);
JSONObject nested2(out); JSONObject nested2(out);
nested2.attr("currentJobs", s->currentJobs); nested2.attr("currentJobs", s->currentJobs);
if (s->currentJobs == 0)
nested2.attr("idleSince", s->idleSince);
nested2.attr("nrStepsDone", s->nrStepsDone); nested2.attr("nrStepsDone", s->nrStepsDone);
if (m->state->nrStepsDone) { if (m->state->nrStepsDone) {
nested2.attr("totalStepTime", s->totalStepTime); nested2.attr("totalStepTime", s->totalStepTime);
@ -473,6 +475,19 @@ void State::dumpStatus(Connection & conn, bool log)
nested2.attr("seconds", jobset.second->getSeconds()); nested2.attr("seconds", jobset.second->getSeconds());
} }
} }
{
root.attr("machineTypes");
JSONObject nested(out);
auto machineTypes_(machineTypes.lock());
for (auto & i : *machineTypes_) {
nested.attr(i.first);
JSONObject nested2(out);
nested2.attr("runnable", i.second.runnable);
nested2.attr("running", i.second.running);
if (i.second.running == 0)
nested2.attr("lastActive", i.second.lastActive);
}
}
} }
if (log) printMsg(lvlInfo, format("status: %1%") % out.str()); if (log) printMsg(lvlInfo, format("status: %1%") % out.str());

View file

@ -208,6 +208,7 @@ struct Machine
counter nrStepsDone{0}; counter nrStepsDone{0};
counter totalStepTime{0}; // total time for steps, including closure copying counter totalStepTime{0}; // total time for steps, including closure copying
counter totalStepBuildTime{0}; // total build time for steps counter totalStepBuildTime{0}; // total build time for steps
std::atomic<time_t> idleSince{0};
struct ConnectInfo struct ConnectInfo
{ {
@ -318,6 +319,25 @@ private:
/* Specific build to do for --build-one (testing only). */ /* Specific build to do for --build-one (testing only). */
BuildID buildOne; BuildID buildOne;
/* Statistics per machine type for the Hydra auto-scaler. */
struct MachineType
{
unsigned int runnable{0}, running{0};
time_t lastActive{0};
};
Sync<std::map<std::string, MachineType>> machineTypes;
struct MachineReservation
{
typedef std::shared_ptr<MachineReservation> ptr;
State & state;
Step::ptr step;
Machine::ptr machine;
MachineReservation(State & state, Step::ptr step, Machine::ptr machine);
~MachineReservation();
};
public: public:
State(); State();
@ -369,7 +389,7 @@ private:
void wakeDispatcher(); void wakeDispatcher();
void builder(Step::ptr step, Machine::ptr machine, std::shared_ptr<MaintainCount> reservation); void builder(MachineReservation::ptr reservation);
/* Perform the given build step. Return true if the step is to be /* Perform the given build step. Return true if the step is to be
retried. */ retried. */