forked from lix-project/hydra
Keep stats for the Hydra auto scaler
"hydra-queue-runner --status" now prints how many runnable and running build steps exist for each machine type. This allows additional machines to be provisioned based on the Hydra load.
This commit is contained in:
parent
69e9f73cf6
commit
d571e44b86
|
@ -6,18 +6,20 @@
|
||||||
using namespace nix;
|
using namespace nix;
|
||||||
|
|
||||||
|
|
||||||
void State::builder(Step::ptr step, Machine::ptr machine, std::shared_ptr<MaintainCount> reservation)
|
void State::builder(MachineReservation::ptr reservation)
|
||||||
{
|
{
|
||||||
bool retry = true;
|
bool retry = true;
|
||||||
|
|
||||||
MaintainCount mc(nrActiveSteps);
|
MaintainCount mc(nrActiveSteps);
|
||||||
|
|
||||||
|
auto step = reservation->step;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
auto store = openStore(); // FIXME: pool
|
auto store = openStore(); // FIXME: pool
|
||||||
retry = doBuildStep(store, step, machine);
|
retry = doBuildStep(store, step, reservation->machine);
|
||||||
} catch (std::exception & e) {
|
} catch (std::exception & e) {
|
||||||
printMsg(lvlError, format("uncaught exception building ‘%1%’ on ‘%2%’: %3%")
|
printMsg(lvlError, format("uncaught exception building ‘%1%’ on ‘%2%’: %3%")
|
||||||
% step->drvPath % machine->sshName % e.what());
|
% step->drvPath % reservation->machine->sshName % e.what());
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Release the machine and wake up the dispatcher. */
|
/* Release the machine and wake up the dispatcher. */
|
||||||
|
|
|
@ -8,5 +8,5 @@ struct MaintainCount
|
||||||
{
|
{
|
||||||
counter & c;
|
counter & c;
|
||||||
MaintainCount(counter & c) : c(c) { c++; }
|
MaintainCount(counter & c) : c(c) { c++; }
|
||||||
~MaintainCount() { c--; }
|
~MaintainCount() { auto prev = c--; assert(prev); }
|
||||||
};
|
};
|
||||||
|
|
|
@ -1,5 +1,6 @@
|
||||||
#include <algorithm>
|
#include <algorithm>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
|
#include <unordered_map>
|
||||||
|
|
||||||
#include "state.hh"
|
#include "state.hh"
|
||||||
|
|
||||||
|
@ -142,6 +143,7 @@ system_time State::doDispatch()
|
||||||
FIXME: O(n lg n); obviously, it would be better to keep a
|
FIXME: O(n lg n); obviously, it would be better to keep a
|
||||||
runnable queue sorted by priority. */
|
runnable queue sorted by priority. */
|
||||||
std::vector<Step::ptr> runnableSorted;
|
std::vector<Step::ptr> runnableSorted;
|
||||||
|
std::unordered_map<std::string, unsigned int> runnablePerType;
|
||||||
{
|
{
|
||||||
auto runnable_(runnable.lock());
|
auto runnable_(runnable.lock());
|
||||||
runnableSorted.reserve(runnable_->size());
|
runnableSorted.reserve(runnable_->size());
|
||||||
|
@ -156,6 +158,8 @@ system_time State::doDispatch()
|
||||||
|
|
||||||
++i;
|
++i;
|
||||||
|
|
||||||
|
runnablePerType[step->drv.platform]++;
|
||||||
|
|
||||||
/* Skip previously failed steps that aren't ready
|
/* Skip previously failed steps that aren't ready
|
||||||
to be retried. */
|
to be retried. */
|
||||||
{
|
{
|
||||||
|
@ -215,13 +219,14 @@ system_time State::doDispatch()
|
||||||
break;
|
break;
|
||||||
} else ++i;
|
} else ++i;
|
||||||
assert(removed);
|
assert(removed);
|
||||||
|
assert(runnablePerType[step->drv.platform]);
|
||||||
|
runnablePerType[step->drv.platform]--;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Make a slot reservation and start a thread to
|
/* Make a slot reservation and start a thread to
|
||||||
do the build. */
|
do the build. */
|
||||||
auto reservation = std::make_shared<MaintainCount>(mi.machine->state->currentJobs);
|
auto builderThread = std::thread(&State::builder, this,
|
||||||
|
std::make_shared<MachineReservation>(*this, step, mi.machine));
|
||||||
auto builderThread = std::thread(&State::builder, this, step, mi.machine, reservation);
|
|
||||||
builderThread.detach(); // FIXME?
|
builderThread.detach(); // FIXME?
|
||||||
|
|
||||||
keepGoing = true;
|
keepGoing = true;
|
||||||
|
@ -231,6 +236,17 @@ system_time State::doDispatch()
|
||||||
if (keepGoing) break;
|
if (keepGoing) break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Update the stats for the auto-scaler. */
|
||||||
|
{
|
||||||
|
auto machineTypes_(machineTypes.lock());
|
||||||
|
|
||||||
|
for (auto & i : *machineTypes_)
|
||||||
|
i.second.runnable = 0;
|
||||||
|
|
||||||
|
for (auto & i : runnablePerType)
|
||||||
|
(*machineTypes_)[i.first].runnable = i.second;
|
||||||
|
}
|
||||||
|
|
||||||
} while (keepGoing);
|
} while (keepGoing);
|
||||||
|
|
||||||
return sleepUntil;
|
return sleepUntil;
|
||||||
|
@ -266,3 +282,33 @@ void Jobset::pruneSteps()
|
||||||
steps_->erase(i);
|
steps_->erase(i);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
State::MachineReservation::MachineReservation(State & state, Step::ptr step, Machine::ptr machine)
|
||||||
|
: state(state), step(step), machine(machine)
|
||||||
|
{
|
||||||
|
machine->state->currentJobs++;
|
||||||
|
|
||||||
|
{
|
||||||
|
auto machineTypes_(state.machineTypes.lock());
|
||||||
|
(*machineTypes_)[step->drv.platform].running++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
State::MachineReservation::~MachineReservation()
|
||||||
|
{
|
||||||
|
auto prev = machine->state->currentJobs--;
|
||||||
|
assert(prev);
|
||||||
|
if (prev == 1)
|
||||||
|
machine->state->idleSince = time(0);
|
||||||
|
|
||||||
|
{
|
||||||
|
auto machineTypes_(state.machineTypes.lock());
|
||||||
|
auto & machineType = (*machineTypes_)[step->drv.platform];
|
||||||
|
assert(machineType.running);
|
||||||
|
machineType.running--;
|
||||||
|
if (machineType.running == 0)
|
||||||
|
machineType.lastActive = time(0);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
|
@ -453,6 +453,8 @@ void State::dumpStatus(Connection & conn, bool log)
|
||||||
nested.attr(m->sshName);
|
nested.attr(m->sshName);
|
||||||
JSONObject nested2(out);
|
JSONObject nested2(out);
|
||||||
nested2.attr("currentJobs", s->currentJobs);
|
nested2.attr("currentJobs", s->currentJobs);
|
||||||
|
if (s->currentJobs == 0)
|
||||||
|
nested2.attr("idleSince", s->idleSince);
|
||||||
nested2.attr("nrStepsDone", s->nrStepsDone);
|
nested2.attr("nrStepsDone", s->nrStepsDone);
|
||||||
if (m->state->nrStepsDone) {
|
if (m->state->nrStepsDone) {
|
||||||
nested2.attr("totalStepTime", s->totalStepTime);
|
nested2.attr("totalStepTime", s->totalStepTime);
|
||||||
|
@ -473,6 +475,19 @@ void State::dumpStatus(Connection & conn, bool log)
|
||||||
nested2.attr("seconds", jobset.second->getSeconds());
|
nested2.attr("seconds", jobset.second->getSeconds());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
{
|
||||||
|
root.attr("machineTypes");
|
||||||
|
JSONObject nested(out);
|
||||||
|
auto machineTypes_(machineTypes.lock());
|
||||||
|
for (auto & i : *machineTypes_) {
|
||||||
|
nested.attr(i.first);
|
||||||
|
JSONObject nested2(out);
|
||||||
|
nested2.attr("runnable", i.second.runnable);
|
||||||
|
nested2.attr("running", i.second.running);
|
||||||
|
if (i.second.running == 0)
|
||||||
|
nested2.attr("lastActive", i.second.lastActive);
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (log) printMsg(lvlInfo, format("status: %1%") % out.str());
|
if (log) printMsg(lvlInfo, format("status: %1%") % out.str());
|
||||||
|
|
|
@ -208,6 +208,7 @@ struct Machine
|
||||||
counter nrStepsDone{0};
|
counter nrStepsDone{0};
|
||||||
counter totalStepTime{0}; // total time for steps, including closure copying
|
counter totalStepTime{0}; // total time for steps, including closure copying
|
||||||
counter totalStepBuildTime{0}; // total build time for steps
|
counter totalStepBuildTime{0}; // total build time for steps
|
||||||
|
std::atomic<time_t> idleSince{0};
|
||||||
|
|
||||||
struct ConnectInfo
|
struct ConnectInfo
|
||||||
{
|
{
|
||||||
|
@ -318,6 +319,25 @@ private:
|
||||||
/* Specific build to do for --build-one (testing only). */
|
/* Specific build to do for --build-one (testing only). */
|
||||||
BuildID buildOne;
|
BuildID buildOne;
|
||||||
|
|
||||||
|
/* Statistics per machine type for the Hydra auto-scaler. */
|
||||||
|
struct MachineType
|
||||||
|
{
|
||||||
|
unsigned int runnable{0}, running{0};
|
||||||
|
time_t lastActive{0};
|
||||||
|
};
|
||||||
|
|
||||||
|
Sync<std::map<std::string, MachineType>> machineTypes;
|
||||||
|
|
||||||
|
struct MachineReservation
|
||||||
|
{
|
||||||
|
typedef std::shared_ptr<MachineReservation> ptr;
|
||||||
|
State & state;
|
||||||
|
Step::ptr step;
|
||||||
|
Machine::ptr machine;
|
||||||
|
MachineReservation(State & state, Step::ptr step, Machine::ptr machine);
|
||||||
|
~MachineReservation();
|
||||||
|
};
|
||||||
|
|
||||||
public:
|
public:
|
||||||
State();
|
State();
|
||||||
|
|
||||||
|
@ -369,7 +389,7 @@ private:
|
||||||
|
|
||||||
void wakeDispatcher();
|
void wakeDispatcher();
|
||||||
|
|
||||||
void builder(Step::ptr step, Machine::ptr machine, std::shared_ptr<MaintainCount> reservation);
|
void builder(MachineReservation::ptr reservation);
|
||||||
|
|
||||||
/* Perform the given build step. Return true if the step is to be
|
/* Perform the given build step. Return true if the step is to be
|
||||||
retried. */
|
retried. */
|
||||||
|
|
Loading…
Reference in a new issue