forked from lix-project/hydra
Automatically reload $NIX_REMOTE_SYSTEMS when it changes
Otherwise, you'd have to restart the queue runner to add or remove machines.
This commit is contained in:
parent
1a0e1eb5a0
commit
32210905d8
|
@ -180,10 +180,15 @@ struct Machine
|
||||||
unsigned int maxJobs = 1;
|
unsigned int maxJobs = 1;
|
||||||
float speedFactor = 1.0;
|
float speedFactor = 1.0;
|
||||||
|
|
||||||
|
struct State {
|
||||||
|
typedef std::shared_ptr<State> ptr;
|
||||||
counter currentJobs{0};
|
counter currentJobs{0};
|
||||||
counter nrStepsDone{0};
|
counter nrStepsDone{0};
|
||||||
counter totalStepTime{0}; // total time for steps, including closure copying
|
counter totalStepTime{0}; // total time for steps, including closure copying
|
||||||
counter totalStepBuildTime{0}; // total build time for steps
|
counter totalStepBuildTime{0}; // total build time for steps
|
||||||
|
};
|
||||||
|
|
||||||
|
State::ptr state;
|
||||||
|
|
||||||
bool supportsStep(Step::ptr step)
|
bool supportsStep(Step::ptr step)
|
||||||
{
|
{
|
||||||
|
@ -197,23 +202,6 @@ struct Machine
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
/* A RAII helper that manages the currentJobs field of Machine
|
|
||||||
objects. */
|
|
||||||
struct MachineReservation
|
|
||||||
{
|
|
||||||
typedef std::shared_ptr<MachineReservation> ptr;
|
|
||||||
Machine::ptr machine;
|
|
||||||
MachineReservation(Machine::ptr machine) : machine(machine)
|
|
||||||
{
|
|
||||||
machine->currentJobs++;
|
|
||||||
}
|
|
||||||
~MachineReservation()
|
|
||||||
{
|
|
||||||
machine->currentJobs--;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
|
|
||||||
class State
|
class State
|
||||||
{
|
{
|
||||||
private:
|
private:
|
||||||
|
@ -243,9 +231,12 @@ private:
|
||||||
Pool<Connection> dbPool;
|
Pool<Connection> dbPool;
|
||||||
|
|
||||||
/* The build machines. */
|
/* The build machines. */
|
||||||
typedef std::list<Machine::ptr> Machines;
|
typedef std::map<string, Machine::ptr> Machines;
|
||||||
Sync<Machines> machines;
|
Sync<Machines> machines;
|
||||||
|
|
||||||
|
Path machinesFile;
|
||||||
|
struct stat machinesFileStat;
|
||||||
|
|
||||||
/* Token server limiting the number of threads copying closures in
|
/* Token server limiting the number of threads copying closures in
|
||||||
parallel to prevent excessive I/O load. */
|
parallel to prevent excessive I/O load. */
|
||||||
TokenServer copyClosureTokenServer{maxParallelCopyClosure};
|
TokenServer copyClosureTokenServer{maxParallelCopyClosure};
|
||||||
|
@ -285,7 +276,11 @@ private:
|
||||||
|
|
||||||
void clearBusy(Connection & conn, time_t stopTime);
|
void clearBusy(Connection & conn, time_t stopTime);
|
||||||
|
|
||||||
void loadMachines();
|
/* (Re)load /etc/nix/machines. */
|
||||||
|
void loadMachinesFile();
|
||||||
|
|
||||||
|
/* Thread to reload /etc/nix/machines periodically. */
|
||||||
|
void monitorMachinesFile();
|
||||||
|
|
||||||
int createBuildStep(pqxx::work & txn, time_t startTime, Build::ptr build, Step::ptr step,
|
int createBuildStep(pqxx::work & txn, time_t startTime, Build::ptr build, Step::ptr step,
|
||||||
const std::string & machine, BuildStepStatus status, const std::string & errorMsg = "",
|
const std::string & machine, BuildStepStatus status, const std::string & errorMsg = "",
|
||||||
|
@ -316,7 +311,7 @@ private:
|
||||||
|
|
||||||
void wakeDispatcher();
|
void wakeDispatcher();
|
||||||
|
|
||||||
void builder(Step::ptr step, MachineReservation::ptr reservation);
|
void builder(Step::ptr step, Machine::ptr machine, std::shared_ptr<MaintainCount> reservation);
|
||||||
|
|
||||||
/* Perform the given build step. Return true if the step is to be
|
/* Perform the given build step. Return true if the step is to be
|
||||||
retried. */
|
retried. */
|
||||||
|
@ -357,18 +352,40 @@ State::State()
|
||||||
if (hydraData == "") throw Error("$HYDRA_DATA must be set");
|
if (hydraData == "") throw Error("$HYDRA_DATA must be set");
|
||||||
|
|
||||||
logDir = canonPath(hydraData + "/build-logs");
|
logDir = canonPath(hydraData + "/build-logs");
|
||||||
|
|
||||||
|
machinesFile = getEnv("NIX_REMOTE_SYSTEMS", "/etc/nix/machines");
|
||||||
|
machinesFileStat.st_ino = 0;
|
||||||
|
machinesFileStat.st_mtime = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void State::loadMachines()
|
void State::loadMachinesFile()
|
||||||
{
|
{
|
||||||
Path machinesFile = getEnv("NIX_REMOTE_SYSTEMS", "/etc/nix/machines");
|
string contents;
|
||||||
|
|
||||||
Machines newMachines;
|
|
||||||
|
|
||||||
if (pathExists(machinesFile)) {
|
if (pathExists(machinesFile)) {
|
||||||
|
struct stat st;
|
||||||
|
if (stat(machinesFile.c_str(), &st) != 0)
|
||||||
|
throw SysError(format("getting stats about ‘%1%’") % machinesFile);
|
||||||
|
if (st.st_ino == machinesFileStat.st_ino && st.st_mtime == machinesFileStat.st_mtime)
|
||||||
|
return;
|
||||||
|
printMsg(lvlDebug, "reloading machines");
|
||||||
|
contents = readFile(machinesFile);
|
||||||
|
machinesFileStat = st;
|
||||||
|
} else {
|
||||||
|
StringSet systems = StringSet({settings.thisSystem});
|
||||||
|
if (settings.thisSystem == "x86_64-linux")
|
||||||
|
systems.insert("i686-linux");
|
||||||
|
contents = "localhost " + concatStringsSep(",", systems)
|
||||||
|
+ " - " + int2String(settings.maxBuildJobs) + " 1";
|
||||||
|
}
|
||||||
|
|
||||||
for (auto line : tokenizeString<Strings>(readFile(machinesFile), "\n")) {
|
Machines newMachines, oldMachines;
|
||||||
|
{
|
||||||
|
auto machines_(machines.lock());
|
||||||
|
oldMachines = *machines_;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto line : tokenizeString<Strings>(contents, "\n")) {
|
||||||
line = trim(string(line, 0, line.find('#')));
|
line = trim(string(line, 0, line.find('#')));
|
||||||
auto tokens = tokenizeString<std::vector<std::string>>(line);
|
auto tokens = tokenizeString<std::vector<std::string>>(line);
|
||||||
if (tokens.size() < 3) continue;
|
if (tokens.size() < 3) continue;
|
||||||
|
@ -387,24 +404,43 @@ void State::loadMachines()
|
||||||
machine->mandatoryFeatures = tokenizeString<StringSet>(tokens[6], ",");
|
machine->mandatoryFeatures = tokenizeString<StringSet>(tokens[6], ",");
|
||||||
for (auto & f : machine->mandatoryFeatures)
|
for (auto & f : machine->mandatoryFeatures)
|
||||||
machine->supportedFeatures.insert(f);
|
machine->supportedFeatures.insert(f);
|
||||||
newMachines.push_back(machine);
|
|
||||||
|
/* Re-use the State object of the previous machine with the
|
||||||
|
same name. */
|
||||||
|
auto i = oldMachines.find(machine->sshName);
|
||||||
|
if (i == oldMachines.end())
|
||||||
|
printMsg(lvlChatty, format("adding new machine ‘%1%’") % machine->sshName);
|
||||||
|
else
|
||||||
|
printMsg(lvlChatty, format("updating machine ‘%1%’") % machine->sshName);
|
||||||
|
machine->state = i == oldMachines.end()
|
||||||
|
? std::make_shared<Machine::State>()
|
||||||
|
: i->second->state;
|
||||||
|
newMachines[machine->sshName] = machine;
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
for (auto & m : oldMachines)
|
||||||
auto machine = std::make_shared<Machine>();
|
if (newMachines.find(m.first) == newMachines.end())
|
||||||
machine->sshName = "localhost";
|
printMsg(lvlInfo, format("removing machine ‘%1%’") % m.first);
|
||||||
machine->systemTypes = StringSet({settings.thisSystem});
|
|
||||||
if (settings.thisSystem == "x86_64-linux")
|
|
||||||
machine->systemTypes.insert("i686-linux");
|
|
||||||
machine->maxJobs = settings.maxBuildJobs;
|
|
||||||
newMachines.push_back(machine);
|
|
||||||
}
|
|
||||||
|
|
||||||
auto machines_(machines.lock());
|
auto machines_(machines.lock());
|
||||||
*machines_ = newMachines;
|
*machines_ = newMachines;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void State::monitorMachinesFile()
|
||||||
|
{
|
||||||
|
while (true) {
|
||||||
|
try {
|
||||||
|
// FIXME: use inotify.
|
||||||
|
sleep(60);
|
||||||
|
loadMachinesFile();
|
||||||
|
} catch (std::exception & e) {
|
||||||
|
printMsg(lvlError, format("reloading machines file: %1%") % e.what());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void State::clearBusy(Connection & conn, time_t stopTime)
|
void State::clearBusy(Connection & conn, time_t stopTime)
|
||||||
{
|
{
|
||||||
pqxx::work txn(conn);
|
pqxx::work txn(conn);
|
||||||
|
@ -619,7 +655,7 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr<StoreAPI> store,
|
||||||
{
|
{
|
||||||
auto machines_(machines.lock()); // FIXME: use shared_mutex
|
auto machines_(machines.lock()); // FIXME: use shared_mutex
|
||||||
for (auto & m : *machines_)
|
for (auto & m : *machines_)
|
||||||
if (m->supportsStep(r)) { supported = true; break; }
|
if (m.second->supportsStep(r)) { supported = true; break; }
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!supported) {
|
if (!supported) {
|
||||||
|
@ -896,7 +932,7 @@ void State::dispatcher()
|
||||||
{
|
{
|
||||||
auto machines_(machines.lock());
|
auto machines_(machines.lock());
|
||||||
for (auto & m : *machines_)
|
for (auto & m : *machines_)
|
||||||
machinesSorted.push_back({m, m->currentJobs});
|
machinesSorted.push_back({m.second, m.second->state->currentJobs});
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Sort the machines by a combination of speed factor and
|
/* Sort the machines by a combination of speed factor and
|
||||||
|
@ -929,7 +965,7 @@ void State::dispatcher()
|
||||||
|
|
||||||
for (auto & mi : machinesSorted) {
|
for (auto & mi : machinesSorted) {
|
||||||
// FIXME: can we lose a wakeup if a builder exits concurrently?
|
// FIXME: can we lose a wakeup if a builder exits concurrently?
|
||||||
if (mi.machine->currentJobs >= mi.machine->maxJobs) continue;
|
if (mi.machine->state->currentJobs >= mi.machine->maxJobs) continue;
|
||||||
|
|
||||||
auto runnable_(runnable.lock());
|
auto runnable_(runnable.lock());
|
||||||
//printMsg(lvlDebug, format("%1% runnable builds") % runnable_->size());
|
//printMsg(lvlDebug, format("%1% runnable builds") % runnable_->size());
|
||||||
|
@ -966,10 +1002,10 @@ void State::dispatcher()
|
||||||
|
|
||||||
/* Make a slot reservation and start a thread to
|
/* Make a slot reservation and start a thread to
|
||||||
do the build. */
|
do the build. */
|
||||||
auto reservation = std::make_shared<MachineReservation>(mi.machine);
|
auto reservation = std::make_shared<MaintainCount>(mi.machine->state->currentJobs);
|
||||||
i = runnable_->erase(i);
|
i = runnable_->erase(i);
|
||||||
|
|
||||||
auto builderThread = std::thread(&State::builder, this, step, reservation);
|
auto builderThread = std::thread(&State::builder, this, step, mi.machine, reservation);
|
||||||
builderThread.detach(); // FIXME?
|
builderThread.detach(); // FIXME?
|
||||||
|
|
||||||
keepGoing = true;
|
keepGoing = true;
|
||||||
|
@ -1003,7 +1039,7 @@ void State::wakeDispatcher()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void State::builder(Step::ptr step, MachineReservation::ptr reservation)
|
void State::builder(Step::ptr step, Machine::ptr machine, std::shared_ptr<MaintainCount> reservation)
|
||||||
{
|
{
|
||||||
bool retry = true;
|
bool retry = true;
|
||||||
|
|
||||||
|
@ -1011,10 +1047,10 @@ void State::builder(Step::ptr step, MachineReservation::ptr reservation)
|
||||||
|
|
||||||
try {
|
try {
|
||||||
auto store = openStore(); // FIXME: pool
|
auto store = openStore(); // FIXME: pool
|
||||||
retry = doBuildStep(store, step, reservation->machine);
|
retry = doBuildStep(store, step, machine);
|
||||||
} catch (std::exception & e) {
|
} catch (std::exception & e) {
|
||||||
printMsg(lvlError, format("uncaught exception building ‘%1%’ on ‘%2%’: %3%")
|
printMsg(lvlError, format("uncaught exception building ‘%1%’ on ‘%2%’: %3%")
|
||||||
% step->drvPath % reservation->machine->sshName % e.what());
|
% step->drvPath % machine->sshName % e.what());
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Release the machine and wake up the dispatcher. */
|
/* Release the machine and wake up the dispatcher. */
|
||||||
|
@ -1359,9 +1395,9 @@ bool State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
|
||||||
nrStepsDone++;
|
nrStepsDone++;
|
||||||
totalStepTime += stepStopTime - stepStartTime;
|
totalStepTime += stepStopTime - stepStartTime;
|
||||||
totalStepBuildTime += result.stopTime - result.startTime;
|
totalStepBuildTime += result.stopTime - result.startTime;
|
||||||
machine->nrStepsDone++;
|
machine->state->nrStepsDone++;
|
||||||
machine->totalStepTime += stepStopTime - stepStartTime;
|
machine->state->totalStepTime += stepStopTime - stepStartTime;
|
||||||
machine->totalStepBuildTime += result.stopTime - result.startTime;
|
machine->state->totalStepBuildTime += result.stopTime - result.startTime;
|
||||||
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
@ -1564,16 +1600,18 @@ void State::dumpStatus(Connection & conn, bool log)
|
||||||
root.attr("machines");
|
root.attr("machines");
|
||||||
JSONObject nested(out);
|
JSONObject nested(out);
|
||||||
auto machines_(machines.lock());
|
auto machines_(machines.lock());
|
||||||
for (auto & m : *machines_) {
|
for (auto & i : *machines_) {
|
||||||
|
auto & m(i.second);
|
||||||
|
auto & s(m->state);
|
||||||
nested.attr(m->sshName);
|
nested.attr(m->sshName);
|
||||||
JSONObject nested2(out);
|
JSONObject nested2(out);
|
||||||
nested2.attr("currentJobs", m->currentJobs);
|
nested2.attr("currentJobs", s->currentJobs);
|
||||||
nested2.attr("nrStepsDone", m->nrStepsDone);
|
nested2.attr("nrStepsDone", s->nrStepsDone);
|
||||||
if (m->nrStepsDone) {
|
if (m->state->nrStepsDone) {
|
||||||
nested2.attr("totalStepTime", m->totalStepTime);
|
nested2.attr("totalStepTime", s->totalStepTime);
|
||||||
nested2.attr("totalStepBuildTime", m->totalStepBuildTime);
|
nested2.attr("totalStepBuildTime", s->totalStepBuildTime);
|
||||||
nested2.attr("avgStepTime"); out << (float) m->totalStepTime / m->nrStepsDone;
|
nested2.attr("avgStepTime"); out << (float) s->totalStepTime / s->nrStepsDone;
|
||||||
nested2.attr("avgStepBuildTime"); out << (float) m->totalStepBuildTime / m->nrStepsDone;
|
nested2.attr("avgStepBuildTime"); out << (float) s->totalStepBuildTime / s->nrStepsDone;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1670,7 +1708,9 @@ void State::run()
|
||||||
dumpStatus(*conn, false);
|
dumpStatus(*conn, false);
|
||||||
}
|
}
|
||||||
|
|
||||||
loadMachines();
|
loadMachinesFile();
|
||||||
|
|
||||||
|
std::thread(&State::monitorMachinesFile, this).detach();
|
||||||
|
|
||||||
std::thread(&State::queueMonitor, this).detach();
|
std::thread(&State::queueMonitor, this).detach();
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue