2015-07-21 13:14:17 +00:00
|
|
|
|
#include <algorithm>
|
2016-09-30 15:05:07 +00:00
|
|
|
|
#include <cmath>
|
2015-07-21 13:14:17 +00:00
|
|
|
|
#include <thread>
|
2015-08-17 11:50:41 +00:00
|
|
|
|
#include <unordered_map>
|
2015-07-21 13:14:17 +00:00
|
|
|
|
|
|
|
|
|
#include "state.hh"
|
|
|
|
|
|
|
|
|
|
using namespace nix;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void State::makeRunnable(Step::ptr step)
|
|
|
|
|
{
|
2019-12-30 21:49:26 +00:00
|
|
|
|
printMsg(lvlChatty, "step ‘%s’ is now runnable", localStore->printStorePath(step->drvPath));
|
2015-07-21 13:14:17 +00:00
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
auto step_(step->state.lock());
|
|
|
|
|
assert(step_->created);
|
|
|
|
|
assert(!step->finished);
|
|
|
|
|
assert(step_->deps.empty());
|
2015-08-17 13:45:44 +00:00
|
|
|
|
step_->runnableSince = std::chrono::system_clock::now();
|
2015-07-21 13:14:17 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
auto runnable_(runnable.lock());
|
|
|
|
|
runnable_->push_back(step);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
wakeDispatcher();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void State::dispatcher()
|
|
|
|
|
{
|
|
|
|
|
while (true) {
|
2016-03-02 13:18:39 +00:00
|
|
|
|
|
2016-11-08 10:25:43 +00:00
|
|
|
|
try {
|
|
|
|
|
printMsg(lvlDebug, "dispatcher woken up");
|
|
|
|
|
nrDispatcherWakeups++;
|
2015-07-21 13:14:17 +00:00
|
|
|
|
|
2016-11-08 10:25:43 +00:00
|
|
|
|
auto now1 = std::chrono::steady_clock::now();
|
2015-07-21 13:14:17 +00:00
|
|
|
|
|
2016-11-08 10:25:43 +00:00
|
|
|
|
auto sleepUntil = doDispatch();
|
2016-03-02 13:18:39 +00:00
|
|
|
|
|
2016-11-08 10:25:43 +00:00
|
|
|
|
auto now2 = std::chrono::steady_clock::now();
|
2016-03-02 13:18:39 +00:00
|
|
|
|
|
2016-11-08 10:25:43 +00:00
|
|
|
|
dispatchTimeMs += std::chrono::duration_cast<std::chrono::milliseconds>(now2 - now1).count();
|
|
|
|
|
|
|
|
|
|
/* Sleep until we're woken up (either because a runnable build
|
|
|
|
|
is added, or because a build finishes). */
|
|
|
|
|
{
|
|
|
|
|
auto dispatcherWakeup_(dispatcherWakeup.lock());
|
|
|
|
|
if (!*dispatcherWakeup_) {
|
|
|
|
|
printMsg(lvlDebug, format("dispatcher sleeping for %1%s") %
|
|
|
|
|
std::chrono::duration_cast<std::chrono::seconds>(sleepUntil - std::chrono::system_clock::now()).count());
|
|
|
|
|
dispatcherWakeup_.wait_until(dispatcherWakeupCV, sleepUntil);
|
|
|
|
|
}
|
|
|
|
|
*dispatcherWakeup_ = false;
|
2015-08-10 09:58:33 +00:00
|
|
|
|
}
|
2016-11-08 10:25:43 +00:00
|
|
|
|
|
|
|
|
|
} catch (std::exception & e) {
|
|
|
|
|
printMsg(lvlError, format("dispatcher: %1%") % e.what());
|
|
|
|
|
sleep(1);
|
2015-08-10 09:26:30 +00:00
|
|
|
|
}
|
2016-11-08 10:25:43 +00:00
|
|
|
|
|
2015-08-10 09:26:30 +00:00
|
|
|
|
}
|
2015-07-21 13:14:17 +00:00
|
|
|
|
|
2015-08-10 09:26:30 +00:00
|
|
|
|
printMsg(lvlError, "dispatcher exits");
|
|
|
|
|
}
|
2015-07-21 13:53:27 +00:00
|
|
|
|
|
2015-08-10 09:26:30 +00:00
|
|
|
|
|
|
|
|
|
system_time State::doDispatch()
|
|
|
|
|
{
|
2015-08-10 23:30:24 +00:00
|
|
|
|
/* Prune old historical build step info from the jobsets. */
|
|
|
|
|
{
|
|
|
|
|
auto jobsets_(jobsets.lock());
|
|
|
|
|
for (auto & jobset : *jobsets_) {
|
|
|
|
|
auto s1 = jobset.second->shareUsed();
|
|
|
|
|
jobset.second->pruneSteps();
|
|
|
|
|
auto s2 = jobset.second->shareUsed();
|
|
|
|
|
if (s1 != s2)
|
|
|
|
|
printMsg(lvlDebug, format("pruned scheduling window of ‘%1%:%2%’ from %3% to %4%")
|
|
|
|
|
% jobset.first.first % jobset.first.second % s1 % s2);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2015-08-10 12:50:22 +00:00
|
|
|
|
/* Start steps until we're out of steps or slots. */
|
2015-08-10 09:26:30 +00:00
|
|
|
|
auto sleepUntil = system_time::max();
|
|
|
|
|
bool keepGoing;
|
|
|
|
|
|
|
|
|
|
do {
|
|
|
|
|
system_time now = std::chrono::system_clock::now();
|
|
|
|
|
|
|
|
|
|
/* Copy the currentJobs field of each machine. This is
|
|
|
|
|
necessary to ensure that the sort comparator below is
|
|
|
|
|
an ordering. std::sort() can segfault if it isn't. Also
|
|
|
|
|
filter out temporarily disabled machines. */
|
|
|
|
|
struct MachineInfo
|
|
|
|
|
{
|
|
|
|
|
Machine::ptr machine;
|
2016-04-13 12:22:44 +00:00
|
|
|
|
unsigned long currentJobs;
|
2015-08-10 09:26:30 +00:00
|
|
|
|
};
|
|
|
|
|
std::vector<MachineInfo> machinesSorted;
|
|
|
|
|
{
|
|
|
|
|
auto machines_(machines.lock());
|
|
|
|
|
for (auto & m : *machines_) {
|
|
|
|
|
auto info(m.second->state->connectInfo.lock());
|
2015-09-02 11:31:47 +00:00
|
|
|
|
if (!m.second->enabled) continue;
|
2015-08-10 09:26:30 +00:00
|
|
|
|
if (info->consecutiveFailures && info->disabledUntil > now) {
|
|
|
|
|
if (info->disabledUntil < sleepUntil)
|
|
|
|
|
sleepUntil = info->disabledUntil;
|
|
|
|
|
continue;
|
2015-07-21 13:53:27 +00:00
|
|
|
|
}
|
2015-08-10 09:26:30 +00:00
|
|
|
|
machinesSorted.push_back({m.second, m.second->state->currentJobs});
|
2015-07-21 13:14:17 +00:00
|
|
|
|
}
|
2015-08-10 09:26:30 +00:00
|
|
|
|
}
|
2015-07-21 13:14:17 +00:00
|
|
|
|
|
2015-08-10 09:26:30 +00:00
|
|
|
|
/* Sort the machines by a combination of speed factor and
|
|
|
|
|
available slots. Prioritise the available machines as
|
|
|
|
|
follows:
|
2015-07-21 13:14:17 +00:00
|
|
|
|
|
2015-08-10 09:26:30 +00:00
|
|
|
|
- First by load divided by speed factor, rounded to the
|
|
|
|
|
nearest integer. This causes fast machines to be
|
|
|
|
|
preferred over slow machines with similar loads.
|
2015-07-21 13:14:17 +00:00
|
|
|
|
|
2015-08-10 09:26:30 +00:00
|
|
|
|
- Then by speed factor.
|
2015-07-21 13:14:17 +00:00
|
|
|
|
|
2015-08-10 09:26:30 +00:00
|
|
|
|
- Finally by load. */
|
|
|
|
|
sort(machinesSorted.begin(), machinesSorted.end(),
|
|
|
|
|
[](const MachineInfo & a, const MachineInfo & b) -> bool
|
|
|
|
|
{
|
2016-09-30 15:05:07 +00:00
|
|
|
|
float ta = std::round(a.currentJobs / a.machine->speedFactor);
|
|
|
|
|
float tb = std::round(b.currentJobs / b.machine->speedFactor);
|
2015-08-10 09:26:30 +00:00
|
|
|
|
return
|
|
|
|
|
ta != tb ? ta < tb :
|
|
|
|
|
a.machine->speedFactor != b.machine->speedFactor ? a.machine->speedFactor > b.machine->speedFactor :
|
|
|
|
|
a.currentJobs > b.currentJobs;
|
|
|
|
|
});
|
|
|
|
|
|
2015-08-12 10:05:43 +00:00
|
|
|
|
/* Sort the runnable steps by priority. Priority is establised
|
|
|
|
|
as follows (in order of precedence):
|
|
|
|
|
|
|
|
|
|
- The global priority of the builds that depend on the
|
|
|
|
|
step. This allows admins to bump a build to the front of
|
|
|
|
|
the queue.
|
|
|
|
|
|
|
|
|
|
- The lowest used scheduling share of the jobsets depending
|
|
|
|
|
on the step.
|
|
|
|
|
|
|
|
|
|
- The local priority of the build, as set via the build's
|
|
|
|
|
meta.schedulingPriority field. Note that this is not
|
|
|
|
|
quite correct: the local priority should only be used to
|
|
|
|
|
establish priority between builds in the same jobset, but
|
|
|
|
|
here it's used between steps in different jobsets if they
|
|
|
|
|
happen to have the same lowest used scheduling share. But
|
2018-12-20 11:07:02 +00:00
|
|
|
|
that's not very likely.
|
2015-08-12 10:05:43 +00:00
|
|
|
|
|
|
|
|
|
- The lowest ID of the builds depending on the step;
|
|
|
|
|
i.e. older builds take priority over new ones.
|
|
|
|
|
|
|
|
|
|
FIXME: O(n lg n); obviously, it would be better to keep a
|
|
|
|
|
runnable queue sorted by priority. */
|
2016-03-02 12:59:24 +00:00
|
|
|
|
struct StepInfo
|
|
|
|
|
{
|
|
|
|
|
Step::ptr step;
|
|
|
|
|
|
|
|
|
|
/* The lowest share used of any jobset depending on this
|
|
|
|
|
step. */
|
|
|
|
|
double lowestShareUsed = 1e9;
|
|
|
|
|
|
|
|
|
|
/* Info copied from step->state to ensure that the
|
|
|
|
|
comparator is a partial ordering (see MachineInfo). */
|
|
|
|
|
int highestGlobalPriority;
|
|
|
|
|
int highestLocalPriority;
|
|
|
|
|
BuildID lowestBuildID;
|
|
|
|
|
|
|
|
|
|
StepInfo(Step::ptr step, Step::State & step_) : step(step)
|
|
|
|
|
{
|
|
|
|
|
for (auto & jobset : step_.jobsets)
|
|
|
|
|
lowestShareUsed = std::min(lowestShareUsed, jobset->shareUsed());
|
|
|
|
|
highestGlobalPriority = step_.highestGlobalPriority;
|
|
|
|
|
highestLocalPriority = step_.highestLocalPriority;
|
|
|
|
|
lowestBuildID = step_.lowestBuildID;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
std::vector<StepInfo> runnableSorted;
|
|
|
|
|
|
2015-08-17 13:45:44 +00:00
|
|
|
|
struct RunnablePerType
|
|
|
|
|
{
|
|
|
|
|
unsigned int count{0};
|
|
|
|
|
std::chrono::seconds waitTime{0};
|
|
|
|
|
};
|
2016-03-02 12:59:24 +00:00
|
|
|
|
|
2015-08-17 13:45:44 +00:00
|
|
|
|
std::unordered_map<std::string, RunnablePerType> runnablePerType;
|
2016-03-02 12:59:24 +00:00
|
|
|
|
|
2015-08-10 12:50:22 +00:00
|
|
|
|
{
|
2015-08-10 09:26:30 +00:00
|
|
|
|
auto runnable_(runnable.lock());
|
2015-08-10 12:50:22 +00:00
|
|
|
|
runnableSorted.reserve(runnable_->size());
|
2015-08-10 09:26:30 +00:00
|
|
|
|
for (auto i = runnable_->begin(); i != runnable_->end(); ) {
|
|
|
|
|
auto step = i->lock();
|
|
|
|
|
|
2015-08-10 12:50:22 +00:00
|
|
|
|
/* Remove dead steps. */
|
2015-08-10 09:26:30 +00:00
|
|
|
|
if (!step) {
|
|
|
|
|
i = runnable_->erase(i);
|
|
|
|
|
continue;
|
|
|
|
|
}
|
2015-07-21 13:14:17 +00:00
|
|
|
|
|
2015-08-10 12:50:22 +00:00
|
|
|
|
++i;
|
2015-08-10 09:26:30 +00:00
|
|
|
|
|
2015-08-17 13:45:44 +00:00
|
|
|
|
auto & r = runnablePerType[step->systemType];
|
|
|
|
|
r.count++;
|
2015-08-17 11:50:41 +00:00
|
|
|
|
|
2015-08-10 09:26:30 +00:00
|
|
|
|
/* Skip previously failed steps that aren't ready
|
|
|
|
|
to be retried. */
|
2016-03-02 12:59:24 +00:00
|
|
|
|
auto step_(step->state.lock());
|
|
|
|
|
r.waitTime += std::chrono::duration_cast<std::chrono::seconds>(now - step_->runnableSince);
|
|
|
|
|
if (step_->tries > 0 && step_->after > now) {
|
|
|
|
|
if (step_->after < sleepUntil)
|
|
|
|
|
sleepUntil = step_->after;
|
|
|
|
|
continue;
|
2015-08-10 09:26:30 +00:00
|
|
|
|
}
|
2015-07-21 13:14:17 +00:00
|
|
|
|
|
2016-03-02 12:59:24 +00:00
|
|
|
|
runnableSorted.emplace_back(step, *step_);
|
2015-08-10 12:50:22 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
sort(runnableSorted.begin(), runnableSorted.end(),
|
2016-03-02 12:59:24 +00:00
|
|
|
|
[](const StepInfo & a, const StepInfo & b)
|
2015-08-10 12:50:22 +00:00
|
|
|
|
{
|
2015-08-10 14:18:06 +00:00
|
|
|
|
return
|
2016-03-02 12:59:24 +00:00
|
|
|
|
a.highestGlobalPriority != b.highestGlobalPriority ? a.highestGlobalPriority > b.highestGlobalPriority :
|
|
|
|
|
a.lowestShareUsed != b.lowestShareUsed ? a.lowestShareUsed < b.lowestShareUsed :
|
|
|
|
|
a.highestLocalPriority != b.highestLocalPriority ? a.highestLocalPriority > b.highestLocalPriority :
|
|
|
|
|
a.lowestBuildID < b.lowestBuildID;
|
2015-08-10 12:50:22 +00:00
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
/* Find a machine with a free slot and find a step to run
|
|
|
|
|
on it. Once we find such a pair, we restart the outer
|
|
|
|
|
loop because the machine sorting will have changed. */
|
|
|
|
|
keepGoing = false;
|
|
|
|
|
|
|
|
|
|
for (auto & mi : machinesSorted) {
|
|
|
|
|
if (mi.machine->state->currentJobs >= mi.machine->maxJobs) continue;
|
|
|
|
|
|
2016-03-02 12:59:24 +00:00
|
|
|
|
for (auto & stepInfo : runnableSorted) {
|
|
|
|
|
auto & step(stepInfo.step);
|
2015-08-10 12:50:22 +00:00
|
|
|
|
|
|
|
|
|
/* Can this machine do this step? */
|
2018-03-07 09:23:43 +00:00
|
|
|
|
if (!mi.machine->supportsStep(step)) {
|
|
|
|
|
debug("machine '%s' does not support step '%s' (system type '%s')",
|
2019-12-30 21:49:26 +00:00
|
|
|
|
mi.machine->sshName, localStore->printStorePath(step->drvPath), step->drv->platform);
|
2018-03-07 09:23:43 +00:00
|
|
|
|
continue;
|
|
|
|
|
}
|
2015-08-10 12:50:22 +00:00
|
|
|
|
|
|
|
|
|
/* Let's do this step. Remove it from the runnable
|
|
|
|
|
list. FIXME: O(n). */
|
|
|
|
|
{
|
|
|
|
|
auto runnable_(runnable.lock());
|
|
|
|
|
bool removed = false;
|
|
|
|
|
for (auto i = runnable_->begin(); i != runnable_->end(); )
|
|
|
|
|
if (i->lock() == step) {
|
|
|
|
|
i = runnable_->erase(i);
|
|
|
|
|
removed = true;
|
|
|
|
|
break;
|
|
|
|
|
} else ++i;
|
|
|
|
|
assert(removed);
|
2015-08-17 13:45:44 +00:00
|
|
|
|
auto & r = runnablePerType[step->systemType];
|
|
|
|
|
assert(r.count);
|
|
|
|
|
r.count--;
|
2015-08-10 12:50:22 +00:00
|
|
|
|
}
|
|
|
|
|
|
2015-08-10 09:26:30 +00:00
|
|
|
|
/* Make a slot reservation and start a thread to
|
|
|
|
|
do the build. */
|
2015-08-17 11:50:41 +00:00
|
|
|
|
auto builderThread = std::thread(&State::builder, this,
|
|
|
|
|
std::make_shared<MachineReservation>(*this, step, mi.machine));
|
2015-08-10 09:26:30 +00:00
|
|
|
|
builderThread.detach(); // FIXME?
|
2015-07-21 13:14:17 +00:00
|
|
|
|
|
2015-08-10 09:26:30 +00:00
|
|
|
|
keepGoing = true;
|
|
|
|
|
break;
|
2015-07-21 13:14:17 +00:00
|
|
|
|
}
|
|
|
|
|
|
2015-08-10 09:26:30 +00:00
|
|
|
|
if (keepGoing) break;
|
2015-07-21 13:14:17 +00:00
|
|
|
|
}
|
|
|
|
|
|
2015-08-17 11:50:41 +00:00
|
|
|
|
/* Update the stats for the auto-scaler. */
|
|
|
|
|
{
|
|
|
|
|
auto machineTypes_(machineTypes.lock());
|
|
|
|
|
|
|
|
|
|
for (auto & i : *machineTypes_)
|
|
|
|
|
i.second.runnable = 0;
|
|
|
|
|
|
2015-08-17 13:45:44 +00:00
|
|
|
|
for (auto & i : runnablePerType) {
|
|
|
|
|
auto & j = (*machineTypes_)[i.first];
|
|
|
|
|
j.runnable = i.second.count;
|
|
|
|
|
j.waitTime = i.second.waitTime;
|
|
|
|
|
}
|
2015-08-17 11:50:41 +00:00
|
|
|
|
}
|
|
|
|
|
|
2015-08-17 13:45:44 +00:00
|
|
|
|
lastDispatcherCheck = std::chrono::system_clock::to_time_t(now);
|
|
|
|
|
|
2015-08-10 09:26:30 +00:00
|
|
|
|
} while (keepGoing);
|
|
|
|
|
|
|
|
|
|
return sleepUntil;
|
2015-07-21 13:14:17 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void State::wakeDispatcher()
|
|
|
|
|
{
|
2015-08-10 09:58:33 +00:00
|
|
|
|
{
|
|
|
|
|
auto dispatcherWakeup_(dispatcherWakeup.lock());
|
|
|
|
|
*dispatcherWakeup_ = true;
|
|
|
|
|
}
|
|
|
|
|
dispatcherWakeupCV.notify_one();
|
2015-07-21 13:14:17 +00:00
|
|
|
|
}
|
2015-08-10 23:30:24 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void Jobset::addStep(time_t startTime, time_t duration)
|
|
|
|
|
{
|
|
|
|
|
auto steps_(steps.lock());
|
|
|
|
|
(*steps_)[startTime] = duration;
|
|
|
|
|
seconds += duration;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
void Jobset::pruneSteps()
|
|
|
|
|
{
|
|
|
|
|
time_t now = time(0);
|
|
|
|
|
auto steps_(steps.lock());
|
|
|
|
|
while (!steps_->empty()) {
|
|
|
|
|
auto i = steps_->begin();
|
|
|
|
|
if (i->first > now - schedulingWindow) break;
|
|
|
|
|
seconds -= i->second;
|
|
|
|
|
steps_->erase(i);
|
|
|
|
|
}
|
|
|
|
|
}
|
2015-08-17 11:50:41 +00:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
State::MachineReservation::MachineReservation(State & state, Step::ptr step, Machine::ptr machine)
|
|
|
|
|
: state(state), step(step), machine(machine)
|
|
|
|
|
{
|
|
|
|
|
machine->state->currentJobs++;
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
auto machineTypes_(state.machineTypes.lock());
|
2015-08-17 12:37:57 +00:00
|
|
|
|
(*machineTypes_)[step->systemType].running++;
|
2015-08-17 11:50:41 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
State::MachineReservation::~MachineReservation()
|
|
|
|
|
{
|
|
|
|
|
auto prev = machine->state->currentJobs--;
|
|
|
|
|
assert(prev);
|
|
|
|
|
if (prev == 1)
|
|
|
|
|
machine->state->idleSince = time(0);
|
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
auto machineTypes_(state.machineTypes.lock());
|
2015-08-17 12:37:57 +00:00
|
|
|
|
auto & machineType = (*machineTypes_)[step->systemType];
|
2015-08-17 11:50:41 +00:00
|
|
|
|
assert(machineType.running);
|
|
|
|
|
machineType.running--;
|
|
|
|
|
if (machineType.running == 0)
|
2015-08-17 13:45:44 +00:00
|
|
|
|
machineType.lastActive = std::chrono::system_clock::now();
|
2015-08-17 11:50:41 +00:00
|
|
|
|
}
|
|
|
|
|
}
|