forked from lix-project/hydra
Split hydra-queue-runner.cc more
This commit is contained in:
parent
6ddcd37df1
commit
7e026d35f7
6 changed files with 921 additions and 904 deletions
|
@ -1,6 +1,7 @@
|
||||||
bin_PROGRAMS = hydra-queue-runner
|
bin_PROGRAMS = hydra-queue-runner
|
||||||
|
|
||||||
hydra_queue_runner_SOURCES = hydra-queue-runner.cc build-result.cc build-remote.cc \
|
hydra_queue_runner_SOURCES = hydra-queue-runner.cc queue-monitor.cc dispatcher.cc \
|
||||||
|
builder.cc build-result.cc build-remote.cc \
|
||||||
build-result.hh counter.hh pool.hh sync.hh token-server.hh state.hh db.hh
|
build-result.hh counter.hh pool.hh sync.hh token-server.hh state.hh db.hh
|
||||||
hydra_queue_runner_LDADD = $(NIX_LIBS) -lpqxx
|
hydra_queue_runner_LDADD = $(NIX_LIBS) -lpqxx
|
||||||
|
|
||||||
|
|
378
src/hydra-queue-runner/builder.cc
Normal file
378
src/hydra-queue-runner/builder.cc
Normal file
|
@ -0,0 +1,378 @@
|
||||||
|
#include <cmath>
|
||||||
|
|
||||||
|
#include "state.hh"
|
||||||
|
#include "build-result.hh"
|
||||||
|
|
||||||
|
using namespace nix;
|
||||||
|
|
||||||
|
|
||||||
|
void State::builder(Step::ptr step, Machine::ptr machine, std::shared_ptr<MaintainCount> reservation)
|
||||||
|
{
|
||||||
|
bool retry = true;
|
||||||
|
|
||||||
|
MaintainCount mc(nrActiveSteps);
|
||||||
|
|
||||||
|
try {
|
||||||
|
auto store = openStore(); // FIXME: pool
|
||||||
|
retry = doBuildStep(store, step, machine);
|
||||||
|
} catch (std::exception & e) {
|
||||||
|
printMsg(lvlError, format("uncaught exception building ‘%1%’ on ‘%2%’: %3%")
|
||||||
|
% step->drvPath % machine->sshName % e.what());
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Release the machine and wake up the dispatcher. */
|
||||||
|
assert(reservation.unique());
|
||||||
|
reservation = 0;
|
||||||
|
wakeDispatcher();
|
||||||
|
|
||||||
|
/* If there was a temporary failure, retry the step after an
|
||||||
|
exponentially increasing interval. */
|
||||||
|
if (retry) {
|
||||||
|
{
|
||||||
|
auto step_(step->state.lock());
|
||||||
|
step_->tries++;
|
||||||
|
nrRetries++;
|
||||||
|
if (step_->tries > maxNrRetries) maxNrRetries = step_->tries; // yeah yeah, not atomic
|
||||||
|
int delta = retryInterval * powf(retryBackoff, step_->tries - 1);
|
||||||
|
printMsg(lvlInfo, format("will retry ‘%1%’ after %2%s") % step->drvPath % delta);
|
||||||
|
step_->after = std::chrono::system_clock::now() + std::chrono::seconds(delta);
|
||||||
|
}
|
||||||
|
|
||||||
|
makeRunnable(step);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
bool State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
|
||||||
|
Machine::ptr machine)
|
||||||
|
{
|
||||||
|
{
|
||||||
|
auto step_(step->state.lock());
|
||||||
|
assert(step_->created);
|
||||||
|
assert(!step->finished);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* There can be any number of builds in the database that depend
|
||||||
|
on this derivation. Arbitrarily pick one (though preferring a
|
||||||
|
build of which this is the top-level derivation) for the
|
||||||
|
purpose of creating build steps. We could create a build step
|
||||||
|
record for every build, but that could be very expensive
|
||||||
|
(e.g. a stdenv derivation can be a dependency of tens of
|
||||||
|
thousands of builds), so we don't. */
|
||||||
|
Build::ptr build;
|
||||||
|
|
||||||
|
{
|
||||||
|
std::set<Build::ptr> dependents;
|
||||||
|
std::set<Step::ptr> steps;
|
||||||
|
getDependents(step, dependents, steps);
|
||||||
|
|
||||||
|
if (dependents.empty()) {
|
||||||
|
/* Apparently all builds that depend on this derivation
|
||||||
|
are gone (e.g. cancelled). So don't bother. This is
|
||||||
|
very unlikely to happen, because normally Steps are
|
||||||
|
only kept alive by being reachable from a
|
||||||
|
Build. However, it's possible that a new Build just
|
||||||
|
created a reference to this step. So to handle that
|
||||||
|
possibility, we retry this step (putting it back in
|
||||||
|
the runnable queue). If there are really no strong
|
||||||
|
pointers to the step, it will be deleted. */
|
||||||
|
printMsg(lvlInfo, format("maybe cancelling build step ‘%1%’") % step->drvPath);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (auto build2 : dependents)
|
||||||
|
if (build2->drvPath == step->drvPath) { build = build2; break; }
|
||||||
|
|
||||||
|
if (!build) build = *dependents.begin();
|
||||||
|
|
||||||
|
printMsg(lvlInfo, format("performing step ‘%1%’ on ‘%2%’ (needed by build %3% and %4% others)")
|
||||||
|
% step->drvPath % machine->sshName % build->id % (dependents.size() - 1));
|
||||||
|
}
|
||||||
|
|
||||||
|
bool quit = build->id == buildOne;
|
||||||
|
|
||||||
|
auto conn(dbPool.get());
|
||||||
|
|
||||||
|
RemoteResult result;
|
||||||
|
BuildOutput res;
|
||||||
|
int stepNr = 0;
|
||||||
|
|
||||||
|
time_t stepStartTime = result.startTime = time(0);
|
||||||
|
|
||||||
|
/* If any of the outputs have previously failed, then don't bother
|
||||||
|
building again. */
|
||||||
|
bool cachedFailure = checkCachedFailure(step, *conn);
|
||||||
|
|
||||||
|
if (cachedFailure)
|
||||||
|
result.status = BuildResult::CachedFailure;
|
||||||
|
else {
|
||||||
|
|
||||||
|
/* Create a build step record indicating that we started
|
||||||
|
building. Also, mark the selected build as busy. */
|
||||||
|
{
|
||||||
|
pqxx::work txn(*conn);
|
||||||
|
stepNr = createBuildStep(txn, result.startTime, build, step, machine->sshName, bssBusy);
|
||||||
|
txn.parameterized("update Builds set busy = 1 where id = $1")(build->id).exec();
|
||||||
|
txn.commit();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Do the build. */
|
||||||
|
try {
|
||||||
|
/* FIXME: referring builds may have conflicting timeouts. */
|
||||||
|
buildRemote(store, machine, step, build->maxSilentTime, build->buildTimeout, result);
|
||||||
|
} catch (Error & e) {
|
||||||
|
result.status = BuildResult::MiscFailure;
|
||||||
|
result.errorMsg = e.msg();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (result.success()) res = getBuildOutput(store, step->drv);
|
||||||
|
}
|
||||||
|
|
||||||
|
time_t stepStopTime = time(0);
|
||||||
|
if (!result.stopTime) result.stopTime = stepStopTime;
|
||||||
|
|
||||||
|
/* Asynchronously compress the log. */
|
||||||
|
if (result.logFile != "") {
|
||||||
|
{
|
||||||
|
auto logCompressorQueue_(logCompressorQueue.lock());
|
||||||
|
logCompressorQueue_->push(result.logFile);
|
||||||
|
}
|
||||||
|
logCompressorWakeup.notify_one();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* The step had a hopefully temporary failure (e.g. network
|
||||||
|
issue). Retry a number of times. */
|
||||||
|
if (result.canRetry()) {
|
||||||
|
printMsg(lvlError, format("possibly transient failure building ‘%1%’ on ‘%2%’: %3%")
|
||||||
|
% step->drvPath % machine->sshName % result.errorMsg);
|
||||||
|
bool retry;
|
||||||
|
{
|
||||||
|
auto step_(step->state.lock());
|
||||||
|
retry = step_->tries + 1 < maxTries;
|
||||||
|
}
|
||||||
|
if (retry) {
|
||||||
|
pqxx::work txn(*conn);
|
||||||
|
finishBuildStep(txn, result.startTime, result.stopTime, build->id,
|
||||||
|
stepNr, machine->sshName, bssAborted, result.errorMsg);
|
||||||
|
txn.commit();
|
||||||
|
if (quit) exit(1);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (result.success()) {
|
||||||
|
|
||||||
|
/* Register success in the database for all Build objects that
|
||||||
|
have this step as the top-level step. Since the queue
|
||||||
|
monitor thread may be creating new referring Builds
|
||||||
|
concurrently, and updating the database may fail, we do
|
||||||
|
this in a loop, marking all known builds, repeating until
|
||||||
|
there are no unmarked builds.
|
||||||
|
*/
|
||||||
|
|
||||||
|
std::vector<BuildID> buildIDs;
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
|
||||||
|
/* Get the builds that have this one as the top-level. */
|
||||||
|
std::vector<Build::ptr> direct;
|
||||||
|
{
|
||||||
|
auto steps_(steps.lock());
|
||||||
|
auto step_(step->state.lock());
|
||||||
|
|
||||||
|
for (auto & b_ : step_->builds) {
|
||||||
|
auto b = b_.lock();
|
||||||
|
if (b && !b->finishedInDB) direct.push_back(b);
|
||||||
|
}
|
||||||
|
|
||||||
|
/* If there are no builds left to update in the DB,
|
||||||
|
then we're done (except for calling
|
||||||
|
finishBuildStep()). Delete the step from
|
||||||
|
‘steps’. Since we've been holding the ‘steps’ lock,
|
||||||
|
no new referrers can have been added in the
|
||||||
|
meantime or be added afterwards. */
|
||||||
|
if (direct.empty()) {
|
||||||
|
printMsg(lvlDebug, format("finishing build step ‘%1%’") % step->drvPath);
|
||||||
|
steps_->erase(step->drvPath);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Update the database. */
|
||||||
|
{
|
||||||
|
pqxx::work txn(*conn);
|
||||||
|
|
||||||
|
finishBuildStep(txn, result.startTime, result.stopTime, build->id, stepNr, machine->sshName, bssSuccess);
|
||||||
|
|
||||||
|
for (auto & b : direct)
|
||||||
|
markSucceededBuild(txn, b, res, build != b || result.status != BuildResult::Built,
|
||||||
|
result.startTime, result.stopTime);
|
||||||
|
|
||||||
|
txn.commit();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (direct.empty()) break;
|
||||||
|
|
||||||
|
/* Remove the direct dependencies from ‘builds’. This will
|
||||||
|
cause them to be destroyed. */
|
||||||
|
for (auto & b : direct) {
|
||||||
|
auto builds_(builds.lock());
|
||||||
|
b->finishedInDB = true;
|
||||||
|
builds_->erase(b->id);
|
||||||
|
buildIDs.push_back(b->id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Send notification about the builds that have this step as
|
||||||
|
the top-level. */
|
||||||
|
for (auto id : buildIDs) {
|
||||||
|
{
|
||||||
|
auto notificationSenderQueue_(notificationSenderQueue.lock());
|
||||||
|
notificationSenderQueue_->push(NotificationItem(id, std::vector<BuildID>()));
|
||||||
|
}
|
||||||
|
notificationSenderWakeup.notify_one();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Wake up any dependent steps that have no other
|
||||||
|
dependencies. */
|
||||||
|
{
|
||||||
|
auto step_(step->state.lock());
|
||||||
|
for (auto & rdepWeak : step_->rdeps) {
|
||||||
|
auto rdep = rdepWeak.lock();
|
||||||
|
if (!rdep) continue;
|
||||||
|
|
||||||
|
bool runnable = false;
|
||||||
|
{
|
||||||
|
auto rdep_(rdep->state.lock());
|
||||||
|
rdep_->deps.erase(step);
|
||||||
|
/* Note: if the step has not finished
|
||||||
|
initialisation yet, it will be made runnable in
|
||||||
|
createStep(), if appropriate. */
|
||||||
|
if (rdep_->deps.empty() && rdep_->created) runnable = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (runnable) makeRunnable(rdep);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
} else {
|
||||||
|
|
||||||
|
/* Register failure in the database for all Build objects that
|
||||||
|
directly or indirectly depend on this step. */
|
||||||
|
|
||||||
|
std::vector<BuildID> dependentIDs;
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
|
||||||
|
/* Get the builds and steps that depend on this step. */
|
||||||
|
std::set<Build::ptr> indirect;
|
||||||
|
{
|
||||||
|
auto steps_(steps.lock());
|
||||||
|
std::set<Step::ptr> steps;
|
||||||
|
getDependents(step, indirect, steps);
|
||||||
|
|
||||||
|
/* If there are no builds left, delete all referring
|
||||||
|
steps from ‘steps’. As for the success case, we can
|
||||||
|
be certain no new referrers can be added. */
|
||||||
|
if (indirect.empty()) {
|
||||||
|
for (auto & s : steps) {
|
||||||
|
printMsg(lvlDebug, format("finishing build step ‘%1%’") % s->drvPath);
|
||||||
|
steps_->erase(s->drvPath);
|
||||||
|
}
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Update the database. */
|
||||||
|
{
|
||||||
|
pqxx::work txn(*conn);
|
||||||
|
|
||||||
|
BuildStatus buildStatus =
|
||||||
|
result.status == BuildResult::TimedOut ? bsTimedOut :
|
||||||
|
result.canRetry() ? bsAborted :
|
||||||
|
bsFailed;
|
||||||
|
BuildStepStatus buildStepStatus =
|
||||||
|
result.status == BuildResult::TimedOut ? bssTimedOut :
|
||||||
|
result.canRetry() ? bssAborted :
|
||||||
|
bssFailed;
|
||||||
|
|
||||||
|
/* For standard failures, we don't care about the error
|
||||||
|
message. */
|
||||||
|
if (result.status == BuildResult::PermanentFailure ||
|
||||||
|
result.status == BuildResult::TransientFailure ||
|
||||||
|
result.status == BuildResult::CachedFailure ||
|
||||||
|
result.status == BuildResult::TimedOut)
|
||||||
|
result.errorMsg = "";
|
||||||
|
|
||||||
|
/* Create failed build steps for every build that depends
|
||||||
|
on this. For cached failures, only create a step for
|
||||||
|
builds that don't have this step as top-level
|
||||||
|
(otherwise the user won't be able to see what caused
|
||||||
|
the build to fail). */
|
||||||
|
for (auto & build2 : indirect) {
|
||||||
|
if ((cachedFailure && build2->drvPath == step->drvPath) ||
|
||||||
|
(!cachedFailure && build == build2) ||
|
||||||
|
build2->finishedInDB)
|
||||||
|
continue;
|
||||||
|
createBuildStep(txn, 0, build2, step, machine->sshName,
|
||||||
|
buildStepStatus, result.errorMsg, build == build2 ? 0 : build->id);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!cachedFailure)
|
||||||
|
finishBuildStep(txn, result.startTime, result.stopTime, build->id,
|
||||||
|
stepNr, machine->sshName, buildStepStatus, result.errorMsg);
|
||||||
|
|
||||||
|
/* Mark all builds that depend on this derivation as failed. */
|
||||||
|
for (auto & build2 : indirect) {
|
||||||
|
if (build2->finishedInDB) continue;
|
||||||
|
printMsg(lvlError, format("marking build %1% as failed") % build2->id);
|
||||||
|
txn.parameterized
|
||||||
|
("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1 and finished = 0")
|
||||||
|
(build2->id)
|
||||||
|
((int) (build2->drvPath != step->drvPath && buildStatus == bsFailed ? bsDepFailed : buildStatus))
|
||||||
|
(result.startTime)
|
||||||
|
(result.stopTime)
|
||||||
|
(cachedFailure ? 1 : 0).exec();
|
||||||
|
nrBuildsDone++;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Remember failed paths in the database so that they
|
||||||
|
won't be built again. */
|
||||||
|
if (!cachedFailure && result.status == BuildResult::PermanentFailure)
|
||||||
|
for (auto & path : outputPaths(step->drv))
|
||||||
|
txn.parameterized("insert into FailedPaths values ($1)")(path).exec();
|
||||||
|
|
||||||
|
txn.commit();
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Remove the indirect dependencies from ‘builds’. This
|
||||||
|
will cause them to be destroyed. */
|
||||||
|
for (auto & b : indirect) {
|
||||||
|
auto builds_(builds.lock());
|
||||||
|
b->finishedInDB = true;
|
||||||
|
builds_->erase(b->id);
|
||||||
|
dependentIDs.push_back(b->id);
|
||||||
|
if (buildOne == b->id) quit = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Send notification about this build and its dependents. */
|
||||||
|
{
|
||||||
|
auto notificationSenderQueue_(notificationSenderQueue.lock());
|
||||||
|
notificationSenderQueue_->push(NotificationItem(build->id, dependentIDs));
|
||||||
|
}
|
||||||
|
notificationSenderWakeup.notify_one();
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
// FIXME: keep stats about aborted steps?
|
||||||
|
nrStepsDone++;
|
||||||
|
totalStepTime += stepStopTime - stepStartTime;
|
||||||
|
totalStepBuildTime += result.stopTime - result.startTime;
|
||||||
|
machine->state->nrStepsDone++;
|
||||||
|
machine->state->totalStepTime += stepStopTime - stepStartTime;
|
||||||
|
machine->state->totalStepBuildTime += result.stopTime - result.startTime;
|
||||||
|
|
||||||
|
if (quit) exit(0); // testing hack
|
||||||
|
|
||||||
|
return false;
|
||||||
|
}
|
155
src/hydra-queue-runner/dispatcher.cc
Normal file
155
src/hydra-queue-runner/dispatcher.cc
Normal file
|
@ -0,0 +1,155 @@
|
||||||
|
#include <algorithm>
|
||||||
|
#include <thread>
|
||||||
|
|
||||||
|
#include "state.hh"
|
||||||
|
|
||||||
|
using namespace nix;
|
||||||
|
|
||||||
|
|
||||||
|
void State::makeRunnable(Step::ptr step)
|
||||||
|
{
|
||||||
|
printMsg(lvlChatty, format("step ‘%1%’ is now runnable") % step->drvPath);
|
||||||
|
|
||||||
|
{
|
||||||
|
auto step_(step->state.lock());
|
||||||
|
assert(step_->created);
|
||||||
|
assert(!step->finished);
|
||||||
|
assert(step_->deps.empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
auto runnable_(runnable.lock());
|
||||||
|
runnable_->push_back(step);
|
||||||
|
}
|
||||||
|
|
||||||
|
wakeDispatcher();
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void State::dispatcher()
|
||||||
|
{
|
||||||
|
while (true) {
|
||||||
|
printMsg(lvlDebug, "dispatcher woken up");
|
||||||
|
|
||||||
|
auto sleepUntil = system_time::max();
|
||||||
|
|
||||||
|
bool keepGoing;
|
||||||
|
|
||||||
|
do {
|
||||||
|
/* Copy the currentJobs field of each machine. This is
|
||||||
|
necessary to ensure that the sort comparator below is
|
||||||
|
an ordering. std::sort() can segfault if it isn't. */
|
||||||
|
struct MachineInfo
|
||||||
|
{
|
||||||
|
Machine::ptr machine;
|
||||||
|
unsigned int currentJobs;
|
||||||
|
};
|
||||||
|
std::vector<MachineInfo> machinesSorted;
|
||||||
|
{
|
||||||
|
auto machines_(machines.lock());
|
||||||
|
for (auto & m : *machines_)
|
||||||
|
machinesSorted.push_back({m.second, m.second->state->currentJobs});
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Sort the machines by a combination of speed factor and
|
||||||
|
available slots. Prioritise the available machines as
|
||||||
|
follows:
|
||||||
|
|
||||||
|
- First by load divided by speed factor, rounded to the
|
||||||
|
nearest integer. This causes fast machines to be
|
||||||
|
preferred over slow machines with similar loads.
|
||||||
|
|
||||||
|
- Then by speed factor.
|
||||||
|
|
||||||
|
- Finally by load. */
|
||||||
|
sort(machinesSorted.begin(), machinesSorted.end(),
|
||||||
|
[](const MachineInfo & a, const MachineInfo & b) -> bool
|
||||||
|
{
|
||||||
|
float ta = roundf(a.currentJobs / a.machine->speedFactor);
|
||||||
|
float tb = roundf(b.currentJobs / b.machine->speedFactor);
|
||||||
|
return
|
||||||
|
ta != tb ? ta < tb :
|
||||||
|
a.machine->speedFactor != b.machine->speedFactor ? a.machine->speedFactor > b.machine->speedFactor :
|
||||||
|
a.currentJobs > b.currentJobs;
|
||||||
|
});
|
||||||
|
|
||||||
|
/* Find a machine with a free slot and find a step to run
|
||||||
|
on it. Once we find such a pair, we restart the outer
|
||||||
|
loop because the machine sorting will have changed. */
|
||||||
|
keepGoing = false;
|
||||||
|
system_time now = std::chrono::system_clock::now();
|
||||||
|
|
||||||
|
for (auto & mi : machinesSorted) {
|
||||||
|
// FIXME: can we lose a wakeup if a builder exits concurrently?
|
||||||
|
if (mi.machine->state->currentJobs >= mi.machine->maxJobs) continue;
|
||||||
|
|
||||||
|
auto runnable_(runnable.lock());
|
||||||
|
//printMsg(lvlDebug, format("%1% runnable builds") % runnable_->size());
|
||||||
|
|
||||||
|
/* FIXME: we're holding the runnable lock too long
|
||||||
|
here. This could be more efficient. */
|
||||||
|
|
||||||
|
for (auto i = runnable_->begin(); i != runnable_->end(); ) {
|
||||||
|
auto step = i->lock();
|
||||||
|
|
||||||
|
/* Delete dead steps. */
|
||||||
|
if (!step) {
|
||||||
|
i = runnable_->erase(i);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Can this machine do this step? */
|
||||||
|
if (!mi.machine->supportsStep(step)) {
|
||||||
|
++i;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Skip previously failed steps that aren't ready
|
||||||
|
to be retried. */
|
||||||
|
{
|
||||||
|
auto step_(step->state.lock());
|
||||||
|
if (step_->tries > 0 && step_->after > now) {
|
||||||
|
if (step_->after < sleepUntil)
|
||||||
|
sleepUntil = step_->after;
|
||||||
|
++i;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Make a slot reservation and start a thread to
|
||||||
|
do the build. */
|
||||||
|
auto reservation = std::make_shared<MaintainCount>(mi.machine->state->currentJobs);
|
||||||
|
i = runnable_->erase(i);
|
||||||
|
|
||||||
|
auto builderThread = std::thread(&State::builder, this, step, mi.machine, reservation);
|
||||||
|
builderThread.detach(); // FIXME?
|
||||||
|
|
||||||
|
keepGoing = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (keepGoing) break;
|
||||||
|
}
|
||||||
|
|
||||||
|
} while (keepGoing);
|
||||||
|
|
||||||
|
/* Sleep until we're woken up (either because a runnable build
|
||||||
|
is added, or because a build finishes). */
|
||||||
|
{
|
||||||
|
std::unique_lock<std::mutex> lock(dispatcherMutex);
|
||||||
|
printMsg(lvlDebug, format("dispatcher sleeping for %1%s") %
|
||||||
|
std::chrono::duration_cast<std::chrono::seconds>(sleepUntil - std::chrono::system_clock::now()).count());
|
||||||
|
dispatcherWakeup.wait_until(lock, sleepUntil);
|
||||||
|
nrDispatcherWakeups++;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
printMsg(lvlError, "dispatcher exits");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void State::wakeDispatcher()
|
||||||
|
{
|
||||||
|
{ std::lock_guard<std::mutex> lock(dispatcherMutex); } // barrier
|
||||||
|
dispatcherWakeup.notify_one();
|
||||||
|
}
|
|
@ -1,14 +1,12 @@
|
||||||
#include <iostream>
|
#include <iostream>
|
||||||
#include <thread>
|
#include <thread>
|
||||||
#include <cmath>
|
|
||||||
#include <algorithm>
|
|
||||||
|
|
||||||
#include <sys/types.h>
|
#include <sys/types.h>
|
||||||
#include <sys/stat.h>
|
#include <sys/stat.h>
|
||||||
#include <fcntl.h>
|
#include <fcntl.h>
|
||||||
|
|
||||||
#include "build-result.hh"
|
|
||||||
#include "state.hh"
|
#include "state.hh"
|
||||||
|
#include "build-result.hh"
|
||||||
|
|
||||||
#include "shared.hh"
|
#include "shared.hh"
|
||||||
#include "globals.hh"
|
#include "globals.hh"
|
||||||
|
@ -17,20 +15,6 @@
|
||||||
using namespace nix;
|
using namespace nix;
|
||||||
|
|
||||||
|
|
||||||
// FIXME: Make configurable.
|
|
||||||
const unsigned int maxTries = 5;
|
|
||||||
const unsigned int retryInterval = 60; // seconds
|
|
||||||
const float retryBackoff = 3.0;
|
|
||||||
const unsigned int maxParallelCopyClosure = 4;
|
|
||||||
|
|
||||||
|
|
||||||
template <class C, class V>
|
|
||||||
bool has(const C & c, const V & v)
|
|
||||||
{
|
|
||||||
return c.find(v) != c.end();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
State::State()
|
State::State()
|
||||||
{
|
{
|
||||||
hydraData = getEnv("HYDRA_DATA");
|
hydraData = getEnv("HYDRA_DATA");
|
||||||
|
@ -186,371 +170,6 @@ void State::finishBuildStep(pqxx::work & txn, time_t startTime, time_t stopTime,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void State::queueMonitor()
|
|
||||||
{
|
|
||||||
while (true) {
|
|
||||||
try {
|
|
||||||
queueMonitorLoop();
|
|
||||||
} catch (std::exception & e) {
|
|
||||||
printMsg(lvlError, format("queue monitor: %1%") % e.what());
|
|
||||||
sleep(10); // probably a DB problem, so don't retry right away
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void State::queueMonitorLoop()
|
|
||||||
{
|
|
||||||
auto conn(dbPool.get());
|
|
||||||
|
|
||||||
receiver buildsAdded(*conn, "builds_added");
|
|
||||||
receiver buildsRestarted(*conn, "builds_restarted");
|
|
||||||
receiver buildsCancelled(*conn, "builds_cancelled");
|
|
||||||
receiver buildsDeleted(*conn, "builds_deleted");
|
|
||||||
|
|
||||||
auto store = openStore(); // FIXME: pool
|
|
||||||
|
|
||||||
unsigned int lastBuildId = 0;
|
|
||||||
|
|
||||||
while (true) {
|
|
||||||
getQueuedBuilds(*conn, store, lastBuildId);
|
|
||||||
|
|
||||||
/* Sleep until we get notification from the database about an
|
|
||||||
event. */
|
|
||||||
conn->await_notification();
|
|
||||||
nrQueueWakeups++;
|
|
||||||
|
|
||||||
if (buildsAdded.get())
|
|
||||||
printMsg(lvlTalkative, "got notification: new builds added to the queue");
|
|
||||||
if (buildsRestarted.get()) {
|
|
||||||
printMsg(lvlTalkative, "got notification: builds restarted");
|
|
||||||
lastBuildId = 0; // check all builds
|
|
||||||
}
|
|
||||||
if (buildsCancelled.get() || buildsDeleted.get()) {
|
|
||||||
printMsg(lvlTalkative, "got notification: builds cancelled");
|
|
||||||
removeCancelledBuilds(*conn);
|
|
||||||
}
|
|
||||||
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void State::getQueuedBuilds(Connection & conn, std::shared_ptr<StoreAPI> store, unsigned int & lastBuildId)
|
|
||||||
{
|
|
||||||
printMsg(lvlInfo, format("checking the queue for builds > %1%...") % lastBuildId);
|
|
||||||
|
|
||||||
/* Grab the queued builds from the database, but don't process
|
|
||||||
them yet (since we don't want a long-running transaction). */
|
|
||||||
std::multimap<Path, Build::ptr> newBuilds;
|
|
||||||
|
|
||||||
{
|
|
||||||
pqxx::work txn(conn);
|
|
||||||
|
|
||||||
auto res = txn.parameterized("select id, project, jobset, job, drvPath, maxsilent, timeout from Builds where id > $1 and finished = 0 order by id")(lastBuildId).exec();
|
|
||||||
|
|
||||||
for (auto const & row : res) {
|
|
||||||
auto builds_(builds.lock());
|
|
||||||
BuildID id = row["id"].as<BuildID>();
|
|
||||||
if (buildOne && id != buildOne) continue;
|
|
||||||
if (id > lastBuildId) lastBuildId = id;
|
|
||||||
if (has(*builds_, id)) continue;
|
|
||||||
|
|
||||||
auto build = std::make_shared<Build>();
|
|
||||||
build->id = id;
|
|
||||||
build->drvPath = row["drvPath"].as<string>();
|
|
||||||
build->fullJobName = row["project"].as<string>() + ":" + row["jobset"].as<string>() + ":" + row["job"].as<string>();
|
|
||||||
build->maxSilentTime = row["maxsilent"].as<int>();
|
|
||||||
build->buildTimeout = row["timeout"].as<int>();
|
|
||||||
|
|
||||||
newBuilds.emplace(std::make_pair(build->drvPath, build));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
std::set<Step::ptr> newRunnable;
|
|
||||||
unsigned int nrAdded;
|
|
||||||
std::function<void(Build::ptr)> createBuild;
|
|
||||||
|
|
||||||
createBuild = [&](Build::ptr build) {
|
|
||||||
printMsg(lvlTalkative, format("loading build %1% (%2%)") % build->id % build->fullJobName);
|
|
||||||
nrAdded++;
|
|
||||||
|
|
||||||
if (!store->isValidPath(build->drvPath)) {
|
|
||||||
/* Derivation has been GC'ed prematurely. */
|
|
||||||
printMsg(lvlError, format("aborting GC'ed build %1%") % build->id);
|
|
||||||
if (!build->finishedInDB) {
|
|
||||||
pqxx::work txn(conn);
|
|
||||||
txn.parameterized
|
|
||||||
("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $3, errorMsg = $4 where id = $1 and finished = 0")
|
|
||||||
(build->id)
|
|
||||||
((int) bsAborted)
|
|
||||||
(time(0))
|
|
||||||
("derivation was garbage-collected prior to build").exec();
|
|
||||||
txn.commit();
|
|
||||||
build->finishedInDB = true;
|
|
||||||
nrBuildsDone++;
|
|
||||||
}
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
std::set<Step::ptr> newSteps;
|
|
||||||
std::set<Path> finishedDrvs; // FIXME: re-use?
|
|
||||||
Step::ptr step = createStep(store, build->drvPath, build, 0, finishedDrvs, newSteps, newRunnable);
|
|
||||||
|
|
||||||
/* Some of the new steps may be the top level of builds that
|
|
||||||
we haven't processed yet. So do them now. This ensures that
|
|
||||||
if build A depends on build B with top-level step X, then X
|
|
||||||
will be "accounted" to B in doBuildStep(). */
|
|
||||||
for (auto & r : newSteps) {
|
|
||||||
while (true) {
|
|
||||||
auto i = newBuilds.find(r->drvPath);
|
|
||||||
if (i == newBuilds.end()) break;
|
|
||||||
Build::ptr b = i->second;
|
|
||||||
newBuilds.erase(i);
|
|
||||||
createBuild(b);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* If we didn't get a step, it means the step's outputs are
|
|
||||||
all valid. So we mark this as a finished, cached build. */
|
|
||||||
if (!step) {
|
|
||||||
Derivation drv = readDerivation(build->drvPath);
|
|
||||||
BuildOutput res = getBuildOutput(store, drv);
|
|
||||||
|
|
||||||
pqxx::work txn(conn);
|
|
||||||
time_t now = time(0);
|
|
||||||
markSucceededBuild(txn, build, res, true, now, now);
|
|
||||||
txn.commit();
|
|
||||||
|
|
||||||
build->finishedInDB = true;
|
|
||||||
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* If any step has an unsupported system type or has a
|
|
||||||
previously failed output path, then fail the build right
|
|
||||||
away. */
|
|
||||||
bool badStep = false;
|
|
||||||
for (auto & r : newSteps) {
|
|
||||||
BuildStatus buildStatus = bsSuccess;
|
|
||||||
BuildStepStatus buildStepStatus = bssFailed;
|
|
||||||
|
|
||||||
if (checkCachedFailure(r, conn)) {
|
|
||||||
printMsg(lvlError, format("marking build %1% as cached failure") % build->id);
|
|
||||||
buildStatus = step == r ? bsFailed : bsDepFailed;
|
|
||||||
buildStepStatus = bssFailed;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (buildStatus == bsSuccess) {
|
|
||||||
bool supported = false;
|
|
||||||
{
|
|
||||||
auto machines_(machines.lock()); // FIXME: use shared_mutex
|
|
||||||
for (auto & m : *machines_)
|
|
||||||
if (m.second->supportsStep(r)) { supported = true; break; }
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!supported) {
|
|
||||||
printMsg(lvlError, format("aborting unsupported build %1%") % build->id);
|
|
||||||
buildStatus = bsUnsupported;
|
|
||||||
buildStepStatus = bssUnsupported;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (buildStatus != bsSuccess) {
|
|
||||||
time_t now = time(0);
|
|
||||||
if (!build->finishedInDB) {
|
|
||||||
pqxx::work txn(conn);
|
|
||||||
createBuildStep(txn, 0, build, r, "", buildStepStatus);
|
|
||||||
txn.parameterized
|
|
||||||
("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $3, isCachedBuild = $4 where id = $1 and finished = 0")
|
|
||||||
(build->id)
|
|
||||||
((int) buildStatus)
|
|
||||||
(now)
|
|
||||||
(buildStatus != bsUnsupported ? 1 : 0).exec();
|
|
||||||
txn.commit();
|
|
||||||
build->finishedInDB = true;
|
|
||||||
nrBuildsDone++;
|
|
||||||
}
|
|
||||||
badStep = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (badStep) return;
|
|
||||||
|
|
||||||
/* Note: if we exit this scope prior to this, the build and
|
|
||||||
all newly created steps are destroyed. */
|
|
||||||
|
|
||||||
{
|
|
||||||
auto builds_(builds.lock());
|
|
||||||
if (!build->finishedInDB) // FIXME: can this happen?
|
|
||||||
(*builds_)[build->id] = build;
|
|
||||||
build->toplevel = step;
|
|
||||||
}
|
|
||||||
|
|
||||||
printMsg(lvlChatty, format("added build %1% (top-level step %2%, %3% new steps)")
|
|
||||||
% build->id % step->drvPath % newSteps.size());
|
|
||||||
};
|
|
||||||
|
|
||||||
/* Now instantiate build steps for each new build. The builder
|
|
||||||
threads can start building the runnable build steps right away,
|
|
||||||
even while we're still processing other new builds. */
|
|
||||||
while (!newBuilds.empty()) {
|
|
||||||
auto build = newBuilds.begin()->second;
|
|
||||||
newBuilds.erase(newBuilds.begin());
|
|
||||||
|
|
||||||
newRunnable.clear();
|
|
||||||
nrAdded = 0;
|
|
||||||
try {
|
|
||||||
createBuild(build);
|
|
||||||
} catch (Error & e) {
|
|
||||||
e.addPrefix(format("while loading build %1%: ") % build->id);
|
|
||||||
throw;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Add the new runnable build steps to ‘runnable’ and wake up
|
|
||||||
the builder threads. */
|
|
||||||
printMsg(lvlChatty, format("got %1% new runnable steps from %2% new builds") % newRunnable.size() % nrAdded);
|
|
||||||
for (auto & r : newRunnable)
|
|
||||||
makeRunnable(r);
|
|
||||||
|
|
||||||
nrBuildsRead += nrAdded;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void State::removeCancelledBuilds(Connection & conn)
|
|
||||||
{
|
|
||||||
/* Get the current set of queued builds. */
|
|
||||||
std::set<BuildID> currentIds;
|
|
||||||
{
|
|
||||||
pqxx::work txn(conn);
|
|
||||||
auto res = txn.exec("select id from Builds where finished = 0");
|
|
||||||
for (auto const & row : res)
|
|
||||||
currentIds.insert(row["id"].as<BuildID>());
|
|
||||||
}
|
|
||||||
|
|
||||||
auto builds_(builds.lock());
|
|
||||||
|
|
||||||
for (auto i = builds_->begin(); i != builds_->end(); ) {
|
|
||||||
if (currentIds.find(i->first) == currentIds.end()) {
|
|
||||||
printMsg(lvlInfo, format("discarding cancelled build %1%") % i->first);
|
|
||||||
i = builds_->erase(i);
|
|
||||||
// FIXME: ideally we would interrupt active build steps here.
|
|
||||||
} else
|
|
||||||
++i;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
Step::ptr State::createStep(std::shared_ptr<StoreAPI> store, const Path & drvPath,
|
|
||||||
Build::ptr referringBuild, Step::ptr referringStep, std::set<Path> & finishedDrvs,
|
|
||||||
std::set<Step::ptr> & newSteps, std::set<Step::ptr> & newRunnable)
|
|
||||||
{
|
|
||||||
if (finishedDrvs.find(drvPath) != finishedDrvs.end()) return 0;
|
|
||||||
|
|
||||||
/* Check if the requested step already exists. If not, create a
|
|
||||||
new step. In any case, make the step reachable from
|
|
||||||
referringBuild or referringStep. This is done atomically (with
|
|
||||||
‘steps’ locked), to ensure that this step can never become
|
|
||||||
reachable from a new build after doBuildStep has removed it
|
|
||||||
from ‘steps’. */
|
|
||||||
Step::ptr step;
|
|
||||||
bool isNew = false;
|
|
||||||
{
|
|
||||||
auto steps_(steps.lock());
|
|
||||||
|
|
||||||
/* See if the step already exists in ‘steps’ and is not
|
|
||||||
stale. */
|
|
||||||
auto prev = steps_->find(drvPath);
|
|
||||||
if (prev != steps_->end()) {
|
|
||||||
step = prev->second.lock();
|
|
||||||
/* Since ‘step’ is a strong pointer, the referred Step
|
|
||||||
object won't be deleted after this. */
|
|
||||||
if (!step) steps_->erase(drvPath); // remove stale entry
|
|
||||||
}
|
|
||||||
|
|
||||||
/* If it doesn't exist, create it. */
|
|
||||||
if (!step) {
|
|
||||||
step = std::make_shared<Step>();
|
|
||||||
step->drvPath = drvPath;
|
|
||||||
isNew = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
auto step_(step->state.lock());
|
|
||||||
|
|
||||||
assert(step_->created != isNew);
|
|
||||||
|
|
||||||
if (referringBuild)
|
|
||||||
step_->builds.push_back(referringBuild);
|
|
||||||
|
|
||||||
if (referringStep)
|
|
||||||
step_->rdeps.push_back(referringStep);
|
|
||||||
|
|
||||||
(*steps_)[drvPath] = step;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!isNew) return step;
|
|
||||||
|
|
||||||
printMsg(lvlDebug, format("considering derivation ‘%1%’") % drvPath);
|
|
||||||
|
|
||||||
/* Initialize the step. Note that the step may be visible in
|
|
||||||
‘steps’ before this point, but that doesn't matter because
|
|
||||||
it's not runnable yet, and other threads won't make it
|
|
||||||
runnable while step->created == false. */
|
|
||||||
step->drv = readDerivation(drvPath);
|
|
||||||
{
|
|
||||||
auto i = step->drv.env.find("requiredSystemFeatures");
|
|
||||||
if (i != step->drv.env.end())
|
|
||||||
step->requiredSystemFeatures = tokenizeString<std::set<std::string>>(i->second);
|
|
||||||
}
|
|
||||||
|
|
||||||
auto attr = step->drv.env.find("preferLocalBuild");
|
|
||||||
step->preferLocalBuild =
|
|
||||||
attr != step->drv.env.end() && attr->second == "1"
|
|
||||||
&& has(localPlatforms, step->drv.platform);
|
|
||||||
|
|
||||||
/* Are all outputs valid? */
|
|
||||||
bool valid = true;
|
|
||||||
for (auto & i : step->drv.outputs) {
|
|
||||||
if (!store->isValidPath(i.second.path)) {
|
|
||||||
valid = false;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// FIXME: check whether all outputs are in the binary cache.
|
|
||||||
if (valid) {
|
|
||||||
finishedDrvs.insert(drvPath);
|
|
||||||
return 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* No, we need to build. */
|
|
||||||
printMsg(lvlDebug, format("creating build step ‘%1%’") % drvPath);
|
|
||||||
newSteps.insert(step);
|
|
||||||
|
|
||||||
/* Create steps for the dependencies. */
|
|
||||||
for (auto & i : step->drv.inputDrvs) {
|
|
||||||
auto dep = createStep(store, i.first, 0, step, finishedDrvs, newSteps, newRunnable);
|
|
||||||
if (dep) {
|
|
||||||
auto step_(step->state.lock());
|
|
||||||
step_->deps.insert(dep);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* If the step has no (remaining) dependencies, make it
|
|
||||||
runnable. */
|
|
||||||
{
|
|
||||||
auto step_(step->state.lock());
|
|
||||||
assert(!step_->created);
|
|
||||||
step_->created = true;
|
|
||||||
if (step_->deps.empty())
|
|
||||||
newRunnable.insert(step);
|
|
||||||
}
|
|
||||||
|
|
||||||
return step;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
/* Get the steps and unfinished builds that depend on the given step. */
|
/* Get the steps and unfinished builds that depend on the given step. */
|
||||||
void getDependents(Step::ptr step, std::set<Build::ptr> & builds, std::set<Step::ptr> & steps)
|
void getDependents(Step::ptr step, std::set<Build::ptr> & builds, std::set<Step::ptr> & steps)
|
||||||
{
|
{
|
||||||
|
@ -585,527 +204,6 @@ void getDependents(Step::ptr step, std::set<Build::ptr> & builds, std::set<Step:
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void State::makeRunnable(Step::ptr step)
|
|
||||||
{
|
|
||||||
printMsg(lvlChatty, format("step ‘%1%’ is now runnable") % step->drvPath);
|
|
||||||
|
|
||||||
{
|
|
||||||
auto step_(step->state.lock());
|
|
||||||
assert(step_->created);
|
|
||||||
assert(!step->finished);
|
|
||||||
assert(step_->deps.empty());
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
auto runnable_(runnable.lock());
|
|
||||||
runnable_->push_back(step);
|
|
||||||
}
|
|
||||||
|
|
||||||
wakeDispatcher();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void State::dispatcher()
|
|
||||||
{
|
|
||||||
while (true) {
|
|
||||||
printMsg(lvlDebug, "dispatcher woken up");
|
|
||||||
|
|
||||||
auto sleepUntil = system_time::max();
|
|
||||||
|
|
||||||
bool keepGoing;
|
|
||||||
|
|
||||||
do {
|
|
||||||
/* Copy the currentJobs field of each machine. This is
|
|
||||||
necessary to ensure that the sort comparator below is
|
|
||||||
an ordering. std::sort() can segfault if it isn't. */
|
|
||||||
struct MachineInfo
|
|
||||||
{
|
|
||||||
Machine::ptr machine;
|
|
||||||
unsigned int currentJobs;
|
|
||||||
};
|
|
||||||
std::vector<MachineInfo> machinesSorted;
|
|
||||||
{
|
|
||||||
auto machines_(machines.lock());
|
|
||||||
for (auto & m : *machines_)
|
|
||||||
machinesSorted.push_back({m.second, m.second->state->currentJobs});
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Sort the machines by a combination of speed factor and
|
|
||||||
available slots. Prioritise the available machines as
|
|
||||||
follows:
|
|
||||||
|
|
||||||
- First by load divided by speed factor, rounded to the
|
|
||||||
nearest integer. This causes fast machines to be
|
|
||||||
preferred over slow machines with similar loads.
|
|
||||||
|
|
||||||
- Then by speed factor.
|
|
||||||
|
|
||||||
- Finally by load. */
|
|
||||||
sort(machinesSorted.begin(), machinesSorted.end(),
|
|
||||||
[](const MachineInfo & a, const MachineInfo & b) -> bool
|
|
||||||
{
|
|
||||||
float ta = roundf(a.currentJobs / a.machine->speedFactor);
|
|
||||||
float tb = roundf(b.currentJobs / b.machine->speedFactor);
|
|
||||||
return
|
|
||||||
ta != tb ? ta < tb :
|
|
||||||
a.machine->speedFactor != b.machine->speedFactor ? a.machine->speedFactor > b.machine->speedFactor :
|
|
||||||
a.currentJobs > b.currentJobs;
|
|
||||||
});
|
|
||||||
|
|
||||||
/* Find a machine with a free slot and find a step to run
|
|
||||||
on it. Once we find such a pair, we restart the outer
|
|
||||||
loop because the machine sorting will have changed. */
|
|
||||||
keepGoing = false;
|
|
||||||
system_time now = std::chrono::system_clock::now();
|
|
||||||
|
|
||||||
for (auto & mi : machinesSorted) {
|
|
||||||
// FIXME: can we lose a wakeup if a builder exits concurrently?
|
|
||||||
if (mi.machine->state->currentJobs >= mi.machine->maxJobs) continue;
|
|
||||||
|
|
||||||
auto runnable_(runnable.lock());
|
|
||||||
//printMsg(lvlDebug, format("%1% runnable builds") % runnable_->size());
|
|
||||||
|
|
||||||
/* FIXME: we're holding the runnable lock too long
|
|
||||||
here. This could be more efficient. */
|
|
||||||
|
|
||||||
for (auto i = runnable_->begin(); i != runnable_->end(); ) {
|
|
||||||
auto step = i->lock();
|
|
||||||
|
|
||||||
/* Delete dead steps. */
|
|
||||||
if (!step) {
|
|
||||||
i = runnable_->erase(i);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Can this machine do this step? */
|
|
||||||
if (!mi.machine->supportsStep(step)) {
|
|
||||||
++i;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Skip previously failed steps that aren't ready
|
|
||||||
to be retried. */
|
|
||||||
{
|
|
||||||
auto step_(step->state.lock());
|
|
||||||
if (step_->tries > 0 && step_->after > now) {
|
|
||||||
if (step_->after < sleepUntil)
|
|
||||||
sleepUntil = step_->after;
|
|
||||||
++i;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Make a slot reservation and start a thread to
|
|
||||||
do the build. */
|
|
||||||
auto reservation = std::make_shared<MaintainCount>(mi.machine->state->currentJobs);
|
|
||||||
i = runnable_->erase(i);
|
|
||||||
|
|
||||||
auto builderThread = std::thread(&State::builder, this, step, mi.machine, reservation);
|
|
||||||
builderThread.detach(); // FIXME?
|
|
||||||
|
|
||||||
keepGoing = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (keepGoing) break;
|
|
||||||
}
|
|
||||||
|
|
||||||
} while (keepGoing);
|
|
||||||
|
|
||||||
/* Sleep until we're woken up (either because a runnable build
|
|
||||||
is added, or because a build finishes). */
|
|
||||||
{
|
|
||||||
std::unique_lock<std::mutex> lock(dispatcherMutex);
|
|
||||||
printMsg(lvlDebug, format("dispatcher sleeping for %1%s") %
|
|
||||||
std::chrono::duration_cast<std::chrono::seconds>(sleepUntil - std::chrono::system_clock::now()).count());
|
|
||||||
dispatcherWakeup.wait_until(lock, sleepUntil);
|
|
||||||
nrDispatcherWakeups++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
printMsg(lvlError, "dispatcher exits");
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void State::wakeDispatcher()
|
|
||||||
{
|
|
||||||
{ std::lock_guard<std::mutex> lock(dispatcherMutex); } // barrier
|
|
||||||
dispatcherWakeup.notify_one();
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void State::builder(Step::ptr step, Machine::ptr machine, std::shared_ptr<MaintainCount> reservation)
|
|
||||||
{
|
|
||||||
bool retry = true;
|
|
||||||
|
|
||||||
MaintainCount mc(nrActiveSteps);
|
|
||||||
|
|
||||||
try {
|
|
||||||
auto store = openStore(); // FIXME: pool
|
|
||||||
retry = doBuildStep(store, step, machine);
|
|
||||||
} catch (std::exception & e) {
|
|
||||||
printMsg(lvlError, format("uncaught exception building ‘%1%’ on ‘%2%’: %3%")
|
|
||||||
% step->drvPath % machine->sshName % e.what());
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Release the machine and wake up the dispatcher. */
|
|
||||||
assert(reservation.unique());
|
|
||||||
reservation = 0;
|
|
||||||
wakeDispatcher();
|
|
||||||
|
|
||||||
/* If there was a temporary failure, retry the step after an
|
|
||||||
exponentially increasing interval. */
|
|
||||||
if (retry) {
|
|
||||||
{
|
|
||||||
auto step_(step->state.lock());
|
|
||||||
step_->tries++;
|
|
||||||
nrRetries++;
|
|
||||||
if (step_->tries > maxNrRetries) maxNrRetries = step_->tries; // yeah yeah, not atomic
|
|
||||||
int delta = retryInterval * powf(retryBackoff, step_->tries - 1);
|
|
||||||
printMsg(lvlInfo, format("will retry ‘%1%’ after %2%s") % step->drvPath % delta);
|
|
||||||
step_->after = std::chrono::system_clock::now() + std::chrono::seconds(delta);
|
|
||||||
}
|
|
||||||
|
|
||||||
makeRunnable(step);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
bool State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
|
|
||||||
Machine::ptr machine)
|
|
||||||
{
|
|
||||||
{
|
|
||||||
auto step_(step->state.lock());
|
|
||||||
assert(step_->created);
|
|
||||||
assert(!step->finished);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* There can be any number of builds in the database that depend
|
|
||||||
on this derivation. Arbitrarily pick one (though preferring a
|
|
||||||
build of which this is the top-level derivation) for the
|
|
||||||
purpose of creating build steps. We could create a build step
|
|
||||||
record for every build, but that could be very expensive
|
|
||||||
(e.g. a stdenv derivation can be a dependency of tens of
|
|
||||||
thousands of builds), so we don't. */
|
|
||||||
Build::ptr build;
|
|
||||||
|
|
||||||
{
|
|
||||||
std::set<Build::ptr> dependents;
|
|
||||||
std::set<Step::ptr> steps;
|
|
||||||
getDependents(step, dependents, steps);
|
|
||||||
|
|
||||||
if (dependents.empty()) {
|
|
||||||
/* Apparently all builds that depend on this derivation
|
|
||||||
are gone (e.g. cancelled). So don't bother. This is
|
|
||||||
very unlikely to happen, because normally Steps are
|
|
||||||
only kept alive by being reachable from a
|
|
||||||
Build. However, it's possible that a new Build just
|
|
||||||
created a reference to this step. So to handle that
|
|
||||||
possibility, we retry this step (putting it back in
|
|
||||||
the runnable queue). If there are really no strong
|
|
||||||
pointers to the step, it will be deleted. */
|
|
||||||
printMsg(lvlInfo, format("maybe cancelling build step ‘%1%’") % step->drvPath);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
for (auto build2 : dependents)
|
|
||||||
if (build2->drvPath == step->drvPath) { build = build2; break; }
|
|
||||||
|
|
||||||
if (!build) build = *dependents.begin();
|
|
||||||
|
|
||||||
printMsg(lvlInfo, format("performing step ‘%1%’ on ‘%2%’ (needed by build %3% and %4% others)")
|
|
||||||
% step->drvPath % machine->sshName % build->id % (dependents.size() - 1));
|
|
||||||
}
|
|
||||||
|
|
||||||
bool quit = build->id == buildOne;
|
|
||||||
|
|
||||||
auto conn(dbPool.get());
|
|
||||||
|
|
||||||
RemoteResult result;
|
|
||||||
BuildOutput res;
|
|
||||||
int stepNr = 0;
|
|
||||||
|
|
||||||
time_t stepStartTime = result.startTime = time(0);
|
|
||||||
|
|
||||||
/* If any of the outputs have previously failed, then don't bother
|
|
||||||
building again. */
|
|
||||||
bool cachedFailure = checkCachedFailure(step, *conn);
|
|
||||||
|
|
||||||
if (cachedFailure)
|
|
||||||
result.status = BuildResult::CachedFailure;
|
|
||||||
else {
|
|
||||||
|
|
||||||
/* Create a build step record indicating that we started
|
|
||||||
building. Also, mark the selected build as busy. */
|
|
||||||
{
|
|
||||||
pqxx::work txn(*conn);
|
|
||||||
stepNr = createBuildStep(txn, result.startTime, build, step, machine->sshName, bssBusy);
|
|
||||||
txn.parameterized("update Builds set busy = 1 where id = $1")(build->id).exec();
|
|
||||||
txn.commit();
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Do the build. */
|
|
||||||
try {
|
|
||||||
/* FIXME: referring builds may have conflicting timeouts. */
|
|
||||||
buildRemote(store, machine, step, build->maxSilentTime, build->buildTimeout, result);
|
|
||||||
} catch (Error & e) {
|
|
||||||
result.status = BuildResult::MiscFailure;
|
|
||||||
result.errorMsg = e.msg();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (result.success()) res = getBuildOutput(store, step->drv);
|
|
||||||
}
|
|
||||||
|
|
||||||
time_t stepStopTime = time(0);
|
|
||||||
if (!result.stopTime) result.stopTime = stepStopTime;
|
|
||||||
|
|
||||||
/* Asynchronously compress the log. */
|
|
||||||
if (result.logFile != "") {
|
|
||||||
{
|
|
||||||
auto logCompressorQueue_(logCompressorQueue.lock());
|
|
||||||
logCompressorQueue_->push(result.logFile);
|
|
||||||
}
|
|
||||||
logCompressorWakeup.notify_one();
|
|
||||||
}
|
|
||||||
|
|
||||||
/* The step had a hopefully temporary failure (e.g. network
|
|
||||||
issue). Retry a number of times. */
|
|
||||||
if (result.canRetry()) {
|
|
||||||
printMsg(lvlError, format("possibly transient failure building ‘%1%’ on ‘%2%’: %3%")
|
|
||||||
% step->drvPath % machine->sshName % result.errorMsg);
|
|
||||||
bool retry;
|
|
||||||
{
|
|
||||||
auto step_(step->state.lock());
|
|
||||||
retry = step_->tries + 1 < maxTries;
|
|
||||||
}
|
|
||||||
if (retry) {
|
|
||||||
pqxx::work txn(*conn);
|
|
||||||
finishBuildStep(txn, result.startTime, result.stopTime, build->id,
|
|
||||||
stepNr, machine->sshName, bssAborted, result.errorMsg);
|
|
||||||
txn.commit();
|
|
||||||
if (quit) exit(1);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (result.success()) {
|
|
||||||
|
|
||||||
/* Register success in the database for all Build objects that
|
|
||||||
have this step as the top-level step. Since the queue
|
|
||||||
monitor thread may be creating new referring Builds
|
|
||||||
concurrently, and updating the database may fail, we do
|
|
||||||
this in a loop, marking all known builds, repeating until
|
|
||||||
there are no unmarked builds.
|
|
||||||
*/
|
|
||||||
|
|
||||||
std::vector<BuildID> buildIDs;
|
|
||||||
|
|
||||||
while (true) {
|
|
||||||
|
|
||||||
/* Get the builds that have this one as the top-level. */
|
|
||||||
std::vector<Build::ptr> direct;
|
|
||||||
{
|
|
||||||
auto steps_(steps.lock());
|
|
||||||
auto step_(step->state.lock());
|
|
||||||
|
|
||||||
for (auto & b_ : step_->builds) {
|
|
||||||
auto b = b_.lock();
|
|
||||||
if (b && !b->finishedInDB) direct.push_back(b);
|
|
||||||
}
|
|
||||||
|
|
||||||
/* If there are no builds left to update in the DB,
|
|
||||||
then we're done (except for calling
|
|
||||||
finishBuildStep()). Delete the step from
|
|
||||||
‘steps’. Since we've been holding the ‘steps’ lock,
|
|
||||||
no new referrers can have been added in the
|
|
||||||
meantime or be added afterwards. */
|
|
||||||
if (direct.empty()) {
|
|
||||||
printMsg(lvlDebug, format("finishing build step ‘%1%’") % step->drvPath);
|
|
||||||
steps_->erase(step->drvPath);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Update the database. */
|
|
||||||
{
|
|
||||||
pqxx::work txn(*conn);
|
|
||||||
|
|
||||||
finishBuildStep(txn, result.startTime, result.stopTime, build->id, stepNr, machine->sshName, bssSuccess);
|
|
||||||
|
|
||||||
for (auto & b : direct)
|
|
||||||
markSucceededBuild(txn, b, res, build != b || result.status != BuildResult::Built,
|
|
||||||
result.startTime, result.stopTime);
|
|
||||||
|
|
||||||
txn.commit();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (direct.empty()) break;
|
|
||||||
|
|
||||||
/* Remove the direct dependencies from ‘builds’. This will
|
|
||||||
cause them to be destroyed. */
|
|
||||||
for (auto & b : direct) {
|
|
||||||
auto builds_(builds.lock());
|
|
||||||
b->finishedInDB = true;
|
|
||||||
builds_->erase(b->id);
|
|
||||||
buildIDs.push_back(b->id);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Send notification about the builds that have this step as
|
|
||||||
the top-level. */
|
|
||||||
for (auto id : buildIDs) {
|
|
||||||
{
|
|
||||||
auto notificationSenderQueue_(notificationSenderQueue.lock());
|
|
||||||
notificationSenderQueue_->push(NotificationItem(id, std::vector<BuildID>()));
|
|
||||||
}
|
|
||||||
notificationSenderWakeup.notify_one();
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Wake up any dependent steps that have no other
|
|
||||||
dependencies. */
|
|
||||||
{
|
|
||||||
auto step_(step->state.lock());
|
|
||||||
for (auto & rdepWeak : step_->rdeps) {
|
|
||||||
auto rdep = rdepWeak.lock();
|
|
||||||
if (!rdep) continue;
|
|
||||||
|
|
||||||
bool runnable = false;
|
|
||||||
{
|
|
||||||
auto rdep_(rdep->state.lock());
|
|
||||||
rdep_->deps.erase(step);
|
|
||||||
/* Note: if the step has not finished
|
|
||||||
initialisation yet, it will be made runnable in
|
|
||||||
createStep(), if appropriate. */
|
|
||||||
if (rdep_->deps.empty() && rdep_->created) runnable = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (runnable) makeRunnable(rdep);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
} else {
|
|
||||||
|
|
||||||
/* Register failure in the database for all Build objects that
|
|
||||||
directly or indirectly depend on this step. */
|
|
||||||
|
|
||||||
std::vector<BuildID> dependentIDs;
|
|
||||||
|
|
||||||
while (true) {
|
|
||||||
|
|
||||||
/* Get the builds and steps that depend on this step. */
|
|
||||||
std::set<Build::ptr> indirect;
|
|
||||||
{
|
|
||||||
auto steps_(steps.lock());
|
|
||||||
std::set<Step::ptr> steps;
|
|
||||||
getDependents(step, indirect, steps);
|
|
||||||
|
|
||||||
/* If there are no builds left, delete all referring
|
|
||||||
steps from ‘steps’. As for the success case, we can
|
|
||||||
be certain no new referrers can be added. */
|
|
||||||
if (indirect.empty()) {
|
|
||||||
for (auto & s : steps) {
|
|
||||||
printMsg(lvlDebug, format("finishing build step ‘%1%’") % s->drvPath);
|
|
||||||
steps_->erase(s->drvPath);
|
|
||||||
}
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Update the database. */
|
|
||||||
{
|
|
||||||
pqxx::work txn(*conn);
|
|
||||||
|
|
||||||
BuildStatus buildStatus =
|
|
||||||
result.status == BuildResult::TimedOut ? bsTimedOut :
|
|
||||||
result.canRetry() ? bsAborted :
|
|
||||||
bsFailed;
|
|
||||||
BuildStepStatus buildStepStatus =
|
|
||||||
result.status == BuildResult::TimedOut ? bssTimedOut :
|
|
||||||
result.canRetry() ? bssAborted :
|
|
||||||
bssFailed;
|
|
||||||
|
|
||||||
/* For standard failures, we don't care about the error
|
|
||||||
message. */
|
|
||||||
if (result.status == BuildResult::PermanentFailure ||
|
|
||||||
result.status == BuildResult::TransientFailure ||
|
|
||||||
result.status == BuildResult::CachedFailure ||
|
|
||||||
result.status == BuildResult::TimedOut)
|
|
||||||
result.errorMsg = "";
|
|
||||||
|
|
||||||
/* Create failed build steps for every build that depends
|
|
||||||
on this. For cached failures, only create a step for
|
|
||||||
builds that don't have this step as top-level
|
|
||||||
(otherwise the user won't be able to see what caused
|
|
||||||
the build to fail). */
|
|
||||||
for (auto & build2 : indirect) {
|
|
||||||
if ((cachedFailure && build2->drvPath == step->drvPath) ||
|
|
||||||
(!cachedFailure && build == build2) ||
|
|
||||||
build2->finishedInDB)
|
|
||||||
continue;
|
|
||||||
createBuildStep(txn, 0, build2, step, machine->sshName,
|
|
||||||
buildStepStatus, result.errorMsg, build == build2 ? 0 : build->id);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!cachedFailure)
|
|
||||||
finishBuildStep(txn, result.startTime, result.stopTime, build->id,
|
|
||||||
stepNr, machine->sshName, buildStepStatus, result.errorMsg);
|
|
||||||
|
|
||||||
/* Mark all builds that depend on this derivation as failed. */
|
|
||||||
for (auto & build2 : indirect) {
|
|
||||||
if (build2->finishedInDB) continue;
|
|
||||||
printMsg(lvlError, format("marking build %1% as failed") % build2->id);
|
|
||||||
txn.parameterized
|
|
||||||
("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1 and finished = 0")
|
|
||||||
(build2->id)
|
|
||||||
((int) (build2->drvPath != step->drvPath && buildStatus == bsFailed ? bsDepFailed : buildStatus))
|
|
||||||
(result.startTime)
|
|
||||||
(result.stopTime)
|
|
||||||
(cachedFailure ? 1 : 0).exec();
|
|
||||||
nrBuildsDone++;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Remember failed paths in the database so that they
|
|
||||||
won't be built again. */
|
|
||||||
if (!cachedFailure && result.status == BuildResult::PermanentFailure)
|
|
||||||
for (auto & path : outputPaths(step->drv))
|
|
||||||
txn.parameterized("insert into FailedPaths values ($1)")(path).exec();
|
|
||||||
|
|
||||||
txn.commit();
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Remove the indirect dependencies from ‘builds’. This
|
|
||||||
will cause them to be destroyed. */
|
|
||||||
for (auto & b : indirect) {
|
|
||||||
auto builds_(builds.lock());
|
|
||||||
b->finishedInDB = true;
|
|
||||||
builds_->erase(b->id);
|
|
||||||
dependentIDs.push_back(b->id);
|
|
||||||
if (buildOne == b->id) quit = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Send notification about this build and its dependents. */
|
|
||||||
{
|
|
||||||
auto notificationSenderQueue_(notificationSenderQueue.lock());
|
|
||||||
notificationSenderQueue_->push(NotificationItem(build->id, dependentIDs));
|
|
||||||
}
|
|
||||||
notificationSenderWakeup.notify_one();
|
|
||||||
|
|
||||||
}
|
|
||||||
|
|
||||||
// FIXME: keep stats about aborted steps?
|
|
||||||
nrStepsDone++;
|
|
||||||
totalStepTime += stepStopTime - stepStartTime;
|
|
||||||
totalStepBuildTime += result.stopTime - result.startTime;
|
|
||||||
machine->state->nrStepsDone++;
|
|
||||||
machine->state->totalStepTime += stepStopTime - stepStartTime;
|
|
||||||
machine->state->totalStepBuildTime += result.stopTime - result.startTime;
|
|
||||||
|
|
||||||
if (quit) exit(0); // testing hack
|
|
||||||
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
void State::markSucceededBuild(pqxx::work & txn, Build::ptr build,
|
void State::markSucceededBuild(pqxx::work & txn, Build::ptr build,
|
||||||
const BuildOutput & res, bool isCachedBuild, time_t startTime, time_t stopTime)
|
const BuildOutput & res, bool isCachedBuild, time_t startTime, time_t stopTime)
|
||||||
{
|
{
|
||||||
|
|
369
src/hydra-queue-runner/queue-monitor.cc
Normal file
369
src/hydra-queue-runner/queue-monitor.cc
Normal file
|
@ -0,0 +1,369 @@
|
||||||
|
#include "state.hh"
|
||||||
|
#include "build-result.hh"
|
||||||
|
|
||||||
|
using namespace nix;
|
||||||
|
|
||||||
|
|
||||||
|
void State::queueMonitor()
|
||||||
|
{
|
||||||
|
while (true) {
|
||||||
|
try {
|
||||||
|
queueMonitorLoop();
|
||||||
|
} catch (std::exception & e) {
|
||||||
|
printMsg(lvlError, format("queue monitor: %1%") % e.what());
|
||||||
|
sleep(10); // probably a DB problem, so don't retry right away
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void State::queueMonitorLoop()
|
||||||
|
{
|
||||||
|
auto conn(dbPool.get());
|
||||||
|
|
||||||
|
receiver buildsAdded(*conn, "builds_added");
|
||||||
|
receiver buildsRestarted(*conn, "builds_restarted");
|
||||||
|
receiver buildsCancelled(*conn, "builds_cancelled");
|
||||||
|
receiver buildsDeleted(*conn, "builds_deleted");
|
||||||
|
|
||||||
|
auto store = openStore(); // FIXME: pool
|
||||||
|
|
||||||
|
unsigned int lastBuildId = 0;
|
||||||
|
|
||||||
|
while (true) {
|
||||||
|
getQueuedBuilds(*conn, store, lastBuildId);
|
||||||
|
|
||||||
|
/* Sleep until we get notification from the database about an
|
||||||
|
event. */
|
||||||
|
conn->await_notification();
|
||||||
|
nrQueueWakeups++;
|
||||||
|
|
||||||
|
if (buildsAdded.get())
|
||||||
|
printMsg(lvlTalkative, "got notification: new builds added to the queue");
|
||||||
|
if (buildsRestarted.get()) {
|
||||||
|
printMsg(lvlTalkative, "got notification: builds restarted");
|
||||||
|
lastBuildId = 0; // check all builds
|
||||||
|
}
|
||||||
|
if (buildsCancelled.get() || buildsDeleted.get()) {
|
||||||
|
printMsg(lvlTalkative, "got notification: builds cancelled");
|
||||||
|
removeCancelledBuilds(*conn);
|
||||||
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void State::getQueuedBuilds(Connection & conn, std::shared_ptr<StoreAPI> store, unsigned int & lastBuildId)
|
||||||
|
{
|
||||||
|
printMsg(lvlInfo, format("checking the queue for builds > %1%...") % lastBuildId);
|
||||||
|
|
||||||
|
/* Grab the queued builds from the database, but don't process
|
||||||
|
them yet (since we don't want a long-running transaction). */
|
||||||
|
std::multimap<Path, Build::ptr> newBuilds;
|
||||||
|
|
||||||
|
{
|
||||||
|
pqxx::work txn(conn);
|
||||||
|
|
||||||
|
auto res = txn.parameterized("select id, project, jobset, job, drvPath, maxsilent, timeout from Builds where id > $1 and finished = 0 order by id")(lastBuildId).exec();
|
||||||
|
|
||||||
|
for (auto const & row : res) {
|
||||||
|
auto builds_(builds.lock());
|
||||||
|
BuildID id = row["id"].as<BuildID>();
|
||||||
|
if (buildOne && id != buildOne) continue;
|
||||||
|
if (id > lastBuildId) lastBuildId = id;
|
||||||
|
if (has(*builds_, id)) continue;
|
||||||
|
|
||||||
|
auto build = std::make_shared<Build>();
|
||||||
|
build->id = id;
|
||||||
|
build->drvPath = row["drvPath"].as<string>();
|
||||||
|
build->fullJobName = row["project"].as<string>() + ":" + row["jobset"].as<string>() + ":" + row["job"].as<string>();
|
||||||
|
build->maxSilentTime = row["maxsilent"].as<int>();
|
||||||
|
build->buildTimeout = row["timeout"].as<int>();
|
||||||
|
|
||||||
|
newBuilds.emplace(std::make_pair(build->drvPath, build));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
std::set<Step::ptr> newRunnable;
|
||||||
|
unsigned int nrAdded;
|
||||||
|
std::function<void(Build::ptr)> createBuild;
|
||||||
|
|
||||||
|
createBuild = [&](Build::ptr build) {
|
||||||
|
printMsg(lvlTalkative, format("loading build %1% (%2%)") % build->id % build->fullJobName);
|
||||||
|
nrAdded++;
|
||||||
|
|
||||||
|
if (!store->isValidPath(build->drvPath)) {
|
||||||
|
/* Derivation has been GC'ed prematurely. */
|
||||||
|
printMsg(lvlError, format("aborting GC'ed build %1%") % build->id);
|
||||||
|
if (!build->finishedInDB) {
|
||||||
|
pqxx::work txn(conn);
|
||||||
|
txn.parameterized
|
||||||
|
("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $3, errorMsg = $4 where id = $1 and finished = 0")
|
||||||
|
(build->id)
|
||||||
|
((int) bsAborted)
|
||||||
|
(time(0))
|
||||||
|
("derivation was garbage-collected prior to build").exec();
|
||||||
|
txn.commit();
|
||||||
|
build->finishedInDB = true;
|
||||||
|
nrBuildsDone++;
|
||||||
|
}
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
std::set<Step::ptr> newSteps;
|
||||||
|
std::set<Path> finishedDrvs; // FIXME: re-use?
|
||||||
|
Step::ptr step = createStep(store, build->drvPath, build, 0, finishedDrvs, newSteps, newRunnable);
|
||||||
|
|
||||||
|
/* Some of the new steps may be the top level of builds that
|
||||||
|
we haven't processed yet. So do them now. This ensures that
|
||||||
|
if build A depends on build B with top-level step X, then X
|
||||||
|
will be "accounted" to B in doBuildStep(). */
|
||||||
|
for (auto & r : newSteps) {
|
||||||
|
while (true) {
|
||||||
|
auto i = newBuilds.find(r->drvPath);
|
||||||
|
if (i == newBuilds.end()) break;
|
||||||
|
Build::ptr b = i->second;
|
||||||
|
newBuilds.erase(i);
|
||||||
|
createBuild(b);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* If we didn't get a step, it means the step's outputs are
|
||||||
|
all valid. So we mark this as a finished, cached build. */
|
||||||
|
if (!step) {
|
||||||
|
Derivation drv = readDerivation(build->drvPath);
|
||||||
|
BuildOutput res = getBuildOutput(store, drv);
|
||||||
|
|
||||||
|
pqxx::work txn(conn);
|
||||||
|
time_t now = time(0);
|
||||||
|
markSucceededBuild(txn, build, res, true, now, now);
|
||||||
|
txn.commit();
|
||||||
|
|
||||||
|
build->finishedInDB = true;
|
||||||
|
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* If any step has an unsupported system type or has a
|
||||||
|
previously failed output path, then fail the build right
|
||||||
|
away. */
|
||||||
|
bool badStep = false;
|
||||||
|
for (auto & r : newSteps) {
|
||||||
|
BuildStatus buildStatus = bsSuccess;
|
||||||
|
BuildStepStatus buildStepStatus = bssFailed;
|
||||||
|
|
||||||
|
if (checkCachedFailure(r, conn)) {
|
||||||
|
printMsg(lvlError, format("marking build %1% as cached failure") % build->id);
|
||||||
|
buildStatus = step == r ? bsFailed : bsDepFailed;
|
||||||
|
buildStepStatus = bssFailed;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (buildStatus == bsSuccess) {
|
||||||
|
bool supported = false;
|
||||||
|
{
|
||||||
|
auto machines_(machines.lock()); // FIXME: use shared_mutex
|
||||||
|
for (auto & m : *machines_)
|
||||||
|
if (m.second->supportsStep(r)) { supported = true; break; }
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!supported) {
|
||||||
|
printMsg(lvlError, format("aborting unsupported build %1%") % build->id);
|
||||||
|
buildStatus = bsUnsupported;
|
||||||
|
buildStepStatus = bssUnsupported;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (buildStatus != bsSuccess) {
|
||||||
|
time_t now = time(0);
|
||||||
|
if (!build->finishedInDB) {
|
||||||
|
pqxx::work txn(conn);
|
||||||
|
createBuildStep(txn, 0, build, r, "", buildStepStatus);
|
||||||
|
txn.parameterized
|
||||||
|
("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $3, isCachedBuild = $4 where id = $1 and finished = 0")
|
||||||
|
(build->id)
|
||||||
|
((int) buildStatus)
|
||||||
|
(now)
|
||||||
|
(buildStatus != bsUnsupported ? 1 : 0).exec();
|
||||||
|
txn.commit();
|
||||||
|
build->finishedInDB = true;
|
||||||
|
nrBuildsDone++;
|
||||||
|
}
|
||||||
|
badStep = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (badStep) return;
|
||||||
|
|
||||||
|
/* Note: if we exit this scope prior to this, the build and
|
||||||
|
all newly created steps are destroyed. */
|
||||||
|
|
||||||
|
{
|
||||||
|
auto builds_(builds.lock());
|
||||||
|
if (!build->finishedInDB) // FIXME: can this happen?
|
||||||
|
(*builds_)[build->id] = build;
|
||||||
|
build->toplevel = step;
|
||||||
|
}
|
||||||
|
|
||||||
|
printMsg(lvlChatty, format("added build %1% (top-level step %2%, %3% new steps)")
|
||||||
|
% build->id % step->drvPath % newSteps.size());
|
||||||
|
};
|
||||||
|
|
||||||
|
/* Now instantiate build steps for each new build. The builder
|
||||||
|
threads can start building the runnable build steps right away,
|
||||||
|
even while we're still processing other new builds. */
|
||||||
|
while (!newBuilds.empty()) {
|
||||||
|
auto build = newBuilds.begin()->second;
|
||||||
|
newBuilds.erase(newBuilds.begin());
|
||||||
|
|
||||||
|
newRunnable.clear();
|
||||||
|
nrAdded = 0;
|
||||||
|
try {
|
||||||
|
createBuild(build);
|
||||||
|
} catch (Error & e) {
|
||||||
|
e.addPrefix(format("while loading build %1%: ") % build->id);
|
||||||
|
throw;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Add the new runnable build steps to ‘runnable’ and wake up
|
||||||
|
the builder threads. */
|
||||||
|
printMsg(lvlChatty, format("got %1% new runnable steps from %2% new builds") % newRunnable.size() % nrAdded);
|
||||||
|
for (auto & r : newRunnable)
|
||||||
|
makeRunnable(r);
|
||||||
|
|
||||||
|
nrBuildsRead += nrAdded;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void State::removeCancelledBuilds(Connection & conn)
|
||||||
|
{
|
||||||
|
/* Get the current set of queued builds. */
|
||||||
|
std::set<BuildID> currentIds;
|
||||||
|
{
|
||||||
|
pqxx::work txn(conn);
|
||||||
|
auto res = txn.exec("select id from Builds where finished = 0");
|
||||||
|
for (auto const & row : res)
|
||||||
|
currentIds.insert(row["id"].as<BuildID>());
|
||||||
|
}
|
||||||
|
|
||||||
|
auto builds_(builds.lock());
|
||||||
|
|
||||||
|
for (auto i = builds_->begin(); i != builds_->end(); ) {
|
||||||
|
if (currentIds.find(i->first) == currentIds.end()) {
|
||||||
|
printMsg(lvlInfo, format("discarding cancelled build %1%") % i->first);
|
||||||
|
i = builds_->erase(i);
|
||||||
|
// FIXME: ideally we would interrupt active build steps here.
|
||||||
|
} else
|
||||||
|
++i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
Step::ptr State::createStep(std::shared_ptr<StoreAPI> store, const Path & drvPath,
|
||||||
|
Build::ptr referringBuild, Step::ptr referringStep, std::set<Path> & finishedDrvs,
|
||||||
|
std::set<Step::ptr> & newSteps, std::set<Step::ptr> & newRunnable)
|
||||||
|
{
|
||||||
|
if (finishedDrvs.find(drvPath) != finishedDrvs.end()) return 0;
|
||||||
|
|
||||||
|
/* Check if the requested step already exists. If not, create a
|
||||||
|
new step. In any case, make the step reachable from
|
||||||
|
referringBuild or referringStep. This is done atomically (with
|
||||||
|
‘steps’ locked), to ensure that this step can never become
|
||||||
|
reachable from a new build after doBuildStep has removed it
|
||||||
|
from ‘steps’. */
|
||||||
|
Step::ptr step;
|
||||||
|
bool isNew = false;
|
||||||
|
{
|
||||||
|
auto steps_(steps.lock());
|
||||||
|
|
||||||
|
/* See if the step already exists in ‘steps’ and is not
|
||||||
|
stale. */
|
||||||
|
auto prev = steps_->find(drvPath);
|
||||||
|
if (prev != steps_->end()) {
|
||||||
|
step = prev->second.lock();
|
||||||
|
/* Since ‘step’ is a strong pointer, the referred Step
|
||||||
|
object won't be deleted after this. */
|
||||||
|
if (!step) steps_->erase(drvPath); // remove stale entry
|
||||||
|
}
|
||||||
|
|
||||||
|
/* If it doesn't exist, create it. */
|
||||||
|
if (!step) {
|
||||||
|
step = std::make_shared<Step>();
|
||||||
|
step->drvPath = drvPath;
|
||||||
|
isNew = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
auto step_(step->state.lock());
|
||||||
|
|
||||||
|
assert(step_->created != isNew);
|
||||||
|
|
||||||
|
if (referringBuild)
|
||||||
|
step_->builds.push_back(referringBuild);
|
||||||
|
|
||||||
|
if (referringStep)
|
||||||
|
step_->rdeps.push_back(referringStep);
|
||||||
|
|
||||||
|
(*steps_)[drvPath] = step;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!isNew) return step;
|
||||||
|
|
||||||
|
printMsg(lvlDebug, format("considering derivation ‘%1%’") % drvPath);
|
||||||
|
|
||||||
|
/* Initialize the step. Note that the step may be visible in
|
||||||
|
‘steps’ before this point, but that doesn't matter because
|
||||||
|
it's not runnable yet, and other threads won't make it
|
||||||
|
runnable while step->created == false. */
|
||||||
|
step->drv = readDerivation(drvPath);
|
||||||
|
{
|
||||||
|
auto i = step->drv.env.find("requiredSystemFeatures");
|
||||||
|
if (i != step->drv.env.end())
|
||||||
|
step->requiredSystemFeatures = tokenizeString<std::set<std::string>>(i->second);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto attr = step->drv.env.find("preferLocalBuild");
|
||||||
|
step->preferLocalBuild =
|
||||||
|
attr != step->drv.env.end() && attr->second == "1"
|
||||||
|
&& has(localPlatforms, step->drv.platform);
|
||||||
|
|
||||||
|
/* Are all outputs valid? */
|
||||||
|
bool valid = true;
|
||||||
|
for (auto & i : step->drv.outputs) {
|
||||||
|
if (!store->isValidPath(i.second.path)) {
|
||||||
|
valid = false;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// FIXME: check whether all outputs are in the binary cache.
|
||||||
|
if (valid) {
|
||||||
|
finishedDrvs.insert(drvPath);
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* No, we need to build. */
|
||||||
|
printMsg(lvlDebug, format("creating build step ‘%1%’") % drvPath);
|
||||||
|
newSteps.insert(step);
|
||||||
|
|
||||||
|
/* Create steps for the dependencies. */
|
||||||
|
for (auto & i : step->drv.inputDrvs) {
|
||||||
|
auto dep = createStep(store, i.first, 0, step, finishedDrvs, newSteps, newRunnable);
|
||||||
|
if (dep) {
|
||||||
|
auto step_(step->state.lock());
|
||||||
|
step_->deps.insert(dep);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* If the step has no (remaining) dependencies, make it
|
||||||
|
runnable. */
|
||||||
|
{
|
||||||
|
auto step_(step->state.lock());
|
||||||
|
assert(!step_->created);
|
||||||
|
step_->created = true;
|
||||||
|
if (step_->deps.empty())
|
||||||
|
newRunnable.insert(step);
|
||||||
|
}
|
||||||
|
|
||||||
|
return step;
|
||||||
|
}
|
|
@ -118,6 +118,9 @@ struct Step
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
void getDependents(Step::ptr step, std::set<Build::ptr> & builds, std::set<Step::ptr> & steps);
|
||||||
|
|
||||||
|
|
||||||
struct Machine
|
struct Machine
|
||||||
{
|
{
|
||||||
typedef std::shared_ptr<Machine> ptr;
|
typedef std::shared_ptr<Machine> ptr;
|
||||||
|
@ -159,6 +162,12 @@ class State
|
||||||
{
|
{
|
||||||
private:
|
private:
|
||||||
|
|
||||||
|
// FIXME: Make configurable.
|
||||||
|
const unsigned int maxTries = 5;
|
||||||
|
const unsigned int retryInterval = 60; // seconds
|
||||||
|
const float retryBackoff = 3.0;
|
||||||
|
const unsigned int maxParallelCopyClosure = 4;
|
||||||
|
|
||||||
nix::Path hydraData, logDir;
|
nix::Path hydraData, logDir;
|
||||||
|
|
||||||
nix::StringSet localPlatforms;
|
nix::StringSet localPlatforms;
|
||||||
|
@ -306,3 +315,10 @@ public:
|
||||||
|
|
||||||
void run(BuildID buildOne = 0);
|
void run(BuildID buildOne = 0);
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
template <class C, class V>
|
||||||
|
bool has(const C & c, const V & v)
|
||||||
|
{
|
||||||
|
return c.find(v) != c.end();
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in a new issue