forked from lix-project/hydra
Merge pull request #727 from NixOS/fail-unsupported
Abort unsupported build steps
This commit is contained in:
commit
a7540b141d
|
@ -376,8 +376,32 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
} else {
|
} else
|
||||||
|
failStep(*conn, step, buildId, result, machine, stepFinished, quit);
|
||||||
|
|
||||||
|
// FIXME: keep stats about aborted steps?
|
||||||
|
nrStepsDone++;
|
||||||
|
totalStepTime += stepStopTime - stepStartTime;
|
||||||
|
totalStepBuildTime += result.stopTime - result.startTime;
|
||||||
|
machine->state->nrStepsDone++;
|
||||||
|
machine->state->totalStepTime += stepStopTime - stepStartTime;
|
||||||
|
machine->state->totalStepBuildTime += result.stopTime - result.startTime;
|
||||||
|
|
||||||
|
if (quit) exit(0); // testing hack; FIXME: this won't run plugins
|
||||||
|
|
||||||
|
return sDone;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void State::failStep(
|
||||||
|
Connection & conn,
|
||||||
|
Step::ptr step,
|
||||||
|
BuildID buildId,
|
||||||
|
const RemoteResult & result,
|
||||||
|
Machine::ptr machine,
|
||||||
|
bool & stepFinished,
|
||||||
|
bool & quit)
|
||||||
|
{
|
||||||
/* Register failure in the database for all Build objects that
|
/* Register failure in the database for all Build objects that
|
||||||
directly or indirectly depend on this step. */
|
directly or indirectly depend on this step. */
|
||||||
|
|
||||||
|
@ -409,29 +433,30 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
|
||||||
{
|
{
|
||||||
auto mc = startDbUpdate();
|
auto mc = startDbUpdate();
|
||||||
|
|
||||||
pqxx::work txn(*conn);
|
pqxx::work txn(conn);
|
||||||
|
|
||||||
/* Create failed build steps for every build that
|
/* Create failed build steps for every build that
|
||||||
depends on this, except when this step is cached
|
depends on this, except when this step is cached
|
||||||
and is the top-level of that build (since then it's
|
and is the top-level of that build (since then it's
|
||||||
redundant with the build's isCachedBuild field). */
|
redundant with the build's isCachedBuild field). */
|
||||||
for (auto & build2 : indirect) {
|
for (auto & build : indirect) {
|
||||||
if ((result.stepStatus == bsCachedFailure && build2->drvPath == step->drvPath) ||
|
if ((result.stepStatus == bsCachedFailure && build->drvPath == step->drvPath) ||
|
||||||
(result.stepStatus != bsCachedFailure && buildId == build2->id) ||
|
((result.stepStatus != bsCachedFailure && result.stepStatus != bsUnsupported) && buildId == build->id) ||
|
||||||
build2->finishedInDB)
|
build->finishedInDB)
|
||||||
continue;
|
continue;
|
||||||
createBuildStep(txn, 0, build2->id, step, machine->sshName,
|
createBuildStep(txn,
|
||||||
result.stepStatus, result.errorMsg, buildId == build2->id ? 0 : buildId);
|
0, build->id, step, machine ? machine->sshName : "",
|
||||||
|
result.stepStatus, result.errorMsg, buildId == build->id ? 0 : buildId);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Mark all builds that depend on this derivation as failed. */
|
/* Mark all builds that depend on this derivation as failed. */
|
||||||
for (auto & build2 : indirect) {
|
for (auto & build : indirect) {
|
||||||
if (build2->finishedInDB) continue;
|
if (build->finishedInDB) continue;
|
||||||
printMsg(lvlError, format("marking build %1% as failed") % build2->id);
|
printMsg(lvlError, format("marking build %1% as failed") % build->id);
|
||||||
txn.parameterized
|
txn.parameterized
|
||||||
("update Builds set finished = 1, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5, notificationPendingSince = $4 where id = $1 and finished = 0")
|
("update Builds set finished = 1, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5, notificationPendingSince = $4 where id = $1 and finished = 0")
|
||||||
(build2->id)
|
(build->id)
|
||||||
((int) (build2->drvPath != step->drvPath && result.buildStatus() == bsFailed ? bsDepFailed : result.buildStatus()))
|
((int) (build->drvPath != step->drvPath && result.buildStatus() == bsFailed ? bsDepFailed : result.buildStatus()))
|
||||||
(result.startTime)
|
(result.startTime)
|
||||||
(result.stopTime)
|
(result.stopTime)
|
||||||
(result.stepStatus == bsCachedFailure ? 1 : 0).exec();
|
(result.stepStatus == bsCachedFailure ? 1 : 0).exec();
|
||||||
|
@ -462,23 +487,10 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
|
||||||
|
|
||||||
/* Send notification about this build and its dependents. */
|
/* Send notification about this build and its dependents. */
|
||||||
{
|
{
|
||||||
pqxx::work txn(*conn);
|
pqxx::work txn(conn);
|
||||||
notifyBuildFinished(txn, buildId, dependentIDs);
|
notifyBuildFinished(txn, buildId, dependentIDs);
|
||||||
txn.commit();
|
txn.commit();
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// FIXME: keep stats about aborted steps?
|
|
||||||
nrStepsDone++;
|
|
||||||
totalStepTime += stepStopTime - stepStartTime;
|
|
||||||
totalStepBuildTime += result.stopTime - result.startTime;
|
|
||||||
machine->state->nrStepsDone++;
|
|
||||||
machine->state->totalStepTime += stepStopTime - stepStartTime;
|
|
||||||
machine->state->totalStepBuildTime += result.stopTime - result.startTime;
|
|
||||||
|
|
||||||
if (quit) exit(0); // testing hack; FIXME: this won't run plugins
|
|
||||||
|
|
||||||
return sDone;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -300,6 +300,8 @@ system_time State::doDispatch()
|
||||||
|
|
||||||
} while (keepGoing);
|
} while (keepGoing);
|
||||||
|
|
||||||
|
abortUnsupported();
|
||||||
|
|
||||||
return sleepUntil;
|
return sleepUntil;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -314,6 +316,96 @@ void State::wakeDispatcher()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
void State::abortUnsupported()
|
||||||
|
{
|
||||||
|
/* Make a copy of 'runnable' and 'machines' so we don't block them
|
||||||
|
very long. */
|
||||||
|
auto runnable2 = *runnable.lock();
|
||||||
|
auto machines2 = *machines.lock();
|
||||||
|
|
||||||
|
system_time now = std::chrono::system_clock::now();
|
||||||
|
auto now2 = time(0);
|
||||||
|
|
||||||
|
std::unordered_set<Step::ptr> aborted;
|
||||||
|
|
||||||
|
size_t count = 0;
|
||||||
|
|
||||||
|
for (auto & wstep : runnable2) {
|
||||||
|
auto step(wstep.lock());
|
||||||
|
if (!step) continue;
|
||||||
|
|
||||||
|
bool supported = false;
|
||||||
|
for (auto & machine : machines2) {
|
||||||
|
if (machine.second->supportsStep(step)) {
|
||||||
|
step->state.lock()->lastSupported = now;
|
||||||
|
supported = true;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!supported)
|
||||||
|
count++;
|
||||||
|
|
||||||
|
if (!supported
|
||||||
|
&& std::chrono::duration_cast<std::chrono::seconds>(now - step->state.lock()->lastSupported).count() >= maxUnsupportedTime)
|
||||||
|
{
|
||||||
|
printError("aborting unsupported build step '%s' (type '%s')",
|
||||||
|
localStore->printStorePath(step->drvPath),
|
||||||
|
step->systemType);
|
||||||
|
|
||||||
|
aborted.insert(step);
|
||||||
|
|
||||||
|
auto conn(dbPool.get());
|
||||||
|
|
||||||
|
std::set<Build::ptr> dependents;
|
||||||
|
std::set<Step::ptr> steps;
|
||||||
|
getDependents(step, dependents, steps);
|
||||||
|
|
||||||
|
/* Maybe the step got cancelled. */
|
||||||
|
if (dependents.empty()) continue;
|
||||||
|
|
||||||
|
/* Find the build that has this step as the top-level (if
|
||||||
|
any). */
|
||||||
|
Build::ptr build;
|
||||||
|
for (auto build2 : dependents) {
|
||||||
|
if (build2->drvPath == step->drvPath)
|
||||||
|
build = build2;
|
||||||
|
}
|
||||||
|
if (!build) build = *dependents.begin();
|
||||||
|
|
||||||
|
bool stepFinished = false;
|
||||||
|
bool quit = false;
|
||||||
|
|
||||||
|
failStep(
|
||||||
|
*conn, step, build->id,
|
||||||
|
RemoteResult {
|
||||||
|
.stepStatus = bsUnsupported,
|
||||||
|
.errorMsg = fmt("unsupported system type '%s'",
|
||||||
|
step->systemType),
|
||||||
|
.startTime = now2,
|
||||||
|
.stopTime = now2,
|
||||||
|
},
|
||||||
|
nullptr, stepFinished, quit);
|
||||||
|
|
||||||
|
if (quit) exit(1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Clean up 'runnable'. */
|
||||||
|
{
|
||||||
|
auto runnable_(runnable.lock());
|
||||||
|
for (auto i = runnable_->begin(); i != runnable_->end(); ) {
|
||||||
|
if (aborted.count(i->lock()))
|
||||||
|
i = runnable_->erase(i);
|
||||||
|
else
|
||||||
|
++i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
nrUnsupportedSteps = count;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
void Jobset::addStep(time_t startTime, time_t duration)
|
void Jobset::addStep(time_t startTime, time_t duration)
|
||||||
{
|
{
|
||||||
auto steps_(steps.lock());
|
auto steps_(steps.lock());
|
||||||
|
|
|
@ -46,6 +46,7 @@ std::string getEnvOrDie(const std::string & key)
|
||||||
|
|
||||||
State::State()
|
State::State()
|
||||||
: config(std::make_unique<::Config>())
|
: config(std::make_unique<::Config>())
|
||||||
|
, maxUnsupportedTime(config->getIntOption("max_unsupported_time", 0))
|
||||||
, dbPool(config->getIntOption("max_db_connections", 128))
|
, dbPool(config->getIntOption("max_db_connections", 128))
|
||||||
, memoryTokens(config->getIntOption("nar_buffer_size", getMemSize() / 2))
|
, memoryTokens(config->getIntOption("nar_buffer_size", getMemSize() / 2))
|
||||||
, maxOutputSize(config->getIntOption("max_output_size", 2ULL << 30))
|
, maxOutputSize(config->getIntOption("max_output_size", 2ULL << 30))
|
||||||
|
@ -490,7 +491,7 @@ std::shared_ptr<PathLocks> State::acquireGlobalLock()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void State::dumpStatus(Connection & conn, bool log)
|
void State::dumpStatus(Connection & conn)
|
||||||
{
|
{
|
||||||
std::ostringstream out;
|
std::ostringstream out;
|
||||||
|
|
||||||
|
@ -522,6 +523,7 @@ void State::dumpStatus(Connection & conn, bool log)
|
||||||
root.attr("nrStepsCopyingTo", nrStepsCopyingTo);
|
root.attr("nrStepsCopyingTo", nrStepsCopyingTo);
|
||||||
root.attr("nrStepsCopyingFrom", nrStepsCopyingFrom);
|
root.attr("nrStepsCopyingFrom", nrStepsCopyingFrom);
|
||||||
root.attr("nrStepsWaiting", nrStepsWaiting);
|
root.attr("nrStepsWaiting", nrStepsWaiting);
|
||||||
|
root.attr("nrUnsupportedSteps", nrUnsupportedSteps);
|
||||||
root.attr("bytesSent", bytesSent);
|
root.attr("bytesSent", bytesSent);
|
||||||
root.attr("bytesReceived", bytesReceived);
|
root.attr("bytesReceived", bytesReceived);
|
||||||
root.attr("nrBuildsRead", nrBuildsRead);
|
root.attr("nrBuildsRead", nrBuildsRead);
|
||||||
|
@ -670,11 +672,6 @@ void State::dumpStatus(Connection & conn, bool log)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (log && time(0) >= lastStatusLogged + statusLogInterval) {
|
|
||||||
printMsg(lvlInfo, format("status: %1%") % out.str());
|
|
||||||
lastStatusLogged = time(0);
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
{
|
||||||
auto mc = startDbUpdate();
|
auto mc = startDbUpdate();
|
||||||
pqxx::work txn(conn);
|
pqxx::work txn(conn);
|
||||||
|
@ -783,7 +780,7 @@ void State::run(BuildID buildOne)
|
||||||
{
|
{
|
||||||
auto conn(dbPool.get());
|
auto conn(dbPool.get());
|
||||||
clearBusy(*conn, 0);
|
clearBusy(*conn, 0);
|
||||||
dumpStatus(*conn, false);
|
dumpStatus(*conn);
|
||||||
}
|
}
|
||||||
|
|
||||||
std::thread(&State::monitorMachinesFile, this).detach();
|
std::thread(&State::monitorMachinesFile, this).detach();
|
||||||
|
@ -846,8 +843,8 @@ void State::run(BuildID buildOne)
|
||||||
auto conn(dbPool.get());
|
auto conn(dbPool.get());
|
||||||
receiver dumpStatus_(*conn, "dump_status");
|
receiver dumpStatus_(*conn, "dump_status");
|
||||||
while (true) {
|
while (true) {
|
||||||
conn->await_notification(statusLogInterval / 2 + 1, 0);
|
conn->await_notification();
|
||||||
dumpStatus(*conn, true);
|
dumpStatus(*conn);
|
||||||
}
|
}
|
||||||
} catch (std::exception & e) {
|
} catch (std::exception & e) {
|
||||||
printMsg(lvlError, format("main thread: %1%") % e.what());
|
printMsg(lvlError, format("main thread: %1%") % e.what());
|
||||||
|
|
|
@ -68,7 +68,7 @@ struct RemoteResult
|
||||||
std::unique_ptr<nix::TokenServer::Token> tokens;
|
std::unique_ptr<nix::TokenServer::Token> tokens;
|
||||||
std::shared_ptr<nix::FSAccessor> accessor;
|
std::shared_ptr<nix::FSAccessor> accessor;
|
||||||
|
|
||||||
BuildStatus buildStatus()
|
BuildStatus buildStatus() const
|
||||||
{
|
{
|
||||||
return stepStatus == bsCachedFailure ? bsFailed : stepStatus;
|
return stepStatus == bsCachedFailure ? bsFailed : stepStatus;
|
||||||
}
|
}
|
||||||
|
@ -198,6 +198,10 @@ struct Step
|
||||||
|
|
||||||
/* The time at which this step became runnable. */
|
/* The time at which this step became runnable. */
|
||||||
system_time runnableSince;
|
system_time runnableSince;
|
||||||
|
|
||||||
|
/* The time that we last saw a machine that supports this
|
||||||
|
step. */
|
||||||
|
system_time lastSupported = std::chrono::system_clock::now();
|
||||||
};
|
};
|
||||||
|
|
||||||
std::atomic_bool finished{false}; // debugging
|
std::atomic_bool finished{false}; // debugging
|
||||||
|
@ -303,6 +307,9 @@ private:
|
||||||
const float retryBackoff = 3.0;
|
const float retryBackoff = 3.0;
|
||||||
const unsigned int maxParallelCopyClosure = 4;
|
const unsigned int maxParallelCopyClosure = 4;
|
||||||
|
|
||||||
|
/* Time in seconds before unsupported build steps are aborted. */
|
||||||
|
const unsigned int maxUnsupportedTime = 0;
|
||||||
|
|
||||||
nix::Path hydraData, logDir;
|
nix::Path hydraData, logDir;
|
||||||
|
|
||||||
bool useSubstitutes = false;
|
bool useSubstitutes = false;
|
||||||
|
@ -348,6 +355,7 @@ private:
|
||||||
counter nrStepsCopyingTo{0};
|
counter nrStepsCopyingTo{0};
|
||||||
counter nrStepsCopyingFrom{0};
|
counter nrStepsCopyingFrom{0};
|
||||||
counter nrStepsWaiting{0};
|
counter nrStepsWaiting{0};
|
||||||
|
counter nrUnsupportedSteps{0};
|
||||||
counter nrRetries{0};
|
counter nrRetries{0};
|
||||||
counter maxNrRetries{0};
|
counter maxNrRetries{0};
|
||||||
counter totalStepTime{0}; // total time for steps, including closure copying
|
counter totalStepTime{0}; // total time for steps, including closure copying
|
||||||
|
@ -412,9 +420,6 @@ private:
|
||||||
size_t maxOutputSize;
|
size_t maxOutputSize;
|
||||||
size_t maxLogSize;
|
size_t maxLogSize;
|
||||||
|
|
||||||
time_t lastStatusLogged = 0;
|
|
||||||
const int statusLogInterval = 300;
|
|
||||||
|
|
||||||
/* Steps that were busy while we encounted a PostgreSQL
|
/* Steps that were busy while we encounted a PostgreSQL
|
||||||
error. These need to be cleared at a later time to prevent them
|
error. These need to be cleared at a later time to prevent them
|
||||||
from showing up as busy until the queue runner is restarted. */
|
from showing up as busy until the queue runner is restarted. */
|
||||||
|
@ -483,6 +488,15 @@ private:
|
||||||
Build::ptr referringBuild, Step::ptr referringStep, std::set<nix::StorePath> & finishedDrvs,
|
Build::ptr referringBuild, Step::ptr referringStep, std::set<nix::StorePath> & finishedDrvs,
|
||||||
std::set<Step::ptr> & newSteps, std::set<Step::ptr> & newRunnable);
|
std::set<Step::ptr> & newSteps, std::set<Step::ptr> & newRunnable);
|
||||||
|
|
||||||
|
void failStep(
|
||||||
|
Connection & conn,
|
||||||
|
Step::ptr step,
|
||||||
|
BuildID buildId,
|
||||||
|
const RemoteResult & result,
|
||||||
|
Machine::ptr machine,
|
||||||
|
bool & stepFinished,
|
||||||
|
bool & quit);
|
||||||
|
|
||||||
Jobset::ptr createJobset(pqxx::work & txn,
|
Jobset::ptr createJobset(pqxx::work & txn,
|
||||||
const std::string & projectName, const std::string & jobsetName);
|
const std::string & projectName, const std::string & jobsetName);
|
||||||
|
|
||||||
|
@ -497,6 +511,8 @@ private:
|
||||||
|
|
||||||
void wakeDispatcher();
|
void wakeDispatcher();
|
||||||
|
|
||||||
|
void abortUnsupported();
|
||||||
|
|
||||||
void builder(MachineReservation::ptr reservation);
|
void builder(MachineReservation::ptr reservation);
|
||||||
|
|
||||||
/* Perform the given build step. Return true if the step is to be
|
/* Perform the given build step. Return true if the step is to be
|
||||||
|
@ -527,7 +543,7 @@ private:
|
||||||
has it. */
|
has it. */
|
||||||
std::shared_ptr<nix::PathLocks> acquireGlobalLock();
|
std::shared_ptr<nix::PathLocks> acquireGlobalLock();
|
||||||
|
|
||||||
void dumpStatus(Connection & conn, bool log);
|
void dumpStatus(Connection & conn);
|
||||||
|
|
||||||
void addRoot(const nix::StorePath & storePath);
|
void addRoot(const nix::StorePath & storePath);
|
||||||
|
|
||||||
|
|
|
@ -34,6 +34,7 @@ sub sendQueueRunnerStats {
|
||||||
gauge("hydra.queue.steps.unfinished", $json->{nrUnfinishedSteps});
|
gauge("hydra.queue.steps.unfinished", $json->{nrUnfinishedSteps});
|
||||||
gauge("hydra.queue.steps.finished", $json->{nrStepsDone});
|
gauge("hydra.queue.steps.finished", $json->{nrStepsDone});
|
||||||
gauge("hydra.queue.steps.retries", $json->{nrRetries});
|
gauge("hydra.queue.steps.retries", $json->{nrRetries});
|
||||||
|
gauge("hydra.queue.steps.unsupported", $json->{nrUnsupportedSteps});
|
||||||
gauge("hydra.queue.steps.max_retries", $json->{maxNrRetries});
|
gauge("hydra.queue.steps.max_retries", $json->{maxNrRetries});
|
||||||
if ($json->{nrStepsDone}) {
|
if ($json->{nrStepsDone}) {
|
||||||
gauge("hydra.queue.steps.avg_total_time", $json->{avgStepTime});
|
gauge("hydra.queue.steps.avg_total_time", $json->{avgStepTime});
|
||||||
|
|
Loading…
Reference in a new issue