Abort unsupported build steps

If we don't see machine that supports a build step for
'max_unsupported_time' seconds, the step is aborted. The default is 0,
which is appropriate for Hydra installations that don't provision
missing machines dynamically.

(cherry picked from commit f5cdbfe21d)
This commit is contained in:
Eelco Dolstra 2020-03-26 15:00:04 +01:00
parent 68a59f34a0
commit 4417f9f260
4 changed files with 208 additions and 92 deletions

View file

@ -376,8 +376,32 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
} }
} }
} else { } else
failStep(*conn, step, buildId, result, machine, stepFinished, quit);
// FIXME: keep stats about aborted steps?
nrStepsDone++;
totalStepTime += stepStopTime - stepStartTime;
totalStepBuildTime += result.stopTime - result.startTime;
machine->state->nrStepsDone++;
machine->state->totalStepTime += stepStopTime - stepStartTime;
machine->state->totalStepBuildTime += result.stopTime - result.startTime;
if (quit) exit(0); // testing hack; FIXME: this won't run plugins
return sDone;
}
void State::failStep(
Connection & conn,
Step::ptr step,
BuildID buildId,
const RemoteResult & result,
Machine::ptr machine,
bool & stepFinished,
bool & quit)
{
/* Register failure in the database for all Build objects that /* Register failure in the database for all Build objects that
directly or indirectly depend on this step. */ directly or indirectly depend on this step. */
@ -409,29 +433,30 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
{ {
auto mc = startDbUpdate(); auto mc = startDbUpdate();
pqxx::work txn(*conn); pqxx::work txn(conn);
/* Create failed build steps for every build that /* Create failed build steps for every build that
depends on this, except when this step is cached depends on this, except when this step is cached
and is the top-level of that build (since then it's and is the top-level of that build (since then it's
redundant with the build's isCachedBuild field). */ redundant with the build's isCachedBuild field). */
for (auto & build2 : indirect) { for (auto & build : indirect) {
if ((result.stepStatus == bsCachedFailure && build2->drvPath == step->drvPath) || if ((result.stepStatus == bsCachedFailure && build->drvPath == step->drvPath) ||
(result.stepStatus != bsCachedFailure && buildId == build2->id) || ((result.stepStatus != bsCachedFailure && result.stepStatus != bsUnsupported) && buildId == build->id) ||
build2->finishedInDB) build->finishedInDB)
continue; continue;
createBuildStep(txn, 0, build2->id, step, machine->sshName, createBuildStep(txn,
result.stepStatus, result.errorMsg, buildId == build2->id ? 0 : buildId); 0, build->id, step, machine ? machine->sshName : "",
result.stepStatus, result.errorMsg, buildId == build->id ? 0 : buildId);
} }
/* Mark all builds that depend on this derivation as failed. */ /* Mark all builds that depend on this derivation as failed. */
for (auto & build2 : indirect) { for (auto & build : indirect) {
if (build2->finishedInDB) continue; if (build->finishedInDB) continue;
printMsg(lvlError, format("marking build %1% as failed") % build2->id); printMsg(lvlError, format("marking build %1% as failed") % build->id);
txn.parameterized txn.parameterized
("update Builds set finished = 1, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5, notificationPendingSince = $4 where id = $1 and finished = 0") ("update Builds set finished = 1, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5, notificationPendingSince = $4 where id = $1 and finished = 0")
(build2->id) (build->id)
((int) (build2->drvPath != step->drvPath && result.buildStatus() == bsFailed ? bsDepFailed : result.buildStatus())) ((int) (build->drvPath != step->drvPath && result.buildStatus() == bsFailed ? bsDepFailed : result.buildStatus()))
(result.startTime) (result.startTime)
(result.stopTime) (result.stopTime)
(result.stepStatus == bsCachedFailure ? 1 : 0).exec(); (result.stepStatus == bsCachedFailure ? 1 : 0).exec();
@ -462,25 +487,12 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,
/* Send notification about this build and its dependents. */ /* Send notification about this build and its dependents. */
{ {
pqxx::work txn(*conn); pqxx::work txn(conn);
notifyBuildFinished(txn, buildId, dependentIDs); notifyBuildFinished(txn, buildId, dependentIDs);
txn.commit(); txn.commit();
} }
} }
// FIXME: keep stats about aborted steps?
nrStepsDone++;
totalStepTime += stepStopTime - stepStartTime;
totalStepBuildTime += result.stopTime - result.startTime;
machine->state->nrStepsDone++;
machine->state->totalStepTime += stepStopTime - stepStartTime;
machine->state->totalStepBuildTime += result.stopTime - result.startTime;
if (quit) exit(0); // testing hack; FIXME: this won't run plugins
return sDone;
}
void State::addRoot(const StorePath & storePath) void State::addRoot(const StorePath & storePath)
{ {

View file

@ -300,6 +300,8 @@ system_time State::doDispatch()
} while (keepGoing); } while (keepGoing);
abortUnsupported();
return sleepUntil; return sleepUntil;
} }
@ -314,6 +316,89 @@ void State::wakeDispatcher()
} }
void State::abortUnsupported()
{
/* Make a copy of 'runnable' and 'machines' so we don't block them
very long. */
auto runnable2 = *runnable.lock();
auto machines2 = *machines.lock();
system_time now = std::chrono::system_clock::now();
auto now2 = time(0);
std::unordered_set<Step::ptr> aborted;
for (auto & wstep : runnable2) {
auto step(wstep.lock());
if (!step) continue;
bool supported = false;
for (auto & machine : machines2) {
if (machine.second->supportsStep(step)) {
step->state.lock()->lastSupported = now;
supported = true;
break;
}
}
if (!supported
&& std::chrono::duration_cast<std::chrono::seconds>(now - step->state.lock()->lastSupported).count() >= maxUnsupportedTime)
{
printError("aborting unsupported build step '%s' (type '%s')",
localStore->printStorePath(step->drvPath),
step->systemType);
aborted.insert(step);
auto conn(dbPool.get());
std::set<Build::ptr> dependents;
std::set<Step::ptr> steps;
getDependents(step, dependents, steps);
/* Maybe the step got cancelled. */
if (dependents.empty()) continue;
/* Find the build that has this step as the top-level (if
any). */
Build::ptr build;
for (auto build2 : dependents) {
if (build2->drvPath == step->drvPath)
build = build2;
}
if (!build) build = *dependents.begin();
bool stepFinished = false;
bool quit = false;
failStep(
*conn, step, build->id,
RemoteResult {
.stepStatus = bsUnsupported,
.errorMsg = fmt("unsupported system type '%s'",
step->systemType),
.startTime = now2,
.stopTime = now2,
},
nullptr, stepFinished, quit);
if (quit) exit(1);
}
}
/* Clean up 'runnable'. */
{
auto runnable_(runnable.lock());
for (auto i = runnable_->begin(); i != runnable_->end(); ) {
if (aborted.count(i->lock()))
i = runnable_->erase(i);
else
++i;
}
}
}
void Jobset::addStep(time_t startTime, time_t duration) void Jobset::addStep(time_t startTime, time_t duration)
{ {
auto steps_(steps.lock()); auto steps_(steps.lock());

View file

@ -46,6 +46,7 @@ std::string getEnvOrDie(const std::string & key)
State::State() State::State()
: config(std::make_unique<::Config>()) : config(std::make_unique<::Config>())
, maxUnsupportedTime(config->getIntOption("max_unsupported_time", 0))
, dbPool(config->getIntOption("max_db_connections", 128)) , dbPool(config->getIntOption("max_db_connections", 128))
, memoryTokens(config->getIntOption("nar_buffer_size", getMemSize() / 2)) , memoryTokens(config->getIntOption("nar_buffer_size", getMemSize() / 2))
, maxOutputSize(config->getIntOption("max_output_size", 2ULL << 30)) , maxOutputSize(config->getIntOption("max_output_size", 2ULL << 30))

View file

@ -68,7 +68,7 @@ struct RemoteResult
std::unique_ptr<nix::TokenServer::Token> tokens; std::unique_ptr<nix::TokenServer::Token> tokens;
std::shared_ptr<nix::FSAccessor> accessor; std::shared_ptr<nix::FSAccessor> accessor;
BuildStatus buildStatus() BuildStatus buildStatus() const
{ {
return stepStatus == bsCachedFailure ? bsFailed : stepStatus; return stepStatus == bsCachedFailure ? bsFailed : stepStatus;
} }
@ -198,6 +198,10 @@ struct Step
/* The time at which this step became runnable. */ /* The time at which this step became runnable. */
system_time runnableSince; system_time runnableSince;
/* The time that we last saw a machine that supports this
step. */
system_time lastSupported = std::chrono::system_clock::now();
}; };
std::atomic_bool finished{false}; // debugging std::atomic_bool finished{false}; // debugging
@ -303,6 +307,9 @@ private:
const float retryBackoff = 3.0; const float retryBackoff = 3.0;
const unsigned int maxParallelCopyClosure = 4; const unsigned int maxParallelCopyClosure = 4;
/* Time in seconds before unsupported build steps are aborted. */
const unsigned int maxUnsupportedTime = 0;
nix::Path hydraData, logDir; nix::Path hydraData, logDir;
bool useSubstitutes = false; bool useSubstitutes = false;
@ -483,6 +490,15 @@ private:
Build::ptr referringBuild, Step::ptr referringStep, std::set<nix::StorePath> & finishedDrvs, Build::ptr referringBuild, Step::ptr referringStep, std::set<nix::StorePath> & finishedDrvs,
std::set<Step::ptr> & newSteps, std::set<Step::ptr> & newRunnable); std::set<Step::ptr> & newSteps, std::set<Step::ptr> & newRunnable);
void failStep(
Connection & conn,
Step::ptr step,
BuildID buildId,
const RemoteResult & result,
Machine::ptr machine,
bool & stepFinished,
bool & quit);
Jobset::ptr createJobset(pqxx::work & txn, Jobset::ptr createJobset(pqxx::work & txn,
const std::string & projectName, const std::string & jobsetName); const std::string & projectName, const std::string & jobsetName);
@ -497,6 +513,8 @@ private:
void wakeDispatcher(); void wakeDispatcher();
void abortUnsupported();
void builder(MachineReservation::ptr reservation); void builder(MachineReservation::ptr reservation);
/* Perform the given build step. Return true if the step is to be /* Perform the given build step. Return true if the step is to be