From 5b731004da89a9b94f4ea7945ea6538837a64e41 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Thu, 26 Mar 2020 14:59:52 +0100 Subject: [PATCH 1/4] Remove tabs --- src/root/common.tt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/root/common.tt b/src/root/common.tt index 83b1c736..bc777ede 100644 --- a/src/root/common.tt +++ b/src/root/common.tt @@ -584,10 +584,10 @@ BLOCK renderJobsetOverview %] [% IF showProject; INCLUDE renderFullJobsetName project=j.get_column('project') jobset=j.name inRow=1; ELSE; INCLUDE renderJobsetName project=j.get_column('project') jobset=j.name inRow=1; END %] [% HTML.escape(j.description) %] [% IF j.lastcheckedtime; - INCLUDE renderDateTime timestamp = j.lastcheckedtime; - IF j.errormsg || j.fetcherrormsg; %] Error[% END; - ELSE; "-"; - END %] + INCLUDE renderDateTime timestamp = j.lastcheckedtime; + IF j.errormsg || j.fetcherrormsg; %] Error[% END; + ELSE; "-"; + END %] [% IF j.get_column('nrtotal') > 0 %] [% successrate = ( j.get_column('nrsucceeded') / j.get_column('nrtotal') )*100 %] [% IF j.get_column('nrscheduled') > 0 %] From f5cdbfe21d930db43d3812c7d8e87746d6378ef9 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Thu, 26 Mar 2020 15:00:04 +0100 Subject: [PATCH 2/4] Abort unsupported build steps If we don't see machine that supports a build step for 'max_unsupported_time' seconds, the step is aborted. The default is 0, which is appropriate for Hydra installations that don't provision missing machines dynamically. --- src/hydra-queue-runner/builder.cc | 194 ++++++++++--------- src/hydra-queue-runner/dispatcher.cc | 85 ++++++++ src/hydra-queue-runner/hydra-queue-runner.cc | 1 + src/hydra-queue-runner/state.hh | 20 +- 4 files changed, 208 insertions(+), 92 deletions(-) diff --git a/src/hydra-queue-runner/builder.cc b/src/hydra-queue-runner/builder.cc index a0fc01c3..0a005321 100644 --- a/src/hydra-queue-runner/builder.cc +++ b/src/hydra-queue-runner/builder.cc @@ -376,97 +376,8 @@ State::StepResult State::doBuildStep(nix::ref destStore, } } - } else { - - /* Register failure in the database for all Build objects that - directly or indirectly depend on this step. */ - - std::vector dependentIDs; - - while (true) { - /* Get the builds and steps that depend on this step. */ - std::set indirect; - { - auto steps_(steps.lock()); - std::set steps; - getDependents(step, indirect, steps); - - /* If there are no builds left, delete all referring - steps from ‘steps’. As for the success case, we can - be certain no new referrers can be added. */ - if (indirect.empty()) { - for (auto & s : steps) { - printMsg(lvlDebug, "finishing build step ‘%s’", - localStore->printStorePath(s->drvPath)); - steps_->erase(s->drvPath); - } - } - } - - if (indirect.empty() && stepFinished) break; - - /* Update the database. */ - { - auto mc = startDbUpdate(); - - pqxx::work txn(*conn); - - /* Create failed build steps for every build that - depends on this, except when this step is cached - and is the top-level of that build (since then it's - redundant with the build's isCachedBuild field). */ - for (auto & build2 : indirect) { - if ((result.stepStatus == bsCachedFailure && build2->drvPath == step->drvPath) || - (result.stepStatus != bsCachedFailure && buildId == build2->id) || - build2->finishedInDB) - continue; - createBuildStep(txn, 0, build2->id, step, machine->sshName, - result.stepStatus, result.errorMsg, buildId == build2->id ? 0 : buildId); - } - - /* Mark all builds that depend on this derivation as failed. */ - for (auto & build2 : indirect) { - if (build2->finishedInDB) continue; - printMsg(lvlError, format("marking build %1% as failed") % build2->id); - txn.parameterized - ("update Builds set finished = 1, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5, notificationPendingSince = $4 where id = $1 and finished = 0") - (build2->id) - ((int) (build2->drvPath != step->drvPath && result.buildStatus() == bsFailed ? bsDepFailed : result.buildStatus())) - (result.startTime) - (result.stopTime) - (result.stepStatus == bsCachedFailure ? 1 : 0).exec(); - nrBuildsDone++; - } - - /* Remember failed paths in the database so that they - won't be built again. */ - if (result.stepStatus != bsCachedFailure && result.canCache) - for (auto & path : step->drv->outputPaths()) - txn.parameterized("insert into FailedPaths values ($1)")(localStore->printStorePath(path)).exec(); - - txn.commit(); - } - - stepFinished = true; - - /* Remove the indirect dependencies from ‘builds’. This - will cause them to be destroyed. */ - for (auto & b : indirect) { - auto builds_(builds.lock()); - b->finishedInDB = true; - builds_->erase(b->id); - dependentIDs.push_back(b->id); - if (buildOne == b->id) quit = true; - } - } - - /* Send notification about this build and its dependents. */ - { - pqxx::work txn(*conn); - notifyBuildFinished(txn, buildId, dependentIDs); - txn.commit(); - } - } + } else + failStep(*conn, step, buildId, result, machine, stepFinished, quit); // FIXME: keep stats about aborted steps? nrStepsDone++; @@ -482,6 +393,107 @@ State::StepResult State::doBuildStep(nix::ref destStore, } +void State::failStep( + Connection & conn, + Step::ptr step, + BuildID buildId, + const RemoteResult & result, + Machine::ptr machine, + bool & stepFinished, + bool & quit) +{ + /* Register failure in the database for all Build objects that + directly or indirectly depend on this step. */ + + std::vector dependentIDs; + + while (true) { + /* Get the builds and steps that depend on this step. */ + std::set indirect; + { + auto steps_(steps.lock()); + std::set steps; + getDependents(step, indirect, steps); + + /* If there are no builds left, delete all referring + steps from ‘steps’. As for the success case, we can + be certain no new referrers can be added. */ + if (indirect.empty()) { + for (auto & s : steps) { + printMsg(lvlDebug, "finishing build step ‘%s’", + localStore->printStorePath(s->drvPath)); + steps_->erase(s->drvPath); + } + } + } + + if (indirect.empty() && stepFinished) break; + + /* Update the database. */ + { + auto mc = startDbUpdate(); + + pqxx::work txn(conn); + + /* Create failed build steps for every build that + depends on this, except when this step is cached + and is the top-level of that build (since then it's + redundant with the build's isCachedBuild field). */ + for (auto & build : indirect) { + if ((result.stepStatus == bsCachedFailure && build->drvPath == step->drvPath) || + ((result.stepStatus != bsCachedFailure && result.stepStatus != bsUnsupported) && buildId == build->id) || + build->finishedInDB) + continue; + createBuildStep(txn, + 0, build->id, step, machine ? machine->sshName : "", + result.stepStatus, result.errorMsg, buildId == build->id ? 0 : buildId); + } + + /* Mark all builds that depend on this derivation as failed. */ + for (auto & build : indirect) { + if (build->finishedInDB) continue; + printMsg(lvlError, format("marking build %1% as failed") % build->id); + txn.parameterized + ("update Builds set finished = 1, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5, notificationPendingSince = $4 where id = $1 and finished = 0") + (build->id) + ((int) (build->drvPath != step->drvPath && result.buildStatus() == bsFailed ? bsDepFailed : result.buildStatus())) + (result.startTime) + (result.stopTime) + (result.stepStatus == bsCachedFailure ? 1 : 0).exec(); + nrBuildsDone++; + } + + /* Remember failed paths in the database so that they + won't be built again. */ + if (result.stepStatus != bsCachedFailure && result.canCache) + for (auto & path : step->drv->outputPaths()) + txn.parameterized("insert into FailedPaths values ($1)")(localStore->printStorePath(path)).exec(); + + txn.commit(); + } + + stepFinished = true; + + /* Remove the indirect dependencies from ‘builds’. This + will cause them to be destroyed. */ + for (auto & b : indirect) { + auto builds_(builds.lock()); + b->finishedInDB = true; + builds_->erase(b->id); + dependentIDs.push_back(b->id); + if (buildOne == b->id) quit = true; + } + } + + /* Send notification about this build and its dependents. */ + { + pqxx::work txn(conn); + notifyBuildFinished(txn, buildId, dependentIDs); + txn.commit(); + } +} + + void State::addRoot(const StorePath & storePath) { auto root = rootsDir + "/" + std::string(storePath.to_string()); diff --git a/src/hydra-queue-runner/dispatcher.cc b/src/hydra-queue-runner/dispatcher.cc index 86596ff5..700fdd8e 100644 --- a/src/hydra-queue-runner/dispatcher.cc +++ b/src/hydra-queue-runner/dispatcher.cc @@ -300,6 +300,8 @@ system_time State::doDispatch() } while (keepGoing); + abortUnsupported(); + return sleepUntil; } @@ -314,6 +316,89 @@ void State::wakeDispatcher() } +void State::abortUnsupported() +{ + /* Make a copy of 'runnable' and 'machines' so we don't block them + very long. */ + auto runnable2 = *runnable.lock(); + auto machines2 = *machines.lock(); + + system_time now = std::chrono::system_clock::now(); + auto now2 = time(0); + + std::unordered_set aborted; + + for (auto & wstep : runnable2) { + auto step(wstep.lock()); + if (!step) continue; + + bool supported = false; + for (auto & machine : machines2) { + if (machine.second->supportsStep(step)) { + step->state.lock()->lastSupported = now; + supported = true; + break; + } + } + + if (!supported + && std::chrono::duration_cast(now - step->state.lock()->lastSupported).count() >= maxUnsupportedTime) + { + printError("aborting unsupported build step '%s' (type '%s')", + localStore->printStorePath(step->drvPath), + step->systemType); + + aborted.insert(step); + + auto conn(dbPool.get()); + + std::set dependents; + std::set steps; + getDependents(step, dependents, steps); + + /* Maybe the step got cancelled. */ + if (dependents.empty()) continue; + + /* Find the build that has this step as the top-level (if + any). */ + Build::ptr build; + for (auto build2 : dependents) { + if (build2->drvPath == step->drvPath) + build = build2; + } + if (!build) build = *dependents.begin(); + + bool stepFinished = false; + bool quit = false; + + failStep( + *conn, step, build->id, + RemoteResult { + .stepStatus = bsUnsupported, + .errorMsg = fmt("unsupported system type '%s'", + step->systemType), + .startTime = now2, + .stopTime = now2, + }, + nullptr, stepFinished, quit); + + if (quit) exit(1); + } + } + + /* Clean up 'runnable'. */ + { + auto runnable_(runnable.lock()); + for (auto i = runnable_->begin(); i != runnable_->end(); ) { + if (aborted.count(i->lock())) + i = runnable_->erase(i); + else + ++i; + } + } +} + + void Jobset::addStep(time_t startTime, time_t duration) { auto steps_(steps.lock()); diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 3d5dad4d..f7ecf268 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -46,6 +46,7 @@ std::string getEnvOrDie(const std::string & key) State::State() : config(std::make_unique<::Config>()) + , maxUnsupportedTime(config->getIntOption("max_unsupported_time", 0)) , dbPool(config->getIntOption("max_db_connections", 128)) , memoryTokens(config->getIntOption("nar_buffer_size", getMemSize() / 2)) , maxOutputSize(config->getIntOption("max_output_size", 2ULL << 30)) diff --git a/src/hydra-queue-runner/state.hh b/src/hydra-queue-runner/state.hh index 180907e9..4d87a3a7 100644 --- a/src/hydra-queue-runner/state.hh +++ b/src/hydra-queue-runner/state.hh @@ -68,7 +68,7 @@ struct RemoteResult std::unique_ptr tokens; std::shared_ptr accessor; - BuildStatus buildStatus() + BuildStatus buildStatus() const { return stepStatus == bsCachedFailure ? bsFailed : stepStatus; } @@ -198,6 +198,10 @@ struct Step /* The time at which this step became runnable. */ system_time runnableSince; + + /* The time that we last saw a machine that supports this + step. */ + system_time lastSupported = std::chrono::system_clock::now(); }; std::atomic_bool finished{false}; // debugging @@ -303,6 +307,9 @@ private: const float retryBackoff = 3.0; const unsigned int maxParallelCopyClosure = 4; + /* Time in seconds before unsupported build steps are aborted. */ + const unsigned int maxUnsupportedTime = 0; + nix::Path hydraData, logDir; bool useSubstitutes = false; @@ -483,6 +490,15 @@ private: Build::ptr referringBuild, Step::ptr referringStep, std::set & finishedDrvs, std::set & newSteps, std::set & newRunnable); + void failStep( + Connection & conn, + Step::ptr step, + BuildID buildId, + const RemoteResult & result, + Machine::ptr machine, + bool & stepFinished, + bool & quit); + Jobset::ptr createJobset(pqxx::work & txn, const std::string & projectName, const std::string & jobsetName); @@ -497,6 +513,8 @@ private: void wakeDispatcher(); + void abortUnsupported(); + void builder(MachineReservation::ptr reservation); /* Perform the given build step. Return true if the step is to be From 45ffe578b695f9de101b30d44d46f12aa0654f10 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Thu, 26 Mar 2020 15:26:12 +0100 Subject: [PATCH 3/4] Keep track of the number of unsupported steps --- src/hydra-queue-runner/dispatcher.cc | 7 +++++++ src/hydra-queue-runner/hydra-queue-runner.cc | 1 + src/hydra-queue-runner/state.hh | 1 + src/script/hydra-send-stats | 1 + 4 files changed, 10 insertions(+) diff --git a/src/hydra-queue-runner/dispatcher.cc b/src/hydra-queue-runner/dispatcher.cc index 700fdd8e..6dc7f700 100644 --- a/src/hydra-queue-runner/dispatcher.cc +++ b/src/hydra-queue-runner/dispatcher.cc @@ -328,6 +328,8 @@ void State::abortUnsupported() std::unordered_set aborted; + size_t count = 0; + for (auto & wstep : runnable2) { auto step(wstep.lock()); if (!step) continue; @@ -341,6 +343,9 @@ void State::abortUnsupported() } } + if (!supported) + count++; + if (!supported && std::chrono::duration_cast(now - step->state.lock()->lastSupported).count() >= maxUnsupportedTime) { @@ -396,6 +401,8 @@ void State::abortUnsupported() ++i; } } + + nrUnsupportedSteps = count; } diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index f7ecf268..8d729bd1 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -523,6 +523,7 @@ void State::dumpStatus(Connection & conn, bool log) root.attr("nrStepsCopyingTo", nrStepsCopyingTo); root.attr("nrStepsCopyingFrom", nrStepsCopyingFrom); root.attr("nrStepsWaiting", nrStepsWaiting); + root.attr("nrUnsupportedSteps", nrUnsupportedSteps); root.attr("bytesSent", bytesSent); root.attr("bytesReceived", bytesReceived); root.attr("nrBuildsRead", nrBuildsRead); diff --git a/src/hydra-queue-runner/state.hh b/src/hydra-queue-runner/state.hh index 4d87a3a7..c702e8a3 100644 --- a/src/hydra-queue-runner/state.hh +++ b/src/hydra-queue-runner/state.hh @@ -355,6 +355,7 @@ private: counter nrStepsCopyingTo{0}; counter nrStepsCopyingFrom{0}; counter nrStepsWaiting{0}; + counter nrUnsupportedSteps{0}; counter nrRetries{0}; counter maxNrRetries{0}; counter totalStepTime{0}; // total time for steps, including closure copying diff --git a/src/script/hydra-send-stats b/src/script/hydra-send-stats index 2b8c550b..cf653865 100755 --- a/src/script/hydra-send-stats +++ b/src/script/hydra-send-stats @@ -34,6 +34,7 @@ sub sendQueueRunnerStats { gauge("hydra.queue.steps.unfinished", $json->{nrUnfinishedSteps}); gauge("hydra.queue.steps.finished", $json->{nrStepsDone}); gauge("hydra.queue.steps.retries", $json->{nrRetries}); + gauge("hydra.queue.steps.unsupported", $json->{nrUnsupportedSteps}); gauge("hydra.queue.steps.max_retries", $json->{maxNrRetries}); if ($json->{nrStepsDone}) { gauge("hydra.queue.steps.avg_total_time", $json->{avgStepTime}); From 15ae932488512ba235ed2f6f841cc5eb56ba9314 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Thu, 26 Mar 2020 15:30:37 +0100 Subject: [PATCH 4/4] Don't spam the journal with hydra-queue-runner status dumps --- src/hydra-queue-runner/hydra-queue-runner.cc | 13 ++++--------- src/hydra-queue-runner/state.hh | 5 +---- 2 files changed, 5 insertions(+), 13 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 8d729bd1..65d93a04 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -491,7 +491,7 @@ std::shared_ptr State::acquireGlobalLock() } -void State::dumpStatus(Connection & conn, bool log) +void State::dumpStatus(Connection & conn) { std::ostringstream out; @@ -672,11 +672,6 @@ void State::dumpStatus(Connection & conn, bool log) } } - if (log && time(0) >= lastStatusLogged + statusLogInterval) { - printMsg(lvlInfo, format("status: %1%") % out.str()); - lastStatusLogged = time(0); - } - { auto mc = startDbUpdate(); pqxx::work txn(conn); @@ -785,7 +780,7 @@ void State::run(BuildID buildOne) { auto conn(dbPool.get()); clearBusy(*conn, 0); - dumpStatus(*conn, false); + dumpStatus(*conn); } std::thread(&State::monitorMachinesFile, this).detach(); @@ -848,8 +843,8 @@ void State::run(BuildID buildOne) auto conn(dbPool.get()); receiver dumpStatus_(*conn, "dump_status"); while (true) { - conn->await_notification(statusLogInterval / 2 + 1, 0); - dumpStatus(*conn, true); + conn->await_notification(); + dumpStatus(*conn); } } catch (std::exception & e) { printMsg(lvlError, format("main thread: %1%") % e.what()); diff --git a/src/hydra-queue-runner/state.hh b/src/hydra-queue-runner/state.hh index c702e8a3..31c7fddf 100644 --- a/src/hydra-queue-runner/state.hh +++ b/src/hydra-queue-runner/state.hh @@ -420,9 +420,6 @@ private: size_t maxOutputSize; size_t maxLogSize; - time_t lastStatusLogged = 0; - const int statusLogInterval = 300; - /* Steps that were busy while we encounted a PostgreSQL error. These need to be cleared at a later time to prevent them from showing up as busy until the queue runner is restarted. */ @@ -546,7 +543,7 @@ private: has it. */ std::shared_ptr acquireGlobalLock(); - void dumpStatus(Connection & conn, bool log); + void dumpStatus(Connection & conn); void addRoot(const nix::StorePath & storePath);