From 745efce828937934a2240e5170fdee3d63ee66af Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Wed, 17 Jun 2015 13:32:06 +0200 Subject: [PATCH] hydra-queue-runner: Implement timeouts Also, keep track of timeouts in the database as a distinct build status. --- src/hydra-queue-runner/build-remote.cc | 8 +++--- src/hydra-queue-runner/build-remote.hh | 3 ++- src/hydra-queue-runner/hydra-queue-runner.cc | 26 +++++++++++++++----- src/root/common.tt | 4 +++ src/sql/hydra.sql | 1 + 5 files changed, 32 insertions(+), 10 deletions(-) diff --git a/src/hydra-queue-runner/build-remote.cc b/src/hydra-queue-runner/build-remote.cc index 6fddaf82..762a1058 100644 --- a/src/hydra-queue-runner/build-remote.cc +++ b/src/hydra-queue-runner/build-remote.cc @@ -109,7 +109,8 @@ static void copyClosureFrom(std::shared_ptr store, void buildRemote(std::shared_ptr store, const string & sshName, const string & sshKey, const Path & drvPath, const Derivation & drv, - const nix::Path & logDir, RemoteResult & result) + const nix::Path & logDir, unsigned int maxSilentTime, unsigned int buildTimeout, + RemoteResult & result) { string base = baseNameOf(drvPath); Path logFile = logDir + "/" + string(base, 0, 2) + "/" + string(base, 2); @@ -152,8 +153,9 @@ void buildRemote(std::shared_ptr store, printMsg(lvlDebug, format("building ‘%1%’ on ‘%2%’") % drvPath % sshName); writeInt(cmdBuildPaths, to); writeStrings(PathSet({drvPath}), to); - writeInt(3600, to); // == maxSilentTime, FIXME - writeInt(7200, to); // == buildTimeout, FIXME + writeInt(maxSilentTime, to); + writeInt(buildTimeout, to); + // FIXME: send maxLogSize. to.flush(); result.startTime = time(0); int res = readInt(from); diff --git a/src/hydra-queue-runner/build-remote.hh b/src/hydra-queue-runner/build-remote.hh index 6406bc58..99e79c8c 100644 --- a/src/hydra-queue-runner/build-remote.hh +++ b/src/hydra-queue-runner/build-remote.hh @@ -18,4 +18,5 @@ struct RemoteResult void buildRemote(std::shared_ptr store, const std::string & sshName, const std::string & sshKey, const nix::Path & drvPath, const nix::Derivation & drv, - const nix::Path & logDir, RemoteResult & result); + const nix::Path & logDir, unsigned int maxSilentTime, unsigned int buildTimeout, + RemoteResult & result); diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 489ec241..192e75a9 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -43,6 +43,7 @@ typedef enum { bsDepFailed = 2, bsAborted = 3, bsFailedWithOutput = 6, + bsTimedOut = 7, bsUnsupported = 9, } BuildStatus; @@ -51,6 +52,7 @@ typedef enum { bssSuccess = 0, bssFailed = 1, bssAborted = 4, + bssTimedOut = 7, bssUnsupported = 9, bssBusy = 100, // not stored } BuildStepStatus; @@ -77,6 +79,7 @@ struct Build Path drvPath; std::map outputs; std::string fullJobName; + unsigned int maxSilentTime, buildTimeout; std::shared_ptr toplevel; @@ -481,7 +484,7 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, { pqxx::work txn(conn); - auto res = txn.parameterized("select id, project, jobset, job, drvPath from Builds where id > $1 and finished = 0 order by id")(lastBuildId).exec(); + auto res = txn.parameterized("select id, project, jobset, job, drvPath, maxsilent, timeout from Builds where id > $1 and finished = 0 order by id")(lastBuildId).exec(); for (auto const & row : res) { auto builds_(builds.lock()); @@ -493,6 +496,9 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, build->id = id; build->drvPath = row["drvPath"].as(); build->fullJobName = row["project"].as() + ":" + row["jobset"].as() + ":" + row["job"].as(); + build->maxSilentTime = row["maxsilent"].as(); + build->buildTimeout = row["timeout"].as(); + std::cerr << build->id << " " << build->buildTimeout << std::endl; newBuilds.push_back(build); } @@ -975,8 +981,8 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, if (!build) build = *dependents.begin(); - printMsg(lvlInfo, format("performing step ‘%1%’ on ‘%2%’ (needed by %3% builds)") - % step->drvPath % machine->sshName % dependents.size()); + printMsg(lvlInfo, format("performing step ‘%1%’ on ‘%2%’ (needed by build %3% and %4% others)") + % step->drvPath % machine->sshName % build->id % (dependents.size() - 1)); } auto conn(dbPool.get()); @@ -1005,7 +1011,9 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, } try { - buildRemote(store, machine->sshName, machine->sshKey, step->drvPath, step->drv, logDir, result); + /* FIXME: referring builds may have conflicting timeouts. */ + buildRemote(store, machine->sshName, machine->sshKey, step->drvPath, step->drv, + logDir, build->maxSilentTime, build->buildTimeout, result); } catch (Error & e) { result.status = RemoteResult::rrMiscFailure; result.errorMsg = e.msg(); @@ -1066,9 +1074,13 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, /* Failure case. */ BuildStatus buildStatus = - result.status == RemoteResult::rrPermanentFailure ? bsFailed : bsAborted; + result.status == RemoteResult::rrPermanentFailure ? bsFailed : + result.status == RemoteResult::rrTimedOut ? bsTimedOut : + bsAborted; BuildStepStatus buildStepStatus = - result.status == RemoteResult::rrPermanentFailure ? bssFailed : bssAborted; + result.status == RemoteResult::rrPermanentFailure ? bssFailed : + result.status == RemoteResult::rrTimedOut ? bssTimedOut : + bssAborted; /* For regular failures, we don't care about the error message. */ @@ -1223,6 +1235,8 @@ void State::run() auto queueMonitorThread = std::thread(&State::queueMonitor, this); + sleep(5); + std::thread(&State::dispatcher, this).detach(); queueMonitorThread.join(); diff --git a/src/root/common.tt b/src/root/common.tt index f4425772..f02ed838 100644 --- a/src/root/common.tt +++ b/src/root/common.tt @@ -204,6 +204,8 @@ BLOCK renderBuildStatusIcon; Cancelled [% ELSIF buildstatus == 6 %] Failed (with result) + [% ELSIF buildstatus == 7 %] + Timed out [% ELSE %] Failed [% END; @@ -229,6 +231,8 @@ BLOCK renderStatus; Cancelled by user [% ELSIF buildstatus == 6 %] Build failed (with result) + [% ELSIF buildstatus == 7 %] + Timed out [% ELSIF buildstatus == 9 %] Unsupported system type [% ELSE %] diff --git a/src/sql/hydra.sql b/src/sql/hydra.sql index feeb9452..10f2d614 100644 --- a/src/sql/hydra.sql +++ b/src/sql/hydra.sql @@ -180,6 +180,7 @@ create table Builds ( -- 4 = build cancelled (removed from queue; never built) -- 5 = build not done because a dependency failed previously (obsolete) -- 6 = failure with output + -- 7 = timed out -- 9 = unsupported system type buildStatus integer,