hydra-queue-runner: Implement timeouts

Also, keep track of timeouts in the database as a distinct build
status.
This commit is contained in:
Eelco Dolstra 2015-06-17 13:32:06 +02:00
parent 2da4987bc2
commit 745efce828
5 changed files with 32 additions and 10 deletions

View file

@ -109,7 +109,8 @@ static void copyClosureFrom(std::shared_ptr<StoreAPI> store,
void buildRemote(std::shared_ptr<StoreAPI> store, void buildRemote(std::shared_ptr<StoreAPI> store,
const string & sshName, const string & sshKey, const string & sshName, const string & sshKey,
const Path & drvPath, const Derivation & drv, const Path & drvPath, const Derivation & drv,
const nix::Path & logDir, RemoteResult & result) const nix::Path & logDir, unsigned int maxSilentTime, unsigned int buildTimeout,
RemoteResult & result)
{ {
string base = baseNameOf(drvPath); string base = baseNameOf(drvPath);
Path logFile = logDir + "/" + string(base, 0, 2) + "/" + string(base, 2); Path logFile = logDir + "/" + string(base, 0, 2) + "/" + string(base, 2);
@ -152,8 +153,9 @@ void buildRemote(std::shared_ptr<StoreAPI> store,
printMsg(lvlDebug, format("building %1% on %2%") % drvPath % sshName); printMsg(lvlDebug, format("building %1% on %2%") % drvPath % sshName);
writeInt(cmdBuildPaths, to); writeInt(cmdBuildPaths, to);
writeStrings(PathSet({drvPath}), to); writeStrings(PathSet({drvPath}), to);
writeInt(3600, to); // == maxSilentTime, FIXME writeInt(maxSilentTime, to);
writeInt(7200, to); // == buildTimeout, FIXME writeInt(buildTimeout, to);
// FIXME: send maxLogSize.
to.flush(); to.flush();
result.startTime = time(0); result.startTime = time(0);
int res = readInt(from); int res = readInt(from);

View file

@ -18,4 +18,5 @@ struct RemoteResult
void buildRemote(std::shared_ptr<nix::StoreAPI> store, void buildRemote(std::shared_ptr<nix::StoreAPI> store,
const std::string & sshName, const std::string & sshKey, const std::string & sshName, const std::string & sshKey,
const nix::Path & drvPath, const nix::Derivation & drv, const nix::Path & drvPath, const nix::Derivation & drv,
const nix::Path & logDir, RemoteResult & result); const nix::Path & logDir, unsigned int maxSilentTime, unsigned int buildTimeout,
RemoteResult & result);

View file

@ -43,6 +43,7 @@ typedef enum {
bsDepFailed = 2, bsDepFailed = 2,
bsAborted = 3, bsAborted = 3,
bsFailedWithOutput = 6, bsFailedWithOutput = 6,
bsTimedOut = 7,
bsUnsupported = 9, bsUnsupported = 9,
} BuildStatus; } BuildStatus;
@ -51,6 +52,7 @@ typedef enum {
bssSuccess = 0, bssSuccess = 0,
bssFailed = 1, bssFailed = 1,
bssAborted = 4, bssAborted = 4,
bssTimedOut = 7,
bssUnsupported = 9, bssUnsupported = 9,
bssBusy = 100, // not stored bssBusy = 100, // not stored
} BuildStepStatus; } BuildStepStatus;
@ -77,6 +79,7 @@ struct Build
Path drvPath; Path drvPath;
std::map<string, Path> outputs; std::map<string, Path> outputs;
std::string fullJobName; std::string fullJobName;
unsigned int maxSilentTime, buildTimeout;
std::shared_ptr<Step> toplevel; std::shared_ptr<Step> toplevel;
@ -481,7 +484,7 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr<StoreAPI> store,
{ {
pqxx::work txn(conn); pqxx::work txn(conn);
auto res = txn.parameterized("select id, project, jobset, job, drvPath from Builds where id > $1 and finished = 0 order by id")(lastBuildId).exec(); auto res = txn.parameterized("select id, project, jobset, job, drvPath, maxsilent, timeout from Builds where id > $1 and finished = 0 order by id")(lastBuildId).exec();
for (auto const & row : res) { for (auto const & row : res) {
auto builds_(builds.lock()); auto builds_(builds.lock());
@ -493,6 +496,9 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr<StoreAPI> store,
build->id = id; build->id = id;
build->drvPath = row["drvPath"].as<string>(); build->drvPath = row["drvPath"].as<string>();
build->fullJobName = row["project"].as<string>() + ":" + row["jobset"].as<string>() + ":" + row["job"].as<string>(); build->fullJobName = row["project"].as<string>() + ":" + row["jobset"].as<string>() + ":" + row["job"].as<string>();
build->maxSilentTime = row["maxsilent"].as<int>();
build->buildTimeout = row["timeout"].as<int>();
std::cerr << build->id << " " << build->buildTimeout << std::endl;
newBuilds.push_back(build); newBuilds.push_back(build);
} }
@ -975,8 +981,8 @@ bool State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
if (!build) build = *dependents.begin(); if (!build) build = *dependents.begin();
printMsg(lvlInfo, format("performing step %1% on %2% (needed by %3% builds)") printMsg(lvlInfo, format("performing step %1% on %2% (needed by build %3% and %4% others)")
% step->drvPath % machine->sshName % dependents.size()); % step->drvPath % machine->sshName % build->id % (dependents.size() - 1));
} }
auto conn(dbPool.get()); auto conn(dbPool.get());
@ -1005,7 +1011,9 @@ bool State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
} }
try { try {
buildRemote(store, machine->sshName, machine->sshKey, step->drvPath, step->drv, logDir, result); /* FIXME: referring builds may have conflicting timeouts. */
buildRemote(store, machine->sshName, machine->sshKey, step->drvPath, step->drv,
logDir, build->maxSilentTime, build->buildTimeout, result);
} catch (Error & e) { } catch (Error & e) {
result.status = RemoteResult::rrMiscFailure; result.status = RemoteResult::rrMiscFailure;
result.errorMsg = e.msg(); result.errorMsg = e.msg();
@ -1066,9 +1074,13 @@ bool State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
/* Failure case. */ /* Failure case. */
BuildStatus buildStatus = BuildStatus buildStatus =
result.status == RemoteResult::rrPermanentFailure ? bsFailed : bsAborted; result.status == RemoteResult::rrPermanentFailure ? bsFailed :
result.status == RemoteResult::rrTimedOut ? bsTimedOut :
bsAborted;
BuildStepStatus buildStepStatus = BuildStepStatus buildStepStatus =
result.status == RemoteResult::rrPermanentFailure ? bssFailed : bssAborted; result.status == RemoteResult::rrPermanentFailure ? bssFailed :
result.status == RemoteResult::rrTimedOut ? bssTimedOut :
bssAborted;
/* For regular failures, we don't care about the error /* For regular failures, we don't care about the error
message. */ message. */
@ -1223,6 +1235,8 @@ void State::run()
auto queueMonitorThread = std::thread(&State::queueMonitor, this); auto queueMonitorThread = std::thread(&State::queueMonitor, this);
sleep(5);
std::thread(&State::dispatcher, this).detach(); std::thread(&State::dispatcher, this).detach();
queueMonitorThread.join(); queueMonitorThread.join();

View file

@ -204,6 +204,8 @@ BLOCK renderBuildStatusIcon;
<img src="[% c.uri_for("/static/images/forbidden_${size}.png") %]" alt="Cancelled" class="build-status" /> <img src="[% c.uri_for("/static/images/forbidden_${size}.png") %]" alt="Cancelled" class="build-status" />
[% ELSIF buildstatus == 6 %] [% ELSIF buildstatus == 6 %]
<img src="[% c.uri_for("/static/images/error_${size}.png") %]" alt="Failed (with result)" class="build-status" /> <img src="[% c.uri_for("/static/images/error_${size}.png") %]" alt="Failed (with result)" class="build-status" />
[% ELSIF buildstatus == 7 %]
<img src="[% c.uri_for("/static/images/warning_${size}.png") %]" alt="Timed out" class="build-status" />
[% ELSE %] [% ELSE %]
<img src="[% c.uri_for("/static/images/error_${size}.png") %]" alt="Failed" class="build-status" /> <img src="[% c.uri_for("/static/images/error_${size}.png") %]" alt="Failed" class="build-status" />
[% END; [% END;
@ -229,6 +231,8 @@ BLOCK renderStatus;
<span class="error">Cancelled by user</span> <span class="error">Cancelled by user</span>
[% ELSIF buildstatus == 6 %] [% ELSIF buildstatus == 6 %]
<span class="error">Build failed (with result)</span> <span class="error">Build failed (with result)</span>
[% ELSIF buildstatus == 7 %]
<span class="error">Timed out</span>
[% ELSIF buildstatus == 9 %] [% ELSIF buildstatus == 9 %]
<span class="error">Unsupported system type</span> <span class="error">Unsupported system type</span>
[% ELSE %] [% ELSE %]

View file

@ -180,6 +180,7 @@ create table Builds (
-- 4 = build cancelled (removed from queue; never built) -- 4 = build cancelled (removed from queue; never built)
-- 5 = build not done because a dependency failed previously (obsolete) -- 5 = build not done because a dependency failed previously (obsolete)
-- 6 = failure with output -- 6 = failure with output
-- 7 = timed out
-- 9 = unsupported system type -- 9 = unsupported system type
buildStatus integer, buildStatus integer,