hydra-queue-runner: Implement timeouts

Also, keep track of timeouts in the database as a distinct build
status.
This commit is contained in:
Eelco Dolstra 2015-06-17 13:32:06 +02:00
parent 2da4987bc2
commit 745efce828
5 changed files with 32 additions and 10 deletions

View file

@ -109,7 +109,8 @@ static void copyClosureFrom(std::shared_ptr<StoreAPI> store,
void buildRemote(std::shared_ptr<StoreAPI> store,
const string & sshName, const string & sshKey,
const Path & drvPath, const Derivation & drv,
const nix::Path & logDir, RemoteResult & result)
const nix::Path & logDir, unsigned int maxSilentTime, unsigned int buildTimeout,
RemoteResult & result)
{
string base = baseNameOf(drvPath);
Path logFile = logDir + "/" + string(base, 0, 2) + "/" + string(base, 2);
@ -152,8 +153,9 @@ void buildRemote(std::shared_ptr<StoreAPI> store,
printMsg(lvlDebug, format("building %1% on %2%") % drvPath % sshName);
writeInt(cmdBuildPaths, to);
writeStrings(PathSet({drvPath}), to);
writeInt(3600, to); // == maxSilentTime, FIXME
writeInt(7200, to); // == buildTimeout, FIXME
writeInt(maxSilentTime, to);
writeInt(buildTimeout, to);
// FIXME: send maxLogSize.
to.flush();
result.startTime = time(0);
int res = readInt(from);

View file

@ -18,4 +18,5 @@ struct RemoteResult
void buildRemote(std::shared_ptr<nix::StoreAPI> store,
const std::string & sshName, const std::string & sshKey,
const nix::Path & drvPath, const nix::Derivation & drv,
const nix::Path & logDir, RemoteResult & result);
const nix::Path & logDir, unsigned int maxSilentTime, unsigned int buildTimeout,
RemoteResult & result);

View file

@ -43,6 +43,7 @@ typedef enum {
bsDepFailed = 2,
bsAborted = 3,
bsFailedWithOutput = 6,
bsTimedOut = 7,
bsUnsupported = 9,
} BuildStatus;
@ -51,6 +52,7 @@ typedef enum {
bssSuccess = 0,
bssFailed = 1,
bssAborted = 4,
bssTimedOut = 7,
bssUnsupported = 9,
bssBusy = 100, // not stored
} BuildStepStatus;
@ -77,6 +79,7 @@ struct Build
Path drvPath;
std::map<string, Path> outputs;
std::string fullJobName;
unsigned int maxSilentTime, buildTimeout;
std::shared_ptr<Step> toplevel;
@ -481,7 +484,7 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr<StoreAPI> store,
{
pqxx::work txn(conn);
auto res = txn.parameterized("select id, project, jobset, job, drvPath from Builds where id > $1 and finished = 0 order by id")(lastBuildId).exec();
auto res = txn.parameterized("select id, project, jobset, job, drvPath, maxsilent, timeout from Builds where id > $1 and finished = 0 order by id")(lastBuildId).exec();
for (auto const & row : res) {
auto builds_(builds.lock());
@ -493,6 +496,9 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr<StoreAPI> store,
build->id = id;
build->drvPath = row["drvPath"].as<string>();
build->fullJobName = row["project"].as<string>() + ":" + row["jobset"].as<string>() + ":" + row["job"].as<string>();
build->maxSilentTime = row["maxsilent"].as<int>();
build->buildTimeout = row["timeout"].as<int>();
std::cerr << build->id << " " << build->buildTimeout << std::endl;
newBuilds.push_back(build);
}
@ -975,8 +981,8 @@ bool State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
if (!build) build = *dependents.begin();
printMsg(lvlInfo, format("performing step %1% on %2% (needed by %3% builds)")
% step->drvPath % machine->sshName % dependents.size());
printMsg(lvlInfo, format("performing step %1% on %2% (needed by build %3% and %4% others)")
% step->drvPath % machine->sshName % build->id % (dependents.size() - 1));
}
auto conn(dbPool.get());
@ -1005,7 +1011,9 @@ bool State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
}
try {
buildRemote(store, machine->sshName, machine->sshKey, step->drvPath, step->drv, logDir, result);
/* FIXME: referring builds may have conflicting timeouts. */
buildRemote(store, machine->sshName, machine->sshKey, step->drvPath, step->drv,
logDir, build->maxSilentTime, build->buildTimeout, result);
} catch (Error & e) {
result.status = RemoteResult::rrMiscFailure;
result.errorMsg = e.msg();
@ -1066,9 +1074,13 @@ bool State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
/* Failure case. */
BuildStatus buildStatus =
result.status == RemoteResult::rrPermanentFailure ? bsFailed : bsAborted;
result.status == RemoteResult::rrPermanentFailure ? bsFailed :
result.status == RemoteResult::rrTimedOut ? bsTimedOut :
bsAborted;
BuildStepStatus buildStepStatus =
result.status == RemoteResult::rrPermanentFailure ? bssFailed : bssAborted;
result.status == RemoteResult::rrPermanentFailure ? bssFailed :
result.status == RemoteResult::rrTimedOut ? bssTimedOut :
bssAborted;
/* For regular failures, we don't care about the error
message. */
@ -1223,6 +1235,8 @@ void State::run()
auto queueMonitorThread = std::thread(&State::queueMonitor, this);
sleep(5);
std::thread(&State::dispatcher, this).detach();
queueMonitorThread.join();

View file

@ -204,6 +204,8 @@ BLOCK renderBuildStatusIcon;
<img src="[% c.uri_for("/static/images/forbidden_${size}.png") %]" alt="Cancelled" class="build-status" />
[% ELSIF buildstatus == 6 %]
<img src="[% c.uri_for("/static/images/error_${size}.png") %]" alt="Failed (with result)" class="build-status" />
[% ELSIF buildstatus == 7 %]
<img src="[% c.uri_for("/static/images/warning_${size}.png") %]" alt="Timed out" class="build-status" />
[% ELSE %]
<img src="[% c.uri_for("/static/images/error_${size}.png") %]" alt="Failed" class="build-status" />
[% END;
@ -229,6 +231,8 @@ BLOCK renderStatus;
<span class="error">Cancelled by user</span>
[% ELSIF buildstatus == 6 %]
<span class="error">Build failed (with result)</span>
[% ELSIF buildstatus == 7 %]
<span class="error">Timed out</span>
[% ELSIF buildstatus == 9 %]
<span class="error">Unsupported system type</span>
[% ELSE %]

View file

@ -180,6 +180,7 @@ create table Builds (
-- 4 = build cancelled (removed from queue; never built)
-- 5 = build not done because a dependency failed previously (obsolete)
-- 6 = failure with output
-- 7 = timed out
-- 9 = unsupported system type
buildStatus integer,