Make the output size limit configurable

The maximum output size per build step (as the sum of the NARs of each
output) can be set via hydra.conf, e.g.

  max-output-size = 1000000000

The default is 2 GiB.

Also refactored the build error / status handling a bit.
This commit is contained in:
Eelco Dolstra 2016-03-09 16:59:38 +01:00
parent dc790c5f7e
commit 4151be7e69
7 changed files with 111 additions and 43 deletions

View file

@ -261,21 +261,74 @@ void State::buildRemote(ref<Store> destStore,
if (sendDerivation) { if (sendDerivation) {
if (res) { if (res) {
result.errorMsg = (format("%1% on %2%") % readString(from) % machine->sshName).str(); result.errorMsg = (format("%1% on %2%") % readString(from) % machine->sshName).str();
if (res == 100) result.status = BuildResult::PermanentFailure; if (res == 100) {
else if (res == 101) result.status = BuildResult::TimedOut; result.stepStatus = bsFailed;
else result.status = BuildResult::MiscFailure; result.canCache = true;
}
else if (res == 101) {
result.stepStatus = bsTimedOut;
}
else {
result.stepStatus = bsAborted;
result.canRetry = true;
}
return; return;
} }
result.status = BuildResult::Built; result.stepStatus = bsSuccess;
} else { } else {
result.status = (BuildResult::Status) res;
result.errorMsg = readString(from); result.errorMsg = readString(from);
if (!result.success()) return; switch ((BuildResult::Status) res) {
case BuildResult::Built:
result.stepStatus = bsSuccess;
break;
case BuildResult::Substituted:
case BuildResult::AlreadyValid:
result.stepStatus = bsSuccess;
result.isCached = true;
break;
case BuildResult::PermanentFailure:
result.stepStatus = bsFailed;
result.canCache = true;
result.errorMsg = "";
break;
case BuildResult::InputRejected:
case BuildResult::OutputRejected:
result.stepStatus = bsFailed;
result.canCache = true;
break;
case BuildResult::TransientFailure:
result.stepStatus = bsFailed;
result.canRetry = true;
result.errorMsg = "";
break;
case BuildResult::CachedFailure: // cached on the build machine
result.stepStatus = bsCachedFailure;
result.canCache = true;
result.errorMsg = "";
break;
case BuildResult::TimedOut:
result.stepStatus = bsTimedOut;
result.errorMsg = "";
break;
case BuildResult::MiscFailure:
result.stepStatus = bsAborted;
result.canRetry = true;
break;
case BuildResult::LogLimitExceeded:
result.stepStatus = bsLogLimitExceeded;
break;
default:
result.stepStatus = bsAborted;
break;
} }
if (result.stepStatus != bsSuccess) return;
}
result.errorMsg = "";
/* If the path was substituted or already valid, then we didn't /* If the path was substituted or already valid, then we didn't
get a build log. */ get a build log. */
if (result.status == BuildResult::Substituted || result.status == BuildResult::AlreadyValid) { if (result.isCached) {
printMsg(lvlInfo, format("outputs of %1% substituted or already valid on %2%") % step->drvPath % machine->sshName); printMsg(lvlInfo, format("outputs of %1% substituted or already valid on %2%") % step->drvPath % machine->sshName);
unlink(result.logFile.c_str()); unlink(result.logFile.c_str());
result.logFile = ""; result.logFile = "";
@ -303,6 +356,11 @@ void State::buildRemote(ref<Store> destStore,
totalNarSize += readLongLong(from); totalNarSize += readLongLong(from);
} }
if (totalNarSize > maxOutputSize) {
result.stepStatus = bsNarSizeLimitExceeded;
return;
}
printMsg(lvlDebug, format("copying outputs of %s from %s (%d bytes)") printMsg(lvlDebug, format("copying outputs of %s from %s (%d bytes)")
% step->drvPath % machine->sshName % totalNarSize); % step->drvPath % machine->sshName % totalNarSize);

View file

@ -105,10 +105,8 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore, Step::ptr step,
/* If any of the outputs have previously failed, then don't bother /* If any of the outputs have previously failed, then don't bother
building again. */ building again. */
bool cachedFailure = checkCachedFailure(step, *conn); if (checkCachedFailure(step, *conn))
result.stepStatus = bsCachedFailure;
if (cachedFailure)
result.status = BuildResult::CachedFailure;
else { else {
/* Create a build step record indicating that we started /* Create a build step record indicating that we started
@ -124,12 +122,14 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore, Step::ptr step,
try { try {
/* FIXME: referring builds may have conflicting timeouts. */ /* FIXME: referring builds may have conflicting timeouts. */
buildRemote(destStore, machine, step, build->maxSilentTime, build->buildTimeout, result); buildRemote(destStore, machine, step, build->maxSilentTime, build->buildTimeout, result);
} catch (NoTokens & e) {
result.stepStatus = bsNarSizeLimitExceeded;
} catch (Error & e) { } catch (Error & e) {
result.status = BuildResult::MiscFailure; result.stepStatus = bsAborted;
result.errorMsg = e.msg(); result.errorMsg = e.msg();
} }
if (result.success()) if (result.stepStatus == bsSuccess)
res = getBuildOutput(destStore, ref<FSAccessor>(result.accessor), step->drv); res = getBuildOutput(destStore, ref<FSAccessor>(result.accessor), step->drv);
} }
@ -159,7 +159,7 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore, Step::ptr step,
/* The step had a hopefully temporary failure (e.g. network /* The step had a hopefully temporary failure (e.g. network
issue). Retry a number of times. */ issue). Retry a number of times. */
if (result.canRetry()) { if (result.canRetry) {
printMsg(lvlError, format("possibly transient failure building %1% on %2%: %3%") printMsg(lvlError, format("possibly transient failure building %1% on %2%: %3%")
% step->drvPath % machine->sshName % result.errorMsg); % step->drvPath % machine->sshName % result.errorMsg);
bool retry; bool retry;
@ -178,7 +178,7 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore, Step::ptr step,
} }
} }
if (result.success()) { if (result.stepStatus == bsSuccess) {
/* Register success in the database for all Build objects that /* Register success in the database for all Build objects that
have this step as the top-level step. Since the queue have this step as the top-level step. Since the queue
@ -225,7 +225,7 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore, Step::ptr step,
build->id, stepNr, machine->sshName, bsSuccess); build->id, stepNr, machine->sshName, bsSuccess);
for (auto & b : direct) for (auto & b : direct)
markSucceededBuild(txn, b, res, build != b || result.status != BuildResult::Built, markSucceededBuild(txn, b, res, build != b || result.isCached,
result.startTime, result.stopTime); result.startTime, result.stopTime);
txn.commit(); txn.commit();
@ -309,38 +309,27 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore, Step::ptr step,
pqxx::work txn(*conn); pqxx::work txn(*conn);
BuildStatus buildStatus =
result.status == BuildResult::TimedOut ? bsTimedOut :
result.status == BuildResult::LogLimitExceeded ? bsLogLimitExceeded :
result.canRetry() ? bsAborted :
bsFailed;
/* For standard failures, we don't care about the error /* For standard failures, we don't care about the error
message. */ message. */
if (result.status == BuildResult::PermanentFailure || if (result.stepStatus != bsAborted)
result.status == BuildResult::TransientFailure ||
result.status == BuildResult::CachedFailure ||
result.status == BuildResult::TimedOut ||
result.status == BuildResult::LogLimitExceeded)
result.errorMsg = ""; result.errorMsg = "";
/* Create failed build steps for every build that depends /* Create failed build steps for every build that
on this. For cached failures, only create a step for depends on this, except when this step is cached
builds that don't have this step as top-level and is the top-level of that build (since then it's
(otherwise the user won't be able to see what caused redundant with the build's isCachedBuild field). */
the build to fail). */
for (auto & build2 : indirect) { for (auto & build2 : indirect) {
if ((cachedFailure && build2->drvPath == step->drvPath) || if ((result.stepStatus == bsCachedFailure && build2->drvPath == step->drvPath) ||
(!cachedFailure && build == build2) || (result.stepStatus != bsCachedFailure && build == build2) ||
build2->finishedInDB) build2->finishedInDB)
continue; continue;
createBuildStep(txn, 0, build2, step, machine->sshName, createBuildStep(txn, 0, build2, step, machine->sshName,
buildStatus, result.errorMsg, build == build2 ? 0 : build->id); result.stepStatus, result.errorMsg, build == build2 ? 0 : build->id);
} }
if (!cachedFailure) if (result.stepStatus != bsCachedFailure)
finishBuildStep(txn, result.startTime, result.stopTime, result.overhead, finishBuildStep(txn, result.startTime, result.stopTime, result.overhead,
build->id, stepNr, machine->sshName, buildStatus, result.errorMsg); build->id, stepNr, machine->sshName, result.stepStatus, result.errorMsg);
/* Mark all builds that depend on this derivation as failed. */ /* Mark all builds that depend on this derivation as failed. */
for (auto & build2 : indirect) { for (auto & build2 : indirect) {
@ -349,16 +338,16 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore, Step::ptr step,
txn.parameterized txn.parameterized
("update Builds set finished = 1, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1 and finished = 0") ("update Builds set finished = 1, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1 and finished = 0")
(build2->id) (build2->id)
((int) (build2->drvPath != step->drvPath && buildStatus == bsFailed ? bsDepFailed : buildStatus)) ((int) (build2->drvPath != step->drvPath && result.buildStatus() == bsFailed ? bsDepFailed : result.buildStatus()))
(result.startTime) (result.startTime)
(result.stopTime) (result.stopTime)
(cachedFailure ? 1 : 0).exec(); (result.stepStatus == bsCachedFailure ? 1 : 0).exec();
nrBuildsDone++; nrBuildsDone++;
} }
/* Remember failed paths in the database so that they /* Remember failed paths in the database so that they
won't be built again. */ won't be built again. */
if (!cachedFailure && result.status == BuildResult::PermanentFailure) if (result.stepStatus != bsCachedFailure && result.canCache)
for (auto & path : step->drv.outputPaths()) for (auto & path : step->drv.outputPaths())
txn.parameterized("insert into FailedPaths values ($1)")(path).exec(); txn.parameterized("insert into FailedPaths values ($1)")(path).exec();

View file

@ -41,6 +41,11 @@ State::State()
} }
} }
{
std::string s = hydraConfig["max-output-size"];
if (s != "") string2Int(s, maxOutputSize);
}
logDir = canonPath(hydraData + "/build-logs"); logDir = canonPath(hydraData + "/build-logs");
} }

View file

@ -33,20 +33,27 @@ typedef enum {
bsCachedFailure = 8, // steps only bsCachedFailure = 8, // steps only
bsUnsupported = 9, bsUnsupported = 9,
bsLogLimitExceeded = 10, bsLogLimitExceeded = 10,
bsNarSizeLimitExceeded = 11,
bsBusy = 100, // not stored bsBusy = 100, // not stored
} BuildStatus; } BuildStatus;
struct RemoteResult : nix::BuildResult struct RemoteResult
{ {
BuildStatus stepStatus = bsAborted;
bool canRetry = false; // for bsAborted
bool isCached = false; // for bsSucceed
bool canCache = false; // for bsFailed
std::string errorMsg; // for bsAborted
time_t startTime = 0, stopTime = 0; time_t startTime = 0, stopTime = 0;
unsigned int overhead = 0; unsigned int overhead = 0;
nix::Path logFile; nix::Path logFile;
std::shared_ptr<nix::FSAccessor> accessor; std::shared_ptr<nix::FSAccessor> accessor;
bool canRetry() BuildStatus buildStatus()
{ {
return status == TransientFailure || status == MiscFailure; return stepStatus == bsCachedFailure ? bsFailed : stepStatus;
} }
}; };
@ -350,6 +357,8 @@ private:
tokens are available. */ tokens are available. */
nix::TokenServer memoryTokens; nix::TokenServer memoryTokens;
size_t maxOutputSize = 2ULL << 30;
public: public:
State(); State();

View file

@ -65,6 +65,8 @@ FOR step IN steps; IF step.busy; busy = 1; END; END;
<span class="error">Unsupported system type</span> <span class="error">Unsupported system type</span>
[% ELSIF step.status == 10 %] [% ELSIF step.status == 10 %]
<span class="error">Log limit exceeded</span> <span class="error">Log limit exceeded</span>
[% ELSIF step.status == 11 %]
<span class="error">Output limit exceeded</span>
[% ELSIF step.errormsg %] [% ELSIF step.errormsg %]
<span class="error">Failed: [% HTML.escape(step.errormsg) %]</span> <span class="error">Failed: [% HTML.escape(step.errormsg) %]</span>
[% ELSE %] [% ELSE %]

View file

@ -207,6 +207,8 @@ BLOCK renderBuildStatusIcon;
<img src="[% c.uri_for("/static/images/warning_${size}.png") %]" alt="Timed out" class="build-status" /> <img src="[% c.uri_for("/static/images/warning_${size}.png") %]" alt="Timed out" class="build-status" />
[% ELSIF buildstatus == 10 %] [% ELSIF buildstatus == 10 %]
<img src="[% c.uri_for("/static/images/warning_${size}.png") %]" alt="Log limit exceeded" class="build-status" /> <img src="[% c.uri_for("/static/images/warning_${size}.png") %]" alt="Log limit exceeded" class="build-status" />
[% ELSIF buildstatus == 11 %]
<img src="[% c.uri_for("/static/images/warning_${size}.png") %]" alt="Output size limit exceeded" class="build-status" />
[% ELSE %] [% ELSE %]
<img src="[% c.uri_for("/static/images/error_${size}.png") %]" alt="Failed" class="build-status" /> <img src="[% c.uri_for("/static/images/error_${size}.png") %]" alt="Failed" class="build-status" />
[% END; [% END;
@ -236,6 +238,8 @@ BLOCK renderStatus;
<span class="error">Unsupported system type</span> <span class="error">Unsupported system type</span>
[% ELSIF buildstatus == 10 %] [% ELSIF buildstatus == 10 %]
<span class="error">Log limit exceeded</span> <span class="error">Log limit exceeded</span>
[% ELSIF buildstatus == 11 %]
<span class="error">Output limit exceeded</span>
[% ELSE %] [% ELSE %]
<span class="error">Aborted</span> <span class="error">Aborted</span>
(Hydra failure; see <a href="#nix-error">below</a>) (Hydra failure; see <a href="#nix-error">below</a>)

View file

@ -192,6 +192,7 @@ create table Builds (
-- 8 = cached failure [steps only; builds use isCachedBuild] -- 8 = cached failure [steps only; builds use isCachedBuild]
-- 9 = unsupported system type -- 9 = unsupported system type
-- 10 = log limit exceeded -- 10 = log limit exceeded
-- 11 = NAR size limit exceeded
buildStatus integer, buildStatus integer,
size bigint, size bigint,