From 4151be7e69957d22af712dd5410b5ad8aa3a2289 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Wed, 9 Mar 2016 16:59:38 +0100 Subject: [PATCH] Make the output size limit configurable The maximum output size per build step (as the sum of the NARs of each output) can be set via hydra.conf, e.g. max-output-size = 1000000000 The default is 2 GiB. Also refactored the build error / status handling a bit. --- src/hydra-queue-runner/build-remote.cc | 72 ++++++++++++++++++-- src/hydra-queue-runner/builder.cc | 55 ++++++--------- src/hydra-queue-runner/hydra-queue-runner.cc | 5 ++ src/hydra-queue-runner/state.hh | 15 +++- src/root/build.tt | 2 + src/root/common.tt | 4 ++ src/sql/hydra.sql | 1 + 7 files changed, 111 insertions(+), 43 deletions(-) diff --git a/src/hydra-queue-runner/build-remote.cc b/src/hydra-queue-runner/build-remote.cc index 3c9a8f79..f3576a0c 100644 --- a/src/hydra-queue-runner/build-remote.cc +++ b/src/hydra-queue-runner/build-remote.cc @@ -261,21 +261,74 @@ void State::buildRemote(ref destStore, if (sendDerivation) { if (res) { result.errorMsg = (format("%1% on ‘%2%’") % readString(from) % machine->sshName).str(); - if (res == 100) result.status = BuildResult::PermanentFailure; - else if (res == 101) result.status = BuildResult::TimedOut; - else result.status = BuildResult::MiscFailure; + if (res == 100) { + result.stepStatus = bsFailed; + result.canCache = true; + } + else if (res == 101) { + result.stepStatus = bsTimedOut; + } + else { + result.stepStatus = bsAborted; + result.canRetry = true; + } return; } - result.status = BuildResult::Built; + result.stepStatus = bsSuccess; } else { - result.status = (BuildResult::Status) res; result.errorMsg = readString(from); - if (!result.success()) return; + switch ((BuildResult::Status) res) { + case BuildResult::Built: + result.stepStatus = bsSuccess; + break; + case BuildResult::Substituted: + case BuildResult::AlreadyValid: + result.stepStatus = bsSuccess; + result.isCached = true; + break; + case BuildResult::PermanentFailure: + result.stepStatus = bsFailed; + result.canCache = true; + result.errorMsg = ""; + break; + case BuildResult::InputRejected: + case BuildResult::OutputRejected: + result.stepStatus = bsFailed; + result.canCache = true; + break; + case BuildResult::TransientFailure: + result.stepStatus = bsFailed; + result.canRetry = true; + result.errorMsg = ""; + break; + case BuildResult::CachedFailure: // cached on the build machine + result.stepStatus = bsCachedFailure; + result.canCache = true; + result.errorMsg = ""; + break; + case BuildResult::TimedOut: + result.stepStatus = bsTimedOut; + result.errorMsg = ""; + break; + case BuildResult::MiscFailure: + result.stepStatus = bsAborted; + result.canRetry = true; + break; + case BuildResult::LogLimitExceeded: + result.stepStatus = bsLogLimitExceeded; + break; + default: + result.stepStatus = bsAborted; + break; + } + if (result.stepStatus != bsSuccess) return; } + result.errorMsg = ""; + /* If the path was substituted or already valid, then we didn't get a build log. */ - if (result.status == BuildResult::Substituted || result.status == BuildResult::AlreadyValid) { + if (result.isCached) { printMsg(lvlInfo, format("outputs of ‘%1%’ substituted or already valid on ‘%2%’") % step->drvPath % machine->sshName); unlink(result.logFile.c_str()); result.logFile = ""; @@ -303,6 +356,11 @@ void State::buildRemote(ref destStore, totalNarSize += readLongLong(from); } + if (totalNarSize > maxOutputSize) { + result.stepStatus = bsNarSizeLimitExceeded; + return; + } + printMsg(lvlDebug, format("copying outputs of ‘%s’ from ‘%s’ (%d bytes)") % step->drvPath % machine->sshName % totalNarSize); diff --git a/src/hydra-queue-runner/builder.cc b/src/hydra-queue-runner/builder.cc index a0d076b7..70302b87 100644 --- a/src/hydra-queue-runner/builder.cc +++ b/src/hydra-queue-runner/builder.cc @@ -105,10 +105,8 @@ State::StepResult State::doBuildStep(nix::ref destStore, Step::ptr step, /* If any of the outputs have previously failed, then don't bother building again. */ - bool cachedFailure = checkCachedFailure(step, *conn); - - if (cachedFailure) - result.status = BuildResult::CachedFailure; + if (checkCachedFailure(step, *conn)) + result.stepStatus = bsCachedFailure; else { /* Create a build step record indicating that we started @@ -124,12 +122,14 @@ State::StepResult State::doBuildStep(nix::ref destStore, Step::ptr step, try { /* FIXME: referring builds may have conflicting timeouts. */ buildRemote(destStore, machine, step, build->maxSilentTime, build->buildTimeout, result); + } catch (NoTokens & e) { + result.stepStatus = bsNarSizeLimitExceeded; } catch (Error & e) { - result.status = BuildResult::MiscFailure; + result.stepStatus = bsAborted; result.errorMsg = e.msg(); } - if (result.success()) + if (result.stepStatus == bsSuccess) res = getBuildOutput(destStore, ref(result.accessor), step->drv); } @@ -159,7 +159,7 @@ State::StepResult State::doBuildStep(nix::ref destStore, Step::ptr step, /* The step had a hopefully temporary failure (e.g. network issue). Retry a number of times. */ - if (result.canRetry()) { + if (result.canRetry) { printMsg(lvlError, format("possibly transient failure building ‘%1%’ on ‘%2%’: %3%") % step->drvPath % machine->sshName % result.errorMsg); bool retry; @@ -178,7 +178,7 @@ State::StepResult State::doBuildStep(nix::ref destStore, Step::ptr step, } } - if (result.success()) { + if (result.stepStatus == bsSuccess) { /* Register success in the database for all Build objects that have this step as the top-level step. Since the queue @@ -225,7 +225,7 @@ State::StepResult State::doBuildStep(nix::ref destStore, Step::ptr step, build->id, stepNr, machine->sshName, bsSuccess); for (auto & b : direct) - markSucceededBuild(txn, b, res, build != b || result.status != BuildResult::Built, + markSucceededBuild(txn, b, res, build != b || result.isCached, result.startTime, result.stopTime); txn.commit(); @@ -309,38 +309,27 @@ State::StepResult State::doBuildStep(nix::ref destStore, Step::ptr step, pqxx::work txn(*conn); - BuildStatus buildStatus = - result.status == BuildResult::TimedOut ? bsTimedOut : - result.status == BuildResult::LogLimitExceeded ? bsLogLimitExceeded : - result.canRetry() ? bsAborted : - bsFailed; - /* For standard failures, we don't care about the error message. */ - if (result.status == BuildResult::PermanentFailure || - result.status == BuildResult::TransientFailure || - result.status == BuildResult::CachedFailure || - result.status == BuildResult::TimedOut || - result.status == BuildResult::LogLimitExceeded) + if (result.stepStatus != bsAborted) result.errorMsg = ""; - /* Create failed build steps for every build that depends - on this. For cached failures, only create a step for - builds that don't have this step as top-level - (otherwise the user won't be able to see what caused - the build to fail). */ + /* Create failed build steps for every build that + depends on this, except when this step is cached + and is the top-level of that build (since then it's + redundant with the build's isCachedBuild field). */ for (auto & build2 : indirect) { - if ((cachedFailure && build2->drvPath == step->drvPath) || - (!cachedFailure && build == build2) || + if ((result.stepStatus == bsCachedFailure && build2->drvPath == step->drvPath) || + (result.stepStatus != bsCachedFailure && build == build2) || build2->finishedInDB) continue; createBuildStep(txn, 0, build2, step, machine->sshName, - buildStatus, result.errorMsg, build == build2 ? 0 : build->id); + result.stepStatus, result.errorMsg, build == build2 ? 0 : build->id); } - if (!cachedFailure) + if (result.stepStatus != bsCachedFailure) finishBuildStep(txn, result.startTime, result.stopTime, result.overhead, - build->id, stepNr, machine->sshName, buildStatus, result.errorMsg); + build->id, stepNr, machine->sshName, result.stepStatus, result.errorMsg); /* Mark all builds that depend on this derivation as failed. */ for (auto & build2 : indirect) { @@ -349,16 +338,16 @@ State::StepResult State::doBuildStep(nix::ref destStore, Step::ptr step, txn.parameterized ("update Builds set finished = 1, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1 and finished = 0") (build2->id) - ((int) (build2->drvPath != step->drvPath && buildStatus == bsFailed ? bsDepFailed : buildStatus)) + ((int) (build2->drvPath != step->drvPath && result.buildStatus() == bsFailed ? bsDepFailed : result.buildStatus())) (result.startTime) (result.stopTime) - (cachedFailure ? 1 : 0).exec(); + (result.stepStatus == bsCachedFailure ? 1 : 0).exec(); nrBuildsDone++; } /* Remember failed paths in the database so that they won't be built again. */ - if (!cachedFailure && result.status == BuildResult::PermanentFailure) + if (result.stepStatus != bsCachedFailure && result.canCache) for (auto & path : step->drv.outputPaths()) txn.parameterized("insert into FailedPaths values ($1)")(path).exec(); diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index d391b237..c9377ab7 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -41,6 +41,11 @@ State::State() } } + { + std::string s = hydraConfig["max-output-size"]; + if (s != "") string2Int(s, maxOutputSize); + } + logDir = canonPath(hydraData + "/build-logs"); } diff --git a/src/hydra-queue-runner/state.hh b/src/hydra-queue-runner/state.hh index 49a51924..048c91f4 100644 --- a/src/hydra-queue-runner/state.hh +++ b/src/hydra-queue-runner/state.hh @@ -33,20 +33,27 @@ typedef enum { bsCachedFailure = 8, // steps only bsUnsupported = 9, bsLogLimitExceeded = 10, + bsNarSizeLimitExceeded = 11, bsBusy = 100, // not stored } BuildStatus; -struct RemoteResult : nix::BuildResult +struct RemoteResult { + BuildStatus stepStatus = bsAborted; + bool canRetry = false; // for bsAborted + bool isCached = false; // for bsSucceed + bool canCache = false; // for bsFailed + std::string errorMsg; // for bsAborted + time_t startTime = 0, stopTime = 0; unsigned int overhead = 0; nix::Path logFile; std::shared_ptr accessor; - bool canRetry() + BuildStatus buildStatus() { - return status == TransientFailure || status == MiscFailure; + return stepStatus == bsCachedFailure ? bsFailed : stepStatus; } }; @@ -350,6 +357,8 @@ private: tokens are available. */ nix::TokenServer memoryTokens; + size_t maxOutputSize = 2ULL << 30; + public: State(); diff --git a/src/root/build.tt b/src/root/build.tt index 20ff4abe..8c233e3e 100644 --- a/src/root/build.tt +++ b/src/root/build.tt @@ -65,6 +65,8 @@ FOR step IN steps; IF step.busy; busy = 1; END; END; Unsupported system type [% ELSIF step.status == 10 %] Log limit exceeded + [% ELSIF step.status == 11 %] + Output limit exceeded [% ELSIF step.errormsg %] Failed: [% HTML.escape(step.errormsg) %] [% ELSE %] diff --git a/src/root/common.tt b/src/root/common.tt index 0f629cf7..40708567 100644 --- a/src/root/common.tt +++ b/src/root/common.tt @@ -207,6 +207,8 @@ BLOCK renderBuildStatusIcon; Timed out [% ELSIF buildstatus == 10 %] Log limit exceeded + [% ELSIF buildstatus == 11 %] + Output size limit exceeded [% ELSE %] Failed [% END; @@ -236,6 +238,8 @@ BLOCK renderStatus; Unsupported system type [% ELSIF buildstatus == 10 %] Log limit exceeded + [% ELSIF buildstatus == 11 %] + Output limit exceeded [% ELSE %] Aborted (Hydra failure; see below) diff --git a/src/sql/hydra.sql b/src/sql/hydra.sql index 2511a960..d53ba11f 100644 --- a/src/sql/hydra.sql +++ b/src/sql/hydra.sql @@ -192,6 +192,7 @@ create table Builds ( -- 8 = cached failure [steps only; builds use isCachedBuild] -- 9 = unsupported system type -- 10 = log limit exceeded + -- 11 = NAR size limit exceeded buildStatus integer, size bigint,