From 4151be7e69957d22af712dd5410b5ad8aa3a2289 Mon Sep 17 00:00:00 2001
From: Eelco Dolstra <eelco.dolstra@logicblox.com>
Date: Wed, 9 Mar 2016 16:59:38 +0100
Subject: [PATCH] Make the output size limit configurable

The maximum output size per build step (as the sum of the NARs of each
output) can be set via hydra.conf, e.g.

  max-output-size = 1000000000

The default is 2 GiB.

Also refactored the build error / status handling a bit.
---
 src/hydra-queue-runner/build-remote.cc       | 72 ++++++++++++++++++--
 src/hydra-queue-runner/builder.cc            | 55 ++++++---------
 src/hydra-queue-runner/hydra-queue-runner.cc |  5 ++
 src/hydra-queue-runner/state.hh              | 15 +++-
 src/root/build.tt                            |  2 +
 src/root/common.tt                           |  4 ++
 src/sql/hydra.sql                            |  1 +
 7 files changed, 111 insertions(+), 43 deletions(-)
diff --git a/src/hydra-queue-runner/build-remote.cc b/src/hydra-queue-runner/build-remote.cc
index 3c9a8f79..f3576a0c 100644
--- a/src/hydra-queue-runner/build-remote.cc
+++ b/src/hydra-queue-runner/build-remote.cc
@@ -261,21 +261,74 @@ void State::buildRemote(ref<Store> destStore,
     if (sendDerivation) {
         if (res) {
             result.errorMsg = (format("%1% on ‘%2%’") % readString(from) % machine->sshName).str();
-            if (res == 100) result.status = BuildResult::PermanentFailure;
-            else if (res == 101) result.status = BuildResult::TimedOut;
-            else result.status = BuildResult::MiscFailure;
+            if (res == 100) {
+                result.stepStatus = bsFailed;
+                result.canCache = true;
+            }
+            else if (res == 101) {
+                result.stepStatus = bsTimedOut;
+            }
+            else {
+                result.stepStatus = bsAborted;
+                result.canRetry = true;
+            }
             return;
         }
-        result.status = BuildResult::Built;
+        result.stepStatus = bsSuccess;
     } else {
-        result.status = (BuildResult::Status) res;
         result.errorMsg = readString(from);
-        if (!result.success()) return;
+        switch ((BuildResult::Status) res) {
+            case BuildResult::Built:
+                result.stepStatus = bsSuccess;
+                break;
+            case BuildResult::Substituted:
+            case BuildResult::AlreadyValid:
+                result.stepStatus = bsSuccess;
+                result.isCached = true;
+                break;
+            case BuildResult::PermanentFailure:
+                result.stepStatus = bsFailed;
+                result.canCache = true;
+                result.errorMsg = "";
+                break;
+            case BuildResult::InputRejected:
+            case BuildResult::OutputRejected:
+                result.stepStatus = bsFailed;
+                result.canCache = true;
+                break;
+            case BuildResult::TransientFailure:
+                result.stepStatus = bsFailed;
+                result.canRetry = true;
+                result.errorMsg = "";
+                break;
+            case BuildResult::CachedFailure: // cached on the build machine
+                result.stepStatus = bsCachedFailure;
+                result.canCache = true;
+                result.errorMsg = "";
+                break;
+            case BuildResult::TimedOut:
+                result.stepStatus = bsTimedOut;
+                result.errorMsg = "";
+                break;
+            case BuildResult::MiscFailure:
+                result.stepStatus = bsAborted;
+                result.canRetry = true;
+                break;
+            case BuildResult::LogLimitExceeded:
+                result.stepStatus = bsLogLimitExceeded;
+                break;
+            default:
+                result.stepStatus = bsAborted;
+                break;
+        }
+        if (result.stepStatus != bsSuccess) return;
     }
 
+    result.errorMsg = "";
+
     /* If the path was substituted or already valid, then we didn't
        get a build log. */
-    if (result.status == BuildResult::Substituted || result.status == BuildResult::AlreadyValid) {
+    if (result.isCached) {
         printMsg(lvlInfo, format("outputs of ‘%1%’ substituted or already valid on ‘%2%’") % step->drvPath % machine->sshName);
         unlink(result.logFile.c_str());
         result.logFile = "";
@@ -303,6 +356,11 @@ void State::buildRemote(ref<Store> destStore,
             totalNarSize += readLongLong(from);
         }
 
+        if (totalNarSize > maxOutputSize) {
+            result.stepStatus = bsNarSizeLimitExceeded;
+            return;
+        }
+
         printMsg(lvlDebug, format("copying outputs of ‘%s’ from ‘%s’ (%d bytes)")
             % step->drvPath % machine->sshName % totalNarSize);
 
diff --git a/src/hydra-queue-runner/builder.cc b/src/hydra-queue-runner/builder.cc
index a0d076b7..70302b87 100644
--- a/src/hydra-queue-runner/builder.cc
+++ b/src/hydra-queue-runner/builder.cc
@@ -105,10 +105,8 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore, Step::ptr step,
 
     /* If any of the outputs have previously failed, then don't bother
        building again. */
-    bool cachedFailure = checkCachedFailure(step, *conn);
-
-    if (cachedFailure)
-        result.status = BuildResult::CachedFailure;
+    if (checkCachedFailure(step, *conn))
+        result.stepStatus = bsCachedFailure;
     else {
 
         /* Create a build step record indicating that we started
@@ -124,12 +122,14 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore, Step::ptr step,
         try {
             /* FIXME: referring builds may have conflicting timeouts. */
             buildRemote(destStore, machine, step, build->maxSilentTime, build->buildTimeout, result);
+        } catch (NoTokens & e) {
+            result.stepStatus = bsNarSizeLimitExceeded;
         } catch (Error & e) {
-            result.status = BuildResult::MiscFailure;
+            result.stepStatus = bsAborted;
             result.errorMsg = e.msg();
         }
 
-        if (result.success())
+        if (result.stepStatus == bsSuccess)
             res = getBuildOutput(destStore, ref<FSAccessor>(result.accessor), step->drv);
     }
 
@@ -159,7 +159,7 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore, Step::ptr step,
 
     /* The step had a hopefully temporary failure (e.g. network
        issue). Retry a number of times. */
-    if (result.canRetry()) {
+    if (result.canRetry) {
         printMsg(lvlError, format("possibly transient failure building ‘%1%’ on ‘%2%’: %3%")
             % step->drvPath % machine->sshName % result.errorMsg);
         bool retry;
@@ -178,7 +178,7 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore, Step::ptr step,
         }
     }
 
-    if (result.success()) {
+    if (result.stepStatus == bsSuccess) {
 
         /* Register success in the database for all Build objects that
            have this step as the top-level step. Since the queue
@@ -225,7 +225,7 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore, Step::ptr step,
                     build->id, stepNr, machine->sshName, bsSuccess);
 
                 for (auto & b : direct)
-                    markSucceededBuild(txn, b, res, build != b || result.status != BuildResult::Built,
+                    markSucceededBuild(txn, b, res, build != b || result.isCached,
                         result.startTime, result.stopTime);
 
                 txn.commit();
@@ -309,38 +309,27 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore, Step::ptr step,
 
                 pqxx::work txn(*conn);
 
-                BuildStatus buildStatus =
-                    result.status == BuildResult::TimedOut ? bsTimedOut :
-                    result.status == BuildResult::LogLimitExceeded ? bsLogLimitExceeded :
-                    result.canRetry() ? bsAborted :
-                    bsFailed;
-
                 /* For standard failures, we don't care about the error
                    message. */
-                if (result.status == BuildResult::PermanentFailure ||
-                    result.status == BuildResult::TransientFailure ||
-                    result.status == BuildResult::CachedFailure ||
-                    result.status == BuildResult::TimedOut ||
-                    result.status == BuildResult::LogLimitExceeded)
+                if (result.stepStatus != bsAborted)
                     result.errorMsg = "";
 
-                /* Create failed build steps for every build that depends
-                   on this. For cached failures, only create a step for
-                   builds that don't have this step as top-level
-                   (otherwise the user won't be able to see what caused
-                   the build to fail). */
+                /* Create failed build steps for every build that
+                   depends on this, except when this step is cached
+                   and is the top-level of that build (since then it's
+                   redundant with the build's isCachedBuild field). */
                 for (auto & build2 : indirect) {
-                    if ((cachedFailure && build2->drvPath == step->drvPath) ||
-                        (!cachedFailure && build == build2) ||
+                    if ((result.stepStatus == bsCachedFailure && build2->drvPath == step->drvPath) ||
+                        (result.stepStatus != bsCachedFailure && build == build2) ||
                         build2->finishedInDB)
                         continue;
                     createBuildStep(txn, 0, build2, step, machine->sshName,
-                        buildStatus, result.errorMsg, build == build2 ? 0 : build->id);
+                        result.stepStatus, result.errorMsg, build == build2 ? 0 : build->id);
                 }
 
-                if (!cachedFailure)
+                if (result.stepStatus != bsCachedFailure)
                     finishBuildStep(txn, result.startTime, result.stopTime, result.overhead,
-                        build->id, stepNr, machine->sshName, buildStatus, result.errorMsg);
+                        build->id, stepNr, machine->sshName, result.stepStatus, result.errorMsg);
 
                 /* Mark all builds that depend on this derivation as failed. */
                 for (auto & build2 : indirect) {
@@ -349,16 +338,16 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore, Step::ptr step,
                     txn.parameterized
                         ("update Builds set finished = 1, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1 and finished = 0")
                         (build2->id)
-                        ((int) (build2->drvPath != step->drvPath && buildStatus == bsFailed ? bsDepFailed : buildStatus))
+                        ((int) (build2->drvPath != step->drvPath && result.buildStatus() == bsFailed ? bsDepFailed : result.buildStatus()))
                         (result.startTime)
                         (result.stopTime)
-                        (cachedFailure ? 1 : 0).exec();
+                        (result.stepStatus == bsCachedFailure ? 1 : 0).exec();
                     nrBuildsDone++;
                 }
 
                 /* Remember failed paths in the database so that they
                    won't be built again. */
-                if (!cachedFailure && result.status == BuildResult::PermanentFailure)
+                if (result.stepStatus != bsCachedFailure && result.canCache)
                     for (auto & path : step->drv.outputPaths())
                         txn.parameterized("insert into FailedPaths values ($1)")(path).exec();
 
diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc
index d391b237..c9377ab7 100644
--- a/src/hydra-queue-runner/hydra-queue-runner.cc
+++ b/src/hydra-queue-runner/hydra-queue-runner.cc
@@ -41,6 +41,11 @@ State::State()
         }
     }
 
+    {
+        std::string s = hydraConfig["max-output-size"];
+        if (s != "") string2Int(s, maxOutputSize);
+    }
+
     logDir = canonPath(hydraData + "/build-logs");
 }
 
diff --git a/src/hydra-queue-runner/state.hh b/src/hydra-queue-runner/state.hh
index 49a51924..048c91f4 100644
--- a/src/hydra-queue-runner/state.hh
+++ b/src/hydra-queue-runner/state.hh
@@ -33,20 +33,27 @@ typedef enum {
     bsCachedFailure = 8, // steps only
     bsUnsupported = 9,
     bsLogLimitExceeded = 10,
+    bsNarSizeLimitExceeded = 11,
     bsBusy = 100, // not stored
 } BuildStatus;
 
 
-struct RemoteResult : nix::BuildResult
+struct RemoteResult
 {
+    BuildStatus stepStatus = bsAborted;
+    bool canRetry = false; // for bsAborted
+    bool isCached = false; // for bsSucceed
+    bool canCache = false; // for bsFailed
+    std::string errorMsg; // for bsAborted
+
     time_t startTime = 0, stopTime = 0;
     unsigned int overhead = 0;
     nix::Path logFile;
     std::shared_ptr<nix::FSAccessor> accessor;
 
-    bool canRetry()
+    BuildStatus buildStatus()
     {
-        return status == TransientFailure || status == MiscFailure;
+        return stepStatus == bsCachedFailure ? bsFailed : stepStatus;
     }
 };
 
@@ -350,6 +357,8 @@ private:
        tokens are available. */
     nix::TokenServer memoryTokens;
 
+    size_t maxOutputSize = 2ULL << 30;
+
 public:
     State();
 
diff --git a/src/root/build.tt b/src/root/build.tt
index 20ff4abe..8c233e3e 100644
--- a/src/root/build.tt
+++ b/src/root/build.tt
@@ -65,6 +65,8 @@ FOR step IN steps; IF step.busy; busy = 1; END; END;
                 <span class="error">Unsupported system type</span>
               [% ELSIF step.status == 10 %]
                 <span class="error">Log limit exceeded</span>
+              [% ELSIF step.status == 11 %]
+                <span class="error">Output limit exceeded</span>
               [% ELSIF step.errormsg %]
                 <span class="error">Failed: [% HTML.escape(step.errormsg) %]</span>
               [% ELSE %]
diff --git a/src/root/common.tt b/src/root/common.tt
index 0f629cf7..40708567 100644
--- a/src/root/common.tt
+++ b/src/root/common.tt
@@ -207,6 +207,8 @@ BLOCK renderBuildStatusIcon;
       <img src="[% c.uri_for("/static/images/warning_${size}.png") %]" alt="Timed out" class="build-status" />
     [% ELSIF buildstatus == 10 %]
       <img src="[% c.uri_for("/static/images/warning_${size}.png") %]" alt="Log limit exceeded" class="build-status" />
+    [% ELSIF buildstatus == 11 %]
+      <img src="[% c.uri_for("/static/images/warning_${size}.png") %]" alt="Output size limit exceeded" class="build-status" />
     [% ELSE %]
       <img src="[% c.uri_for("/static/images/error_${size}.png") %]" alt="Failed" class="build-status" />
     [% END;
@@ -236,6 +238,8 @@ BLOCK renderStatus;
       <span class="error">Unsupported system type</span>
     [% ELSIF buildstatus == 10 %]
       <span class="error">Log limit exceeded</span>
+    [% ELSIF buildstatus == 11 %]
+      <span class="error">Output limit exceeded</span>
     [% ELSE %]
       <span class="error">Aborted</span>
       (Hydra failure; see <a href="#nix-error">below</a>)
diff --git a/src/sql/hydra.sql b/src/sql/hydra.sql
index 2511a960..d53ba11f 100644
--- a/src/sql/hydra.sql
+++ b/src/sql/hydra.sql
@@ -192,6 +192,7 @@ create table Builds (
     --   8 = cached failure [steps only; builds use isCachedBuild]
     --   9 = unsupported system type
     --  10 = log limit exceeded
+    --  11 = NAR size limit exceeded
     buildStatus   integer,
 
     size          bigint,