Automatically retry aborted builds

Aborted builds are now put back on the runnable queue and retried after a certain time interval (currently 60 seconds for the first retry, then tripled on each subsequent retry).
2015-06-17 11:45:20 +02:00 · 2015-06-17 11:45:20 +02:00 · b91a616520
parent e02654b3a0
commit b91a616520
4 changed files with 145 additions and 61 deletions
--- a/src/hydra-queue-runner/build-remote.cc
+++ b/src/hydra-queue-runner/build-remote.cc
@ -84,7 +84,7 @@ static void copyClosureTo(std::shared_ptr<StoreAPI> store,
    for (auto i = sorted.rbegin(); i != sorted.rend(); ++i)
        if (present.find(*i) == present.end()) missing.push_back(*i);

-    printMsg(lvlError, format("sending %1% missing paths") % missing.size());
+    printMsg(lvlDebug, format("sending %1% missing paths") % missing.size());

    writeInt(cmdImportPaths, to);
    exportPaths(*store, missing, false, to);
@ -128,23 +128,28 @@ void buildRemote(std::shared_ptr<StoreAPI> store,
    FdSink to(child.to);

    /* Handshake. */
-    writeInt(SERVE_MAGIC_1, to);
-    writeInt(SERVE_PROTOCOL_VERSION, to);
-    to.flush();
+    try {
+        writeInt(SERVE_MAGIC_1, to);
+        writeInt(SERVE_PROTOCOL_VERSION, to);
+        to.flush();

-    unsigned int magic = readInt(from);
-    if (magic != SERVE_MAGIC_2)
-        throw Error(format("protocol mismatch with ‘nix-store --serve’ on ‘%1%’") % sshName);
-    unsigned int version = readInt(from);
-    if (GET_PROTOCOL_MAJOR(version) != 0x200)
-        throw Error(format("unsupported ‘nix-store --serve’ protocol version on ‘%1%’") % sshName);
+        unsigned int magic = readInt(from);
+        if (magic != SERVE_MAGIC_2)
+            throw Error(format("protocol mismatch with ‘nix-store --serve’ on ‘%1%’") % sshName);
+        unsigned int version = readInt(from);
+        if (GET_PROTOCOL_MAJOR(version) != 0x200)
+            throw Error(format("unsupported ‘nix-store --serve’ protocol version on ‘%1%’") % sshName);
+    } catch (EndOfFile & e) {
+        child.pid.wait(true);
+        throw Error(format("cannot connect to ‘%1%’: %2%") % sshName % chomp(readFile(logFile)));
+    }

    /* Copy the input closure. */
-    printMsg(lvlError, format("sending closure of ‘%1%’ to ‘%2%’") % drvPath % sshName);
+    printMsg(lvlDebug, format("sending closure of ‘%1%’ to ‘%2%’") % drvPath % sshName);
    copyClosureTo(store, from, to, PathSet({drvPath}));

    /* Do the build. */
-    printMsg(lvlError, format("building ‘%1%’ on ‘%2%’") % drvPath % sshName);
+    printMsg(lvlDebug, format("building ‘%1%’ on ‘%2%’") % drvPath % sshName);
    writeInt(cmdBuildPaths, to);
    writeStrings(PathSet({drvPath}), to);
    writeInt(3600, to); // == maxSilentTime, FIXME
@ -162,7 +167,7 @@ void buildRemote(std::shared_ptr<StoreAPI> store,
    }

    /* Copy the output paths. */
-    printMsg(lvlError, format("copying outputs of ‘%1%’ from ‘%2%’") % drvPath % sshName);
+    printMsg(lvlDebug, format("copying outputs of ‘%1%’ from ‘%2%’") % drvPath % sshName);
    PathSet outputs;
    for (auto & output : drv.outputs)
        outputs.insert(output.second.path);
--- a/src/hydra-queue-runner/hydra-queue-runner.cc
+++ b/src/hydra-queue-runner/hydra-queue-runner.cc
@ -4,6 +4,8 @@
 #include <map>
 #include <memory>
 #include <thread>
+#include <cmath>
+#include <chrono>

 #include <pqxx/pqxx>

@ -20,6 +22,14 @@
 using namespace nix;


+const int maxTries = 5;
+const int retryInterval = 60; // seconds
+const float retryBackoff = 3.0;
+
+
+typedef std::chrono::time_point<std::chrono::system_clock> system_time;
+
+
 template <class C, class V>
 bool has(const C & c, const V & v)
 {
@ -100,6 +110,12 @@ struct Step

        /* Builds that have this step as the top-level derivation. */
        std::vector<Build::wptr> builds;
+
+        /* Number of times we've tried this step. */
+        unsigned int tries = 0;
+
+        /* Point in time after which the step can be retried. */
+        system_time after;
    };

    Sync<State> state;
@ -108,10 +124,7 @@ struct Step

    Step() : destroyed(false) { }

-    ~Step()
-    {
-        printMsg(lvlDebug, format("destroying step %1%") % drvPath);
-    }
+    ~Step() { }
 };


@ -198,7 +211,10 @@ private:
    Sync<Machines> machines;

    /* Various stats. */
-    std::atomic<int> nrQueueWakeups;
+    std::atomic<unsigned int> nrRetries;
+    std::atomic<unsigned int> maxNrRetries;
+    std::atomic<unsigned int> nrQueueWakeups;
+    std::atomic<unsigned int> nrDispatcherWakeups;

 public:
    State();
@ -246,7 +262,9 @@ public:

    void builder(Step::ptr step, MachineReservation::ptr reservation);

-    void doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
+    /* Perform the given build step. Return true if the step is to be
+       retried. */
+    bool doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
        Machine::ptr machine);

    void markSucceededBuild(pqxx::work & txn, Build::ptr build,
@ -262,7 +280,7 @@ public:

 State::State()
 {
-    nrQueueWakeups = 0;
+    nrRetries = maxNrRetries = nrQueueWakeups = nrDispatcherWakeups = 0;

    hydraData = getEnv("HYDRA_DATA");
    if (hydraData == "") throw Error("$HYDRA_DATA must be set");
@ -799,6 +817,8 @@ void State::dispatcher()
    while (true) {
        printMsg(lvlDebug, "dispatcher woken up");

+        auto sleepUntil = system_time::max();
+
        {
            auto runnable_(runnable.lock());
            printMsg(lvlDebug, format("%1% runnable builds") % runnable_->size());
@ -806,6 +826,8 @@ void State::dispatcher()
            /* FIXME: we're holding the runnable lock too long
               here. This could be more efficient. */

+            system_time now = std::chrono::system_clock::now();
+
            for (auto i = runnable_->begin(); i != runnable_->end(); ) {
                auto step = i->lock();

@ -815,6 +837,18 @@ void State::dispatcher()
                    continue;
                }

+                /* Skip previously failed steps that aren't ready to
+                   be retried. */
+                {
+                    auto step_(step->state.lock());
+                    if (step_->tries > 0 && step_->after > now) {
+                        if (step_->after < sleepUntil)
+                            sleepUntil = step_->after;
+                        ++i;
+                        continue;
+                    }
+                }
+
                auto reservation = findMachine(step);
                if (!reservation) {
                    printMsg(lvlDebug, format("cannot execute step ‘%1%’ right now") % step->drvPath);
@ -833,7 +867,10 @@ void State::dispatcher()
           is added, or because a build finishes). */
        {
            std::unique_lock<std::mutex> lock(dispatcherMutex);
-            dispatcherWakeup.wait(lock);
+            printMsg(lvlDebug, format("dispatcher sleeping for %1%s") %
+                std::chrono::duration_cast<std::chrono::seconds>(sleepUntil - std::chrono::system_clock::now()).count());
+            dispatcherWakeup.wait_until(lock, sleepUntil);
+            nrDispatcherWakeups++;
        }
    }

@ -871,22 +908,40 @@ MachineReservation::ptr State::findMachine(Step::ptr step)

 void State::builder(Step::ptr step, MachineReservation::ptr reservation)
 {
+    bool retry = true;
+
    try {
        auto store = openStore(); // FIXME: pool
-        doBuildStep(store, step, reservation->machine);
+        retry = doBuildStep(store, step, reservation->machine);
    } catch (std::exception & e) {
-        printMsg(lvlError, format("error building ‘%1%’: %2%") % step->drvPath % e.what());
-        // FIXME: put step back in runnable and retry
+        printMsg(lvlError, format("uncaught exception building ‘%1%’ on ‘%2%’: %3%")
+            % step->drvPath % reservation->machine->sshName % e.what());
    }

    /* Release the machine and wake up the dispatcher. */
    assert(reservation.unique());
    reservation = 0;
    wakeDispatcher();
+
+    /* If there was a temporary failure, retry the step after an
+       exponentially increasing interval. */
+    if (retry) {
+        {
+            auto step_(step->state.lock());
+            step_->tries++;
+            nrRetries++;
+            if (step_->tries > maxNrRetries) maxNrRetries = step_->tries; // yeah yeah, not atomic
+            int delta = retryInterval * powf(retryBackoff, step_->tries - 1);
+            printMsg(lvlInfo, format("will retry ‘%1%’ after %2%s") % step->drvPath % delta);
+            step_->after = std::chrono::system_clock::now() + std::chrono::seconds(delta);
+        }
+
+        makeRunnable(step);
+    }
 }


-void State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
+bool State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
    Machine::ptr machine)
 {
    /* There can be any number of builds in the database that depend
@ -903,14 +958,16 @@ void State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,

        if (dependents.empty()) {
            /* Apparently all builds that depend on this derivation
-               are gone (e.g. cancelled). So don't bother. (This is
+               are gone (e.g. cancelled). So don't bother. This is
               very unlikely to happen, because normally Steps are
               only kept alive by being reachable from a
-               Build). FIXME: what if a new Build gets a reference to
-               this step? */
+               Build. However, it's possible that a new Build just
+               created a reference to this step. So to handle that
+               possibility, we retry this step (putting it back in
+               the runnable queue). If there are really no strong
+               pointers to the step, it will be deleted. */
            printMsg(lvlInfo, format("cancelling build step ‘%1%’") % step->drvPath);
-            destroyStep(step, false);
-            return;
+            return true;
        }

        for (auto build2 : dependents)
@ -930,8 +987,8 @@ void State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,

    result.startTime = time(0);

-    /* If any of the outputs have previously failed, then don't
-       retry. */
+    /* If any of the outputs have previously failed, then don't bother
+       building again. */
    bool cachedFailure = checkCachedFailure(step, *conn);

    if (cachedFailure)
@ -952,8 +1009,8 @@ void State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
        } catch (Error & e) {
            result.status = RemoteResult::rrMiscFailure;
            result.errorMsg = e.msg();
-            printMsg(lvlError, format("ERROR: %1%") % e.msg());
-            abort(); // FIXME
+            printMsg(lvlError, format("irregular failure building ‘%1%’ on ‘%2%’: %3%")
+                % step->drvPath % machine->sshName % e.msg());
        }

        if (result.status == RemoteResult::rrSuccess) res = getBuildResult(store, step->drv);
@ -963,13 +1020,19 @@ void State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,

    if (!result.stopTime) result.stopTime = time(0);

+    bool retry = false;
+    if (result.status == RemoteResult::rrMiscFailure) {
+        auto step_(step->state.lock());
+        retry = step_->tries + 1 < maxTries;
+    }
+
    /* Remove this step. After this, incoming builds that depend on
       drvPath will either see that the output paths exist, or will
       create a new build step for drvPath. The latter is fine - it
       won't conflict with this one, because we're removing it. In any
       case, the set of dependent builds for ‘step’ can't increase
       anymore because ‘step’ is no longer visible to createStep(). */
-    {
+    if (!retry) {
        auto steps_(steps.lock());
        steps_->erase(step->drvPath);
    }
@ -1002,34 +1065,44 @@ void State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
        } else {
            /* Failure case. */

+            BuildStatus buildStatus =
+                result.status == RemoteResult::rrPermanentFailure ? bsFailed : bsAborted;
+            BuildStepStatus buildStepStatus =
+                result.status == RemoteResult::rrPermanentFailure ? bssFailed : bssAborted;
+
            /* For regular failures, we don't care about the error
               message. */
-            if (result.status != RemoteResult::rrMiscFailure) result.errorMsg = "";
+            if (buildStatus != bsAborted) result.errorMsg = "";

-            if (!cachedFailure) {
+            if (!cachedFailure && !retry) {

                /* Create failed build steps for every build that depends
                   on this. */
                for (auto build2 : dependents) {
                    if (build == build2) continue;
-                    createBuildStep(txn, 0, build2, step, machine->sshName, bssFailed, result.errorMsg, build->id);
+                    createBuildStep(txn, 0, build2, step, machine->sshName,
+                        buildStepStatus, result.errorMsg, build->id);
                }

-                finishBuildStep(txn, result.startTime, result.stopTime, build->id, stepNr, machine->sshName, bssFailed, result.errorMsg);
            }

+            if (!cachedFailure)
+                finishBuildStep(txn, result.startTime, result.stopTime, build->id,
+                    stepNr, machine->sshName, buildStepStatus, result.errorMsg);
+
            /* Mark all builds that depend on this derivation as failed. */
-            for (auto build2 : dependents) {
-                printMsg(lvlError, format("marking build %1% as failed") % build2->id);
-                txn.parameterized
-                    ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1")
-                    (build2->id)
-                    ((int) (build2->drvPath == step->drvPath ? bsFailed : bsDepFailed))
-                    (result.startTime)
-                    (result.stopTime)
-                    (cachedFailure ? 1 : 0).exec();
-                build2->finishedInDB = true; // FIXME: txn might fail
-            }
+            if (!retry)
+                for (auto build2 : dependents) {
+                    printMsg(lvlError, format("marking build %1% as failed") % build2->id);
+                    txn.parameterized
+                        ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1")
+                        (build2->id)
+                        ((int) (build2->drvPath != step->drvPath && buildStatus == bsFailed ? bsDepFailed : buildStatus))
+                        (result.startTime)
+                        (result.stopTime)
+                        (cachedFailure ? 1 : 0).exec();
+                    build2->finishedInDB = true; // FIXME: txn might fail
+                }

            /* Remember failed paths in the database so that they
               won't be built again. */
@ -1045,16 +1118,20 @@ void State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
       is the top-level derivation. In case of failure, destroy all
       dependent Build objects. Any Steps not referenced by other
       Builds will be destroyed as well. */
-    for (auto build2 : dependents)
-        if (build2->toplevel == step || result.status != RemoteResult::rrSuccess) {
-            auto builds_(builds.lock());
-            builds_->erase(build2->id);
-        }
+    if (!retry)
+        for (auto build2 : dependents)
+            if (build2->toplevel == step || result.status != RemoteResult::rrSuccess) {
+                auto builds_(builds.lock());
+                builds_->erase(build2->id);
+            }

    /* Remove the step from the graph. In case of success, make
       dependent build steps runnable if they have no other
       dependencies. */
-    destroyStep(step, result.status == RemoteResult::rrSuccess);
+    if (!retry)
+        destroyStep(step, result.status == RemoteResult::rrSuccess);
+
+    return retry;
 }


@ -1122,7 +1199,11 @@ void State::dumpStatus()
            if (i->lock()) ++i; else i = runnable_->erase(i);
        printMsg(lvlError, format("%1% runnable build steps") % runnable_->size());
    }
-    printMsg(lvlError, format("%1% times woken up to check the queue") % nrQueueWakeups);
+    printMsg(lvlError, format("%1% build step retries") % nrRetries);
+    printMsg(lvlError, format("%1% most retries for any build step") % maxNrRetries);
+    printMsg(lvlError, format("%1% queue wakeups") % nrQueueWakeups);
+    printMsg(lvlError, format("%1% dispatcher wakeups") % nrDispatcherWakeups);
+    printMsg(lvlError, format("%1% database connections") % dbPool.count());
    {
        auto machines_(machines.lock());
        for (auto & m : *machines_) {
@ -1145,9 +1226,6 @@ void State::run()
    std::thread(&State::dispatcher, this).detach();

    queueMonitorThread.join();
-
-    //printMsg(lvlInfo, "exiting...");
-    //printMsg(lvlInfo, format("psql connections = %1%") % dbPool.count());
 }


--- a/src/root/build.tt
+++ b/src/root/build.tt
@ -51,7 +51,7 @@
              [% ELSIF step.status == 0 %]
                Succeeded
              [% ELSIF step.status == 4 %]
-                <span class="error">Aborted</span>
+                <span class="error"><strong>Aborted</strong>[% IF step.errormsg %]: [% HTML.escape(step.errormsg); END %]</span>
              [% ELSIF step.status == 7 %]
                <span class="error">Timed out</span>
              [% ELSIF step.status == 8 %]
--- a/src/sql/hydra.sql
+++ b/src/sql/hydra.sql
@ -372,6 +372,7 @@ create table CachedCVSInputs (
 );


+-- FIXME: remove
 create table SystemTypes (
    system        text primary key not null,
    maxConcurrent integer not null default 2