Split hydra-queue-runner.cc more

2015-07-21 15:14:17 +02:00 · 2015-07-21 15:14:17 +02:00 · 7e026d35f7
parent 6ddcd37df1
commit 7e026d35f7
6 changed files with 921 additions and 904 deletions
--- a/src/hydra-queue-runner/Makefile.am
+++ b/src/hydra-queue-runner/Makefile.am
@ -1,6 +1,7 @@
 bin_PROGRAMS = hydra-queue-runner
-hydra_queue_runner_SOURCES = hydra-queue-runner.cc build-result.cc build-remote.cc \
+hydra_queue_runner_SOURCES = hydra-queue-runner.cc queue-monitor.cc dispatcher.cc \
 builder.cc build-result.cc build-remote.cc  \
 build-result.hh counter.hh pool.hh sync.hh token-server.hh state.hh db.hh
 hydra_queue_runner_LDADD = $(NIX_LIBS) -lpqxx
--- a/src/hydra-queue-runner/builder.cc
+++ b/src/hydra-queue-runner/builder.cc
@ -0,0 +1,378 @@
 #include <cmath>
 #include "state.hh"
 #include "build-result.hh"
 using namespace nix;
 void State::builder(Step::ptr step, Machine::ptr machine, std::shared_ptr<MaintainCount> reservation)
 {
    bool retry = true;
    MaintainCount mc(nrActiveSteps);
    try {
        auto store = openStore(); // FIXME: pool
        retry = doBuildStep(store, step, machine);
    } catch (std::exception & e) {
        printMsg(lvlError, format("uncaught exception building ‘%1%’ on ‘%2%’: %3%")
            % step->drvPath % machine->sshName % e.what());
    }
    /* Release the machine and wake up the dispatcher. */
    assert(reservation.unique());
    reservation = 0;
    wakeDispatcher();
    /* If there was a temporary failure, retry the step after an
       exponentially increasing interval. */
    if (retry) {
        {
            auto step_(step->state.lock());
            step_->tries++;
            nrRetries++;
            if (step_->tries > maxNrRetries) maxNrRetries = step_->tries; // yeah yeah, not atomic
            int delta = retryInterval * powf(retryBackoff, step_->tries - 1);
            printMsg(lvlInfo, format("will retry ‘%1%’ after %2%s") % step->drvPath % delta);
            step_->after = std::chrono::system_clock::now() + std::chrono::seconds(delta);
        }
        makeRunnable(step);
    }
 }
 bool State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
    Machine::ptr machine)
 {
    {
        auto step_(step->state.lock());
        assert(step_->created);
        assert(!step->finished);
    }
    /* There can be any number of builds in the database that depend
       on this derivation. Arbitrarily pick one (though preferring a
       build of which this is the top-level derivation) for the
       purpose of creating build steps. We could create a build step
       record for every build, but that could be very expensive
       (e.g. a stdenv derivation can be a dependency of tens of
       thousands of builds), so we don't. */
    Build::ptr build;
    {
        std::set<Build::ptr> dependents;
        std::set<Step::ptr> steps;
        getDependents(step, dependents, steps);
        if (dependents.empty()) {
            /* Apparently all builds that depend on this derivation
               are gone (e.g. cancelled). So don't bother. This is
               very unlikely to happen, because normally Steps are
               only kept alive by being reachable from a
               Build. However, it's possible that a new Build just
               created a reference to this step. So to handle that
               possibility, we retry this step (putting it back in
               the runnable queue). If there are really no strong
               pointers to the step, it will be deleted. */
            printMsg(lvlInfo, format("maybe cancelling build step ‘%1%’") % step->drvPath);
            return true;
        }
        for (auto build2 : dependents)
            if (build2->drvPath == step->drvPath) { build = build2; break; }
        if (!build) build = *dependents.begin();
        printMsg(lvlInfo, format("performing step ‘%1%’ on ‘%2%’ (needed by build %3% and %4% others)")
            % step->drvPath % machine->sshName % build->id % (dependents.size() - 1));
    }
    bool quit = build->id == buildOne;
    auto conn(dbPool.get());
    RemoteResult result;
    BuildOutput res;
    int stepNr = 0;
    time_t stepStartTime = result.startTime = time(0);
    /* If any of the outputs have previously failed, then don't bother
       building again. */
    bool cachedFailure = checkCachedFailure(step, *conn);
    if (cachedFailure)
        result.status = BuildResult::CachedFailure;
    else {
        /* Create a build step record indicating that we started
           building. Also, mark the selected build as busy. */
        {
            pqxx::work txn(*conn);
            stepNr = createBuildStep(txn, result.startTime, build, step, machine->sshName, bssBusy);
            txn.parameterized("update Builds set busy = 1 where id = $1")(build->id).exec();
            txn.commit();
        }
        /* Do the build. */
        try {
            /* FIXME: referring builds may have conflicting timeouts. */
            buildRemote(store, machine, step, build->maxSilentTime, build->buildTimeout, result);
        } catch (Error & e) {
            result.status = BuildResult::MiscFailure;
            result.errorMsg = e.msg();
        }
        if (result.success()) res = getBuildOutput(store, step->drv);
    }
    time_t stepStopTime = time(0);
    if (!result.stopTime) result.stopTime = stepStopTime;
    /* Asynchronously compress the log. */
    if (result.logFile != "") {
        {
            auto logCompressorQueue_(logCompressorQueue.lock());
            logCompressorQueue_->push(result.logFile);
        }
        logCompressorWakeup.notify_one();
    }
    /* The step had a hopefully temporary failure (e.g. network
       issue). Retry a number of times. */
    if (result.canRetry()) {
        printMsg(lvlError, format("possibly transient failure building ‘%1%’ on ‘%2%’: %3%")
            % step->drvPath % machine->sshName % result.errorMsg);
        bool retry;
        {
            auto step_(step->state.lock());
            retry = step_->tries + 1 < maxTries;
        }
        if (retry) {
            pqxx::work txn(*conn);
            finishBuildStep(txn, result.startTime, result.stopTime, build->id,
                stepNr, machine->sshName, bssAborted, result.errorMsg);
            txn.commit();
            if (quit) exit(1);
            return true;
        }
    }
    if (result.success()) {
        /* Register success in the database for all Build objects that
           have this step as the top-level step. Since the queue
           monitor thread may be creating new referring Builds
           concurrently, and updating the database may fail, we do
           this in a loop, marking all known builds, repeating until
           there are no unmarked builds.
        */
        std::vector<BuildID> buildIDs;
        while (true) {
            /* Get the builds that have this one as the top-level. */
            std::vector<Build::ptr> direct;
            {
                auto steps_(steps.lock());
                auto step_(step->state.lock());
                for (auto & b_ : step_->builds) {
                    auto b = b_.lock();
                    if (b && !b->finishedInDB) direct.push_back(b);
                }
                /* If there are no builds left to update in the DB,
                   then we're done (except for calling
                   finishBuildStep()). Delete the step from
                   ‘steps’. Since we've been holding the ‘steps’ lock,
                   no new referrers can have been added in the
                   meantime or be added afterwards. */
                if (direct.empty()) {
                    printMsg(lvlDebug, format("finishing build step ‘%1%’") % step->drvPath);
                    steps_->erase(step->drvPath);
                }
            }
            /* Update the database. */
            {
                pqxx::work txn(*conn);
                finishBuildStep(txn, result.startTime, result.stopTime, build->id, stepNr, machine->sshName, bssSuccess);
                for (auto & b : direct)
                    markSucceededBuild(txn, b, res, build != b || result.status != BuildResult::Built,
                        result.startTime, result.stopTime);
                txn.commit();
            }
            if (direct.empty()) break;
            /* Remove the direct dependencies from ‘builds’. This will
               cause them to be destroyed. */
            for (auto & b : direct) {
                auto builds_(builds.lock());
                b->finishedInDB = true;
                builds_->erase(b->id);
                buildIDs.push_back(b->id);
            }
        }
        /* Send notification about the builds that have this step as
           the top-level. */
        for (auto id : buildIDs) {
            {
                auto notificationSenderQueue_(notificationSenderQueue.lock());
                notificationSenderQueue_->push(NotificationItem(id, std::vector<BuildID>()));
            }
            notificationSenderWakeup.notify_one();
        }
        /* Wake up any dependent steps that have no other
           dependencies. */
        {
            auto step_(step->state.lock());
            for (auto & rdepWeak : step_->rdeps) {
                auto rdep = rdepWeak.lock();
                if (!rdep) continue;
                bool runnable = false;
                {
                    auto rdep_(rdep->state.lock());
                    rdep_->deps.erase(step);
                    /* Note: if the step has not finished
                       initialisation yet, it will be made runnable in
                       createStep(), if appropriate. */
                    if (rdep_->deps.empty() && rdep_->created) runnable = true;
                }
                if (runnable) makeRunnable(rdep);
            }
        }
    } else {
        /* Register failure in the database for all Build objects that
           directly or indirectly depend on this step. */
        std::vector<BuildID> dependentIDs;
        while (true) {
            /* Get the builds and steps that depend on this step. */
            std::set<Build::ptr> indirect;
            {
                auto steps_(steps.lock());
                std::set<Step::ptr> steps;
                getDependents(step, indirect, steps);
                /* If there are no builds left, delete all referring
                   steps from ‘steps’. As for the success case, we can
                   be certain no new referrers can be added. */
                if (indirect.empty()) {
                    for (auto & s : steps) {
                        printMsg(lvlDebug, format("finishing build step ‘%1%’") % s->drvPath);
                        steps_->erase(s->drvPath);
                    }
                    break;
                }
            }
            /* Update the database. */
            {
                pqxx::work txn(*conn);
                BuildStatus buildStatus =
                    result.status == BuildResult::TimedOut ? bsTimedOut :
                    result.canRetry() ? bsAborted :
                    bsFailed;
                BuildStepStatus buildStepStatus =
                    result.status == BuildResult::TimedOut ? bssTimedOut :
                    result.canRetry() ? bssAborted :
                    bssFailed;
                /* For standard failures, we don't care about the error
                   message. */
                if (result.status == BuildResult::PermanentFailure ||
                    result.status == BuildResult::TransientFailure ||
                    result.status == BuildResult::CachedFailure ||
                    result.status == BuildResult::TimedOut)
                    result.errorMsg = "";
                /* Create failed build steps for every build that depends
                   on this. For cached failures, only create a step for
                   builds that don't have this step as top-level
                   (otherwise the user won't be able to see what caused
                   the build to fail). */
                for (auto & build2 : indirect) {
                    if ((cachedFailure && build2->drvPath == step->drvPath) ||
                        (!cachedFailure && build == build2) ||
                        build2->finishedInDB)
                        continue;
                    createBuildStep(txn, 0, build2, step, machine->sshName,
                        buildStepStatus, result.errorMsg, build == build2 ? 0 : build->id);
                }
                if (!cachedFailure)
                    finishBuildStep(txn, result.startTime, result.stopTime, build->id,
                        stepNr, machine->sshName, buildStepStatus, result.errorMsg);
                /* Mark all builds that depend on this derivation as failed. */
                for (auto & build2 : indirect) {
                    if (build2->finishedInDB) continue;
                    printMsg(lvlError, format("marking build %1% as failed") % build2->id);
                    txn.parameterized
                        ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1 and finished = 0")
                        (build2->id)
                        ((int) (build2->drvPath != step->drvPath && buildStatus == bsFailed ? bsDepFailed : buildStatus))
                        (result.startTime)
                        (result.stopTime)
                        (cachedFailure ? 1 : 0).exec();
                    nrBuildsDone++;
                }
                /* Remember failed paths in the database so that they
                   won't be built again. */
                if (!cachedFailure && result.status == BuildResult::PermanentFailure)
                    for (auto & path : outputPaths(step->drv))
                        txn.parameterized("insert into FailedPaths values ($1)")(path).exec();
                txn.commit();
            }
            /* Remove the indirect dependencies from ‘builds’. This
               will cause them to be destroyed. */
            for (auto & b : indirect) {
                auto builds_(builds.lock());
                b->finishedInDB = true;
                builds_->erase(b->id);
                dependentIDs.push_back(b->id);
                if (buildOne == b->id) quit = true;
            }
        }
        /* Send notification about this build and its dependents. */
        {
            auto notificationSenderQueue_(notificationSenderQueue.lock());
            notificationSenderQueue_->push(NotificationItem(build->id, dependentIDs));
        }
        notificationSenderWakeup.notify_one();
    }
    // FIXME: keep stats about aborted steps?
    nrStepsDone++;
    totalStepTime += stepStopTime - stepStartTime;
    totalStepBuildTime += result.stopTime - result.startTime;
    machine->state->nrStepsDone++;
    machine->state->totalStepTime += stepStopTime - stepStartTime;
    machine->state->totalStepBuildTime += result.stopTime - result.startTime;
    if (quit) exit(0); // testing hack
    return false;
 }
--- a/src/hydra-queue-runner/dispatcher.cc
+++ b/src/hydra-queue-runner/dispatcher.cc
@ -0,0 +1,155 @@
 #include <algorithm>
 #include <thread>
 #include "state.hh"
 using namespace nix;
 void State::makeRunnable(Step::ptr step)
 {
    printMsg(lvlChatty, format("step ‘%1%’ is now runnable") % step->drvPath);
    {
        auto step_(step->state.lock());
        assert(step_->created);
        assert(!step->finished);
        assert(step_->deps.empty());
    }
    {
        auto runnable_(runnable.lock());
        runnable_->push_back(step);
    }
    wakeDispatcher();
 }
 void State::dispatcher()
 {
    while (true) {
        printMsg(lvlDebug, "dispatcher woken up");
        auto sleepUntil = system_time::max();
        bool keepGoing;
        do {
            /* Copy the currentJobs field of each machine. This is
               necessary to ensure that the sort comparator below is
               an ordering. std::sort() can segfault if it isn't. */
            struct MachineInfo
            {
                Machine::ptr machine;
                unsigned int currentJobs;
            };
            std::vector<MachineInfo> machinesSorted;
            {
                auto machines_(machines.lock());
                for (auto & m : *machines_)
                    machinesSorted.push_back({m.second, m.second->state->currentJobs});
            }
            /* Sort the machines by a combination of speed factor and
               available slots. Prioritise the available machines as
               follows:
               - First by load divided by speed factor, rounded to the
                 nearest integer.  This causes fast machines to be
                 preferred over slow machines with similar loads.
               - Then by speed factor.
               - Finally by load. */
            sort(machinesSorted.begin(), machinesSorted.end(),
                [](const MachineInfo & a, const MachineInfo & b) -> bool
                {
                    float ta = roundf(a.currentJobs / a.machine->speedFactor);
                    float tb = roundf(b.currentJobs / b.machine->speedFactor);
                    return
                        ta != tb ? ta < tb :
                        a.machine->speedFactor != b.machine->speedFactor ? a.machine->speedFactor > b.machine->speedFactor :
                        a.currentJobs > b.currentJobs;
                });
            /* Find a machine with a free slot and find a step to run
               on it. Once we find such a pair, we restart the outer
               loop because the machine sorting will have changed. */
            keepGoing = false;
            system_time now = std::chrono::system_clock::now();
            for (auto & mi : machinesSorted) {
                // FIXME: can we lose a wakeup if a builder exits concurrently?
                if (mi.machine->state->currentJobs >= mi.machine->maxJobs) continue;
                auto runnable_(runnable.lock());
                //printMsg(lvlDebug, format("%1% runnable builds") % runnable_->size());
                /* FIXME: we're holding the runnable lock too long
                   here. This could be more efficient. */
                for (auto i = runnable_->begin(); i != runnable_->end(); ) {
                    auto step = i->lock();
                    /* Delete dead steps. */
                    if (!step) {
                        i = runnable_->erase(i);
                        continue;
                    }
                    /* Can this machine do this step? */
                    if (!mi.machine->supportsStep(step)) {
                        ++i;
                        continue;
                    }
                    /* Skip previously failed steps that aren't ready
                       to be retried. */
                    {
                        auto step_(step->state.lock());
                        if (step_->tries > 0 && step_->after > now) {
                            if (step_->after < sleepUntil)
                                sleepUntil = step_->after;
                            ++i;
                            continue;
                        }
                    }
                    /* Make a slot reservation and start a thread to
                       do the build. */
                    auto reservation = std::make_shared<MaintainCount>(mi.machine->state->currentJobs);
                    i = runnable_->erase(i);
                    auto builderThread = std::thread(&State::builder, this, step, mi.machine, reservation);
                    builderThread.detach(); // FIXME?
                    keepGoing = true;
                    break;
                }
                if (keepGoing) break;
            }
        } while (keepGoing);
        /* Sleep until we're woken up (either because a runnable build
           is added, or because a build finishes). */
        {
            std::unique_lock<std::mutex> lock(dispatcherMutex);
            printMsg(lvlDebug, format("dispatcher sleeping for %1%s") %
                std::chrono::duration_cast<std::chrono::seconds>(sleepUntil - std::chrono::system_clock::now()).count());
            dispatcherWakeup.wait_until(lock, sleepUntil);
            nrDispatcherWakeups++;
        }
    }
    printMsg(lvlError, "dispatcher exits");
 }
 void State::wakeDispatcher()
 {
    { std::lock_guard<std::mutex> lock(dispatcherMutex); } // barrier
    dispatcherWakeup.notify_one();
 }
--- a/src/hydra-queue-runner/hydra-queue-runner.cc
+++ b/src/hydra-queue-runner/hydra-queue-runner.cc
@ -1,14 +1,12 @@
 #include <iostream>
 #include <thread>
 #include <cmath>
 #include <algorithm>
 #include <sys/types.h>
 #include <sys/stat.h>
 #include <fcntl.h>
 #include "build-result.hh"
 #include "state.hh"
 #include "build-result.hh"
 #include "shared.hh"
 #include "globals.hh"
@ -17,20 +15,6 @@
 using namespace nix;
 // FIXME: Make configurable.
 const unsigned int maxTries = 5;
 const unsigned int retryInterval = 60; // seconds
 const float retryBackoff = 3.0;
 const unsigned int maxParallelCopyClosure = 4;
 template <class C, class V>
 bool has(const C & c, const V & v)
 {
    return c.find(v) != c.end();
 }
 State::State()
 {
    hydraData = getEnv("HYDRA_DATA");
@ -186,371 +170,6 @@ void State::finishBuildStep(pqxx::work & txn, time_t startTime, time_t stopTime,
 }
 void State::queueMonitor()
 {
    while (true) {
        try {
            queueMonitorLoop();
        } catch (std::exception & e) {
            printMsg(lvlError, format("queue monitor: %1%") % e.what());
            sleep(10); // probably a DB problem, so don't retry right away
        }
    }
 }
 void State::queueMonitorLoop()
 {
    auto conn(dbPool.get());
    receiver buildsAdded(*conn, "builds_added");
    receiver buildsRestarted(*conn, "builds_restarted");
    receiver buildsCancelled(*conn, "builds_cancelled");
    receiver buildsDeleted(*conn, "builds_deleted");
    auto store = openStore(); // FIXME: pool
    unsigned int lastBuildId = 0;
    while (true) {
        getQueuedBuilds(*conn, store, lastBuildId);
        /* Sleep until we get notification from the database about an
           event. */
        conn->await_notification();
        nrQueueWakeups++;
        if (buildsAdded.get())
            printMsg(lvlTalkative, "got notification: new builds added to the queue");
        if (buildsRestarted.get()) {
            printMsg(lvlTalkative, "got notification: builds restarted");
            lastBuildId = 0; // check all builds
        }
        if (buildsCancelled.get() || buildsDeleted.get()) {
            printMsg(lvlTalkative, "got notification: builds cancelled");
            removeCancelledBuilds(*conn);
        }
    }
 }
 void State::getQueuedBuilds(Connection & conn, std::shared_ptr<StoreAPI> store, unsigned int & lastBuildId)
 {
    printMsg(lvlInfo, format("checking the queue for builds > %1%...") % lastBuildId);
    /* Grab the queued builds from the database, but don't process
       them yet (since we don't want a long-running transaction). */
    std::multimap<Path, Build::ptr> newBuilds;
    {
        pqxx::work txn(conn);
        auto res = txn.parameterized("select id, project, jobset, job, drvPath, maxsilent, timeout from Builds where id > $1 and finished = 0 order by id")(lastBuildId).exec();
        for (auto const & row : res) {
            auto builds_(builds.lock());
            BuildID id = row["id"].as<BuildID>();
            if (buildOne && id != buildOne) continue;
            if (id > lastBuildId) lastBuildId = id;
            if (has(*builds_, id)) continue;
            auto build = std::make_shared<Build>();
            build->id = id;
            build->drvPath = row["drvPath"].as<string>();
            build->fullJobName = row["project"].as<string>() + ":" + row["jobset"].as<string>() + ":" + row["job"].as<string>();
            build->maxSilentTime = row["maxsilent"].as<int>();
            build->buildTimeout = row["timeout"].as<int>();
            newBuilds.emplace(std::make_pair(build->drvPath, build));
        }
    }
    std::set<Step::ptr> newRunnable;
    unsigned int nrAdded;
    std::function<void(Build::ptr)> createBuild;
    createBuild = [&](Build::ptr build) {
        printMsg(lvlTalkative, format("loading build %1% (%2%)") % build->id % build->fullJobName);
        nrAdded++;
        if (!store->isValidPath(build->drvPath)) {
            /* Derivation has been GC'ed prematurely. */
            printMsg(lvlError, format("aborting GC'ed build %1%") % build->id);
            if (!build->finishedInDB) {
                pqxx::work txn(conn);
                txn.parameterized
                    ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $3, errorMsg = $4 where id = $1 and finished = 0")
                    (build->id)
                    ((int) bsAborted)
                    (time(0))
                    ("derivation was garbage-collected prior to build").exec();
                txn.commit();
                build->finishedInDB = true;
                nrBuildsDone++;
            }
            return;
        }
        std::set<Step::ptr> newSteps;
        std::set<Path> finishedDrvs; // FIXME: re-use?
        Step::ptr step = createStep(store, build->drvPath, build, 0, finishedDrvs, newSteps, newRunnable);
        /* Some of the new steps may be the top level of builds that
           we haven't processed yet. So do them now. This ensures that
           if build A depends on build B with top-level step X, then X
           will be "accounted" to B in doBuildStep(). */
        for (auto & r : newSteps) {
            while (true) {
                auto i = newBuilds.find(r->drvPath);
                if (i == newBuilds.end()) break;
                Build::ptr b = i->second;
                newBuilds.erase(i);
                createBuild(b);
            }
        }
        /* If we didn't get a step, it means the step's outputs are
           all valid. So we mark this as a finished, cached build. */
        if (!step) {
            Derivation drv = readDerivation(build->drvPath);
            BuildOutput res = getBuildOutput(store, drv);
            pqxx::work txn(conn);
            time_t now = time(0);
            markSucceededBuild(txn, build, res, true, now, now);
            txn.commit();
            build->finishedInDB = true;
            return;
        }
        /* If any step has an unsupported system type or has a
           previously failed output path, then fail the build right
           away. */
        bool badStep = false;
        for (auto & r : newSteps) {
            BuildStatus buildStatus = bsSuccess;
            BuildStepStatus buildStepStatus = bssFailed;
            if (checkCachedFailure(r, conn)) {
                printMsg(lvlError, format("marking build %1% as cached failure") % build->id);
                buildStatus = step == r ? bsFailed : bsDepFailed;
                buildStepStatus = bssFailed;
            }
            if (buildStatus == bsSuccess) {
                bool supported = false;
                {
                    auto machines_(machines.lock()); // FIXME: use shared_mutex
                    for (auto & m : *machines_)
                        if (m.second->supportsStep(r)) { supported = true; break; }
                }
                if (!supported) {
                    printMsg(lvlError, format("aborting unsupported build %1%") % build->id);
                    buildStatus = bsUnsupported;
                    buildStepStatus = bssUnsupported;
                }
            }
            if (buildStatus != bsSuccess) {
                time_t now = time(0);
                if (!build->finishedInDB) {
                    pqxx::work txn(conn);
                    createBuildStep(txn, 0, build, r, "", buildStepStatus);
                    txn.parameterized
                        ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $3, isCachedBuild = $4 where id = $1 and finished = 0")
                        (build->id)
                        ((int) buildStatus)
                        (now)
                        (buildStatus != bsUnsupported ? 1 : 0).exec();
                    txn.commit();
                    build->finishedInDB = true;
                    nrBuildsDone++;
                }
                badStep = true;
                break;
            }
        }
        if (badStep) return;
        /* Note: if we exit this scope prior to this, the build and
           all newly created steps are destroyed. */
        {
            auto builds_(builds.lock());
            if (!build->finishedInDB) // FIXME: can this happen?
                (*builds_)[build->id] = build;
            build->toplevel = step;
        }
        printMsg(lvlChatty, format("added build %1% (top-level step %2%, %3% new steps)")
            % build->id % step->drvPath % newSteps.size());
    };
    /* Now instantiate build steps for each new build. The builder
       threads can start building the runnable build steps right away,
       even while we're still processing other new builds. */
    while (!newBuilds.empty()) {
        auto build = newBuilds.begin()->second;
        newBuilds.erase(newBuilds.begin());
        newRunnable.clear();
        nrAdded = 0;
        try {
            createBuild(build);
        } catch (Error & e) {
            e.addPrefix(format("while loading build %1%: ") % build->id);
            throw;
        }
        /* Add the new runnable build steps to ‘runnable’ and wake up
           the builder threads. */
        printMsg(lvlChatty, format("got %1% new runnable steps from %2% new builds") % newRunnable.size() % nrAdded);
        for (auto & r : newRunnable)
            makeRunnable(r);
        nrBuildsRead += nrAdded;
    }
 }
 void State::removeCancelledBuilds(Connection & conn)
 {
    /* Get the current set of queued builds. */
    std::set<BuildID> currentIds;
    {
        pqxx::work txn(conn);
        auto res = txn.exec("select id from Builds where finished = 0");
        for (auto const & row : res)
            currentIds.insert(row["id"].as<BuildID>());
    }
    auto builds_(builds.lock());
    for (auto i = builds_->begin(); i != builds_->end(); ) {
        if (currentIds.find(i->first) == currentIds.end()) {
            printMsg(lvlInfo, format("discarding cancelled build %1%") % i->first);
            i = builds_->erase(i);
            // FIXME: ideally we would interrupt active build steps here.
        } else
            ++i;
    }
 }
 Step::ptr State::createStep(std::shared_ptr<StoreAPI> store, const Path & drvPath,
    Build::ptr referringBuild, Step::ptr referringStep, std::set<Path> & finishedDrvs,
    std::set<Step::ptr> & newSteps, std::set<Step::ptr> & newRunnable)
 {
    if (finishedDrvs.find(drvPath) != finishedDrvs.end()) return 0;
    /* Check if the requested step already exists. If not, create a
       new step. In any case, make the step reachable from
       referringBuild or referringStep. This is done atomically (with
       ‘steps’ locked), to ensure that this step can never become
       reachable from a new build after doBuildStep has removed it
       from ‘steps’. */
    Step::ptr step;
    bool isNew = false;
    {
        auto steps_(steps.lock());
        /* See if the step already exists in ‘steps’ and is not
           stale. */
        auto prev = steps_->find(drvPath);
        if (prev != steps_->end()) {
            step = prev->second.lock();
            /* Since ‘step’ is a strong pointer, the referred Step
               object won't be deleted after this. */
            if (!step) steps_->erase(drvPath); // remove stale entry
        }
        /* If it doesn't exist, create it. */
        if (!step) {
            step = std::make_shared<Step>();
            step->drvPath = drvPath;
            isNew = true;
        }
        auto step_(step->state.lock());
        assert(step_->created != isNew);
        if (referringBuild)
            step_->builds.push_back(referringBuild);
        if (referringStep)
            step_->rdeps.push_back(referringStep);
        (*steps_)[drvPath] = step;
    }
    if (!isNew) return step;
    printMsg(lvlDebug, format("considering derivation ‘%1%’") % drvPath);
    /* Initialize the step. Note that the step may be visible in
       ‘steps’ before this point, but that doesn't matter because
       it's not runnable yet, and other threads won't make it
       runnable while step->created == false. */
    step->drv = readDerivation(drvPath);
    {
        auto i = step->drv.env.find("requiredSystemFeatures");
        if (i != step->drv.env.end())
            step->requiredSystemFeatures = tokenizeString<std::set<std::string>>(i->second);
    }
    auto attr = step->drv.env.find("preferLocalBuild");
    step->preferLocalBuild =
        attr != step->drv.env.end() && attr->second == "1"
        && has(localPlatforms, step->drv.platform);
    /* Are all outputs valid? */
    bool valid = true;
    for (auto & i : step->drv.outputs) {
        if (!store->isValidPath(i.second.path)) {
            valid = false;
            break;
        }
    }
    // FIXME: check whether all outputs are in the binary cache.
    if (valid) {
        finishedDrvs.insert(drvPath);
        return 0;
    }
    /* No, we need to build. */
    printMsg(lvlDebug, format("creating build step ‘%1%’") % drvPath);
    newSteps.insert(step);
    /* Create steps for the dependencies. */
    for (auto & i : step->drv.inputDrvs) {
        auto dep = createStep(store, i.first, 0, step, finishedDrvs, newSteps, newRunnable);
        if (dep) {
            auto step_(step->state.lock());
            step_->deps.insert(dep);
        }
    }
    /* If the step has no (remaining) dependencies, make it
       runnable. */
    {
        auto step_(step->state.lock());
        assert(!step_->created);
        step_->created = true;
        if (step_->deps.empty())
            newRunnable.insert(step);
    }
    return step;
 }
 /* Get the steps and unfinished builds that depend on the given step. */
 void getDependents(Step::ptr step, std::set<Build::ptr> & builds, std::set<Step::ptr> & steps)
 {
@ -585,527 +204,6 @@ void getDependents(Step::ptr step, std::set<Build::ptr> & builds, std::set<Step:
 }
 void State::makeRunnable(Step::ptr step)
 {
    printMsg(lvlChatty, format("step ‘%1%’ is now runnable") % step->drvPath);
    {
        auto step_(step->state.lock());
        assert(step_->created);
        assert(!step->finished);
        assert(step_->deps.empty());
    }
    {
        auto runnable_(runnable.lock());
        runnable_->push_back(step);
    }
    wakeDispatcher();
 }
 void State::dispatcher()
 {
    while (true) {
        printMsg(lvlDebug, "dispatcher woken up");
        auto sleepUntil = system_time::max();
        bool keepGoing;
        do {
            /* Copy the currentJobs field of each machine. This is
               necessary to ensure that the sort comparator below is
               an ordering. std::sort() can segfault if it isn't. */
            struct MachineInfo
            {
                Machine::ptr machine;
                unsigned int currentJobs;
            };
            std::vector<MachineInfo> machinesSorted;
            {
                auto machines_(machines.lock());
                for (auto & m : *machines_)
                    machinesSorted.push_back({m.second, m.second->state->currentJobs});
            }
            /* Sort the machines by a combination of speed factor and
               available slots. Prioritise the available machines as
               follows:
               - First by load divided by speed factor, rounded to the
                 nearest integer.  This causes fast machines to be
                 preferred over slow machines with similar loads.
               - Then by speed factor.
               - Finally by load. */
            sort(machinesSorted.begin(), machinesSorted.end(),
                [](const MachineInfo & a, const MachineInfo & b) -> bool
                {
                    float ta = roundf(a.currentJobs / a.machine->speedFactor);
                    float tb = roundf(b.currentJobs / b.machine->speedFactor);
                    return
                        ta != tb ? ta < tb :
                        a.machine->speedFactor != b.machine->speedFactor ? a.machine->speedFactor > b.machine->speedFactor :
                        a.currentJobs > b.currentJobs;
                });
            /* Find a machine with a free slot and find a step to run
               on it. Once we find such a pair, we restart the outer
               loop because the machine sorting will have changed. */
            keepGoing = false;
            system_time now = std::chrono::system_clock::now();
            for (auto & mi : machinesSorted) {
                // FIXME: can we lose a wakeup if a builder exits concurrently?
                if (mi.machine->state->currentJobs >= mi.machine->maxJobs) continue;
                auto runnable_(runnable.lock());
                //printMsg(lvlDebug, format("%1% runnable builds") % runnable_->size());
                /* FIXME: we're holding the runnable lock too long
                   here. This could be more efficient. */
                for (auto i = runnable_->begin(); i != runnable_->end(); ) {
                    auto step = i->lock();
                    /* Delete dead steps. */
                    if (!step) {
                        i = runnable_->erase(i);
                        continue;
                    }
                    /* Can this machine do this step? */
                    if (!mi.machine->supportsStep(step)) {
                        ++i;
                        continue;
                    }
                    /* Skip previously failed steps that aren't ready
                       to be retried. */
                    {
                        auto step_(step->state.lock());
                        if (step_->tries > 0 && step_->after > now) {
                            if (step_->after < sleepUntil)
                                sleepUntil = step_->after;
                            ++i;
                            continue;
                        }
                    }
                    /* Make a slot reservation and start a thread to
                       do the build. */
                    auto reservation = std::make_shared<MaintainCount>(mi.machine->state->currentJobs);
                    i = runnable_->erase(i);
                    auto builderThread = std::thread(&State::builder, this, step, mi.machine, reservation);
                    builderThread.detach(); // FIXME?
                    keepGoing = true;
                    break;
                }
                if (keepGoing) break;
            }
        } while (keepGoing);
        /* Sleep until we're woken up (either because a runnable build
           is added, or because a build finishes). */
        {
            std::unique_lock<std::mutex> lock(dispatcherMutex);
            printMsg(lvlDebug, format("dispatcher sleeping for %1%s") %
                std::chrono::duration_cast<std::chrono::seconds>(sleepUntil - std::chrono::system_clock::now()).count());
            dispatcherWakeup.wait_until(lock, sleepUntil);
            nrDispatcherWakeups++;
        }
    }
    printMsg(lvlError, "dispatcher exits");
 }
 void State::wakeDispatcher()
 {
    { std::lock_guard<std::mutex> lock(dispatcherMutex); } // barrier
    dispatcherWakeup.notify_one();
 }
 void State::builder(Step::ptr step, Machine::ptr machine, std::shared_ptr<MaintainCount> reservation)
 {
    bool retry = true;
    MaintainCount mc(nrActiveSteps);
    try {
        auto store = openStore(); // FIXME: pool
        retry = doBuildStep(store, step, machine);
    } catch (std::exception & e) {
        printMsg(lvlError, format("uncaught exception building ‘%1%’ on ‘%2%’: %3%")
            % step->drvPath % machine->sshName % e.what());
    }
    /* Release the machine and wake up the dispatcher. */
    assert(reservation.unique());
    reservation = 0;
    wakeDispatcher();
    /* If there was a temporary failure, retry the step after an
       exponentially increasing interval. */
    if (retry) {
        {
            auto step_(step->state.lock());
            step_->tries++;
            nrRetries++;
            if (step_->tries > maxNrRetries) maxNrRetries = step_->tries; // yeah yeah, not atomic
            int delta = retryInterval * powf(retryBackoff, step_->tries - 1);
            printMsg(lvlInfo, format("will retry ‘%1%’ after %2%s") % step->drvPath % delta);
            step_->after = std::chrono::system_clock::now() + std::chrono::seconds(delta);
        }
        makeRunnable(step);
    }
 }
 bool State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
    Machine::ptr machine)
 {
    {
        auto step_(step->state.lock());
        assert(step_->created);
        assert(!step->finished);
    }
    /* There can be any number of builds in the database that depend
       on this derivation. Arbitrarily pick one (though preferring a
       build of which this is the top-level derivation) for the
       purpose of creating build steps. We could create a build step
       record for every build, but that could be very expensive
       (e.g. a stdenv derivation can be a dependency of tens of
       thousands of builds), so we don't. */
    Build::ptr build;
    {
        std::set<Build::ptr> dependents;
        std::set<Step::ptr> steps;
        getDependents(step, dependents, steps);
        if (dependents.empty()) {
            /* Apparently all builds that depend on this derivation
               are gone (e.g. cancelled). So don't bother. This is
               very unlikely to happen, because normally Steps are
               only kept alive by being reachable from a
               Build. However, it's possible that a new Build just
               created a reference to this step. So to handle that
               possibility, we retry this step (putting it back in
               the runnable queue). If there are really no strong
               pointers to the step, it will be deleted. */
            printMsg(lvlInfo, format("maybe cancelling build step ‘%1%’") % step->drvPath);
            return true;
        }
        for (auto build2 : dependents)
            if (build2->drvPath == step->drvPath) { build = build2; break; }
        if (!build) build = *dependents.begin();
        printMsg(lvlInfo, format("performing step ‘%1%’ on ‘%2%’ (needed by build %3% and %4% others)")
            % step->drvPath % machine->sshName % build->id % (dependents.size() - 1));
    }
    bool quit = build->id == buildOne;
    auto conn(dbPool.get());
    RemoteResult result;
    BuildOutput res;
    int stepNr = 0;
    time_t stepStartTime = result.startTime = time(0);
    /* If any of the outputs have previously failed, then don't bother
       building again. */
    bool cachedFailure = checkCachedFailure(step, *conn);
    if (cachedFailure)
        result.status = BuildResult::CachedFailure;
    else {
        /* Create a build step record indicating that we started
           building. Also, mark the selected build as busy. */
        {
            pqxx::work txn(*conn);
            stepNr = createBuildStep(txn, result.startTime, build, step, machine->sshName, bssBusy);
            txn.parameterized("update Builds set busy = 1 where id = $1")(build->id).exec();
            txn.commit();
        }
        /* Do the build. */
        try {
            /* FIXME: referring builds may have conflicting timeouts. */
            buildRemote(store, machine, step, build->maxSilentTime, build->buildTimeout, result);
        } catch (Error & e) {
            result.status = BuildResult::MiscFailure;
            result.errorMsg = e.msg();
        }
        if (result.success()) res = getBuildOutput(store, step->drv);
    }
    time_t stepStopTime = time(0);
    if (!result.stopTime) result.stopTime = stepStopTime;
    /* Asynchronously compress the log. */
    if (result.logFile != "") {
        {
            auto logCompressorQueue_(logCompressorQueue.lock());
            logCompressorQueue_->push(result.logFile);
        }
        logCompressorWakeup.notify_one();
    }
    /* The step had a hopefully temporary failure (e.g. network
       issue). Retry a number of times. */
    if (result.canRetry()) {
        printMsg(lvlError, format("possibly transient failure building ‘%1%’ on ‘%2%’: %3%")
            % step->drvPath % machine->sshName % result.errorMsg);
        bool retry;
        {
            auto step_(step->state.lock());
            retry = step_->tries + 1 < maxTries;
        }
        if (retry) {
            pqxx::work txn(*conn);
            finishBuildStep(txn, result.startTime, result.stopTime, build->id,
                stepNr, machine->sshName, bssAborted, result.errorMsg);
            txn.commit();
            if (quit) exit(1);
            return true;
        }
    }
    if (result.success()) {
        /* Register success in the database for all Build objects that
           have this step as the top-level step. Since the queue
           monitor thread may be creating new referring Builds
           concurrently, and updating the database may fail, we do
           this in a loop, marking all known builds, repeating until
           there are no unmarked builds.
        */
        std::vector<BuildID> buildIDs;
        while (true) {
            /* Get the builds that have this one as the top-level. */
            std::vector<Build::ptr> direct;
            {
                auto steps_(steps.lock());
                auto step_(step->state.lock());
                for (auto & b_ : step_->builds) {
                    auto b = b_.lock();
                    if (b && !b->finishedInDB) direct.push_back(b);
                }
                /* If there are no builds left to update in the DB,
                   then we're done (except for calling
                   finishBuildStep()). Delete the step from
                   ‘steps’. Since we've been holding the ‘steps’ lock,
                   no new referrers can have been added in the
                   meantime or be added afterwards. */
                if (direct.empty()) {
                    printMsg(lvlDebug, format("finishing build step ‘%1%’") % step->drvPath);
                    steps_->erase(step->drvPath);
                }
            }
            /* Update the database. */
            {
                pqxx::work txn(*conn);
                finishBuildStep(txn, result.startTime, result.stopTime, build->id, stepNr, machine->sshName, bssSuccess);
                for (auto & b : direct)
                    markSucceededBuild(txn, b, res, build != b || result.status != BuildResult::Built,
                        result.startTime, result.stopTime);
                txn.commit();
            }
            if (direct.empty()) break;
            /* Remove the direct dependencies from ‘builds’. This will
               cause them to be destroyed. */
            for (auto & b : direct) {
                auto builds_(builds.lock());
                b->finishedInDB = true;
                builds_->erase(b->id);
                buildIDs.push_back(b->id);
            }
        }
        /* Send notification about the builds that have this step as
           the top-level. */
        for (auto id : buildIDs) {
            {
                auto notificationSenderQueue_(notificationSenderQueue.lock());
                notificationSenderQueue_->push(NotificationItem(id, std::vector<BuildID>()));
            }
            notificationSenderWakeup.notify_one();
        }
        /* Wake up any dependent steps that have no other
           dependencies. */
        {
            auto step_(step->state.lock());
            for (auto & rdepWeak : step_->rdeps) {
                auto rdep = rdepWeak.lock();
                if (!rdep) continue;
                bool runnable = false;
                {
                    auto rdep_(rdep->state.lock());
                    rdep_->deps.erase(step);
                    /* Note: if the step has not finished
                       initialisation yet, it will be made runnable in
                       createStep(), if appropriate. */
                    if (rdep_->deps.empty() && rdep_->created) runnable = true;
                }
                if (runnable) makeRunnable(rdep);
            }
        }
    } else {
        /* Register failure in the database for all Build objects that
           directly or indirectly depend on this step. */
        std::vector<BuildID> dependentIDs;
        while (true) {
            /* Get the builds and steps that depend on this step. */
            std::set<Build::ptr> indirect;
            {
                auto steps_(steps.lock());
                std::set<Step::ptr> steps;
                getDependents(step, indirect, steps);
                /* If there are no builds left, delete all referring
                   steps from ‘steps’. As for the success case, we can
                   be certain no new referrers can be added. */
                if (indirect.empty()) {
                    for (auto & s : steps) {
                        printMsg(lvlDebug, format("finishing build step ‘%1%’") % s->drvPath);
                        steps_->erase(s->drvPath);
                    }
                    break;
                }
            }
            /* Update the database. */
            {
                pqxx::work txn(*conn);
                BuildStatus buildStatus =
                    result.status == BuildResult::TimedOut ? bsTimedOut :
                    result.canRetry() ? bsAborted :
                    bsFailed;
                BuildStepStatus buildStepStatus =
                    result.status == BuildResult::TimedOut ? bssTimedOut :
                    result.canRetry() ? bssAborted :
                    bssFailed;
                /* For standard failures, we don't care about the error
                   message. */
                if (result.status == BuildResult::PermanentFailure ||
                    result.status == BuildResult::TransientFailure ||
                    result.status == BuildResult::CachedFailure ||
                    result.status == BuildResult::TimedOut)
                    result.errorMsg = "";
                /* Create failed build steps for every build that depends
                   on this. For cached failures, only create a step for
                   builds that don't have this step as top-level
                   (otherwise the user won't be able to see what caused
                   the build to fail). */
                for (auto & build2 : indirect) {
                    if ((cachedFailure && build2->drvPath == step->drvPath) ||
                        (!cachedFailure && build == build2) ||
                        build2->finishedInDB)
                        continue;
                    createBuildStep(txn, 0, build2, step, machine->sshName,
                        buildStepStatus, result.errorMsg, build == build2 ? 0 : build->id);
                }
                if (!cachedFailure)
                    finishBuildStep(txn, result.startTime, result.stopTime, build->id,
                        stepNr, machine->sshName, buildStepStatus, result.errorMsg);
                /* Mark all builds that depend on this derivation as failed. */
                for (auto & build2 : indirect) {
                    if (build2->finishedInDB) continue;
                    printMsg(lvlError, format("marking build %1% as failed") % build2->id);
                    txn.parameterized
                        ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1 and finished = 0")
                        (build2->id)
                        ((int) (build2->drvPath != step->drvPath && buildStatus == bsFailed ? bsDepFailed : buildStatus))
                        (result.startTime)
                        (result.stopTime)
                        (cachedFailure ? 1 : 0).exec();
                    nrBuildsDone++;
                }
                /* Remember failed paths in the database so that they
                   won't be built again. */
                if (!cachedFailure && result.status == BuildResult::PermanentFailure)
                    for (auto & path : outputPaths(step->drv))
                        txn.parameterized("insert into FailedPaths values ($1)")(path).exec();
                txn.commit();
            }
            /* Remove the indirect dependencies from ‘builds’. This
               will cause them to be destroyed. */
            for (auto & b : indirect) {
                auto builds_(builds.lock());
                b->finishedInDB = true;
                builds_->erase(b->id);
                dependentIDs.push_back(b->id);
                if (buildOne == b->id) quit = true;
            }
        }
        /* Send notification about this build and its dependents. */
        {
            auto notificationSenderQueue_(notificationSenderQueue.lock());
            notificationSenderQueue_->push(NotificationItem(build->id, dependentIDs));
        }
        notificationSenderWakeup.notify_one();
    }
    // FIXME: keep stats about aborted steps?
    nrStepsDone++;
    totalStepTime += stepStopTime - stepStartTime;
    totalStepBuildTime += result.stopTime - result.startTime;
    machine->state->nrStepsDone++;
    machine->state->totalStepTime += stepStopTime - stepStartTime;
    machine->state->totalStepBuildTime += result.stopTime - result.startTime;
    if (quit) exit(0); // testing hack
    return false;
 }
 void State::markSucceededBuild(pqxx::work & txn, Build::ptr build,
    const BuildOutput & res, bool isCachedBuild, time_t startTime, time_t stopTime)
 {
--- a/src/hydra-queue-runner/queue-monitor.cc
+++ b/src/hydra-queue-runner/queue-monitor.cc
@ -0,0 +1,369 @@
 #include "state.hh"
 #include "build-result.hh"
 using namespace nix;
 void State::queueMonitor()
 {
    while (true) {
        try {
            queueMonitorLoop();
        } catch (std::exception & e) {
            printMsg(lvlError, format("queue monitor: %1%") % e.what());
            sleep(10); // probably a DB problem, so don't retry right away
        }
    }
 }
 void State::queueMonitorLoop()
 {
    auto conn(dbPool.get());
    receiver buildsAdded(*conn, "builds_added");
    receiver buildsRestarted(*conn, "builds_restarted");
    receiver buildsCancelled(*conn, "builds_cancelled");
    receiver buildsDeleted(*conn, "builds_deleted");
    auto store = openStore(); // FIXME: pool
    unsigned int lastBuildId = 0;
    while (true) {
        getQueuedBuilds(*conn, store, lastBuildId);
        /* Sleep until we get notification from the database about an
           event. */
        conn->await_notification();
        nrQueueWakeups++;
        if (buildsAdded.get())
            printMsg(lvlTalkative, "got notification: new builds added to the queue");
        if (buildsRestarted.get()) {
            printMsg(lvlTalkative, "got notification: builds restarted");
            lastBuildId = 0; // check all builds
        }
        if (buildsCancelled.get() || buildsDeleted.get()) {
            printMsg(lvlTalkative, "got notification: builds cancelled");
            removeCancelledBuilds(*conn);
        }
    }
 }
 void State::getQueuedBuilds(Connection & conn, std::shared_ptr<StoreAPI> store, unsigned int & lastBuildId)
 {
    printMsg(lvlInfo, format("checking the queue for builds > %1%...") % lastBuildId);
    /* Grab the queued builds from the database, but don't process
       them yet (since we don't want a long-running transaction). */
    std::multimap<Path, Build::ptr> newBuilds;
    {
        pqxx::work txn(conn);
        auto res = txn.parameterized("select id, project, jobset, job, drvPath, maxsilent, timeout from Builds where id > $1 and finished = 0 order by id")(lastBuildId).exec();
        for (auto const & row : res) {
            auto builds_(builds.lock());
            BuildID id = row["id"].as<BuildID>();
            if (buildOne && id != buildOne) continue;
            if (id > lastBuildId) lastBuildId = id;
            if (has(*builds_, id)) continue;
            auto build = std::make_shared<Build>();
            build->id = id;
            build->drvPath = row["drvPath"].as<string>();
            build->fullJobName = row["project"].as<string>() + ":" + row["jobset"].as<string>() + ":" + row["job"].as<string>();
            build->maxSilentTime = row["maxsilent"].as<int>();
            build->buildTimeout = row["timeout"].as<int>();
            newBuilds.emplace(std::make_pair(build->drvPath, build));
        }
    }
    std::set<Step::ptr> newRunnable;
    unsigned int nrAdded;
    std::function<void(Build::ptr)> createBuild;
    createBuild = [&](Build::ptr build) {
        printMsg(lvlTalkative, format("loading build %1% (%2%)") % build->id % build->fullJobName);
        nrAdded++;
        if (!store->isValidPath(build->drvPath)) {
            /* Derivation has been GC'ed prematurely. */
            printMsg(lvlError, format("aborting GC'ed build %1%") % build->id);
            if (!build->finishedInDB) {
                pqxx::work txn(conn);
                txn.parameterized
                    ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $3, errorMsg = $4 where id = $1 and finished = 0")
                    (build->id)
                    ((int) bsAborted)
                    (time(0))
                    ("derivation was garbage-collected prior to build").exec();
                txn.commit();
                build->finishedInDB = true;
                nrBuildsDone++;
            }
            return;
        }
        std::set<Step::ptr> newSteps;
        std::set<Path> finishedDrvs; // FIXME: re-use?
        Step::ptr step = createStep(store, build->drvPath, build, 0, finishedDrvs, newSteps, newRunnable);
        /* Some of the new steps may be the top level of builds that
           we haven't processed yet. So do them now. This ensures that
           if build A depends on build B with top-level step X, then X
           will be "accounted" to B in doBuildStep(). */
        for (auto & r : newSteps) {
            while (true) {
                auto i = newBuilds.find(r->drvPath);
                if (i == newBuilds.end()) break;
                Build::ptr b = i->second;
                newBuilds.erase(i);
                createBuild(b);
            }
        }
        /* If we didn't get a step, it means the step's outputs are
           all valid. So we mark this as a finished, cached build. */
        if (!step) {
            Derivation drv = readDerivation(build->drvPath);
            BuildOutput res = getBuildOutput(store, drv);
            pqxx::work txn(conn);
            time_t now = time(0);
            markSucceededBuild(txn, build, res, true, now, now);
            txn.commit();
            build->finishedInDB = true;
            return;
        }
        /* If any step has an unsupported system type or has a
           previously failed output path, then fail the build right
           away. */
        bool badStep = false;
        for (auto & r : newSteps) {
            BuildStatus buildStatus = bsSuccess;
            BuildStepStatus buildStepStatus = bssFailed;
            if (checkCachedFailure(r, conn)) {
                printMsg(lvlError, format("marking build %1% as cached failure") % build->id);
                buildStatus = step == r ? bsFailed : bsDepFailed;
                buildStepStatus = bssFailed;
            }
            if (buildStatus == bsSuccess) {
                bool supported = false;
                {
                    auto machines_(machines.lock()); // FIXME: use shared_mutex
                    for (auto & m : *machines_)
                        if (m.second->supportsStep(r)) { supported = true; break; }
                }
                if (!supported) {
                    printMsg(lvlError, format("aborting unsupported build %1%") % build->id);
                    buildStatus = bsUnsupported;
                    buildStepStatus = bssUnsupported;
                }
            }
            if (buildStatus != bsSuccess) {
                time_t now = time(0);
                if (!build->finishedInDB) {
                    pqxx::work txn(conn);
                    createBuildStep(txn, 0, build, r, "", buildStepStatus);
                    txn.parameterized
                        ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $3, isCachedBuild = $4 where id = $1 and finished = 0")
                        (build->id)
                        ((int) buildStatus)
                        (now)
                        (buildStatus != bsUnsupported ? 1 : 0).exec();
                    txn.commit();
                    build->finishedInDB = true;
                    nrBuildsDone++;
                }
                badStep = true;
                break;
            }
        }
        if (badStep) return;
        /* Note: if we exit this scope prior to this, the build and
           all newly created steps are destroyed. */
        {
            auto builds_(builds.lock());
            if (!build->finishedInDB) // FIXME: can this happen?
                (*builds_)[build->id] = build;
            build->toplevel = step;
        }
        printMsg(lvlChatty, format("added build %1% (top-level step %2%, %3% new steps)")
            % build->id % step->drvPath % newSteps.size());
    };
    /* Now instantiate build steps for each new build. The builder
       threads can start building the runnable build steps right away,
       even while we're still processing other new builds. */
    while (!newBuilds.empty()) {
        auto build = newBuilds.begin()->second;
        newBuilds.erase(newBuilds.begin());
        newRunnable.clear();
        nrAdded = 0;
        try {
            createBuild(build);
        } catch (Error & e) {
            e.addPrefix(format("while loading build %1%: ") % build->id);
            throw;
        }
        /* Add the new runnable build steps to ‘runnable’ and wake up
           the builder threads. */
        printMsg(lvlChatty, format("got %1% new runnable steps from %2% new builds") % newRunnable.size() % nrAdded);
        for (auto & r : newRunnable)
            makeRunnable(r);
        nrBuildsRead += nrAdded;
    }
 }
 void State::removeCancelledBuilds(Connection & conn)
 {
    /* Get the current set of queued builds. */
    std::set<BuildID> currentIds;
    {
        pqxx::work txn(conn);
        auto res = txn.exec("select id from Builds where finished = 0");
        for (auto const & row : res)
            currentIds.insert(row["id"].as<BuildID>());
    }
    auto builds_(builds.lock());
    for (auto i = builds_->begin(); i != builds_->end(); ) {
        if (currentIds.find(i->first) == currentIds.end()) {
            printMsg(lvlInfo, format("discarding cancelled build %1%") % i->first);
            i = builds_->erase(i);
            // FIXME: ideally we would interrupt active build steps here.
        } else
            ++i;
    }
 }
 Step::ptr State::createStep(std::shared_ptr<StoreAPI> store, const Path & drvPath,
    Build::ptr referringBuild, Step::ptr referringStep, std::set<Path> & finishedDrvs,
    std::set<Step::ptr> & newSteps, std::set<Step::ptr> & newRunnable)
 {
    if (finishedDrvs.find(drvPath) != finishedDrvs.end()) return 0;
    /* Check if the requested step already exists. If not, create a
       new step. In any case, make the step reachable from
       referringBuild or referringStep. This is done atomically (with
       ‘steps’ locked), to ensure that this step can never become
       reachable from a new build after doBuildStep has removed it
       from ‘steps’. */
    Step::ptr step;
    bool isNew = false;
    {
        auto steps_(steps.lock());
        /* See if the step already exists in ‘steps’ and is not
           stale. */
        auto prev = steps_->find(drvPath);
        if (prev != steps_->end()) {
            step = prev->second.lock();
            /* Since ‘step’ is a strong pointer, the referred Step
               object won't be deleted after this. */
            if (!step) steps_->erase(drvPath); // remove stale entry
        }
        /* If it doesn't exist, create it. */
        if (!step) {
            step = std::make_shared<Step>();
            step->drvPath = drvPath;
            isNew = true;
        }
        auto step_(step->state.lock());
        assert(step_->created != isNew);
        if (referringBuild)
            step_->builds.push_back(referringBuild);
        if (referringStep)
            step_->rdeps.push_back(referringStep);
        (*steps_)[drvPath] = step;
    }
    if (!isNew) return step;
    printMsg(lvlDebug, format("considering derivation ‘%1%’") % drvPath);
    /* Initialize the step. Note that the step may be visible in
       ‘steps’ before this point, but that doesn't matter because
       it's not runnable yet, and other threads won't make it
       runnable while step->created == false. */
    step->drv = readDerivation(drvPath);
    {
        auto i = step->drv.env.find("requiredSystemFeatures");
        if (i != step->drv.env.end())
            step->requiredSystemFeatures = tokenizeString<std::set<std::string>>(i->second);
    }
    auto attr = step->drv.env.find("preferLocalBuild");
    step->preferLocalBuild =
        attr != step->drv.env.end() && attr->second == "1"
        && has(localPlatforms, step->drv.platform);
    /* Are all outputs valid? */
    bool valid = true;
    for (auto & i : step->drv.outputs) {
        if (!store->isValidPath(i.second.path)) {
            valid = false;
            break;
        }
    }
    // FIXME: check whether all outputs are in the binary cache.
    if (valid) {
        finishedDrvs.insert(drvPath);
        return 0;
    }
    /* No, we need to build. */
    printMsg(lvlDebug, format("creating build step ‘%1%’") % drvPath);
    newSteps.insert(step);
    /* Create steps for the dependencies. */
    for (auto & i : step->drv.inputDrvs) {
        auto dep = createStep(store, i.first, 0, step, finishedDrvs, newSteps, newRunnable);
        if (dep) {
            auto step_(step->state.lock());
            step_->deps.insert(dep);
        }
    }
    /* If the step has no (remaining) dependencies, make it
       runnable. */
    {
        auto step_(step->state.lock());
        assert(!step_->created);
        step_->created = true;
        if (step_->deps.empty())
            newRunnable.insert(step);
    }
    return step;
 }
--- a/src/hydra-queue-runner/state.hh
+++ b/src/hydra-queue-runner/state.hh
@ -118,6 +118,9 @@ struct Step
 };
 void getDependents(Step::ptr step, std::set<Build::ptr> & builds, std::set<Step::ptr> & steps);
 struct Machine
 {
    typedef std::shared_ptr<Machine> ptr;
@ -159,6 +162,12 @@ class State
 {
 private:
    // FIXME: Make configurable.
    const unsigned int maxTries = 5;
    const unsigned int retryInterval = 60; // seconds
    const float retryBackoff = 3.0;
    const unsigned int maxParallelCopyClosure = 4;
    nix::Path hydraData, logDir;
    nix::StringSet localPlatforms;
@ -306,3 +315,10 @@ public:
    void run(BuildID buildOne = 0);
 };
 template <class C, class V>
 bool has(const C & c, const V & v)
 {
    return c.find(v) != c.end();
 }