Fix race between the queue monitor and the builder threads

2015-06-18 16:30:28 +02:00 · 2015-06-18 16:30:28 +02:00 · 948473c909
parent 9c03b11ca8
commit 948473c909
1 changed files with 239 additions and 198 deletions
--- a/src/hydra-queue-runner/hydra-queue-runner.cc
+++ b/src/hydra-queue-runner/hydra-queue-runner.cc
@ -120,9 +120,7 @@ struct Build

    std::shared_ptr<Step> toplevel;

-    bool finishedInDB;
-
-    Build() : finishedInDB(false) { }
+    std::atomic_bool finishedInDB{false};

    ~Build()
    {
@ -158,13 +156,15 @@ struct Step
        system_time after;
    };

+    std::atomic_bool created{false}; // debugging
+    std::atomic_bool finished{false}; // debugging
+
    Sync<State> state;

-    std::atomic_bool destroyed;
-
-    Step() : destroyed(false) { }
-
-    ~Step() { }
+    ~Step()
+    {
+        printMsg(lvlError, format("destroying step %1%") % drvPath);
+    }
 };


@ -280,13 +280,9 @@ public:
    void removeCancelledBuilds(Connection & conn);

    Step::ptr createStep(std::shared_ptr<StoreAPI> store, const Path & drvPath,
+        Build::ptr referringBuild, Step::ptr referringStep,
        std::set<Step::ptr> & newSteps, std::set<Step::ptr> & newRunnable);

-    void destroyStep(Step::ptr step, bool proceed);
-
-    /* Get the builds that depend on the given step. */
-    std::set<Build::ptr> getDependentBuilds(Step::ptr step);
-
    void makeRunnable(Step::ptr step);

    /* The thread that selects and starts runnable builds. */
@ -525,6 +521,7 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr<StoreAPI> store,
            /* Derivation has been GC'ed prematurely. */
            printMsg(lvlError, format("aborting GC'ed build %1%") % build->id);
            pqxx::work txn(conn);
+            assert(!build->finishedInDB);
            txn.parameterized
                ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $3, errorMsg = $4 where id = $1")
                (build->id)
@ -538,7 +535,7 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr<StoreAPI> store,
        }

        std::set<Step::ptr> newSteps;
-        Step::ptr step = createStep(store, build->drvPath, newSteps, newRunnable);
+        Step::ptr step = createStep(store, build->drvPath, build, 0, newSteps, newRunnable);

        /* Some of the new steps may be the top level of builds that
           we haven't processed yet. So do them now. This ensures that
@ -560,13 +557,13 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr<StoreAPI> store,
            Derivation drv = readDerivation(build->drvPath);
            BuildResult res = getBuildResult(store, drv);

-            printMsg(lvlInfo, format("marking build %1% as cached successful") % build->id);
-
            pqxx::work txn(conn);
            time_t now = time(0);
            markSucceededBuild(txn, build, res, true, now, now);
            txn.commit();

+            build->finishedInDB = true;
+
            return;
        }

@ -603,6 +600,7 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr<StoreAPI> store,
                time_t now = time(0);
                pqxx::work txn(conn);
                createBuildStep(txn, 0, build, r, "", buildStepStatus);
+                assert(!build->finishedInDB);
                txn.parameterized
                    ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $3, isCachedBuild = $4 where id = $1")
                    (build->id)
@ -624,20 +622,12 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr<StoreAPI> store,

        {
            auto builds_(builds.lock());
-            auto step_(step->state.lock());
            (*builds_)[build->id] = build;
-            step_->builds.push_back(build);
            build->toplevel = step;
        }

        printMsg(lvlChatty, format("added build %1% (top-level step %2%, %3% new steps)")
            % build->id % step->drvPath % newSteps.size());
-
-        /* Prior to this, the build is not visible to
-           getDependentBuilds().  Now it is, so the build can be
-           failed if a dependency fails. (It can't succeed right away
-           because its top-level is not runnable yet). */
-
    };

    /* Now instantiate build steps for each new build. The builder
@ -687,32 +677,65 @@ void State::removeCancelledBuilds(Connection & conn)


 Step::ptr State::createStep(std::shared_ptr<StoreAPI> store, const Path & drvPath,
+    Build::ptr referringBuild, Step::ptr referringStep,
    std::set<Step::ptr> & newSteps, std::set<Step::ptr> & newRunnable)
 {
-    /* Check if the requested step already exists. */
+    /* Check if the requested step already exists. If not, create a
+       new step. In any case, make the step reachable from
+       referringBuild or referringStep. This is done atomically (with
+       ‘steps’ locked), to ensure that this step can never become
+       reachable from a new build after doBuildStep has removed it
+       from ‘steps’. */
+    Step::ptr step;
+    bool isNew = false;
    {
        auto steps_(steps.lock());
+
+        /* See if the step already exists in ‘steps’ and is not
+           stale. */
        auto prev = steps_->find(drvPath);
        if (prev != steps_->end()) {
-            auto step = prev->second.lock();
+            step = prev->second.lock();
            /* Since ‘step’ is a strong pointer, the referred Step
               object won't be deleted after this. */
-            if (step) return step;
-            steps_->erase(drvPath); // remove stale entry
+            if (!step) steps_->erase(drvPath); // remove stale entry
        }
+
+        /* If it doesn't exist, create it. */
+        if (!step) {
+            step = std::make_shared<Step>();
+            step->drvPath = drvPath;
+            isNew = true;
+        }
+
+        auto step_(step->state.lock());
+
+        if (referringBuild)
+            step_->builds.push_back(referringBuild);
+
+        if (referringStep)
+            step_->rdeps.push_back(referringStep);
+
+        (*steps_)[drvPath] = step;
    }

    printMsg(lvlDebug, format("considering derivation ‘%1%’") % drvPath);

-    auto step = std::make_shared<Step>();
-    step->drvPath = drvPath;
+    if (!isNew) {
+        assert(step->created);
+        return step;
+    }
+
+    /* Initialize the step. Note that the step may be visible in
+       ‘steps’ before this point, but that doesn't matter because
+       it's not runnable yet, and other threads won't make it
+       runnable while step->created == false. */
    step->drv = readDerivation(drvPath);
    {
        auto i = step->drv.env.find("requiredSystemFeatures");
        if (i != step->drv.env.end())
            step->requiredSystemFeatures = tokenizeString<std::set<std::string>>(i->second);
    }
-    newSteps.insert(step);

    /* Are all outputs valid? */
    bool valid = true;
@ -728,94 +751,39 @@ Step::ptr State::createStep(std::shared_ptr<StoreAPI> store, const Path & drvPat

    /* No, we need to build. */
    printMsg(lvlDebug, format("creating build step ‘%1%’") % drvPath);
+    newSteps.insert(step);

    /* Create steps for the dependencies. */
-    bool hasDeps = false;
    for (auto & i : step->drv.inputDrvs) {
-        Step::ptr dep = createStep(store, i.first, newSteps, newRunnable);
+        auto dep = createStep(store, i.first, 0, step, newSteps, newRunnable);
        if (dep) {
-            hasDeps = true;
            auto step_(step->state.lock());
-            auto dep_(dep->state.lock());
            step_->deps.insert(dep);
-            dep_->rdeps.push_back(step);
        }
    }

+    /* If the step has no (remaining) dependencies, make it
+       runnable. */
    {
-        auto steps_(steps.lock());
-        assert(steps_->find(drvPath) == steps_->end());
-        (*steps_)[drvPath] = step;
+        auto step_(step->state.lock());
+        assert(!step->created);
+        step->created = true;
+        if (step_->deps.empty())
+            newRunnable.insert(step);
    }

-    if (!hasDeps) newRunnable.insert(step);
-
    return step;
 }


-void State::destroyStep(Step::ptr step, bool proceed)
+/* Get the steps and unfinished builds that depend on the given step. */
+void getDependents(Step::ptr step, std::set<Build::ptr> & builds, std::set<Step::ptr> & steps)
 {
-    if (step->destroyed) return;
-    step->destroyed = true;
-
-    printMsg(lvlDebug, format("destroying build step ‘%1%’") % step->drvPath);
-
-    nrStepsDone++;
-
-    {
-        auto steps_(steps.lock());
-        steps_->erase(step->drvPath);
-    }
-
-    std::vector<Step::wptr> rdeps;
-
-    {
-        auto step_(step->state.lock());
-        rdeps = step_->rdeps;
-
-        /* Sanity checks. */
-        for (auto & build_ : step_->builds) {
-            auto build = build_.lock();
-            if (!build) continue;
-            assert(build->drvPath == step->drvPath);
-            assert(build->finishedInDB);
-        }
-    }
-
-    for (auto & rdep_ : rdeps) {
-        auto rdep = rdep_.lock();
-        if (!rdep) continue;
-        bool runnable = false;
-        {
-            auto rdep_(rdep->state.lock());
-            assert(has(rdep_->deps, step));
-            rdep_->deps.erase(step);
-            if (rdep_->deps.empty()) runnable = true;
-        }
-        if (proceed) {
-            /* If this rdep has no other dependencies, then we can now
-               build it. */
-            if (runnable)
-                makeRunnable(rdep);
-        } else
-            /* If ‘step’ failed or was cancelled, then delete all
-               dependent steps as well. */
-            destroyStep(rdep, false);
-    }
-}
-
-
-std::set<Build::ptr> State::getDependentBuilds(Step::ptr step)
-{
-    std::set<Step::ptr> done;
-    std::set<Build::ptr> res;
-
    std::function<void(Step::ptr)> visit;

    visit = [&](Step::ptr step) {
-        if (has(done, step)) return;
-        done.insert(step);
+        if (has(steps, step)) return;
+        steps.insert(step);

        std::vector<Step::wptr> rdeps;

@ -824,7 +792,7 @@ std::set<Build::ptr> State::getDependentBuilds(Step::ptr step)

            for (auto & build : step_->builds) {
                auto build_ = build.lock();
-                if (build_) res.insert(build_);
+                if (build_ && !build_->finishedInDB) builds.insert(build_);
            }

            /* Make a copy of rdeps so that we don't hold the lock for
@ -839,8 +807,6 @@ std::set<Build::ptr> State::getDependentBuilds(Step::ptr step)
    };

    visit(step);
-
-    return res;
 }


@ -850,6 +816,8 @@ void State::makeRunnable(Step::ptr step)

    {
        auto step_(step->state.lock());
+        assert(step->created);
+        assert(!step->finished);
        assert(step_->deps.empty());
    }

@ -913,7 +881,7 @@ void State::dispatcher()
                if (machine->currentJobs >= machine->maxJobs) continue;

                auto runnable_(runnable.lock());
-                printMsg(lvlDebug, format("%1% runnable builds") % runnable_->size());
+                //printMsg(lvlDebug, format("%1% runnable builds") % runnable_->size());

                /* FIXME: we're holding the runnable lock too long
                   here. This could be more efficient. */
@ -1024,6 +992,12 @@ void State::builder(Step::ptr step, MachineReservation::ptr reservation)
 bool State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
    Machine::ptr machine)
 {
+    {
+        auto step_(step->state.lock());
+        assert(step->created);
+        assert(!step->finished);
+    }
+
    /* There can be any number of builds in the database that depend
       on this derivation. Arbitrarily pick one (though preferring a
       build of which this is the top-level derivation) for the
@ -1034,7 +1008,9 @@ bool State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
    Build::ptr build;

    {
-        auto dependents = getDependentBuilds(step);
+        std::set<Build::ptr> dependents;
+        std::set<Step::ptr> steps;
+        getDependents(step, dependents, steps);

        if (dependents.empty()) {
            /* Apparently all builds that depend on this derivation
@ -1117,24 +1093,38 @@ bool State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
        }
    }

-    /* Remove this step. After this, incoming builds that depend on
-       drvPath will either see that the output paths exist, or will
-       create a new build step for drvPath. The latter is fine - it
-       won't conflict with this one, because we're removing it. In any
-       case, the set of dependent builds for ‘step’ can't increase
-       anymore because ‘step’ is no longer visible to createStep(). */
-    auto steps_(steps.lock());
-    steps_->erase(step->drvPath);
+    if (result.status == RemoteResult::rrSuccess) {

-    /* Get the final set of dependent builds. */
-    auto dependents = getDependentBuilds(step);
+        /* Register success in the database for all Build objects that
+           have this step as the top-level step. Since the queue
+           monitor thread may be creating new referring Builds
+           concurrently, and updating the database may fail, we do
+           this in a loop, marking all known builds, repeating until
+           there are no unmarked builds.
+        */
+        while (true) {

-    std::set<Build::ptr> direct;
+            /* Get the builds that have this one as the top-level. */
+            std::vector<Build::ptr> direct;
            {
+                auto steps_(steps.lock());
                auto step_(step->state.lock());
-        for (auto & build : step_->builds) {
-            auto build_ = build.lock();
-            if (build_) direct.insert(build_);
+
+                for (auto & b_ : step_->builds) {
+                    auto b = b_.lock();
+                    if (b && !b->finishedInDB) direct.push_back(b);
+                }
+
+                /* If there are no builds left to update in the DB,
+                   then we're done. Delete the step from
+                   ‘steps’. Since we've been holding the ‘steps’ lock,
+                   no new referrers can have been added in the
+                   meantime or be added afterwards. */
+                if (direct.empty()) {
+                    printMsg(lvlDebug, format("finishing build step ‘%1%’") % step->drvPath);
+                    nrStepsDone++;
+                    steps_->erase(step->drvPath);
+                    break;
                }
            }

@ -1142,18 +1132,73 @@ bool State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
            {
                pqxx::work txn(*conn);

-        if (result.status == RemoteResult::rrSuccess) {
-
                finishBuildStep(txn, result.startTime, result.stopTime, build->id, stepNr, machine->sshName, bssSuccess);

-            /* Mark all builds of which this derivation is the top
-               level as succeeded. */
-            for (auto build2 : direct)
-                markSucceededBuild(txn, build2, res, build != build2,
+                for (auto & b : direct)
+                    markSucceededBuild(txn, b, res, build != b,
                        result.startTime, result.stopTime);

+                txn.commit();
+            }
+
+            /* Remove the direct dependencies from ‘builds’. This will
+               cause them to be destroyed. */
+            for (auto & b : direct) {
+                auto builds_(builds.lock());
+                b->finishedInDB = true;
+                builds_->erase(b->id);
+            }
+        }
+
+        /* Wake up any dependent steps that have no other
+           dependencies. */
+        {
+            auto step_(step->state.lock());
+            for (auto & rdepWeak : step_->rdeps) {
+                auto rdep = rdepWeak.lock();
+                if (!rdep) continue;
+
+                bool runnable = false;
+                {
+                    auto rdep_(rdep->state.lock());
+                    rdep_->deps.erase(step);
+                    if (rdep_->deps.empty()) runnable = true;
+                }
+
+                if (runnable) makeRunnable(rdep);
+            }
+        }
+
    } else {
-            /* Failure case. */
+
+        /* Register failure in the database for all Build objects that
+           directly or indirectly depend on this step. */
+
+        while (true) {
+
+            /* Get the builds and steps that depend on this step. */
+            std::set<Build::ptr> indirect;
+            {
+                auto steps_(steps.lock());
+                std::set<Step::ptr> steps;
+                getDependents(step, indirect, steps);
+
+                /* If there are no builds left, delete all referring
+                   steps from ‘steps’. As for the success case, we can
+                   be certain no new referrers can be added. */
+                if (indirect.empty()) {
+                    for (auto & s : steps) {
+                        printMsg(lvlDebug, format("finishing build step ‘%1%’") % step->drvPath);
+                        nrStepsDone++;
+                        steps_->erase(s->drvPath);
+                    }
+                    break;
+                }
+            }
+
+            /* Update the database. */
+            {
+                pqxx::work txn(*conn);

                BuildStatus buildStatus =
                    result.status == RemoteResult::rrPermanentFailure ? bsFailed :
@ -1173,7 +1218,7 @@ bool State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
                   builds that don't have this step as top-level
                   (otherwise the user won't be able to see what caused
                   the build to fail). */
-            for (auto build2 : dependents) {
+                for (auto & build2 : indirect) {
                    if (build == build2) continue;
                    if (cachedFailure && build2->drvPath == step->drvPath) continue;
                    createBuildStep(txn, 0, build2, step, machine->sshName,
@ -1185,8 +1230,9 @@ bool State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
                        stepNr, machine->sshName, buildStepStatus, result.errorMsg);

                /* Mark all builds that depend on this derivation as failed. */
-            for (auto build2 : dependents) {
+                for (auto & build2 : indirect) {
                    printMsg(lvlError, format("marking build %1% as failed") % build2->id);
+                    assert(!build->finishedInDB);
                    txn.parameterized
                        ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1")
                        (build2->id)
@ -1194,7 +1240,6 @@ bool State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
                        (result.startTime)
                        (result.stopTime)
                        (cachedFailure ? 1 : 0).exec();
-                build2->finishedInDB = true; // FIXME: txn might fail
                    nrBuildsDone++;
                }

@ -1203,25 +1248,20 @@ bool State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
                if (!cachedFailure && result.status == RemoteResult::rrPermanentFailure)
                    for (auto & path : outputPaths(step->drv))
                        txn.parameterized("insert into FailedPaths values ($1)")(path).exec();
-        }

                txn.commit();
            }

-    /* In case of success, destroy all Build objects of which ‘step’
-       is the top-level derivation. In case of failure, destroy all
-       dependent Build objects. Any Steps not referenced by other
-       Builds will be destroyed as well. */
-    for (auto build2 : dependents)
-        if (build2->toplevel == step || result.status != RemoteResult::rrSuccess) {
+            /* Remove the indirect dependencies from ‘builds’. This
+               will cause them to be destroyed. */
+            for (auto & b : indirect) {
                auto builds_(builds.lock());
-            builds_->erase(build2->id);
+                b->finishedInDB = true;
+                builds_->erase(b->id);
+            }
        }

-    /* Remove the step from the graph. In case of success, make
-       dependent build steps runnable if they have no other
-       dependencies. */
-    destroyStep(step, result.status == RemoteResult::rrSuccess);
+    }

    return false;
 }
@ -1232,6 +1272,8 @@ void State::markSucceededBuild(pqxx::work & txn, Build::ptr build,
 {
    printMsg(lvlInfo, format("marking build %1% as succeeded") % build->id);

+    assert(!build->finishedInDB);
+
    txn.parameterized
        ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, size = $5, closureSize = $6, releaseName = $7, isCachedBuild = $8 where id = $1")
        (build->id)
@ -1259,7 +1301,6 @@ void State::markSucceededBuild(pqxx::work & txn, Build::ptr build,
            (product.defaultPath).exec();
    }

-    build->finishedInDB = true; // FIXME: txn might fail
    nrBuildsDone++;
 }