hydra-queue-runner: More accurate memory accounting

We now take into account the memory necessary for compressing the NAR being exported to the binary cache, plus xz compression overhead. Also, we now release the memory tokens for the NAR accessor *after* releasing the NAR accessor. Previously the memory for the NAR accessor might still be in use while another thread does an allocation, causing the maximum to be exceeded temporarily. Also, use notify_all instead of notify_one to wake up memory token waiters. This is not very nice, but not every waiter is requesting the same number of tokens, so some might be able to proceed.
2016-11-16 17:46:00 +01:00 · 2016-11-16 17:46:00 +01:00 · b4d32a3085
parent cb5e438a08
commit b4d32a3085
4 changed files with 40 additions and 10 deletions
--- a/src/hydra-queue-runner/build-remote.cc
+++ b/src/hydra-queue-runner/build-remote.cc
@ -373,10 +373,13 @@ void State::buildRemote(ref<Store> destStore,
                % step->drvPath % machine->sshName % totalNarSize);

            /* Block until we have the required amount of memory
-               available. FIXME: only need this for binary cache
-               destination stores. */
+               available, which is twice the NAR size (namely the
+               uncompressed and worst-case compressed NAR), plus 150
+               MB for xz compression overhead. (The xz manpage claims
+               ~94 MiB, but that's not was I'm seeing.) */
            auto resStart = std::chrono::steady_clock::now();
-            auto memoryReservation(memoryTokens.get(totalNarSize));
+            size_t compressionCost = totalNarSize + 150 * 1024 * 1024;
+            result.tokens = std::make_unique<nix::TokenServer::Token>(memoryTokens.get(totalNarSize + compressionCost));
            auto resStop = std::chrono::steady_clock::now();

            auto resMs = std::chrono::duration_cast<std::chrono::milliseconds>(resStop - resStart).count();
@ -390,6 +393,11 @@ void State::buildRemote(ref<Store> destStore,
            to.flush();
            destStore->importPaths(from, result.accessor, true);

+            /* Release the tokens pertaining to NAR
+               compression. After this we only have the uncompressed
+               NAR in memory. */
+            result.tokens->give_back(compressionCost);
+
            auto now2 = std::chrono::steady_clock::now();

            result.overhead += std::chrono::duration_cast<std::chrono::milliseconds>(now2 - now1).count();
--- a/src/hydra-queue-runner/builder.cc
+++ b/src/hydra-queue-runner/builder.cc
@ -179,6 +179,9 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore,

        if (result.stepStatus == bsSuccess)
            res = getBuildOutput(destStore, ref<FSAccessor>(result.accessor), step->drv);
+
+        result.accessor = 0;
+        result.tokens = 0;
    }

    time_t stepStopTime = time(0);
--- a/src/hydra-queue-runner/state.hh
+++ b/src/hydra-queue-runner/state.hh
@ -50,6 +50,7 @@ struct RemoteResult
    time_t startTime = 0, stopTime = 0;
    unsigned int overhead = 0;
    nix::Path logFile;
+    std::unique_ptr<nix::TokenServer::Token> tokens;
    std::shared_ptr<nix::FSAccessor> accessor;

    BuildStatus buildStatus()
--- a/src/hydra-queue-runner/token-server.hh
+++ b/src/hydra-queue-runner/token-server.hh
@ -40,6 +40,7 @@ public:
        {
            if (tokens >= ts->maxTokens)
                throw NoTokens(format("requesting more tokens (%d) than exist (%d)") % tokens % ts->maxTokens);
+            debug("acquiring %d tokens", tokens);
            auto inUse(ts->inUse.lock());
            while (*inUse + tokens > ts->maxTokens)
                if (timeout) {
@ -54,21 +55,38 @@ public:

    public:

-        Token(Token && t) : ts(t.ts) { t.ts = 0; }
+        Token(Token && t) : ts(t.ts), tokens(t.tokens), acquired(t.acquired)
+        {
+            t.ts = 0;
+            t.acquired = false;
+        }
        Token(const Token & l) = delete;

        ~Token()
        {
            if (!ts || !acquired) return;
-            {
-                auto inUse(ts->inUse.lock());
-                assert(*inUse >= tokens);
-                *inUse -= tokens;
-            }
-            ts->wakeup.notify_one();
+            give_back(tokens);
        }

        bool operator ()() { return acquired; }
+
+        void give_back(size_t t)
+        {
+            debug("returning %d tokens", t);
+            if (!t) return;
+            assert(acquired);
+            assert(t <= tokens);
+            {
+                auto inUse(ts->inUse.lock());
+                assert(*inUse >= t);
+                *inUse -= t;
+                tokens -= t;
+            }
+            // FIXME: inefficient. Should wake up waiters that can
+            // proceed now.
+            ts->wakeup.notify_all();
+        }
+
    };

    Token get(size_t tokens = 1, unsigned int timeout = 0)