From b4d32a30855e1350170afd3fd2c35c9ee9f70128 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Wed, 16 Nov 2016 17:46:00 +0100 Subject: [PATCH] hydra-queue-runner: More accurate memory accounting We now take into account the memory necessary for compressing the NAR being exported to the binary cache, plus xz compression overhead. Also, we now release the memory tokens for the NAR accessor *after* releasing the NAR accessor. Previously the memory for the NAR accessor might still be in use while another thread does an allocation, causing the maximum to be exceeded temporarily. Also, use notify_all instead of notify_one to wake up memory token waiters. This is not very nice, but not every waiter is requesting the same number of tokens, so some might be able to proceed. --- src/hydra-queue-runner/build-remote.cc | 14 ++++++++--- src/hydra-queue-runner/builder.cc | 3 +++ src/hydra-queue-runner/state.hh | 1 + src/hydra-queue-runner/token-server.hh | 32 ++++++++++++++++++++------ 4 files changed, 40 insertions(+), 10 deletions(-) diff --git a/src/hydra-queue-runner/build-remote.cc b/src/hydra-queue-runner/build-remote.cc index 65452be3..9466f009 100644 --- a/src/hydra-queue-runner/build-remote.cc +++ b/src/hydra-queue-runner/build-remote.cc @@ -373,10 +373,13 @@ void State::buildRemote(ref destStore, % step->drvPath % machine->sshName % totalNarSize); /* Block until we have the required amount of memory - available. FIXME: only need this for binary cache - destination stores. */ + available, which is twice the NAR size (namely the + uncompressed and worst-case compressed NAR), plus 150 + MB for xz compression overhead. (The xz manpage claims + ~94 MiB, but that's not was I'm seeing.) */ auto resStart = std::chrono::steady_clock::now(); - auto memoryReservation(memoryTokens.get(totalNarSize)); + size_t compressionCost = totalNarSize + 150 * 1024 * 1024; + result.tokens = std::make_unique(memoryTokens.get(totalNarSize + compressionCost)); auto resStop = std::chrono::steady_clock::now(); auto resMs = std::chrono::duration_cast(resStop - resStart).count(); @@ -390,6 +393,11 @@ void State::buildRemote(ref destStore, to.flush(); destStore->importPaths(from, result.accessor, true); + /* Release the tokens pertaining to NAR + compression. After this we only have the uncompressed + NAR in memory. */ + result.tokens->give_back(compressionCost); + auto now2 = std::chrono::steady_clock::now(); result.overhead += std::chrono::duration_cast(now2 - now1).count(); diff --git a/src/hydra-queue-runner/builder.cc b/src/hydra-queue-runner/builder.cc index f943ea6e..4e8f6cf3 100644 --- a/src/hydra-queue-runner/builder.cc +++ b/src/hydra-queue-runner/builder.cc @@ -179,6 +179,9 @@ State::StepResult State::doBuildStep(nix::ref destStore, if (result.stepStatus == bsSuccess) res = getBuildOutput(destStore, ref(result.accessor), step->drv); + + result.accessor = 0; + result.tokens = 0; } time_t stepStopTime = time(0); diff --git a/src/hydra-queue-runner/state.hh b/src/hydra-queue-runner/state.hh index b02491f2..347bbbb7 100644 --- a/src/hydra-queue-runner/state.hh +++ b/src/hydra-queue-runner/state.hh @@ -50,6 +50,7 @@ struct RemoteResult time_t startTime = 0, stopTime = 0; unsigned int overhead = 0; nix::Path logFile; + std::unique_ptr tokens; std::shared_ptr accessor; BuildStatus buildStatus() diff --git a/src/hydra-queue-runner/token-server.hh b/src/hydra-queue-runner/token-server.hh index 6c0cf716..7c6310ef 100644 --- a/src/hydra-queue-runner/token-server.hh +++ b/src/hydra-queue-runner/token-server.hh @@ -40,6 +40,7 @@ public: { if (tokens >= ts->maxTokens) throw NoTokens(format("requesting more tokens (%d) than exist (%d)") % tokens % ts->maxTokens); + debug("acquiring %d tokens", tokens); auto inUse(ts->inUse.lock()); while (*inUse + tokens > ts->maxTokens) if (timeout) { @@ -54,21 +55,38 @@ public: public: - Token(Token && t) : ts(t.ts) { t.ts = 0; } + Token(Token && t) : ts(t.ts), tokens(t.tokens), acquired(t.acquired) + { + t.ts = 0; + t.acquired = false; + } Token(const Token & l) = delete; ~Token() { if (!ts || !acquired) return; - { - auto inUse(ts->inUse.lock()); - assert(*inUse >= tokens); - *inUse -= tokens; - } - ts->wakeup.notify_one(); + give_back(tokens); } bool operator ()() { return acquired; } + + void give_back(size_t t) + { + debug("returning %d tokens", t); + if (!t) return; + assert(acquired); + assert(t <= tokens); + { + auto inUse(ts->inUse.lock()); + assert(*inUse >= t); + *inUse -= t; + tokens -= t; + } + // FIXME: inefficient. Should wake up waiters that can + // proceed now. + ts->wakeup.notify_all(); + } + }; Token get(size_t tokens = 1, unsigned int timeout = 0)