From 35b7c4f82b69c9317fcdb4ec4b5a82fdab19f757 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Tue, 7 Jul 2015 14:04:36 +0200 Subject: [PATCH] Allow only 1 thread to send a closure to a given machine at the same time This prevents a race where multiple threads see that machine X is missing path P, and start sending it concurrently. Nix handles this correctly, but it's still wasteful (especially for the case where P == GHC). A more refined scheme would be to have per machine, per path locks. --- src/hydra-queue-runner/build-remote.cc | 18 +++--------------- src/hydra-queue-runner/hydra-queue-runner.cc | 1 - src/hydra-queue-runner/state.hh | 9 ++++----- 3 files changed, 7 insertions(+), 21 deletions(-) diff --git a/src/hydra-queue-runner/build-remote.cc b/src/hydra-queue-runner/build-remote.cc index c87d040b..48970c3d 100644 --- a/src/hydra-queue-runner/build-remote.cc +++ b/src/hydra-queue-runner/build-remote.cc @@ -70,7 +70,7 @@ static void openConnection(const string & sshName, const string & sshKey, static void copyClosureTo(std::shared_ptr store, FdSource & from, FdSink & to, const PathSet & paths, - TokenServer & copyClosureTokenServer, counter & bytesSent, + counter & bytesSent, bool useSubstitutes = false) { PathSet closure; @@ -99,19 +99,6 @@ static void copyClosureTo(std::shared_ptr store, for (auto i = sorted.rbegin(); i != sorted.rend(); ++i) if (present.find(*i) == present.end()) missing.push_back(*i); - /* Ensure that only a limited number of threads can copy closures - at the same time. However, proceed anyway after a timeout to - prevent starvation by a handful of really huge closures. */ - time_t start = time(0); - int timeout = 60 * (10 + rand() % 5); - auto token(copyClosureTokenServer.get(timeout)); - time_t stop = time(0); - - if (token()) - printMsg(lvlDebug, format("got copy closure token after %1%s") % (stop - start)); - else - printMsg(lvlDebug, format("did not get copy closure token after %1%s") % (stop - start)); - printMsg(lvlDebug, format("sending %1% missing paths") % missing.size()); for (auto & p : missing) @@ -194,7 +181,8 @@ void State::buildRemote(std::shared_ptr store, if (machine->sshName != "localhost") { printMsg(lvlDebug, format("sending closure of ‘%1%’ to ‘%2%’") % step->drvPath % machine->sshName); MaintainCount mc(nrStepsCopyingTo); - copyClosureTo(store, from, to, inputs, copyClosureTokenServer, bytesSent); + std::lock_guard sendLock(machine->state->sendLock); + copyClosureTo(store, from, to, inputs, bytesSent); } autoDelete.cancel(); diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 27e18212..892e8103 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -32,7 +32,6 @@ bool has(const C & c, const V & v) State::State() - : copyClosureTokenServer{maxParallelCopyClosure} { hydraData = getEnv("HYDRA_DATA"); if (hydraData == "") throw Error("$HYDRA_DATA must be set"); diff --git a/src/hydra-queue-runner/state.hh b/src/hydra-queue-runner/state.hh index 525db44e..b439a6b8 100644 --- a/src/hydra-queue-runner/state.hh +++ b/src/hydra-queue-runner/state.hh @@ -12,7 +12,6 @@ #include "pathlocks.hh" #include "pool.hh" #include "sync.hh" -#include "token-server.hh" #include "store-api.hh" #include "derivations.hh" @@ -136,6 +135,10 @@ struct Machine counter nrStepsDone{0}; counter totalStepTime{0}; // total time for steps, including closure copying counter totalStepBuildTime{0}; // total build time for steps + + /* Mutex to prevent multiple threads from sending data to the + same machine (which would be inefficient). */ + std::mutex sendLock; }; State::ptr state; @@ -191,10 +194,6 @@ private: nix::Path machinesFile; struct stat machinesFileStat; - /* Token server limiting the number of threads copying closures in - parallel to prevent excessive I/O load. */ - TokenServer copyClosureTokenServer; - /* Various stats. */ time_t startedAt; counter nrBuildsRead{0};