Rate-limit the number of threads copying closures at the same time

Having a hundred threads doing I/O at the same time is bad on magnetic
disks because of the excessive disk seeks. So allow only 4 threads to
copy closures in parallel.
This commit is contained in:
Eelco Dolstra 2015-06-23 01:49:14 +02:00
parent a317d24b29
commit 4db7c51b5c
5 changed files with 106 additions and 4 deletions

View file

@ -60,6 +60,7 @@ static void openConnection(const string & sshName, const string & sshKey,
static void copyClosureTo(std::shared_ptr<StoreAPI> store,
FdSource & from, FdSink & to, const PathSet & paths,
TokenServer & copyClosureTokenServer,
bool useSubstitutes = false)
{
PathSet closure;
@ -88,6 +89,19 @@ static void copyClosureTo(std::shared_ptr<StoreAPI> store,
for (auto i = sorted.rbegin(); i != sorted.rend(); ++i)
if (present.find(*i) == present.end()) missing.push_back(*i);
/* Ensure that only a limited number of threads can copy closures
at the same time. However, proceed anyway after a timeout to
prevent starvation by a handful of really huge closures. */
time_t start = time(0);
int timeout = 60 * (10 + rand() % 5);
auto token(copyClosureTokenServer.get(timeout));
time_t stop = time(0);
if (token())
printMsg(lvlDebug, format("got copy closure token after %1%s") % (stop - start));
else
printMsg(lvlDebug, format("dit not get copy closure token after %1%s") % (stop - start));
printMsg(lvlDebug, format("sending %1% missing paths") % missing.size());
writeInt(cmdImportPaths, to);
@ -114,6 +128,7 @@ void buildRemote(std::shared_ptr<StoreAPI> store,
const string & sshName, const string & sshKey,
const Path & drvPath, const Derivation & drv,
const nix::Path & logDir, unsigned int maxSilentTime, unsigned int buildTimeout,
TokenServer & copyClosureTokenServer,
RemoteResult & result, counter & nrStepsBuilding)
{
string base = baseNameOf(drvPath);
@ -163,7 +178,7 @@ void buildRemote(std::shared_ptr<StoreAPI> store,
/* Copy the input closure. */
printMsg(lvlDebug, format("sending closure of %1% to %2%") % drvPath % sshName);
copyClosureTo(store, from, to, inputs);
copyClosureTo(store, from, to, inputs, copyClosureTokenServer);
autoDelete.cancel();

View file

@ -4,6 +4,7 @@
#include "derivations.hh"
#include "counter.hh"
#include "token-server.hh"
struct RemoteResult
{
@ -22,4 +23,5 @@ void buildRemote(std::shared_ptr<nix::StoreAPI> store,
const std::string & sshName, const std::string & sshKey,
const nix::Path & drvPath, const nix::Derivation & drv,
const nix::Path & logDir, unsigned int maxSilentTime, unsigned int buildTimeout,
TokenServer & copyClosureTokenServer,
RemoteResult & result, counter & nrStepsBuilding);

View file

@ -20,6 +20,7 @@
#include "sync.hh"
#include "pool.hh"
#include "counter.hh"
#include "token-server.hh"
#include "store-api.hh"
#include "derivations.hh"
@ -31,9 +32,11 @@
using namespace nix;
const int maxTries = 5;
const int retryInterval = 60; // seconds
// FIXME: Make configurable.
const unsigned int maxTries = 5;
const unsigned int retryInterval = 60; // seconds
const float retryBackoff = 3.0;
const unsigned int maxParallelCopyClosure = 4;
typedef std::chrono::time_point<std::chrono::system_clock> system_time;
@ -243,6 +246,10 @@ private:
typedef std::list<Machine::ptr> Machines;
Sync<Machines> machines;
/* Token server limiting the number of threads copying closures in
parallel to prevent excessive I/O load. */
TokenServer copyClosureTokenServer{maxParallelCopyClosure};
/* Various stats. */
time_t startedAt;
counter nrBuildsRead{0};
@ -1100,7 +1107,8 @@ bool State::doBuildStep(std::shared_ptr<StoreAPI> store, Step::ptr step,
try {
/* FIXME: referring builds may have conflicting timeouts. */
buildRemote(store, machine->sshName, machine->sshKey, step->drvPath, step->drv,
logDir, build->maxSilentTime, build->buildTimeout, result, nrStepsBuilding);
logDir, build->maxSilentTime, build->buildTimeout, copyClosureTokenServer,
result, nrStepsBuilding);
} catch (Error & e) {
result.status = RemoteResult::rrMiscFailure;
result.errorMsg = e.msg();

View file

@ -2,6 +2,7 @@
#include <mutex>
#include <condition_variable>
#include <cassert>
/* This template class ensures synchronized access to a value of type
T. It is used as follows:
@ -50,6 +51,15 @@ public:
assert(s);
cv.wait(s->mutex);
}
template<class Rep, class Period, class Predicate>
bool wait_for(std::condition_variable_any & cv,
const std::chrono::duration<Rep, Period> & duration,
Predicate pred)
{
assert(s);
return cv.wait_for(s->mutex, duration, pred);
}
};
Lock lock() { return Lock(this); }

View file

@ -0,0 +1,67 @@
#pragma once
#include <atomic>
#include "sync.hh"
/* This class hands out tokens. There are only maxTokens tokens
available. Calling get() will return a Token object, representing
ownership of a token. If no token is available, get() will sleep
until another thread returns a token. */
class TokenServer
{
unsigned int maxTokens;
Sync<unsigned int> curTokens{0};
std::condition_variable_any wakeup;
public:
TokenServer(unsigned int maxTokens) : maxTokens(maxTokens) { }
class Token
{
friend TokenServer;
TokenServer * ts;
bool acquired = false;
Token(TokenServer * ts, unsigned int timeout) : ts(ts)
{
auto curTokens(ts->curTokens.lock());
while (*curTokens >= ts->maxTokens)
if (timeout) {
if (!curTokens.wait_for(ts->wakeup, std::chrono::seconds(timeout),
[&]() { return *curTokens < ts->maxTokens; }))
return;
} else
curTokens.wait(ts->wakeup);
(*curTokens)++;
acquired = true;
}
public:
Token(Token && t) : ts(t.ts) { t.ts = 0; }
Token(const Token & l) = delete;
~Token()
{
if (!ts || !acquired) return;
{
auto curTokens(ts->curTokens.lock());
assert(*curTokens);
(*curTokens)--;
}
ts->wakeup.notify_one();
}
bool operator ()() { return acquired; }
};
Token get(unsigned int timeout = 0)
{
return Token(this, timeout);
}
};