diff --git a/src/hydra-queue-runner/build-remote.cc b/src/hydra-queue-runner/build-remote.cc index 722e2f0a..d5a75d69 100644 --- a/src/hydra-queue-runner/build-remote.cc +++ b/src/hydra-queue-runner/build-remote.cc @@ -160,10 +160,33 @@ void State::buildRemote(std::shared_ptr store, sendDerivation = false; } catch (EndOfFile & e) { child.pid.wait(true); + + { + /* Disable this machine until a certain period of time has + passed. This period increases on every consecutive + failure. However, don't count failures that occurred + soon after the last one (to take into account steps + started in parallel). */ + auto info(machine->state->connectInfo.lock()); + auto now = std::chrono::system_clock::now(); + if (info->consecutiveFailures == 0 || info->lastFailure < now - std::chrono::seconds(30)) { + info->consecutiveFailures = std::min(info->consecutiveFailures + 1, (unsigned int) 4); + info->lastFailure = now; + int delta = retryInterval * powf(retryBackoff, info->consecutiveFailures - 1) + (rand() % 30); + printMsg(lvlInfo, format("will disable machine ‘%1%’ for %2%s") % machine->sshName % delta); + info->disabledUntil = now + std::chrono::seconds(delta); + } + } + string s = chomp(readFile(result.logFile)); throw Error(format("cannot connect to ‘%1%’: %2%") % machine->sshName % s); } + { + auto info(machine->state->connectInfo.lock()); + info->consecutiveFailures = 0; + } + /* Gather the inputs. If the remote side is Nix <= 1.9, we have to copy the entire closure of ‘drvPath’, as well the required outputs of the input derivations. On Nix > 1.9, we only need to diff --git a/src/hydra-queue-runner/builder.cc b/src/hydra-queue-runner/builder.cc index 540a61d6..56922719 100644 --- a/src/hydra-queue-runner/builder.cc +++ b/src/hydra-queue-runner/builder.cc @@ -33,7 +33,7 @@ void State::builder(Step::ptr step, Machine::ptr machine, std::shared_ptrtries++; nrRetries++; if (step_->tries > maxNrRetries) maxNrRetries = step_->tries; // yeah yeah, not atomic - int delta = retryInterval * powf(retryBackoff, step_->tries - 1); + int delta = retryInterval * powf(retryBackoff, step_->tries - 1) + (rand() % 10); printMsg(lvlInfo, format("will retry ‘%1%’ after %2%s") % step->drvPath % delta); step_->after = std::chrono::system_clock::now() + std::chrono::seconds(delta); } diff --git a/src/hydra-queue-runner/dispatcher.cc b/src/hydra-queue-runner/dispatcher.cc index bd4db86d..0c401e87 100644 --- a/src/hydra-queue-runner/dispatcher.cc +++ b/src/hydra-queue-runner/dispatcher.cc @@ -36,9 +36,12 @@ void State::dispatcher() bool keepGoing; do { + system_time now = std::chrono::system_clock::now(); + /* Copy the currentJobs field of each machine. This is necessary to ensure that the sort comparator below is - an ordering. std::sort() can segfault if it isn't. */ + an ordering. std::sort() can segfault if it isn't. Also + filter out temporarily disabled machines. */ struct MachineInfo { Machine::ptr machine; @@ -47,8 +50,15 @@ void State::dispatcher() std::vector machinesSorted; { auto machines_(machines.lock()); - for (auto & m : *machines_) + for (auto & m : *machines_) { + auto info(m.second->state->connectInfo.lock()); + if (info->consecutiveFailures && info->disabledUntil > now) { + if (info->disabledUntil < sleepUntil) + sleepUntil = info->disabledUntil; + continue; + } machinesSorted.push_back({m.second, m.second->state->currentJobs}); + } } /* Sort the machines by a combination of speed factor and @@ -77,7 +87,6 @@ void State::dispatcher() on it. Once we find such a pair, we restart the outer loop because the machine sorting will have changed. */ keepGoing = false; - system_time now = std::chrono::system_clock::now(); for (auto & mi : machinesSorted) { // FIXME: can we lose a wakeup if a builder exits concurrently? diff --git a/src/hydra-queue-runner/state.hh b/src/hydra-queue-runner/state.hh index fa352c6e..761a100b 100644 --- a/src/hydra-queue-runner/state.hh +++ b/src/hydra-queue-runner/state.hh @@ -137,6 +137,13 @@ struct Machine counter totalStepTime{0}; // total time for steps, including closure copying counter totalStepBuildTime{0}; // total build time for steps + struct ConnectInfo + { + system_time lastFailure, disabledUntil; + unsigned int consecutiveFailures; + }; + Sync connectInfo; + /* Mutex to prevent multiple threads from sending data to the same machine (which would be inefficient). */ std::mutex sendLock;