Temporarily disable machines on any exception, not just connection failures

This commit is contained in:
Eelco Dolstra 2016-03-22 16:54:40 +01:00
parent 0aecd65e59
commit ddc9f3cc6a

View file

@ -130,6 +130,8 @@ void State::buildRemote(ref<Store> destStore,
nix::Path tmpDir = createTempDir(); nix::Path tmpDir = createTempDir();
AutoDelete tmpDirDel(tmpDir, true); AutoDelete tmpDirDel(tmpDir, true);
try {
Child child; Child child;
openConnection(machine, tmpDir, logFD, child); openConnection(machine, tmpDir, logFD, child);
@ -162,24 +164,6 @@ void State::buildRemote(ref<Store> destStore,
} catch (EndOfFile & e) { } catch (EndOfFile & e) {
child.pid.wait(true); child.pid.wait(true);
{
/* Disable this machine until a certain period of time has
passed. This period increases on every consecutive
failure. However, don't count failures that occurred
soon after the last one (to take into account steps
started in parallel). */
auto info(machine->state->connectInfo.lock());
auto now = std::chrono::system_clock::now();
if (info->consecutiveFailures == 0 || info->lastFailure < now - std::chrono::seconds(30)) {
info->consecutiveFailures = std::min(info->consecutiveFailures + 1, (unsigned int) 4);
info->lastFailure = now;
int delta = retryInterval * powf(retryBackoff, info->consecutiveFailures - 1) + (rand() % 30);
printMsg(lvlInfo, format("will disable machine %1% for %2%s") % machine->sshName % delta);
info->disabledUntil = now + std::chrono::seconds(delta);
}
}
string s = chomp(readFile(result.logFile)); string s = chomp(readFile(result.logFile));
throw Error(format("cannot connect to %1%: %2%") % machine->sshName % s); throw Error(format("cannot connect to %1%: %2%") % machine->sshName % s);
} }
@ -390,4 +374,22 @@ void State::buildRemote(ref<Store> destStore,
/* Shut down the connection. */ /* Shut down the connection. */
child.to.close(); child.to.close();
child.pid.wait(true); child.pid.wait(true);
} catch (Error & e) {
/* Disable this machine until a certain period of time has
passed. This period increases on every consecutive
failure. However, don't count failures that occurred soon
after the last one (to take into account steps started in
parallel). */
auto info(machine->state->connectInfo.lock());
auto now = std::chrono::system_clock::now();
if (info->consecutiveFailures == 0 || info->lastFailure < now - std::chrono::seconds(30)) {
info->consecutiveFailures = std::min(info->consecutiveFailures + 1, (unsigned int) 4);
info->lastFailure = now;
int delta = retryInterval * powf(retryBackoff, info->consecutiveFailures - 1) + (rand() % 30);
printMsg(lvlInfo, format("will disable machine %1% for %2%s") % machine->sshName % delta);
info->disabledUntil = now + std::chrono::seconds(delta);
}
throw;
}
} }