forked from lix-project/hydra
Temporarily disable machines on any exception, not just connection failures
This commit is contained in:
parent
0aecd65e59
commit
ddc9f3cc6a
|
@ -130,6 +130,8 @@ void State::buildRemote(ref<Store> destStore,
|
|||
nix::Path tmpDir = createTempDir();
|
||||
AutoDelete tmpDirDel(tmpDir, true);
|
||||
|
||||
try {
|
||||
|
||||
Child child;
|
||||
openConnection(machine, tmpDir, logFD, child);
|
||||
|
||||
|
@ -162,24 +164,6 @@ void State::buildRemote(ref<Store> destStore,
|
|||
|
||||
} catch (EndOfFile & e) {
|
||||
child.pid.wait(true);
|
||||
|
||||
{
|
||||
/* Disable this machine until a certain period of time has
|
||||
passed. This period increases on every consecutive
|
||||
failure. However, don't count failures that occurred
|
||||
soon after the last one (to take into account steps
|
||||
started in parallel). */
|
||||
auto info(machine->state->connectInfo.lock());
|
||||
auto now = std::chrono::system_clock::now();
|
||||
if (info->consecutiveFailures == 0 || info->lastFailure < now - std::chrono::seconds(30)) {
|
||||
info->consecutiveFailures = std::min(info->consecutiveFailures + 1, (unsigned int) 4);
|
||||
info->lastFailure = now;
|
||||
int delta = retryInterval * powf(retryBackoff, info->consecutiveFailures - 1) + (rand() % 30);
|
||||
printMsg(lvlInfo, format("will disable machine ‘%1%’ for %2%s") % machine->sshName % delta);
|
||||
info->disabledUntil = now + std::chrono::seconds(delta);
|
||||
}
|
||||
}
|
||||
|
||||
string s = chomp(readFile(result.logFile));
|
||||
throw Error(format("cannot connect to ‘%1%’: %2%") % machine->sshName % s);
|
||||
}
|
||||
|
@ -390,4 +374,22 @@ void State::buildRemote(ref<Store> destStore,
|
|||
/* Shut down the connection. */
|
||||
child.to.close();
|
||||
child.pid.wait(true);
|
||||
|
||||
} catch (Error & e) {
|
||||
/* Disable this machine until a certain period of time has
|
||||
passed. This period increases on every consecutive
|
||||
failure. However, don't count failures that occurred soon
|
||||
after the last one (to take into account steps started in
|
||||
parallel). */
|
||||
auto info(machine->state->connectInfo.lock());
|
||||
auto now = std::chrono::system_clock::now();
|
||||
if (info->consecutiveFailures == 0 || info->lastFailure < now - std::chrono::seconds(30)) {
|
||||
info->consecutiveFailures = std::min(info->consecutiveFailures + 1, (unsigned int) 4);
|
||||
info->lastFailure = now;
|
||||
int delta = retryInterval * powf(retryBackoff, info->consecutiveFailures - 1) + (rand() % 30);
|
||||
printMsg(lvlInfo, format("will disable machine ‘%1%’ for %2%s") % machine->sshName % delta);
|
||||
info->disabledUntil = now + std::chrono::seconds(delta);
|
||||
}
|
||||
throw;
|
||||
}
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue