forked from lix-project/hydra
Temporarily disable machines on any exception, not just connection failures
This commit is contained in:
parent
0aecd65e59
commit
ddc9f3cc6a
|
@ -130,6 +130,8 @@ void State::buildRemote(ref<Store> destStore,
|
||||||
nix::Path tmpDir = createTempDir();
|
nix::Path tmpDir = createTempDir();
|
||||||
AutoDelete tmpDirDel(tmpDir, true);
|
AutoDelete tmpDirDel(tmpDir, true);
|
||||||
|
|
||||||
|
try {
|
||||||
|
|
||||||
Child child;
|
Child child;
|
||||||
openConnection(machine, tmpDir, logFD, child);
|
openConnection(machine, tmpDir, logFD, child);
|
||||||
|
|
||||||
|
@ -162,24 +164,6 @@ void State::buildRemote(ref<Store> destStore,
|
||||||
|
|
||||||
} catch (EndOfFile & e) {
|
} catch (EndOfFile & e) {
|
||||||
child.pid.wait(true);
|
child.pid.wait(true);
|
||||||
|
|
||||||
{
|
|
||||||
/* Disable this machine until a certain period of time has
|
|
||||||
passed. This period increases on every consecutive
|
|
||||||
failure. However, don't count failures that occurred
|
|
||||||
soon after the last one (to take into account steps
|
|
||||||
started in parallel). */
|
|
||||||
auto info(machine->state->connectInfo.lock());
|
|
||||||
auto now = std::chrono::system_clock::now();
|
|
||||||
if (info->consecutiveFailures == 0 || info->lastFailure < now - std::chrono::seconds(30)) {
|
|
||||||
info->consecutiveFailures = std::min(info->consecutiveFailures + 1, (unsigned int) 4);
|
|
||||||
info->lastFailure = now;
|
|
||||||
int delta = retryInterval * powf(retryBackoff, info->consecutiveFailures - 1) + (rand() % 30);
|
|
||||||
printMsg(lvlInfo, format("will disable machine ‘%1%’ for %2%s") % machine->sshName % delta);
|
|
||||||
info->disabledUntil = now + std::chrono::seconds(delta);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
string s = chomp(readFile(result.logFile));
|
string s = chomp(readFile(result.logFile));
|
||||||
throw Error(format("cannot connect to ‘%1%’: %2%") % machine->sshName % s);
|
throw Error(format("cannot connect to ‘%1%’: %2%") % machine->sshName % s);
|
||||||
}
|
}
|
||||||
|
@ -390,4 +374,22 @@ void State::buildRemote(ref<Store> destStore,
|
||||||
/* Shut down the connection. */
|
/* Shut down the connection. */
|
||||||
child.to.close();
|
child.to.close();
|
||||||
child.pid.wait(true);
|
child.pid.wait(true);
|
||||||
|
|
||||||
|
} catch (Error & e) {
|
||||||
|
/* Disable this machine until a certain period of time has
|
||||||
|
passed. This period increases on every consecutive
|
||||||
|
failure. However, don't count failures that occurred soon
|
||||||
|
after the last one (to take into account steps started in
|
||||||
|
parallel). */
|
||||||
|
auto info(machine->state->connectInfo.lock());
|
||||||
|
auto now = std::chrono::system_clock::now();
|
||||||
|
if (info->consecutiveFailures == 0 || info->lastFailure < now - std::chrono::seconds(30)) {
|
||||||
|
info->consecutiveFailures = std::min(info->consecutiveFailures + 1, (unsigned int) 4);
|
||||||
|
info->lastFailure = now;
|
||||||
|
int delta = retryInterval * powf(retryBackoff, info->consecutiveFailures - 1) + (rand() % 30);
|
||||||
|
printMsg(lvlInfo, format("will disable machine ‘%1%’ for %2%s") % machine->sshName % delta);
|
||||||
|
info->disabledUntil = now + std::chrono::seconds(delta);
|
||||||
|
}
|
||||||
|
throw;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue