0
0
Fork 0
forked from lix-project/hydra

Step cancellation: Don't use pthread_cancel()

This was a bad idea because pthread_cancel() is unsalvageable broken
in C++. Destructors are not allowed to throw exceptions (especially in
C++11), but pthread_cancel() can cause a __cxxabiv1::__forced_unwind
exception inside any destructor that invokes a cancellation
point. (This exception can be caught but *must* be rethrown.) So let's
just kill the builder process instead.
This commit is contained in:
Eelco Dolstra 2016-11-07 19:34:35 +01:00
parent 95aa1f0590
commit 7863d2e1da
4 changed files with 57 additions and 33 deletions

View file

@ -117,7 +117,7 @@ static void copyClosureTo(ref<Store> destStore,
void State::buildRemote(ref<Store> destStore,
Machine::ptr machine, Step::ptr step,
unsigned int maxSilentTime, unsigned int buildTimeout,
RemoteResult & result)
RemoteResult & result, std::shared_ptr<ActiveStep> activeStep)
{
assert(BuildResult::TimedOut == 8);
@ -138,6 +138,24 @@ void State::buildRemote(ref<Store> destStore,
Child child;
openConnection(machine, tmpDir, logFD.get(), child);
{
auto activeStepState(activeStep->state_.lock());
if (activeStepState->cancelled) throw Error("step cancelled");
activeStepState->pid = child.pid;
}
Finally clearPid([&]() {
auto activeStepState(activeStep->state_.lock());
activeStepState->pid = -1;
/* FIXME: there is a slight race here with step
cancellation in State::processQueueChange(), which
could call kill() on this pid after we've done waitpid()
on it. With pid wrap-around, there is a tiny
possibility that we end up killing another
process. Meh. */
});
FdSource from(child.from.get());
FdSink to(child.to.get());

View file

@ -15,11 +15,9 @@ void State::builder(MachineReservation::ptr reservation)
auto activeStep = std::make_shared<ActiveStep>();
activeStep->step = reservation->step;
activeStep->threadId = pthread_self();
activeSteps_.lock()->insert(activeStep);
Finally removeActiveStep([&]() {
activeStep->threadId = -1;
activeSteps_.lock()->erase(activeStep);
});
@ -27,7 +25,7 @@ void State::builder(MachineReservation::ptr reservation)
try {
auto destStore = getDestStore();
res = doBuildStep(destStore, step, reservation->machine);
res = doBuildStep(destStore, reservation, activeStep);
} catch (std::exception & e) {
printMsg(lvlError, format("uncaught exception building %1% on %2%: %3%")
% step->drvPath % reservation->machine->sshName % e.what());
@ -56,9 +54,13 @@ void State::builder(MachineReservation::ptr reservation)
}
State::StepResult State::doBuildStep(nix::ref<Store> destStore, Step::ptr step,
Machine::ptr machine)
State::StepResult State::doBuildStep(nix::ref<Store> destStore,
MachineReservation::ptr reservation,
std::shared_ptr<ActiveStep> activeStep)
{
auto & step(reservation->step);
auto & machine(reservation->machine);
{
auto step_(step->state.lock());
assert(step_->created);
@ -156,26 +158,19 @@ State::StepResult State::doBuildStep(nix::ref<Store> destStore, Step::ptr step,
/* Do the build. */
try {
/* FIXME: referring builds may have conflicting timeouts. */
buildRemote(destStore, machine, step, maxSilentTime, buildTimeout, result);
buildRemote(destStore, machine, step, maxSilentTime, buildTimeout, result, activeStep);
} catch (NoTokens & e) {
result.stepStatus = bsNarSizeLimitExceeded;
} catch (Error & e) {
result.stepStatus = bsAborted;
result.errorMsg = e.msg();
result.canRetry = true;
} catch (__cxxabiv1::__forced_unwind & e) {
/* The queue monitor thread cancelled this step. */
try {
if (activeStep->state_.lock()->cancelled) {
printInfo("marking step %d of build %d as cancelled", stepNr, buildId);
pqxx::work txn(*conn);
finishBuildStep(txn, result.startTime, time(0), result.overhead, buildId,
stepNr, machine->sshName, bsCancelled, "");
txn.commit();
stepFinished = true;
} catch (...) {
ignoreException();
result.stepStatus = bsCancelled;
result.canRetry = false;
} else {
result.stepStatus = bsAborted;
result.errorMsg = e.msg();
result.canRetry = true;
}
throw;
}
if (result.stepStatus == bsSuccess)

View file

@ -337,20 +337,23 @@ void State::processQueueChange(Connection & conn)
{
auto activeSteps(activeSteps_.lock());
for (auto & activeStep : *activeSteps) {
auto threadId = activeStep->threadId; // FIXME: use Sync or atomic?
if (threadId == 0) continue;
std::set<Build::ptr> dependents;
std::set<Step::ptr> steps;
getDependents(activeStep->step, dependents, steps);
if (!dependents.empty()) continue;
printInfo("cancelling thread for build step %s", activeStep->step->drvPath);
int err = pthread_cancel(threadId);
if (err)
printError("error cancelling thread for build step %s: %s",
activeStep->step->drvPath, strerror(err));
{
auto activeStepState(activeStep->state_.lock());
if (activeStepState->cancelled) continue;
activeStepState->cancelled = true;
if (activeStepState->pid != -1) {
printInfo("killing builder process %d of build step %s",
activeStepState->pid, activeStep->step->drvPath);
if (kill(activeStepState->pid, SIGINT) == -1)
printError("error killing build step %s: %s",
activeStep->step->drvPath, strerror(errno));
}
}
}
}
}

View file

@ -377,7 +377,14 @@ private:
struct ActiveStep
{
Step::ptr step;
pthread_t threadId;
struct State
{
pid_t pid = -1;
bool cancelled = false;
};
nix::Sync<State> state_;
};
nix::Sync<std::set<std::shared_ptr<ActiveStep>>> activeSteps_;
@ -476,12 +483,13 @@ private:
retried. */
enum StepResult { sDone, sRetry, sMaybeCancelled };
StepResult doBuildStep(nix::ref<nix::Store> destStore,
Step::ptr step, Machine::ptr machine);
MachineReservation::ptr reservation,
std::shared_ptr<ActiveStep> activeStep);
void buildRemote(nix::ref<nix::Store> destStore,
Machine::ptr machine, Step::ptr step,
unsigned int maxSilentTime, unsigned int buildTimeout,
RemoteResult & result);
RemoteResult & result, std::shared_ptr<ActiveStep> activeStep);
void markSucceededBuild(pqxx::work & txn, Build::ptr build,
const BuildOutput & res, bool isCachedBuild, time_t startTime, time_t stopTime);