Queue monitor: Bail out earlier if a step has failed previously

Currently, the hydra.nixos.org queue contains 1000s of Darwin builds
that all depend on a stdenv-darwin that previously failed. However,
before, first createStep() would construct a dependency graph for each
build, then getQueuedBuilds() would discover that one of the steps had
failed previously and discard all those steps. Since the graph
construction involves a lot of uncached calls to isValidPath(), this
took several seconds per build.

Now createStep() detects the previous failure right away and bails
out.
This commit is contained in:
Eelco Dolstra 2016-04-15 14:28:00 +02:00
parent 2f0f7406d4
commit 177bf25d64

View file

@ -63,6 +63,12 @@ void State::queueMonitorLoop()
} }
struct PreviousFailure : public std::exception {
Step::ptr step;
PreviousFailure(Step::ptr step) : step(step) { }
};
bool State::getQueuedBuilds(Connection & conn, ref<Store> localStore, bool State::getQueuedBuilds(Connection & conn, ref<Store> localStore,
ref<Store> destStore, unsigned int & lastBuildId) ref<Store> destStore, unsigned int & lastBuildId)
{ {
@ -139,8 +145,55 @@ bool State::getQueuedBuilds(Connection & conn, ref<Store> localStore,
} }
std::set<Step::ptr> newSteps; std::set<Step::ptr> newSteps;
Step::ptr step = createStep(destStore, conn, build, build->drvPath, Step::ptr step;
/* Create steps for this derivation and its dependencies. */
try {
step = createStep(destStore, conn, build, build->drvPath,
build, 0, finishedDrvs, newSteps, newRunnable); build, 0, finishedDrvs, newSteps, newRunnable);
} catch (PreviousFailure & ex) {
/* Some step previously failed, so mark the build as
failed right away. */
printMsg(lvlError, format("marking build %d as cached failure due to %s") % build->id % ex.step->drvPath);
if (!build->finishedInDB) {
auto mc = startDbUpdate();
pqxx::work txn(conn);
/* Find the previous build step record, first by
derivation path, then by output path. */
BuildID propagatedFrom = 0;
auto res = txn.parameterized
("select max(build) from BuildSteps where drvPath = $1 and startTime != 0 and stopTime != 0 and status = 1")
(ex.step->drvPath).exec();
if (!res[0][0].is_null()) propagatedFrom = res[0][0].as<BuildID>();
if (!propagatedFrom) {
for (auto & output : ex.step->drv.outputs) {
auto res = txn.parameterized
("select max(s.build) from BuildSteps s join BuildStepOutputs o on s.build = o.build where path = $1 and startTime != 0 and stopTime != 0 and status = 1")
(output.second.path).exec();
if (!res[0][0].is_null()) {
propagatedFrom = res[0][0].as<BuildID>();
break;
}
}
}
createBuildStep(txn, 0, build, ex.step, "", bsCachedFailure, "", propagatedFrom);
txn.parameterized
("update Builds set finished = 1, buildStatus = $2, startTime = $3, stopTime = $3, isCachedBuild = 1 where id = $1 and finished = 0")
(build->id)
((int) (ex.step->drvPath == build->drvPath ? bsFailed : bsDepFailed))
(time(0)).exec();
txn.commit();
build->finishedInDB = true;
nrBuildsDone++;
}
return;
}
/* Some of the new steps may be the top level of builds that /* Some of the new steps may be the top level of builds that
we haven't processed yet. So do them now. This ensures that we haven't processed yet. So do them now. This ensures that
@ -174,53 +227,6 @@ bool State::getQueuedBuilds(Connection & conn, ref<Store> localStore,
return; return;
} }
/* If any step has a previously failed output path, then fail
the build right away. */
bool badStep = false;
for (auto & r : newSteps)
if (checkCachedFailure(r, conn)) {
printMsg(lvlError, format("marking build %1% as cached failure") % build->id);
if (!build->finishedInDB) {
auto mc = startDbUpdate();
pqxx::work txn(conn);
/* Find the previous build step record, first by
derivation path, then by output path. */
BuildID propagatedFrom = 0;
auto res = txn.parameterized
("select max(build) from BuildSteps where drvPath = $1 and startTime != 0 and stopTime != 0 and status = 1")
(r->drvPath).exec();
if (!res[0][0].is_null()) propagatedFrom = res[0][0].as<BuildID>();
if (!propagatedFrom) {
for (auto & output : r->drv.outputs) {
auto res = txn.parameterized
("select max(s.build) from BuildSteps s join BuildStepOutputs o on s.build = o.build where path = $1 and startTime != 0 and stopTime != 0 and status = 1")
(output.second.path).exec();
if (!res[0][0].is_null()) {
propagatedFrom = res[0][0].as<BuildID>();
break;
}
}
}
createBuildStep(txn, 0, build, r, "", bsCachedFailure, "", propagatedFrom);
txn.parameterized
("update Builds set finished = 1, buildStatus = $2, startTime = $3, stopTime = $3, isCachedBuild = 1 where id = $1 and finished = 0")
(build->id)
((int) (step == r ? bsFailed : bsDepFailed))
(time(0)).exec();
txn.commit();
build->finishedInDB = true;
nrBuildsDone++;
}
badStep = true;
break;
}
if (badStep) return;
/* Note: if we exit this scope prior to this, the build and /* Note: if we exit this scope prior to this, the build and
all newly created steps are destroyed. */ all newly created steps are destroyed. */
@ -401,6 +407,10 @@ Step::ptr State::createStep(ref<Store> destStore,
} }
} }
/* If this derivation failed previously, give up. */
if (checkCachedFailure(step, conn))
throw PreviousFailure{step};
/* Are all outputs valid? */ /* Are all outputs valid? */
bool valid = true; bool valid = true;
PathSet outputs = step->drv.outputPaths(); PathSet outputs = step->drv.outputPaths();
@ -453,7 +463,6 @@ Step::ptr State::createStep(ref<Store> destStore,
/* No, we need to build. */ /* No, we need to build. */
printMsg(lvlDebug, format("creating build step %1%") % drvPath); printMsg(lvlDebug, format("creating build step %1%") % drvPath);
newSteps.insert(step);
/* Create steps for the dependencies. */ /* Create steps for the dependencies. */
for (auto & i : step->drv.inputDrvs) { for (auto & i : step->drv.inputDrvs) {
@ -474,6 +483,8 @@ Step::ptr State::createStep(ref<Store> destStore,
newRunnable.insert(step); newRunnable.insert(step);
} }
newSteps.insert(step);
return step; return step;
} }