From 5370be9f52ada02b99ada6e92bb188b25de618f1 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Tue, 21 Jul 2015 01:45:00 +0200 Subject: [PATCH] hydra-queue-runner: Use cmdBuildDerivation See https://github.com/NixOS/nix/commit/1511aa9f488ba0762c2da0bf8ab61b5fde47305d and https://github.com/NixOS/nix/commit/eda2f36c2ac847e02e871c327e7633693d92cd8d. --- src/hydra-queue-runner/build-remote.cc | 75 +++++++++++++------- src/hydra-queue-runner/build-result.cc | 4 +- src/hydra-queue-runner/build-result.hh | 4 +- src/hydra-queue-runner/hydra-queue-runner.cc | 42 ++++++----- src/hydra-queue-runner/state.hh | 18 +++-- 5 files changed, 85 insertions(+), 58 deletions(-) diff --git a/src/hydra-queue-runner/build-remote.cc b/src/hydra-queue-runner/build-remote.cc index 0d299cc2..722e2f0a 100644 --- a/src/hydra-queue-runner/build-remote.cc +++ b/src/hydra-queue-runner/build-remote.cc @@ -81,10 +81,7 @@ static void copyClosureTo(std::shared_ptr store, enabled. This prevents a race where the remote host garbage-collect paths that are already there. Optionally, ask the remote host to substitute missing paths. */ - writeInt(cmdQueryValidPaths, to); - writeInt(1, to); // == lock paths - writeInt(useSubstitutes, to); - writeStrings(closure, to); + to << cmdQueryValidPaths << 1 << useSubstitutes << closure; to.flush(); /* Get back the set of paths that are already valid on the remote @@ -104,7 +101,7 @@ static void copyClosureTo(std::shared_ptr store, for (auto & p : missing) bytesSent += store->queryPathInfo(p).narSize; - writeInt(cmdImportPaths, to); + to << cmdImportPaths; exportPaths(*store, missing, false, to); to.flush(); @@ -116,9 +113,7 @@ static void copyClosureTo(std::shared_ptr store, static void copyClosureFrom(std::shared_ptr store, FdSource & from, FdSink & to, const PathSet & paths, counter & bytesReceived) { - writeInt(cmdExportPaths, to); - writeInt(0, to); // == don't sign - writeStrings(paths, to); + to << cmdExportPaths << 0 << paths; to.flush(); store->importPaths(false, from); @@ -150,9 +145,9 @@ void State::buildRemote(std::shared_ptr store, FdSink to(child.to); /* Handshake. */ + bool sendDerivation = true; try { - writeInt(SERVE_MAGIC_1, to); - writeInt(SERVE_PROTOCOL_VERSION, to); + to << SERVE_MAGIC_1 << SERVE_PROTOCOL_VERSION; to.flush(); unsigned int magic = readInt(from); @@ -161,19 +156,33 @@ void State::buildRemote(std::shared_ptr store, unsigned int version = readInt(from); if (GET_PROTOCOL_MAJOR(version) != 0x200) throw Error(format("unsupported ‘nix-store --serve’ protocol version on ‘%1%’") % machine->sshName); + if (GET_PROTOCOL_MINOR(version) >= 1) + sendDerivation = false; } catch (EndOfFile & e) { child.pid.wait(true); string s = chomp(readFile(result.logFile)); throw Error(format("cannot connect to ‘%1%’: %2%") % machine->sshName % s); } - /* Gather the inputs. */ - PathSet inputs({step->drvPath}); + /* Gather the inputs. If the remote side is Nix <= 1.9, we have to + copy the entire closure of ‘drvPath’, as well the required + outputs of the input derivations. On Nix > 1.9, we only need to + copy the immediate sources of the derivation and the required + outputs of the input derivations. */ + PathSet inputs; + + if (sendDerivation) + inputs.insert(step->drvPath); + else + for (auto & p : step->drv.inputSrcs) + inputs.insert(p); + for (auto & input : step->drv.inputDrvs) { Derivation drv2 = readDerivation(input.first); for (auto & name : input.second) { auto i = drv2.outputs.find(name); - if (i != drv2.outputs.end()) inputs.insert(i->second.path); + if (i == drv2.outputs.end()) continue; + inputs.insert(i->second.path); } } @@ -191,12 +200,14 @@ void State::buildRemote(std::shared_ptr store, /* Do the build. */ printMsg(lvlDebug, format("building ‘%1%’ on ‘%2%’") % step->drvPath % machine->sshName); - writeInt(cmdBuildPaths, to); - writeStrings(PathSet({step->drvPath}), to); - writeInt(maxSilentTime, to); - writeInt(buildTimeout, to); - // FIXME: send maxLogSize. + + if (sendDerivation) + to << cmdBuildPaths << PathSet({step->drvPath}) << maxSilentTime << buildTimeout; + else + to << cmdBuildDerivation << step->drvPath << step->drv << maxSilentTime << buildTimeout; + // FIXME: send maxLogSize. to.flush(); + result.startTime = time(0); int res; { @@ -204,12 +215,27 @@ void State::buildRemote(std::shared_ptr store, res = readInt(from); } result.stopTime = time(0); - if (res) { - result.errorMsg = (format("%1% on ‘%2%’") % readString(from) % machine->sshName).str(); - if (res == 100) result.status = RemoteResult::rrPermanentFailure; - else if (res == 101) result.status = RemoteResult::rrTimedOut; - else result.status = RemoteResult::rrMiscFailure; - return; + + if (sendDerivation) { + if (res) { + result.errorMsg = (format("%1% on ‘%2%’") % readString(from) % machine->sshName).str(); + if (res == 100) result.status = BuildResult::PermanentFailure; + else if (res == 101) result.status = BuildResult::TimedOut; + else result.status = BuildResult::MiscFailure; + return; + } + result.status = BuildResult::Built; + } else { + result.status = (BuildResult::Status) res; + result.errorMsg = readString(from); + if (!result.success()) return; + } + + /* If the path was substituted or already valid, then we didn't + get a build log. */ + if (result.status == BuildResult::Substituted || result.status == BuildResult::AlreadyValid) { + unlink(result.logFile.c_str()); + result.logFile = ""; } /* Copy the output paths. */ @@ -226,5 +252,4 @@ void State::buildRemote(std::shared_ptr store, child.to.close(); child.pid.wait(true); - result.status = RemoteResult::rrSuccess; } diff --git a/src/hydra-queue-runner/build-result.cc b/src/hydra-queue-runner/build-result.cc index 339ba887..c2c2c8af 100644 --- a/src/hydra-queue-runner/build-result.cc +++ b/src/hydra-queue-runner/build-result.cc @@ -7,9 +7,9 @@ using namespace nix; -BuildResult getBuildResult(std::shared_ptr store, const Derivation & drv) +BuildOutput getBuildOutput(std::shared_ptr store, const Derivation & drv) { - BuildResult res; + BuildOutput res; /* Compute the closure size. */ PathSet outputs; diff --git a/src/hydra-queue-runner/build-result.hh b/src/hydra-queue-runner/build-result.hh index c8965345..65fbf948 100644 --- a/src/hydra-queue-runner/build-result.hh +++ b/src/hydra-queue-runner/build-result.hh @@ -15,7 +15,7 @@ struct BuildProduct BuildProduct() { } }; -struct BuildResult +struct BuildOutput { /* Whether this build has failed with output, i.e., the build finished with exit code 0 but produced a file @@ -29,4 +29,4 @@ struct BuildResult std::list products; }; -BuildResult getBuildResult(std::shared_ptr store, const nix::Derivation & drv); +BuildOutput getBuildOutput(std::shared_ptr store, const nix::Derivation & drv); diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 44d488ef..21b1f65b 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -314,7 +314,7 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, all valid. So we mark this as a finished, cached build. */ if (!step) { Derivation drv = readDerivation(build->drvPath); - BuildResult res = getBuildResult(store, drv); + BuildOutput res = getBuildOutput(store, drv); pqxx::work txn(conn); time_t now = time(0); @@ -822,7 +822,7 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, auto conn(dbPool.get()); RemoteResult result; - BuildResult res; + BuildOutput res; int stepNr = 0; time_t stepStartTime = result.startTime = time(0); @@ -832,7 +832,7 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, bool cachedFailure = checkCachedFailure(step, *conn); if (cachedFailure) - result.status = RemoteResult::rrPermanentFailure; + result.status = BuildResult::CachedFailure; else { /* Create a build step record indicating that we started @@ -849,11 +849,11 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, /* FIXME: referring builds may have conflicting timeouts. */ buildRemote(store, machine, step, build->maxSilentTime, build->buildTimeout, result); } catch (Error & e) { - result.status = RemoteResult::rrMiscFailure; + result.status = BuildResult::MiscFailure; result.errorMsg = e.msg(); } - if (result.status == RemoteResult::rrSuccess) res = getBuildResult(store, step->drv); + if (result.success()) res = getBuildOutput(store, step->drv); } time_t stepStopTime = time(0); @@ -870,8 +870,8 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, /* The step had a hopefully temporary failure (e.g. network issue). Retry a number of times. */ - if (result.status == RemoteResult::rrMiscFailure) { - printMsg(lvlError, format("irregular failure building ‘%1%’ on ‘%2%’: %3%") + if (result.canRetry()) { + printMsg(lvlError, format("possibly transient failure building ‘%1%’ on ‘%2%’: %3%") % step->drvPath % machine->sshName % result.errorMsg); bool retry; { @@ -888,7 +888,7 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, } } - if (result.status == RemoteResult::rrSuccess) { + if (result.success()) { /* Register success in the database for all Build objects that have this step as the top-level step. Since the queue @@ -932,7 +932,7 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, finishBuildStep(txn, result.startTime, result.stopTime, build->id, stepNr, machine->sshName, bssSuccess); for (auto & b : direct) - markSucceededBuild(txn, b, res, build != b, + markSucceededBuild(txn, b, res, build != b || result.status != BuildResult::Built, result.startTime, result.stopTime); txn.commit(); @@ -1015,17 +1015,21 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, pqxx::work txn(*conn); BuildStatus buildStatus = - result.status == RemoteResult::rrPermanentFailure ? bsFailed : - result.status == RemoteResult::rrTimedOut ? bsTimedOut : - bsAborted; + result.status == BuildResult::TimedOut ? bsTimedOut : + result.canRetry() ? bsAborted : + bsFailed; BuildStepStatus buildStepStatus = - result.status == RemoteResult::rrPermanentFailure ? bssFailed : - result.status == RemoteResult::rrTimedOut ? bssTimedOut : - bssAborted; + result.status == BuildResult::TimedOut ? bssTimedOut : + result.canRetry() ? bssAborted : + bssFailed; - /* For regular failures, we don't care about the error + /* For standard failures, we don't care about the error message. */ - if (buildStatus != bsAborted) result.errorMsg = ""; + if (result.status == BuildResult::PermanentFailure || + result.status == BuildResult::TransientFailure || + result.status == BuildResult::CachedFailure || + result.status == BuildResult::TimedOut) + result.errorMsg = ""; /* Create failed build steps for every build that depends on this. For cached failures, only create a step for @@ -1061,7 +1065,7 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, /* Remember failed paths in the database so that they won't be built again. */ - if (!cachedFailure && result.status == RemoteResult::rrPermanentFailure) + if (!cachedFailure && result.status == BuildResult::PermanentFailure) for (auto & path : outputPaths(step->drv)) txn.parameterized("insert into FailedPaths values ($1)")(path).exec(); @@ -1103,7 +1107,7 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, void State::markSucceededBuild(pqxx::work & txn, Build::ptr build, - const BuildResult & res, bool isCachedBuild, time_t startTime, time_t stopTime) + const BuildOutput & res, bool isCachedBuild, time_t startTime, time_t stopTime) { printMsg(lvlInfo, format("marking build %1% as succeeded") % build->id); diff --git a/src/hydra-queue-runner/state.hh b/src/hydra-queue-runner/state.hh index 33103c13..1c1a91b4 100644 --- a/src/hydra-queue-runner/state.hh +++ b/src/hydra-queue-runner/state.hh @@ -43,22 +43,20 @@ typedef enum { } BuildStepStatus; -struct RemoteResult +struct RemoteResult : nix::BuildResult { - enum { - rrSuccess = 0, - rrPermanentFailure = 1, - rrTimedOut = 2, - rrMiscFailure = 3 - } status = rrMiscFailure; - std::string errorMsg; time_t startTime = 0, stopTime = 0; nix::Path logFile; + + bool canRetry() + { + return status == TransientFailure || status == MiscFailure; + } }; struct Step; -struct BuildResult; +struct BuildOutput; struct Build @@ -283,7 +281,7 @@ private: RemoteResult & result); void markSucceededBuild(pqxx::work & txn, Build::ptr build, - const BuildResult & res, bool isCachedBuild, time_t startTime, time_t stopTime); + const BuildOutput & res, bool isCachedBuild, time_t startTime, time_t stopTime); bool checkCachedFailure(Step::ptr step, Connection & conn);