From dc446c39800967e532b459cc2a494f024f15f643 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Thu, 28 May 2015 17:39:29 +0200 Subject: [PATCH 001/158] Start of single-process hydra-queue-runner --- configure.ac | 1 + release.nix | 2 +- src/Makefile.am | 2 +- src/hydra-queue-runner/Makefile.am | 6 + src/hydra-queue-runner/build-result.cc | 112 ++++ src/hydra-queue-runner/build-result.hh | 25 + src/hydra-queue-runner/hydra-queue-runner.cc | 515 +++++++++++++++++++ src/lib/Hydra/Helper/AddBuilds.pm | 130 +---- src/root/build.tt | 10 +- src/sql/hydra.sql | 4 + 10 files changed, 676 insertions(+), 131 deletions(-) create mode 100644 src/hydra-queue-runner/Makefile.am create mode 100644 src/hydra-queue-runner/build-result.cc create mode 100644 src/hydra-queue-runner/build-result.hh create mode 100644 src/hydra-queue-runner/hydra-queue-runner.cc diff --git a/configure.ac b/configure.ac index af0dd194..971b502d 100644 --- a/configure.ac +++ b/configure.ac @@ -73,6 +73,7 @@ AC_CONFIG_FILES([ doc/manual/Makefile src/Makefile src/hydra-eval-jobs/Makefile + src/hydra-queue-runner/Makefile src/sql/Makefile src/xsl/Makefile src/ttf/Makefile diff --git a/release.nix b/release.nix index 62e850ba..3d273370 100644 --- a/release.nix +++ b/release.nix @@ -129,7 +129,7 @@ in rec { src = tarball; buildInputs = - [ makeWrapper libtool unzip nukeReferences pkgconfig sqlite + [ makeWrapper libtool unzip nukeReferences pkgconfig sqlite libpqxx gitAndTools.topGit mercurial darcs subversion bazaar openssl bzip2 guile # optional, for Guile + Guix support perlDeps perl diff --git a/src/Makefile.am b/src/Makefile.am index d91a1daa..a1936113 100644 --- a/src/Makefile.am +++ b/src/Makefile.am @@ -1,3 +1,3 @@ -SUBDIRS = hydra-eval-jobs sql script lib root xsl ttf +SUBDIRS = hydra-eval-jobs hydra-queue-runner sql script lib root xsl ttf BOOTCLEAN_SUBDIRS = $(SUBDIRS) DIST_SUBDIRS = $(SUBDIRS) diff --git a/src/hydra-queue-runner/Makefile.am b/src/hydra-queue-runner/Makefile.am new file mode 100644 index 00000000..2525c936 --- /dev/null +++ b/src/hydra-queue-runner/Makefile.am @@ -0,0 +1,6 @@ +bin_PROGRAMS = hydra-queue-runner + +hydra_queue_runner_SOURCES = hydra-queue-runner.cc build-result.cc +hydra_queue_runner_LDADD = $(NIX_LIBS) -lpqxx + +AM_CXXFLAGS = $(NIX_CFLAGS) -Wall diff --git a/src/hydra-queue-runner/build-result.cc b/src/hydra-queue-runner/build-result.cc new file mode 100644 index 00000000..de9903b9 --- /dev/null +++ b/src/hydra-queue-runner/build-result.cc @@ -0,0 +1,112 @@ +#include "build-result.hh" +#include "store-api.hh" +#include "misc.hh" +#include "util.hh" + +using namespace nix; + + +BuildResult getBuildResult(const Derivation & drv) +{ + BuildResult res; + + /* Compute the closure size. */ + PathSet outputs; + for (auto & output : drv.outputs) + outputs.insert(output.second.path); + PathSet closure; + for (auto & output : outputs) + computeFSClosure(*store, output, closure); + for (auto & path : closure) { + auto info = store->queryPathInfo(path); + res.closureSize += info.narSize; + if (outputs.find(path) != outputs.end()) res.size += info.narSize; + } + + /* Get build products. */ + bool explicitProducts = false; + + for (auto & output : outputs) { + Path productsFile = output + "/nix-support/hydra-build-products"; + if (!pathExists(productsFile)) continue; + explicitProducts = true; + + /* For security, resolve symlinks. */ + productsFile = canonPath(productsFile, true); + if (!isInStore(productsFile)) continue; + + // FIXME: handle I/O errors + + auto contents = readFile(productsFile); + auto lines = tokenizeString(contents, "\n"); + + for (auto & line : lines) { + BuildProduct product; + + auto words = tokenizeString(line); + if (words.size() < 3) continue; + product.type = words.front(); words.pop_front(); + product.subtype = words.front(); words.pop_front(); + if (string(words.front(), 0, 1) == "\"") { + // FIXME: + throw Error("FIXME"); + } else { + product.path = words.front(); words.pop_front(); + } + product.defaultPath = words.empty() ? "" : words.front(); + + /* Ensure that the path exists and points into the + Nix store. */ + if (product.path == "" || product.path[0] != '/') continue; + product.path = canonPath(product.path, true); + if (!isInStore(product.path) || !pathExists(product.path)) continue; + + /* FIXME: check that the path is in the input closure + of the build? */ + + product.name = product.path == output ? "" : baseNameOf(product.path); + + struct stat st; + if (stat(product.path.c_str(), &st)) + throw SysError(format("getting status of ‘%1%’") % product.path); + + if (S_ISREG(st.st_mode)) { + product.isRegular = true; + product.fileSize = st.st_size; + product.sha1hash = hashFile(htSHA1, product.path); + product.sha256hash = hashFile(htSHA256, product.path); + } + + res.products.push_back(product); + } + } + + /* If no build products were explicitly declared, then add all + outputs as a product of type "nix-build". */ + if (!explicitProducts) { + for (auto & output : drv.outputs) { + BuildProduct product; + product.path = output.second.path; + product.type = "nix-build"; + product.subtype = output.first == "out" ? "" : output.first; + product.name = storePathToName(product.path); + + struct stat st; + if (stat(product.path.c_str(), &st)) + throw SysError(format("getting status of ‘%1%’") % product.path); + if (S_ISDIR(st.st_mode)) + res.products.push_back(product); + } + } + + /* Get the release name from $output/nix-support/hydra-release-name. */ + for (auto & output : outputs) { + Path p = output + "/nix-support/hydra-release-name"; + if (!pathExists(p)) continue; + // FIXME: handle I/O error + res.releaseName = trim(readFile(p)); + // FIXME: validate release name + } + + return res; +} diff --git a/src/hydra-queue-runner/build-result.hh b/src/hydra-queue-runner/build-result.hh new file mode 100644 index 00000000..f8a93b3a --- /dev/null +++ b/src/hydra-queue-runner/build-result.hh @@ -0,0 +1,25 @@ +#pragma once + +#include "hash.hh" +#include "derivations.hh" + +struct BuildProduct +{ + nix::Path path, defaultPath; + std::string type, subtype, name; + bool isRegular = false; + nix::Hash sha1hash, sha256hash; + off_t fileSize = 0; + BuildProduct() { } +}; + +struct BuildResult +{ + std::string releaseName; + + unsigned long long closureSize = 0, size = 0; + + std::list products; +}; + +BuildResult getBuildResult(const nix::Derivation & drv); diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc new file mode 100644 index 00000000..6d2cd094 --- /dev/null +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -0,0 +1,515 @@ +#include +#include +#include +#include + +#include "build-result.hh" +#include "store-api.hh" +#include "derivations.hh" +#include "shared.hh" +#include "globals.hh" + +using namespace nix; + + +typedef enum { + bsSuccess = 0, + bsFailed = 1, + bsDepFailed = 2, + bsAborted = 3, + bsFailedWithOutput = 6, +} BuildStatus; + + +typedef enum { + bssSuccess = 0, + bssFailed = 1, + bssAborted = 4, + bssBusy = 100, // not stored +} BuildStepStatus; + + +struct Connection : pqxx::connection +{ + Connection() : pqxx::connection("dbname=hydra") { }; +}; + + +typedef unsigned int BuildID; + + +struct Build +{ + typedef std::shared_ptr ptr; + typedef std::weak_ptr wptr; + + BuildID id; + Path drvPath; + std::map outputs; + + bool finishedInDB; + + Build() : finishedInDB(false) { } +}; + + +struct Step +{ + typedef std::shared_ptr ptr; + typedef std::weak_ptr wptr; + Path drvPath; + Derivation drv; + + /* The build steps on which this step depends. */ + std::set deps; + + /* The build steps that depend on this step. */ + std::vector rdeps; + + /* Builds that have this step as the top-level derivation. */ + std::vector builds; +}; + + +class State +{ +private: + /* The queued builds. */ + std::map builds; + + /* All active or pending build steps (i.e. dependencies of the + queued builds). */ + std::map steps; + + /* Build steps that have no unbuilt dependencies. */ + std::set runnable; + +public: + State(); + + ~State(); + + void markActiveBuildStepsAsAborted(pqxx::connection & conn, time_t stopTime); + + int createBuildStep(pqxx::work & txn, time_t startTime, Build::ptr build, Step::ptr step, + BuildStepStatus status, const std::string & errorMsg = "", BuildID propagatedFrom = 0); + + void finishBuildStep(pqxx::work & txn, time_t stopTime, BuildID buildId, int stepNr, + BuildStepStatus status, const string & errorMsg = "", BuildID propagatedFrom = 0); + + void updateBuild(pqxx::work & txn, Build::ptr build, BuildStatus status); + + void getQueuedBuilds(pqxx::connection & conn); + + Step::ptr createStep(const Path & drvPath); + + void destroyStep(Step::ptr step, bool proceed); + + /* Get the builds that depend on the given step. */ + std::set getDependentBuilds(Step::ptr step); + + void doBuildSteps(); + + void doBuildStep(Step::ptr step); + + void markSucceededBuild(pqxx::work & txn, Build::ptr build, + const BuildResult & res, bool isCachedBuild, time_t startTime, time_t stopTime); +}; + + +State::State() +{ +} + + +State::~State() +{ + try { + Connection conn; + printMsg(lvlError, "clearing active build steps..."); + markActiveBuildStepsAsAborted(conn, time(0)); + } catch (...) { + ignoreException(); + } +} + + +void State::markActiveBuildStepsAsAborted(pqxx::connection & conn, time_t stopTime) +{ + pqxx::work txn(conn); + auto stm = txn.parameterized + ("update BuildSteps set busy = 0, status = $1, stopTime = $2 where busy = 1") + ((int) bssAborted); + if (stopTime) stm(stopTime); else stm(); + stm.exec(); + txn.commit(); +} + + +int State::createBuildStep(pqxx::work & txn, time_t startTime, Build::ptr build, Step::ptr step, + BuildStepStatus status, const std::string & errorMsg, BuildID propagatedFrom) +{ + auto res = txn.parameterized("select max(stepnr) from BuildSteps where build = $1")(build->id).exec(); + int stepNr = res[0][0].is_null() ? 1 : res[0][0].as() + 1; + + auto stm = txn.parameterized + ("insert into BuildSteps (build, stepnr, type, drvPath, busy, startTime, system, status, propagatedFrom, errorMsg, stopTime) values ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)") + (build->id)(stepNr)(0)(step->drvPath)(status == bssBusy ? 1 : 0)(startTime)(step->drv.platform); + if (status == bssBusy) stm(); else stm((int) status); + if (propagatedFrom) stm(propagatedFrom); else stm(); + if (errorMsg != "") stm(errorMsg); else stm(); + if (status == bssBusy) stm(); else stm(startTime); + stm.exec(); + + for (auto & output : step->drv.outputs) + txn.parameterized + ("insert into BuildStepOutputs (build, stepnr, name, path) values ($1, $2, $3, $4)") + (build->id)(stepNr)(output.first)(output.second.path).exec(); + + return stepNr; +} + + +void State::finishBuildStep(pqxx::work & txn, time_t stopTime, BuildID buildId, int stepNr, + BuildStepStatus status, const std::string & errorMsg, BuildID propagatedFrom) +{ + auto stm = txn.parameterized + ("update BuildSteps set busy = 0, status = $1, propagatedFrom = $4, errorMsg = $5, stopTime = $6 where build = $2 and stepnr = $3") + ((int) status)(buildId)(stepNr); + if (propagatedFrom) stm(propagatedFrom); else stm(); + if (errorMsg != "") stm(errorMsg); else stm(); + if (stopTime) stm(stopTime); else stm(); + stm.exec(); +} + + +void State::getQueuedBuilds(pqxx::connection & conn) +{ + pqxx::work txn(conn); + + // FIXME: query only builds with ID higher than the previous + // highest. + auto res = txn.exec("select * from Builds where finished = 0"); + + // FIXME: don't process inside a txn. + for (auto const & row : res) { + BuildID id = row["id"].as(); + if (builds.find(id) != builds.end()) continue; + + Build::ptr build(new Build); + build->id = id; + build->drvPath = row["drvPath"].as(); + + printMsg(lvlInfo, format("loading build %1% (%2%:%3%:%4%)") % id % row["project"] % row["jobset"] % row["job"]); + + if (!store->isValidPath(build->drvPath)) { + /* Derivation has been GC'ed prematurely. */ + Connection conn; + pqxx::work txn(conn); + txn.parameterized + ("update Builds set finished = 1, buildStatus = $2, startTime = $3, stopTime = $3, errorMsg = $4 where id = $1") + (build->id) + ((int) bsAborted) + (time(0)) + ("derivation was garbage-collected prior to build").exec(); + txn.commit(); + continue; + } + + Step::ptr step = createStep(build->drvPath); + if (!step) { + Derivation drv = readDerivation(build->drvPath); + BuildResult res = getBuildResult(drv); + + Connection conn; + pqxx::work txn(conn); + time_t now = time(0); + markSucceededBuild(txn, build, res, true, now, now); + txn.commit(); + + continue; + } + + step->builds.push_back(build); + + builds[id] = build; + } +} + + +Step::ptr State::createStep(const Path & drvPath) +{ + auto prev = steps.find(drvPath); + if (prev != steps.end()) return prev->second; + + printMsg(lvlInfo, format("considering derivation ‘%1%’") % drvPath); + + Step::ptr step(new Step); + step->drvPath = drvPath; + step->drv = readDerivation(drvPath); + + /* Are all outputs valid? */ + bool valid = true; + for (auto & i : step->drv.outputs) { + if (!store->isValidPath(i.second.path)) { + valid = false; + break; + } + } + + // FIXME: check whether all outputs are in the binary cache. + if (valid) return 0; + + /* No, we need to build. */ + printMsg(lvlInfo, format("creating build step ‘%1%’") % drvPath); + + /* Create steps for the dependencies. */ + for (auto & i : step->drv.inputDrvs) { + Step::ptr dep = createStep(i.first); + if (dep) { + step->deps.insert(dep); + dep->rdeps.push_back(step); + } + } + + steps[drvPath] = step; + + if (step->deps.empty()) runnable.insert(step); + + return step; +} + + +void State::destroyStep(Step::ptr step, bool proceed) +{ + steps.erase(step->drvPath); + + for (auto & rdep_ : step->rdeps) { + auto rdep = rdep_.lock(); + if (!rdep) continue; + assert(rdep->deps.find(step) != rdep->deps.end()); + rdep->deps.erase(step); + if (proceed) { + /* If this rdep has no other dependencies, then we can now + build it. */ + if (rdep->deps.empty()) + runnable.insert(rdep); + } else + /* If ‘step’ failed, then delete all dependent steps as + well. */ + destroyStep(rdep, false); + } + + for (auto & build_ : step->builds) { + auto build = build_.lock(); + if (!build) continue; + assert(build->drvPath == step->drvPath); + assert(build->finishedInDB); + } +} + + +std::set State::getDependentBuilds(Step::ptr step) +{ + std::set done; + std::set res; + + std::function visit; + + visit = [&](Step::ptr step) { + if (done.find(step) != done.end()) return; + done.insert(step); + + for (auto & build : step->builds) { + auto build2 = build.lock(); + if (build2) res.insert(build2); + } + + for (auto & rdep : step->rdeps) { + auto rdep2 = rdep.lock(); + if (rdep2) visit(rdep2); + } + }; + + visit(step); + + return res; +} + + +void State::doBuildSteps() +{ + while (!runnable.empty()) { + printMsg(lvlInfo, format("%1% runnable steps") % runnable.size()); + Step::ptr step = *runnable.begin(); + runnable.erase(step); + doBuildStep(step); + } +} + + +void State::doBuildStep(Step::ptr step) +{ + assert(step->deps.empty()); + + /* There can be any number of builds in the database that depend + on this derivation. Arbitrarily pick one (though preferring + those build of which this is the top-level derivation) for the + purpose of creating build steps. We could create a build step + record for every build, but that could be very expensive + (e.g. a stdenv derivation can be a dependency of tens of + thousands of builds), so we don't. */ + Build::ptr build; + + auto builds = getDependentBuilds(step); + + if (builds.empty()) { + /* Apparently all builds that depend on this derivation are + gone (e.g. cancelled). So don't bother. */ + printMsg(lvlInfo, format("cancelling build step ‘%1%’") % step->drvPath); + destroyStep(step, true); + return; + } + + for (auto build2 : builds) + if (build2->drvPath == step->drvPath) { build = build2; break; } + + if (!build) build = *builds.begin(); + + printMsg(lvlInfo, format("performing build step ‘%1%’ (needed by %2% builds)") % step->drvPath % builds.size()); + + /* Create a build step record indicating that we started + building. */ + Connection conn; + time_t startTime = time(0); + int stepNr; + { + pqxx::work txn(conn); + stepNr = createBuildStep(txn, startTime, build, step, bssBusy); + txn.commit(); + } + + bool success = false; + std::string errorMsg; + try { + store->buildPaths(PathSet({step->drvPath})); + success = true; + } catch (Error & e) { + errorMsg = e.msg(); + } + + time_t stopTime = time(0); + + BuildResult res; + if (success) res = getBuildResult(step->drv); + + // FIXME: handle failed-with-output + + // FIXME: handle new builds having been added in the meantime. + + { + pqxx::work txn(conn); + + if (success) { + + finishBuildStep(txn, stopTime, build->id, stepNr, bssSuccess); + + /* Mark all builds of which this derivation is the top + level as succeeded. */ + for (auto build2_ : step->builds) { + auto build2 = build2_.lock(); + if (!build2) continue; + markSucceededBuild(txn, build2, res, false, startTime, stopTime); + } + + } else { + /* Create failed build steps for every build that depends + on this. */ + finishBuildStep(txn, stopTime, build->id, stepNr, bssFailed, errorMsg); + + for (auto build2 : builds) { + if (build == build2) continue; + createBuildStep(txn, stopTime, build2, step, bssFailed, errorMsg, build->id); + } + + /* Mark all builds that depend on this derivation as failed. */ + for (auto build2 : builds) { + txn.parameterized + ("update Builds set finished = 1, isCachedBuild = 0, buildStatus = $2, startTime = $3, stopTime = $4 where id = $1") + (build2->id) + ((int) (build2->drvPath == step->drvPath ? bsFailed : bsDepFailed)) + (startTime) + (stopTime).exec(); + build2->finishedInDB = true; // FIXME: txn might fail + } + } + + txn.commit(); + + } + + /* Remove the build step from the graph. */ + destroyStep(step, success); +} + + +void State::markSucceededBuild(pqxx::work & txn, Build::ptr build, + const BuildResult & res, bool isCachedBuild, time_t startTime, time_t stopTime) +{ + auto stm = txn.parameterized + ("update Builds set finished = 1, buildStatus = $2, startTime = $3, stopTime = $4, size = $5, closureSize = $6, releaseName = $7, isCachedBuild = $8 where id = $1") + (build->id) + ((int) bsSuccess) + (startTime) + (stopTime) + (res.size) + (res.closureSize); + if (res.releaseName != "") stm(res.releaseName); else stm(); + stm(isCachedBuild ? 1 : 0); + stm.exec(); + + unsigned int productNr = 1; + for (auto & product : res.products) { + auto stm = txn.parameterized + ("insert into BuildProducts (build, productnr, type, subtype, fileSize, sha1hash, sha256hash, path, name, defaultPath) values ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)") + (build->id) + (productNr++) + (product.type) + (product.subtype); + if (product.isRegular) stm(product.fileSize); else stm(); + if (product.isRegular) stm(printHash(product.sha1hash)); else stm(); + if (product.isRegular) stm(printHash(product.sha256hash)); else stm(); + stm + (product.path) + (product.name) + (product.defaultPath).exec(); + } + + build->finishedInDB = true; // FIXME: txn might fail +} + + +int main(int argc, char * * argv) +{ + return handleExceptions(argv[0], [&]() { + initNix(); + + settings.buildVerbosity = lvlVomit; + settings.useSubstitutes = false; + + store = openStore(); + + /* FIXME: need some locking to prevent multiple instances of + hydra-queue-runner. */ + + Connection conn; + + State state; + + state.markActiveBuildStepsAsAborted(conn, 0); + + state.getQueuedBuilds(conn); + + state.doBuildSteps(); + }); +} diff --git a/src/lib/Hydra/Helper/AddBuilds.pm b/src/lib/Hydra/Helper/AddBuilds.pm index 52b0b030..3fcd6cc0 100644 --- a/src/lib/Hydra/Helper/AddBuilds.pm +++ b/src/lib/Hydra/Helper/AddBuilds.pm @@ -22,20 +22,10 @@ use Hydra::Helper::CatalystUtils; our @ISA = qw(Exporter); our @EXPORT = qw( fetchInput evalJobs checkBuild inputsToArgs - getReleaseName addBuildProducts restartBuild - getPrevJobsetEval + restartBuild getPrevJobsetEval ); -sub getReleaseName { - my ($outPath) = @_; - return undef unless -f "$outPath/nix-support/hydra-release-name"; - my $releaseName = read_file("$outPath/nix-support/hydra-release-name"); - chomp $releaseName; - return $releaseName; -} - - sub parseJobName { # Parse a job specification of the form `:: # [attrs]'. The project, jobset and attrs may be omitted. The @@ -355,80 +345,6 @@ sub evalJobs { } -sub addBuildProducts { - my ($db, $build) = @_; - - my $productnr = 1; - my $explicitProducts = 0; - my $storeDir = $Nix::Config::storeDir . "/"; - - foreach my $output ($build->buildoutputs->all) { - my $outPath = $output->path; - if (-e "$outPath/nix-support/hydra-build-products") { - $explicitProducts = 1; - - open LIST, "$outPath/nix-support/hydra-build-products" or die; - while () { - /^([\w\-]+)\s+([\w\-]+)\s+("[^"]*"|\S+)(\s+(\S+))?$/ or next; - my $type = $1; - my $subtype = $2 eq "none" ? "" : $2; - my $path = substr($3, 0, 1) eq "\"" ? substr($3, 1, -1) : $3; - my $defaultPath = $5; - - # Ensure that the path exists and points into the Nix store. - next unless File::Spec->file_name_is_absolute($path); - $path = pathIsInsidePrefix($path, $Nix::Config::storeDir); - next unless defined $path; - next unless -e $path; - - # FIXME: check that the path is in the input closure - # of the build? - - my $fileSize, my $sha1, my $sha256; - - if (-f $path) { - my $st = stat($path) or die "cannot stat $path: $!"; - $fileSize = $st->size; - $sha1 = hashFile("sha1", 0, $path); - $sha256 = hashFile("sha256", 0, $path); - } - - my $name = $path eq $outPath ? "" : basename $path; - - $db->resultset('BuildProducts')->create( - { build => $build->id - , productnr => $productnr++ - , type => $type - , subtype => $subtype - , path => $path - , filesize => $fileSize - , sha1hash => $sha1 - , sha256hash => $sha256 - , name => $name - , defaultpath => $defaultPath - }); - } - close LIST; - } - } - - return if $explicitProducts; - - foreach my $output ($build->buildoutputs->all) { - my $outPath = $output->path; - next unless -d $outPath; - $db->resultset('BuildProducts')->create( - { build => $build->id - , productnr => $productnr++ - , type => "nix-build" - , subtype => $output->name eq "out" ? "" : $output->name - , path => $outPath - , name => $build->nixname - }); - } -} - - # Return the most recent evaluation of the given jobset (that # optionally had new builds), or undefined if no such evaluation # exists. @@ -501,40 +417,6 @@ sub checkBuild { my $time = time(); - # Are the outputs already in the Nix store? Then add a cached - # build. - my %extraFlags; - my $allValid = 1; - my $buildStatus; - my $releaseName; - foreach my $name (@outputNames) { - my $path = $buildInfo->{outputs}->{$name}; - if (isValidPath($path)) { - if (-f "$path/nix-support/failed") { - $buildStatus = 6; - } else { - $buildStatus //= 0; - } - $releaseName //= getReleaseName($path); - } else { - $allValid = 0; - last; - } - } - - if ($allValid) { - %extraFlags = - ( finished => 1 - , iscachedbuild => 1 - , buildstatus => $buildStatus - , starttime => $time - , stoptime => $time - , releasename => $releaseName - ); - } else { - %extraFlags = ( finished => 0 ); - } - # Add the build to the database. $build = $job->builds->create( { timestamp => $time @@ -550,10 +432,10 @@ sub checkBuild { , nixexprinput => $jobset->nixexprinput , nixexprpath => $jobset->nixexprpath , priority => $buildInfo->{schedulingPriority} + , finished => 0 , busy => 0 , locker => "" , iscurrent => 1 - , %extraFlags }); $build->buildoutputs->create({ name => $_, path => $buildInfo->{outputs}->{$_} }) @@ -562,13 +444,7 @@ sub checkBuild { $buildMap->{$build->id} = { id => $build->id, jobName => $jobName, new => 1, drvPath => $drvPath }; $$jobOutPathMap{$jobName . "\t" . $firstOutputPath} = $build->id; - if ($build->iscachedbuild) { - #print STDERR " marked as cached build ", $build->id, "\n"; - addBuildProducts($db, $build); - notifyBuildFinished($plugins, $build, []); - } else { - print STDERR "added build ${\$build->id} (${\$jobset->project->name}:${\$jobset->name}:$jobName)\n"; - } + print STDERR "added build ${\$build->id} (${\$jobset->project->name}:${\$jobset->name}:$jobName)\n"; }); return $build; diff --git a/src/root/build.tt b/src/root/build.tt index 2ddf5fd7..3923715a 100644 --- a/src/root/build.tt +++ b/src/root/build.tt @@ -33,7 +33,11 @@ [% IF step.busy == 0; - INCLUDE renderDuration duration = step.stoptime - step.starttime; + IF step.stoptime; + INCLUDE renderDuration duration = step.stoptime - step.starttime; + ELSE; + %]?[% + END; ELSIF build.finished; INCLUDE renderDuration duration = build.stoptime - step.starttime; ELSE; @@ -52,8 +56,10 @@ Timed out [% ELSIF step.status == 8 %] Cached failure - [% ELSE %] + [% ELSIF step.errormsg %] Failed: [% HTML.escape(step.errormsg) %] + [% ELSE %] + Failed [% END %] [%%] [%+ IF has_log; INCLUDE renderLogLinks url=log inRow=1; END %] [%+ IF step.propagatedfrom; %](propagated from [% INCLUDE renderBuildIdLink id=step.propagatedfrom.get_column('id') %])[% END %] diff --git a/src/sql/hydra.sql b/src/sql/hydra.sql index 436cb823..4def1825 100644 --- a/src/sql/hydra.sql +++ b/src/sql/hydra.sql @@ -159,11 +159,13 @@ create table Builds ( -- Information about scheduled builds. priority integer not null default 0, + -- FIXME: remove (obsolete with the new queue runner) busy integer not null default 0, -- true means someone is building this job now locker text, -- !!! hostname/pid of the process building this job? logfile text, -- if busy, the path of the logfile + -- FIXME: remove startTime? startTime integer, -- if busy/finished, time we started stopTime integer, -- if finished, time we finished @@ -207,6 +209,8 @@ create table BuildOutputs ( ); +-- TODO: normalize this. Currently there can be multiple BuildSteps +-- for a single step. create table BuildSteps ( build integer not null, stepnr integer not null, From 604fdb908f33323ea1fdf59db03b4e5aabb4afe0 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Thu, 28 May 2015 19:06:17 +0200 Subject: [PATCH 002/158] Pass null values to libpqxx properly --- src/hydra-queue-runner/hydra-queue-runner.cc | 51 +++++++++----------- 1 file changed, 23 insertions(+), 28 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 6d2cd094..95a1c8ad 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -137,11 +137,10 @@ State::~State() void State::markActiveBuildStepsAsAborted(pqxx::connection & conn, time_t stopTime) { pqxx::work txn(conn); - auto stm = txn.parameterized + txn.parameterized ("update BuildSteps set busy = 0, status = $1, stopTime = $2 where busy = 1") - ((int) bssAborted); - if (stopTime) stm(stopTime); else stm(); - stm.exec(); + ((int) bssAborted) + (stopTime, stopTime != 0).exec(); txn.commit(); } @@ -152,14 +151,13 @@ int State::createBuildStep(pqxx::work & txn, time_t startTime, Build::ptr build, auto res = txn.parameterized("select max(stepnr) from BuildSteps where build = $1")(build->id).exec(); int stepNr = res[0][0].is_null() ? 1 : res[0][0].as() + 1; - auto stm = txn.parameterized + txn.parameterized ("insert into BuildSteps (build, stepnr, type, drvPath, busy, startTime, system, status, propagatedFrom, errorMsg, stopTime) values ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)") - (build->id)(stepNr)(0)(step->drvPath)(status == bssBusy ? 1 : 0)(startTime)(step->drv.platform); - if (status == bssBusy) stm(); else stm((int) status); - if (propagatedFrom) stm(propagatedFrom); else stm(); - if (errorMsg != "") stm(errorMsg); else stm(); - if (status == bssBusy) stm(); else stm(startTime); - stm.exec(); + (build->id)(stepNr)(0)(step->drvPath)(status == bssBusy ? 1 : 0)(startTime)(step->drv.platform) + ((int) status, status != bssBusy) + (propagatedFrom, propagatedFrom != 0) + (errorMsg, errorMsg != "") + (startTime, status != bssBusy).exec(); for (auto & output : step->drv.outputs) txn.parameterized @@ -173,13 +171,12 @@ int State::createBuildStep(pqxx::work & txn, time_t startTime, Build::ptr build, void State::finishBuildStep(pqxx::work & txn, time_t stopTime, BuildID buildId, int stepNr, BuildStepStatus status, const std::string & errorMsg, BuildID propagatedFrom) { - auto stm = txn.parameterized + txn.parameterized ("update BuildSteps set busy = 0, status = $1, propagatedFrom = $4, errorMsg = $5, stopTime = $6 where build = $2 and stepnr = $3") - ((int) status)(buildId)(stepNr); - if (propagatedFrom) stm(propagatedFrom); else stm(); - if (errorMsg != "") stm(errorMsg); else stm(); - if (stopTime) stm(stopTime); else stm(); - stm.exec(); + ((int) status)(buildId)(stepNr) + (propagatedFrom, propagatedFrom != 0) + (errorMsg, errorMsg != "") + (stopTime, stopTime != 0).exec(); } @@ -456,30 +453,28 @@ void State::doBuildStep(Step::ptr step) void State::markSucceededBuild(pqxx::work & txn, Build::ptr build, const BuildResult & res, bool isCachedBuild, time_t startTime, time_t stopTime) { - auto stm = txn.parameterized + txn.parameterized ("update Builds set finished = 1, buildStatus = $2, startTime = $3, stopTime = $4, size = $5, closureSize = $6, releaseName = $7, isCachedBuild = $8 where id = $1") (build->id) ((int) bsSuccess) (startTime) (stopTime) (res.size) - (res.closureSize); - if (res.releaseName != "") stm(res.releaseName); else stm(); - stm(isCachedBuild ? 1 : 0); - stm.exec(); + (res.closureSize) + (res.releaseName, res.releaseName != "") + (isCachedBuild ? 1 : 0).exec(); unsigned int productNr = 1; for (auto & product : res.products) { - auto stm = txn.parameterized + txn.parameterized ("insert into BuildProducts (build, productnr, type, subtype, fileSize, sha1hash, sha256hash, path, name, defaultPath) values ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10)") (build->id) (productNr++) (product.type) - (product.subtype); - if (product.isRegular) stm(product.fileSize); else stm(); - if (product.isRegular) stm(printHash(product.sha1hash)); else stm(); - if (product.isRegular) stm(printHash(product.sha256hash)); else stm(); - stm + (product.subtype) + (product.fileSize, product.isRegular) + (printHash(product.sha1hash), product.isRegular) + (printHash(product.sha256hash), product.isRegular) (product.path) (product.name) (product.defaultPath).exec(); From 8640e30787157d607c7f8be387583041ad93221b Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Fri, 29 May 2015 01:31:12 +0200 Subject: [PATCH 003/158] Very basic multi-threaded queue runner --- src/hydra-queue-runner/build-result.cc | 2 +- src/hydra-queue-runner/build-result.hh | 4 +- src/hydra-queue-runner/hydra-queue-runner.cc | 177 +++++++++++++++---- 3 files changed, 150 insertions(+), 33 deletions(-) diff --git a/src/hydra-queue-runner/build-result.cc b/src/hydra-queue-runner/build-result.cc index de9903b9..e7a2fda3 100644 --- a/src/hydra-queue-runner/build-result.cc +++ b/src/hydra-queue-runner/build-result.cc @@ -6,7 +6,7 @@ using namespace nix; -BuildResult getBuildResult(const Derivation & drv) +BuildResult getBuildResult(std::shared_ptr store, const Derivation & drv) { BuildResult res; diff --git a/src/hydra-queue-runner/build-result.hh b/src/hydra-queue-runner/build-result.hh index f8a93b3a..bbe6fd7a 100644 --- a/src/hydra-queue-runner/build-result.hh +++ b/src/hydra-queue-runner/build-result.hh @@ -1,5 +1,7 @@ #pragma once +#include + #include "hash.hh" #include "derivations.hh" @@ -22,4 +24,4 @@ struct BuildResult std::list products; }; -BuildResult getBuildResult(const nix::Derivation & drv); +BuildResult getBuildResult(std::shared_ptr store, const nix::Derivation & drv); diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 95a1c8ad..d2edf32c 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -1,6 +1,10 @@ +#include +#include #include -#include #include +#include +#include + #include #include "build-result.hh" @@ -12,6 +16,40 @@ using namespace nix; +std::mutex exitRequestMutex; +std::condition_variable exitRequest; +bool exitRequested(false); + +static std::atomic_int _int(0); + +void sigintHandler(int signo) +{ + _int = 1; +} + + +void signalThread() +{ + struct sigaction act; + act.sa_handler = sigintHandler; + sigemptyset(&act.sa_mask); + act.sa_flags = 0; + if (sigaction(SIGINT, &act, 0)) + throw SysError("installing handler for SIGINT"); + + while (true) { + sleep(1000000); + if (_int) break; + } + + { + std::lock_guard lock(exitRequestMutex); + exitRequested = true; + } + exitRequest.notify_all(); +} + + typedef enum { bsSuccess = 0, bsFailed = 1, @@ -74,6 +112,9 @@ struct Step class State { private: + + std::thread queueMonitorThread; + /* The queued builds. */ std::map builds; @@ -84,6 +125,9 @@ private: /* Build steps that have no unbuilt dependencies. */ std::set runnable; + std::mutex runnableMutex; + std::condition_variable runnableCV; + public: State(); @@ -99,21 +143,27 @@ public: void updateBuild(pqxx::work & txn, Build::ptr build, BuildStatus status); - void getQueuedBuilds(pqxx::connection & conn); + void queueMonitorThreadEntry(); - Step::ptr createStep(const Path & drvPath); + void getQueuedBuilds(std::shared_ptr store, pqxx::connection & conn); + + Step::ptr createStep(std::shared_ptr store, const Path & drvPath); void destroyStep(Step::ptr step, bool proceed); /* Get the builds that depend on the given step. */ std::set getDependentBuilds(Step::ptr step); - void doBuildSteps(); + void makeRunnable(Step::ptr step); - void doBuildStep(Step::ptr step); + void builderThreadEntry(int slot); + + void doBuildStep(std::shared_ptr store, Step::ptr step); void markSucceededBuild(pqxx::work & txn, Build::ptr build, const BuildResult & res, bool isCachedBuild, time_t startTime, time_t stopTime); + + void run(); }; @@ -180,8 +230,30 @@ void State::finishBuildStep(pqxx::work & txn, time_t stopTime, BuildID buildId, } -void State::getQueuedBuilds(pqxx::connection & conn) +void State::queueMonitorThreadEntry() { + auto store = openStore(); // FIXME: pool + + Connection conn; + + while (true) { + getQueuedBuilds(store, conn); + + { + std::unique_lock lock(exitRequestMutex); + exitRequest.wait_for(lock, std::chrono::seconds(5)); + if (exitRequested) break; + } + } + + printMsg(lvlError, "queue monitor exits"); +} + + +void State::getQueuedBuilds(std::shared_ptr store, pqxx::connection & conn) +{ + printMsg(lvlError, "checking the queue..."); + pqxx::work txn(conn); // FIXME: query only builds with ID higher than the previous @@ -213,10 +285,10 @@ void State::getQueuedBuilds(pqxx::connection & conn) continue; } - Step::ptr step = createStep(build->drvPath); + Step::ptr step = createStep(store, build->drvPath); if (!step) { Derivation drv = readDerivation(build->drvPath); - BuildResult res = getBuildResult(drv); + BuildResult res = getBuildResult(store, drv); Connection conn; pqxx::work txn(conn); @@ -234,7 +306,7 @@ void State::getQueuedBuilds(pqxx::connection & conn) } -Step::ptr State::createStep(const Path & drvPath) +Step::ptr State::createStep(std::shared_ptr store, const Path & drvPath) { auto prev = steps.find(drvPath); if (prev != steps.end()) return prev->second; @@ -262,7 +334,7 @@ Step::ptr State::createStep(const Path & drvPath) /* Create steps for the dependencies. */ for (auto & i : step->drv.inputDrvs) { - Step::ptr dep = createStep(i.first); + Step::ptr dep = createStep(store, i.first); if (dep) { step->deps.insert(dep); dep->rdeps.push_back(step); @@ -271,7 +343,7 @@ Step::ptr State::createStep(const Path & drvPath) steps[drvPath] = step; - if (step->deps.empty()) runnable.insert(step); + if (step->deps.empty()) makeRunnable(step); return step; } @@ -290,7 +362,7 @@ void State::destroyStep(Step::ptr step, bool proceed) /* If this rdep has no other dependencies, then we can now build it. */ if (rdep->deps.empty()) - runnable.insert(rdep); + makeRunnable(rdep); } else /* If ‘step’ failed, then delete all dependent steps as well. */ @@ -334,18 +406,43 @@ std::set State::getDependentBuilds(Step::ptr step) } -void State::doBuildSteps() +void State::makeRunnable(Step::ptr step) { - while (!runnable.empty()) { - printMsg(lvlInfo, format("%1% runnable steps") % runnable.size()); - Step::ptr step = *runnable.begin(); - runnable.erase(step); - doBuildStep(step); + assert(step->deps.empty()); + + { + std::lock_guard lock(runnableMutex); + runnable.insert(step); } + + runnableCV.notify_one(); } -void State::doBuildStep(Step::ptr step) +void State::builderThreadEntry(int slot) +{ + auto store = openStore(); // FIXME: pool + + while (true) { + Step::ptr step; + { + std::unique_lock lock(runnableMutex); + while (runnable.empty()) + runnableCV.wait(lock); + step = *runnable.begin(); + runnable.erase(step); + } + + printMsg(lvlError, format("slot %1%: got build step ‘%2%’") % slot % step->drvPath); + + doBuildStep(store, step); + } + + printMsg(lvlError, "builder thread exits"); +} + + +void State::doBuildStep(std::shared_ptr store, Step::ptr step) { assert(step->deps.empty()); @@ -398,7 +495,7 @@ void State::doBuildStep(Step::ptr step) time_t stopTime = time(0); BuildResult res; - if (success) res = getBuildResult(step->drv); + if (success) res = getBuildResult(store, step->drv); // FIXME: handle failed-with-output @@ -484,27 +581,45 @@ void State::markSucceededBuild(pqxx::work & txn, Build::ptr build, } +void State::run() +{ + { + Connection conn; + markActiveBuildStepsAsAborted(conn, 0); + } + + queueMonitorThread = std::thread(&State::queueMonitorThreadEntry, this); + + sleep(1); + + for (int n = 0; n < 4; n++) + std::thread(&State::builderThreadEntry, this, n).detach(); + + queueMonitorThread.join(); +} + + int main(int argc, char * * argv) { return handleExceptions(argv[0], [&]() { initNix(); + std::thread(signalThread).detach(); + + /* Ignore signals. This is inherited by the other threads. */ + sigset_t set; + sigemptyset(&set); + sigaddset(&set, SIGHUP); + sigaddset(&set, SIGINT); + sigaddset(&set, SIGTERM); + sigprocmask(SIG_BLOCK, &set, NULL); + settings.buildVerbosity = lvlVomit; settings.useSubstitutes = false; - store = openStore(); - /* FIXME: need some locking to prevent multiple instances of hydra-queue-runner. */ - - Connection conn; - State state; - - state.markActiveBuildStepsAsAborted(conn, 0); - - state.getQueuedBuilds(conn); - - state.doBuildSteps(); + state.run(); }); } From e7788219401945789265a603604ffb0f05ca9e2b Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Fri, 29 May 2015 17:14:20 +0200 Subject: [PATCH 004/158] Make concurrency more robust --- src/hydra-queue-runner/hydra-queue-runner.cc | 366 ++++++++++++++----- src/hydra-queue-runner/sync.hh | 53 +++ 2 files changed, 321 insertions(+), 98 deletions(-) create mode 100644 src/hydra-queue-runner/sync.hh diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index d2edf32c..6b9d4aae 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -8,6 +8,8 @@ #include #include "build-result.hh" +#include "sync.hh" + #include "store-api.hh" #include "derivations.hh" #include "shared.hh" @@ -16,6 +18,13 @@ using namespace nix; +template +bool has(const C & c, const V & v) +{ + return c.find(v) != c.end(); +} + + std::mutex exitRequestMutex; std::condition_variable exitRequest; bool exitRequested(false); @@ -76,6 +85,9 @@ struct Connection : pqxx::connection typedef unsigned int BuildID; +struct Step; + + struct Build { typedef std::shared_ptr ptr; @@ -84,10 +96,18 @@ struct Build BuildID id; Path drvPath; std::map outputs; + std::string fullJobName; + + std::shared_ptr toplevel; bool finishedInDB; Build() : finishedInDB(false) { } + + ~Build() + { + printMsg(lvlError, format("destroying build %1%") % id); + } }; @@ -95,17 +115,28 @@ struct Step { typedef std::shared_ptr ptr; typedef std::weak_ptr wptr; + Path drvPath; Derivation drv; - /* The build steps on which this step depends. */ - std::set deps; + struct State + { + /* The build steps on which this step depends. */ + std::set deps; - /* The build steps that depend on this step. */ - std::vector rdeps; + /* The build steps that depend on this step. */ + std::vector rdeps; - /* Builds that have this step as the top-level derivation. */ - std::vector builds; + /* Builds that have this step as the top-level derivation. */ + std::vector builds; + }; + + Sync state; + + ~Step() + { + printMsg(lvlError, format("destroying step %1%") % drvPath); + } }; @@ -116,17 +147,21 @@ private: std::thread queueMonitorThread; /* The queued builds. */ - std::map builds; + typedef std::map Builds; + Sync builds; /* All active or pending build steps (i.e. dependencies of the - queued builds). */ - std::map steps; + queued builds). Note that these are weak pointers. Steps are + kept alive by being reachable from Builds or by being in + progress. */ + typedef std::map Steps; + Sync steps; /* Build steps that have no unbuilt dependencies. */ - std::set runnable; + typedef std::list Runnable; + Sync runnable; - std::mutex runnableMutex; - std::condition_variable runnableCV; + std::condition_variable_any runnableCV; public: State(); @@ -147,7 +182,8 @@ public: void getQueuedBuilds(std::shared_ptr store, pqxx::connection & conn); - Step::ptr createStep(std::shared_ptr store, const Path & drvPath); + Step::ptr createStep(std::shared_ptr store, const Path & drvPath, + std::set & newRunnable); void destroyStep(Step::ptr step, bool proceed); @@ -254,26 +290,55 @@ void State::getQueuedBuilds(std::shared_ptr store, pqxx::connection & { printMsg(lvlError, "checking the queue..."); - pqxx::work txn(conn); +#if 0 + { + auto runnable_(runnable.lock()); + auto builds_(builds.lock()); + auto steps_(steps.lock()); + printMsg(lvlError, format("%1% builds, %2% steps, %3% runnable steps") + % builds_->size() + % steps_->size() + % runnable_->size()); + } +#endif - // FIXME: query only builds with ID higher than the previous - // highest. - auto res = txn.exec("select * from Builds where finished = 0"); + /* Grab the queued builds from the database, but don't process + them yet (since we don't want a long-running transaction). */ + std::list newBuilds; // FIXME: use queue - // FIXME: don't process inside a txn. - for (auto const & row : res) { - BuildID id = row["id"].as(); - if (builds.find(id) != builds.end()) continue; + { + pqxx::work txn(conn); - Build::ptr build(new Build); - build->id = id; - build->drvPath = row["drvPath"].as(); + // FIXME: query only builds with ID higher than the previous + // highest. + auto res = txn.exec("select * from Builds where finished = 0 order by id"); - printMsg(lvlInfo, format("loading build %1% (%2%:%3%:%4%)") % id % row["project"] % row["jobset"] % row["job"]); + auto builds_(builds.lock()); + + for (auto const & row : res) { + BuildID id = row["id"].as(); + if (has(*builds_, id)) continue; + + auto build = std::make_shared(); + build->id = id; + build->drvPath = row["drvPath"].as(); + build->fullJobName = row["project"].as() + ":" + row["jobset"].as() + ":" + row["job"].as(); + + newBuilds.push_back(build); + } + } + + /* Now instantiate build steps for each new build. The builder + threads can start building the runnable build steps right away, + even while we're still processing other new builds. */ + for (auto & build : newBuilds) { + // FIXME: remove build from newBuilds to ensure quick destruction + // FIXME: exception handling + + printMsg(lvlInfo, format("loading build %1% (%2%)") % build->id % build->fullJobName); if (!store->isValidPath(build->drvPath)) { /* Derivation has been GC'ed prematurely. */ - Connection conn; pqxx::work txn(conn); txn.parameterized ("update Builds set finished = 1, buildStatus = $2, startTime = $3, stopTime = $3, errorMsg = $4 where id = $1") @@ -285,12 +350,15 @@ void State::getQueuedBuilds(std::shared_ptr store, pqxx::connection & continue; } - Step::ptr step = createStep(store, build->drvPath); + std::set newRunnable; + Step::ptr step = createStep(store, build->drvPath, newRunnable); + + /* If we didn't get a step, it means the step's outputs are + all valid. So we mark this as a finished, cached build. */ if (!step) { Derivation drv = readDerivation(build->drvPath); BuildResult res = getBuildResult(store, drv); - Connection conn; pqxx::work txn(conn); time_t now = time(0); markSucceededBuild(txn, build, res, true, now, now); @@ -299,21 +367,49 @@ void State::getQueuedBuilds(std::shared_ptr store, pqxx::connection & continue; } - step->builds.push_back(build); + /* Note: if we exit this scope prior to this, the build and + all newly created steps are destroyed. */ - builds[id] = build; + { + auto builds_(builds.lock()); + auto step_(step->state.lock()); + (*builds_)[build->id] = build; + step_->builds.push_back(build); + build->toplevel = step; + } + + /* Prior to this, the build is not visible to + getDependentBuilds(). Now it is, so the build can be + failed if a dependency fails. (It can't succeed right away + because its top-level is not runnable yet). */ + + /* Add the new runnable build steps to ‘runnable’ and wake up + the builder threads. */ + for (auto & r : newRunnable) + makeRunnable(r); } } -Step::ptr State::createStep(std::shared_ptr store, const Path & drvPath) +Step::ptr State::createStep(std::shared_ptr store, const Path & drvPath, + std::set & newRunnable) { - auto prev = steps.find(drvPath); - if (prev != steps.end()) return prev->second; + /* Check if the requested step already exists. */ + { + auto steps_(steps.lock()); + auto prev = steps_->find(drvPath); + if (prev != steps_->end()) { + auto step = prev->second.lock(); + /* Since ‘step’ is a strong pointer, the referred Step + object won't be deleted after this. */ + if (step) return step; + steps_->erase(drvPath); // remove stale entry + } + } printMsg(lvlInfo, format("considering derivation ‘%1%’") % drvPath); - Step::ptr step(new Step); + auto step = std::make_shared(); step->drvPath = drvPath; step->drv = readDerivation(drvPath); @@ -333,17 +429,25 @@ Step::ptr State::createStep(std::shared_ptr store, const Path & drvPat printMsg(lvlInfo, format("creating build step ‘%1%’") % drvPath); /* Create steps for the dependencies. */ + bool hasDeps = false; for (auto & i : step->drv.inputDrvs) { - Step::ptr dep = createStep(store, i.first); + Step::ptr dep = createStep(store, i.first, newRunnable); if (dep) { - step->deps.insert(dep); - dep->rdeps.push_back(step); + hasDeps = true; + auto step_(step->state.lock()); + auto dep_(dep->state.lock()); + step_->deps.insert(dep); + dep_->rdeps.push_back(step); } } - steps[drvPath] = step; + { + auto steps_(steps.lock()); + assert(steps_->find(drvPath) == steps_->end()); + (*steps_)[drvPath] = step; + } - if (step->deps.empty()) makeRunnable(step); + if (!hasDeps) newRunnable.insert(step); return step; } @@ -351,30 +455,48 @@ Step::ptr State::createStep(std::shared_ptr store, const Path & drvPat void State::destroyStep(Step::ptr step, bool proceed) { - steps.erase(step->drvPath); + printMsg(lvlInfo, format("destroying build step ‘%1%’") % step->drvPath); - for (auto & rdep_ : step->rdeps) { + { + auto steps_(steps.lock()); + steps_->erase(step->drvPath); + } + + std::vector rdeps; + + { + auto step_(step->state.lock()); + rdeps = step_->rdeps; + + /* Sanity checks. */ + for (auto & build_ : step_->builds) { + auto build = build_.lock(); + if (!build) continue; + assert(build->drvPath == step->drvPath); + assert(build->finishedInDB); + } + } + + for (auto & rdep_ : rdeps) { auto rdep = rdep_.lock(); if (!rdep) continue; - assert(rdep->deps.find(step) != rdep->deps.end()); - rdep->deps.erase(step); + bool runnable = false; + { + auto rdep_(rdep->state.lock()); + assert(has(rdep_->deps, step)); + rdep_->deps.erase(step); + if (rdep_->deps.empty()) runnable = true; + } if (proceed) { /* If this rdep has no other dependencies, then we can now build it. */ - if (rdep->deps.empty()) + if (runnable) makeRunnable(rdep); } else - /* If ‘step’ failed, then delete all dependent steps as - well. */ + /* If ‘step’ failed or was cancelled, then delete all + dependent steps as well. */ destroyStep(rdep, false); } - - for (auto & build_ : step->builds) { - auto build = build_.lock(); - if (!build) continue; - assert(build->drvPath == step->drvPath); - assert(build->finishedInDB); - } } @@ -386,17 +508,27 @@ std::set State::getDependentBuilds(Step::ptr step) std::function visit; visit = [&](Step::ptr step) { - if (done.find(step) != done.end()) return; + if (has(done, step)) return; done.insert(step); - for (auto & build : step->builds) { - auto build2 = build.lock(); - if (build2) res.insert(build2); + std::vector rdeps; + + { + auto step_(step->state.lock()); + + for (auto & build : step_->builds) { + auto build_ = build.lock(); + if (build_) res.insert(build_); + } + + /* Make a copy of rdeps so that we don't hold the lock for + very long. */ + rdeps = step_->rdeps; } - for (auto & rdep : step->rdeps) { - auto rdep2 = rdep.lock(); - if (rdep2) visit(rdep2); + for (auto & rdep : rdeps) { + auto rdep_ = rdep.lock(); + if (rdep_) visit(rdep_); } }; @@ -408,11 +540,14 @@ std::set State::getDependentBuilds(Step::ptr step) void State::makeRunnable(Step::ptr step) { - assert(step->deps.empty()); + { + auto step_(step->state.lock()); + assert(step_->deps.empty()); + } { - std::lock_guard lock(runnableMutex); - runnable.insert(step); + auto runnable_(runnable.lock()); + runnable_->push_back(step); } runnableCV.notify_one(); @@ -424,17 +559,20 @@ void State::builderThreadEntry(int slot) auto store = openStore(); // FIXME: pool while (true) { + /* Sleep until a runnable build step becomes available. */ Step::ptr step; { - std::unique_lock lock(runnableMutex); - while (runnable.empty()) - runnableCV.wait(lock); - step = *runnable.begin(); - runnable.erase(step); + auto runnable_(runnable.lock()); + while (runnable_->empty()) + runnable_.wait(runnableCV); + auto weak = *runnable_->begin(); + runnable_->pop_front(); + step = weak.lock(); + if (!step) continue; } + /* Build it. */ printMsg(lvlError, format("slot %1%: got build step ‘%2%’") % slot % step->drvPath); - doBuildStep(store, step); } @@ -444,34 +582,38 @@ void State::builderThreadEntry(int slot) void State::doBuildStep(std::shared_ptr store, Step::ptr step) { - assert(step->deps.empty()); - /* There can be any number of builds in the database that depend - on this derivation. Arbitrarily pick one (though preferring - those build of which this is the top-level derivation) for the + on this derivation. Arbitrarily pick one (though preferring a + build of which this is the top-level derivation) for the purpose of creating build steps. We could create a build step record for every build, but that could be very expensive (e.g. a stdenv derivation can be a dependency of tens of thousands of builds), so we don't. */ Build::ptr build; - auto builds = getDependentBuilds(step); + { + auto dependents = getDependentBuilds(step); - if (builds.empty()) { - /* Apparently all builds that depend on this derivation are - gone (e.g. cancelled). So don't bother. */ - printMsg(lvlInfo, format("cancelling build step ‘%1%’") % step->drvPath); - destroyStep(step, true); - return; + if (dependents.empty()) { + /* Apparently all builds that depend on this derivation + are gone (e.g. cancelled). So don't bother. (This is + very unlikely to happen, because normally Steps are + only kept alive by being reachable from a + Build). FIXME: what if a new Build gets a reference to + this step? */ + printMsg(lvlInfo, format("cancelling build step ‘%1%’") % step->drvPath); + destroyStep(step, false); + return; + } + + for (auto build2 : dependents) + if (build2->drvPath == step->drvPath) { build = build2; break; } + + if (!build) build = *dependents.begin(); + + printMsg(lvlInfo, format("performing build step ‘%1%’ (needed by %2% builds)") % step->drvPath % dependents.size()); } - for (auto build2 : builds) - if (build2->drvPath == step->drvPath) { build = build2; break; } - - if (!build) build = *builds.begin(); - - printMsg(lvlInfo, format("performing build step ‘%1%’ (needed by %2% builds)") % step->drvPath % builds.size()); - /* Create a build step record indicating that we started building. */ Connection conn; @@ -499,8 +641,30 @@ void State::doBuildStep(std::shared_ptr store, Step::ptr step) // FIXME: handle failed-with-output - // FIXME: handle new builds having been added in the meantime. + /* Remove this step. After this, incoming builds that depend on + drvPath will either see that the output paths exist, or will + create a new build step for drvPath. The latter is fine - it + won't conflict with this one, because we're removing it. In any + case, the set of dependent builds for ‘step’ can't increase + anymore because ‘step’ is no longer visible to createStep(). */ + { + auto steps_(steps.lock()); + steps_->erase(step->drvPath); + } + /* Get the final set of dependent builds. */ + auto dependents = getDependentBuilds(step); + + std::set direct; + { + auto step_(step->state.lock()); + for (auto & build : step_->builds) { + auto build_ = build.lock(); + if (build_) direct.insert(build_); + } + } + + /* Update the database. */ { pqxx::work txn(conn); @@ -510,24 +674,21 @@ void State::doBuildStep(std::shared_ptr store, Step::ptr step) /* Mark all builds of which this derivation is the top level as succeeded. */ - for (auto build2_ : step->builds) { - auto build2 = build2_.lock(); - if (!build2) continue; + for (auto build2 : direct) markSucceededBuild(txn, build2, res, false, startTime, stopTime); - } } else { /* Create failed build steps for every build that depends on this. */ finishBuildStep(txn, stopTime, build->id, stepNr, bssFailed, errorMsg); - for (auto build2 : builds) { + for (auto build2 : dependents) { if (build == build2) continue; createBuildStep(txn, stopTime, build2, step, bssFailed, errorMsg, build->id); } /* Mark all builds that depend on this derivation as failed. */ - for (auto build2 : builds) { + for (auto build2 : dependents) { txn.parameterized ("update Builds set finished = 1, isCachedBuild = 0, buildStatus = $2, startTime = $3, stopTime = $4 where id = $1") (build2->id) @@ -539,10 +700,21 @@ void State::doBuildStep(std::shared_ptr store, Step::ptr step) } txn.commit(); - } - /* Remove the build step from the graph. */ + /* In case of success, destroy all Build objects of which ‘step’ + is the top-level derivation. In case of failure, destroy all + dependent Build objects. Any Steps not referenced by other + Builds will be destroyed as well. */ + for (auto build2 : dependents) + if (build2->toplevel == step || !success) { + auto builds_(builds.lock()); + builds_->erase(build2->id); + } + + /* Remove the step from the graph. In case of success, make + dependent build steps runnable if they have no other + dependencies. */ destroyStep(step, success); } @@ -590,8 +762,6 @@ void State::run() queueMonitorThread = std::thread(&State::queueMonitorThreadEntry, this); - sleep(1); - for (int n = 0; n < 4; n++) std::thread(&State::builderThreadEntry, this, n).detach(); diff --git a/src/hydra-queue-runner/sync.hh b/src/hydra-queue-runner/sync.hh new file mode 100644 index 00000000..6f5f9e6a --- /dev/null +++ b/src/hydra-queue-runner/sync.hh @@ -0,0 +1,53 @@ +#pragma once + +#include +#include + +/* This template class ensures synchronized access to a value of type + T. It is used as follows: + + struct Data { int x; ... }; + + Sync data; + + { + auto data_(data.lock()); + data_->x = 123; + } + + Here, "data" is automatically unlocked when "data_" goes out of + scope. +*/ + +template +class Sync +{ +private: + std::mutex mutex; + T data; + +public: + + class Lock + { + private: + Sync * s; + friend Sync; + Lock(Sync * s) : s(s) { s->mutex.lock(); } + public: + Lock(Lock && l) : s(l.s) { l.s = 0; } + Lock(const Lock & l) = delete; + ~Lock() { if (s) s->mutex.unlock(); } + T * operator -> () { return &s->data; } + T & operator * () { return s->data; } + + /* FIXME: performance impact of condition_variable_any? */ + void wait(std::condition_variable_any & cv) + { + assert(s); + cv.wait(s->mutex); + } + }; + + Lock lock() { return Lock(this); } +}; From 214b95706c5b8380443c3d26e81248b9b99c0d93 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Fri, 29 May 2015 20:02:15 +0200 Subject: [PATCH 005/158] On SIGINT, shut down the builder threads Note that they don't get interrupted at the moment (so on SIGINT, any running builds will need to finish first). --- src/hydra-queue-runner/hydra-queue-runner.cc | 45 ++++++++++++++------ 1 file changed, 32 insertions(+), 13 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 6b9d4aae..2eee9463 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -27,7 +27,7 @@ bool has(const C & c, const V & v) std::mutex exitRequestMutex; std::condition_variable exitRequest; -bool exitRequested(false); +std::atomic exitRequested(false); static std::atomic_int _int(0); @@ -145,6 +145,8 @@ class State private: std::thread queueMonitorThread; + std::mutex queueMonitorMutex; + std::condition_variable queueMonitorWakeup; /* The queued builds. */ typedef std::map Builds; @@ -161,7 +163,7 @@ private: typedef std::list Runnable; Sync runnable; - std::condition_variable_any runnableCV; + std::condition_variable_any runnableWakeup; public: State(); @@ -178,7 +180,7 @@ public: void updateBuild(pqxx::work & txn, Build::ptr build, BuildStatus status); - void queueMonitorThreadEntry(); + void queueMonitor(); void getQueuedBuilds(std::shared_ptr store, pqxx::connection & conn); @@ -266,19 +268,18 @@ void State::finishBuildStep(pqxx::work & txn, time_t stopTime, BuildID buildId, } -void State::queueMonitorThreadEntry() +void State::queueMonitor() { auto store = openStore(); // FIXME: pool Connection conn; - while (true) { + while (!exitRequested) { getQueuedBuilds(store, conn); { - std::unique_lock lock(exitRequestMutex); - exitRequest.wait_for(lock, std::chrono::seconds(5)); - if (exitRequested) break; + std::unique_lock lock(queueMonitorMutex); + queueMonitorWakeup.wait_for(lock, std::chrono::seconds(5)); } } @@ -550,7 +551,7 @@ void State::makeRunnable(Step::ptr step) runnable_->push_back(step); } - runnableCV.notify_one(); + runnableWakeup.notify_one(); } @@ -563,8 +564,9 @@ void State::builderThreadEntry(int slot) Step::ptr step; { auto runnable_(runnable.lock()); - while (runnable_->empty()) - runnable_.wait(runnableCV); + while (runnable_->empty() && !exitRequested) + runnable_.wait(runnableWakeup); + if (exitRequested) break; auto weak = *runnable_->begin(); runnable_->pop_front(); step = weak.lock(); @@ -760,12 +762,29 @@ void State::run() markActiveBuildStepsAsAborted(conn, 0); } - queueMonitorThread = std::thread(&State::queueMonitorThreadEntry, this); + queueMonitorThread = std::thread(&State::queueMonitor, this); + std::vector builderThreads; for (int n = 0; n < 4; n++) - std::thread(&State::builderThreadEntry, this, n).detach(); + builderThreads.push_back(std::thread(&State::builderThreadEntry, this, n)); + /* Wait for SIGINT. */ + { + std::unique_lock lock(exitRequestMutex); + while (!exitRequested) + exitRequest.wait(lock); + } + + printMsg(lvlError, "exiting..."); + + /* Shut down the various threads. */ + { std::lock_guard lock(queueMonitorMutex); } // barrier + queueMonitorWakeup.notify_all(); queueMonitorThread.join(); + + { auto runnable_(runnable.lock()); } // barrier + runnableWakeup.notify_all(); + for (auto & thread : builderThreads) thread.join(); } From 3a6cb2f2707d20ee05d1dfd53592c4df926aa6fa Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Fri, 29 May 2015 20:55:13 +0200 Subject: [PATCH 006/158] Implement a database connection pool --- src/hydra-queue-runner/hydra-queue-runner.cc | 47 ++++++----- src/hydra-queue-runner/pool.hh | 85 ++++++++++++++++++++ 2 files changed, 111 insertions(+), 21 deletions(-) create mode 100644 src/hydra-queue-runner/pool.hh diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 2eee9463..ada15e67 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -9,6 +9,7 @@ #include "build-result.hh" #include "sync.hh" +#include "pool.hh" #include "store-api.hh" #include "derivations.hh" @@ -145,8 +146,10 @@ class State private: std::thread queueMonitorThread; - std::mutex queueMonitorMutex; + + /* CV for waking up the queue. */ std::condition_variable queueMonitorWakeup; + std::mutex queueMonitorMutex; /* The queued builds. */ typedef std::map Builds; @@ -165,12 +168,15 @@ private: std::condition_variable_any runnableWakeup; + /* PostgreSQL connection pool. */ + Pool dbPool; + public: State(); ~State(); - void markActiveBuildStepsAsAborted(pqxx::connection & conn, time_t stopTime); + void markActiveBuildStepsAsAborted(time_t stopTime); int createBuildStep(pqxx::work & txn, time_t startTime, Build::ptr build, Step::ptr step, BuildStepStatus status, const std::string & errorMsg = "", BuildID propagatedFrom = 0); @@ -182,7 +188,7 @@ public: void queueMonitor(); - void getQueuedBuilds(std::shared_ptr store, pqxx::connection & conn); + void getQueuedBuilds(std::shared_ptr store); Step::ptr createStep(std::shared_ptr store, const Path & drvPath, std::set & newRunnable); @@ -213,18 +219,18 @@ State::State() State::~State() { try { - Connection conn; printMsg(lvlError, "clearing active build steps..."); - markActiveBuildStepsAsAborted(conn, time(0)); + markActiveBuildStepsAsAborted(time(0)); } catch (...) { ignoreException(); } } -void State::markActiveBuildStepsAsAborted(pqxx::connection & conn, time_t stopTime) +void State::markActiveBuildStepsAsAborted(time_t stopTime) { - pqxx::work txn(conn); + auto conn(dbPool.get()); + pqxx::work txn(*conn); txn.parameterized ("update BuildSteps set busy = 0, status = $1, stopTime = $2 where busy = 1") ((int) bssAborted) @@ -272,10 +278,8 @@ void State::queueMonitor() { auto store = openStore(); // FIXME: pool - Connection conn; - while (!exitRequested) { - getQueuedBuilds(store, conn); + getQueuedBuilds(store); { std::unique_lock lock(queueMonitorMutex); @@ -287,10 +291,12 @@ void State::queueMonitor() } -void State::getQueuedBuilds(std::shared_ptr store, pqxx::connection & conn) +void State::getQueuedBuilds(std::shared_ptr store) { printMsg(lvlError, "checking the queue..."); + auto conn(dbPool.get()); + #if 0 { auto runnable_(runnable.lock()); @@ -308,7 +314,7 @@ void State::getQueuedBuilds(std::shared_ptr store, pqxx::connection & std::list newBuilds; // FIXME: use queue { - pqxx::work txn(conn); + pqxx::work txn(*conn); // FIXME: query only builds with ID higher than the previous // highest. @@ -340,7 +346,7 @@ void State::getQueuedBuilds(std::shared_ptr store, pqxx::connection & if (!store->isValidPath(build->drvPath)) { /* Derivation has been GC'ed prematurely. */ - pqxx::work txn(conn); + pqxx::work txn(*conn); txn.parameterized ("update Builds set finished = 1, buildStatus = $2, startTime = $3, stopTime = $3, errorMsg = $4 where id = $1") (build->id) @@ -360,7 +366,7 @@ void State::getQueuedBuilds(std::shared_ptr store, pqxx::connection & Derivation drv = readDerivation(build->drvPath); BuildResult res = getBuildResult(store, drv); - pqxx::work txn(conn); + pqxx::work txn(*conn); time_t now = time(0); markSucceededBuild(txn, build, res, true, now, now); txn.commit(); @@ -618,11 +624,11 @@ void State::doBuildStep(std::shared_ptr store, Step::ptr step) /* Create a build step record indicating that we started building. */ - Connection conn; + auto conn(dbPool.get()); time_t startTime = time(0); int stepNr; { - pqxx::work txn(conn); + pqxx::work txn(*conn); stepNr = createBuildStep(txn, startTime, build, step, bssBusy); txn.commit(); } @@ -668,7 +674,7 @@ void State::doBuildStep(std::shared_ptr store, Step::ptr step) /* Update the database. */ { - pqxx::work txn(conn); + pqxx::work txn(*conn); if (success) { @@ -757,10 +763,7 @@ void State::markSucceededBuild(pqxx::work & txn, Build::ptr build, void State::run() { - { - Connection conn; - markActiveBuildStepsAsAborted(conn, 0); - } + markActiveBuildStepsAsAborted(0); queueMonitorThread = std::thread(&State::queueMonitor, this); @@ -785,6 +788,8 @@ void State::run() { auto runnable_(runnable.lock()); } // barrier runnableWakeup.notify_all(); for (auto & thread : builderThreads) thread.join(); + + printMsg(lvlError, format("psql connections = %1%") % dbPool.count()); } diff --git a/src/hydra-queue-runner/pool.hh b/src/hydra-queue-runner/pool.hh new file mode 100644 index 00000000..0a58ebe0 --- /dev/null +++ b/src/hydra-queue-runner/pool.hh @@ -0,0 +1,85 @@ +#pragma once + +#include +#include + +#include "sync.hh" + +/* This template class implements a simple pool manager of resources + of some type R, such as database connections. It is used as + follows: + + class Connection { ... }; + + Pool pool; + + { + auto conn(pool.get()); + conn->exec("select ..."); + } + + Here, the Connection object referenced by ‘conn’ is automatically + returned to the pool when ‘conn’ goes out of scope. +*/ + +template +class Pool +{ +private: + struct State + { + unsigned int count = 0; + std::list> idle; + }; + + Sync state; + +public: + + class Handle + { + private: + Pool & pool; + std::shared_ptr r; + + friend Pool; + + Handle(Pool & pool, std::shared_ptr r) : pool(pool), r(r) { } + + public: + Handle(Handle && h) : pool(h.pool), r(h.r) { h.r.reset(); } + + Handle(const Handle & l) = delete; + + ~Handle() + { + auto state_(pool.state.lock()); + if (r) state_->idle.push_back(r); + } + + R * operator -> () { return r; } + R & operator * () { return *r; } + }; + + Handle get() + { + { + auto state_(state.lock()); + if (!state_->idle.empty()) { + auto p = state_->idle.back(); + state_->idle.pop_back(); + return Handle(*this, p); + } + state_->count++; + } + /* Note: we don't hold the lock while creating a new instance, + because creation might take a long time. */ + return Handle(*this, std::make_shared()); + } + + unsigned int count() + { + auto state_(state.lock()); + return state_->count; + } +}; From 8b12ac1f6d6496d4904a36e90b08bac981e33482 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Tue, 9 Jun 2015 14:21:21 +0200 Subject: [PATCH 007/158] Basic remote building This removes the need for Nix's build-remote.pl. Build logs are now written to $HYDRA_DATA/build-logs because hydra-queue-runner doesn't have write permission to /nix/var/log. --- src/hydra-queue-runner/Makefile.am | 2 +- src/hydra-queue-runner/build-remote.cc | 170 +++++++++++ src/hydra-queue-runner/build-remote.hh | 21 ++ src/hydra-queue-runner/hydra-queue-runner.cc | 300 +++++++++++++++---- src/lib/Hydra/Helper/Nix.pm | 5 +- 5 files changed, 445 insertions(+), 53 deletions(-) create mode 100644 src/hydra-queue-runner/build-remote.cc create mode 100644 src/hydra-queue-runner/build-remote.hh diff --git a/src/hydra-queue-runner/Makefile.am b/src/hydra-queue-runner/Makefile.am index 2525c936..00aa254d 100644 --- a/src/hydra-queue-runner/Makefile.am +++ b/src/hydra-queue-runner/Makefile.am @@ -1,6 +1,6 @@ bin_PROGRAMS = hydra-queue-runner -hydra_queue_runner_SOURCES = hydra-queue-runner.cc build-result.cc +hydra_queue_runner_SOURCES = hydra-queue-runner.cc build-result.cc build-remote.cc hydra_queue_runner_LDADD = $(NIX_LIBS) -lpqxx AM_CXXFLAGS = $(NIX_CFLAGS) -Wall diff --git a/src/hydra-queue-runner/build-remote.cc b/src/hydra-queue-runner/build-remote.cc new file mode 100644 index 00000000..cbce20f3 --- /dev/null +++ b/src/hydra-queue-runner/build-remote.cc @@ -0,0 +1,170 @@ +#include + +#include +#include +#include + +#include "build-remote.hh" + +#include "util.hh" +#include "misc.hh" +#include "serve-protocol.hh" +#include "worker-protocol.hh" + +using namespace nix; + + +struct Child +{ + Pid pid; + AutoCloseFD to, from; +}; + + +static void openConnection(const string & sshName, const string & sshKey, + int stderrFD, Child & child) +{ + Pipe to, from; + to.create(); + from.create(); + + child.pid = startProcess([&]() { + + if (dup2(to.readSide, STDIN_FILENO) == -1) + throw SysError("cannot dup input pipe to stdin"); + + if (dup2(from.writeSide, STDOUT_FILENO) == -1) + throw SysError("cannot dup output pipe to stdout"); + + if (dup2(stderrFD, STDERR_FILENO) == -1) + throw SysError("cannot dup stderr"); + + Strings argv({"ssh", "-x", "-a", sshName, "--", "nix-store", "--serve", "--write"}); + + execvp("ssh", (char * *) stringsToCharPtrs(argv).data()); // FIXME: remove cast + + throw SysError("cannot start ssh"); + }); + + to.readSide.close(); + from.writeSide.close(); + + child.to = to.writeSide.borrow(); + child.from = from.readSide.borrow(); +} + + +static void copyClosureTo(std::shared_ptr store, + FdSource & from, FdSink & to, const PathSet & paths, + bool useSubstitutes = false) +{ + PathSet closure; + for (auto & path : paths) + computeFSClosure(*store, path, closure); + + Paths sorted = topoSortPaths(*store, closure); + + /* Send the "query valid paths" command with the "lock" option + enabled. This prevents a race where the remote host + garbage-collect paths that are already there. Optionally, ask + the remote host to substitute missing paths. */ + writeInt(cmdQueryValidPaths, to); + writeInt(1, to); // == lock paths + writeInt(useSubstitutes, to); + writeStrings(sorted, to); + to.flush(); + + /* Get back the set of paths that are already valid on the remote + host. */ + auto present = readStorePaths(from); + + PathSet missing; + std::set_difference(closure.begin(), closure.end(), present.begin(), present.end(), + std::inserter(missing, missing.end())); + + printMsg(lvlError, format("sending %1% missing paths") % missing.size()); + if (missing.empty()) return; + + throw Error("NOT IMPL 1"); +} + + +static void copyClosureFrom(std::shared_ptr store, + FdSource & from, FdSink & to, const PathSet & paths) +{ + writeInt(cmdExportPaths, to); + writeInt(0, to); // == don't sign + writeStrings(paths, to); + to.flush(); + store->importPaths(false, from); +} + + +void buildRemote(std::shared_ptr store, + const string & sshName, const string & sshKey, + const Path & drvPath, const Derivation & drv, + const nix::Path & logDir, RemoteResult & result) +{ + string base = baseNameOf(drvPath); + Path logFile = logDir + "/" + string(base, 0, 2) + "/" + string(base, 2); + + createDirs(dirOf(logFile)); + + AutoCloseFD logFD(open(logFile.c_str(), O_CREAT | O_TRUNC | O_WRONLY, 0666)); + if (logFD == -1) throw SysError(format("creating log file ‘%1%’") % logFile); + + Child child; + openConnection(sshName, sshKey, logFD, child); + + logFD.close(); + + FdSource from(child.from); + FdSink to(child.to); + + /* Handshake. */ + writeInt(SERVE_MAGIC_1, to); + writeInt(SERVE_PROTOCOL_VERSION, to); + to.flush(); + + unsigned int magic = readInt(from); + if (magic != SERVE_MAGIC_2) + throw Error(format("protocol mismatch with ‘nix-store --serve’ on ‘%1%’") % sshName); + unsigned int version = readInt(from); + if (GET_PROTOCOL_MAJOR(version) != 0x200) + throw Error(format("unsupported ‘nix-store --serve’ protocol version on ‘%1%’") % sshName); + + /* Copy the input closure. */ + printMsg(lvlError, format("sending closure of ‘%1%’ to ‘%2%’") % drvPath % sshName); + copyClosureTo(store, from, to, PathSet({drvPath})); + + /* Do the build. */ + printMsg(lvlError, format("building ‘%1%’ on ‘%2%’") % drvPath % sshName); + writeInt(cmdBuildPaths, to); + writeStrings(PathSet({drvPath}), to); + writeInt(3600, to); // == maxSilentTime, FIXME + writeInt(7200, to); // == buildTimeout, FIXME + to.flush(); + result.startTime = time(0); + int res = readInt(from); + result.stopTime = time(0); + if (res) { + result.errorMsg = (format("%1% on ‘%2%’") % readString(from) % sshName).str(); + if (res == 100) result.status = RemoteResult::rrPermanentFailure; + else if (res == 101) result.status = RemoteResult::rrTimedOut; + else result.status = RemoteResult::rrMiscFailure; + return; + } + + /* Copy the output paths. */ + printMsg(lvlError, format("copying outputs of ‘%1%’ from ‘%2%’") % drvPath % sshName); + PathSet outputs; + for (auto & output : drv.outputs) + outputs.insert(output.second.path); + copyClosureFrom(store, from, to, outputs); + + /* Shut down the connection. */ + child.to.close(); + child.pid.wait(true); + + result.status = RemoteResult::rrSuccess; +} diff --git a/src/hydra-queue-runner/build-remote.hh b/src/hydra-queue-runner/build-remote.hh new file mode 100644 index 00000000..6406bc58 --- /dev/null +++ b/src/hydra-queue-runner/build-remote.hh @@ -0,0 +1,21 @@ +#pragma once + +#include "store-api.hh" +#include "derivations.hh" + +struct RemoteResult +{ + enum { + rrSuccess = 0, + rrPermanentFailure = 1, + rrTimedOut = 2, + rrMiscFailure = 3 + } status = rrMiscFailure; + std::string errorMsg; + time_t startTime = 0, stopTime = 0; +}; + +void buildRemote(std::shared_ptr store, + const std::string & sshName, const std::string & sshKey, + const nix::Path & drvPath, const nix::Derivation & drv, + const nix::Path & logDir, RemoteResult & result); diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index ada15e67..5fe9be2b 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -8,6 +8,7 @@ #include #include "build-result.hh" +#include "build-remote.hh" #include "sync.hh" #include "pool.hh" @@ -134,6 +135,10 @@ struct Step Sync state; + std::atomic_bool destroyed; + + Step() : destroyed(false) { } + ~Step() { printMsg(lvlError, format("destroying step %1%") % drvPath); @@ -141,11 +146,49 @@ struct Step }; +struct Machine +{ + typedef std::shared_ptr ptr; + + std::string sshName, sshKey; + std::set systemTypes, supportedFeatures, mandatoryFeatures; + unsigned int maxJobs = 1; + float speedFactor = 1.0; + + Sync currentJobs; + + Machine() + { + auto currentJobs_(currentJobs.lock()); + *currentJobs_ = 0; + } +}; + + +/* A RAII helper that manages the currentJobs field of Machine + objects. */ +struct MachineReservation +{ + typedef std::shared_ptr ptr; + Machine::ptr machine; + MachineReservation(Machine::ptr machine) : machine(machine) + { + auto currentJobs_(machine->currentJobs.lock()); + (*currentJobs_)++; + } + ~MachineReservation() + { + auto currentJobs_(machine->currentJobs.lock()); + if (*currentJobs_ > 0) (*currentJobs_)--; + } +}; + + class State { private: - std::thread queueMonitorThread; + Path hydraData, logDir; /* CV for waking up the queue. */ std::condition_variable queueMonitorWakeup; @@ -168,20 +211,35 @@ private: std::condition_variable_any runnableWakeup; + /* CV for waking up the dispatcher. */ + std::condition_variable dispatcherWakeup; + std::mutex dispatcherMutex; + /* PostgreSQL connection pool. */ Pool dbPool; + /* The build machines. */ + typedef std::list Machines; + Sync machines; + + /* The currently active builder threads. FIXME: We could re-use + these, but since they're fairly long-running, it's probably not + worth it. */ + // std::vector builderThreads; + public: State(); ~State(); + void loadMachines(); + void markActiveBuildStepsAsAborted(time_t stopTime); int createBuildStep(pqxx::work & txn, time_t startTime, Build::ptr build, Step::ptr step, BuildStepStatus status, const std::string & errorMsg = "", BuildID propagatedFrom = 0); - void finishBuildStep(pqxx::work & txn, time_t stopTime, BuildID buildId, int stepNr, + void finishBuildStep(pqxx::work & txn, time_t startTime, time_t stopTime, BuildID buildId, int stepNr, BuildStepStatus status, const string & errorMsg = "", BuildID propagatedFrom = 0); void updateBuild(pqxx::work & txn, Build::ptr build, BuildStatus status); @@ -200,9 +258,17 @@ public: void makeRunnable(Step::ptr step); - void builderThreadEntry(int slot); + /* The thread that selects and starts runnable builds. */ + void dispatcher(); - void doBuildStep(std::shared_ptr store, Step::ptr step); + void wakeDispatcher(); + + MachineReservation::ptr findMachine(Step::ptr step); + + void builder(Step::ptr step, MachineReservation::ptr reservation); + + void doBuildStep(std::shared_ptr store, Step::ptr step, + Machine::ptr machine); void markSucceededBuild(pqxx::work & txn, Build::ptr build, const BuildResult & res, bool isCachedBuild, time_t startTime, time_t stopTime); @@ -213,6 +279,10 @@ public: State::State() { + hydraData = getEnv("HYDRA_DATA"); + if (hydraData == "") throw Error("$HYDRA_DATA must be set"); + + logDir = canonPath(hydraData + "/build-logs"); } @@ -227,6 +297,49 @@ State::~State() } +void State::loadMachines() +{ + Path machinesFile = getEnv("NIX_REMOTE_SYSTEMS", "/etc/nix/machines"); + + Machines newMachines; + + if (pathExists(machinesFile)) { + + for (auto line : tokenizeString(readFile(machinesFile), "\n")) { + line = trim(string(line, 0, line.find('#'))); + auto tokens = tokenizeString>(line); + if (tokens.size() < 3) continue; + tokens.resize(7); + + auto machine = std::make_shared(); + machine->sshName = tokens[0]; + machine->systemTypes = tokenizeString(tokens[1], ","); + machine->sshKey = tokens[2]; + if (tokens[3] != "") + string2Int(tokens[3], machine->maxJobs); + else + machine->maxJobs = 1; + machine->speedFactor = atof(tokens[4].c_str()); + machine->supportedFeatures = tokenizeString(tokens[5], ","); + machine->mandatoryFeatures = tokenizeString(tokens[6], ","); + newMachines.push_back(machine); + } + + } else { + auto machine = std::make_shared(); + machine->sshName = "localhost"; + machine->systemTypes = StringSet({settings.thisSystem}); + if (settings.thisSystem == "x86_64-linux") + machine->systemTypes.insert("i686-linux"); + machine->maxJobs = settings.maxBuildJobs; + newMachines.push_back(machine); + } + + auto machines_(machines.lock()); + *machines_ = newMachines; +} + + void State::markActiveBuildStepsAsAborted(time_t stopTime) { auto conn(dbPool.get()); @@ -262,15 +375,17 @@ int State::createBuildStep(pqxx::work & txn, time_t startTime, Build::ptr build, } -void State::finishBuildStep(pqxx::work & txn, time_t stopTime, BuildID buildId, int stepNr, +void State::finishBuildStep(pqxx::work & txn, time_t startTime, time_t stopTime, BuildID buildId, int stepNr, BuildStepStatus status, const std::string & errorMsg, BuildID propagatedFrom) { + assert(startTime); + assert(stopTime); txn.parameterized - ("update BuildSteps set busy = 0, status = $1, propagatedFrom = $4, errorMsg = $5, stopTime = $6 where build = $2 and stepnr = $3") + ("update BuildSteps set busy = 0, status = $1, propagatedFrom = $4, errorMsg = $5, startTime = $6, stopTime = $7 where build = $2 and stepnr = $3") ((int) status)(buildId)(stepNr) (propagatedFrom, propagatedFrom != 0) (errorMsg, errorMsg != "") - (stopTime, stopTime != 0).exec(); + (startTime)(stopTime).exec(); } @@ -346,6 +461,7 @@ void State::getQueuedBuilds(std::shared_ptr store) if (!store->isValidPath(build->drvPath)) { /* Derivation has been GC'ed prematurely. */ + printMsg(lvlInfo, format("aborting GC'ed build %1%") % build->id); pqxx::work txn(*conn); txn.parameterized ("update Builds set finished = 1, buildStatus = $2, startTime = $3, stopTime = $3, errorMsg = $4 where id = $1") @@ -366,6 +482,8 @@ void State::getQueuedBuilds(std::shared_ptr store) Derivation drv = readDerivation(build->drvPath); BuildResult res = getBuildResult(store, drv); + printMsg(lvlInfo, format("cached build %1%") % build->id); + pqxx::work txn(*conn); time_t now = time(0); markSucceededBuild(txn, build, res, true, now, now); @@ -385,6 +503,9 @@ void State::getQueuedBuilds(std::shared_ptr store) build->toplevel = step; } + printMsg(lvlInfo, format("added build %1% (top-level step %2%, %3% new runnable steps)") + % build->id % step->drvPath % newRunnable.size()); + /* Prior to this, the build is not visible to getDependentBuilds(). Now it is, so the build can be failed if a dependency fails. (It can't succeed right away @@ -462,6 +583,9 @@ Step::ptr State::createStep(std::shared_ptr store, const Path & drvPat void State::destroyStep(Step::ptr step, bool proceed) { + if (step->destroyed) return; + step->destroyed = true; + printMsg(lvlInfo, format("destroying build step ‘%1%’") % step->drvPath); { @@ -547,6 +671,8 @@ std::set State::getDependentBuilds(Step::ptr step) void State::makeRunnable(Step::ptr step) { + printMsg(lvlInfo, format("step ‘%1%’ is now runnable") % step->drvPath); + { auto step_(step->state.lock()); assert(step_->deps.empty()); @@ -557,38 +683,109 @@ void State::makeRunnable(Step::ptr step) runnable_->push_back(step); } - runnableWakeup.notify_one(); + wakeDispatcher(); } -void State::builderThreadEntry(int slot) +void State::dispatcher() { - auto store = openStore(); // FIXME: pool + while (!exitRequested) { + printMsg(lvlError, "dispatcher woken up"); - while (true) { - /* Sleep until a runnable build step becomes available. */ - Step::ptr step; { auto runnable_(runnable.lock()); - while (runnable_->empty() && !exitRequested) - runnable_.wait(runnableWakeup); - if (exitRequested) break; - auto weak = *runnable_->begin(); - runnable_->pop_front(); - step = weak.lock(); - if (!step) continue; + printMsg(lvlError, format("%1% runnable builds") % runnable_->size()); + + /* FIXME: we're holding the runnable lock too long + here. This could be more efficient. */ + + for (auto i = runnable_->begin(); i != runnable_->end(); ) { + auto step = i->lock(); + + /* Delete dead steps. */ + if (!step) { + i = runnable_->erase(i); + continue; + } + + auto reservation = findMachine(step); + if (!reservation) { + printMsg(lvlError, format("cannot execute step ‘%1%’ right now") % step->drvPath); + ++i; + continue; + } + + printMsg(lvlInfo, format("WOOHOO: starting step ‘%1%’ on machine ‘%2%’") + % step->drvPath % reservation->machine->sshName); + i = runnable_->erase(i); + + auto builderThread = std::thread(&State::builder, this, step, reservation); + builderThread.detach(); // FIXME? + } } - /* Build it. */ - printMsg(lvlError, format("slot %1%: got build step ‘%2%’") % slot % step->drvPath); - doBuildStep(store, step); + /* Sleep until we're woken up (either because a runnable build + is added, or because a build finishes). */ + { + std::unique_lock lock(dispatcherMutex); + dispatcherWakeup.wait(lock); + } } - printMsg(lvlError, "builder thread exits"); + printMsg(lvlError, "dispatcher exits"); } -void State::doBuildStep(std::shared_ptr store, Step::ptr step) +void State::wakeDispatcher() +{ + { std::lock_guard lock(dispatcherMutex); } // barrier + dispatcherWakeup.notify_all(); +} + + +MachineReservation::ptr State::findMachine(Step::ptr step) +{ + auto machines_(machines.lock()); + + for (auto & machine : *machines_) { + if (!has(machine->systemTypes, step->drv.platform)) continue; + // FIXME: check features + { + auto currentJobs_(machine->currentJobs.lock()); + if (*currentJobs_ >= machine->maxJobs) continue; + } + return std::make_shared(machine); + } + + /* FIXME: distinguish between permanent failures (a matching + machine doesn't exist) and temporary failures (a matching + machine is not available). */ + + return 0; +} + + +void State::builder(Step::ptr step, MachineReservation::ptr reservation) +{ + try { + auto store = openStore(); // FIXME: pool + doBuildStep(store, step, reservation->machine); + } catch (std::exception & e) { + printMsg(lvlError, format("build thread for ‘%1%’: %2%") % step->drvPath % e.what()); + // FIXME: put step back in runnable and retry + } + + /* Release the machine and wake up the dispatcher. */ + assert(reservation.unique()); + reservation = 0; + wakeDispatcher(); + + printMsg(lvlError, "builder exits"); +} + + +void State::doBuildStep(std::shared_ptr store, Step::ptr step, + Machine::ptr machine) { /* There can be any number of builds in the database that depend on this derivation. Arbitrarily pick one (though preferring a @@ -625,27 +822,28 @@ void State::doBuildStep(std::shared_ptr store, Step::ptr step) /* Create a build step record indicating that we started building. */ auto conn(dbPool.get()); - time_t startTime = time(0); + RemoteResult result; + result.startTime = time(0); int stepNr; { pqxx::work txn(*conn); - stepNr = createBuildStep(txn, startTime, build, step, bssBusy); + stepNr = createBuildStep(txn, result.startTime, build, step, bssBusy); txn.commit(); } - bool success = false; - std::string errorMsg; try { - store->buildPaths(PathSet({step->drvPath})); - success = true; + buildRemote(store, machine->sshName, machine->sshKey, step->drvPath, step->drv, logDir, result); } catch (Error & e) { - errorMsg = e.msg(); + result.status = RemoteResult::rrMiscFailure; + result.errorMsg = e.msg(); + printMsg(lvlError, format("ERROR: %1%") % e.msg()); + abort(); } - time_t stopTime = time(0); + if (!result.stopTime) result.stopTime = time(0); BuildResult res; - if (success) res = getBuildResult(store, step->drv); + if (result.status == RemoteResult::rrSuccess) res = getBuildResult(store, step->drv); // FIXME: handle failed-with-output @@ -676,33 +874,34 @@ void State::doBuildStep(std::shared_ptr store, Step::ptr step) { pqxx::work txn(*conn); - if (success) { + if (result.status == RemoteResult::rrSuccess) { - finishBuildStep(txn, stopTime, build->id, stepNr, bssSuccess); + finishBuildStep(txn, result.startTime, result.stopTime, build->id, stepNr, bssSuccess); /* Mark all builds of which this derivation is the top level as succeeded. */ for (auto build2 : direct) - markSucceededBuild(txn, build2, res, false, startTime, stopTime); + markSucceededBuild(txn, build2, res, false, result.startTime, result.stopTime); } else { /* Create failed build steps for every build that depends on this. */ - finishBuildStep(txn, stopTime, build->id, stepNr, bssFailed, errorMsg); + finishBuildStep(txn, result.startTime, result.stopTime, build->id, stepNr, bssFailed, result.errorMsg); for (auto build2 : dependents) { if (build == build2) continue; - createBuildStep(txn, stopTime, build2, step, bssFailed, errorMsg, build->id); + createBuildStep(txn, result.stopTime, build2, step, bssFailed, result.errorMsg, build->id); } /* Mark all builds that depend on this derivation as failed. */ for (auto build2 : dependents) { + printMsg(lvlError, format("marking build %1% as failed") % build2->id); txn.parameterized ("update Builds set finished = 1, isCachedBuild = 0, buildStatus = $2, startTime = $3, stopTime = $4 where id = $1") (build2->id) ((int) (build2->drvPath == step->drvPath ? bsFailed : bsDepFailed)) - (startTime) - (stopTime).exec(); + (result.startTime) + (result.stopTime).exec(); build2->finishedInDB = true; // FIXME: txn might fail } } @@ -715,7 +914,7 @@ void State::doBuildStep(std::shared_ptr store, Step::ptr step) dependent Build objects. Any Steps not referenced by other Builds will be destroyed as well. */ for (auto build2 : dependents) - if (build2->toplevel == step || !success) { + if (build2->toplevel == step || result.status != RemoteResult::rrSuccess) { auto builds_(builds.lock()); builds_->erase(build2->id); } @@ -723,13 +922,15 @@ void State::doBuildStep(std::shared_ptr store, Step::ptr step) /* Remove the step from the graph. In case of success, make dependent build steps runnable if they have no other dependencies. */ - destroyStep(step, success); + destroyStep(step, result.status == RemoteResult::rrSuccess); } void State::markSucceededBuild(pqxx::work & txn, Build::ptr build, const BuildResult & res, bool isCachedBuild, time_t startTime, time_t stopTime) { + printMsg(lvlError, format("marking build %1% as succeeded") % build->id); + txn.parameterized ("update Builds set finished = 1, buildStatus = $2, startTime = $3, stopTime = $4, size = $5, closureSize = $6, releaseName = $7, isCachedBuild = $8 where id = $1") (build->id) @@ -765,11 +966,11 @@ void State::run() { markActiveBuildStepsAsAborted(0); - queueMonitorThread = std::thread(&State::queueMonitor, this); + loadMachines(); - std::vector builderThreads; - for (int n = 0; n < 4; n++) - builderThreads.push_back(std::thread(&State::builderThreadEntry, this, n)); + auto queueMonitorThread = std::thread(&State::queueMonitor, this); + + auto dispatcherThread = std::thread(&State::dispatcher, this); /* Wait for SIGINT. */ { @@ -785,9 +986,8 @@ void State::run() queueMonitorWakeup.notify_all(); queueMonitorThread.join(); - { auto runnable_(runnable.lock()); } // barrier - runnableWakeup.notify_all(); - for (auto & thread : builderThreads) thread.join(); + wakeDispatcher(); + dispatcherThread.join(); printMsg(lvlError, format("psql connections = %1%") % dbPool.count()); } diff --git a/src/lib/Hydra/Helper/Nix.pm b/src/lib/Hydra/Helper/Nix.pm index 16d498fb..087e66cb 100644 --- a/src/lib/Hydra/Helper/Nix.pm +++ b/src/lib/Hydra/Helper/Nix.pm @@ -133,8 +133,9 @@ sub getDrvLogPath { my $base = basename $drvPath; my $bucketed = substr($base, 0, 2) . "/" . substr($base, 2); my $fn = ($ENV{NIX_LOG_DIR} || "/nix/var/log/nix") . "/drvs/"; - for ($fn . $bucketed . ".bz2", $fn . $bucketed, $fn . $base . ".bz2", $fn . $base) { - return $_ if (-f $_); + my $fn2 = Hydra::Model::DB::getHydraPath . "/build-logs/"; + for ($fn2 . $bucketed, $fn . $bucketed . ".bz2", $fn . $bucketed, $fn . $base . ".bz2", $fn . $base) { + return $_ if -f $_; } return undef; } From ca1fbdd0589e0b8ca54d3f72086393e28edd6e45 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Tue, 9 Jun 2015 14:31:14 +0200 Subject: [PATCH 008/158] Mark builds as busy --- src/hydra-queue-runner/hydra-queue-runner.cc | 22 ++++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 5fe9be2b..dd1920a8 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -234,7 +234,7 @@ public: void loadMachines(); - void markActiveBuildStepsAsAborted(time_t stopTime); + void clearBusy(time_t stopTime); int createBuildStep(pqxx::work & txn, time_t startTime, Build::ptr build, Step::ptr step, BuildStepStatus status, const std::string & errorMsg = "", BuildID propagatedFrom = 0); @@ -289,8 +289,8 @@ State::State() State::~State() { try { - printMsg(lvlError, "clearing active build steps..."); - markActiveBuildStepsAsAborted(time(0)); + printMsg(lvlError, "clearing active builds / build steps..."); + clearBusy(time(0)); } catch (...) { ignoreException(); } @@ -340,7 +340,7 @@ void State::loadMachines() } -void State::markActiveBuildStepsAsAborted(time_t stopTime) +void State::clearBusy(time_t stopTime) { auto conn(dbPool.get()); pqxx::work txn(*conn); @@ -348,6 +348,7 @@ void State::markActiveBuildStepsAsAborted(time_t stopTime) ("update BuildSteps set busy = 0, status = $1, stopTime = $2 where busy = 1") ((int) bssAborted) (stopTime, stopTime != 0).exec(); + txn.exec("update Builds set busy = 0 where finished = 0 and busy = 1"); txn.commit(); } @@ -464,7 +465,7 @@ void State::getQueuedBuilds(std::shared_ptr store) printMsg(lvlInfo, format("aborting GC'ed build %1%") % build->id); pqxx::work txn(*conn); txn.parameterized - ("update Builds set finished = 1, buildStatus = $2, startTime = $3, stopTime = $3, errorMsg = $4 where id = $1") + ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $3, errorMsg = $4 where id = $1") (build->id) ((int) bsAborted) (time(0)) @@ -820,7 +821,7 @@ void State::doBuildStep(std::shared_ptr store, Step::ptr step, } /* Create a build step record indicating that we started - building. */ + building. Also, mark the selected build as busy. */ auto conn(dbPool.get()); RemoteResult result; result.startTime = time(0); @@ -828,6 +829,9 @@ void State::doBuildStep(std::shared_ptr store, Step::ptr step, { pqxx::work txn(*conn); stepNr = createBuildStep(txn, result.startTime, build, step, bssBusy); + + txn.parameterized("update Builds set busy = 1 where id = $1")(build->id).exec(); + txn.commit(); } @@ -897,7 +901,7 @@ void State::doBuildStep(std::shared_ptr store, Step::ptr step, for (auto build2 : dependents) { printMsg(lvlError, format("marking build %1% as failed") % build2->id); txn.parameterized - ("update Builds set finished = 1, isCachedBuild = 0, buildStatus = $2, startTime = $3, stopTime = $4 where id = $1") + ("update Builds set finished = 1, busy = 0, isCachedBuild = 0, buildStatus = $2, startTime = $3, stopTime = $4 where id = $1") (build2->id) ((int) (build2->drvPath == step->drvPath ? bsFailed : bsDepFailed)) (result.startTime) @@ -932,7 +936,7 @@ void State::markSucceededBuild(pqxx::work & txn, Build::ptr build, printMsg(lvlError, format("marking build %1% as succeeded") % build->id); txn.parameterized - ("update Builds set finished = 1, buildStatus = $2, startTime = $3, stopTime = $4, size = $5, closureSize = $6, releaseName = $7, isCachedBuild = $8 where id = $1") + ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, size = $5, closureSize = $6, releaseName = $7, isCachedBuild = $8 where id = $1") (build->id) ((int) bsSuccess) (startTime) @@ -964,7 +968,7 @@ void State::markSucceededBuild(pqxx::work & txn, Build::ptr build, void State::run() { - markActiveBuildStepsAsAborted(0); + clearBusy(0); loadMachines(); From 08633508da12e9633c6ea1603273d20cc979da79 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Tue, 9 Jun 2015 14:42:02 +0200 Subject: [PATCH 009/158] Fix colspan --- src/root/machine-status.tt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/root/machine-status.tt b/src/root/machine-status.tt index 9ba9dea3..a1331378 100644 --- a/src/root/machine-status.tt +++ b/src/root/machine-status.tt @@ -42,9 +42,9 @@ [% END %] [% IF idle == 1 %] [% IF m.value.idle %] - Idle for [% INCLUDE renderDuration duration = curTime - m.value.idle %] + Idle for [% INCLUDE renderDuration duration = curTime - m.value.idle %] [% ELSE %] - Never used + Never used [% END %] [% END %] From 61d406052250f5db53954bc17eca47b2acc70e8d Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Tue, 9 Jun 2015 14:57:49 +0200 Subject: [PATCH 010/158] Record the machine used for a build step --- src/hydra-queue-runner/hydra-queue-runner.cc | 30 +++++++++++--------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index dd1920a8..1c3ab547 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -237,10 +237,12 @@ public: void clearBusy(time_t stopTime); int createBuildStep(pqxx::work & txn, time_t startTime, Build::ptr build, Step::ptr step, - BuildStepStatus status, const std::string & errorMsg = "", BuildID propagatedFrom = 0); + const std::string & machine, BuildStepStatus status, const std::string & errorMsg = "", + BuildID propagatedFrom = 0); void finishBuildStep(pqxx::work & txn, time_t startTime, time_t stopTime, BuildID buildId, int stepNr, - BuildStepStatus status, const string & errorMsg = "", BuildID propagatedFrom = 0); + const std::string & machine, BuildStepStatus status, const string & errorMsg = "", + BuildID propagatedFrom = 0); void updateBuild(pqxx::work & txn, Build::ptr build, BuildStatus status); @@ -354,18 +356,19 @@ void State::clearBusy(time_t stopTime) int State::createBuildStep(pqxx::work & txn, time_t startTime, Build::ptr build, Step::ptr step, - BuildStepStatus status, const std::string & errorMsg, BuildID propagatedFrom) + const std::string & machine, BuildStepStatus status, const std::string & errorMsg, BuildID propagatedFrom) { auto res = txn.parameterized("select max(stepnr) from BuildSteps where build = $1")(build->id).exec(); int stepNr = res[0][0].is_null() ? 1 : res[0][0].as() + 1; txn.parameterized - ("insert into BuildSteps (build, stepnr, type, drvPath, busy, startTime, system, status, propagatedFrom, errorMsg, stopTime) values ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11)") + ("insert into BuildSteps (build, stepnr, type, drvPath, busy, startTime, system, status, propagatedFrom, errorMsg, stopTime, machine) values ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12)") (build->id)(stepNr)(0)(step->drvPath)(status == bssBusy ? 1 : 0)(startTime)(step->drv.platform) ((int) status, status != bssBusy) (propagatedFrom, propagatedFrom != 0) (errorMsg, errorMsg != "") - (startTime, status != bssBusy).exec(); + (startTime, status != bssBusy) + (machine, machine != "").exec(); for (auto & output : step->drv.outputs) txn.parameterized @@ -377,16 +380,17 @@ int State::createBuildStep(pqxx::work & txn, time_t startTime, Build::ptr build, void State::finishBuildStep(pqxx::work & txn, time_t startTime, time_t stopTime, BuildID buildId, int stepNr, - BuildStepStatus status, const std::string & errorMsg, BuildID propagatedFrom) + const std::string & machine, BuildStepStatus status, const std::string & errorMsg, BuildID propagatedFrom) { assert(startTime); assert(stopTime); txn.parameterized - ("update BuildSteps set busy = 0, status = $1, propagatedFrom = $4, errorMsg = $5, startTime = $6, stopTime = $7 where build = $2 and stepnr = $3") + ("update BuildSteps set busy = 0, status = $1, propagatedFrom = $4, errorMsg = $5, startTime = $6, stopTime = $7, machine = $8 where build = $2 and stepnr = $3") ((int) status)(buildId)(stepNr) (propagatedFrom, propagatedFrom != 0) (errorMsg, errorMsg != "") - (startTime)(stopTime).exec(); + (startTime)(stopTime) + (machine, machine != "").exec(); } @@ -828,10 +832,8 @@ void State::doBuildStep(std::shared_ptr store, Step::ptr step, int stepNr; { pqxx::work txn(*conn); - stepNr = createBuildStep(txn, result.startTime, build, step, bssBusy); - + stepNr = createBuildStep(txn, result.startTime, build, step, machine->sshName, bssBusy); txn.parameterized("update Builds set busy = 1 where id = $1")(build->id).exec(); - txn.commit(); } @@ -880,7 +882,7 @@ void State::doBuildStep(std::shared_ptr store, Step::ptr step, if (result.status == RemoteResult::rrSuccess) { - finishBuildStep(txn, result.startTime, result.stopTime, build->id, stepNr, bssSuccess); + finishBuildStep(txn, result.startTime, result.stopTime, build->id, stepNr, machine->sshName, bssSuccess); /* Mark all builds of which this derivation is the top level as succeeded. */ @@ -890,11 +892,11 @@ void State::doBuildStep(std::shared_ptr store, Step::ptr step, } else { /* Create failed build steps for every build that depends on this. */ - finishBuildStep(txn, result.startTime, result.stopTime, build->id, stepNr, bssFailed, result.errorMsg); + finishBuildStep(txn, result.startTime, result.stopTime, build->id, stepNr, machine->sshName, bssFailed, result.errorMsg); for (auto build2 : dependents) { if (build == build2) continue; - createBuildStep(txn, result.stopTime, build2, step, bssFailed, result.errorMsg, build->id); + createBuildStep(txn, result.stopTime, build2, step, machine->sshName, bssFailed, result.errorMsg, build->id); } /* Mark all builds that depend on this derivation as failed. */ From c93aa925631484dca9a0fadf494596f35af04105 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Tue, 9 Jun 2015 15:03:20 +0200 Subject: [PATCH 011/158] Create BuildSteps race-free If multiple threads create a step for the same build, they could get the same "max(stepnr)" and allocate conflicting new step numbers. So lock the BuildSteps table while doing this. We could use a different isolation level, but this is easier. --- src/hydra-queue-runner/hydra-queue-runner.cc | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 1c3ab547..4f1fc48c 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -358,6 +358,10 @@ void State::clearBusy(time_t stopTime) int State::createBuildStep(pqxx::work & txn, time_t startTime, Build::ptr build, Step::ptr step, const std::string & machine, BuildStepStatus status, const std::string & errorMsg, BuildID propagatedFrom) { + /* Acquire an exclusive lock on BuildSteps to ensure that we don't + race with other threads creating a step of the same build. */ + txn.exec("lock table BuildSteps in exclusive mode"); + auto res = txn.parameterized("select max(stepnr) from BuildSteps where build = $1")(build->id).exec(); int stepNr = res[0][0].is_null() ? 1 : res[0][0].as() + 1; @@ -892,13 +896,13 @@ void State::doBuildStep(std::shared_ptr store, Step::ptr step, } else { /* Create failed build steps for every build that depends on this. */ - finishBuildStep(txn, result.startTime, result.stopTime, build->id, stepNr, machine->sshName, bssFailed, result.errorMsg); - for (auto build2 : dependents) { if (build == build2) continue; createBuildStep(txn, result.stopTime, build2, step, machine->sshName, bssFailed, result.errorMsg, build->id); } + finishBuildStep(txn, result.startTime, result.stopTime, build->id, stepNr, machine->sshName, bssFailed, result.errorMsg); + /* Mark all builds that depend on this derivation as failed. */ for (auto build2 : dependents) { printMsg(lvlError, format("marking build %1% as failed") % build2->id); From 7dd1f0097ea6ac92b5191a38f4eb1251a95ffdd9 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Tue, 9 Jun 2015 16:03:41 +0200 Subject: [PATCH 012/158] Finish copyClosure --- src/hydra-queue-runner/build-remote.cc | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/src/hydra-queue-runner/build-remote.cc b/src/hydra-queue-runner/build-remote.cc index cbce20f3..67d14991 100644 --- a/src/hydra-queue-runner/build-remote.cc +++ b/src/hydra-queue-runner/build-remote.cc @@ -62,8 +62,6 @@ static void copyClosureTo(std::shared_ptr store, for (auto & path : paths) computeFSClosure(*store, path, closure); - Paths sorted = topoSortPaths(*store, closure); - /* Send the "query valid paths" command with the "lock" option enabled. This prevents a race where the remote host garbage-collect paths that are already there. Optionally, ask @@ -71,21 +69,29 @@ static void copyClosureTo(std::shared_ptr store, writeInt(cmdQueryValidPaths, to); writeInt(1, to); // == lock paths writeInt(useSubstitutes, to); - writeStrings(sorted, to); + writeStrings(closure, to); to.flush(); /* Get back the set of paths that are already valid on the remote host. */ auto present = readStorePaths(from); - PathSet missing; - std::set_difference(closure.begin(), closure.end(), present.begin(), present.end(), - std::inserter(missing, missing.end())); + if (present.size() == closure.size()) return; + + Paths sorted = topoSortPaths(*store, closure); + + Paths missing; + for (auto i = sorted.rbegin(); i != sorted.rend(); ++i) + if (present.find(*i) == present.end()) missing.push_back(*i); printMsg(lvlError, format("sending %1% missing paths") % missing.size()); - if (missing.empty()) return; - throw Error("NOT IMPL 1"); + writeInt(cmdImportPaths, to); + exportPaths(*store, missing, false, to); + to.flush(); + + if (readInt(from) != 1) + throw Error("remote machine failed to import closure"); } From c68036f8b000f19df891ccbee5c4d19d90699e4d Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Wed, 10 Jun 2015 14:57:07 +0200 Subject: [PATCH 013/158] Pass ssh key --- src/hydra-queue-runner/build-remote.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hydra-queue-runner/build-remote.cc b/src/hydra-queue-runner/build-remote.cc index 67d14991..f3d153e4 100644 --- a/src/hydra-queue-runner/build-remote.cc +++ b/src/hydra-queue-runner/build-remote.cc @@ -39,7 +39,7 @@ static void openConnection(const string & sshName, const string & sshKey, if (dup2(stderrFD, STDERR_FILENO) == -1) throw SysError("cannot dup stderr"); - Strings argv({"ssh", "-x", "-a", sshName, "--", "nix-store", "--serve", "--write"}); + Strings argv({"ssh", sshName, "-i", sshKey, "-x", "-a", "--", "nix-store", "--serve", "--write"}); execvp("ssh", (char * *) stringsToCharPtrs(argv).data()); // FIXME: remove cast From 6d738a31bf7b0c01191433c625229bb6c0a56efc Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Wed, 10 Jun 2015 14:57:16 +0200 Subject: [PATCH 014/158] Keep track of failed paths in the Hydra database I.e. don't use Nix's failed paths feature anymore. Easier to keep everything in one place. --- src/hydra-queue-runner/hydra-queue-runner.cc | 92 ++++++++++++++------ src/lib/Hydra/Controller/Admin.pm | 2 +- src/lib/Hydra/Helper/Nix.pm | 5 +- src/lib/Hydra/Schema/FailedPaths.pm | 65 ++++++++++++++ src/sql/hydra.sql | 16 ++++ 5 files changed, 149 insertions(+), 31 deletions(-) create mode 100644 src/lib/Hydra/Schema/FailedPaths.pm diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 4f1fc48c..844b19ee 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -780,7 +780,7 @@ void State::builder(Step::ptr step, MachineReservation::ptr reservation) auto store = openStore(); // FIXME: pool doBuildStep(store, step, reservation->machine); } catch (std::exception & e) { - printMsg(lvlError, format("build thread for ‘%1%’: %2%") % step->drvPath % e.what()); + printMsg(lvlError, format("error building ‘%1%’: %2%") % step->drvPath % e.what()); // FIXME: put step back in runnable and retry } @@ -828,35 +828,55 @@ void State::doBuildStep(std::shared_ptr store, Step::ptr step, printMsg(lvlInfo, format("performing build step ‘%1%’ (needed by %2% builds)") % step->drvPath % dependents.size()); } - /* Create a build step record indicating that we started - building. Also, mark the selected build as busy. */ auto conn(dbPool.get()); + RemoteResult result; + BuildResult res; + int stepNr = 0; + result.startTime = time(0); - int stepNr; + + /* If any of the outputs have previously failed, then don't + retry. */ + bool cachedFailure = false; { pqxx::work txn(*conn); - stepNr = createBuildStep(txn, result.startTime, build, step, machine->sshName, bssBusy); - txn.parameterized("update Builds set busy = 1 where id = $1")(build->id).exec(); - txn.commit(); + for (auto & path : outputPaths(step->drv)) + if (!txn.parameterized("select 1 from FailedPaths where path = $1")(path).exec().empty()) { + cachedFailure = true; + break; + } } - try { - buildRemote(store, machine->sshName, machine->sshKey, step->drvPath, step->drv, logDir, result); - } catch (Error & e) { - result.status = RemoteResult::rrMiscFailure; - result.errorMsg = e.msg(); - printMsg(lvlError, format("ERROR: %1%") % e.msg()); - abort(); + if (cachedFailure) + result.status = RemoteResult::rrPermanentFailure; + else { + + /* Create a build step record indicating that we started + building. Also, mark the selected build as busy. */ + { + pqxx::work txn(*conn); + stepNr = createBuildStep(txn, result.startTime, build, step, machine->sshName, bssBusy); + txn.parameterized("update Builds set busy = 1 where id = $1")(build->id).exec(); + txn.commit(); + } + + try { + buildRemote(store, machine->sshName, machine->sshKey, step->drvPath, step->drv, logDir, result); + } catch (Error & e) { + result.status = RemoteResult::rrMiscFailure; + result.errorMsg = e.msg(); + printMsg(lvlError, format("ERROR: %1%") % e.msg()); + abort(); + } + + if (result.status == RemoteResult::rrSuccess) res = getBuildResult(store, step->drv); + + // FIXME: handle failed-with-output } if (!result.stopTime) result.stopTime = time(0); - BuildResult res; - if (result.status == RemoteResult::rrSuccess) res = getBuildResult(store, step->drv); - - // FIXME: handle failed-with-output - /* Remove this step. After this, incoming builds that depend on drvPath will either see that the output paths exist, or will create a new build step for drvPath. The latter is fine - it @@ -894,26 +914,42 @@ void State::doBuildStep(std::shared_ptr store, Step::ptr step, markSucceededBuild(txn, build2, res, false, result.startTime, result.stopTime); } else { - /* Create failed build steps for every build that depends - on this. */ - for (auto build2 : dependents) { - if (build == build2) continue; - createBuildStep(txn, result.stopTime, build2, step, machine->sshName, bssFailed, result.errorMsg, build->id); - } + /* Failure case. */ - finishBuildStep(txn, result.startTime, result.stopTime, build->id, stepNr, machine->sshName, bssFailed, result.errorMsg); + /* For regular failures, we don't care about the error + message. */ + if (result.status != RemoteResult::rrMiscFailure) result.errorMsg = ""; + + if (!cachedFailure) { + + /* Create failed build steps for every build that depends + on this. */ + for (auto build2 : dependents) { + if (build == build2) continue; + createBuildStep(txn, result.stopTime, build2, step, machine->sshName, bssFailed, result.errorMsg, build->id); + } + + finishBuildStep(txn, result.startTime, result.stopTime, build->id, stepNr, machine->sshName, bssFailed, result.errorMsg); + } /* Mark all builds that depend on this derivation as failed. */ for (auto build2 : dependents) { printMsg(lvlError, format("marking build %1% as failed") % build2->id); txn.parameterized - ("update Builds set finished = 1, busy = 0, isCachedBuild = 0, buildStatus = $2, startTime = $3, stopTime = $4 where id = $1") + ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1") (build2->id) ((int) (build2->drvPath == step->drvPath ? bsFailed : bsDepFailed)) (result.startTime) - (result.stopTime).exec(); + (result.stopTime) + (cachedFailure ? 1 : 0).exec(); build2->finishedInDB = true; // FIXME: txn might fail } + + /* Remember failed paths in the database so that they + won't be built again. */ + if (!cachedFailure && result.status == RemoteResult::rrPermanentFailure) + for (auto & path : outputPaths(step->drv)) + txn.parameterized("insert into FailedPaths values ($1)")(path).exec(); } txn.commit(); diff --git a/src/lib/Hydra/Controller/Admin.pm b/src/lib/Hydra/Controller/Admin.pm index 95666f07..a09a5edb 100644 --- a/src/lib/Hydra/Controller/Admin.pm +++ b/src/lib/Hydra/Controller/Admin.pm @@ -45,7 +45,7 @@ sub clear_queue_non_current : Chained('admin') PathPart('clear-queue-non-current sub clearfailedcache : Chained('admin') PathPart('clear-failed-cache') Args(0) { my ($self, $c) = @_; - my $r = `nix-store --clear-failed-paths '*'`; + $c->model('DB::FailedPaths')->delete; $c->res->redirect($c->request->referer // "/"); } diff --git a/src/lib/Hydra/Helper/Nix.pm b/src/lib/Hydra/Helper/Nix.pm index 087e66cb..7a68490f 100644 --- a/src/lib/Hydra/Helper/Nix.pm +++ b/src/lib/Hydra/Helper/Nix.pm @@ -465,9 +465,10 @@ sub restartBuilds($$) { # !!! Should do this in a trigger. $db->resultset('JobsetEvals')->search({ build => \@buildIds }, { join => 'buildIds' })->update({ nrsucceeded => undef }); - # Clear Nix's negative failure cache. + # Clear the failed paths cache. # FIXME: Add this to the API. - system("nix-store", "--clear-failed-paths", @paths); + # FIXME: clear the dependencies? + $db->resultset('FailedPaths')->search({ path => [ @paths ]})->delete; }); return scalar(@buildIds); diff --git a/src/lib/Hydra/Schema/FailedPaths.pm b/src/lib/Hydra/Schema/FailedPaths.pm new file mode 100644 index 00000000..082b989d --- /dev/null +++ b/src/lib/Hydra/Schema/FailedPaths.pm @@ -0,0 +1,65 @@ +use utf8; +package Hydra::Schema::FailedPaths; + +# Created by DBIx::Class::Schema::Loader +# DO NOT MODIFY THE FIRST PART OF THIS FILE + +=head1 NAME + +Hydra::Schema::FailedPaths + +=cut + +use strict; +use warnings; + +use base 'DBIx::Class::Core'; + +=head1 COMPONENTS LOADED + +=over 4 + +=item * L + +=back + +=cut + +__PACKAGE__->load_components("+Hydra::Component::ToJSON"); + +=head1 TABLE: C + +=cut + +__PACKAGE__->table("FailedPaths"); + +=head1 ACCESSORS + +=head2 path + + data_type: 'text' + is_nullable: 0 + +=cut + +__PACKAGE__->add_columns("path", { data_type => "text", is_nullable => 0 }); + +=head1 PRIMARY KEY + +=over 4 + +=item * L + +=back + +=cut + +__PACKAGE__->set_primary_key("path"); + + +# Created by DBIx::Class::Schema::Loader v0.07033 @ 2015-06-10 14:48:16 +# DO NOT MODIFY THIS OR ANYTHING ABOVE! md5sum:WFgjfjH+szE6Ntcicmaflw + + +# You can replace this text with custom code or comments, and it will be preserved on regeneration +1; diff --git a/src/sql/hydra.sql b/src/sql/hydra.sql index 4def1825..83178c25 100644 --- a/src/sql/hydra.sql +++ b/src/sql/hydra.sql @@ -511,6 +511,22 @@ create table StarredJobs ( ); +-- The output paths that have permanently failed. +create table FailedPaths ( + path text primary key not null +); + +#ifdef POSTGRESQL + +-- Needed because Postgres doesn't have "ignore duplicate" or upsert +-- yet. +create rule IdempotentInsert as on insert to FailedPaths + where exists (select 1 from FailedPaths where path = new.path) + do instead nothing; + +#endif + + -- Cache of the number of finished builds. create table NrBuilds ( what text primary key not null, From a4fb93c1196726155997f1d9abb0cee8e3614a66 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Wed, 10 Jun 2015 15:36:21 +0200 Subject: [PATCH 015/158] Lock builds for a shorter amount of time --- src/hydra-queue-runner/hydra-queue-runner.cc | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 844b19ee..788fe975 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -421,18 +421,6 @@ void State::getQueuedBuilds(std::shared_ptr store) auto conn(dbPool.get()); -#if 0 - { - auto runnable_(runnable.lock()); - auto builds_(builds.lock()); - auto steps_(steps.lock()); - printMsg(lvlError, format("%1% builds, %2% steps, %3% runnable steps") - % builds_->size() - % steps_->size() - % runnable_->size()); - } -#endif - /* Grab the queued builds from the database, but don't process them yet (since we don't want a long-running transaction). */ std::list newBuilds; // FIXME: use queue @@ -444,9 +432,8 @@ void State::getQueuedBuilds(std::shared_ptr store) // highest. auto res = txn.exec("select * from Builds where finished = 0 order by id"); - auto builds_(builds.lock()); - for (auto const & row : res) { + auto builds_(builds.lock()); BuildID id = row["id"].as(); if (has(*builds_, id)) continue; From d72a88b5629c85740ea81e2011b8d06cd8ee0228 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Wed, 10 Jun 2015 15:55:46 +0200 Subject: [PATCH 016/158] Don't try to handle SIGINT It just makes things unnecessarily complicated. We can just exit without cleaning anything up, since the only thing to do is unmark builds and build steps as busy. But we can do that by having systemd call "hydra-queue-runner --unlock" from ExecStopPost. --- src/hydra-queue-runner/hydra-queue-runner.cc | 68 ++------------------ 1 file changed, 7 insertions(+), 61 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 788fe975..9e5553b6 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -27,40 +27,6 @@ bool has(const C & c, const V & v) } -std::mutex exitRequestMutex; -std::condition_variable exitRequest; -std::atomic exitRequested(false); - -static std::atomic_int _int(0); - -void sigintHandler(int signo) -{ - _int = 1; -} - - -void signalThread() -{ - struct sigaction act; - act.sa_handler = sigintHandler; - sigemptyset(&act.sa_mask); - act.sa_flags = 0; - if (sigaction(SIGINT, &act, 0)) - throw SysError("installing handler for SIGINT"); - - while (true) { - sleep(1000000); - if (_int) break; - } - - { - std::lock_guard lock(exitRequestMutex); - exitRequested = true; - } - exitRequest.notify_all(); -} - - typedef enum { bsSuccess = 0, bsFailed = 1, @@ -402,7 +368,7 @@ void State::queueMonitor() { auto store = openStore(); // FIXME: pool - while (!exitRequested) { + while (true) { getQueuedBuilds(store); { @@ -685,7 +651,7 @@ void State::makeRunnable(Step::ptr step) void State::dispatcher() { - while (!exitRequested) { + while (true) { printMsg(lvlError, "dispatcher woken up"); { @@ -1003,25 +969,11 @@ void State::run() auto queueMonitorThread = std::thread(&State::queueMonitor, this); - auto dispatcherThread = std::thread(&State::dispatcher, this); + std::thread(&State::dispatcher, this).detach(); - /* Wait for SIGINT. */ - { - std::unique_lock lock(exitRequestMutex); - while (!exitRequested) - exitRequest.wait(lock); - } - - printMsg(lvlError, "exiting..."); - - /* Shut down the various threads. */ - { std::lock_guard lock(queueMonitorMutex); } // barrier - queueMonitorWakeup.notify_all(); queueMonitorThread.join(); - wakeDispatcher(); - dispatcherThread.join(); - + printMsg(lvlError, "exiting..."); printMsg(lvlError, format("psql connections = %1%") % dbPool.count()); } @@ -1031,15 +983,9 @@ int main(int argc, char * * argv) return handleExceptions(argv[0], [&]() { initNix(); - std::thread(signalThread).detach(); - - /* Ignore signals. This is inherited by the other threads. */ - sigset_t set; - sigemptyset(&set); - sigaddset(&set, SIGHUP); - sigaddset(&set, SIGINT); - sigaddset(&set, SIGTERM); - sigprocmask(SIG_BLOCK, &set, NULL); + signal(SIGINT, SIG_DFL); + signal(SIGTERM, SIG_DFL); + signal(SIGHUP, SIG_DFL); settings.buildVerbosity = lvlVomit; settings.useSubstitutes = false; From c08883966c55c8063725ec374de560c0242e64bd Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Thu, 11 Jun 2015 17:38:55 +0200 Subject: [PATCH 017/158] Use PostgreSQL notifications for queue events Hydra-queue-runner now no longer polls the queue periodically, but instead sleeps until it receives a notification from PostgreSQL about a change to the queue (build added, build cancelled or build restarted). Also, for the "build added" case, we now only check for builds with an ID greater than the previous greatest ID. This is much more efficient if the queue is large. --- src/hydra-queue-runner/hydra-queue-runner.cc | 56 +++++++++++++++----- src/hydra-queue-runner/pool.hh | 2 +- src/lib/Hydra/Helper/Nix.pm | 2 + src/script/hydra-evaluator | 2 + 4 files changed, 48 insertions(+), 14 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 9e5553b6..4e0b9c53 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -156,10 +156,6 @@ private: Path hydraData, logDir; - /* CV for waking up the queue. */ - std::condition_variable queueMonitorWakeup; - std::mutex queueMonitorMutex; - /* The queued builds. */ typedef std::map Builds; Sync builds; @@ -214,7 +210,7 @@ public: void queueMonitor(); - void getQueuedBuilds(std::shared_ptr store); + void getQueuedBuilds(std::shared_ptr store, unsigned int & lastBuildId); Step::ptr createStep(std::shared_ptr store, const Path & drvPath, std::set & newRunnable); @@ -366,14 +362,47 @@ void State::finishBuildStep(pqxx::work & txn, time_t startTime, time_t stopTime, void State::queueMonitor() { + auto conn(dbPool.get()); + + struct receiver : public pqxx::notification_receiver + { + bool status = false; + receiver(pqxx::connection_base & c, const std::string & channel) + : pqxx::notification_receiver(c, channel) { } + void operator() (const string & payload, int pid) override + { + status = true; + }; + bool get() { + bool b = status; + status = false; + return b; + } + }; + + receiver buildsAdded(*conn, "builds_added"); + receiver buildsRestarted(*conn, "builds_restarted"); + receiver buildsCancelled(*conn, "builds_cancelled"); + auto store = openStore(); // FIXME: pool - while (true) { - getQueuedBuilds(store); + unsigned int lastBuildId = 0; - { - std::unique_lock lock(queueMonitorMutex); - queueMonitorWakeup.wait_for(lock, std::chrono::seconds(5)); + while (true) { + getQueuedBuilds(store, lastBuildId); + + /* Sleep until we get notification from the database about an + event. */ + conn->await_notification(); + + if (buildsAdded.get()) + printMsg(lvlError, "got notification: new builds added to the queue"); + if (buildsRestarted.get()) { + printMsg(lvlError, "got notification: builds restarted"); + lastBuildId = 0; // check all builds + } + if (buildsCancelled.get()) { + printMsg(lvlError, "got notification: builds cancelled"); } } @@ -381,9 +410,9 @@ void State::queueMonitor() } -void State::getQueuedBuilds(std::shared_ptr store) +void State::getQueuedBuilds(std::shared_ptr store, unsigned int & lastBuildId) { - printMsg(lvlError, "checking the queue..."); + printMsg(lvlError, format("checking the queue for builds > %1%...") % lastBuildId); auto conn(dbPool.get()); @@ -396,11 +425,12 @@ void State::getQueuedBuilds(std::shared_ptr store) // FIXME: query only builds with ID higher than the previous // highest. - auto res = txn.exec("select * from Builds where finished = 0 order by id"); + auto res = txn.parameterized("select * from Builds where id > $1 and finished = 0 order by id")(lastBuildId).exec(); for (auto const & row : res) { auto builds_(builds.lock()); BuildID id = row["id"].as(); + if (id > lastBuildId) lastBuildId = id; if (has(*builds_, id)) continue; auto build = std::make_shared(); diff --git a/src/hydra-queue-runner/pool.hh b/src/hydra-queue-runner/pool.hh index 0a58ebe0..a1cd3977 100644 --- a/src/hydra-queue-runner/pool.hh +++ b/src/hydra-queue-runner/pool.hh @@ -57,7 +57,7 @@ public: if (r) state_->idle.push_back(r); } - R * operator -> () { return r; } + R * operator -> () { return r.get(); } R & operator * () { return *r; } }; diff --git a/src/lib/Hydra/Helper/Nix.pm b/src/lib/Hydra/Helper/Nix.pm index 7a68490f..9523ddaa 100644 --- a/src/lib/Hydra/Helper/Nix.pm +++ b/src/lib/Hydra/Helper/Nix.pm @@ -469,6 +469,8 @@ sub restartBuilds($$) { # FIXME: Add this to the API. # FIXME: clear the dependencies? $db->resultset('FailedPaths')->search({ path => [ @paths ]})->delete; + + $db->storage->dbh->do("notify builds_restarted"); }); return scalar(@buildIds); diff --git a/src/script/hydra-evaluator b/src/script/hydra-evaluator index 9d463122..bcdb948e 100755 --- a/src/script/hydra-evaluator +++ b/src/script/hydra-evaluator @@ -246,6 +246,8 @@ sub checkJobsetWrapped { $jobset->update({ enabled => 0 }) if $jobset->enabled == 2; $jobset->update({ lastcheckedtime => time }); + + $db->storage->dbh->do("notify builds_added"); }); # Store the error messages for jobs that failed to evaluate. From c974fb893b32735b7e30c7244a30e41893f62800 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Thu, 11 Jun 2015 18:07:45 +0200 Subject: [PATCH 018/158] Support cancelling builds --- src/hydra-queue-runner/hydra-queue-runner.cc | 43 +++++++++++++++----- src/lib/Hydra/Helper/Nix.pm | 1 + 2 files changed, 34 insertions(+), 10 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 4e0b9c53..94d36e44 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -210,7 +210,9 @@ public: void queueMonitor(); - void getQueuedBuilds(std::shared_ptr store, unsigned int & lastBuildId); + void getQueuedBuilds(Connection & conn, std::shared_ptr store, unsigned int & lastBuildId); + + void removeCancelledBuilds(Connection & conn); Step::ptr createStep(std::shared_ptr store, const Path & drvPath, std::set & newRunnable); @@ -389,7 +391,7 @@ void State::queueMonitor() unsigned int lastBuildId = 0; while (true) { - getQueuedBuilds(store, lastBuildId); + getQueuedBuilds(*conn, store, lastBuildId); /* Sleep until we get notification from the database about an event. */ @@ -403,6 +405,7 @@ void State::queueMonitor() } if (buildsCancelled.get()) { printMsg(lvlError, "got notification: builds cancelled"); + removeCancelledBuilds(*conn); } } @@ -410,21 +413,17 @@ void State::queueMonitor() } -void State::getQueuedBuilds(std::shared_ptr store, unsigned int & lastBuildId) +void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, unsigned int & lastBuildId) { printMsg(lvlError, format("checking the queue for builds > %1%...") % lastBuildId); - auto conn(dbPool.get()); - /* Grab the queued builds from the database, but don't process them yet (since we don't want a long-running transaction). */ std::list newBuilds; // FIXME: use queue { - pqxx::work txn(*conn); + pqxx::work txn(conn); - // FIXME: query only builds with ID higher than the previous - // highest. auto res = txn.parameterized("select * from Builds where id > $1 and finished = 0 order by id")(lastBuildId).exec(); for (auto const & row : res) { @@ -454,7 +453,7 @@ void State::getQueuedBuilds(std::shared_ptr store, unsigned int & last if (!store->isValidPath(build->drvPath)) { /* Derivation has been GC'ed prematurely. */ printMsg(lvlInfo, format("aborting GC'ed build %1%") % build->id); - pqxx::work txn(*conn); + pqxx::work txn(conn); txn.parameterized ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $3, errorMsg = $4 where id = $1") (build->id) @@ -476,7 +475,7 @@ void State::getQueuedBuilds(std::shared_ptr store, unsigned int & last printMsg(lvlInfo, format("cached build %1%") % build->id); - pqxx::work txn(*conn); + pqxx::work txn(conn); time_t now = time(0); markSucceededBuild(txn, build, res, true, now, now); txn.commit(); @@ -511,6 +510,30 @@ void State::getQueuedBuilds(std::shared_ptr store, unsigned int & last } +void State::removeCancelledBuilds(Connection & conn) +{ + /* Get the current set of queued builds. */ + std::set currentIds; + { + pqxx::work txn(conn); + auto res = txn.exec("select id from Builds where finished = 0"); + for (auto const & row : res) + currentIds.insert(row["id"].as()); + } + + auto builds_(builds.lock()); + + for (auto i = builds_->begin(); i != builds_->end(); ) { + if (currentIds.find(i->first) == currentIds.end()) { + printMsg(lvlInfo, format("discarding cancelled build %1%") % i->first); + i = builds_->erase(i); + // FIXME: ideally we would interrupt active build steps here. + } else + ++i; + } +} + + Step::ptr State::createStep(std::shared_ptr store, const Path & drvPath, std::set & newRunnable) { diff --git a/src/lib/Hydra/Helper/Nix.pm b/src/lib/Hydra/Helper/Nix.pm index 9523ddaa..c54c8f10 100644 --- a/src/lib/Hydra/Helper/Nix.pm +++ b/src/lib/Hydra/Helper/Nix.pm @@ -433,6 +433,7 @@ sub cancelBuilds($$) { , starttime => $time , stoptime => $time }); + $db->storage->dbh->do("notify builds_cancelled"); return $n; }); } From f9cd5adae8b3269e1e843ca7c04ec72c0f68f5c9 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Thu, 11 Jun 2015 18:09:50 +0200 Subject: [PATCH 019/158] Queue monitor: Get only the fields we need --- src/hydra-queue-runner/hydra-queue-runner.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 94d36e44..0acc2084 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -424,7 +424,7 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, { pqxx::work txn(conn); - auto res = txn.parameterized("select * from Builds where id > $1 and finished = 0 order by id")(lastBuildId).exec(); + auto res = txn.parameterized("select id, project, jobset, job, drvPath from Builds where id > $1 and finished = 0 order by id")(lastBuildId).exec(); for (auto const & row : res) { auto builds_(builds.lock()); From bf87d3a6ed7bb68300e607f989d5f057333bac9d Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Mon, 15 Jun 2015 14:51:38 +0200 Subject: [PATCH 020/158] Use stable Nix --- release.nix | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/release.nix b/release.nix index 3d273370..e757dfe9 100644 --- a/release.nix +++ b/release.nix @@ -37,7 +37,7 @@ in rec { version = builtins.readFile ./version; buildInputs = - [ perl libxslt dblatex tetex nukeReferences pkgconfig nixUnstable git openssl ]; + [ perl libxslt dblatex tetex nukeReferences pkgconfig nix git openssl ]; versionSuffix = if officialRelease then "" else "pre${toString hydraSrc.revCount}-${hydraSrc.gitTag}"; @@ -71,7 +71,7 @@ in rec { let - nix = nixUnstable; + #nix = nixUnstable; perlDeps = buildEnv { name = "hydra-perl-deps"; From 541fbd62cc5d095f86bbcb65f2e48def0a1d0eae Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Mon, 15 Jun 2015 14:51:49 +0200 Subject: [PATCH 021/158] Immediately abort builds that require an unsupported system type --- src/hydra-queue-runner/hydra-queue-runner.cc | 38 ++++++++++++++++++-- 1 file changed, 35 insertions(+), 3 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 0acc2084..c1bfe8ae 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -128,6 +128,13 @@ struct Machine auto currentJobs_(currentJobs.lock()); *currentJobs_ = 0; } + + bool supportsStep(Step::ptr step) + { + if (systemTypes.find(step->drv.platform) == systemTypes.end()) return false; + // FIXME: check features + return true; + } }; @@ -452,7 +459,7 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, if (!store->isValidPath(build->drvPath)) { /* Derivation has been GC'ed prematurely. */ - printMsg(lvlInfo, format("aborting GC'ed build %1%") % build->id); + printMsg(lvlError, format("aborting GC'ed build %1%") % build->id); pqxx::work txn(conn); txn.parameterized ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $3, errorMsg = $4 where id = $1") @@ -483,6 +490,32 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, continue; } + /* If any step has an unsupported system type, then fail the + build. */ + bool allSupported = true; + for (auto & r : newRunnable) { + bool supported = false; + { + auto machines_(machines.lock()); // FIXME: use shared_mutex + for (auto & m : *machines_) + if (m->supportsStep(r)) { supported = true; break; } + } + if (!supported) { allSupported = false; break; } + } + + if (!allSupported) { + printMsg(lvlError, format("aborting unsupported build %1%") % build->id); + pqxx::work txn(conn); + txn.parameterized + ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $3, errorMsg = $4 where id = $1") + (build->id) + ((int) bsAborted) + (time(0)) + ("unsupported system type").exec(); + txn.commit(); + continue; + } + /* Note: if we exit this scope prior to this, the build and all newly created steps are destroyed. */ @@ -763,8 +796,7 @@ MachineReservation::ptr State::findMachine(Step::ptr step) auto machines_(machines.lock()); for (auto & machine : *machines_) { - if (!has(machine->systemTypes, step->drv.platform)) continue; - // FIXME: check features + if (!machine->supportsStep(step)) continue; { auto currentJobs_(machine->currentJobs.lock()); if (*currentJobs_ >= machine->maxJobs) continue; From 5019fceb20532f6a5554e242179c0e83638d570b Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Mon, 15 Jun 2015 15:07:04 +0200 Subject: [PATCH 022/158] Add a error type for "unsupported system type" --- src/hydra-queue-runner/hydra-queue-runner.cc | 32 +++++++++++--------- src/root/build.tt | 2 ++ src/root/common.tt | 4 ++- src/sql/hydra.sql | 2 ++ 4 files changed, 25 insertions(+), 15 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index c1bfe8ae..e6677512 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -33,6 +33,7 @@ typedef enum { bsDepFailed = 2, bsAborted = 3, bsFailedWithOutput = 6, + bsUnsupported = 9, } BuildStatus; @@ -40,6 +41,7 @@ typedef enum { bssSuccess = 0, bssFailed = 1, bssAborted = 4, + bssUnsupported = 9, bssBusy = 100, // not stored } BuildStepStatus; @@ -343,7 +345,7 @@ int State::createBuildStep(pqxx::work & txn, time_t startTime, Build::ptr build, (propagatedFrom, propagatedFrom != 0) (errorMsg, errorMsg != "") (startTime, status != bssBusy) - (machine, machine != "").exec(); + (machine).exec(); for (auto & output : step->drv.outputs) txn.parameterized @@ -500,21 +502,23 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, for (auto & m : *machines_) if (m->supportsStep(r)) { supported = true; break; } } - if (!supported) { allSupported = false; break; } + if (!supported) { + allSupported = false; + printMsg(lvlError, format("aborting unsupported build %1%") % build->id); + pqxx::work txn(conn); + time_t now = time(0); + txn.parameterized + ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $3 where id = $1") + (build->id) + ((int) bsUnsupported) + (now).exec(); + createBuildStep(txn, now, build, r, "", bssUnsupported); + txn.commit(); + break; + } } - if (!allSupported) { - printMsg(lvlError, format("aborting unsupported build %1%") % build->id); - pqxx::work txn(conn); - txn.parameterized - ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $3, errorMsg = $4 where id = $1") - (build->id) - ((int) bsAborted) - (time(0)) - ("unsupported system type").exec(); - txn.commit(); - continue; - } + if (!allSupported) continue; /* Note: if we exit this scope prior to this, the build and all newly created steps are destroyed. */ diff --git a/src/root/build.tt b/src/root/build.tt index 3923715a..dc689cdd 100644 --- a/src/root/build.tt +++ b/src/root/build.tt @@ -56,6 +56,8 @@ Timed out [% ELSIF step.status == 8 %] Cached failure + [% ELSIF step.status == 9 %] + Unsupported system type [% ELSIF step.errormsg %] Failed: [% HTML.escape(step.errormsg) %] [% ELSE %] diff --git a/src/root/common.tt b/src/root/common.tt index 8ebdbd47..f4425772 100644 --- a/src/root/common.tt +++ b/src/root/common.tt @@ -198,7 +198,7 @@ BLOCK renderBuildStatusIcon; Failed [% ELSIF buildstatus == 2 || buildstatus == 5 %] Dependency failed - [% ELSIF buildstatus == 3 %] + [% ELSIF buildstatus == 3 || buildstatus == 9 %] Aborted [% ELSIF buildstatus == 4 %] Cancelled @@ -229,6 +229,8 @@ BLOCK renderStatus; Cancelled by user [% ELSIF buildstatus == 6 %] Build failed (with result) + [% ELSIF buildstatus == 9 %] + Unsupported system type [% ELSE %] Aborted (Hydra failure; see below) diff --git a/src/sql/hydra.sql b/src/sql/hydra.sql index 83178c25..cfeea893 100644 --- a/src/sql/hydra.sql +++ b/src/sql/hydra.sql @@ -180,6 +180,7 @@ create table Builds ( -- 4 = build cancelled (removed from queue; never built) -- 5 = build not done because a dependency failed previously (obsolete) -- 6 = failure with output + -- 9 = unsupported system type buildStatus integer, errorMsg text, -- error message in case of a Nix failure @@ -227,6 +228,7 @@ create table BuildSteps ( -- 4 = aborted -- 7 = timed out -- 8 = cached failure + -- 9 = unsupported system type status integer, errorMsg text, From c00bf7cd1aee6821734a755f1110e545c218fcc1 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Mon, 15 Jun 2015 15:13:03 +0200 Subject: [PATCH 023/158] Check non-runnable steps for unsupported system type --- src/hydra-queue-runner/hydra-queue-runner.cc | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index e6677512..123f9e19 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -224,7 +224,7 @@ public: void removeCancelledBuilds(Connection & conn); Step::ptr createStep(std::shared_ptr store, const Path & drvPath, - std::set & newRunnable); + std::set & newSteps, std::set & newRunnable); void destroyStep(Step::ptr step, bool proceed); @@ -473,8 +473,8 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, continue; } - std::set newRunnable; - Step::ptr step = createStep(store, build->drvPath, newRunnable); + std::set newSteps, newRunnable; + Step::ptr step = createStep(store, build->drvPath, newSteps, newRunnable); /* If we didn't get a step, it means the step's outputs are all valid. So we mark this as a finished, cached build. */ @@ -495,7 +495,7 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, /* If any step has an unsupported system type, then fail the build. */ bool allSupported = true; - for (auto & r : newRunnable) { + for (auto & r : newSteps) { bool supported = false; { auto machines_(machines.lock()); // FIXME: use shared_mutex @@ -531,8 +531,8 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, build->toplevel = step; } - printMsg(lvlInfo, format("added build %1% (top-level step %2%, %3% new runnable steps)") - % build->id % step->drvPath % newRunnable.size()); + printMsg(lvlInfo, format("added build %1% (top-level step %2%, %3% new steps, %4% new runnable steps)") + % build->id % step->drvPath % newSteps.size() % newRunnable.size()); /* Prior to this, the build is not visible to getDependentBuilds(). Now it is, so the build can be @@ -572,7 +572,7 @@ void State::removeCancelledBuilds(Connection & conn) Step::ptr State::createStep(std::shared_ptr store, const Path & drvPath, - std::set & newRunnable) + std::set & newSteps, std::set & newRunnable) { /* Check if the requested step already exists. */ { @@ -592,6 +592,7 @@ Step::ptr State::createStep(std::shared_ptr store, const Path & drvPat auto step = std::make_shared(); step->drvPath = drvPath; step->drv = readDerivation(drvPath); + newSteps.insert(step); /* Are all outputs valid? */ bool valid = true; @@ -611,7 +612,7 @@ Step::ptr State::createStep(std::shared_ptr store, const Path & drvPat /* Create steps for the dependencies. */ bool hasDeps = false; for (auto & i : step->drv.inputDrvs) { - Step::ptr dep = createStep(store, i.first, newRunnable); + Step::ptr dep = createStep(store, i.first, newSteps, newRunnable); if (dep) { hasDeps = true; auto step_(step->state.lock()); From 21aaa0596b25bb5b49a671ead37b8c0850fe7b43 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Mon, 15 Jun 2015 15:31:42 +0200 Subject: [PATCH 024/158] Fail builds with previously failed steps early --- src/hydra-queue-runner/hydra-queue-runner.cc | 59 +++++++++++++------- 1 file changed, 40 insertions(+), 19 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 123f9e19..632b11c0 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -248,6 +248,8 @@ public: void markSucceededBuild(pqxx::work & txn, Build::ptr build, const BuildResult & res, bool isCachedBuild, time_t startTime, time_t stopTime); + bool checkCachedFailure(Step::ptr step, Connection & conn); + void run(); }; @@ -492,33 +494,50 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, continue; } - /* If any step has an unsupported system type, then fail the - build. */ - bool allSupported = true; + /* If any step has an unsupported system type or has a + previously failed output path, then fail the build right + away. */ + bool badStep = false; for (auto & r : newSteps) { + BuildStatus buildStatus = bsSuccess; + BuildStepStatus buildStepStatus; + bool supported = false; { auto machines_(machines.lock()); // FIXME: use shared_mutex for (auto & m : *machines_) if (m->supportsStep(r)) { supported = true; break; } } + if (!supported) { - allSupported = false; printMsg(lvlError, format("aborting unsupported build %1%") % build->id); - pqxx::work txn(conn); + buildStatus = bsUnsupported; + buildStepStatus = bssUnsupported; + } + + if (checkCachedFailure(r, conn)) { + printMsg(lvlError, format("failing build %1% due to previous failure") % build->id); + buildStatus = step == r ? bsFailed : bsFailed; + buildStepStatus = bssFailed; + } + + if (buildStatus != bsSuccess) { time_t now = time(0); + pqxx::work txn(conn); txn.parameterized - ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $3 where id = $1") + ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $3, isCachedBuild = $4 where id = $1") (build->id) - ((int) bsUnsupported) - (now).exec(); - createBuildStep(txn, now, build, r, "", bssUnsupported); + ((int) buildStatus) + (now) + (buildStatus != bsUnsupported ? 1 : 0).exec(); + createBuildStep(txn, now, build, r, "", buildStepStatus); txn.commit(); + badStep = true; break; } } - if (!allSupported) continue; + if (badStep) continue; /* Note: if we exit this scope prior to this, the build and all newly created steps are destroyed. */ @@ -881,15 +900,7 @@ void State::doBuildStep(std::shared_ptr store, Step::ptr step, /* If any of the outputs have previously failed, then don't retry. */ - bool cachedFailure = false; - { - pqxx::work txn(*conn); - for (auto & path : outputPaths(step->drv)) - if (!txn.parameterized("select 1 from FailedPaths where path = $1")(path).exec().empty()) { - cachedFailure = true; - break; - } - } + bool cachedFailure = checkCachedFailure(step, *conn); if (cachedFailure) result.status = RemoteResult::rrPermanentFailure; @@ -1051,6 +1062,16 @@ void State::markSucceededBuild(pqxx::work & txn, Build::ptr build, } +bool State::checkCachedFailure(Step::ptr step, Connection & conn) +{ + pqxx::work txn(conn); + for (auto & path : outputPaths(step->drv)) + if (!txn.parameterized("select 1 from FailedPaths where path = $1")(path).exec().empty()) + return true; + return false; +} + + void State::run() { clearBusy(0); From 508ab7f8a2bd62a9c37dd791ec7dd2321980162d Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Mon, 15 Jun 2015 15:48:05 +0200 Subject: [PATCH 025/158] Tweak build steps --- src/hydra-queue-runner/hydra-queue-runner.cc | 10 +++++---- src/root/build.tt | 22 ++++++++++++-------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 632b11c0..6f6d2219 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -342,11 +342,13 @@ int State::createBuildStep(pqxx::work & txn, time_t startTime, Build::ptr build, txn.parameterized ("insert into BuildSteps (build, stepnr, type, drvPath, busy, startTime, system, status, propagatedFrom, errorMsg, stopTime, machine) values ($1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12)") - (build->id)(stepNr)(0)(step->drvPath)(status == bssBusy ? 1 : 0)(startTime)(step->drv.platform) + (build->id)(stepNr)(0)(step->drvPath)(status == bssBusy ? 1 : 0) + (startTime, startTime != 0) + (step->drv.platform) ((int) status, status != bssBusy) (propagatedFrom, propagatedFrom != 0) (errorMsg, errorMsg != "") - (startTime, status != bssBusy) + (startTime, startTime != 0 && status != bssBusy) (machine).exec(); for (auto & output : step->drv.outputs) @@ -530,7 +532,7 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, ((int) buildStatus) (now) (buildStatus != bsUnsupported ? 1 : 0).exec(); - createBuildStep(txn, now, build, r, "", buildStepStatus); + createBuildStep(txn, 0, build, r, "", buildStepStatus); txn.commit(); badStep = true; break; @@ -980,7 +982,7 @@ void State::doBuildStep(std::shared_ptr store, Step::ptr step, on this. */ for (auto build2 : dependents) { if (build == build2) continue; - createBuildStep(txn, result.stopTime, build2, step, machine->sshName, bssFailed, result.errorMsg, build->id); + createBuildStep(txn, 0, build2, step, machine->sshName, bssFailed, result.errorMsg, build->id); } finishBuildStep(txn, result.startTime, result.stopTime, build->id, stepNr, machine->sshName, bssFailed, result.errorMsg); diff --git a/src/root/build.tt b/src/root/build.tt index dc689cdd..8da24e1b 100644 --- a/src/root/build.tt +++ b/src/root/build.tt @@ -36,7 +36,7 @@ IF step.stoptime; INCLUDE renderDuration duration = step.stoptime - step.starttime; ELSE; - %]?[% + %]n/a[% END; ELSIF build.finished; INCLUDE renderDuration duration = build.stoptime - step.starttime; @@ -186,13 +186,17 @@ [% IF cachedBuild; INCLUDE renderFullBuildLink build=cachedBuild; ELSE %]unknown[% END %] [% END %] - [% IF !isAggregate && build.finished %] - - Duration: - [% actualBuild = build.iscachedbuild ? cachedBuild : build; - INCLUDE renderDuration duration = actualBuild.stoptime - actualBuild.starttime %]; - finished at [% INCLUDE renderDateTime timestamp = actualBuild.stoptime %] - + [% IF !isAggregate && build.finished; actualBuild = build.iscachedbuild ? cachedBuild : build %] + [% IF actualBuild %] + + Duration: + [% INCLUDE renderDuration duration = actualBuild.stoptime - actualBuild.starttime %] + + [% END %] + + Finished at: + [% INCLUDE renderDateTime timestamp = build.stoptime; %] + [% END %] [% IF !isAggregate && buildLogExists(build) %] @@ -298,7 +302,7 @@ - + [% IF build.finished && !build.iscachedbuild %] From 147eb4fd15f439b8f14b84df285e26cf5b7108ac Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Mon, 15 Jun 2015 16:33:50 +0200 Subject: [PATCH 026/158] Support requiredSystemFeatures --- src/hydra-queue-runner/hydra-queue-runner.cc | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 6f6d2219..8e21b3ba 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -88,6 +88,7 @@ struct Step Path drvPath; Derivation drv; + std::set requiredSystemFeatures; struct State { @@ -134,7 +135,10 @@ struct Machine bool supportsStep(Step::ptr step) { if (systemTypes.find(step->drv.platform) == systemTypes.end()) return false; - // FIXME: check features + for (auto & f : mandatoryFeatures) + if (step->requiredSystemFeatures.find(f) == step->requiredSystemFeatures.end()) return false; + for (auto & f : step->requiredSystemFeatures) + if (supportedFeatures.find(f) == supportedFeatures.end()) return false; return true; } }; @@ -299,6 +303,8 @@ void State::loadMachines() machine->speedFactor = atof(tokens[4].c_str()); machine->supportedFeatures = tokenizeString(tokens[5], ","); machine->mandatoryFeatures = tokenizeString(tokens[6], ","); + for (auto & f : machine->mandatoryFeatures) + machine->supportedFeatures.insert(f); newMachines.push_back(machine); } @@ -502,7 +508,7 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, bool badStep = false; for (auto & r : newSteps) { BuildStatus buildStatus = bsSuccess; - BuildStepStatus buildStepStatus; + BuildStepStatus buildStepStatus = bssFailed; bool supported = false; { @@ -613,6 +619,11 @@ Step::ptr State::createStep(std::shared_ptr store, const Path & drvPat auto step = std::make_shared(); step->drvPath = drvPath; step->drv = readDerivation(drvPath); + { + auto i = step->drv.env.find("requiredSystemFeatures"); + if (i != step->drv.env.end()) + step->requiredSystemFeatures = tokenizeString>(i->second); + } newSteps.insert(step); /* Are all outputs valid? */ From dd104f14ea74dd631528e7ade72f08ad30599c14 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Mon, 15 Jun 2015 16:54:52 +0200 Subject: [PATCH 027/158] Make the queue monitor more robust, and better debug output --- src/hydra-queue-runner/hydra-queue-runner.cc | 68 +++++++++++--------- 1 file changed, 39 insertions(+), 29 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 8e21b3ba..d1def645 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -76,7 +76,7 @@ struct Build ~Build() { - printMsg(lvlError, format("destroying build %1%") % id); + printMsg(lvlDebug, format("destroying build %1%") % id); } }; @@ -110,7 +110,7 @@ struct Step ~Step() { - printMsg(lvlError, format("destroying step %1%") % drvPath); + printMsg(lvlDebug, format("destroying step %1%") % drvPath); } }; @@ -223,6 +223,8 @@ public: void queueMonitor(); + void queueMonitorLoop(); + void getQueuedBuilds(Connection & conn, std::shared_ptr store, unsigned int & lastBuildId); void removeCancelledBuilds(Connection & conn); @@ -270,7 +272,7 @@ State::State() State::~State() { try { - printMsg(lvlError, "clearing active builds / build steps..."); + printMsg(lvlInfo, "clearing active builds / build steps..."); clearBusy(time(0)); } catch (...) { ignoreException(); @@ -382,6 +384,19 @@ void State::finishBuildStep(pqxx::work & txn, time_t startTime, time_t stopTime, void State::queueMonitor() +{ + while (true) { + try { + queueMonitorLoop(); + } catch (std::exception & e) { + printMsg(lvlError, format("queue monitor: %1%") % e.what()); + sleep(10); // probably a DB problem, so don't retry right away + } + } +} + + +void State::queueMonitorLoop() { auto conn(dbPool.get()); @@ -417,24 +432,22 @@ void State::queueMonitor() conn->await_notification(); if (buildsAdded.get()) - printMsg(lvlError, "got notification: new builds added to the queue"); + printMsg(lvlTalkative, "got notification: new builds added to the queue"); if (buildsRestarted.get()) { - printMsg(lvlError, "got notification: builds restarted"); + printMsg(lvlTalkative, "got notification: builds restarted"); lastBuildId = 0; // check all builds } if (buildsCancelled.get()) { - printMsg(lvlError, "got notification: builds cancelled"); + printMsg(lvlTalkative, "got notification: builds cancelled"); removeCancelledBuilds(*conn); } } - - printMsg(lvlError, "queue monitor exits"); } void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, unsigned int & lastBuildId) { - printMsg(lvlError, format("checking the queue for builds > %1%...") % lastBuildId); + printMsg(lvlInfo, format("checking the queue for builds > %1%...") % lastBuildId); /* Grab the queued builds from the database, but don't process them yet (since we don't want a long-running transaction). */ @@ -467,7 +480,7 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, // FIXME: remove build from newBuilds to ensure quick destruction // FIXME: exception handling - printMsg(lvlInfo, format("loading build %1% (%2%)") % build->id % build->fullJobName); + printMsg(lvlTalkative, format("loading build %1% (%2%)") % build->id % build->fullJobName); if (!store->isValidPath(build->drvPath)) { /* Derivation has been GC'ed prematurely. */ @@ -492,7 +505,7 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, Derivation drv = readDerivation(build->drvPath); BuildResult res = getBuildResult(store, drv); - printMsg(lvlInfo, format("cached build %1%") % build->id); + printMsg(lvlInfo, format("marking build %1% as cached successful") % build->id); pqxx::work txn(conn); time_t now = time(0); @@ -524,7 +537,7 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, } if (checkCachedFailure(r, conn)) { - printMsg(lvlError, format("failing build %1% due to previous failure") % build->id); + printMsg(lvlError, format("marking build %1% as cached failure") % build->id); buildStatus = step == r ? bsFailed : bsFailed; buildStepStatus = bssFailed; } @@ -558,7 +571,7 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, build->toplevel = step; } - printMsg(lvlInfo, format("added build %1% (top-level step %2%, %3% new steps, %4% new runnable steps)") + printMsg(lvlChatty, format("added build %1% (top-level step %2%, %3% new steps, %4% new runnable steps)") % build->id % step->drvPath % newSteps.size() % newRunnable.size()); /* Prior to this, the build is not visible to @@ -614,7 +627,7 @@ Step::ptr State::createStep(std::shared_ptr store, const Path & drvPat } } - printMsg(lvlInfo, format("considering derivation ‘%1%’") % drvPath); + printMsg(lvlDebug, format("considering derivation ‘%1%’") % drvPath); auto step = std::make_shared(); step->drvPath = drvPath; @@ -639,7 +652,7 @@ Step::ptr State::createStep(std::shared_ptr store, const Path & drvPat if (valid) return 0; /* No, we need to build. */ - printMsg(lvlInfo, format("creating build step ‘%1%’") % drvPath); + printMsg(lvlDebug, format("creating build step ‘%1%’") % drvPath); /* Create steps for the dependencies. */ bool hasDeps = false; @@ -671,7 +684,7 @@ void State::destroyStep(Step::ptr step, bool proceed) if (step->destroyed) return; step->destroyed = true; - printMsg(lvlInfo, format("destroying build step ‘%1%’") % step->drvPath); + printMsg(lvlDebug, format("destroying build step ‘%1%’") % step->drvPath); { auto steps_(steps.lock()); @@ -756,7 +769,7 @@ std::set State::getDependentBuilds(Step::ptr step) void State::makeRunnable(Step::ptr step) { - printMsg(lvlInfo, format("step ‘%1%’ is now runnable") % step->drvPath); + printMsg(lvlChatty, format("step ‘%1%’ is now runnable") % step->drvPath); { auto step_(step->state.lock()); @@ -775,11 +788,11 @@ void State::makeRunnable(Step::ptr step) void State::dispatcher() { while (true) { - printMsg(lvlError, "dispatcher woken up"); + printMsg(lvlDebug, "dispatcher woken up"); { auto runnable_(runnable.lock()); - printMsg(lvlError, format("%1% runnable builds") % runnable_->size()); + printMsg(lvlDebug, format("%1% runnable builds") % runnable_->size()); /* FIXME: we're holding the runnable lock too long here. This could be more efficient. */ @@ -795,13 +808,11 @@ void State::dispatcher() auto reservation = findMachine(step); if (!reservation) { - printMsg(lvlError, format("cannot execute step ‘%1%’ right now") % step->drvPath); + printMsg(lvlDebug, format("cannot execute step ‘%1%’ right now") % step->drvPath); ++i; continue; } - printMsg(lvlInfo, format("WOOHOO: starting step ‘%1%’ on machine ‘%2%’") - % step->drvPath % reservation->machine->sshName); i = runnable_->erase(i); auto builderThread = std::thread(&State::builder, this, step, reservation); @@ -863,8 +874,6 @@ void State::builder(Step::ptr step, MachineReservation::ptr reservation) assert(reservation.unique()); reservation = 0; wakeDispatcher(); - - printMsg(lvlError, "builder exits"); } @@ -900,7 +909,8 @@ void State::doBuildStep(std::shared_ptr store, Step::ptr step, if (!build) build = *dependents.begin(); - printMsg(lvlInfo, format("performing build step ‘%1%’ (needed by %2% builds)") % step->drvPath % dependents.size()); + printMsg(lvlInfo, format("performing step ‘%1%’ on ‘%2%’ (needed by %3% builds)") + % step->drvPath % machine->sshName % dependents.size()); } auto conn(dbPool.get()); @@ -934,7 +944,7 @@ void State::doBuildStep(std::shared_ptr store, Step::ptr step, result.status = RemoteResult::rrMiscFailure; result.errorMsg = e.msg(); printMsg(lvlError, format("ERROR: %1%") % e.msg()); - abort(); + abort(); // FIXME } if (result.status == RemoteResult::rrSuccess) res = getBuildResult(store, step->drv); @@ -1042,7 +1052,7 @@ void State::doBuildStep(std::shared_ptr store, Step::ptr step, void State::markSucceededBuild(pqxx::work & txn, Build::ptr build, const BuildResult & res, bool isCachedBuild, time_t startTime, time_t stopTime) { - printMsg(lvlError, format("marking build %1% as succeeded") % build->id); + printMsg(lvlInfo, format("marking build %1% as succeeded") % build->id); txn.parameterized ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, size = $5, closureSize = $6, releaseName = $7, isCachedBuild = $8 where id = $1") @@ -1097,8 +1107,8 @@ void State::run() queueMonitorThread.join(); - printMsg(lvlError, "exiting..."); - printMsg(lvlError, format("psql connections = %1%") % dbPool.count()); + //printMsg(lvlInfo, "exiting..."); + //printMsg(lvlInfo, format("psql connections = %1%") % dbPool.count()); } From 42e7301c08cf591ed24aee6f9c2710b35d711461 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Mon, 15 Jun 2015 18:20:14 +0200 Subject: [PATCH 028/158] Add status dump facility Doing $ psql hydra -c 'notify dump_status' will cause hydra-queue-runner to dump some internal status info on stderr. --- src/hydra-queue-runner/hydra-queue-runner.cc | 47 ++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index d1def645..1204ed0b 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -202,6 +202,10 @@ private: worth it. */ // std::vector builderThreads; + + /* Various stats. */ + std::atomic nrQueueWakeups; + public: State(); @@ -256,12 +260,16 @@ public: bool checkCachedFailure(Step::ptr step, Connection & conn); + void dumpStatus(); + void run(); }; State::State() { + nrQueueWakeups = 0; + hydraData = getEnv("HYDRA_DATA"); if (hydraData == "") throw Error("$HYDRA_DATA must be set"); @@ -419,6 +427,7 @@ void State::queueMonitorLoop() receiver buildsAdded(*conn, "builds_added"); receiver buildsRestarted(*conn, "builds_restarted"); receiver buildsCancelled(*conn, "builds_cancelled"); + receiver dumpStatus(*conn, "dump_status"); auto store = openStore(); // FIXME: pool @@ -430,6 +439,7 @@ void State::queueMonitorLoop() /* Sleep until we get notification from the database about an event. */ conn->await_notification(); + nrQueueWakeups++; if (buildsAdded.get()) printMsg(lvlTalkative, "got notification: new builds added to the queue"); @@ -441,6 +451,9 @@ void State::queueMonitorLoop() printMsg(lvlTalkative, "got notification: builds cancelled"); removeCancelledBuilds(*conn); } + + if (dumpStatus.get()) + State::dumpStatus(); } } @@ -1095,6 +1108,36 @@ bool State::checkCachedFailure(Step::ptr step, Connection & conn) } +void State::dumpStatus() +{ + { + auto builds_(builds.lock()); + printMsg(lvlError, format("%1% queued builds") % builds_->size()); + } + { + auto steps_(steps.lock()); + for (auto i = steps_->begin(); i != steps_->end(); ) + if (i->second.lock()) ++i; else i = steps_->erase(i); + printMsg(lvlError, format("%1% pending/active build steps") % steps_->size()); + } + { + auto runnable_(runnable.lock()); + for (auto i = runnable_->begin(); i != runnable_->end(); ) + if (i->lock()) ++i; else i = runnable_->erase(i); + printMsg(lvlError, format("%1% runnable build steps") % runnable_->size()); + } + printMsg(lvlError, format("%1% times woken up to check the queue") % nrQueueWakeups); + { + auto machines_(machines.lock()); + for (auto & m : *machines_) { + auto currentJobs_(m->currentJobs.lock()); + printMsg(lvlError, format("machine %1%: %2%/%3% active") + % m->sshName % *currentJobs_ % m->maxJobs); + } + } +} + + void State::run() { clearBusy(0); @@ -1117,6 +1160,10 @@ int main(int argc, char * * argv) return handleExceptions(argv[0], [&]() { initNix(); + parseCmdLine(argc, argv, [&](Strings::iterator & arg, const Strings::iterator & end) { + return false; + }); + signal(SIGINT, SIG_DFL); signal(SIGTERM, SIG_DFL); signal(SIGHUP, SIG_DFL); From e02654b3a02659b364b2e89c2f48e1c368729aa7 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Tue, 16 Jun 2015 18:00:39 +0200 Subject: [PATCH 029/158] Prefer cached failure over unsupported system type --- src/hydra-queue-runner/hydra-queue-runner.cc | 34 +++++++++----------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 1204ed0b..d59bef1e 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -197,12 +197,6 @@ private: typedef std::list Machines; Sync machines; - /* The currently active builder threads. FIXME: We could re-use - these, but since they're fairly long-running, it's probably not - worth it. */ - // std::vector builderThreads; - - /* Various stats. */ std::atomic nrQueueWakeups; @@ -536,25 +530,27 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, BuildStatus buildStatus = bsSuccess; BuildStepStatus buildStepStatus = bssFailed; - bool supported = false; - { - auto machines_(machines.lock()); // FIXME: use shared_mutex - for (auto & m : *machines_) - if (m->supportsStep(r)) { supported = true; break; } - } - - if (!supported) { - printMsg(lvlError, format("aborting unsupported build %1%") % build->id); - buildStatus = bsUnsupported; - buildStepStatus = bssUnsupported; - } - if (checkCachedFailure(r, conn)) { printMsg(lvlError, format("marking build %1% as cached failure") % build->id); buildStatus = step == r ? bsFailed : bsFailed; buildStepStatus = bssFailed; } + if (buildStatus == bsSuccess) { + bool supported = false; + { + auto machines_(machines.lock()); // FIXME: use shared_mutex + for (auto & m : *machines_) + if (m->supportsStep(r)) { supported = true; break; } + } + + if (!supported) { + printMsg(lvlError, format("aborting unsupported build %1%") % build->id); + buildStatus = bsUnsupported; + buildStepStatus = bssUnsupported; + } + } + if (buildStatus != bsSuccess) { time_t now = time(0); pqxx::work txn(conn); From b91a616520e3afe3b5c1b1c1a397599ea7aaeb9a Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Wed, 17 Jun 2015 11:45:20 +0200 Subject: [PATCH 030/158] Automatically retry aborted builds Aborted builds are now put back on the runnable queue and retried after a certain time interval (currently 60 seconds for the first retry, then tripled on each subsequent retry). --- src/hydra-queue-runner/build-remote.cc | 31 ++-- src/hydra-queue-runner/hydra-queue-runner.cc | 172 ++++++++++++++----- src/root/build.tt | 2 +- src/sql/hydra.sql | 1 + 4 files changed, 145 insertions(+), 61 deletions(-) diff --git a/src/hydra-queue-runner/build-remote.cc b/src/hydra-queue-runner/build-remote.cc index f3d153e4..6fddaf82 100644 --- a/src/hydra-queue-runner/build-remote.cc +++ b/src/hydra-queue-runner/build-remote.cc @@ -84,7 +84,7 @@ static void copyClosureTo(std::shared_ptr store, for (auto i = sorted.rbegin(); i != sorted.rend(); ++i) if (present.find(*i) == present.end()) missing.push_back(*i); - printMsg(lvlError, format("sending %1% missing paths") % missing.size()); + printMsg(lvlDebug, format("sending %1% missing paths") % missing.size()); writeInt(cmdImportPaths, to); exportPaths(*store, missing, false, to); @@ -128,23 +128,28 @@ void buildRemote(std::shared_ptr store, FdSink to(child.to); /* Handshake. */ - writeInt(SERVE_MAGIC_1, to); - writeInt(SERVE_PROTOCOL_VERSION, to); - to.flush(); + try { + writeInt(SERVE_MAGIC_1, to); + writeInt(SERVE_PROTOCOL_VERSION, to); + to.flush(); - unsigned int magic = readInt(from); - if (magic != SERVE_MAGIC_2) - throw Error(format("protocol mismatch with ‘nix-store --serve’ on ‘%1%’") % sshName); - unsigned int version = readInt(from); - if (GET_PROTOCOL_MAJOR(version) != 0x200) - throw Error(format("unsupported ‘nix-store --serve’ protocol version on ‘%1%’") % sshName); + unsigned int magic = readInt(from); + if (magic != SERVE_MAGIC_2) + throw Error(format("protocol mismatch with ‘nix-store --serve’ on ‘%1%’") % sshName); + unsigned int version = readInt(from); + if (GET_PROTOCOL_MAJOR(version) != 0x200) + throw Error(format("unsupported ‘nix-store --serve’ protocol version on ‘%1%’") % sshName); + } catch (EndOfFile & e) { + child.pid.wait(true); + throw Error(format("cannot connect to ‘%1%’: %2%") % sshName % chomp(readFile(logFile))); + } /* Copy the input closure. */ - printMsg(lvlError, format("sending closure of ‘%1%’ to ‘%2%’") % drvPath % sshName); + printMsg(lvlDebug, format("sending closure of ‘%1%’ to ‘%2%’") % drvPath % sshName); copyClosureTo(store, from, to, PathSet({drvPath})); /* Do the build. */ - printMsg(lvlError, format("building ‘%1%’ on ‘%2%’") % drvPath % sshName); + printMsg(lvlDebug, format("building ‘%1%’ on ‘%2%’") % drvPath % sshName); writeInt(cmdBuildPaths, to); writeStrings(PathSet({drvPath}), to); writeInt(3600, to); // == maxSilentTime, FIXME @@ -162,7 +167,7 @@ void buildRemote(std::shared_ptr store, } /* Copy the output paths. */ - printMsg(lvlError, format("copying outputs of ‘%1%’ from ‘%2%’") % drvPath % sshName); + printMsg(lvlDebug, format("copying outputs of ‘%1%’ from ‘%2%’") % drvPath % sshName); PathSet outputs; for (auto & output : drv.outputs) outputs.insert(output.second.path); diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index d59bef1e..3503b8f2 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -4,6 +4,8 @@ #include #include #include +#include +#include #include @@ -20,6 +22,14 @@ using namespace nix; +const int maxTries = 5; +const int retryInterval = 60; // seconds +const float retryBackoff = 3.0; + + +typedef std::chrono::time_point system_time; + + template bool has(const C & c, const V & v) { @@ -100,6 +110,12 @@ struct Step /* Builds that have this step as the top-level derivation. */ std::vector builds; + + /* Number of times we've tried this step. */ + unsigned int tries = 0; + + /* Point in time after which the step can be retried. */ + system_time after; }; Sync state; @@ -108,10 +124,7 @@ struct Step Step() : destroyed(false) { } - ~Step() - { - printMsg(lvlDebug, format("destroying step %1%") % drvPath); - } + ~Step() { } }; @@ -198,7 +211,10 @@ private: Sync machines; /* Various stats. */ - std::atomic nrQueueWakeups; + std::atomic nrRetries; + std::atomic maxNrRetries; + std::atomic nrQueueWakeups; + std::atomic nrDispatcherWakeups; public: State(); @@ -246,7 +262,9 @@ public: void builder(Step::ptr step, MachineReservation::ptr reservation); - void doBuildStep(std::shared_ptr store, Step::ptr step, + /* Perform the given build step. Return true if the step is to be + retried. */ + bool doBuildStep(std::shared_ptr store, Step::ptr step, Machine::ptr machine); void markSucceededBuild(pqxx::work & txn, Build::ptr build, @@ -262,7 +280,7 @@ public: State::State() { - nrQueueWakeups = 0; + nrRetries = maxNrRetries = nrQueueWakeups = nrDispatcherWakeups = 0; hydraData = getEnv("HYDRA_DATA"); if (hydraData == "") throw Error("$HYDRA_DATA must be set"); @@ -799,6 +817,8 @@ void State::dispatcher() while (true) { printMsg(lvlDebug, "dispatcher woken up"); + auto sleepUntil = system_time::max(); + { auto runnable_(runnable.lock()); printMsg(lvlDebug, format("%1% runnable builds") % runnable_->size()); @@ -806,6 +826,8 @@ void State::dispatcher() /* FIXME: we're holding the runnable lock too long here. This could be more efficient. */ + system_time now = std::chrono::system_clock::now(); + for (auto i = runnable_->begin(); i != runnable_->end(); ) { auto step = i->lock(); @@ -815,6 +837,18 @@ void State::dispatcher() continue; } + /* Skip previously failed steps that aren't ready to + be retried. */ + { + auto step_(step->state.lock()); + if (step_->tries > 0 && step_->after > now) { + if (step_->after < sleepUntil) + sleepUntil = step_->after; + ++i; + continue; + } + } + auto reservation = findMachine(step); if (!reservation) { printMsg(lvlDebug, format("cannot execute step ‘%1%’ right now") % step->drvPath); @@ -833,7 +867,10 @@ void State::dispatcher() is added, or because a build finishes). */ { std::unique_lock lock(dispatcherMutex); - dispatcherWakeup.wait(lock); + printMsg(lvlDebug, format("dispatcher sleeping for %1%s") % + std::chrono::duration_cast(sleepUntil - std::chrono::system_clock::now()).count()); + dispatcherWakeup.wait_until(lock, sleepUntil); + nrDispatcherWakeups++; } } @@ -871,22 +908,40 @@ MachineReservation::ptr State::findMachine(Step::ptr step) void State::builder(Step::ptr step, MachineReservation::ptr reservation) { + bool retry = true; + try { auto store = openStore(); // FIXME: pool - doBuildStep(store, step, reservation->machine); + retry = doBuildStep(store, step, reservation->machine); } catch (std::exception & e) { - printMsg(lvlError, format("error building ‘%1%’: %2%") % step->drvPath % e.what()); - // FIXME: put step back in runnable and retry + printMsg(lvlError, format("uncaught exception building ‘%1%’ on ‘%2%’: %3%") + % step->drvPath % reservation->machine->sshName % e.what()); } /* Release the machine and wake up the dispatcher. */ assert(reservation.unique()); reservation = 0; wakeDispatcher(); + + /* If there was a temporary failure, retry the step after an + exponentially increasing interval. */ + if (retry) { + { + auto step_(step->state.lock()); + step_->tries++; + nrRetries++; + if (step_->tries > maxNrRetries) maxNrRetries = step_->tries; // yeah yeah, not atomic + int delta = retryInterval * powf(retryBackoff, step_->tries - 1); + printMsg(lvlInfo, format("will retry ‘%1%’ after %2%s") % step->drvPath % delta); + step_->after = std::chrono::system_clock::now() + std::chrono::seconds(delta); + } + + makeRunnable(step); + } } -void State::doBuildStep(std::shared_ptr store, Step::ptr step, +bool State::doBuildStep(std::shared_ptr store, Step::ptr step, Machine::ptr machine) { /* There can be any number of builds in the database that depend @@ -903,14 +958,16 @@ void State::doBuildStep(std::shared_ptr store, Step::ptr step, if (dependents.empty()) { /* Apparently all builds that depend on this derivation - are gone (e.g. cancelled). So don't bother. (This is + are gone (e.g. cancelled). So don't bother. This is very unlikely to happen, because normally Steps are only kept alive by being reachable from a - Build). FIXME: what if a new Build gets a reference to - this step? */ + Build. However, it's possible that a new Build just + created a reference to this step. So to handle that + possibility, we retry this step (putting it back in + the runnable queue). If there are really no strong + pointers to the step, it will be deleted. */ printMsg(lvlInfo, format("cancelling build step ‘%1%’") % step->drvPath); - destroyStep(step, false); - return; + return true; } for (auto build2 : dependents) @@ -930,8 +987,8 @@ void State::doBuildStep(std::shared_ptr store, Step::ptr step, result.startTime = time(0); - /* If any of the outputs have previously failed, then don't - retry. */ + /* If any of the outputs have previously failed, then don't bother + building again. */ bool cachedFailure = checkCachedFailure(step, *conn); if (cachedFailure) @@ -952,8 +1009,8 @@ void State::doBuildStep(std::shared_ptr store, Step::ptr step, } catch (Error & e) { result.status = RemoteResult::rrMiscFailure; result.errorMsg = e.msg(); - printMsg(lvlError, format("ERROR: %1%") % e.msg()); - abort(); // FIXME + printMsg(lvlError, format("irregular failure building ‘%1%’ on ‘%2%’: %3%") + % step->drvPath % machine->sshName % e.msg()); } if (result.status == RemoteResult::rrSuccess) res = getBuildResult(store, step->drv); @@ -963,13 +1020,19 @@ void State::doBuildStep(std::shared_ptr store, Step::ptr step, if (!result.stopTime) result.stopTime = time(0); + bool retry = false; + if (result.status == RemoteResult::rrMiscFailure) { + auto step_(step->state.lock()); + retry = step_->tries + 1 < maxTries; + } + /* Remove this step. After this, incoming builds that depend on drvPath will either see that the output paths exist, or will create a new build step for drvPath. The latter is fine - it won't conflict with this one, because we're removing it. In any case, the set of dependent builds for ‘step’ can't increase anymore because ‘step’ is no longer visible to createStep(). */ - { + if (!retry) { auto steps_(steps.lock()); steps_->erase(step->drvPath); } @@ -1002,34 +1065,44 @@ void State::doBuildStep(std::shared_ptr store, Step::ptr step, } else { /* Failure case. */ + BuildStatus buildStatus = + result.status == RemoteResult::rrPermanentFailure ? bsFailed : bsAborted; + BuildStepStatus buildStepStatus = + result.status == RemoteResult::rrPermanentFailure ? bssFailed : bssAborted; + /* For regular failures, we don't care about the error message. */ - if (result.status != RemoteResult::rrMiscFailure) result.errorMsg = ""; + if (buildStatus != bsAborted) result.errorMsg = ""; - if (!cachedFailure) { + if (!cachedFailure && !retry) { /* Create failed build steps for every build that depends on this. */ for (auto build2 : dependents) { if (build == build2) continue; - createBuildStep(txn, 0, build2, step, machine->sshName, bssFailed, result.errorMsg, build->id); + createBuildStep(txn, 0, build2, step, machine->sshName, + buildStepStatus, result.errorMsg, build->id); } - finishBuildStep(txn, result.startTime, result.stopTime, build->id, stepNr, machine->sshName, bssFailed, result.errorMsg); } + if (!cachedFailure) + finishBuildStep(txn, result.startTime, result.stopTime, build->id, + stepNr, machine->sshName, buildStepStatus, result.errorMsg); + /* Mark all builds that depend on this derivation as failed. */ - for (auto build2 : dependents) { - printMsg(lvlError, format("marking build %1% as failed") % build2->id); - txn.parameterized - ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1") - (build2->id) - ((int) (build2->drvPath == step->drvPath ? bsFailed : bsDepFailed)) - (result.startTime) - (result.stopTime) - (cachedFailure ? 1 : 0).exec(); - build2->finishedInDB = true; // FIXME: txn might fail - } + if (!retry) + for (auto build2 : dependents) { + printMsg(lvlError, format("marking build %1% as failed") % build2->id); + txn.parameterized + ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1") + (build2->id) + ((int) (build2->drvPath != step->drvPath && buildStatus == bsFailed ? bsDepFailed : buildStatus)) + (result.startTime) + (result.stopTime) + (cachedFailure ? 1 : 0).exec(); + build2->finishedInDB = true; // FIXME: txn might fail + } /* Remember failed paths in the database so that they won't be built again. */ @@ -1045,16 +1118,20 @@ void State::doBuildStep(std::shared_ptr store, Step::ptr step, is the top-level derivation. In case of failure, destroy all dependent Build objects. Any Steps not referenced by other Builds will be destroyed as well. */ - for (auto build2 : dependents) - if (build2->toplevel == step || result.status != RemoteResult::rrSuccess) { - auto builds_(builds.lock()); - builds_->erase(build2->id); - } + if (!retry) + for (auto build2 : dependents) + if (build2->toplevel == step || result.status != RemoteResult::rrSuccess) { + auto builds_(builds.lock()); + builds_->erase(build2->id); + } /* Remove the step from the graph. In case of success, make dependent build steps runnable if they have no other dependencies. */ - destroyStep(step, result.status == RemoteResult::rrSuccess); + if (!retry) + destroyStep(step, result.status == RemoteResult::rrSuccess); + + return retry; } @@ -1122,7 +1199,11 @@ void State::dumpStatus() if (i->lock()) ++i; else i = runnable_->erase(i); printMsg(lvlError, format("%1% runnable build steps") % runnable_->size()); } - printMsg(lvlError, format("%1% times woken up to check the queue") % nrQueueWakeups); + printMsg(lvlError, format("%1% build step retries") % nrRetries); + printMsg(lvlError, format("%1% most retries for any build step") % maxNrRetries); + printMsg(lvlError, format("%1% queue wakeups") % nrQueueWakeups); + printMsg(lvlError, format("%1% dispatcher wakeups") % nrDispatcherWakeups); + printMsg(lvlError, format("%1% database connections") % dbPool.count()); { auto machines_(machines.lock()); for (auto & m : *machines_) { @@ -1145,9 +1226,6 @@ void State::run() std::thread(&State::dispatcher, this).detach(); queueMonitorThread.join(); - - //printMsg(lvlInfo, "exiting..."); - //printMsg(lvlInfo, format("psql connections = %1%") % dbPool.count()); } diff --git a/src/root/build.tt b/src/root/build.tt index 8da24e1b..6d9f474a 100644 --- a/src/root/build.tt +++ b/src/root/build.tt @@ -51,7 +51,7 @@ [% ELSIF step.status == 0 %] Succeeded [% ELSIF step.status == 4 %] - Aborted + Aborted[% IF step.errormsg %]: [% HTML.escape(step.errormsg); END %] [% ELSIF step.status == 7 %] Timed out [% ELSIF step.status == 8 %] diff --git a/src/sql/hydra.sql b/src/sql/hydra.sql index cfeea893..feeb9452 100644 --- a/src/sql/hydra.sql +++ b/src/sql/hydra.sql @@ -372,6 +372,7 @@ create table CachedCVSInputs ( ); +-- FIXME: remove create table SystemTypes ( system text primary key not null, maxConcurrent integer not null default 2 From 2da4987bc2320d55b1980fc3ec4f35d9e602b49d Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Wed, 17 Jun 2015 11:48:38 +0200 Subject: [PATCH 031/158] Don't lock the CPU --- src/hydra-queue-runner/hydra-queue-runner.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 3503b8f2..489ec241 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -1244,6 +1244,7 @@ int main(int argc, char * * argv) settings.buildVerbosity = lvlVomit; settings.useSubstitutes = false; + settings.lockCPU = false; /* FIXME: need some locking to prevent multiple instances of hydra-queue-runner. */ From 745efce828937934a2240e5170fdee3d63ee66af Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Wed, 17 Jun 2015 13:32:06 +0200 Subject: [PATCH 032/158] hydra-queue-runner: Implement timeouts Also, keep track of timeouts in the database as a distinct build status. --- src/hydra-queue-runner/build-remote.cc | 8 +++--- src/hydra-queue-runner/build-remote.hh | 3 ++- src/hydra-queue-runner/hydra-queue-runner.cc | 26 +++++++++++++++----- src/root/common.tt | 4 +++ src/sql/hydra.sql | 1 + 5 files changed, 32 insertions(+), 10 deletions(-) diff --git a/src/hydra-queue-runner/build-remote.cc b/src/hydra-queue-runner/build-remote.cc index 6fddaf82..762a1058 100644 --- a/src/hydra-queue-runner/build-remote.cc +++ b/src/hydra-queue-runner/build-remote.cc @@ -109,7 +109,8 @@ static void copyClosureFrom(std::shared_ptr store, void buildRemote(std::shared_ptr store, const string & sshName, const string & sshKey, const Path & drvPath, const Derivation & drv, - const nix::Path & logDir, RemoteResult & result) + const nix::Path & logDir, unsigned int maxSilentTime, unsigned int buildTimeout, + RemoteResult & result) { string base = baseNameOf(drvPath); Path logFile = logDir + "/" + string(base, 0, 2) + "/" + string(base, 2); @@ -152,8 +153,9 @@ void buildRemote(std::shared_ptr store, printMsg(lvlDebug, format("building ‘%1%’ on ‘%2%’") % drvPath % sshName); writeInt(cmdBuildPaths, to); writeStrings(PathSet({drvPath}), to); - writeInt(3600, to); // == maxSilentTime, FIXME - writeInt(7200, to); // == buildTimeout, FIXME + writeInt(maxSilentTime, to); + writeInt(buildTimeout, to); + // FIXME: send maxLogSize. to.flush(); result.startTime = time(0); int res = readInt(from); diff --git a/src/hydra-queue-runner/build-remote.hh b/src/hydra-queue-runner/build-remote.hh index 6406bc58..99e79c8c 100644 --- a/src/hydra-queue-runner/build-remote.hh +++ b/src/hydra-queue-runner/build-remote.hh @@ -18,4 +18,5 @@ struct RemoteResult void buildRemote(std::shared_ptr store, const std::string & sshName, const std::string & sshKey, const nix::Path & drvPath, const nix::Derivation & drv, - const nix::Path & logDir, RemoteResult & result); + const nix::Path & logDir, unsigned int maxSilentTime, unsigned int buildTimeout, + RemoteResult & result); diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 489ec241..192e75a9 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -43,6 +43,7 @@ typedef enum { bsDepFailed = 2, bsAborted = 3, bsFailedWithOutput = 6, + bsTimedOut = 7, bsUnsupported = 9, } BuildStatus; @@ -51,6 +52,7 @@ typedef enum { bssSuccess = 0, bssFailed = 1, bssAborted = 4, + bssTimedOut = 7, bssUnsupported = 9, bssBusy = 100, // not stored } BuildStepStatus; @@ -77,6 +79,7 @@ struct Build Path drvPath; std::map outputs; std::string fullJobName; + unsigned int maxSilentTime, buildTimeout; std::shared_ptr toplevel; @@ -481,7 +484,7 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, { pqxx::work txn(conn); - auto res = txn.parameterized("select id, project, jobset, job, drvPath from Builds where id > $1 and finished = 0 order by id")(lastBuildId).exec(); + auto res = txn.parameterized("select id, project, jobset, job, drvPath, maxsilent, timeout from Builds where id > $1 and finished = 0 order by id")(lastBuildId).exec(); for (auto const & row : res) { auto builds_(builds.lock()); @@ -493,6 +496,9 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, build->id = id; build->drvPath = row["drvPath"].as(); build->fullJobName = row["project"].as() + ":" + row["jobset"].as() + ":" + row["job"].as(); + build->maxSilentTime = row["maxsilent"].as(); + build->buildTimeout = row["timeout"].as(); + std::cerr << build->id << " " << build->buildTimeout << std::endl; newBuilds.push_back(build); } @@ -975,8 +981,8 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, if (!build) build = *dependents.begin(); - printMsg(lvlInfo, format("performing step ‘%1%’ on ‘%2%’ (needed by %3% builds)") - % step->drvPath % machine->sshName % dependents.size()); + printMsg(lvlInfo, format("performing step ‘%1%’ on ‘%2%’ (needed by build %3% and %4% others)") + % step->drvPath % machine->sshName % build->id % (dependents.size() - 1)); } auto conn(dbPool.get()); @@ -1005,7 +1011,9 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, } try { - buildRemote(store, machine->sshName, machine->sshKey, step->drvPath, step->drv, logDir, result); + /* FIXME: referring builds may have conflicting timeouts. */ + buildRemote(store, machine->sshName, machine->sshKey, step->drvPath, step->drv, + logDir, build->maxSilentTime, build->buildTimeout, result); } catch (Error & e) { result.status = RemoteResult::rrMiscFailure; result.errorMsg = e.msg(); @@ -1066,9 +1074,13 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, /* Failure case. */ BuildStatus buildStatus = - result.status == RemoteResult::rrPermanentFailure ? bsFailed : bsAborted; + result.status == RemoteResult::rrPermanentFailure ? bsFailed : + result.status == RemoteResult::rrTimedOut ? bsTimedOut : + bsAborted; BuildStepStatus buildStepStatus = - result.status == RemoteResult::rrPermanentFailure ? bssFailed : bssAborted; + result.status == RemoteResult::rrPermanentFailure ? bssFailed : + result.status == RemoteResult::rrTimedOut ? bssTimedOut : + bssAborted; /* For regular failures, we don't care about the error message. */ @@ -1223,6 +1235,8 @@ void State::run() auto queueMonitorThread = std::thread(&State::queueMonitor, this); + sleep(5); + std::thread(&State::dispatcher, this).detach(); queueMonitorThread.join(); diff --git a/src/root/common.tt b/src/root/common.tt index f4425772..f02ed838 100644 --- a/src/root/common.tt +++ b/src/root/common.tt @@ -204,6 +204,8 @@ BLOCK renderBuildStatusIcon; Cancelled [% ELSIF buildstatus == 6 %] Failed (with result) + [% ELSIF buildstatus == 7 %] + Timed out [% ELSE %] Failed [% END; @@ -229,6 +231,8 @@ BLOCK renderStatus; Cancelled by user [% ELSIF buildstatus == 6 %] Build failed (with result) + [% ELSIF buildstatus == 7 %] + Timed out [% ELSIF buildstatus == 9 %] Unsupported system type [% ELSE %] diff --git a/src/sql/hydra.sql b/src/sql/hydra.sql index feeb9452..10f2d614 100644 --- a/src/sql/hydra.sql +++ b/src/sql/hydra.sql @@ -180,6 +180,7 @@ create table Builds ( -- 4 = build cancelled (removed from queue; never built) -- 5 = build not done because a dependency failed previously (obsolete) -- 6 = failure with output + -- 7 = timed out -- 9 = unsupported system type buildStatus integer, From c6d504edbb7d3ff808a20bcc2c87046106f1c84a Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Wed, 17 Jun 2015 13:49:18 +0200 Subject: [PATCH 033/158] Handle SSH hosts without a @ --- src/lib/Hydra/View/TT.pm | 12 +++++++++++- src/root/machine-status.tt | 4 ++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/src/lib/Hydra/View/TT.pm b/src/lib/Hydra/View/TT.pm index 2f9d4201..be3cf493 100644 --- a/src/lib/Hydra/View/TT.pm +++ b/src/lib/Hydra/View/TT.pm @@ -9,7 +9,7 @@ __PACKAGE__->config( ENCODING => 'utf-8', PRE_CHOMP => 1, POST_CHOMP => 1, - expose_methods => [qw/buildLogExists buildStepLogExists jobExists/]); + expose_methods => [qw/buildLogExists buildStepLogExists jobExists stripSSHUser/]); sub buildLogExists { my ($self, $c, $build) = @_; @@ -23,6 +23,16 @@ sub buildStepLogExists { return defined findLog($c, $step->drvpath, @outPaths); } + +sub stripSSHUser { + my ($self, $c, $name) = @_; + if ($name =~ /^.*@(.*)$/) { + return $1; + } else { + return $name; + } +} + # Check whether the given job is a member of the most recent jobset # evaluation. sub jobExists { diff --git a/src/root/machine-status.tt b/src/root/machine-status.tt index a1331378..5847c970 100644 --- a/src/root/machine-status.tt +++ b/src/root/machine-status.tt @@ -13,7 +13,7 @@ [% FOREACH m IN machines %] - [% name = m.key ? m.key.match('@(.*)').0 : "localhost" %] + [% name = m.key ? stripSSHUser(m.key) : "localhost" %] [% idle = 1 %] [% FOREACH step IN steps %] - [% name2 = step.machine ? step.machine.match('@(.*)').0 : "localhost" %] + [% name2 = step.machine ? stripSSHUser(step.machine) : "localhost" %] [% IF name == name2 %] [% idle = 0 %] From b1a75c7f631f22b495e2fb49b05b4679948bbdfa Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Wed, 17 Jun 2015 14:46:02 +0200 Subject: [PATCH 034/158] getQueuedBuilds(): Handle dependent builds first If a build A depends on a derivation that is the top-level derivation of some build B, then we should process B before A (meaning we shouldn't make the derivation runnable before B has been added). Otherwise, the derivation will be "accounted" to A rather than B (so the build step will show up in the wrong build). --- src/hydra-queue-runner/hydra-queue-runner.cc | 58 ++++++++++++++------ 1 file changed, 41 insertions(+), 17 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 192e75a9..492b44a0 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -479,7 +479,7 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, /* Grab the queued builds from the database, but don't process them yet (since we don't want a long-running transaction). */ - std::list newBuilds; // FIXME: use queue + std::multimap newBuilds; { pqxx::work txn(conn); @@ -498,20 +498,18 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, build->fullJobName = row["project"].as() + ":" + row["jobset"].as() + ":" + row["job"].as(); build->maxSilentTime = row["maxsilent"].as(); build->buildTimeout = row["timeout"].as(); - std::cerr << build->id << " " << build->buildTimeout << std::endl; - newBuilds.push_back(build); + newBuilds.emplace(std::make_pair(build->drvPath, build)); } } - /* Now instantiate build steps for each new build. The builder - threads can start building the runnable build steps right away, - even while we're still processing other new builds. */ - for (auto & build : newBuilds) { - // FIXME: remove build from newBuilds to ensure quick destruction - // FIXME: exception handling + std::set newRunnable; + unsigned int nrAdded; + std::function createBuild; + createBuild = [&](Build::ptr build) { printMsg(lvlTalkative, format("loading build %1% (%2%)") % build->id % build->fullJobName); + nrAdded++; if (!store->isValidPath(build->drvPath)) { /* Derivation has been GC'ed prematurely. */ @@ -524,12 +522,26 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, (time(0)) ("derivation was garbage-collected prior to build").exec(); txn.commit(); - continue; + return; } - std::set newSteps, newRunnable; + std::set newSteps; Step::ptr step = createStep(store, build->drvPath, newSteps, newRunnable); + /* Some of the new steps may be the top level of builds that + we haven't processed yet. So do them now. This ensures that + if build A depends on build B with top-level step X, then X + will be "accounted" to B in doBuildStep(). */ + for (auto & r : newSteps) { + while (true) { + auto i = newBuilds.find(r->drvPath); + if (i == newBuilds.end()) break; + Build::ptr b = i->second; + newBuilds.erase(i); + createBuild(b); + } + } + /* If we didn't get a step, it means the step's outputs are all valid. So we mark this as a finished, cached build. */ if (!step) { @@ -543,7 +555,7 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, markSucceededBuild(txn, build, res, true, now, now); txn.commit(); - continue; + return; } /* If any step has an unsupported system type or has a @@ -591,7 +603,7 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, } } - if (badStep) continue; + if (badStep) return; /* Note: if we exit this scope prior to this, the build and all newly created steps are destroyed. */ @@ -604,16 +616,30 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, build->toplevel = step; } - printMsg(lvlChatty, format("added build %1% (top-level step %2%, %3% new steps, %4% new runnable steps)") - % build->id % step->drvPath % newSteps.size() % newRunnable.size()); + printMsg(lvlChatty, format("added build %1% (top-level step %2%, %3% new steps)") + % build->id % step->drvPath % newSteps.size()); /* Prior to this, the build is not visible to getDependentBuilds(). Now it is, so the build can be failed if a dependency fails. (It can't succeed right away because its top-level is not runnable yet). */ + }; + + /* Now instantiate build steps for each new build. The builder + threads can start building the runnable build steps right away, + even while we're still processing other new builds. */ + while (!newBuilds.empty()) { + auto build = newBuilds.begin()->second; + newBuilds.erase(newBuilds.begin()); + + newRunnable.clear(); + nrAdded = 0; + createBuild(build); + /* Add the new runnable build steps to ‘runnable’ and wake up the builder threads. */ + printMsg(lvlChatty, format("got %1% new runnable steps from %2% new builds") % newRunnable.size() % nrAdded); for (auto & r : newRunnable) makeRunnable(r); } @@ -1235,8 +1261,6 @@ void State::run() auto queueMonitorThread = std::thread(&State::queueMonitor, this); - sleep(5); - std::thread(&State::dispatcher, this).detach(); queueMonitorThread.join(); From 11be780948e6b89ae8619e7c85b43176f1c42a1d Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Wed, 17 Jun 2015 17:11:42 +0200 Subject: [PATCH 035/158] Handle failure with output --- src/hydra-queue-runner/build-result.cc | 3 +++ src/hydra-queue-runner/build-result.hh | 5 +++++ src/hydra-queue-runner/hydra-queue-runner.cc | 8 +------- src/root/common.tt | 4 ++-- 4 files changed, 11 insertions(+), 9 deletions(-) diff --git a/src/hydra-queue-runner/build-result.cc b/src/hydra-queue-runner/build-result.cc index e7a2fda3..78b9b25c 100644 --- a/src/hydra-queue-runner/build-result.cc +++ b/src/hydra-queue-runner/build-result.cc @@ -27,6 +27,9 @@ BuildResult getBuildResult(std::shared_ptr store, const Derivation & d bool explicitProducts = false; for (auto & output : outputs) { + Path failedFile = output + "/nix-support/failed"; + if (pathExists(failedFile)) res.failed = true; + Path productsFile = output + "/nix-support/hydra-build-products"; if (!pathExists(productsFile)) continue; explicitProducts = true; diff --git a/src/hydra-queue-runner/build-result.hh b/src/hydra-queue-runner/build-result.hh index bbe6fd7a..c8965345 100644 --- a/src/hydra-queue-runner/build-result.hh +++ b/src/hydra-queue-runner/build-result.hh @@ -17,6 +17,11 @@ struct BuildProduct struct BuildResult { + /* Whether this build has failed with output, i.e., the build + finished with exit code 0 but produced a file + $out/nix-support/failed. */ + bool failed = false; + std::string releaseName; unsigned long long closureSize = 0, size = 0; diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 492b44a0..c5f657f4 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -930,10 +930,6 @@ MachineReservation::ptr State::findMachine(Step::ptr step) return std::make_shared(machine); } - /* FIXME: distinguish between permanent failures (a matching - machine doesn't exist) and temporary failures (a matching - machine is not available). */ - return 0; } @@ -1048,8 +1044,6 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, } if (result.status == RemoteResult::rrSuccess) res = getBuildResult(store, step->drv); - - // FIXME: handle failed-with-output } if (!result.stopTime) result.stopTime = time(0); @@ -1181,7 +1175,7 @@ void State::markSucceededBuild(pqxx::work & txn, Build::ptr build, txn.parameterized ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, size = $5, closureSize = $6, releaseName = $7, isCachedBuild = $8 where id = $1") (build->id) - ((int) bsSuccess) + ((int) (res.failed ? bsFailedWithOutput : bsSuccess)) (startTime) (stopTime) (res.size) diff --git a/src/root/common.tt b/src/root/common.tt index f02ed838..44f962bd 100644 --- a/src/root/common.tt +++ b/src/root/common.tt @@ -203,7 +203,7 @@ BLOCK renderBuildStatusIcon; [% ELSIF buildstatus == 4 %] Cancelled [% ELSIF buildstatus == 6 %] - Failed (with result) + Failed with output [% ELSIF buildstatus == 7 %] Timed out [% ELSE %] @@ -230,7 +230,7 @@ BLOCK renderStatus; [% ELSIF buildstatus == 4 %] Cancelled by user [% ELSIF buildstatus == 6 %] - Build failed (with result) + Build failed with output [% ELSIF buildstatus == 7 %] Timed out [% ELSIF buildstatus == 9 %] From ca48818b3053270537ca074c90ddddb882a55568 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Wed, 17 Jun 2015 17:28:59 +0200 Subject: [PATCH 036/158] Fix remote building --- src/hydra-queue-runner/build-remote.cc | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/src/hydra-queue-runner/build-remote.cc b/src/hydra-queue-runner/build-remote.cc index 762a1058..f0adccf5 100644 --- a/src/hydra-queue-runner/build-remote.cc +++ b/src/hydra-queue-runner/build-remote.cc @@ -145,9 +145,19 @@ void buildRemote(std::shared_ptr store, throw Error(format("cannot connect to ‘%1%’: %2%") % sshName % chomp(readFile(logFile))); } + /* Gather the inputs. */ + PathSet inputs({drvPath}); + for (auto & input : drv.inputDrvs) { + Derivation drv2 = readDerivation(input.first); + for (auto & name : input.second) { + auto i = drv2.outputs.find(name); + if (i != drv2.outputs.end()) inputs.insert(i->second.path); + } + } + /* Copy the input closure. */ printMsg(lvlDebug, format("sending closure of ‘%1%’ to ‘%2%’") % drvPath % sshName); - copyClosureTo(store, from, to, PathSet({drvPath})); + copyClosureTo(store, from, to, inputs); /* Do the build. */ printMsg(lvlDebug, format("building ‘%1%’ on ‘%2%’") % drvPath % sshName); From ce9e859a9c0125334fdce19e96eee78d268f8baf Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Wed, 17 Jun 2015 21:35:20 +0200 Subject: [PATCH 037/158] hydra-queue-runner: Implement --unlock --- src/hydra-queue-runner/hydra-queue-runner.cc | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index c5f657f4..56feaa54 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -1266,14 +1266,20 @@ int main(int argc, char * * argv) return handleExceptions(argv[0], [&]() { initNix(); - parseCmdLine(argc, argv, [&](Strings::iterator & arg, const Strings::iterator & end) { - return false; - }); - signal(SIGINT, SIG_DFL); signal(SIGTERM, SIG_DFL); signal(SIGHUP, SIG_DFL); + bool unlock = false; + + parseCmdLine(argc, argv, [&](Strings::iterator & arg, const Strings::iterator & end) { + if (*arg == "--unlock") + unlock = true; + else + return false; + return true; + }); + settings.buildVerbosity = lvlVomit; settings.useSubstitutes = false; settings.lockCPU = false; @@ -1281,6 +1287,9 @@ int main(int argc, char * * argv) /* FIXME: need some locking to prevent multiple instances of hydra-queue-runner. */ State state; - state.run(); + if (unlock) + state.clearBusy(0); + else + state.run(); }); } From 4d9c74335d49474ab03160e4819077373aee12e0 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Wed, 17 Jun 2015 21:39:28 +0200 Subject: [PATCH 038/158] Add forgotten file --- src/sql/upgrade-33.sql | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 src/sql/upgrade-33.sql diff --git a/src/sql/upgrade-33.sql b/src/sql/upgrade-33.sql new file mode 100644 index 00000000..62f4a49f --- /dev/null +++ b/src/sql/upgrade-33.sql @@ -0,0 +1,7 @@ +create table FailedPaths ( + path text primary key not null +); + +create rule IdempotentInsert as on insert to FailedPaths + where exists (select 1 from FailedPaths where path = new.path) + do instead nothing; From ec8e8edc86799af88033fe5ed1d799c183e8168a Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Wed, 17 Jun 2015 22:11:01 +0200 Subject: [PATCH 039/158] hydra-queue-runner: Handle $HYDRA_DBI --- src/hydra-queue-runner/hydra-queue-runner.cc | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 56feaa54..2abded44 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -60,7 +60,16 @@ typedef enum { struct Connection : pqxx::connection { - Connection() : pqxx::connection("dbname=hydra") { }; + Connection() : pqxx::connection(getFlags()) { }; + + string getFlags() + { + string s = getEnv("HYDRA_DBI", "dbi:Pg:dbname=hydra;"); + string prefix = "dbi:Pg:"; + if (string(s, 0, prefix.size()) != prefix) + throw Error("$HYDRA_DBI does not denote a PostgreSQL database"); + return concatStringsSep(" ", tokenizeString(string(s, prefix.size()), ";")); + } }; From 59dae60558cdade0a2ba669fb9d68902950e72f8 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Wed, 17 Jun 2015 22:38:12 +0200 Subject: [PATCH 040/158] hydra-queue-runner: More stats --- src/hydra-queue-runner/hydra-queue-runner.cc | 27 +++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 2abded44..aae081b4 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -37,6 +37,9 @@ bool has(const C & c, const V & v) } +typedef std::atomic counter; + + typedef enum { bsSuccess = 0, bsFailed = 1, @@ -223,10 +226,13 @@ private: Sync machines; /* Various stats. */ - std::atomic nrRetries; - std::atomic maxNrRetries; - std::atomic nrQueueWakeups; - std::atomic nrDispatcherWakeups; + counter nrBuildsRead{0}; + counter nrBuildsDone{0}; + counter nrStepsDone{0}; + counter nrRetries{0}; + counter maxNrRetries{0}; + counter nrQueueWakeups{0}; + counter nrDispatcherWakeups{0}; public: State(); @@ -292,8 +298,6 @@ public: State::State() { - nrRetries = maxNrRetries = nrQueueWakeups = nrDispatcherWakeups = 0; - hydraData = getEnv("HYDRA_DATA"); if (hydraData == "") throw Error("$HYDRA_DATA must be set"); @@ -531,6 +535,7 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, (time(0)) ("derivation was garbage-collected prior to build").exec(); txn.commit(); + nrBuildsDone++; return; } @@ -607,6 +612,7 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, (buildStatus != bsUnsupported ? 1 : 0).exec(); createBuildStep(txn, 0, build, r, "", buildStepStatus); txn.commit(); + nrBuildsDone++; badStep = true; break; } @@ -651,6 +657,8 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, printMsg(lvlChatty, format("got %1% new runnable steps from %2% new builds") % newRunnable.size() % nrAdded); for (auto & r : newRunnable) makeRunnable(r); + + nrBuildsRead += nrAdded; } } @@ -754,6 +762,8 @@ void State::destroyStep(Step::ptr step, bool proceed) printMsg(lvlDebug, format("destroying build step ‘%1%’") % step->drvPath); + nrStepsDone++; + { auto steps_(steps.lock()); steps_->erase(step->drvPath); @@ -1143,6 +1153,7 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, (result.stopTime) (cachedFailure ? 1 : 0).exec(); build2->finishedInDB = true; // FIXME: txn might fail + nrBuildsDone++; } /* Remember failed paths in the database so that they @@ -1209,6 +1220,7 @@ void State::markSucceededBuild(pqxx::work & txn, Build::ptr build, } build->finishedInDB = true; // FIXME: txn might fail + nrBuildsDone++; } @@ -1240,6 +1252,9 @@ void State::dumpStatus() if (i->lock()) ++i; else i = runnable_->erase(i); printMsg(lvlError, format("%1% runnable build steps") % runnable_->size()); } + printMsg(lvlError, format("%1% builds read from queue") % nrBuildsRead); + printMsg(lvlError, format("%1% builds done") % nrBuildsDone); + printMsg(lvlError, format("%1% build steps done") % nrStepsDone); printMsg(lvlError, format("%1% build step retries") % nrRetries); printMsg(lvlError, format("%1% most retries for any build step") % maxNrRetries); printMsg(lvlError, format("%1% queue wakeups") % nrQueueWakeups); From f57d0b0c54684231e7a49f14a334db7d94287159 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Thu, 18 Jun 2015 00:24:56 +0200 Subject: [PATCH 041/158] hydra-queue-runner: Maintain count of active build steps --- src/hydra-queue-runner/build-remote.cc | 2 ++ src/hydra-queue-runner/hydra-queue-runner.cc | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/src/hydra-queue-runner/build-remote.cc b/src/hydra-queue-runner/build-remote.cc index f0adccf5..39529e1a 100644 --- a/src/hydra-queue-runner/build-remote.cc +++ b/src/hydra-queue-runner/build-remote.cc @@ -39,6 +39,8 @@ static void openConnection(const string & sshName, const string & sshKey, if (dup2(stderrFD, STDERR_FILENO) == -1) throw SysError("cannot dup stderr"); + // FIXME: ensure no password prompt. + // FIXME: connection timeouts Strings argv({"ssh", sshName, "-i", sshKey, "-x", "-a", "--", "nix-store", "--serve", "--write"}); execvp("ssh", (char * *) stringsToCharPtrs(argv).data()); // FIXME: remove cast diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index aae081b4..417c5202 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -39,6 +39,13 @@ bool has(const C & c, const V & v) typedef std::atomic counter; +struct MaintainCount +{ + counter & c; + MaintainCount(counter & c) : c(c) { c++; } + ~MaintainCount() { c--; } +}; + typedef enum { bsSuccess = 0, @@ -229,6 +236,7 @@ private: counter nrBuildsRead{0}; counter nrBuildsDone{0}; counter nrStepsDone{0}; + counter nrActiveSteps{0}; counter nrRetries{0}; counter maxNrRetries{0}; counter nrQueueWakeups{0}; @@ -957,6 +965,8 @@ void State::builder(Step::ptr step, MachineReservation::ptr reservation) { bool retry = true; + MaintainCount mc(nrActiveSteps); + try { auto store = openStore(); // FIXME: pool retry = doBuildStep(store, step, reservation->machine); @@ -1252,6 +1262,7 @@ void State::dumpStatus() if (i->lock()) ++i; else i = runnable_->erase(i); printMsg(lvlError, format("%1% runnable build steps") % runnable_->size()); } + printMsg(lvlError, format("%1% active build steps") % nrActiveSteps); printMsg(lvlError, format("%1% builds read from queue") % nrBuildsRead); printMsg(lvlError, format("%1% builds done") % nrBuildsDone); printMsg(lvlError, format("%1% build steps done") % nrStepsDone); From 3855131185bae4edbd9f7ae7271e0277e23c591f Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Thu, 18 Jun 2015 00:50:48 +0200 Subject: [PATCH 042/158] hydra-queue-runner: Improve SSH flags --- src/hydra-queue-runner/build-remote.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/hydra-queue-runner/build-remote.cc b/src/hydra-queue-runner/build-remote.cc index 39529e1a..e9278705 100644 --- a/src/hydra-queue-runner/build-remote.cc +++ b/src/hydra-queue-runner/build-remote.cc @@ -39,9 +39,11 @@ static void openConnection(const string & sshName, const string & sshKey, if (dup2(stderrFD, STDERR_FILENO) == -1) throw SysError("cannot dup stderr"); - // FIXME: ensure no password prompt. // FIXME: connection timeouts - Strings argv({"ssh", sshName, "-i", sshKey, "-x", "-a", "--", "nix-store", "--serve", "--write"}); + Strings argv( + { "ssh", sshName, "-i", sshKey, "-x", "-a" + , "-oBatchMode=yes", "-oConnectTimeout=60", "-oTCPKeepAlive=yes" + , "--", "nix-store", "--serve", "--write" }); execvp("ssh", (char * *) stringsToCharPtrs(argv).data()); // FIXME: remove cast From a40ca6b76e58db9cbc05f31e187e8a7ff5ca7f53 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Thu, 18 Jun 2015 01:52:20 +0200 Subject: [PATCH 043/158] hydra-queue-runner: Improve dispatcher We now take the machine speed factor into account, just like build-remote.pl. --- src/hydra-queue-runner/hydra-queue-runner.cc | 148 +++++++++++-------- 1 file changed, 85 insertions(+), 63 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 417c5202..30a1d856 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -6,6 +6,7 @@ #include #include #include +#include #include @@ -159,13 +160,7 @@ struct Machine unsigned int maxJobs = 1; float speedFactor = 1.0; - Sync currentJobs; - - Machine() - { - auto currentJobs_(currentJobs.lock()); - *currentJobs_ = 0; - } + std::atomic currentJobs{0}; bool supportsStep(Step::ptr step) { @@ -187,13 +182,11 @@ struct MachineReservation Machine::ptr machine; MachineReservation(Machine::ptr machine) : machine(machine) { - auto currentJobs_(machine->currentJobs.lock()); - (*currentJobs_)++; + machine->currentJobs++; } ~MachineReservation() { - auto currentJobs_(machine->currentJobs.lock()); - if (*currentJobs_ > 0) (*currentJobs_)--; + machine->currentJobs--; } }; @@ -284,8 +277,6 @@ public: void wakeDispatcher(); - MachineReservation::ptr findMachine(Step::ptr step); - void builder(Step::ptr step, MachineReservation::ptr reservation); /* Perform the given build step. Return true if the step is to be @@ -878,49 +869,98 @@ void State::dispatcher() auto sleepUntil = system_time::max(); - { - auto runnable_(runnable.lock()); - printMsg(lvlDebug, format("%1% runnable builds") % runnable_->size()); + bool keepGoing; - /* FIXME: we're holding the runnable lock too long - here. This could be more efficient. */ + do { + /* Bail out when there are no slots left. */ + std::vector machinesSorted; + { + auto machines_(machines.lock()); + machinesSorted.insert(machinesSorted.end(), + machines_->begin(), machines_->end()); + } + /* Sort the machines by a combination of speed factor and + available slots. Prioritise the available machines as + follows: + + - First by load divided by speed factor, rounded to the + nearest integer. This causes fast machines to be + preferred over slow machines with similar loads. + + - Then by speed factor. + + - Finally by load. */ + sort(machinesSorted.begin(), machinesSorted.end(), + [](const Machine::ptr & a, const Machine::ptr & b) -> bool + { + float ta = roundf(a->currentJobs / a->speedFactor); + float tb = roundf(b->currentJobs / b->speedFactor); + return + ta != tb ? ta > tb : + a->speedFactor != b->speedFactor ? a->speedFactor > b->speedFactor : + a->maxJobs > b->maxJobs; + }); + + /* Find a machine with a free slot and find a step to run + on it. Once we find such a pair, we restart the outer + loop because the machine sorting will have changed. */ + keepGoing = false; system_time now = std::chrono::system_clock::now(); - for (auto i = runnable_->begin(); i != runnable_->end(); ) { - auto step = i->lock(); + for (auto & machine : machinesSorted) { + // FIXME: can we lose a wakeup if a builder exits concurrently? + if (machine->currentJobs >= machine->maxJobs) continue; - /* Delete dead steps. */ - if (!step) { - i = runnable_->erase(i); - continue; - } + auto runnable_(runnable.lock()); + printMsg(lvlDebug, format("%1% runnable builds") % runnable_->size()); - /* Skip previously failed steps that aren't ready to - be retried. */ - { - auto step_(step->state.lock()); - if (step_->tries > 0 && step_->after > now) { - if (step_->after < sleepUntil) - sleepUntil = step_->after; + /* FIXME: we're holding the runnable lock too long + here. This could be more efficient. */ + + for (auto i = runnable_->begin(); i != runnable_->end(); ) { + auto step = i->lock(); + + /* Delete dead steps. */ + if (!step) { + i = runnable_->erase(i); + continue; + } + + /* Can this machine do this step? */ + if (!machine->supportsStep(step)) { ++i; continue; } + + /* Skip previously failed steps that aren't ready + to be retried. */ + { + auto step_(step->state.lock()); + if (step_->tries > 0 && step_->after > now) { + if (step_->after < sleepUntil) + sleepUntil = step_->after; + ++i; + continue; + } + } + + /* Make a slot reservation and start a thread to + do the build. */ + auto reservation = std::make_shared(machine); + i = runnable_->erase(i); + + auto builderThread = std::thread(&State::builder, this, step, reservation); + builderThread.detach(); // FIXME? + + keepGoing = true; + break; } - auto reservation = findMachine(step); - if (!reservation) { - printMsg(lvlDebug, format("cannot execute step ‘%1%’ right now") % step->drvPath); - ++i; - continue; - } - - i = runnable_->erase(i); - - auto builderThread = std::thread(&State::builder, this, step, reservation); - builderThread.detach(); // FIXME? + if (keepGoing) break; } - } + + } while (keepGoing); /* Sleep until we're woken up (either because a runnable build is added, or because a build finishes). */ @@ -944,23 +984,6 @@ void State::wakeDispatcher() } -MachineReservation::ptr State::findMachine(Step::ptr step) -{ - auto machines_(machines.lock()); - - for (auto & machine : *machines_) { - if (!machine->supportsStep(step)) continue; - { - auto currentJobs_(machine->currentJobs.lock()); - if (*currentJobs_ >= machine->maxJobs) continue; - } - return std::make_shared(machine); - } - - return 0; -} - - void State::builder(Step::ptr step, MachineReservation::ptr reservation) { bool retry = true; @@ -1274,9 +1297,8 @@ void State::dumpStatus() { auto machines_(machines.lock()); for (auto & m : *machines_) { - auto currentJobs_(m->currentJobs.lock()); printMsg(lvlError, format("machine %1%: %2%/%3% active") - % m->sshName % *currentJobs_ % m->maxJobs); + % m->sshName % m->currentJobs % m->maxJobs); } } } From 69be3cfe93f4c303de5fa43d29409e9b7fa76bf6 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Thu, 18 Jun 2015 01:57:01 +0200 Subject: [PATCH 044/158] hydra-queue-runner: Handle status queries on the main thread Doing it on the queue monitor thread was problematic because processing the queue can take a while. --- src/hydra-queue-runner/hydra-queue-runner.cc | 52 +++++++++++++------- 1 file changed, 33 insertions(+), 19 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 30a1d856..6824fde6 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -84,6 +84,23 @@ struct Connection : pqxx::connection }; +struct receiver : public pqxx::notification_receiver +{ + bool status = false; + receiver(pqxx::connection_base & c, const std::string & channel) + : pqxx::notification_receiver(c, channel) { } + void operator() (const string & payload, int pid) override + { + status = true; + }; + bool get() { + bool b = status; + status = false; + return b; + } +}; + + typedef unsigned int BuildID; @@ -435,26 +452,9 @@ void State::queueMonitorLoop() { auto conn(dbPool.get()); - struct receiver : public pqxx::notification_receiver - { - bool status = false; - receiver(pqxx::connection_base & c, const std::string & channel) - : pqxx::notification_receiver(c, channel) { } - void operator() (const string & payload, int pid) override - { - status = true; - }; - bool get() { - bool b = status; - status = false; - return b; - } - }; - receiver buildsAdded(*conn, "builds_added"); receiver buildsRestarted(*conn, "builds_restarted"); receiver buildsCancelled(*conn, "builds_cancelled"); - receiver dumpStatus(*conn, "dump_status"); auto store = openStore(); // FIXME: pool @@ -479,8 +479,6 @@ void State::queueMonitorLoop() removeCancelledBuilds(*conn); } - if (dumpStatus.get()) - State::dumpStatus(); } } @@ -1314,6 +1312,22 @@ void State::run() std::thread(&State::dispatcher, this).detach(); + while (true) { + try { + auto conn(dbPool.get()); + receiver dumpStatus(*conn, "dump_status"); + while (true) { + conn->await_notification(); + if (dumpStatus.get()) + State::dumpStatus(); + } + } catch (std::exception & e) { + printMsg(lvlError, format("main thread: %1%") % e.what()); + sleep(10); // probably a DB problem, so don't retry right away + } + } + + // Never reached. queueMonitorThread.join(); } From 8257812d0a823cd0645c64f5aced2410da319379 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Thu, 18 Jun 2015 02:44:29 +0200 Subject: [PATCH 045/158] Acquire exclusive table lock earlier --- src/hydra-queue-runner/hydra-queue-runner.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 6824fde6..21f747f0 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -601,13 +601,13 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, if (buildStatus != bsSuccess) { time_t now = time(0); pqxx::work txn(conn); + createBuildStep(txn, 0, build, r, "", buildStepStatus); txn.parameterized ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $3, isCachedBuild = $4 where id = $1") (build->id) ((int) buildStatus) (now) (buildStatus != bsUnsupported ? 1 : 0).exec(); - createBuildStep(txn, 0, build, r, "", buildStepStatus); txn.commit(); nrBuildsDone++; badStep = true; From 47367451c7bcf0bee640f1319ffab63859c0eeb5 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Thu, 18 Jun 2015 03:28:58 +0200 Subject: [PATCH 046/158] hydra-queue-runner: Set isCachedBuild --- src/hydra-queue-runner/hydra-queue-runner.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 21f747f0..6cd5c828 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -1138,7 +1138,8 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, /* Mark all builds of which this derivation is the top level as succeeded. */ for (auto build2 : direct) - markSucceededBuild(txn, build2, res, false, result.startTime, result.stopTime); + markSucceededBuild(txn, build2, res, build != build2, + result.startTime, result.stopTime); } else { /* Failure case. */ From 92ea800cfb8dc3395fd92ab0c0bc7dd915d0b8b4 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Thu, 18 Jun 2015 04:19:21 +0200 Subject: [PATCH 047/158] Set finishedInDB in a few more places --- src/hydra-queue-runner/hydra-queue-runner.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 6cd5c828..6f13a15f 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -532,6 +532,7 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, (time(0)) ("derivation was garbage-collected prior to build").exec(); txn.commit(); + build->finishedInDB = true; nrBuildsDone++; return; } @@ -609,6 +610,7 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, (now) (buildStatus != bsUnsupported ? 1 : 0).exec(); txn.commit(); + build->finishedInDB = true; nrBuildsDone++; badStep = true; break; From e039f5f840e6fad10600a794cded31232c134a0a Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Thu, 18 Jun 2015 04:35:37 +0200 Subject: [PATCH 048/158] Create failed build steps for cached failures --- src/hydra-queue-runner/hydra-queue-runner.cc | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 6f13a15f..a45e8bb9 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -1159,12 +1159,16 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, message. */ if (buildStatus != bsAborted) result.errorMsg = ""; - if (!cachedFailure && !retry) { + if (!retry) { - /* Create failed build steps for every build that depends - on this. */ + /* Create failed build steps for every build that + depends on this. For cached failures, only create a + step for builds that don't have this step as + top-level (otherwise the user won't be able to see + what caused the build to fail). */ for (auto build2 : dependents) { if (build == build2) continue; + if (cachedFailure && build2->drvPath == step->drvPath) continue; createBuildStep(txn, 0, build2, step, machine->sshName, buildStepStatus, result.errorMsg, build->id); } From 9c03b11ca8ab76b2c8e25301ce96eaf864fc49ab Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Thu, 18 Jun 2015 14:51:08 +0200 Subject: [PATCH 049/158] Simplify retry handling --- src/hydra-queue-runner/hydra-queue-runner.cc | 88 ++++++++++---------- 1 file changed, 45 insertions(+), 43 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index a45e8bb9..3b70197c 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -1100,10 +1100,21 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, if (!result.stopTime) result.stopTime = time(0); - bool retry = false; + /* The step had a hopefully temporary failure (e.g. network + issue). Retry a number of times. */ if (result.status == RemoteResult::rrMiscFailure) { - auto step_(step->state.lock()); - retry = step_->tries + 1 < maxTries; + bool retry; + { + auto step_(step->state.lock()); + retry = step_->tries + 1 < maxTries; + } + if (retry) { + pqxx::work txn(*conn); + finishBuildStep(txn, result.startTime, result.stopTime, build->id, + stepNr, machine->sshName, bssAborted, result.errorMsg); + txn.commit(); + return true; + } } /* Remove this step. After this, incoming builds that depend on @@ -1112,10 +1123,8 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, won't conflict with this one, because we're removing it. In any case, the set of dependent builds for ‘step’ can't increase anymore because ‘step’ is no longer visible to createStep(). */ - if (!retry) { - auto steps_(steps.lock()); - steps_->erase(step->drvPath); - } + auto steps_(steps.lock()); + steps_->erase(step->drvPath); /* Get the final set of dependent builds. */ auto dependents = getDependentBuilds(step); @@ -1159,20 +1168,16 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, message. */ if (buildStatus != bsAborted) result.errorMsg = ""; - if (!retry) { - - /* Create failed build steps for every build that - depends on this. For cached failures, only create a - step for builds that don't have this step as - top-level (otherwise the user won't be able to see - what caused the build to fail). */ - for (auto build2 : dependents) { - if (build == build2) continue; - if (cachedFailure && build2->drvPath == step->drvPath) continue; - createBuildStep(txn, 0, build2, step, machine->sshName, - buildStepStatus, result.errorMsg, build->id); - } - + /* Create failed build steps for every build that depends + on this. For cached failures, only create a step for + builds that don't have this step as top-level + (otherwise the user won't be able to see what caused + the build to fail). */ + for (auto build2 : dependents) { + if (build == build2) continue; + if (cachedFailure && build2->drvPath == step->drvPath) continue; + createBuildStep(txn, 0, build2, step, machine->sshName, + buildStepStatus, result.errorMsg, build->id); } if (!cachedFailure) @@ -1180,19 +1185,18 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, stepNr, machine->sshName, buildStepStatus, result.errorMsg); /* Mark all builds that depend on this derivation as failed. */ - if (!retry) - for (auto build2 : dependents) { - printMsg(lvlError, format("marking build %1% as failed") % build2->id); - txn.parameterized - ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1") - (build2->id) - ((int) (build2->drvPath != step->drvPath && buildStatus == bsFailed ? bsDepFailed : buildStatus)) - (result.startTime) - (result.stopTime) - (cachedFailure ? 1 : 0).exec(); - build2->finishedInDB = true; // FIXME: txn might fail - nrBuildsDone++; - } + for (auto build2 : dependents) { + printMsg(lvlError, format("marking build %1% as failed") % build2->id); + txn.parameterized + ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1") + (build2->id) + ((int) (build2->drvPath != step->drvPath && buildStatus == bsFailed ? bsDepFailed : buildStatus)) + (result.startTime) + (result.stopTime) + (cachedFailure ? 1 : 0).exec(); + build2->finishedInDB = true; // FIXME: txn might fail + nrBuildsDone++; + } /* Remember failed paths in the database so that they won't be built again. */ @@ -1208,20 +1212,18 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, is the top-level derivation. In case of failure, destroy all dependent Build objects. Any Steps not referenced by other Builds will be destroyed as well. */ - if (!retry) - for (auto build2 : dependents) - if (build2->toplevel == step || result.status != RemoteResult::rrSuccess) { - auto builds_(builds.lock()); - builds_->erase(build2->id); - } + for (auto build2 : dependents) + if (build2->toplevel == step || result.status != RemoteResult::rrSuccess) { + auto builds_(builds.lock()); + builds_->erase(build2->id); + } /* Remove the step from the graph. In case of success, make dependent build steps runnable if they have no other dependencies. */ - if (!retry) - destroyStep(step, result.status == RemoteResult::rrSuccess); + destroyStep(step, result.status == RemoteResult::rrSuccess); - return retry; + return false; } From 948473c90976d5e862e8572de72d329b7c1f3116 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Thu, 18 Jun 2015 16:30:28 +0200 Subject: [PATCH 050/158] Fix race between the queue monitor and the builder threads --- src/hydra-queue-runner/hydra-queue-runner.cc | 437 ++++++++++--------- 1 file changed, 239 insertions(+), 198 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 3b70197c..e9c5078e 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -120,9 +120,7 @@ struct Build std::shared_ptr toplevel; - bool finishedInDB; - - Build() : finishedInDB(false) { } + std::atomic_bool finishedInDB{false}; ~Build() { @@ -158,13 +156,15 @@ struct Step system_time after; }; + std::atomic_bool created{false}; // debugging + std::atomic_bool finished{false}; // debugging + Sync state; - std::atomic_bool destroyed; - - Step() : destroyed(false) { } - - ~Step() { } + ~Step() + { + printMsg(lvlError, format("destroying step %1%") % drvPath); + } }; @@ -280,13 +280,9 @@ public: void removeCancelledBuilds(Connection & conn); Step::ptr createStep(std::shared_ptr store, const Path & drvPath, + Build::ptr referringBuild, Step::ptr referringStep, std::set & newSteps, std::set & newRunnable); - void destroyStep(Step::ptr step, bool proceed); - - /* Get the builds that depend on the given step. */ - std::set getDependentBuilds(Step::ptr step); - void makeRunnable(Step::ptr step); /* The thread that selects and starts runnable builds. */ @@ -525,6 +521,7 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, /* Derivation has been GC'ed prematurely. */ printMsg(lvlError, format("aborting GC'ed build %1%") % build->id); pqxx::work txn(conn); + assert(!build->finishedInDB); txn.parameterized ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $3, errorMsg = $4 where id = $1") (build->id) @@ -538,7 +535,7 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, } std::set newSteps; - Step::ptr step = createStep(store, build->drvPath, newSteps, newRunnable); + Step::ptr step = createStep(store, build->drvPath, build, 0, newSteps, newRunnable); /* Some of the new steps may be the top level of builds that we haven't processed yet. So do them now. This ensures that @@ -560,13 +557,13 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, Derivation drv = readDerivation(build->drvPath); BuildResult res = getBuildResult(store, drv); - printMsg(lvlInfo, format("marking build %1% as cached successful") % build->id); - pqxx::work txn(conn); time_t now = time(0); markSucceededBuild(txn, build, res, true, now, now); txn.commit(); + build->finishedInDB = true; + return; } @@ -603,6 +600,7 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, time_t now = time(0); pqxx::work txn(conn); createBuildStep(txn, 0, build, r, "", buildStepStatus); + assert(!build->finishedInDB); txn.parameterized ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $3, isCachedBuild = $4 where id = $1") (build->id) @@ -624,20 +622,12 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, { auto builds_(builds.lock()); - auto step_(step->state.lock()); (*builds_)[build->id] = build; - step_->builds.push_back(build); build->toplevel = step; } printMsg(lvlChatty, format("added build %1% (top-level step %2%, %3% new steps)") % build->id % step->drvPath % newSteps.size()); - - /* Prior to this, the build is not visible to - getDependentBuilds(). Now it is, so the build can be - failed if a dependency fails. (It can't succeed right away - because its top-level is not runnable yet). */ - }; /* Now instantiate build steps for each new build. The builder @@ -687,32 +677,65 @@ void State::removeCancelledBuilds(Connection & conn) Step::ptr State::createStep(std::shared_ptr store, const Path & drvPath, + Build::ptr referringBuild, Step::ptr referringStep, std::set & newSteps, std::set & newRunnable) { - /* Check if the requested step already exists. */ + /* Check if the requested step already exists. If not, create a + new step. In any case, make the step reachable from + referringBuild or referringStep. This is done atomically (with + ‘steps’ locked), to ensure that this step can never become + reachable from a new build after doBuildStep has removed it + from ‘steps’. */ + Step::ptr step; + bool isNew = false; { auto steps_(steps.lock()); + + /* See if the step already exists in ‘steps’ and is not + stale. */ auto prev = steps_->find(drvPath); if (prev != steps_->end()) { - auto step = prev->second.lock(); + step = prev->second.lock(); /* Since ‘step’ is a strong pointer, the referred Step object won't be deleted after this. */ - if (step) return step; - steps_->erase(drvPath); // remove stale entry + if (!step) steps_->erase(drvPath); // remove stale entry } + + /* If it doesn't exist, create it. */ + if (!step) { + step = std::make_shared(); + step->drvPath = drvPath; + isNew = true; + } + + auto step_(step->state.lock()); + + if (referringBuild) + step_->builds.push_back(referringBuild); + + if (referringStep) + step_->rdeps.push_back(referringStep); + + (*steps_)[drvPath] = step; } printMsg(lvlDebug, format("considering derivation ‘%1%’") % drvPath); - auto step = std::make_shared(); - step->drvPath = drvPath; + if (!isNew) { + assert(step->created); + return step; + } + + /* Initialize the step. Note that the step may be visible in + ‘steps’ before this point, but that doesn't matter because + it's not runnable yet, and other threads won't make it + runnable while step->created == false. */ step->drv = readDerivation(drvPath); { auto i = step->drv.env.find("requiredSystemFeatures"); if (i != step->drv.env.end()) step->requiredSystemFeatures = tokenizeString>(i->second); } - newSteps.insert(step); /* Are all outputs valid? */ bool valid = true; @@ -728,94 +751,39 @@ Step::ptr State::createStep(std::shared_ptr store, const Path & drvPat /* No, we need to build. */ printMsg(lvlDebug, format("creating build step ‘%1%’") % drvPath); + newSteps.insert(step); /* Create steps for the dependencies. */ - bool hasDeps = false; for (auto & i : step->drv.inputDrvs) { - Step::ptr dep = createStep(store, i.first, newSteps, newRunnable); + auto dep = createStep(store, i.first, 0, step, newSteps, newRunnable); if (dep) { - hasDeps = true; auto step_(step->state.lock()); - auto dep_(dep->state.lock()); step_->deps.insert(dep); - dep_->rdeps.push_back(step); } } + /* If the step has no (remaining) dependencies, make it + runnable. */ { - auto steps_(steps.lock()); - assert(steps_->find(drvPath) == steps_->end()); - (*steps_)[drvPath] = step; + auto step_(step->state.lock()); + assert(!step->created); + step->created = true; + if (step_->deps.empty()) + newRunnable.insert(step); } - if (!hasDeps) newRunnable.insert(step); - return step; } -void State::destroyStep(Step::ptr step, bool proceed) +/* Get the steps and unfinished builds that depend on the given step. */ +void getDependents(Step::ptr step, std::set & builds, std::set & steps) { - if (step->destroyed) return; - step->destroyed = true; - - printMsg(lvlDebug, format("destroying build step ‘%1%’") % step->drvPath); - - nrStepsDone++; - - { - auto steps_(steps.lock()); - steps_->erase(step->drvPath); - } - - std::vector rdeps; - - { - auto step_(step->state.lock()); - rdeps = step_->rdeps; - - /* Sanity checks. */ - for (auto & build_ : step_->builds) { - auto build = build_.lock(); - if (!build) continue; - assert(build->drvPath == step->drvPath); - assert(build->finishedInDB); - } - } - - for (auto & rdep_ : rdeps) { - auto rdep = rdep_.lock(); - if (!rdep) continue; - bool runnable = false; - { - auto rdep_(rdep->state.lock()); - assert(has(rdep_->deps, step)); - rdep_->deps.erase(step); - if (rdep_->deps.empty()) runnable = true; - } - if (proceed) { - /* If this rdep has no other dependencies, then we can now - build it. */ - if (runnable) - makeRunnable(rdep); - } else - /* If ‘step’ failed or was cancelled, then delete all - dependent steps as well. */ - destroyStep(rdep, false); - } -} - - -std::set State::getDependentBuilds(Step::ptr step) -{ - std::set done; - std::set res; - std::function visit; visit = [&](Step::ptr step) { - if (has(done, step)) return; - done.insert(step); + if (has(steps, step)) return; + steps.insert(step); std::vector rdeps; @@ -824,7 +792,7 @@ std::set State::getDependentBuilds(Step::ptr step) for (auto & build : step_->builds) { auto build_ = build.lock(); - if (build_) res.insert(build_); + if (build_ && !build_->finishedInDB) builds.insert(build_); } /* Make a copy of rdeps so that we don't hold the lock for @@ -839,8 +807,6 @@ std::set State::getDependentBuilds(Step::ptr step) }; visit(step); - - return res; } @@ -850,6 +816,8 @@ void State::makeRunnable(Step::ptr step) { auto step_(step->state.lock()); + assert(step->created); + assert(!step->finished); assert(step_->deps.empty()); } @@ -913,7 +881,7 @@ void State::dispatcher() if (machine->currentJobs >= machine->maxJobs) continue; auto runnable_(runnable.lock()); - printMsg(lvlDebug, format("%1% runnable builds") % runnable_->size()); + //printMsg(lvlDebug, format("%1% runnable builds") % runnable_->size()); /* FIXME: we're holding the runnable lock too long here. This could be more efficient. */ @@ -1024,6 +992,12 @@ void State::builder(Step::ptr step, MachineReservation::ptr reservation) bool State::doBuildStep(std::shared_ptr store, Step::ptr step, Machine::ptr machine) { + { + auto step_(step->state.lock()); + assert(step->created); + assert(!step->finished); + } + /* There can be any number of builds in the database that depend on this derivation. Arbitrarily pick one (though preferring a build of which this is the top-level derivation) for the @@ -1034,7 +1008,9 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, Build::ptr build; { - auto dependents = getDependentBuilds(step); + std::set dependents; + std::set steps; + getDependents(step, dependents, steps); if (dependents.empty()) { /* Apparently all builds that depend on this derivation @@ -1117,112 +1093,176 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, } } - /* Remove this step. After this, incoming builds that depend on - drvPath will either see that the output paths exist, or will - create a new build step for drvPath. The latter is fine - it - won't conflict with this one, because we're removing it. In any - case, the set of dependent builds for ‘step’ can't increase - anymore because ‘step’ is no longer visible to createStep(). */ - auto steps_(steps.lock()); - steps_->erase(step->drvPath); + if (result.status == RemoteResult::rrSuccess) { - /* Get the final set of dependent builds. */ - auto dependents = getDependentBuilds(step); + /* Register success in the database for all Build objects that + have this step as the top-level step. Since the queue + monitor thread may be creating new referring Builds + concurrently, and updating the database may fail, we do + this in a loop, marking all known builds, repeating until + there are no unmarked builds. + */ + while (true) { - std::set direct; - { - auto step_(step->state.lock()); - for (auto & build : step_->builds) { - auto build_ = build.lock(); - if (build_) direct.insert(build_); - } - } + /* Get the builds that have this one as the top-level. */ + std::vector direct; + { + auto steps_(steps.lock()); + auto step_(step->state.lock()); - /* Update the database. */ - { - pqxx::work txn(*conn); + for (auto & b_ : step_->builds) { + auto b = b_.lock(); + if (b && !b->finishedInDB) direct.push_back(b); + } - if (result.status == RemoteResult::rrSuccess) { - - finishBuildStep(txn, result.startTime, result.stopTime, build->id, stepNr, machine->sshName, bssSuccess); - - /* Mark all builds of which this derivation is the top - level as succeeded. */ - for (auto build2 : direct) - markSucceededBuild(txn, build2, res, build != build2, - result.startTime, result.stopTime); - - } else { - /* Failure case. */ - - BuildStatus buildStatus = - result.status == RemoteResult::rrPermanentFailure ? bsFailed : - result.status == RemoteResult::rrTimedOut ? bsTimedOut : - bsAborted; - BuildStepStatus buildStepStatus = - result.status == RemoteResult::rrPermanentFailure ? bssFailed : - result.status == RemoteResult::rrTimedOut ? bssTimedOut : - bssAborted; - - /* For regular failures, we don't care about the error - message. */ - if (buildStatus != bsAborted) result.errorMsg = ""; - - /* Create failed build steps for every build that depends - on this. For cached failures, only create a step for - builds that don't have this step as top-level - (otherwise the user won't be able to see what caused - the build to fail). */ - for (auto build2 : dependents) { - if (build == build2) continue; - if (cachedFailure && build2->drvPath == step->drvPath) continue; - createBuildStep(txn, 0, build2, step, machine->sshName, - buildStepStatus, result.errorMsg, build->id); + /* If there are no builds left to update in the DB, + then we're done. Delete the step from + ‘steps’. Since we've been holding the ‘steps’ lock, + no new referrers can have been added in the + meantime or be added afterwards. */ + if (direct.empty()) { + printMsg(lvlDebug, format("finishing build step ‘%1%’") % step->drvPath); + nrStepsDone++; + steps_->erase(step->drvPath); + break; + } } - if (!cachedFailure) - finishBuildStep(txn, result.startTime, result.stopTime, build->id, - stepNr, machine->sshName, buildStepStatus, result.errorMsg); + /* Update the database. */ + { + pqxx::work txn(*conn); - /* Mark all builds that depend on this derivation as failed. */ - for (auto build2 : dependents) { - printMsg(lvlError, format("marking build %1% as failed") % build2->id); - txn.parameterized - ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1") - (build2->id) - ((int) (build2->drvPath != step->drvPath && buildStatus == bsFailed ? bsDepFailed : buildStatus)) - (result.startTime) - (result.stopTime) - (cachedFailure ? 1 : 0).exec(); - build2->finishedInDB = true; // FIXME: txn might fail - nrBuildsDone++; + finishBuildStep(txn, result.startTime, result.stopTime, build->id, stepNr, machine->sshName, bssSuccess); + + for (auto & b : direct) + markSucceededBuild(txn, b, res, build != b, + result.startTime, result.stopTime); + + txn.commit(); } - /* Remember failed paths in the database so that they - won't be built again. */ - if (!cachedFailure && result.status == RemoteResult::rrPermanentFailure) - for (auto & path : outputPaths(step->drv)) - txn.parameterized("insert into FailedPaths values ($1)")(path).exec(); + /* Remove the direct dependencies from ‘builds’. This will + cause them to be destroyed. */ + for (auto & b : direct) { + auto builds_(builds.lock()); + b->finishedInDB = true; + builds_->erase(b->id); + } + } + + /* Wake up any dependent steps that have no other + dependencies. */ + { + auto step_(step->state.lock()); + for (auto & rdepWeak : step_->rdeps) { + auto rdep = rdepWeak.lock(); + if (!rdep) continue; + + bool runnable = false; + { + auto rdep_(rdep->state.lock()); + rdep_->deps.erase(step); + if (rdep_->deps.empty()) runnable = true; + } + + if (runnable) makeRunnable(rdep); + } + } + + } else { + + /* Register failure in the database for all Build objects that + directly or indirectly depend on this step. */ + + while (true) { + + /* Get the builds and steps that depend on this step. */ + std::set indirect; + { + auto steps_(steps.lock()); + std::set steps; + getDependents(step, indirect, steps); + + /* If there are no builds left, delete all referring + steps from ‘steps’. As for the success case, we can + be certain no new referrers can be added. */ + if (indirect.empty()) { + for (auto & s : steps) { + printMsg(lvlDebug, format("finishing build step ‘%1%’") % step->drvPath); + nrStepsDone++; + steps_->erase(s->drvPath); + } + break; + } + } + + /* Update the database. */ + { + pqxx::work txn(*conn); + + BuildStatus buildStatus = + result.status == RemoteResult::rrPermanentFailure ? bsFailed : + result.status == RemoteResult::rrTimedOut ? bsTimedOut : + bsAborted; + BuildStepStatus buildStepStatus = + result.status == RemoteResult::rrPermanentFailure ? bssFailed : + result.status == RemoteResult::rrTimedOut ? bssTimedOut : + bssAborted; + + /* For regular failures, we don't care about the error + message. */ + if (buildStatus != bsAborted) result.errorMsg = ""; + + /* Create failed build steps for every build that depends + on this. For cached failures, only create a step for + builds that don't have this step as top-level + (otherwise the user won't be able to see what caused + the build to fail). */ + for (auto & build2 : indirect) { + if (build == build2) continue; + if (cachedFailure && build2->drvPath == step->drvPath) continue; + createBuildStep(txn, 0, build2, step, machine->sshName, + buildStepStatus, result.errorMsg, build->id); + } + + if (!cachedFailure) + finishBuildStep(txn, result.startTime, result.stopTime, build->id, + stepNr, machine->sshName, buildStepStatus, result.errorMsg); + + /* Mark all builds that depend on this derivation as failed. */ + for (auto & build2 : indirect) { + printMsg(lvlError, format("marking build %1% as failed") % build2->id); + assert(!build->finishedInDB); + txn.parameterized + ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1") + (build2->id) + ((int) (build2->drvPath != step->drvPath && buildStatus == bsFailed ? bsDepFailed : buildStatus)) + (result.startTime) + (result.stopTime) + (cachedFailure ? 1 : 0).exec(); + nrBuildsDone++; + } + + /* Remember failed paths in the database so that they + won't be built again. */ + if (!cachedFailure && result.status == RemoteResult::rrPermanentFailure) + for (auto & path : outputPaths(step->drv)) + txn.parameterized("insert into FailedPaths values ($1)")(path).exec(); + + txn.commit(); + } + + /* Remove the indirect dependencies from ‘builds’. This + will cause them to be destroyed. */ + for (auto & b : indirect) { + auto builds_(builds.lock()); + b->finishedInDB = true; + builds_->erase(b->id); + } } - txn.commit(); } - /* In case of success, destroy all Build objects of which ‘step’ - is the top-level derivation. In case of failure, destroy all - dependent Build objects. Any Steps not referenced by other - Builds will be destroyed as well. */ - for (auto build2 : dependents) - if (build2->toplevel == step || result.status != RemoteResult::rrSuccess) { - auto builds_(builds.lock()); - builds_->erase(build2->id); - } - - /* Remove the step from the graph. In case of success, make - dependent build steps runnable if they have no other - dependencies. */ - destroyStep(step, result.status == RemoteResult::rrSuccess); - return false; } @@ -1232,6 +1272,8 @@ void State::markSucceededBuild(pqxx::work & txn, Build::ptr build, { printMsg(lvlInfo, format("marking build %1% as succeeded") % build->id); + assert(!build->finishedInDB); + txn.parameterized ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, size = $5, closureSize = $6, releaseName = $7, isCachedBuild = $8 where id = $1") (build->id) @@ -1259,7 +1301,6 @@ void State::markSucceededBuild(pqxx::work & txn, Build::ptr build, (product.defaultPath).exec(); } - build->finishedInDB = true; // FIXME: txn might fail nrBuildsDone++; } From 9cdbff2fdf58bae49bf4fff5bc70beb5a78a97d3 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Thu, 18 Jun 2015 17:12:51 +0200 Subject: [PATCH 051/158] Handle concurrent finishing of the same build There is a slight possibility that the queue monitor and a builder thread simultaneously decide to mark a build as finished. That's fine, as long as we ensure the DB update is idempotent (as ensured by doing "update Builds set finished = 1 ... where finished = 0"). --- src/hydra-queue-runner/hydra-queue-runner.cc | 61 ++++++++++---------- src/hydra-queue-runner/sync.hh | 3 + 2 files changed, 35 insertions(+), 29 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index e9c5078e..b2911559 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -163,7 +163,7 @@ struct Step ~Step() { - printMsg(lvlError, format("destroying step %1%") % drvPath); + //printMsg(lvlError, format("destroying step %1%") % drvPath); } }; @@ -520,17 +520,18 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, if (!store->isValidPath(build->drvPath)) { /* Derivation has been GC'ed prematurely. */ printMsg(lvlError, format("aborting GC'ed build %1%") % build->id); - pqxx::work txn(conn); - assert(!build->finishedInDB); - txn.parameterized - ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $3, errorMsg = $4 where id = $1") - (build->id) - ((int) bsAborted) - (time(0)) - ("derivation was garbage-collected prior to build").exec(); - txn.commit(); - build->finishedInDB = true; - nrBuildsDone++; + if (!build->finishedInDB) { + pqxx::work txn(conn); + txn.parameterized + ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $3, errorMsg = $4 where id = $1 and finished = 0") + (build->id) + ((int) bsAborted) + (time(0)) + ("derivation was garbage-collected prior to build").exec(); + txn.commit(); + build->finishedInDB = true; + nrBuildsDone++; + } return; } @@ -598,18 +599,19 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, if (buildStatus != bsSuccess) { time_t now = time(0); - pqxx::work txn(conn); - createBuildStep(txn, 0, build, r, "", buildStepStatus); - assert(!build->finishedInDB); - txn.parameterized - ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $3, isCachedBuild = $4 where id = $1") - (build->id) - ((int) buildStatus) - (now) - (buildStatus != bsUnsupported ? 1 : 0).exec(); - txn.commit(); - build->finishedInDB = true; - nrBuildsDone++; + if (!build->finishedInDB) { + pqxx::work txn(conn); + createBuildStep(txn, 0, build, r, "", buildStepStatus); + txn.parameterized + ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $3, isCachedBuild = $4 where id = $1 and finished = 0") + (build->id) + ((int) buildStatus) + (now) + (buildStatus != bsUnsupported ? 1 : 0).exec(); + txn.commit(); + build->finishedInDB = true; + nrBuildsDone++; + } badStep = true; break; } @@ -622,7 +624,8 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, { auto builds_(builds.lock()); - (*builds_)[build->id] = build; + if (!build->finishedInDB) // FIXME: can this happen? + (*builds_)[build->id] = build; build->toplevel = step; } @@ -1232,9 +1235,9 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, /* Mark all builds that depend on this derivation as failed. */ for (auto & build2 : indirect) { printMsg(lvlError, format("marking build %1% as failed") % build2->id); - assert(!build->finishedInDB); + if (build->finishedInDB) continue; txn.parameterized - ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1") + ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1 and finished = 0") (build2->id) ((int) (build2->drvPath != step->drvPath && buildStatus == bsFailed ? bsDepFailed : buildStatus)) (result.startTime) @@ -1272,10 +1275,10 @@ void State::markSucceededBuild(pqxx::work & txn, Build::ptr build, { printMsg(lvlInfo, format("marking build %1% as succeeded") % build->id); - assert(!build->finishedInDB); + if (build->finishedInDB) return; txn.parameterized - ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, size = $5, closureSize = $6, releaseName = $7, isCachedBuild = $8 where id = $1") + ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, size = $5, closureSize = $6, releaseName = $7, isCachedBuild = $8 where id = $1 and finished = 0") (build->id) ((int) (res.failed ? bsFailedWithOutput : bsSuccess)) (startTime) diff --git a/src/hydra-queue-runner/sync.hh b/src/hydra-queue-runner/sync.hh index 6f5f9e6a..34b97285 100644 --- a/src/hydra-queue-runner/sync.hh +++ b/src/hydra-queue-runner/sync.hh @@ -28,6 +28,9 @@ private: public: + Sync() { } + Sync(const T & data) : data(data) { } + class Lock { private: From 89b629eeb18397f865600ff0d425c19ade0cfc72 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Thu, 18 Jun 2015 17:37:35 +0200 Subject: [PATCH 052/158] Fix finishing steps that are not top-level of any build --- src/hydra-queue-runner/hydra-queue-runner.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index b2911559..18a59786 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -1119,7 +1119,8 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, } /* If there are no builds left to update in the DB, - then we're done. Delete the step from + then we're done (except for calling + finishBuildStep()). Delete the step from ‘steps’. Since we've been holding the ‘steps’ lock, no new referrers can have been added in the meantime or be added afterwards. */ @@ -1127,7 +1128,6 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, printMsg(lvlDebug, format("finishing build step ‘%1%’") % step->drvPath); nrStepsDone++; steps_->erase(step->drvPath); - break; } } @@ -1144,6 +1144,8 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, txn.commit(); } + if (direct.empty()) break; + /* Remove the direct dependencies from ‘builds’. This will cause them to be destroyed. */ for (auto & b : direct) { From 8db1ae285567882f41164fa81f737cf637b0f0f3 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Thu, 18 Jun 2015 17:43:13 +0200 Subject: [PATCH 053/158] Less verbosity --- src/hydra-queue-runner/hydra-queue-runner.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 18a59786..7558dfc8 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -722,13 +722,13 @@ Step::ptr State::createStep(std::shared_ptr store, const Path & drvPat (*steps_)[drvPath] = step; } - printMsg(lvlDebug, format("considering derivation ‘%1%’") % drvPath); - if (!isNew) { assert(step->created); return step; } + printMsg(lvlDebug, format("considering derivation ‘%1%’") % drvPath); + /* Initialize the step. Note that the step may be visible in ‘steps’ before this point, but that doesn't matter because it's not runnable yet, and other threads won't make it From 77c8bfd392c60b4d23acd9c3d91ef73416e4d3a4 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Fri, 19 Jun 2015 10:37:22 +0200 Subject: [PATCH 054/158] Improve logging for aborts --- src/hydra-queue-runner/hydra-queue-runner.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 7558dfc8..c2a1fdd4 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -1070,8 +1070,6 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, } catch (Error & e) { result.status = RemoteResult::rrMiscFailure; result.errorMsg = e.msg(); - printMsg(lvlError, format("irregular failure building ‘%1%’ on ‘%2%’: %3%") - % step->drvPath % machine->sshName % e.msg()); } if (result.status == RemoteResult::rrSuccess) res = getBuildResult(store, step->drv); @@ -1082,6 +1080,8 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, /* The step had a hopefully temporary failure (e.g. network issue). Retry a number of times. */ if (result.status == RemoteResult::rrMiscFailure) { + printMsg(lvlError, format("irregular failure building ‘%1%’ on ‘%2%’: %3%") + % step->drvPath % machine->sshName % result.errorMsg); bool retry; { auto step_(step->state.lock()); From 8e408048e2c10c1e9e33797ae2bf576bb15f11c0 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Fri, 19 Jun 2015 11:33:15 +0200 Subject: [PATCH 055/158] Create build step for non-top-level cached failures This fixes the missing build step on failures like http://hydra.nixos.org/build/23222231 --- src/hydra-queue-runner/hydra-queue-runner.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index c2a1fdd4..498297fc 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -1224,8 +1224,9 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, (otherwise the user won't be able to see what caused the build to fail). */ for (auto & build2 : indirect) { - if (build == build2) continue; - if (cachedFailure && build2->drvPath == step->drvPath) continue; + if ((cachedFailure && build2->drvPath == step->drvPath) || + (!cachedFailure && build == build2)) + continue; createBuildStep(txn, 0, build2, step, machine->sshName, buildStepStatus, result.errorMsg, build->id); } From 133d298e26186566c584bcaad17e128e6f7fa29b Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Fri, 19 Jun 2015 14:51:59 +0200 Subject: [PATCH 056/158] Asynchronously compress build logs --- src/hydra-queue-runner/build-remote.cc | 14 +-- src/hydra-queue-runner/build-remote.hh | 1 + src/hydra-queue-runner/build-result.cc | 4 +- src/hydra-queue-runner/hydra-queue-runner.cc | 89 ++++++++++++++++++-- src/lib/Hydra/Helper/Nix.pm | 2 +- 5 files changed, 97 insertions(+), 13 deletions(-) diff --git a/src/hydra-queue-runner/build-remote.cc b/src/hydra-queue-runner/build-remote.cc index e9278705..7f02d081 100644 --- a/src/hydra-queue-runner/build-remote.cc +++ b/src/hydra-queue-runner/build-remote.cc @@ -117,12 +117,13 @@ void buildRemote(std::shared_ptr store, RemoteResult & result) { string base = baseNameOf(drvPath); - Path logFile = logDir + "/" + string(base, 0, 2) + "/" + string(base, 2); + result.logFile = logDir + "/" + string(base, 0, 2) + "/" + string(base, 2); + AutoDelete autoDelete(result.logFile, false); - createDirs(dirOf(logFile)); + createDirs(dirOf(result.logFile)); - AutoCloseFD logFD(open(logFile.c_str(), O_CREAT | O_TRUNC | O_WRONLY, 0666)); - if (logFD == -1) throw SysError(format("creating log file ‘%1%’") % logFile); + AutoCloseFD logFD(open(result.logFile.c_str(), O_CREAT | O_TRUNC | O_WRONLY, 0666)); + if (logFD == -1) throw SysError(format("creating log file ‘%1%’") % result.logFile); Child child; openConnection(sshName, sshKey, logFD, child); @@ -146,7 +147,8 @@ void buildRemote(std::shared_ptr store, throw Error(format("unsupported ‘nix-store --serve’ protocol version on ‘%1%’") % sshName); } catch (EndOfFile & e) { child.pid.wait(true); - throw Error(format("cannot connect to ‘%1%’: %2%") % sshName % chomp(readFile(logFile))); + string s = chomp(readFile(result.logFile)); + throw Error(format("cannot connect to ‘%1%’: %2%") % sshName % s); } /* Gather the inputs. */ @@ -163,6 +165,8 @@ void buildRemote(std::shared_ptr store, printMsg(lvlDebug, format("sending closure of ‘%1%’ to ‘%2%’") % drvPath % sshName); copyClosureTo(store, from, to, inputs); + autoDelete.cancel(); + /* Do the build. */ printMsg(lvlDebug, format("building ‘%1%’ on ‘%2%’") % drvPath % sshName); writeInt(cmdBuildPaths, to); diff --git a/src/hydra-queue-runner/build-remote.hh b/src/hydra-queue-runner/build-remote.hh index 99e79c8c..d932e8ae 100644 --- a/src/hydra-queue-runner/build-remote.hh +++ b/src/hydra-queue-runner/build-remote.hh @@ -13,6 +13,7 @@ struct RemoteResult } status = rrMiscFailure; std::string errorMsg; time_t startTime = 0, stopTime = 0; + nix::Path logFile; }; void buildRemote(std::shared_ptr store, diff --git a/src/hydra-queue-runner/build-result.cc b/src/hydra-queue-runner/build-result.cc index 78b9b25c..bb431623 100644 --- a/src/hydra-queue-runner/build-result.cc +++ b/src/hydra-queue-runner/build-result.cc @@ -58,8 +58,8 @@ BuildResult getBuildResult(std::shared_ptr store, const Derivation & d } product.defaultPath = words.empty() ? "" : words.front(); - /* Ensure that the path exists and points into the - Nix store. */ + /* Ensure that the path exists and points into the Nix + store. */ if (product.path == "" || product.path[0] != '/') continue; product.path = canonPath(product.path, true); if (!isInStore(product.path) || !pathExists(product.path)) continue; diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 498297fc..a9de0560 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -10,6 +11,10 @@ #include +#include +#include +#include + #include "build-result.hh" #include "build-remote.hh" #include "sync.hh" @@ -229,8 +234,6 @@ private: typedef std::list Runnable; Sync runnable; - std::condition_variable_any runnableWakeup; - /* CV for waking up the dispatcher. */ std::condition_variable dispatcherWakeup; std::mutex dispatcherMutex; @@ -252,15 +255,21 @@ private: counter nrQueueWakeups{0}; counter nrDispatcherWakeups{0}; + /* Log compressor work queue. */ + Sync> logCompressorQueue; + std::condition_variable_any logCompressorWakeup; + public: State(); ~State(); - void loadMachines(); - void clearBusy(time_t stopTime); +private: + + void loadMachines(); + int createBuildStep(pqxx::work & txn, time_t startTime, Build::ptr build, Step::ptr step, const std::string & machine, BuildStepStatus status, const std::string & errorMsg = "", BuildID propagatedFrom = 0); @@ -302,6 +311,11 @@ public: bool checkCachedFailure(Step::ptr step, Connection & conn); + /* Thread that asynchronously bzips logs of finished steps. */ + void logCompressor(); + +public: + void dumpStatus(); void run(); @@ -951,7 +965,7 @@ void State::dispatcher() void State::wakeDispatcher() { { std::lock_guard lock(dispatcherMutex); } // barrier - dispatcherWakeup.notify_all(); + dispatcherWakeup.notify_one(); } @@ -1063,6 +1077,7 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, txn.commit(); } + /* Do the build. */ try { /* FIXME: referring builds may have conflicting timeouts. */ buildRemote(store, machine->sshName, machine->sshKey, step->drvPath, step->drv, @@ -1077,6 +1092,15 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, if (!result.stopTime) result.stopTime = time(0); + /* Asynchronously compress the log. */ + if (result.logFile != "") { + { + auto logCompressorQueue_(logCompressorQueue.lock()); + logCompressorQueue_->push(result.logFile); + } + logCompressorWakeup.notify_one(); + } + /* The step had a hopefully temporary failure (e.g. network issue). Retry a number of times. */ if (result.status == RemoteResult::rrMiscFailure) { @@ -1321,6 +1345,57 @@ bool State::checkCachedFailure(Step::ptr step, Connection & conn) } +void State::logCompressor() +{ + while (true) { + try { + + Path logPath; + { + auto logCompressorQueue_(logCompressorQueue.lock()); + while (logCompressorQueue_->empty()) + logCompressorQueue_.wait(logCompressorWakeup); + logPath = logCompressorQueue_->front(); + logCompressorQueue_->pop(); + } + + if (!pathExists(logPath)) continue; + + printMsg(lvlChatty, format("compressing log file ‘%1%’") % logPath); + + Path tmpPath = logPath + ".bz2.tmp"; + + AutoCloseFD fd = open(tmpPath.c_str(), O_CREAT | O_TRUNC | O_WRONLY, 0644); + + // FIXME: use libbz2 + + Pid pid = startProcess([&]() { + if (dup2(fd, STDOUT_FILENO) == -1) + throw SysError("cannot dup output pipe to stdout"); + execlp("bzip2", "bzip2", "-c", logPath.c_str(), nullptr); + throw SysError("cannot start ssh"); + }); + + int res = pid.wait(true); + + if (res != 0) + throw Error(format("bzip2 returned exit code %1% while compressing ‘%2%’") + % res % logPath); + + if (rename(tmpPath.c_str(), (logPath + ".bz2").c_str()) != 0) + throw SysError(format("renaming ‘%1%’") % tmpPath); + + if (unlink(logPath.c_str()) != 0) + throw SysError(format("unlinking ‘%1%’") % logPath); + + } catch (std::exception & e) { + printMsg(lvlError, format("log compressor: %1%") % e.what()); + sleep(5); + } + } +} + + void State::dumpStatus() { { @@ -1368,6 +1443,10 @@ void State::run() std::thread(&State::dispatcher, this).detach(); + /* Run a log compressor thread. If needed, we could start more + than one. */ + std::thread(&State::logCompressor, this).detach(); + while (true) { try { auto conn(dbPool.get()); diff --git a/src/lib/Hydra/Helper/Nix.pm b/src/lib/Hydra/Helper/Nix.pm index c54c8f10..23f807dd 100644 --- a/src/lib/Hydra/Helper/Nix.pm +++ b/src/lib/Hydra/Helper/Nix.pm @@ -134,7 +134,7 @@ sub getDrvLogPath { my $bucketed = substr($base, 0, 2) . "/" . substr($base, 2); my $fn = ($ENV{NIX_LOG_DIR} || "/nix/var/log/nix") . "/drvs/"; my $fn2 = Hydra::Model::DB::getHydraPath . "/build-logs/"; - for ($fn2 . $bucketed, $fn . $bucketed . ".bz2", $fn . $bucketed, $fn . $base . ".bz2", $fn . $base) { + for ($fn2 . $bucketed, $fn2 . $bucketed . ".bz2", $fn . $bucketed . ".bz2", $fn . $bucketed, $fn . $base . ".bz2", $fn . $base) { return $_ if -f $_; } return undef; From 7afc61691ba2febab8d1c5966fe9c480f342b51d Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Fri, 19 Jun 2015 15:27:49 +0200 Subject: [PATCH 057/158] Doh --- src/hydra-queue-runner/hydra-queue-runner.cc | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index a9de0560..ce725592 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -1249,7 +1249,8 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, the build to fail). */ for (auto & build2 : indirect) { if ((cachedFailure && build2->drvPath == step->drvPath) || - (!cachedFailure && build == build2)) + (!cachedFailure && build == build2) || + build2->finishedInDB) continue; createBuildStep(txn, 0, build2, step, machine->sshName, buildStepStatus, result.errorMsg, build->id); @@ -1261,8 +1262,8 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, /* Mark all builds that depend on this derivation as failed. */ for (auto & build2 : indirect) { + if (build2->finishedInDB) continue; printMsg(lvlError, format("marking build %1% as failed") % build2->id); - if (build->finishedInDB) continue; txn.parameterized ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1 and finished = 0") (build2->id) From f196967c4347241894001d720e2775423f18f817 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Fri, 19 Jun 2015 15:33:37 +0200 Subject: [PATCH 058/158] Don't create a propagated build step to the same build --- src/hydra-queue-runner/hydra-queue-runner.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index ce725592..2e7ee9d5 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -1253,7 +1253,7 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, build2->finishedInDB) continue; createBuildStep(txn, 0, build2, step, machine->sshName, - buildStepStatus, result.errorMsg, build->id); + buildStepStatus, result.errorMsg, build == build2 ? 0 : build->id); } if (!cachedFailure) From e13477bdf2be93b090d0d1a9c3f4ae450bb7d9e1 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Fri, 19 Jun 2015 16:35:49 +0200 Subject: [PATCH 059/158] Robustness --- src/hydra-queue-runner/hydra-queue-runner.cc | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 2e7ee9d5..71b77f54 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -656,7 +656,12 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, newRunnable.clear(); nrAdded = 0; - createBuild(build); + try { + createBuild(build); + } catch (Error & e) { + printMsg(lvlError, format("while loading build %1%: %2%") % build->id % e.what()); + continue; // FIXME: retry later? + } /* Add the new runnable build steps to ‘runnable’ and wake up the builder threads. */ From 81abb6e166bf9b42efe88d024e7cf65b0776cc6d Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Fri, 19 Jun 2015 17:20:20 +0200 Subject: [PATCH 060/158] Improve parsing of hydra-build-products --- src/hydra-queue-runner/build-result.cc | 51 ++++++++++++++++---------- 1 file changed, 31 insertions(+), 20 deletions(-) diff --git a/src/hydra-queue-runner/build-result.cc b/src/hydra-queue-runner/build-result.cc index bb431623..339ba887 100644 --- a/src/hydra-queue-runner/build-result.cc +++ b/src/hydra-queue-runner/build-result.cc @@ -2,6 +2,7 @@ #include "store-api.hh" #include "misc.hh" #include "util.hh" +#include "regex.hh" using namespace nix; @@ -26,6 +27,15 @@ BuildResult getBuildResult(std::shared_ptr store, const Derivation & d /* Get build products. */ bool explicitProducts = false; + Regex regex( + "(([a-zA-Z0-9_-]+)" // type (e.g. "doc") + "[[:space:]]+" + "([a-zA-Z0-9_-]+)" // subtype (e.g. "readme") + "[[:space:]]+" + "(\"[^\"]+\"|[^[:space:]\"]+))" // path (may be quoted) + "([[:space:]]+([^[:space:]]+))?" // entry point + , true); + for (auto & output : outputs) { Path failedFile = output + "/nix-support/failed"; if (pathExists(failedFile)) res.failed = true; @@ -35,33 +45,33 @@ BuildResult getBuildResult(std::shared_ptr store, const Derivation & d explicitProducts = true; /* For security, resolve symlinks. */ - productsFile = canonPath(productsFile, true); + try { + productsFile = canonPath(productsFile, true); + } catch (Error & e) { continue; } if (!isInStore(productsFile)) continue; - // FIXME: handle I/O errors + string contents; + try { + contents = readFile(productsFile); + } catch (Error & e) { continue; } - auto contents = readFile(productsFile); - auto lines = tokenizeString(contents, "\n"); - - for (auto & line : lines) { + for (auto & line : tokenizeString(contents, "\n")) { BuildProduct product; - auto words = tokenizeString(line); - if (words.size() < 3) continue; - product.type = words.front(); words.pop_front(); - product.subtype = words.front(); words.pop_front(); - if (string(words.front(), 0, 1) == "\"") { - // FIXME: - throw Error("FIXME"); - } else { - product.path = words.front(); words.pop_front(); - } - product.defaultPath = words.empty() ? "" : words.front(); + Regex::Subs subs; + if (!regex.matches(line, subs)) continue; + + product.type = subs[1]; + product.subtype = subs[2]; + product.path = subs[3][0] == '"' ? string(subs[3], 1, subs[3].size() - 2) : subs[3]; + product.defaultPath = subs[5]; /* Ensure that the path exists and points into the Nix store. */ if (product.path == "" || product.path[0] != '/') continue; - product.path = canonPath(product.path, true); + try { + product.path = canonPath(product.path, true); + } catch (Error & e) { continue; } if (!isInStore(product.path) || !pathExists(product.path)) continue; /* FIXME: check that the path is in the input closure @@ -106,8 +116,9 @@ BuildResult getBuildResult(std::shared_ptr store, const Derivation & d for (auto & output : outputs) { Path p = output + "/nix-support/hydra-release-name"; if (!pathExists(p)) continue; - // FIXME: handle I/O error - res.releaseName = trim(readFile(p)); + try { + res.releaseName = trim(readFile(p)); + } catch (Error & e) { continue; } // FIXME: validate release name } From a0c4120232094e7751c272380afca2bdadbd9a18 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Fri, 19 Jun 2015 17:45:11 +0200 Subject: [PATCH 061/158] Don't copy src for nix-shell --- release.nix | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/release.nix b/release.nix index 0dd984cc..0e4feec2 100644 --- a/release.nix +++ b/release.nix @@ -32,7 +32,7 @@ in rec { releaseTools.makeSourceTarball { name = "hydra-tarball"; - src = hydraSrc; + src = if lib.inNixShell then null else hydraSrc; inherit officialRelease; version = builtins.readFile ./version; From a0eff6fc15972af442ee9e4ef152451bba60f9f9 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Fri, 19 Jun 2015 17:45:26 +0200 Subject: [PATCH 062/158] Fix machine selection --- src/hydra-queue-runner/hydra-queue-runner.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 71b77f54..2bacf735 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -887,9 +887,9 @@ void State::dispatcher() float ta = roundf(a->currentJobs / a->speedFactor); float tb = roundf(b->currentJobs / b->speedFactor); return - ta != tb ? ta > tb : + ta != tb ? ta < tb : a->speedFactor != b->speedFactor ? a->speedFactor > b->speedFactor : - a->maxJobs > b->maxJobs; + a->currentJobs > b->currentJobs; }); /* Find a machine with a free slot and find a step to run From d744362e4a249a92ecfc3e48eae135e3a46db49e Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Sun, 21 Jun 2015 16:21:42 +0200 Subject: [PATCH 063/158] hydra-queue-runner: Fix segfault sorting machines by load While sorting machines by load, the load of a machine (machine->currentJobs) can be changed by other threads. If that happens, the comparator is no longer a proper ordering, in which case std::sort() can segfault. So we now make a copy of currentJobs before sorting. --- src/hydra-queue-runner/hydra-queue-runner.cc | 33 ++++++++++++-------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 2bacf735..3e55c868 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -862,12 +862,19 @@ void State::dispatcher() bool keepGoing; do { - /* Bail out when there are no slots left. */ - std::vector machinesSorted; + /* Copy the currentJobs field of each machine. This is + necessary to ensure that the sort comparator below is a + ordering. std::sort() can segfault if it isn't. */ + struct MachineInfo + { + Machine::ptr machine; + unsigned int currentJobs; + }; + std::vector machinesSorted; { auto machines_(machines.lock()); - machinesSorted.insert(machinesSorted.end(), - machines_->begin(), machines_->end()); + for (auto & m : *machines_) + machinesSorted.push_back({m, m->currentJobs}); } /* Sort the machines by a combination of speed factor and @@ -882,14 +889,14 @@ void State::dispatcher() - Finally by load. */ sort(machinesSorted.begin(), machinesSorted.end(), - [](const Machine::ptr & a, const Machine::ptr & b) -> bool + [](const MachineInfo & a, const MachineInfo & b) -> bool { - float ta = roundf(a->currentJobs / a->speedFactor); - float tb = roundf(b->currentJobs / b->speedFactor); + float ta = roundf(a.currentJobs / a.machine->speedFactor); + float tb = roundf(b.currentJobs / b.machine->speedFactor); return ta != tb ? ta < tb : - a->speedFactor != b->speedFactor ? a->speedFactor > b->speedFactor : - a->currentJobs > b->currentJobs; + a.machine->speedFactor != b.machine->speedFactor ? a.machine->speedFactor > b.machine->speedFactor : + a.currentJobs > b.currentJobs; }); /* Find a machine with a free slot and find a step to run @@ -898,9 +905,9 @@ void State::dispatcher() keepGoing = false; system_time now = std::chrono::system_clock::now(); - for (auto & machine : machinesSorted) { + for (auto & mi : machinesSorted) { // FIXME: can we lose a wakeup if a builder exits concurrently? - if (machine->currentJobs >= machine->maxJobs) continue; + if (mi.machine->currentJobs >= mi.machine->maxJobs) continue; auto runnable_(runnable.lock()); //printMsg(lvlDebug, format("%1% runnable builds") % runnable_->size()); @@ -918,7 +925,7 @@ void State::dispatcher() } /* Can this machine do this step? */ - if (!machine->supportsStep(step)) { + if (!mi.machine->supportsStep(step)) { ++i; continue; } @@ -937,7 +944,7 @@ void State::dispatcher() /* Make a slot reservation and start a thread to do the build. */ - auto reservation = std::make_shared(machine); + auto reservation = std::make_shared(mi.machine); i = runnable_->erase(i); auto builderThread = std::thread(&State::builder, this, step, reservation); From 90a08db241951f818f19ffd1cb79f8bc0b5c636c Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Mon, 22 Jun 2015 10:59:07 +0200 Subject: [PATCH 064/158] hydra-queue-runner: Fix assertion failure --- src/hydra-queue-runner/hydra-queue-runner.cc | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 3e55c868..c71d7691 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -863,8 +863,8 @@ void State::dispatcher() do { /* Copy the currentJobs field of each machine. This is - necessary to ensure that the sort comparator below is a - ordering. std::sort() can segfault if it isn't. */ + necessary to ensure that the sort comparator below is + an ordering. std::sort() can segfault if it isn't. */ struct MachineInfo { Machine::ptr machine; @@ -1203,7 +1203,10 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, { auto rdep_(rdep->state.lock()); rdep_->deps.erase(step); - if (rdep_->deps.empty()) runnable = true; + /* Note: if the step has not finished + initialisation yet, it will be made runnable in + createStep(), if appropriate. */ + if (rdep_->deps.empty() && rdep->created) runnable = true; } if (runnable) makeRunnable(rdep); From fed71d3fe956f2583a35cb22da4e40bed838ae03 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Mon, 22 Jun 2015 11:02:01 +0200 Subject: [PATCH 065/158] Move "created" field into Step::State --- src/hydra-queue-runner/hydra-queue-runner.cc | 21 ++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index c71d7691..44e84587 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -145,6 +145,9 @@ struct Step struct State { + /* Whether the step has finished initialisation. */ + bool created = false; + /* The build steps on which this step depends. */ std::set deps; @@ -161,7 +164,6 @@ struct Step system_time after; }; - std::atomic_bool created{false}; // debugging std::atomic_bool finished{false}; // debugging Sync state; @@ -732,6 +734,8 @@ Step::ptr State::createStep(std::shared_ptr store, const Path & drvPat auto step_(step->state.lock()); + assert(step_->created != isNew); + if (referringBuild) step_->builds.push_back(referringBuild); @@ -741,10 +745,7 @@ Step::ptr State::createStep(std::shared_ptr store, const Path & drvPat (*steps_)[drvPath] = step; } - if (!isNew) { - assert(step->created); - return step; - } + if (!isNew) return step; printMsg(lvlDebug, format("considering derivation ‘%1%’") % drvPath); @@ -788,8 +789,8 @@ Step::ptr State::createStep(std::shared_ptr store, const Path & drvPat runnable. */ { auto step_(step->state.lock()); - assert(!step->created); - step->created = true; + assert(!step_->created); + step_->created = true; if (step_->deps.empty()) newRunnable.insert(step); } @@ -838,7 +839,7 @@ void State::makeRunnable(Step::ptr step) { auto step_(step->state.lock()); - assert(step->created); + assert(step_->created); assert(!step->finished); assert(step_->deps.empty()); } @@ -1023,7 +1024,7 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, { { auto step_(step->state.lock()); - assert(step->created); + assert(step_->created); assert(!step->finished); } @@ -1206,7 +1207,7 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, /* Note: if the step has not finished initialisation yet, it will be made runnable in createStep(), if appropriate. */ - if (rdep_->deps.empty() && rdep->created) runnable = true; + if (rdep_->deps.empty() && rdep_->created) runnable = true; } if (runnable) makeRunnable(rdep); From 44a2b74f5a94a23d2e4c3417709e348775b83515 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Mon, 22 Jun 2015 11:23:00 +0200 Subject: [PATCH 066/158] Keep track of the number of build steps that are being built (As opposed to being in the closure copying stage.) --- src/hydra-queue-runner/build-remote.cc | 8 ++++++-- src/hydra-queue-runner/build-remote.hh | 4 +++- src/hydra-queue-runner/counter.hh | 12 ++++++++++++ src/hydra-queue-runner/hydra-queue-runner.cc | 15 ++++----------- 4 files changed, 25 insertions(+), 14 deletions(-) create mode 100644 src/hydra-queue-runner/counter.hh diff --git a/src/hydra-queue-runner/build-remote.cc b/src/hydra-queue-runner/build-remote.cc index 7f02d081..b9696733 100644 --- a/src/hydra-queue-runner/build-remote.cc +++ b/src/hydra-queue-runner/build-remote.cc @@ -114,7 +114,7 @@ void buildRemote(std::shared_ptr store, const string & sshName, const string & sshKey, const Path & drvPath, const Derivation & drv, const nix::Path & logDir, unsigned int maxSilentTime, unsigned int buildTimeout, - RemoteResult & result) + RemoteResult & result, counter & nrStepsBuilding) { string base = baseNameOf(drvPath); result.logFile = logDir + "/" + string(base, 0, 2) + "/" + string(base, 2); @@ -176,7 +176,11 @@ void buildRemote(std::shared_ptr store, // FIXME: send maxLogSize. to.flush(); result.startTime = time(0); - int res = readInt(from); + int res; + { + MaintainCount mc(nrStepsBuilding); + res = readInt(from); + } result.stopTime = time(0); if (res) { result.errorMsg = (format("%1% on ‘%2%’") % readString(from) % sshName).str(); diff --git a/src/hydra-queue-runner/build-remote.hh b/src/hydra-queue-runner/build-remote.hh index d932e8ae..68d612e3 100644 --- a/src/hydra-queue-runner/build-remote.hh +++ b/src/hydra-queue-runner/build-remote.hh @@ -3,6 +3,8 @@ #include "store-api.hh" #include "derivations.hh" +#include "counter.hh" + struct RemoteResult { enum { @@ -20,4 +22,4 @@ void buildRemote(std::shared_ptr store, const std::string & sshName, const std::string & sshKey, const nix::Path & drvPath, const nix::Derivation & drv, const nix::Path & logDir, unsigned int maxSilentTime, unsigned int buildTimeout, - RemoteResult & result); + RemoteResult & result, counter & nrStepsBuilding); diff --git a/src/hydra-queue-runner/counter.hh b/src/hydra-queue-runner/counter.hh new file mode 100644 index 00000000..4d6b4163 --- /dev/null +++ b/src/hydra-queue-runner/counter.hh @@ -0,0 +1,12 @@ +#pragma once + +#include + +typedef std::atomic counter; + +struct MaintainCount +{ + counter & c; + MaintainCount(counter & c) : c(c) { c++; } + ~MaintainCount() { c--; } +}; diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 44e84587..ca35d3fb 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -19,6 +19,7 @@ #include "build-remote.hh" #include "sync.hh" #include "pool.hh" +#include "counter.hh" #include "store-api.hh" #include "derivations.hh" @@ -43,16 +44,6 @@ bool has(const C & c, const V & v) } -typedef std::atomic counter; - -struct MaintainCount -{ - counter & c; - MaintainCount(counter & c) : c(c) { c++; } - ~MaintainCount() { c--; } -}; - - typedef enum { bsSuccess = 0, bsFailed = 1, @@ -252,6 +243,7 @@ private: counter nrBuildsDone{0}; counter nrStepsDone{0}; counter nrActiveSteps{0}; + counter nrStepsBuilding{0}; counter nrRetries{0}; counter maxNrRetries{0}; counter nrQueueWakeups{0}; @@ -1094,7 +1086,7 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, try { /* FIXME: referring builds may have conflicting timeouts. */ buildRemote(store, machine->sshName, machine->sshKey, step->drvPath, step->drv, - logDir, build->maxSilentTime, build->buildTimeout, result); + logDir, build->maxSilentTime, build->buildTimeout, result, nrStepsBuilding); } catch (Error & e) { result.status = RemoteResult::rrMiscFailure; result.errorMsg = e.msg(); @@ -1432,6 +1424,7 @@ void State::dumpStatus() printMsg(lvlError, format("%1% runnable build steps") % runnable_->size()); } printMsg(lvlError, format("%1% active build steps") % nrActiveSteps); + printMsg(lvlError, format("%1% build steps currently building") % nrStepsBuilding); printMsg(lvlError, format("%1% builds read from queue") % nrBuildsRead); printMsg(lvlError, format("%1% builds done") % nrBuildsDone); printMsg(lvlError, format("%1% build steps done") % nrStepsDone); From 4f4141e1db9e10d9f5a34080ecdbf1afe55a017e Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Mon, 22 Jun 2015 14:06:44 +0200 Subject: [PATCH 067/158] =?UTF-8?q?Add=20command=20=E2=80=98hydra-queue-ru?= =?UTF-8?q?nner=20--status=E2=80=99=20to=20show=20current=20status?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/hydra-queue-runner/hydra-queue-runner.cc | 189 +++++++++++++------ src/sql/hydra.sql | 6 + src/sql/upgrade-34.sql | 4 + 3 files changed, 145 insertions(+), 54 deletions(-) create mode 100644 src/sql/upgrade-34.sql diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index ca35d3fb..68d000cb 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -25,6 +25,7 @@ #include "derivations.hh" #include "shared.hh" #include "globals.hh" +#include "value-to-json.hh" using namespace nix; @@ -256,12 +257,10 @@ private: public: State(); - ~State(); - - void clearBusy(time_t stopTime); - private: + void clearBusy(Connection & conn, time_t stopTime); + void loadMachines(); int createBuildStep(pqxx::work & txn, time_t startTime, Build::ptr build, Step::ptr step, @@ -308,9 +307,13 @@ private: /* Thread that asynchronously bzips logs of finished steps. */ void logCompressor(); + void dumpStatus(Connection & conn); + public: - void dumpStatus(); + void showStatus(); + + void unlock(); void run(); }; @@ -325,17 +328,6 @@ State::State() } -State::~State() -{ - try { - printMsg(lvlInfo, "clearing active builds / build steps..."); - clearBusy(time(0)); - } catch (...) { - ignoreException(); - } -} - - void State::loadMachines() { Path machinesFile = getEnv("NIX_REMOTE_SYSTEMS", "/etc/nix/machines"); @@ -381,10 +373,9 @@ void State::loadMachines() } -void State::clearBusy(time_t stopTime) +void State::clearBusy(Connection & conn, time_t stopTime) { - auto conn(dbPool.get()); - pqxx::work txn(*conn); + pqxx::work txn(conn); txn.parameterized ("update BuildSteps set busy = 0, status = $1, stopTime = $2 where busy = 1") ((int) bssAborted) @@ -1405,47 +1396,132 @@ void State::logCompressor() } -void State::dumpStatus() +void State::dumpStatus(Connection & conn) { + std::ostringstream out; + { - auto builds_(builds.lock()); - printMsg(lvlError, format("%1% queued builds") % builds_->size()); - } - { - auto steps_(steps.lock()); - for (auto i = steps_->begin(); i != steps_->end(); ) - if (i->second.lock()) ++i; else i = steps_->erase(i); - printMsg(lvlError, format("%1% pending/active build steps") % steps_->size()); - } - { - auto runnable_(runnable.lock()); - for (auto i = runnable_->begin(); i != runnable_->end(); ) - if (i->lock()) ++i; else i = runnable_->erase(i); - printMsg(lvlError, format("%1% runnable build steps") % runnable_->size()); - } - printMsg(lvlError, format("%1% active build steps") % nrActiveSteps); - printMsg(lvlError, format("%1% build steps currently building") % nrStepsBuilding); - printMsg(lvlError, format("%1% builds read from queue") % nrBuildsRead); - printMsg(lvlError, format("%1% builds done") % nrBuildsDone); - printMsg(lvlError, format("%1% build steps done") % nrStepsDone); - printMsg(lvlError, format("%1% build step retries") % nrRetries); - printMsg(lvlError, format("%1% most retries for any build step") % maxNrRetries); - printMsg(lvlError, format("%1% queue wakeups") % nrQueueWakeups); - printMsg(lvlError, format("%1% dispatcher wakeups") % nrDispatcherWakeups); - printMsg(lvlError, format("%1% database connections") % dbPool.count()); - { - auto machines_(machines.lock()); - for (auto & m : *machines_) { - printMsg(lvlError, format("machine %1%: %2%/%3% active") - % m->sshName % m->currentJobs % m->maxJobs); + JSONObject root(out); + root.attr("status", "up"); + root.attr("time", time(0)); + root.attr("pid", getpid()); + { + auto builds_(builds.lock()); + root.attr("nrQueuedBuilds", builds_->size()); } + { + auto steps_(steps.lock()); + for (auto i = steps_->begin(); i != steps_->end(); ) + if (i->second.lock()) ++i; else i = steps_->erase(i); + root.attr("nrUnfinishedSteps", steps_->size()); + } + { + auto runnable_(runnable.lock()); + for (auto i = runnable_->begin(); i != runnable_->end(); ) + if (i->lock()) ++i; else i = runnable_->erase(i); + root.attr("nrRunnableSteps", runnable_->size()); + } + root.attr("nrActiveSteps", nrActiveSteps); + root.attr("nrStepsBuilding", nrStepsBuilding); + root.attr("nrBuildsRead", nrBuildsRead); + root.attr("nrBuildsDone", nrBuildsDone); + root.attr("nrStepsDone", nrStepsDone); + root.attr("nrRetries", nrRetries); + root.attr("maxNrRetries", maxNrRetries); + root.attr("nrQueueWakeups", nrQueueWakeups); + root.attr("nrDispatcherWakeups", nrDispatcherWakeups); + root.attr("nrDbConnections", dbPool.count()); + { + root.attr("machines"); + JSONObject nested(out); + auto machines_(machines.lock()); + for (auto & m : *machines_) { + nested.attr(m->sshName); + JSONObject nested2(out); + nested2.attr("currentJobs", m->currentJobs); + nested2.attr("maxJobs", m->maxJobs); + } + } + } + + { + pqxx::work txn(conn); + // FIXME: use PostgreSQL 9.5 upsert. + txn.exec("delete from SystemStatus where what = 'queue-runner'"); + txn.parameterized("insert into SystemStatus values ('queue-runner', $1)")(out.str()).exec(); + txn.exec("notify status_dumped"); + txn.commit(); + } +} + + +void State::showStatus() +{ + auto conn(dbPool.get()); + receiver statusDumped(*conn, "status_dumped"); + + string status; + bool barf = false; + + /* Get the last JSON status dump from the database. */ + { + pqxx::work txn(*conn); + auto res = txn.exec("select status from SystemStatus where what = 'queue-runner'"); + if (res.size()) status = res[0][0].as(); + } + + if (status != "") { + + /* If the status is not empty, then the queue runner is + running. Ask it to update the status dump. */ + { + pqxx::work txn(*conn); + txn.exec("notify dump_status"); + txn.commit(); + } + + /* Wait until it has done so. */ + barf = conn->await_notification(5, 0) == 0; + + /* Get the new status. */ + { + pqxx::work txn(*conn); + auto res = txn.exec("select status from SystemStatus where what = 'queue-runner'"); + if (res.size()) status = res[0][0].as(); + } + + } + + if (status == "") status = R"({"status":"down"})"; + + std::cout << status << "\n"; + + if (barf) + throw Error("queue runner did not respond; status information may be wrong"); +} + + +void State::unlock() +{ + auto conn(dbPool.get()); + + clearBusy(*conn, 0); + + { + pqxx::work txn(*conn); + txn.exec("delete from SystemStatus where what = 'queue-runner'"); + txn.commit(); } } void State::run() { - clearBusy(0); + { + auto conn(dbPool.get()); + clearBusy(*conn, 0); + dumpStatus(*conn); + } loadMachines(); @@ -1464,7 +1540,7 @@ void State::run() while (true) { conn->await_notification(); if (dumpStatus.get()) - State::dumpStatus(); + State::dumpStatus(*conn); } } catch (std::exception & e) { printMsg(lvlError, format("main thread: %1%") % e.what()); @@ -1487,10 +1563,13 @@ int main(int argc, char * * argv) signal(SIGHUP, SIG_DFL); bool unlock = false; + bool status = false; parseCmdLine(argc, argv, [&](Strings::iterator & arg, const Strings::iterator & end) { if (*arg == "--unlock") unlock = true; + else if (*arg == "--status") + status = true; else return false; return true; @@ -1503,8 +1582,10 @@ int main(int argc, char * * argv) /* FIXME: need some locking to prevent multiple instances of hydra-queue-runner. */ State state; - if (unlock) - state.clearBusy(0); + if (status) + state.showStatus(); + else if (unlock) + state.unlock(); else state.run(); }); diff --git a/src/sql/hydra.sql b/src/sql/hydra.sql index 10f2d614..2bbb04d4 100644 --- a/src/sql/hydra.sql +++ b/src/sql/hydra.sql @@ -531,6 +531,12 @@ create rule IdempotentInsert as on insert to FailedPaths #endif +create table SystemStatus ( + what text primary key not null, + status json not null +); + + -- Cache of the number of finished builds. create table NrBuilds ( what text primary key not null, diff --git a/src/sql/upgrade-34.sql b/src/sql/upgrade-34.sql new file mode 100644 index 00000000..8e93cbcc --- /dev/null +++ b/src/sql/upgrade-34.sql @@ -0,0 +1,4 @@ +create table SystemStatus ( + what text primary key not null, + status json not null +); From fbd7c0221732e5f9626588e09db555e95ded6053 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Mon, 22 Jun 2015 14:15:43 +0200 Subject: [PATCH 068/158] Periodically dump/log status --- src/hydra-queue-runner/hydra-queue-runner.cc | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 68d000cb..d006ede9 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -307,7 +307,7 @@ private: /* Thread that asynchronously bzips logs of finished steps. */ void logCompressor(); - void dumpStatus(Connection & conn); + void dumpStatus(Connection & conn, bool log); public: @@ -1396,7 +1396,7 @@ void State::logCompressor() } -void State::dumpStatus(Connection & conn) +void State::dumpStatus(Connection & conn, bool log) { std::ostringstream out; @@ -1444,6 +1444,8 @@ void State::dumpStatus(Connection & conn) } } + if (log) printMsg(lvlInfo, format("status: %1%") % out.str()); + { pqxx::work txn(conn); // FIXME: use PostgreSQL 9.5 upsert. @@ -1520,7 +1522,7 @@ void State::run() { auto conn(dbPool.get()); clearBusy(*conn, 0); - dumpStatus(*conn); + dumpStatus(*conn, false); } loadMachines(); @@ -1538,9 +1540,8 @@ void State::run() auto conn(dbPool.get()); receiver dumpStatus(*conn, "dump_status"); while (true) { - conn->await_notification(); - if (dumpStatus.get()) - State::dumpStatus(*conn); + bool timeout = conn->await_notification(300, 0) == 0; + State::dumpStatus(*conn, timeout); } } catch (std::exception & e) { printMsg(lvlError, format("main thread: %1%") % e.what()); From 62b53a0a473bc3f52f3fce227b55f4ef83cf57eb Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Mon, 22 Jun 2015 14:24:03 +0200 Subject: [PATCH 069/158] Guard against concurrent invocations of hydra-queue-runner --- src/hydra-queue-runner/hydra-queue-runner.cc | 24 ++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index d006ede9..8927329c 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -26,6 +26,7 @@ #include "shared.hh" #include "globals.hh" #include "value-to-json.hh" +#include "pathlocks.hh" using namespace nix; @@ -307,6 +308,10 @@ private: /* Thread that asynchronously bzips logs of finished steps. */ void logCompressor(); + /* Acquire the global queue runner lock, or null if somebody else + has it. */ + std::shared_ptr acquireGlobalLock(); + void dumpStatus(Connection & conn, bool log); public: @@ -1396,6 +1401,17 @@ void State::logCompressor() } +std::shared_ptr State::acquireGlobalLock() +{ + Path lockPath = hydraData + "/queue-runner"; + + auto lock = std::make_shared(); + if (!lock->lockPaths(PathSet({lockPath}), "", false)) return 0; + + return lock; +} + + void State::dumpStatus(Connection & conn, bool log) { std::ostringstream out; @@ -1505,6 +1521,10 @@ void State::showStatus() void State::unlock() { + auto lock = acquireGlobalLock(); + if (!lock) + throw Error("hydra-queue-runner is currently running"); + auto conn(dbPool.get()); clearBusy(*conn, 0); @@ -1519,6 +1539,10 @@ void State::unlock() void State::run() { + auto lock = acquireGlobalLock(); + if (!lock) + throw Error("hydra-queue-runner is already running"); + { auto conn(dbPool.get()); clearBusy(*conn, 0); From 41ba7418e27a741697f114a6ab07ceb656909bb4 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Mon, 22 Jun 2015 15:34:33 +0200 Subject: [PATCH 070/158] hydra-queue-runner: More stats --- src/hydra-queue-runner/hydra-queue-runner.cc | 21 +++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 8927329c..f70d81dc 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -248,6 +248,8 @@ private: counter nrStepsBuilding{0}; counter nrRetries{0}; counter maxNrRetries{0}; + counter totalStepTime{0}; // total time for steps, including closure copying + counter totalStepBuildTime{0}; // total build time for steps counter nrQueueWakeups{0}; counter nrDispatcherWakeups{0}; @@ -1040,7 +1042,7 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, possibility, we retry this step (putting it back in the runnable queue). If there are really no strong pointers to the step, it will be deleted. */ - printMsg(lvlInfo, format("cancelling build step ‘%1%’") % step->drvPath); + printMsg(lvlInfo, format("maybe cancelling build step ‘%1%’") % step->drvPath); return true; } @@ -1059,7 +1061,7 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, BuildResult res; int stepNr = 0; - result.startTime = time(0); + time_t stepStartTime = result.startTime = time(0); /* If any of the outputs have previously failed, then don't bother building again. */ @@ -1091,7 +1093,8 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, if (result.status == RemoteResult::rrSuccess) res = getBuildResult(store, step->drv); } - if (!result.stopTime) result.stopTime = time(0); + time_t stepStopTime = time(0); + if (!result.stopTime) result.stopTime = stepStopTime; /* Asynchronously compress the log. */ if (result.logFile != "") { @@ -1151,7 +1154,6 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, meantime or be added afterwards. */ if (direct.empty()) { printMsg(lvlDebug, format("finishing build step ‘%1%’") % step->drvPath); - nrStepsDone++; steps_->erase(step->drvPath); } } @@ -1222,7 +1224,6 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, if (indirect.empty()) { for (auto & s : steps) { printMsg(lvlDebug, format("finishing build step ‘%1%’") % step->drvPath); - nrStepsDone++; steps_->erase(s->drvPath); } break; @@ -1298,6 +1299,10 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, } + nrStepsDone++; + totalStepTime += stepStopTime - stepStartTime; + totalStepBuildTime += result.stopTime - result.startTime; + return false; } @@ -1444,6 +1449,12 @@ void State::dumpStatus(Connection & conn, bool log) root.attr("nrStepsDone", nrStepsDone); root.attr("nrRetries", nrRetries); root.attr("maxNrRetries", maxNrRetries); + root.attr("totalStepTime", totalStepTime); + root.attr("totalStepBuildTime", totalStepBuildTime); + if (nrStepsDone) { + root.attr("avgStepTime"); out << (float) totalStepTime / nrStepsDone; + root.attr("avgStepBuildTime"); out << (float) totalStepBuildTime / nrStepsDone; + } root.attr("nrQueueWakeups", nrQueueWakeups); root.attr("nrDispatcherWakeups", nrDispatcherWakeups); root.attr("nrDbConnections", dbPool.count()); From e32ee3d5b9b756c2c49ad0f2c44eb2de5c0e31d8 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Mon, 22 Jun 2015 15:43:15 +0200 Subject: [PATCH 071/158] Remove hydra-build and the old hydra-queue-runner --- doc/manual/installation.xml | 9 +- src/script/Makefile.am | 2 - src/script/hydra-build | 385 ---------------------------- src/script/hydra-eval-guile-jobs.in | 2 +- src/script/hydra-queue-runner | 279 -------------------- 5 files changed, 3 insertions(+), 674 deletions(-) delete mode 100755 src/script/hydra-build delete mode 100755 src/script/hydra-queue-runner diff --git a/doc/manual/installation.xml b/doc/manual/installation.xml index fb7ef5ce..64af9374 100644 --- a/doc/manual/installation.xml +++ b/doc/manual/installation.xml @@ -100,13 +100,8 @@ nix-env -i hydra - Command completion should reveal a number of command-line tools from Hydra: - - -hydra-build hydra-init hydra-update-gc-roots -hydra-eval-jobs hydra-queue-runner -hydra-evaluator hydra-server - + Command completion should reveal a number of command-line tools + from Hydra, such as hydra-queue-runner. diff --git a/src/script/Makefile.am b/src/script/Makefile.am index c05c9a12..cfdeea8c 100644 --- a/src/script/Makefile.am +++ b/src/script/Makefile.am @@ -4,9 +4,7 @@ EXTRA_DIST = \ distributable_scripts = \ hydra-init \ - hydra-build \ hydra-evaluator \ - hydra-queue-runner \ hydra-server \ hydra-update-gc-roots \ hydra-s3-backup-collect-garbage \ diff --git a/src/script/hydra-build b/src/script/hydra-build deleted file mode 100755 index 7013c3a4..00000000 --- a/src/script/hydra-build +++ /dev/null @@ -1,385 +0,0 @@ -#! /var/run/current-system/sw/bin/perl - -use strict; -use List::MoreUtils qw(all); -use File::Basename; -use File::stat; -use Nix::Store; -use Hydra::Plugin; -use Hydra::Schema; -use Hydra::Helper::Nix; -use Hydra::Helper::PluginHooks; -use Hydra::Model::DB; -use Hydra::Helper::AddBuilds; -use Set::Scalar; - -STDOUT->autoflush(); - -my $db = Hydra::Model::DB->new(); - -my $config = getHydraConfig(); - -my @plugins = Hydra::Plugin->instantiate(db => $db, config => $config); - - -sub addBuildStepOutputs { - my ($step) = @_; - my $drv = derivationFromPath($step->drvpath); - $step->buildstepoutputs->create({ name => $_, path => $drv->{outputs}->{$_} }) - foreach keys %{$drv->{outputs}}; -} - - -sub nextFreeStepNr { - my ($build) = @_; - my $max = $build->buildsteps->find( - {}, {select => {max => 'stepnr + 1'}, as => ['max']}); - return (defined $max && defined $max->get_column('max')) ? $max->get_column('max') : 1; -} - - -sub failDependents { - my ($drvPath, $status, $errorMsg, $dependents, $startTime, $stopTime, $machine, $propagatedFrom) = @_; - - # Get the referrer closure of $drvPath. - my $dependentDrvs = Set::Scalar->new(computeFSClosure(1, 0, $drvPath)); - - my $time = time(); - - txn_do($db, sub { - - my @dependentBuilds = $db->resultset('Builds')->search( - { finished => 0, busy => 0 }, - { columns => ["id", "project", "jobset", "job", "drvpath", "finished", "busy"] }); - - for my $d (@dependentBuilds) { - next unless $dependentDrvs->has($d->drvpath); - print STDERR "failing dependent build ", $d->id, " of ", $d->project->name, ":", $d->jobset->name, ":", $d->job->name, "\n"; - $d->update( - { finished => 1 - , logfile => '' - , iscachedbuild => 0 - , buildstatus => $drvPath eq $d->drvpath ? 1 : 2 - , starttime => $time - , stoptime => $time - , errormsg => undef - }); - - my $step = $d->buildsteps->create( - { stepnr => nextFreeStepNr($d) - , type => 0 # = build - , drvpath => $drvPath - , busy => 0 - , status => $status - , starttime => $startTime - , stoptime => $stopTime - , errormsg => $errorMsg - , machine => $machine - , propagatedfrom => $propagatedFrom->id - }); - addBuildStepOutputs($step); - - push @$dependents, $d; - } - - }); -} - - -sub doBuild { - my ($build) = @_; - - my %outputs; - $outputs{$_->name} = $_->path foreach $build->buildoutputs->all; - - my $drvPath = $build->drvpath; - my $maxsilent = $build->maxsilent; - my $timeout = $build->timeout; - - my $isCachedBuild = 1; - my $outputCreated = 1; # i.e., the Nix build succeeded (but it could be a positive failure) - my $startTime = time(); - my $stopTime = undef; - - my $buildStatus = 0; # = succeeded - - my $errormsg = undef; - - my $dependents = []; - - if (!isValidPath($drvPath)) { - $buildStatus = 3; - $errormsg = "derivation was garbage-collected prior to build"; - goto done; - } - - unless (all { isValidPath($_) } values(%outputs)) { - $isCachedBuild = 0; - - # Do the build. - my $thisBuildFailed = 0; - my $someBuildFailed = 0; - - # Run Nix to perform the build, and monitor the stderr output - # to get notifications about specific build steps, the - # associated log files, etc. - my $cmd = "nix-store --realise $drvPath " . - "--timeout $timeout " . - "--max-silent-time $maxsilent " . - "--option build-max-log-size 67108864 " . - "--option print-missing false " . - "--keep-going --fallback " . - "--no-build-output --log-type flat --print-build-trace " . - "--add-root " . gcRootFor($outputs{out} // $outputs{(sort keys %outputs)[0]}) . " 2>&1"; - - my $buildStepNr = nextFreeStepNr($build); - my %buildSteps; - - open OUT, "$cmd |" or die; - - while () { - $errormsg .= $_; - - unless (/^@\s+/) { - print STDERR "$_"; - next; - } - - if (/^@\s+build-started\s+(\S+)\s+(\S+)\s+(\S+)\s+(\S+)$/) { - my $drvPathStep = $1; - txn_do($db, sub { - my $step = $build->buildsteps->create( - { stepnr => ($buildSteps{$drvPathStep} = $buildStepNr++) - , type => 0 # = build - , drvpath => $drvPathStep - , system => $3 - , busy => 1 - , starttime => time - }); - addBuildStepOutputs($step); - }); - } - - elsif (/^@\s+build-remote\s+(\S+)\s+(\S+)$/) { - my $drvPathStep = $1; - my $machine = $2; - txn_do($db, sub { - my $step = $build->buildsteps->find({stepnr => $buildSteps{$drvPathStep}}) or die; - $step->update({machine => $machine}); - }); - } - - elsif (/^@\s+build-remote-start\s+(\S+)\s+/) { - my $drvPathStep = $1; - txn_do($db, sub { - my $step = $build->buildsteps->find({stepnr => $buildSteps{$drvPathStep}}) or die; - $step->update({starttime => time}); - }); - } - - elsif (/^@\s+build-remote-done\s+(\S+)\s+/) { - my $drvPathStep = $1; - txn_do($db, sub { - my $step = $build->buildsteps->find({stepnr => $buildSteps{$drvPathStep}}) or die; - $step->update({stoptime => time}); - }); - } - - elsif (/^@\s+build-succeeded\s+(\S+)\s+(\S+)$/) { - my $drvPathStep = $1; - txn_do($db, sub { - my $step = $build->buildsteps->find({stepnr => $buildSteps{$drvPathStep}}) or die; - $step->update({busy => 0, status => 0, stoptime => time}); - $step->update({stoptime => time}) unless defined $step->update; - }); - } - - elsif (/^@\s+build-failed\s+(\S+)\s+(\S+)\s+(\S+)\s+(.*)$/) { - my $drvPathStep = $1; - $someBuildFailed = 1; - $thisBuildFailed = 1 if $drvPath eq $drvPathStep; - my $errorMsg; - my $status = 1; - if ($3 eq "cached") { - $status = 8; - } elsif ($3 eq "timeout") { - $status = 7; - } else { - $errorMsg = $4; - } - my $now = time; - my $stepStartTime = $now; - my $stepStopTime = $now; - my $machine = ""; - txn_do($db, sub { - if ($buildSteps{$drvPathStep}) { - my $step = $build->buildsteps->find({stepnr => $buildSteps{$drvPathStep}}) or die; - $stepStartTime = $step->starttime; - $stepStopTime = $now; - $machine = $step->machine; - $step->update({busy => 0, status => $status, errormsg => $errorMsg, stoptime => $now}); - } - # Don't write a record if this derivation already - # failed previously. This can happen if this is a - # restarted build. - elsif (scalar $build->buildsteps->search({drvpath => $drvPathStep, type => 0, busy => 0, status => 1}) == 0) { - my $step = $build->buildsteps->create( - { stepnr => ($buildSteps{$drvPathStep} = $buildStepNr++) - , type => 0 # = build - , drvpath => $drvPathStep - , busy => 0 - , status => $status - , starttime => $now - , stoptime => $now - , errormsg => $errorMsg - }); - addBuildStepOutputs($step); - } - }); - - # Immediately fail all builds that depend on this derivation. - failDependents($drvPathStep, $status, $errorMsg, $dependents, $stepStartTime, $stepStopTime, $machine, $build); - } - - elsif (/^@\s+substituter-started\s+(\S+)\s+(\S+)$/) { - my $path = $1; - txn_do($db, sub { - my $step = $build->buildsteps->create( - { stepnr => ($buildSteps{$path} = $buildStepNr++) - , type => 1 # = substitution - , busy => 1 - , starttime => time - }); - # "out" is kinda fake (substitutions don't have named outputs). - $step->buildstepoutputs->create({ name => "out", path => $path }); - }); - } - - elsif (/^@\s+substituter-succeeded\s+(\S+)$/) { - my $path = $1; - txn_do($db, sub { - my $step = $build->buildsteps->find({stepnr => $buildSteps{$path}}) or die; - $step->update({busy => 0, status => 0, stoptime => time}); - }); - } - - elsif (/^@\s+substituter-failed\s+(\S+)\s+(\S+)\s+(\S+)$/) { - my $path = $1; - txn_do($db, sub { - my $step = $build->buildsteps->find({stepnr => $buildSteps{$path}}) or die; - $step->update({busy => 0, status => 1, errormsg => $3, stoptime => time}); - }); - } - - else { - print STDERR "unknown Nix trace message: $_"; - } - } - - close OUT; - - my $res = $?; - - $stopTime = time(); - - if ($res != 0) { - if ($thisBuildFailed) { $buildStatus = 1; } - elsif ($someBuildFailed) { $buildStatus = 2; } - else { $buildStatus = 3; } - } - - # Only store the output of running Nix if we have a miscellaneous error. - $errormsg = undef unless $buildStatus == 3; - } - - done: - - txn_do($db, sub { - if ($buildStatus == 0) { - - my $size = 0; - my $closureSize = 0; - my $releaseName; - - my @closure = computeFSClosure(0, 0, values %outputs); - foreach my $path (@closure) { - my ($deriver, $hash, $time, $narSize, $refs) = queryPathInfo($path, 0); - $closureSize += $narSize; - $size += $narSize if grep { $path eq $_ } values(%outputs); - } - - foreach my $path (values %outputs) { - $buildStatus = 6 if $buildStatus == 0 && -f "$path/nix-support/failed"; - $releaseName //= getReleaseName($path); - } - - $build->update( - { releasename => $releaseName - , size => $size - , closuresize => $closureSize - }); - - addBuildProducts($db, $build); - } - - # Mark any remaining active build steps as aborted. - $build->buildsteps->search({ busy => 1 })->update({ busy => 0, status => 4, stoptime => time }); - - $build->update( - { finished => 1 - , busy => 0 - , locker => '' - , logfile => '' - , iscachedbuild => $isCachedBuild - , buildstatus => $buildStatus - , starttime => $startTime - , stoptime => $stopTime // time() - , errormsg => $errormsg - }); - - }); - - notifyBuildFinished(\@plugins, $build, $dependents); -} - - -my $buildId = $ARGV[0] or die "syntax: $0 BUILD-ID\n"; -print STDERR "performing build $buildId\n"; - -if ($ENV{'HYDRA_MAIL_TEST'}) { - my $build = $db->resultset('Builds')->find($buildId); - notifyBuildFinished(\@plugins, $build, []); - exit 0; -} - -# Lock the build. If necessary, steal the lock from the parent -# process (runner.pl). This is so that if the runner dies, the -# children (i.e. the build.pl instances) can continue to run and won't -# have the lock taken away. -my $build; -txn_do($db, sub { - $build = $db->resultset('Builds')->find($buildId); - die "build $buildId doesn't exist\n" unless defined $build; - die "build $buildId already done\n" if $build->finished; - if ($build->busy != 0 && $build->locker != getppid) { - die "build $buildId is already being built"; - } - $build->update({busy => 1, locker => $$}); - $build->buildsteps->search({busy => 1})->delete; - $build->buildproducts->delete; -}); - -die unless $build; - -# Do the build. If it throws an error, unlock the build so that it -# can be retried. -eval { - doBuild $build; - print "done\n"; -}; -if ($@) { - warn $@; - txn_do($db, sub { - $build->update({busy => 0, locker => $$}); - }); -} diff --git a/src/script/hydra-eval-guile-jobs.in b/src/script/hydra-eval-guile-jobs.in index 70550db7..8c5df125 100644 --- a/src/script/hydra-eval-guile-jobs.in +++ b/src/script/hydra-eval-guile-jobs.in @@ -104,7 +104,7 @@ symbol/thunk pairs." (when gc-roots-dir ;; Register DRV as a GC root so that it's not collected by - ;; the time 'hydra-build' attempts to build it. + ;; the time 'hydra-queue-runner' attempts to build it. (register-gc-root drv gc-roots-dir)) ;; XXX: Add tags? diff --git a/src/script/hydra-queue-runner b/src/script/hydra-queue-runner deleted file mode 100755 index 4caae3c8..00000000 --- a/src/script/hydra-queue-runner +++ /dev/null @@ -1,279 +0,0 @@ -#! /var/run/current-system/sw/bin/perl - -use strict; -use Cwd; -use File::Basename; -use POSIX qw(dup2 :sys_wait_h); -use Hydra::Schema; -use Hydra::Helper::Nix; -use Hydra::Model::DB; -use IO::Handle; -use Nix::Store; -use Set::Scalar; - -chdir Hydra::Model::DB::getHydraPath or die; -my $db = Hydra::Model::DB->new(); - -STDOUT->autoflush(); - -my $lastTime; - -#$SIG{CHLD} = 'IGNORE'; - - -sub unlockDeadBuilds { - # Unlock builds whose building process has died. - txn_do($db, sub { - my @builds = $db->resultset('Builds')->search({finished => 0, busy => 1}); - foreach my $build (@builds) { - my $pid = $build->locker; - my $unlock = 0; - if ($pid == $$) { - if (!defined $lastTime || $build->starttime < $lastTime - 300) { - $unlock = 1; - } - } elsif (kill(0, $pid) != 1) { # see if we can signal the process - $unlock = 1; - } - if ($unlock) { - print "build ", $build->id, " pid $pid died, unlocking\n"; - $build->update({ busy => 0, locker => "" }); - $build->buildsteps->search({ busy => 1 })->update({ busy => 0, status => 4, stoptime => time }); - } - } - }); -} - - -# Given a build, return an arbitrary queued build on which this build -# depends; or undef if no such build exists. -sub findBuildDependencyInQueue { - my ($buildsByDrv, $build) = @_; - return undef unless isValidPath($build->drvpath); - my @deps = grep { /\.drv$/ && $_ ne $build->drvpath } computeFSClosure(0, 0, $build->drvpath); - return unless scalar @deps > 0; - foreach my $d (@deps) { - my $bs = $buildsByDrv->{$d}; - next unless defined $bs; - return $db->resultset('Builds')->find((@$bs)[0]); - } - return undef; -} - - -sub blockBuilds { - my ($buildsByDrv, $blockedBuilds, $build) = @_; - my @rdeps = grep { /\.drv$/ && $_ ne $build->drvpath } computeFSClosure(1, 0, $build->drvpath); - foreach my $drv (@rdeps) { - my $bs = $buildsByDrv->{$drv}; - next if !defined $bs; - $blockedBuilds->insert($_) foreach @$bs; - } -} - - -sub checkBuilds { - # print "looking for runnable builds...\n"; - - my @buildsStarted; - - my $machines = getMachines; - - my %maxConcurrent; - - foreach my $machineName (keys %{$machines}) { - foreach my $system (@{${$machines}{$machineName}{'systemTypes'}}) { - $maxConcurrent{$system} = (${$machines}{$machineName}{'maxJobs'} or 0) + ($maxConcurrent{$system} or 0) - } - } - - txn_do($db, sub { - - # Cache scheduled builds by derivation path to speed up - # findBuildDependencyInQueue. - my $buildsByDrv = {}; - push @{$buildsByDrv->{$_->drvpath}}, $_->id - foreach $db->resultset('Builds')->search({ finished => 0 }); - - # Builds in the queue of which a dependency is already building. - my $blockedBuilds = Set::Scalar->new(); - blockBuilds($buildsByDrv, $blockedBuilds, $_) - foreach $db->resultset('Builds')->search({ finished => 0, busy => 1 }); - - # Get the system types for the runnable builds. - my @systemTypes = $db->resultset('Builds')->search( - { finished => 0, busy => 0 }, - { join => ['project'], select => ['system'], as => ['system'], distinct => 1 }); - - # Get the total number of scheduling shares. - my $totalShares = getTotalShares($db) || 1; - - # For each system type, select up to the maximum number of - # concurrent build for that system type. - foreach my $system (@systemTypes) { - # How many builds are already currently executing for this - # system type? - my $nrActive = $db->resultset('Builds')->search( - {finished => 0, busy => 1, system => $system->system})->count; - - (my $systemTypeInfo) = $db->resultset('SystemTypes')->search({system => $system->system}); - my $max = defined $systemTypeInfo ? $systemTypeInfo->maxconcurrent : $maxConcurrent{$system->system} // 2; - - my $extraAllowed = $max - $nrActive; - next if $extraAllowed <= 0; - - print STDERR "starting at most $extraAllowed builds for system ${\$system->system}\n"; - - my $timeSpentPerJobset; - - j: while ($extraAllowed-- > 0) { - - my @runnableJobsets = $db->resultset('Builds')->search( - { finished => 0, busy => 0, system => $system->system }, - { select => ['project', 'jobset'], distinct => 1 }); - - next if @runnableJobsets == 0; - - my $windowSize = 24 * 3600; - my $costPerBuild = 30; - my $totalWindowSize = $windowSize * $max; - - my @res; - - foreach my $b (@runnableJobsets) { - my $jobset = $db->resultset('Jobsets')->find($b->get_column('project'), $b->get_column('jobset')) or die; - - my $timeSpent = $timeSpentPerJobset->{$b->get_column('project')}->{$b->get_column('jobset')}; - - if (!defined $timeSpent) { - $timeSpent = $jobset->builds->search( - { }, - { where => \ ("(finished = 0)") - , join => 'buildsteps' - , select => \ "sum(coalesce(buildsteps.stoptime, ${\time}) - buildsteps.starttime)" - , as => "sum" })->single->get_column("sum") // 0; - - $timeSpent += $jobset->builds->search( - { }, - { where => \ ("(me.stoptime >= " . (time() - $windowSize) . ")") - , join => 'buildsteps' - , select => \ "sum(coalesce(buildsteps.stoptime, ${\time}) - buildsteps.starttime)" - , as => "sum" })->single->get_column("sum") // 0; - - # Add a 30s penalty for each started build. This - # is to account for jobsets that have running - # builds but no build steps yet. - $timeSpent += $jobset->builds->search({ finished => 0, busy => 1 })->count * $costPerBuild; - - $timeSpentPerJobset->{$b->get_column('project')}->{$b->get_column('jobset')} = $timeSpent; - } - - my $share = $jobset->schedulingshares || 1; # prevent division by zero - my $used = $timeSpent / ($totalWindowSize * ($share / $totalShares)); - - #printf STDERR "%s:%s: %d s, total used = %.2f%%, share used = %.2f%%\n", $jobset->get_column('project'), $jobset->name, $timeSpent, $timeSpent / $totalWindowSize * 100, $used * 100; - - push @res, { jobset => $jobset, used => $used }; - } - - foreach my $r (sort { $a->{used} <=> $b->{used} } @res) { - my $jobset = $r->{jobset}; - #print STDERR "selected ", $jobset->get_column('project'), ':', $jobset->name, "\n"; - - # Select the highest-priority build for this jobset. - my @builds = $jobset->builds->search( - { finished => 0, busy => 0, system => $system->system }, - { order_by => ["priority DESC", "id"] }); - - foreach my $build (@builds) { - next if $blockedBuilds->has($build->id); - - # Find a dependency of $build that has no queued - # dependencies itself. This isn't strictly necessary, - # but it ensures that Nix builds are done as part of - # their corresponding Hydra builds, rather than as a - # dependency of some other Hydra build. - while (my $dep = findBuildDependencyInQueue($buildsByDrv, $build)) { - $build = $dep; - } - next if $build->busy; - - printf STDERR "starting build %d (%s:%s:%s) on %s; jobset at %.2f%% of its share\n", - $build->id, $build->project->name, $build->jobset->name, $build->job->name, $build->system, $r->{used} * 100; - - my $logfile = getcwd . "/logs/" . $build->id; - mkdir(dirname $logfile); - unlink($logfile); - $build->update( - { busy => 1 - , locker => $$ - , logfile => $logfile - }); - push @buildsStarted, $build; - - $timeSpentPerJobset->{$jobset->get_column('project')}->{$jobset->name} += $costPerBuild; - - blockBuilds($buildsByDrv, $blockedBuilds, $build); - - next j; - } - } - - last; # nothing found, give up on this system type - } - } - - $lastTime = time(); - - $_->update({ starttime => time() }) foreach @buildsStarted; - }); - - # Actually start the builds we just selected. We need to do this - # outside the transaction in case it aborts or something. - foreach my $build (@buildsStarted) { - my $id = $build->id; - eval { - my $logfile = $build->logfile; - my $child = fork(); - die unless defined $child; - if ($child == 0) { - eval { - open LOG, ">$logfile" or die "cannot create logfile $logfile"; - POSIX::dup2(fileno(LOG), 1) or die; - POSIX::dup2(fileno(LOG), 2) or die; - exec("hydra-build", $id); - }; - warn "cannot start build $id: $@"; - POSIX::_exit(1); - } - }; - if ($@) { - warn $@; - txn_do($db, sub { - $build->update({ busy => 0, locker => $$ }); - }); - } - } -} - - -if (scalar(@ARGV) == 1 && $ARGV[0] eq "--unlock") { - unlockDeadBuilds; - exit 0; -} - - -while (1) { - eval { - # Clean up zombies. - while ((waitpid(-1, &WNOHANG)) > 0) { }; - - unlockDeadBuilds; - - checkBuilds; - }; - warn $@ if $@; - - # print "sleeping...\n"; - sleep(5); -} From a757b783f463dbebad6373070b31cb217c48c827 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Mon, 22 Jun 2015 16:15:16 +0200 Subject: [PATCH 072/158] Update $PATH --- release.nix | 1 + 1 file changed, 1 insertion(+) diff --git a/release.nix b/release.nix index 0e4feec2..31ac8ada 100644 --- a/release.nix +++ b/release.nix @@ -47,6 +47,7 @@ in rec { addToSearchPath PATH $(pwd)/src/script addToSearchPath PATH $(pwd)/src/hydra-eval-jobs + addToSearchPath PATH $(pwd)/src/hydra-queue-runner addToSearchPath PERL5LIB $(pwd)/src/lib ''; From e069ee960e9d00cff548dd9b00e58897846192b4 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Mon, 22 Jun 2015 16:58:40 +0200 Subject: [PATCH 073/158] Doh --- src/hydra-queue-runner/hydra-queue-runner.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index f70d81dc..44ac05db 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -584,7 +584,7 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, if (checkCachedFailure(r, conn)) { printMsg(lvlError, format("marking build %1% as cached failure") % build->id); - buildStatus = step == r ? bsFailed : bsFailed; + buildStatus = step == r ? bsFailed : bsDepFailed; buildStepStatus = bssFailed; } From d06366e7cfbc863329ffe35f1205e5135ba5e270 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Mon, 22 Jun 2015 16:59:50 +0200 Subject: [PATCH 074/158] Remove obsolete comment --- src/hydra-queue-runner/hydra-queue-runner.cc | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 44ac05db..e1d7443a 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -1615,8 +1615,6 @@ int main(int argc, char * * argv) settings.useSubstitutes = false; settings.lockCPU = false; - /* FIXME: need some locking to prevent multiple instances of - hydra-queue-runner. */ State state; if (status) state.showStatus(); From 5312e1209bc1b80b9c1f35eed1b654545a7d1b78 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Mon, 22 Jun 2015 17:11:17 +0200 Subject: [PATCH 075/158] Keep per-machine stats --- src/hydra-queue-runner/hydra-queue-runner.cc | 26 +++++++++++++++++--- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index e1d7443a..37f9f9e9 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -177,7 +177,10 @@ struct Machine unsigned int maxJobs = 1; float speedFactor = 1.0; - std::atomic currentJobs{0}; + counter currentJobs{0}; + counter nrStepsDone{0}; + counter totalStepTime{0}; // total time for steps, including closure copying + counter totalStepBuildTime{0}; // total build time for steps bool supportsStep(Step::ptr step) { @@ -241,6 +244,7 @@ private: Sync machines; /* Various stats. */ + time_t startedAt; counter nrBuildsRead{0}; counter nrBuildsDone{0}; counter nrStepsDone{0}; @@ -1299,9 +1303,13 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, } + // FIXME: keep stats about aborted steps? nrStepsDone++; totalStepTime += stepStopTime - stepStartTime; totalStepBuildTime += result.stopTime - result.startTime; + machine->nrStepsDone++; + machine->totalStepTime += stepStopTime - stepStartTime; + machine->totalStepBuildTime += result.stopTime - result.startTime; return false; } @@ -1423,8 +1431,10 @@ void State::dumpStatus(Connection & conn, bool log) { JSONObject root(out); + time_t now = time(0); root.attr("status", "up"); root.attr("time", time(0)); + root.attr("uptime", now - startedAt); root.attr("pid", getpid()); { auto builds_(builds.lock()); @@ -1449,9 +1459,9 @@ void State::dumpStatus(Connection & conn, bool log) root.attr("nrStepsDone", nrStepsDone); root.attr("nrRetries", nrRetries); root.attr("maxNrRetries", maxNrRetries); - root.attr("totalStepTime", totalStepTime); - root.attr("totalStepBuildTime", totalStepBuildTime); if (nrStepsDone) { + root.attr("totalStepTime", totalStepTime); + root.attr("totalStepBuildTime", totalStepBuildTime); root.attr("avgStepTime"); out << (float) totalStepTime / nrStepsDone; root.attr("avgStepBuildTime"); out << (float) totalStepBuildTime / nrStepsDone; } @@ -1466,7 +1476,13 @@ void State::dumpStatus(Connection & conn, bool log) nested.attr(m->sshName); JSONObject nested2(out); nested2.attr("currentJobs", m->currentJobs); - nested2.attr("maxJobs", m->maxJobs); + nested2.attr("nrStepsDone", m->nrStepsDone); + if (m->nrStepsDone) { + nested2.attr("totalStepTime", m->totalStepTime); + nested2.attr("totalStepBuildTime", m->totalStepBuildTime); + nested2.attr("avgStepTime"); out << (float) m->totalStepTime / m->nrStepsDone; + nested2.attr("avgStepBuildTime"); out << (float) m->totalStepBuildTime / m->nrStepsDone; + } } } } @@ -1550,6 +1566,8 @@ void State::unlock() void State::run() { + startedAt = time(0); + auto lock = acquireGlobalLock(); if (!lock) throw Error("hydra-queue-runner is already running"); From a317d24b294f81c609fc99ea0e695d067f679406 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Tue, 23 Jun 2015 00:14:49 +0200 Subject: [PATCH 076/158] hydra-queue-runner: Send build notifications MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since our notification plugins are written in Perl, sending notification from C++ requires a small Perl helper named ‘hydra-notify’. --- src/hydra-queue-runner/hydra-queue-runner.cc | 79 ++++++++++++++++++-- src/script/Makefile.am | 1 + src/script/hydra-evaluator | 2 +- src/script/hydra-notify | 35 +++++++++ 4 files changed, 111 insertions(+), 6 deletions(-) create mode 100755 src/script/hydra-notify diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 37f9f9e9..8a15a96e 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -261,6 +261,14 @@ private: Sync> logCompressorQueue; std::condition_variable_any logCompressorWakeup; + /* Notification sender work queue. FIXME: if hydra-queue-runner is + killed before it has finished sending notifications about a + build, then the notifications may be lost. It would be better + to mark builds with pending notification in the database. */ + typedef std::pair> NotificationItem; + Sync> notificationSenderQueue; + std::condition_variable_any notificationSenderWakeup; + public: State(); @@ -314,6 +322,10 @@ private: /* Thread that asynchronously bzips logs of finished steps. */ void logCompressor(); + /* Thread that asynchronously invokes hydra-notify to send build + notifications. */ + void notificationSender(); + /* Acquire the global queue runner lock, or null if somebody else has it. */ std::shared_ptr acquireGlobalLock(); @@ -1186,6 +1198,13 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, } } + /* Send notification about this build. */ + { + auto notificationSenderQueue_(notificationSenderQueue.lock()); + notificationSenderQueue_->push(NotificationItem(build->id, std::vector())); + } + notificationSenderWakeup.notify_one(); + /* Wake up any dependent steps that have no other dependencies. */ { @@ -1213,6 +1232,8 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, /* Register failure in the database for all Build objects that directly or indirectly depend on this step. */ + std::vector dependentIDs; + while (true) { /* Get the builds and steps that depend on this step. */ @@ -1273,6 +1294,7 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, for (auto & build2 : indirect) { if (build2->finishedInDB) continue; printMsg(lvlError, format("marking build %1% as failed") % build2->id); + dependentIDs.push_back(build2->id); txn.parameterized ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1 and finished = 0") (build2->id) @@ -1301,6 +1323,13 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, } } + /* Send notification about this build and its dependents. */ + { + auto notificationSenderQueue_(notificationSenderQueue.lock()); + notificationSenderQueue_->push(NotificationItem(build->id, dependentIDs)); + } + notificationSenderWakeup.notify_one(); + } // FIXME: keep stats about aborted steps? @@ -1391,7 +1420,7 @@ void State::logCompressor() if (dup2(fd, STDOUT_FILENO) == -1) throw SysError("cannot dup output pipe to stdout"); execlp("bzip2", "bzip2", "-c", logPath.c_str(), nullptr); - throw SysError("cannot start ssh"); + throw SysError("cannot start bzip2"); }); int res = pid.wait(true); @@ -1414,6 +1443,44 @@ void State::logCompressor() } +void State::notificationSender() +{ + while (true) { + try { + + NotificationItem item; + { + auto notificationSenderQueue_(notificationSenderQueue.lock()); + while (notificationSenderQueue_->empty()) + notificationSenderQueue_.wait(notificationSenderWakeup); + item = notificationSenderQueue_->front(); + notificationSenderQueue_->pop(); + } + + printMsg(lvlChatty, format("sending notification about build %1%") % item.first); + + Pid pid = startProcess([&]() { + Strings argv({"hydra-notify", "build", int2String(item.first)}); + for (auto id : item.second) + argv.push_back(int2String(id)); + execvp("hydra-notify", (char * *) stringsToCharPtrs(argv).data()); // FIXME: remove cast + throw SysError("cannot start hydra-notify"); + }); + + int res = pid.wait(true); + + if (res != 0) + throw Error(format("hydra-build returned exit code %1% notifying about build %2%") + % res % item.first); + + } catch (std::exception & e) { + printMsg(lvlError, format("notification sender: %1%") % e.what()); + sleep(5); + } + } +} + + std::shared_ptr State::acquireGlobalLock() { Path lockPath = hydraData + "/queue-runner"; @@ -1580,7 +1647,7 @@ void State::run() loadMachines(); - auto queueMonitorThread = std::thread(&State::queueMonitor, this); + std::thread(&State::queueMonitor, this).detach(); std::thread(&State::dispatcher, this).detach(); @@ -1588,6 +1655,11 @@ void State::run() than one. */ std::thread(&State::logCompressor, this).detach(); + /* Idem for notification sending. */ + std::thread(&State::notificationSender, this).detach(); + + /* Monitor the database for status dump requests (e.g. from + ‘hydra-queue-runner --status’). */ while (true) { try { auto conn(dbPool.get()); @@ -1601,9 +1673,6 @@ void State::run() sleep(10); // probably a DB problem, so don't retry right away } } - - // Never reached. - queueMonitorThread.join(); } diff --git a/src/script/Makefile.am b/src/script/Makefile.am index cfdeea8c..ce612f37 100644 --- a/src/script/Makefile.am +++ b/src/script/Makefile.am @@ -9,6 +9,7 @@ distributable_scripts = \ hydra-update-gc-roots \ hydra-s3-backup-collect-garbage \ hydra-create-user \ + hydra-notify \ nix-prefetch-git \ nix-prefetch-bzr \ nix-prefetch-hg diff --git a/src/script/hydra-evaluator b/src/script/hydra-evaluator index bcdb948e..7dce10c7 100755 --- a/src/script/hydra-evaluator +++ b/src/script/hydra-evaluator @@ -1,4 +1,4 @@ -#! /var/run/current-system/sw/bin/perl +#! /run/current-system/sw/bin/perl use strict; use utf8; diff --git a/src/script/hydra-notify b/src/script/hydra-notify new file mode 100755 index 00000000..cf8599d3 --- /dev/null +++ b/src/script/hydra-notify @@ -0,0 +1,35 @@ +#! /run/current-system/sw/bin/perl + +use strict; +use utf8; +use Hydra::Plugin; +use Hydra::Helper::Nix; +use Hydra::Helper::PluginHooks; + +STDERR->autoflush(1); +binmode STDERR, ":encoding(utf8)"; + +my $config = getHydraConfig(); + +my $db = Hydra::Model::DB->new(); + +my @plugins = Hydra::Plugin->instantiate(db => $db, config => $config); + +my $cmd = shift @ARGV or die "Syntax: hydra-notify build BUILD-ID [BUILD-IDs...]\n"; + +if ($cmd eq "build") { + my $buildId = shift @ARGV or die; + my $build = $db->resultset('Builds')->find($buildId) + or die "build $buildId does not exist\n"; + my @dependents; + foreach my $id (@ARGV) { + my $dep = $db->resultset('Builds')->find($id) + or die "build $id does not exist\n"; + push @dependents, $dep; + } + notifyBuildFinished(\@plugins, $build, [@dependents]); +} + +else { + die "unknown action ‘$cmd’"; +} From 4db7c51b5c9b595e3bde1f7ae4e74f54945af484 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Tue, 23 Jun 2015 01:49:14 +0200 Subject: [PATCH 077/158] Rate-limit the number of threads copying closures at the same time Having a hundred threads doing I/O at the same time is bad on magnetic disks because of the excessive disk seeks. So allow only 4 threads to copy closures in parallel. --- src/hydra-queue-runner/build-remote.cc | 17 ++++- src/hydra-queue-runner/build-remote.hh | 2 + src/hydra-queue-runner/hydra-queue-runner.cc | 14 +++- src/hydra-queue-runner/sync.hh | 10 +++ src/hydra-queue-runner/token-server.hh | 67 ++++++++++++++++++++ 5 files changed, 106 insertions(+), 4 deletions(-) create mode 100644 src/hydra-queue-runner/token-server.hh diff --git a/src/hydra-queue-runner/build-remote.cc b/src/hydra-queue-runner/build-remote.cc index b9696733..a2b69c2b 100644 --- a/src/hydra-queue-runner/build-remote.cc +++ b/src/hydra-queue-runner/build-remote.cc @@ -60,6 +60,7 @@ static void openConnection(const string & sshName, const string & sshKey, static void copyClosureTo(std::shared_ptr store, FdSource & from, FdSink & to, const PathSet & paths, + TokenServer & copyClosureTokenServer, bool useSubstitutes = false) { PathSet closure; @@ -88,6 +89,19 @@ static void copyClosureTo(std::shared_ptr store, for (auto i = sorted.rbegin(); i != sorted.rend(); ++i) if (present.find(*i) == present.end()) missing.push_back(*i); + /* Ensure that only a limited number of threads can copy closures + at the same time. However, proceed anyway after a timeout to + prevent starvation by a handful of really huge closures. */ + time_t start = time(0); + int timeout = 60 * (10 + rand() % 5); + auto token(copyClosureTokenServer.get(timeout)); + time_t stop = time(0); + + if (token()) + printMsg(lvlDebug, format("got copy closure token after %1%s") % (stop - start)); + else + printMsg(lvlDebug, format("dit not get copy closure token after %1%s") % (stop - start)); + printMsg(lvlDebug, format("sending %1% missing paths") % missing.size()); writeInt(cmdImportPaths, to); @@ -114,6 +128,7 @@ void buildRemote(std::shared_ptr store, const string & sshName, const string & sshKey, const Path & drvPath, const Derivation & drv, const nix::Path & logDir, unsigned int maxSilentTime, unsigned int buildTimeout, + TokenServer & copyClosureTokenServer, RemoteResult & result, counter & nrStepsBuilding) { string base = baseNameOf(drvPath); @@ -163,7 +178,7 @@ void buildRemote(std::shared_ptr store, /* Copy the input closure. */ printMsg(lvlDebug, format("sending closure of ‘%1%’ to ‘%2%’") % drvPath % sshName); - copyClosureTo(store, from, to, inputs); + copyClosureTo(store, from, to, inputs, copyClosureTokenServer); autoDelete.cancel(); diff --git a/src/hydra-queue-runner/build-remote.hh b/src/hydra-queue-runner/build-remote.hh index 68d612e3..86ec767f 100644 --- a/src/hydra-queue-runner/build-remote.hh +++ b/src/hydra-queue-runner/build-remote.hh @@ -4,6 +4,7 @@ #include "derivations.hh" #include "counter.hh" +#include "token-server.hh" struct RemoteResult { @@ -22,4 +23,5 @@ void buildRemote(std::shared_ptr store, const std::string & sshName, const std::string & sshKey, const nix::Path & drvPath, const nix::Derivation & drv, const nix::Path & logDir, unsigned int maxSilentTime, unsigned int buildTimeout, + TokenServer & copyClosureTokenServer, RemoteResult & result, counter & nrStepsBuilding); diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 8a15a96e..524372f6 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -20,6 +20,7 @@ #include "sync.hh" #include "pool.hh" #include "counter.hh" +#include "token-server.hh" #include "store-api.hh" #include "derivations.hh" @@ -31,9 +32,11 @@ using namespace nix; -const int maxTries = 5; -const int retryInterval = 60; // seconds +// FIXME: Make configurable. +const unsigned int maxTries = 5; +const unsigned int retryInterval = 60; // seconds const float retryBackoff = 3.0; +const unsigned int maxParallelCopyClosure = 4; typedef std::chrono::time_point system_time; @@ -243,6 +246,10 @@ private: typedef std::list Machines; Sync machines; + /* Token server limiting the number of threads copying closures in + parallel to prevent excessive I/O load. */ + TokenServer copyClosureTokenServer{maxParallelCopyClosure}; + /* Various stats. */ time_t startedAt; counter nrBuildsRead{0}; @@ -1100,7 +1107,8 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, try { /* FIXME: referring builds may have conflicting timeouts. */ buildRemote(store, machine->sshName, machine->sshKey, step->drvPath, step->drv, - logDir, build->maxSilentTime, build->buildTimeout, result, nrStepsBuilding); + logDir, build->maxSilentTime, build->buildTimeout, copyClosureTokenServer, + result, nrStepsBuilding); } catch (Error & e) { result.status = RemoteResult::rrMiscFailure; result.errorMsg = e.msg(); diff --git a/src/hydra-queue-runner/sync.hh b/src/hydra-queue-runner/sync.hh index 34b97285..aadaa838 100644 --- a/src/hydra-queue-runner/sync.hh +++ b/src/hydra-queue-runner/sync.hh @@ -2,6 +2,7 @@ #include #include +#include /* This template class ensures synchronized access to a value of type T. It is used as follows: @@ -50,6 +51,15 @@ public: assert(s); cv.wait(s->mutex); } + + template + bool wait_for(std::condition_variable_any & cv, + const std::chrono::duration & duration, + Predicate pred) + { + assert(s); + return cv.wait_for(s->mutex, duration, pred); + } }; Lock lock() { return Lock(this); } diff --git a/src/hydra-queue-runner/token-server.hh b/src/hydra-queue-runner/token-server.hh new file mode 100644 index 00000000..2ff748e3 --- /dev/null +++ b/src/hydra-queue-runner/token-server.hh @@ -0,0 +1,67 @@ +#pragma once + +#include + +#include "sync.hh" + +/* This class hands out tokens. There are only ‘maxTokens’ tokens + available. Calling get() will return a Token object, representing + ownership of a token. If no token is available, get() will sleep + until another thread returns a token. */ + +class TokenServer +{ + unsigned int maxTokens; + + Sync curTokens{0}; + std::condition_variable_any wakeup; + +public: + TokenServer(unsigned int maxTokens) : maxTokens(maxTokens) { } + + class Token + { + friend TokenServer; + + TokenServer * ts; + + bool acquired = false; + + Token(TokenServer * ts, unsigned int timeout) : ts(ts) + { + auto curTokens(ts->curTokens.lock()); + while (*curTokens >= ts->maxTokens) + if (timeout) { + if (!curTokens.wait_for(ts->wakeup, std::chrono::seconds(timeout), + [&]() { return *curTokens < ts->maxTokens; })) + return; + } else + curTokens.wait(ts->wakeup); + (*curTokens)++; + acquired = true; + } + + public: + + Token(Token && t) : ts(t.ts) { t.ts = 0; } + Token(const Token & l) = delete; + + ~Token() + { + if (!ts || !acquired) return; + { + auto curTokens(ts->curTokens.lock()); + assert(*curTokens); + (*curTokens)--; + } + ts->wakeup.notify_one(); + } + + bool operator ()() { return acquired; } + }; + + Token get(unsigned int timeout = 0) + { + return Token(this, timeout); + } +}; From 524ee295e0c88c6c11ca4a68ac523ae57c7069ad Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Tue, 23 Jun 2015 02:13:06 +0200 Subject: [PATCH 078/158] Fix sending notifications in the successful case --- src/hydra-queue-runner/hydra-queue-runner.cc | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 524372f6..8e221484 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -1157,6 +1157,9 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, this in a loop, marking all known builds, repeating until there are no unmarked builds. */ + + std::vector buildIDs; + while (true) { /* Get the builds that have this one as the top-level. */ @@ -1203,15 +1206,19 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, auto builds_(builds.lock()); b->finishedInDB = true; builds_->erase(b->id); + buildIDs.push_back(b->id); } } - /* Send notification about this build. */ - { - auto notificationSenderQueue_(notificationSenderQueue.lock()); - notificationSenderQueue_->push(NotificationItem(build->id, std::vector())); + /* Send notification about the builds that have this step as + the top-level. */ + for (auto id : buildIDs) { + { + auto notificationSenderQueue_(notificationSenderQueue.lock()); + notificationSenderQueue_->push(NotificationItem(id, std::vector())); + } + notificationSenderWakeup.notify_one(); } - notificationSenderWakeup.notify_one(); /* Wake up any dependent steps that have no other dependencies. */ @@ -1302,7 +1309,6 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, for (auto & build2 : indirect) { if (build2->finishedInDB) continue; printMsg(lvlError, format("marking build %1% as failed") % build2->id); - dependentIDs.push_back(build2->id); txn.parameterized ("update Builds set finished = 1, busy = 0, buildStatus = $2, startTime = $3, stopTime = $4, isCachedBuild = $5 where id = $1 and finished = 0") (build2->id) @@ -1328,6 +1334,7 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, auto builds_(builds.lock()); b->finishedInDB = true; builds_->erase(b->id); + dependentIDs.push_back(b->id); } } From 681f63a382dbe63c57cf9d3daa6cc956c2039ecb Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Tue, 23 Jun 2015 02:15:11 +0200 Subject: [PATCH 079/158] Typo --- src/hydra-queue-runner/build-remote.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hydra-queue-runner/build-remote.cc b/src/hydra-queue-runner/build-remote.cc index a2b69c2b..befb13fa 100644 --- a/src/hydra-queue-runner/build-remote.cc +++ b/src/hydra-queue-runner/build-remote.cc @@ -100,7 +100,7 @@ static void copyClosureTo(std::shared_ptr store, if (token()) printMsg(lvlDebug, format("got copy closure token after %1%s") % (stop - start)); else - printMsg(lvlDebug, format("dit not get copy closure token after %1%s") % (stop - start)); + printMsg(lvlDebug, format("did not get copy closure token after %1%s") % (stop - start)); printMsg(lvlDebug, format("sending %1% missing paths") % missing.size()); From af5cbe97aae7bd35e21a1beedf901e7b991b105e Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Tue, 23 Jun 2015 03:25:31 +0200 Subject: [PATCH 080/158] createStep(): Cache finished derivations This gets rid of a lot of redundant calls to readDerivation(). --- src/hydra-queue-runner/hydra-queue-runner.cc | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 8e221484..9e25ee6a 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -304,7 +304,7 @@ private: void removeCancelledBuilds(Connection & conn); Step::ptr createStep(std::shared_ptr store, const Path & drvPath, - Build::ptr referringBuild, Step::ptr referringStep, + Build::ptr referringBuild, Step::ptr referringStep, std::set & finishedDrvs, std::set & newSteps, std::set & newRunnable); void makeRunnable(Step::ptr step); @@ -565,7 +565,8 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, } std::set newSteps; - Step::ptr step = createStep(store, build->drvPath, build, 0, newSteps, newRunnable); + std::set finishedDrvs; // FIXME: re-use? + Step::ptr step = createStep(store, build->drvPath, build, 0, finishedDrvs, newSteps, newRunnable); /* Some of the new steps may be the top level of builds that we haven't processed yet. So do them now. This ensures that @@ -714,9 +715,11 @@ void State::removeCancelledBuilds(Connection & conn) Step::ptr State::createStep(std::shared_ptr store, const Path & drvPath, - Build::ptr referringBuild, Step::ptr referringStep, + Build::ptr referringBuild, Step::ptr referringStep, std::set & finishedDrvs, std::set & newSteps, std::set & newRunnable) { + if (finishedDrvs.find(drvPath) != finishedDrvs.end()) return 0; + /* Check if the requested step already exists. If not, create a new step. In any case, make the step reachable from referringBuild or referringStep. This is done atomically (with @@ -783,7 +786,10 @@ Step::ptr State::createStep(std::shared_ptr store, const Path & drvPat } // FIXME: check whether all outputs are in the binary cache. - if (valid) return 0; + if (valid) { + finishedDrvs.insert(drvPath); + return 0; + } /* No, we need to build. */ printMsg(lvlDebug, format("creating build step ‘%1%’") % drvPath); @@ -791,7 +797,7 @@ Step::ptr State::createStep(std::shared_ptr store, const Path & drvPat /* Create steps for the dependencies. */ for (auto & i : step->drv.inputDrvs) { - auto dep = createStep(store, i.first, 0, step, newSteps, newRunnable); + auto dep = createStep(store, i.first, 0, step, finishedDrvs, newSteps, newRunnable); if (dep) { auto step_(step->state.lock()); step_->deps.insert(dep); From 62219adaf37f9b2f3cbbab8197213237d91c057c Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Tue, 23 Jun 2015 14:54:34 +0200 Subject: [PATCH 081/158] Send queue runner stats to statsd This is currently done by a separate program that periodically calls "hydra-queue-runner --status". Eventually, I'll do this in the queue runner directly. Fixes #220. --- release.nix | 13 ++++++++++ src/script/hydra-send-stats | 48 +++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+) create mode 100755 src/script/hydra-send-stats diff --git a/release.nix b/release.nix index 31ac8ada..813c3975 100644 --- a/release.nix +++ b/release.nix @@ -81,6 +81,18 @@ in rec { #nix = nixUnstable; + NetStatsd = buildPerlPackage { + name = "Net-Statsd-0.11"; + src = fetchurl { + url = mirror://cpan/authors/id/C/CO/COSIMO/Net-Statsd-0.11.tar.gz; + sha256 = "0f56c95846c7e65e6d32cec13ab9df65716429141f106d2dc587f1de1e09e163"; + }; + meta = { + description = "Sends statistics to the stats daemon over UDP"; + license = "perl"; + }; + }; + perlDeps = buildEnv { name = "hydra-perl-deps"; paths = with perlPackages; @@ -116,6 +128,7 @@ in rec { LWP LWPProtocolHttps NetAmazonS3 + NetStatsd PadWalker Readonly SQLSplitStatement diff --git a/src/script/hydra-send-stats b/src/script/hydra-send-stats new file mode 100755 index 00000000..dd893d77 --- /dev/null +++ b/src/script/hydra-send-stats @@ -0,0 +1,48 @@ +#! /run/current-system/sw/bin/perl + +use strict; +use utf8; +use Net::Statsd; +use JSON; + +STDERR->autoflush(1); +binmode STDERR, ":encoding(utf8)"; + +sub gauge { + my ($name, $val) = @_; + die unless defined $val; + Net::Statsd::gauge($name, $val); +} + +sub sendQueueRunnerStats { + my $s = `hydra-queue-runner --status`; + die "cannot get queue runner stats\n" if $? != 0; + + my $json = decode_json($s) or die "cannot decode queue runner status"; + + return if $json->{status} ne "up"; + + gauge("hydra.queue.steps.active", $json->{nrActiveSteps}); + gauge("hydra.queue.steps.building", $json->{nrStepsBuilding}); + gauge("hydra.queue.steps.runnable", $json->{nrRunnableSteps}); + gauge("hydra.queue.steps.unfinished", $json->{nrUnfinishedSteps}); + gauge("hydra.queue.steps.finished", $json->{nrStepsDone}); + gauge("hydra.queue.steps.retries", $json->{nrRetries}); + gauge("hydra.queue.steps.max_retries", $json->{maxNrRetries}); + if ($json->{nrStepsDone}) { + gauge("hydra.queue.steps.avg_total_time", $json->{avgStepTime}); + gauge("hydra.queue.steps.avg_build_time", $json->{avgStepBuildTime}); + } + + gauge("hydra.queue.builds.read", $json->{nrBuildsRead}); + gauge("hydra.queue.builds.unfinished", $json->{nrQueuedBuilds}); + gauge("hydra.queue.builds.finished", $json->{nrBuildsDone}); + + gauge("hydra.queue.checks", $json->{nrQueueWakeups}); +} + +while (1) { + eval { sendQueueRunnerStats(); }; + if ($@) { warn "$@"; } + sleep(30); +} From 3f8891b6ff84c5a9e81d1812e2fb74f768245943 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Tue, 23 Jun 2015 17:53:08 +0200 Subject: [PATCH 082/158] Fix incorrect debug message --- src/hydra-queue-runner/hydra-queue-runner.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 9e25ee6a..c4ddee89 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -1269,7 +1269,7 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, be certain no new referrers can be added. */ if (indirect.empty()) { for (auto & s : steps) { - printMsg(lvlDebug, format("finishing build step ‘%1%’") % step->drvPath); + printMsg(lvlDebug, format("finishing build step ‘%1%’") % s->drvPath); steps_->erase(s->drvPath); } break; From 1a0e1eb5a007588b1cbaacd8ccb1921ddfa2df77 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Wed, 24 Jun 2015 13:19:16 +0200 Subject: [PATCH 083/158] More stats --- src/hydra-queue-runner/build-remote.cc | 14 ++++++++++---- src/hydra-queue-runner/build-remote.hh | 4 ++-- src/hydra-queue-runner/hydra-queue-runner.cc | 6 +++++- src/script/hydra-send-stats | 13 +++++++++++++ 4 files changed, 30 insertions(+), 7 deletions(-) diff --git a/src/hydra-queue-runner/build-remote.cc b/src/hydra-queue-runner/build-remote.cc index befb13fa..73cf294c 100644 --- a/src/hydra-queue-runner/build-remote.cc +++ b/src/hydra-queue-runner/build-remote.cc @@ -128,8 +128,8 @@ void buildRemote(std::shared_ptr store, const string & sshName, const string & sshKey, const Path & drvPath, const Derivation & drv, const nix::Path & logDir, unsigned int maxSilentTime, unsigned int buildTimeout, - TokenServer & copyClosureTokenServer, - RemoteResult & result, counter & nrStepsBuilding) + TokenServer & copyClosureTokenServer, RemoteResult & result, + counter & nrStepsBuilding, counter & nrStepsCopyingTo, counter & nrStepsCopyingFrom) { string base = baseNameOf(drvPath); result.logFile = logDir + "/" + string(base, 0, 2) + "/" + string(base, 2); @@ -178,7 +178,10 @@ void buildRemote(std::shared_ptr store, /* Copy the input closure. */ printMsg(lvlDebug, format("sending closure of ‘%1%’ to ‘%2%’") % drvPath % sshName); - copyClosureTo(store, from, to, inputs, copyClosureTokenServer); + { + MaintainCount mc(nrStepsCopyingTo); + copyClosureTo(store, from, to, inputs, copyClosureTokenServer); + } autoDelete.cancel(); @@ -210,7 +213,10 @@ void buildRemote(std::shared_ptr store, PathSet outputs; for (auto & output : drv.outputs) outputs.insert(output.second.path); - copyClosureFrom(store, from, to, outputs); + { + MaintainCount mc(nrStepsCopyingFrom); + copyClosureFrom(store, from, to, outputs); + } /* Shut down the connection. */ child.to.close(); diff --git a/src/hydra-queue-runner/build-remote.hh b/src/hydra-queue-runner/build-remote.hh index 86ec767f..19625b19 100644 --- a/src/hydra-queue-runner/build-remote.hh +++ b/src/hydra-queue-runner/build-remote.hh @@ -23,5 +23,5 @@ void buildRemote(std::shared_ptr store, const std::string & sshName, const std::string & sshKey, const nix::Path & drvPath, const nix::Derivation & drv, const nix::Path & logDir, unsigned int maxSilentTime, unsigned int buildTimeout, - TokenServer & copyClosureTokenServer, - RemoteResult & result, counter & nrStepsBuilding); + TokenServer & copyClosureTokenServer, RemoteResult & result, + counter & nrStepsBuilding, counter & nrStepsCopyingTo, counter & nrStepsCopyingFrom); diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index c4ddee89..f6ba6179 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -257,6 +257,8 @@ private: counter nrStepsDone{0}; counter nrActiveSteps{0}; counter nrStepsBuilding{0}; + counter nrStepsCopyingTo{0}; + counter nrStepsCopyingFrom{0}; counter nrRetries{0}; counter maxNrRetries{0}; counter totalStepTime{0}; // total time for steps, including closure copying @@ -1114,7 +1116,7 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, /* FIXME: referring builds may have conflicting timeouts. */ buildRemote(store, machine->sshName, machine->sshKey, step->drvPath, step->drv, logDir, build->maxSilentTime, build->buildTimeout, copyClosureTokenServer, - result, nrStepsBuilding); + result, nrStepsBuilding, nrStepsCopyingTo, nrStepsCopyingFrom); } catch (Error & e) { result.status = RemoteResult::rrMiscFailure; result.errorMsg = e.msg(); @@ -1542,6 +1544,8 @@ void State::dumpStatus(Connection & conn, bool log) } root.attr("nrActiveSteps", nrActiveSteps); root.attr("nrStepsBuilding", nrStepsBuilding); + root.attr("nrStepsCopyingTo", nrStepsCopyingTo); + root.attr("nrStepsCopyingFrom", nrStepsCopyingFrom); root.attr("nrBuildsRead", nrBuildsRead); root.attr("nrBuildsDone", nrBuildsDone); root.attr("nrStepsDone", nrStepsDone); diff --git a/src/script/hydra-send-stats b/src/script/hydra-send-stats index dd893d77..b3e28968 100755 --- a/src/script/hydra-send-stats +++ b/src/script/hydra-send-stats @@ -3,6 +3,7 @@ use strict; use utf8; use Net::Statsd; +use File::Slurp; use JSON; STDERR->autoflush(1); @@ -20,10 +21,14 @@ sub sendQueueRunnerStats { my $json = decode_json($s) or die "cannot decode queue runner status"; + gauge("hydra.queue.up", $json->{status} eq "up" ? 1 : 0); + return if $json->{status} ne "up"; gauge("hydra.queue.steps.active", $json->{nrActiveSteps}); gauge("hydra.queue.steps.building", $json->{nrStepsBuilding}); + gauge("hydra.queue.steps.copying_to", $json->{nrStepsCopyingTo}); + gauge("hydra.queue.steps.copying_from", $json->{nrStepsCopyingFrom}); gauge("hydra.queue.steps.runnable", $json->{nrRunnableSteps}); gauge("hydra.queue.steps.unfinished", $json->{nrUnfinishedSteps}); gauge("hydra.queue.steps.finished", $json->{nrStepsDone}); @@ -44,5 +49,13 @@ sub sendQueueRunnerStats { while (1) { eval { sendQueueRunnerStats(); }; if ($@) { warn "$@"; } + + my $meminfo = read_file("/proc/meminfo", err_mode => 'quiet') // ""; + $meminfo =~ m/Dirty:\s*(\d+) kB/; + if (defined $1) { + my $dirty = $1 / (1024.0 * 1024.0); + gauge("hydra.mem.dirty", $dirty); + } + sleep(30); } From 32210905d81b152c79e31c95ca2b3a6d90f6c4d3 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Thu, 25 Jun 2015 12:24:11 +0200 Subject: [PATCH 084/158] Automatically reload $NIX_REMOTE_SYSTEMS when it changes Otherwise, you'd have to restart the queue runner to add or remove machines. --- src/hydra-queue-runner/hydra-queue-runner.cc | 196 +++++++++++-------- 1 file changed, 118 insertions(+), 78 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index f6ba6179..ccb3e829 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -180,10 +180,15 @@ struct Machine unsigned int maxJobs = 1; float speedFactor = 1.0; - counter currentJobs{0}; - counter nrStepsDone{0}; - counter totalStepTime{0}; // total time for steps, including closure copying - counter totalStepBuildTime{0}; // total build time for steps + struct State { + typedef std::shared_ptr ptr; + counter currentJobs{0}; + counter nrStepsDone{0}; + counter totalStepTime{0}; // total time for steps, including closure copying + counter totalStepBuildTime{0}; // total build time for steps + }; + + State::ptr state; bool supportsStep(Step::ptr step) { @@ -197,23 +202,6 @@ struct Machine }; -/* A RAII helper that manages the currentJobs field of Machine - objects. */ -struct MachineReservation -{ - typedef std::shared_ptr ptr; - Machine::ptr machine; - MachineReservation(Machine::ptr machine) : machine(machine) - { - machine->currentJobs++; - } - ~MachineReservation() - { - machine->currentJobs--; - } -}; - - class State { private: @@ -243,9 +231,12 @@ private: Pool dbPool; /* The build machines. */ - typedef std::list Machines; + typedef std::map Machines; Sync machines; + Path machinesFile; + struct stat machinesFileStat; + /* Token server limiting the number of threads copying closures in parallel to prevent excessive I/O load. */ TokenServer copyClosureTokenServer{maxParallelCopyClosure}; @@ -285,7 +276,11 @@ private: void clearBusy(Connection & conn, time_t stopTime); - void loadMachines(); + /* (Re)load /etc/nix/machines. */ + void loadMachinesFile(); + + /* Thread to reload /etc/nix/machines periodically. */ + void monitorMachinesFile(); int createBuildStep(pqxx::work & txn, time_t startTime, Build::ptr build, Step::ptr step, const std::string & machine, BuildStepStatus status, const std::string & errorMsg = "", @@ -316,7 +311,7 @@ private: void wakeDispatcher(); - void builder(Step::ptr step, MachineReservation::ptr reservation); + void builder(Step::ptr step, Machine::ptr machine, std::shared_ptr reservation); /* Perform the given build step. Return true if the step is to be retried. */ @@ -357,54 +352,95 @@ State::State() if (hydraData == "") throw Error("$HYDRA_DATA must be set"); logDir = canonPath(hydraData + "/build-logs"); + + machinesFile = getEnv("NIX_REMOTE_SYSTEMS", "/etc/nix/machines"); + machinesFileStat.st_ino = 0; + machinesFileStat.st_mtime = 0; } -void State::loadMachines() +void State::loadMachinesFile() { - Path machinesFile = getEnv("NIX_REMOTE_SYSTEMS", "/etc/nix/machines"); - - Machines newMachines; - + string contents; if (pathExists(machinesFile)) { - - for (auto line : tokenizeString(readFile(machinesFile), "\n")) { - line = trim(string(line, 0, line.find('#'))); - auto tokens = tokenizeString>(line); - if (tokens.size() < 3) continue; - tokens.resize(7); - - auto machine = std::make_shared(); - machine->sshName = tokens[0]; - machine->systemTypes = tokenizeString(tokens[1], ","); - machine->sshKey = tokens[2]; - if (tokens[3] != "") - string2Int(tokens[3], machine->maxJobs); - else - machine->maxJobs = 1; - machine->speedFactor = atof(tokens[4].c_str()); - machine->supportedFeatures = tokenizeString(tokens[5], ","); - machine->mandatoryFeatures = tokenizeString(tokens[6], ","); - for (auto & f : machine->mandatoryFeatures) - machine->supportedFeatures.insert(f); - newMachines.push_back(machine); - } - + struct stat st; + if (stat(machinesFile.c_str(), &st) != 0) + throw SysError(format("getting stats about ‘%1%’") % machinesFile); + if (st.st_ino == machinesFileStat.st_ino && st.st_mtime == machinesFileStat.st_mtime) + return; + printMsg(lvlDebug, "reloading machines"); + contents = readFile(machinesFile); + machinesFileStat = st; } else { - auto machine = std::make_shared(); - machine->sshName = "localhost"; - machine->systemTypes = StringSet({settings.thisSystem}); + StringSet systems = StringSet({settings.thisSystem}); if (settings.thisSystem == "x86_64-linux") - machine->systemTypes.insert("i686-linux"); - machine->maxJobs = settings.maxBuildJobs; - newMachines.push_back(machine); + systems.insert("i686-linux"); + contents = "localhost " + concatStringsSep(",", systems) + + " - " + int2String(settings.maxBuildJobs) + " 1"; } + Machines newMachines, oldMachines; + { + auto machines_(machines.lock()); + oldMachines = *machines_; + } + + for (auto line : tokenizeString(contents, "\n")) { + line = trim(string(line, 0, line.find('#'))); + auto tokens = tokenizeString>(line); + if (tokens.size() < 3) continue; + tokens.resize(7); + + auto machine = std::make_shared(); + machine->sshName = tokens[0]; + machine->systemTypes = tokenizeString(tokens[1], ","); + machine->sshKey = tokens[2]; + if (tokens[3] != "") + string2Int(tokens[3], machine->maxJobs); + else + machine->maxJobs = 1; + machine->speedFactor = atof(tokens[4].c_str()); + machine->supportedFeatures = tokenizeString(tokens[5], ","); + machine->mandatoryFeatures = tokenizeString(tokens[6], ","); + for (auto & f : machine->mandatoryFeatures) + machine->supportedFeatures.insert(f); + + /* Re-use the State object of the previous machine with the + same name. */ + auto i = oldMachines.find(machine->sshName); + if (i == oldMachines.end()) + printMsg(lvlChatty, format("adding new machine ‘%1%’") % machine->sshName); + else + printMsg(lvlChatty, format("updating machine ‘%1%’") % machine->sshName); + machine->state = i == oldMachines.end() + ? std::make_shared() + : i->second->state; + newMachines[machine->sshName] = machine; + } + + for (auto & m : oldMachines) + if (newMachines.find(m.first) == newMachines.end()) + printMsg(lvlInfo, format("removing machine ‘%1%’") % m.first); + auto machines_(machines.lock()); *machines_ = newMachines; } +void State::monitorMachinesFile() +{ + while (true) { + try { + // FIXME: use inotify. + sleep(60); + loadMachinesFile(); + } catch (std::exception & e) { + printMsg(lvlError, format("reloading machines file: %1%") % e.what()); + } + } +} + + void State::clearBusy(Connection & conn, time_t stopTime) { pqxx::work txn(conn); @@ -619,7 +655,7 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, { auto machines_(machines.lock()); // FIXME: use shared_mutex for (auto & m : *machines_) - if (m->supportsStep(r)) { supported = true; break; } + if (m.second->supportsStep(r)) { supported = true; break; } } if (!supported) { @@ -896,7 +932,7 @@ void State::dispatcher() { auto machines_(machines.lock()); for (auto & m : *machines_) - machinesSorted.push_back({m, m->currentJobs}); + machinesSorted.push_back({m.second, m.second->state->currentJobs}); } /* Sort the machines by a combination of speed factor and @@ -929,7 +965,7 @@ void State::dispatcher() for (auto & mi : machinesSorted) { // FIXME: can we lose a wakeup if a builder exits concurrently? - if (mi.machine->currentJobs >= mi.machine->maxJobs) continue; + if (mi.machine->state->currentJobs >= mi.machine->maxJobs) continue; auto runnable_(runnable.lock()); //printMsg(lvlDebug, format("%1% runnable builds") % runnable_->size()); @@ -966,10 +1002,10 @@ void State::dispatcher() /* Make a slot reservation and start a thread to do the build. */ - auto reservation = std::make_shared(mi.machine); + auto reservation = std::make_shared(mi.machine->state->currentJobs); i = runnable_->erase(i); - auto builderThread = std::thread(&State::builder, this, step, reservation); + auto builderThread = std::thread(&State::builder, this, step, mi.machine, reservation); builderThread.detach(); // FIXME? keepGoing = true; @@ -1003,7 +1039,7 @@ void State::wakeDispatcher() } -void State::builder(Step::ptr step, MachineReservation::ptr reservation) +void State::builder(Step::ptr step, Machine::ptr machine, std::shared_ptr reservation) { bool retry = true; @@ -1011,10 +1047,10 @@ void State::builder(Step::ptr step, MachineReservation::ptr reservation) try { auto store = openStore(); // FIXME: pool - retry = doBuildStep(store, step, reservation->machine); + retry = doBuildStep(store, step, machine); } catch (std::exception & e) { printMsg(lvlError, format("uncaught exception building ‘%1%’ on ‘%2%’: %3%") - % step->drvPath % reservation->machine->sshName % e.what()); + % step->drvPath % machine->sshName % e.what()); } /* Release the machine and wake up the dispatcher. */ @@ -1359,9 +1395,9 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, nrStepsDone++; totalStepTime += stepStopTime - stepStartTime; totalStepBuildTime += result.stopTime - result.startTime; - machine->nrStepsDone++; - machine->totalStepTime += stepStopTime - stepStartTime; - machine->totalStepBuildTime += result.stopTime - result.startTime; + machine->state->nrStepsDone++; + machine->state->totalStepTime += stepStopTime - stepStartTime; + machine->state->totalStepBuildTime += result.stopTime - result.startTime; return false; } @@ -1564,16 +1600,18 @@ void State::dumpStatus(Connection & conn, bool log) root.attr("machines"); JSONObject nested(out); auto machines_(machines.lock()); - for (auto & m : *machines_) { + for (auto & i : *machines_) { + auto & m(i.second); + auto & s(m->state); nested.attr(m->sshName); JSONObject nested2(out); - nested2.attr("currentJobs", m->currentJobs); - nested2.attr("nrStepsDone", m->nrStepsDone); - if (m->nrStepsDone) { - nested2.attr("totalStepTime", m->totalStepTime); - nested2.attr("totalStepBuildTime", m->totalStepBuildTime); - nested2.attr("avgStepTime"); out << (float) m->totalStepTime / m->nrStepsDone; - nested2.attr("avgStepBuildTime"); out << (float) m->totalStepBuildTime / m->nrStepsDone; + nested2.attr("currentJobs", s->currentJobs); + nested2.attr("nrStepsDone", s->nrStepsDone); + if (m->state->nrStepsDone) { + nested2.attr("totalStepTime", s->totalStepTime); + nested2.attr("totalStepBuildTime", s->totalStepBuildTime); + nested2.attr("avgStepTime"); out << (float) s->totalStepTime / s->nrStepsDone; + nested2.attr("avgStepBuildTime"); out << (float) s->totalStepBuildTime / s->nrStepsDone; } } } @@ -1670,7 +1708,9 @@ void State::run() dumpStatus(*conn, false); } - loadMachines(); + loadMachinesFile(); + + std::thread(&State::monitorMachinesFile, this).detach(); std::thread(&State::queueMonitor, this).detach(); From 18a3c3ff1c1cb6b601a1db36b28ca052c765a85c Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Thu, 25 Jun 2015 15:29:22 +0200 Subject: [PATCH 085/158] Update "make check" for the new queue runner Also, if the machines file contains an entry for localhost, then run "nix-store --serve" directly, without going through SSH. --- src/hydra-queue-runner/Makefile.am | 3 ++- src/hydra-queue-runner/build-remote.cc | 23 +++++++++++++----- src/hydra-queue-runner/hydra-queue-runner.cc | 25 ++++++++++++++++---- tests/Makefile.am | 14 ++++------- tests/Setup.pm | 3 +-- tests/evaluation-tests.pl | 2 +- tests/query-all-tables.pl | 21 ---------------- tests/s3-backup-test.pl | 0 tests/set-up.pl | 3 +++ tests/tear-down.pl | 5 ++++ 10 files changed, 54 insertions(+), 45 deletions(-) mode change 100755 => 100644 tests/evaluation-tests.pl delete mode 100755 tests/query-all-tables.pl mode change 100755 => 100644 tests/s3-backup-test.pl create mode 100644 tests/set-up.pl create mode 100644 tests/tear-down.pl diff --git a/src/hydra-queue-runner/Makefile.am b/src/hydra-queue-runner/Makefile.am index 00aa254d..699a22a5 100644 --- a/src/hydra-queue-runner/Makefile.am +++ b/src/hydra-queue-runner/Makefile.am @@ -1,6 +1,7 @@ bin_PROGRAMS = hydra-queue-runner -hydra_queue_runner_SOURCES = hydra-queue-runner.cc build-result.cc build-remote.cc +hydra_queue_runner_SOURCES = hydra-queue-runner.cc build-result.cc build-remote.cc \ + build-remote.hh build-result.hh counter.hh pool.hh sync.hh token-server.hh hydra_queue_runner_LDADD = $(NIX_LIBS) -lpqxx AM_CXXFLAGS = $(NIX_CFLAGS) -Wall diff --git a/src/hydra-queue-runner/build-remote.cc b/src/hydra-queue-runner/build-remote.cc index 73cf294c..4fa9da0c 100644 --- a/src/hydra-queue-runner/build-remote.cc +++ b/src/hydra-queue-runner/build-remote.cc @@ -21,6 +21,12 @@ struct Child }; +static void append(Strings & dst, const Strings & src) +{ + dst.insert(dst.end(), src.begin(), src.end()); +} + + static void openConnection(const string & sshName, const string & sshKey, int stderrFD, Child & child) { @@ -39,13 +45,18 @@ static void openConnection(const string & sshName, const string & sshKey, if (dup2(stderrFD, STDERR_FILENO) == -1) throw SysError("cannot dup stderr"); - // FIXME: connection timeouts - Strings argv( - { "ssh", sshName, "-i", sshKey, "-x", "-a" - , "-oBatchMode=yes", "-oConnectTimeout=60", "-oTCPKeepAlive=yes" - , "--", "nix-store", "--serve", "--write" }); + Strings argv; + if (sshName == "localhost") + argv = {"nix-store", "--serve", "--write"}; + else { + argv = {"ssh", sshName}; + if (sshKey != "" && sshKey != "-") append(argv, {"-i", sshKey}); + append(argv, + { "-x", "-a", "-oBatchMode=yes", "-oConnectTimeout=60", "-oTCPKeepAlive=yes" + , "--", "nix-store", "--serve", "--write" }); + } - execvp("ssh", (char * *) stringsToCharPtrs(argv).data()); // FIXME: remove cast + execvp(argv.front().c_str(), (char * *) stringsToCharPtrs(argv).data()); // FIXME: remove cast throw SysError("cannot start ssh"); }); diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index ccb3e829..9037ee10 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -232,7 +232,7 @@ private: /* The build machines. */ typedef std::map Machines; - Sync machines; + Sync machines; // FIXME: use atomic_shared_ptr Path machinesFile; struct stat machinesFileStat; @@ -269,6 +269,9 @@ private: Sync> notificationSenderQueue; std::condition_variable_any notificationSenderWakeup; + /* Specific build to do for --build-one (testing only). */ + BuildID buildOne; + public: State(); @@ -342,7 +345,7 @@ public: void unlock(); - void run(); + void run(BuildID buildOne = 0); }; @@ -562,6 +565,7 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, for (auto const & row : res) { auto builds_(builds.lock()); BuildID id = row["id"].as(); + if (buildOne && id != buildOne) continue; if (id > lastBuildId) lastBuildId = id; if (has(*builds_, id)) continue; @@ -1122,6 +1126,8 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, % step->drvPath % machine->sshName % build->id % (dependents.size() - 1)); } + bool quit = build->id == buildOne; + auto conn(dbPool.get()); RemoteResult result; @@ -1188,6 +1194,7 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, finishBuildStep(txn, result.startTime, result.stopTime, build->id, stepNr, machine->sshName, bssAborted, result.errorMsg); txn.commit(); + if (quit) exit(1); return true; } } @@ -1379,6 +1386,7 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, b->finishedInDB = true; builds_->erase(b->id); dependentIDs.push_back(b->id); + if (buildOne == b->id) quit = true; } } @@ -1399,6 +1407,8 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, machine->state->totalStepTime += stepStopTime - stepStartTime; machine->state->totalStepBuildTime += result.stopTime - result.startTime; + if (quit) exit(0); // testing hack + return false; } @@ -1694,9 +1704,10 @@ void State::unlock() } -void State::run() +void State::run(BuildID buildOne) { startedAt = time(0); + this->buildOne = buildOne; auto lock = acquireGlobalLock(); if (!lock) @@ -1752,13 +1763,17 @@ int main(int argc, char * * argv) bool unlock = false; bool status = false; + BuildID buildOne = 0; parseCmdLine(argc, argv, [&](Strings::iterator & arg, const Strings::iterator & end) { if (*arg == "--unlock") unlock = true; else if (*arg == "--status") status = true; - else + else if (*arg == "--build-one") { + if (!string2Int(getArg(*arg, arg, end), buildOne)) + throw Error("‘--build-one’ requires a build ID"); + } else return false; return true; }); @@ -1773,6 +1788,6 @@ int main(int argc, char * * argv) else if (unlock) state.unlock(); else - state.run(); + state.run(buildOne); }); } diff --git a/tests/Makefile.am b/tests/Makefile.am index 165903d3..8b8880f7 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -1,6 +1,6 @@ TESTS_ENVIRONMENT = \ BZR_HOME="$(abs_builddir)/data" \ - HYDRA_DBI="dbi:SQLite:db.sqlite" \ + HYDRA_DBI="dbi:Pg:dbname=hydra-test-suite;" \ HYDRA_DATA="$(abs_builddir)/data" \ HYDRA_HOME="$(top_srcdir)/src" \ HYDRA_CONFIG= \ @@ -22,15 +22,11 @@ EXTRA_DIST = \ $(TESTS) TESTS = \ - query-all-tables.pl \ - evaluation-tests.pl + set-up.pl \ + evaluation-tests.pl \ + tear-down.pl -clean: - chmod -R a+w nix || true - rm -rf db.sqlite data nix git-repo hg-repo svn-repo svn-checkout svn-checkout-repo bzr-repo bzr-checkout-repo darcs-repo - rm -f .*-state - -check_SCRIPTS = db.sqlite repos +check_SCRIPTS = repos db.sqlite: $(top_srcdir)/src/sql/hydra-sqlite.sql $(TESTS_ENVIRONMENT) $(top_srcdir)/src/script/hydra-init diff --git a/tests/Setup.pm b/tests/Setup.pm index 96aecde6..209d950d 100644 --- a/tests/Setup.pm +++ b/tests/Setup.pm @@ -71,8 +71,7 @@ sub evalSucceeds { sub runBuild { my ($build) = @_; - my ($res, $stdout, $stderr) = captureStdoutStderr(60, ("hydra-build", $build->id)); - print "STDERR: $stderr" if $stderr ne ""; + my ($res, $stdout, $stderr) = captureStdoutStderr(60, ("hydra-queue-runner", "-vvvv", "--build-one", $build->id)); return !$res; } diff --git a/tests/evaluation-tests.pl b/tests/evaluation-tests.pl old mode 100755 new mode 100644 index 90ae41df..71673f79 --- a/tests/evaluation-tests.pl +++ b/tests/evaluation-tests.pl @@ -28,7 +28,7 @@ ok(nrQueuedBuildsForJobset($jobset) == 3 , "Evaluating jobs/basic.nix should res for my $build (queuedBuildsForJobset($jobset)) { ok(runBuild($build), "Build '".$build->job->name."' from jobs/basic.nix should exit with code 0"); my $newbuild = $db->resultset('Builds')->find($build->id); - my $expected = $build->job->name eq "fails" ? 1 : 0; + my $expected = $build->job->name eq "fails" ? 1 : $build->job->name =~ /with_failed/ ? 6 : 0; ok($newbuild->finished == 1 && $newbuild->buildstatus == $expected, "Build '".$build->job->name."' from jobs/basic.nix should have buildstatus $expected"); } diff --git a/tests/query-all-tables.pl b/tests/query-all-tables.pl deleted file mode 100755 index eb67a7f2..00000000 --- a/tests/query-all-tables.pl +++ /dev/null @@ -1,21 +0,0 @@ -use strict; -use Hydra::Schema; -use Hydra::Model::DB; - -my $db = Hydra::Model::DB->new; - -my @sources = $db->schema->sources; -my $nrtables = scalar(@sources); - -use Test::Simple tests => 38; - -foreach my $source (@sources) { - my $title = "Basic select query for $source"; - if ($source eq "SchemaVersion" || $source eq "NrBuilds") { - ok(scalar($db->resultset($source)->all) == 1, $title); - } elsif( $source !~ m/^LatestSucceeded/) { - ok(scalar($db->resultset($source)->all) == 0, $title); - } else { - ok(scalar($db->resultset($source)->search({},{ bind => ["", "", ""] })) == 0, $title); - } -} diff --git a/tests/s3-backup-test.pl b/tests/s3-backup-test.pl old mode 100755 new mode 100644 diff --git a/tests/set-up.pl b/tests/set-up.pl new file mode 100644 index 00000000..ff72483f --- /dev/null +++ b/tests/set-up.pl @@ -0,0 +1,3 @@ +use strict; +system("createdb hydra-test-suite") == 0 or die; +system("hydra-init") == 0 or die; diff --git a/tests/tear-down.pl b/tests/tear-down.pl new file mode 100644 index 00000000..32e51a87 --- /dev/null +++ b/tests/tear-down.pl @@ -0,0 +1,5 @@ +use strict; +system("chmod -R a+w nix") == 0 or die; +system("rm -rf data nix git-repo hg-repo svn-repo svn-checkout svn-checkout-repo bzr-repo bzr-checkout-repo darcs-repo") == 0 or die; +system("rm -f .*-state") == 0 or die; +system("dropdb hydra-test-suite") == 0 or die; From b5815e2aa68bb7fade4adbcad48e53445d1c7e72 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Thu, 25 Jun 2015 15:51:44 +0200 Subject: [PATCH 086/158] Run PostgreSQL during "make check" --- release.nix | 1 + tests/Makefile.am | 4 ++-- tests/set-up.pl | 4 +++- tests/tear-down.pl | 15 +++++++++++---- 4 files changed, 17 insertions(+), 7 deletions(-) diff --git a/release.nix b/release.nix index 813c3975..9d2af8f9 100644 --- a/release.nix +++ b/release.nix @@ -154,6 +154,7 @@ in rec { gitAndTools.topGit mercurial darcs subversion bazaar openssl bzip2 guile # optional, for Guile + Guix support perlDeps perl + postgresql92 # for running the tests ]; hydraPath = lib.makeSearchPath "bin" ( diff --git a/tests/Makefile.am b/tests/Makefile.am index 8b8880f7..7a11bca5 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -1,6 +1,6 @@ TESTS_ENVIRONMENT = \ BZR_HOME="$(abs_builddir)/data" \ - HYDRA_DBI="dbi:Pg:dbname=hydra-test-suite;" \ + HYDRA_DBI="dbi:Pg:dbname=hydra-test-suite;port=6433" \ HYDRA_DATA="$(abs_builddir)/data" \ HYDRA_HOME="$(top_srcdir)/src" \ HYDRA_CONFIG= \ @@ -12,7 +12,7 @@ TESTS_ENVIRONMENT = \ NIX_LOG_DIR="$(abs_builddir)/nix/var/log/nix" \ NIX_BUILD_HOOK= \ PERL5LIB="$(srcdir):$(top_srcdir)/src/lib:$$PERL5LIB" \ - PATH=$(abs_top_srcdir)/src/script:$(abs_top_srcdir)/src/hydra-eval-jobs:$$PATH \ + PATH=$(abs_top_srcdir)/src/script:$(abs_top_srcdir)/src/hydra-eval-jobs:$(abs_top_srcdir)/src/hydra-queue-runner:$$PATH \ perl -w EXTRA_DIST = \ diff --git a/tests/set-up.pl b/tests/set-up.pl index ff72483f..63679b63 100644 --- a/tests/set-up.pl +++ b/tests/set-up.pl @@ -1,3 +1,5 @@ use strict; -system("createdb hydra-test-suite") == 0 or die; +system("initdb -D postgres") == 0 or die; +system("pg_ctl -D postgres -o \"-F -p 6433 -h ''\" -w start") == 0 or die; +system("createdb -p 6433 hydra-test-suite") == 0 or die; system("hydra-init") == 0 or die; diff --git a/tests/tear-down.pl b/tests/tear-down.pl index 32e51a87..f30bb278 100644 --- a/tests/tear-down.pl +++ b/tests/tear-down.pl @@ -1,5 +1,12 @@ use strict; -system("chmod -R a+w nix") == 0 or die; -system("rm -rf data nix git-repo hg-repo svn-repo svn-checkout svn-checkout-repo bzr-repo bzr-checkout-repo darcs-repo") == 0 or die; -system("rm -f .*-state") == 0 or die; -system("dropdb hydra-test-suite") == 0 or die; + +my $fail = 0; + +system("dropdb -p 6433 hydra-test-suite") == 0 or $fail = 1; +system("pg_ctl -D postgres -w stop") == 0 or $fail = 1; + +system("chmod -R a+w nix") == 0 or $fail = 1; +system("rm -rf postgres data nix git-repo hg-repo svn-repo svn-checkout svn-checkout-repo bzr-repo bzr-checkout-repo darcs-repo") == 0 or $fail = 1; +system("rm -f .*-state") == 0 or $fail = 1; + +exit $fail; From c6fcce3b3b476256a6aec720e2fee1482d50a565 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Thu, 25 Jun 2015 16:46:59 +0200 Subject: [PATCH 087/158] Moar stats --- src/hydra-queue-runner/build-remote.cc | 31 ++++++++++++-------- src/hydra-queue-runner/build-remote.hh | 3 +- src/hydra-queue-runner/counter.hh | 2 +- src/hydra-queue-runner/hydra-queue-runner.cc | 7 ++++- src/script/hydra-send-stats | 3 ++ 5 files changed, 31 insertions(+), 15 deletions(-) diff --git a/src/hydra-queue-runner/build-remote.cc b/src/hydra-queue-runner/build-remote.cc index 4fa9da0c..f9a0483a 100644 --- a/src/hydra-queue-runner/build-remote.cc +++ b/src/hydra-queue-runner/build-remote.cc @@ -71,7 +71,7 @@ static void openConnection(const string & sshName, const string & sshKey, static void copyClosureTo(std::shared_ptr store, FdSource & from, FdSink & to, const PathSet & paths, - TokenServer & copyClosureTokenServer, + TokenServer & copyClosureTokenServer, counter & bytesSent, bool useSubstitutes = false) { PathSet closure; @@ -115,6 +115,9 @@ static void copyClosureTo(std::shared_ptr store, printMsg(lvlDebug, format("sending %1% missing paths") % missing.size()); + for (auto & p : missing) + bytesSent += store->queryPathInfo(p).narSize; + writeInt(cmdImportPaths, to); exportPaths(*store, missing, false, to); to.flush(); @@ -125,13 +128,16 @@ static void copyClosureTo(std::shared_ptr store, static void copyClosureFrom(std::shared_ptr store, - FdSource & from, FdSink & to, const PathSet & paths) + FdSource & from, FdSink & to, const PathSet & paths, counter & bytesReceived) { writeInt(cmdExportPaths, to); writeInt(0, to); // == don't sign writeStrings(paths, to); to.flush(); store->importPaths(false, from); + + for (auto & p : paths) + bytesReceived += store->queryPathInfo(p).narSize; } @@ -140,7 +146,8 @@ void buildRemote(std::shared_ptr store, const Path & drvPath, const Derivation & drv, const nix::Path & logDir, unsigned int maxSilentTime, unsigned int buildTimeout, TokenServer & copyClosureTokenServer, RemoteResult & result, - counter & nrStepsBuilding, counter & nrStepsCopyingTo, counter & nrStepsCopyingFrom) + counter & nrStepsBuilding, counter & nrStepsCopyingTo, counter & nrStepsCopyingFrom, + counter & bytesSent, counter & bytesReceived) { string base = baseNameOf(drvPath); result.logFile = logDir + "/" + string(base, 0, 2) + "/" + string(base, 2); @@ -188,10 +195,10 @@ void buildRemote(std::shared_ptr store, } /* Copy the input closure. */ - printMsg(lvlDebug, format("sending closure of ‘%1%’ to ‘%2%’") % drvPath % sshName); - { + if (sshName != "localhost") { + printMsg(lvlDebug, format("sending closure of ‘%1%’ to ‘%2%’") % drvPath % sshName); MaintainCount mc(nrStepsCopyingTo); - copyClosureTo(store, from, to, inputs, copyClosureTokenServer); + copyClosureTo(store, from, to, inputs, copyClosureTokenServer, bytesSent); } autoDelete.cancel(); @@ -220,13 +227,13 @@ void buildRemote(std::shared_ptr store, } /* Copy the output paths. */ - printMsg(lvlDebug, format("copying outputs of ‘%1%’ from ‘%2%’") % drvPath % sshName); - PathSet outputs; - for (auto & output : drv.outputs) - outputs.insert(output.second.path); - { + if (sshName != "localhost") { + printMsg(lvlDebug, format("copying outputs of ‘%1%’ from ‘%2%’") % drvPath % sshName); + PathSet outputs; + for (auto & output : drv.outputs) + outputs.insert(output.second.path); MaintainCount mc(nrStepsCopyingFrom); - copyClosureFrom(store, from, to, outputs); + copyClosureFrom(store, from, to, outputs, bytesReceived); } /* Shut down the connection. */ diff --git a/src/hydra-queue-runner/build-remote.hh b/src/hydra-queue-runner/build-remote.hh index 19625b19..3aa2d919 100644 --- a/src/hydra-queue-runner/build-remote.hh +++ b/src/hydra-queue-runner/build-remote.hh @@ -24,4 +24,5 @@ void buildRemote(std::shared_ptr store, const nix::Path & drvPath, const nix::Derivation & drv, const nix::Path & logDir, unsigned int maxSilentTime, unsigned int buildTimeout, TokenServer & copyClosureTokenServer, RemoteResult & result, - counter & nrStepsBuilding, counter & nrStepsCopyingTo, counter & nrStepsCopyingFrom); + counter & nrStepsBuilding, counter & nrStepsCopyingTo, counter & nrStepsCopyingFrom, + counter & bytesSent, counter & bytesReceived); diff --git a/src/hydra-queue-runner/counter.hh b/src/hydra-queue-runner/counter.hh index 4d6b4163..912cb499 100644 --- a/src/hydra-queue-runner/counter.hh +++ b/src/hydra-queue-runner/counter.hh @@ -2,7 +2,7 @@ #include -typedef std::atomic counter; +typedef std::atomic counter; struct MaintainCount { diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 9037ee10..79ef3a04 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -256,6 +256,8 @@ private: counter totalStepBuildTime{0}; // total build time for steps counter nrQueueWakeups{0}; counter nrDispatcherWakeups{0}; + counter bytesSent{0}; + counter bytesReceived{0}; /* Log compressor work queue. */ Sync> logCompressorQueue; @@ -1158,7 +1160,8 @@ bool State::doBuildStep(std::shared_ptr store, Step::ptr step, /* FIXME: referring builds may have conflicting timeouts. */ buildRemote(store, machine->sshName, machine->sshKey, step->drvPath, step->drv, logDir, build->maxSilentTime, build->buildTimeout, copyClosureTokenServer, - result, nrStepsBuilding, nrStepsCopyingTo, nrStepsCopyingFrom); + result, nrStepsBuilding, nrStepsCopyingTo, nrStepsCopyingFrom, + bytesSent, bytesReceived); } catch (Error & e) { result.status = RemoteResult::rrMiscFailure; result.errorMsg = e.msg(); @@ -1592,6 +1595,8 @@ void State::dumpStatus(Connection & conn, bool log) root.attr("nrStepsBuilding", nrStepsBuilding); root.attr("nrStepsCopyingTo", nrStepsCopyingTo); root.attr("nrStepsCopyingFrom", nrStepsCopyingFrom); + root.attr("bytesSent", bytesSent); + root.attr("bytesReceived", bytesReceived); root.attr("nrBuildsRead", nrBuildsRead); root.attr("nrBuildsDone", nrBuildsDone); root.attr("nrStepsDone", nrStepsDone); diff --git a/src/script/hydra-send-stats b/src/script/hydra-send-stats index b3e28968..9ef2dce1 100755 --- a/src/script/hydra-send-stats +++ b/src/script/hydra-send-stats @@ -44,6 +44,9 @@ sub sendQueueRunnerStats { gauge("hydra.queue.builds.finished", $json->{nrBuildsDone}); gauge("hydra.queue.checks", $json->{nrQueueWakeups}); + + gauge("hydra.queue.bytes_sent", $json->{bytesSent}); + gauge("hydra.queue.bytes_received", $json->{bytesReceived}); } while (1) { From c54a04688e24e77426f4ca89225ad9cf90c27ee9 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Wed, 24 Jun 2015 13:45:58 +0200 Subject: [PATCH 088/158] Fix email sender address when notification_sender is not set --- src/lib/Hydra/Helper/Email.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib/Hydra/Helper/Email.pm b/src/lib/Hydra/Helper/Email.pm index 47fc5629..e628fca3 100644 --- a/src/lib/Hydra/Helper/Email.pm +++ b/src/lib/Hydra/Helper/Email.pm @@ -13,7 +13,7 @@ sub sendEmail { my ($config, $to, $subject, $body, $extraHeaders) = @_; my $url = getBaseUrl($config); - my $sender = $config->{'notification_sender'} // (($ENV{'USER'} // "hydra") . "@" . $url); + my $sender = $config->{'notification_sender'} // (($ENV{'USER'} // "hydra") . "@" . hostname_long); my @headers = ( To => $to, From 2f4676bd9780fae433ae207aa715986aae78d67b Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Thu, 25 Jun 2015 16:59:41 +0200 Subject: [PATCH 089/158] JSONObject doesn't handle 64-bit integers --- src/hydra-queue-runner/hydra-queue-runner.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 79ef3a04..9a1c2652 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -1595,8 +1595,8 @@ void State::dumpStatus(Connection & conn, bool log) root.attr("nrStepsBuilding", nrStepsBuilding); root.attr("nrStepsCopyingTo", nrStepsCopyingTo); root.attr("nrStepsCopyingFrom", nrStepsCopyingFrom); - root.attr("bytesSent", bytesSent); - root.attr("bytesReceived", bytesReceived); + root.attr("bytesSent"); out << bytesSent; + root.attr("bytesReceived"); out << bytesReceived; root.attr("nrBuildsRead", nrBuildsRead); root.attr("nrBuildsDone", nrBuildsDone); root.attr("nrStepsDone", nrStepsDone); From 9a041f9a362aa89eb0b77cf7c40fe774df9e5875 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Fri, 26 Jun 2015 11:28:38 +0200 Subject: [PATCH 090/158] Restart builds failed due to unsupported system type --- src/lib/Hydra/Controller/JobsetEval.pm | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lib/Hydra/Controller/JobsetEval.pm b/src/lib/Hydra/Controller/JobsetEval.pm index 65ae7fb0..7a618b7f 100644 --- a/src/lib/Hydra/Controller/JobsetEval.pm +++ b/src/lib/Hydra/Controller/JobsetEval.pm @@ -169,7 +169,7 @@ sub cancel : Chained('eval') PathPart('cancel') Args(0) { sub restart_aborted : Chained('eval') PathPart('restart-aborted') Args(0) { my ($self, $c) = @_; requireProjectOwner($c, $c->stash->{eval}->project); - my $builds = $c->stash->{eval}->builds->search({ finished => 1, buildstatus => { -in => [3, 4] } }); + my $builds = $c->stash->{eval}->builds->search({ finished => 1, buildstatus => { -in => [3, 4, 9] } }); my $n = restartBuilds($c->model('DB')->schema, $builds); $c->flash->{successMsg} = "$n builds have been restarted."; $c->res->redirect($c->uri_for($c->controller('JobsetEval')->action_for('view'), $c->req->captures)); From 73a78c20730462c82c7b6f7e54dec2b86be165d8 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Fri, 26 Jun 2015 11:29:19 +0200 Subject: [PATCH 091/158] Give services.hydra.package a reasonable default --- hydra-module.nix | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hydra-module.nix b/hydra-module.nix index 00f3e439..12c76e35 100644 --- a/hydra-module.nix +++ b/hydra-module.nix @@ -144,6 +144,8 @@ in config = mkIf cfg.enable { + services.hydra.package = mkDefault ((import ./release.nix {}).build.x86_64-linux); + services.hydra.extraConfig = '' using_frontend_proxy 1 From aabe514d07c8a91fd138e2283753dafe76eaabe7 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Fri, 26 Jun 2015 11:29:30 +0200 Subject: [PATCH 092/158] Update the hydra-queue-runner unit --- hydra-module.nix | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/hydra-module.nix b/hydra-module.nix index 12c76e35..470f842d 100644 --- a/hydra-module.nix +++ b/hydra-module.nix @@ -232,8 +232,8 @@ in path = [ pkgs.nettools ]; environment = env; serviceConfig = - { ExecStartPre = "${cfg.package}/bin/hydra-queue-runner --unlock"; - ExecStart = "@${cfg.package}/bin/hydra-queue-runner hydra-queue-runner"; + { ExecStart = "@${cfg.package}/bin/hydra-queue-runner hydra-queue-runner"; + ExecStopPost = "${cfg.package}/bin/hydra-queue-runner --unlock"; User = "hydra"; Restart = "always"; }; From 17924ce012740f8565e9e43f43d52920ad47eb3f Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Fri, 26 Jun 2015 11:56:11 +0200 Subject: [PATCH 093/158] Note in the log when the queue runner / evaluator gets killed --- hydra-module.nix | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hydra-module.nix b/hydra-module.nix index 470f842d..0933e9a6 100644 --- a/hydra-module.nix +++ b/hydra-module.nix @@ -271,9 +271,11 @@ in '' #! ${pkgs.stdenv.shell} if [ $(($(stat -f -c '%a' /nix/store) * $(stat -f -c '%S' /nix/store))) -lt $((${toString cfg.minimumDiskFree} * 1024**3)) ]; then + echo "stopping Hydra queue runner due to lack of free space..." systemctl stop hydra-queue-runner fi if [ $(($(stat -f -c '%a' /nix/store) * $(stat -f -c '%S' /nix/store))) -lt $((${toString cfg.minimumDiskFreeEvaluator} * 1024**3)) ]; then + echo "stopping Hydra evaluator due to lack of free space..." systemctl stop hydra-evaluator fi ''; From 401f5bdce2721af4e53d755f59f5d437bfb427e1 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Fri, 26 Jun 2015 15:24:12 +0200 Subject: [PATCH 094/158] Add a unit for hydra-send-stats --- hydra-module.nix | 20 +++++++++++++++----- src/script/Makefile.am | 1 + 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/hydra-module.nix b/hydra-module.nix index 0933e9a6..823ca0c0 100644 --- a/hydra-module.nix +++ b/hydra-module.nix @@ -186,7 +186,7 @@ in build-compress-log = false ''; - systemd.services."hydra-init" = + systemd.services.hydra-init = { wantedBy = [ "multi-user.target" ]; requires = [ "postgresql.service" ]; after = [ "postgresql.service" ]; @@ -210,7 +210,7 @@ in serviceConfig.RemainAfterExit = true; }; - systemd.services."hydra-server" = + systemd.services.hydra-server = { wantedBy = [ "multi-user.target" ]; requires = [ "hydra-init.service" ]; after = [ "hydra-init.service" ]; @@ -225,7 +225,7 @@ in }; }; - systemd.services."hydra-queue-runner" = + systemd.services.hydra-queue-runner = { wantedBy = [ "multi-user.target" ]; requires = [ "hydra-init.service" ]; after = [ "hydra-init.service" "network.target" ]; @@ -239,7 +239,7 @@ in }; }; - systemd.services."hydra-evaluator" = + systemd.services.hydra-evaluator = { wantedBy = [ "multi-user.target" ]; requires = [ "hydra-init.service" ]; after = [ "hydra-init.service" "network.target" ]; @@ -252,7 +252,7 @@ in }; }; - systemd.services."hydra-update-gc-roots" = + systemd.services.hydra-update-gc-roots = { requires = [ "hydra-init.service" ]; after = [ "hydra-init.service" ]; environment = env; @@ -263,6 +263,16 @@ in startAt = "02:15"; }; + systemd.services.hydra-send-stats = + { wantedBy = [ "multi-user.target" ]; + after = [ "hydra-init.service" ]; + environment = env; + serviceConfig = + { ExecStart = "@${cfg.package}/bin/hydra-send-stats hydra-send-stats"; + User = "hydra"; + }; + }; + services.cron.systemCronJobs = let # If there is less than ... GiB of free disk space, stop the queue diff --git a/src/script/Makefile.am b/src/script/Makefile.am index ce612f37..9b389bd4 100644 --- a/src/script/Makefile.am +++ b/src/script/Makefile.am @@ -10,6 +10,7 @@ distributable_scripts = \ hydra-s3-backup-collect-garbage \ hydra-create-user \ hydra-notify \ + hydra-send-stats \ nix-prefetch-git \ nix-prefetch-bzr \ nix-prefetch-hg From f5e5a1b96e04e2cb53674ef0a7da7d6cf2349bb1 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Fri, 26 Jun 2015 20:59:14 +0200 Subject: [PATCH 095/158] Don't wake up the queue runner for cached evals --- src/script/hydra-evaluator | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/script/hydra-evaluator b/src/script/hydra-evaluator index 7dce10c7..b8fe9d1b 100755 --- a/src/script/hydra-evaluator +++ b/src/script/hydra-evaluator @@ -237,6 +237,8 @@ sub checkJobsetWrapped { print STDERR " created new eval ", $ev->id, "\n"; $ev->builds->update({iscurrent => 1}); + + $db->storage->dbh->do("notify builds_added"); } else { print STDERR " created cached eval ", $ev->id, "\n"; $prevEval->builds->update({iscurrent => 1}) if defined $prevEval; @@ -246,8 +248,6 @@ sub checkJobsetWrapped { $jobset->update({ enabled => 0 }) if $jobset->enabled == 2; $jobset->update({ lastcheckedtime => time }); - - $db->storage->dbh->do("notify builds_added"); }); # Store the error messages for jobs that failed to evaluate. From 008d61046758226800a71aa7e9988e692cdb461b Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Fri, 26 Jun 2015 21:06:35 +0200 Subject: [PATCH 096/158] getQueuedBuilds(): Don't catch errors while loading a build from the queue Otherwise we never recover from reset daemon connections, e.g. hydra-queue-runner[16106]: while loading build 599369: cannot start daemon worker: reading from file: Connection reset by peer hydra-queue-runner[16106]: while loading build 599236: writing to file: Broken pipe ... The error is now handled queueMonitor(), causing the next call to queueMonitorLoop() to create a new connection. --- src/hydra-queue-runner/hydra-queue-runner.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 9a1c2652..3ae69813 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -719,8 +719,8 @@ void State::getQueuedBuilds(Connection & conn, std::shared_ptr store, try { createBuild(build); } catch (Error & e) { - printMsg(lvlError, format("while loading build %1%: %2%") % build->id % e.what()); - continue; // FIXME: retry later? + e.addPrefix(format("while loading build %1%: ") % build->id); + throw; } /* Add the new runnable build steps to ‘runnable’ and wake up From 171303864e70e670eecf86aab56e96850ebc741c Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Mon, 29 Jun 2015 09:28:34 +0000 Subject: [PATCH 097/158] Fix type --- hydra-module.nix | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hydra-module.nix b/hydra-module.nix index 823ca0c0..d7cd7c8f 100644 --- a/hydra-module.nix +++ b/hydra-module.nix @@ -41,7 +41,7 @@ in }; dbi = mkOption { - type = types.string; + type = types.str; default = "dbi:Pg:dbname=hydra;user=hydra;"; example = "dbi:Pg:dbname=hydra;host=postgres.example.org;user=foo;"; description = '' From 2ece42b2b9f9d567ba3235c498c53d222c3346dc Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Tue, 30 Jun 2015 00:20:19 +0200 Subject: [PATCH 098/158] Support preferLocalBuild Derivations with "preferLocalBuild = true" can now be executed on specific machines (typically localhost) by setting the mandary system features field to include "local". For example: localhost x86_64-linux,i686-linux - 10 100 - local says that "localhost" can *only* do builds with "preferLocalBuild = true". The speed factor of 100 will make the machine almost always win over other machines. --- src/hydra-queue-runner/hydra-queue-runner.cc | 23 +++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 3ae69813..d1f748c4 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -138,6 +138,7 @@ struct Step Path drvPath; Derivation drv; std::set requiredSystemFeatures; + bool preferLocalBuild; struct State { @@ -194,7 +195,9 @@ struct Machine { if (systemTypes.find(step->drv.platform) == systemTypes.end()) return false; for (auto & f : mandatoryFeatures) - if (step->requiredSystemFeatures.find(f) == step->requiredSystemFeatures.end()) return false; + if (step->requiredSystemFeatures.find(f) == step->requiredSystemFeatures.end() + && !(step->preferLocalBuild && f == "local")) + return false; for (auto & f : step->requiredSystemFeatures) if (supportedFeatures.find(f) == supportedFeatures.end()) return false; return true; @@ -208,6 +211,8 @@ private: Path hydraData, logDir; + StringSet localPlatforms; + /* The queued builds. */ typedef std::map Builds; Sync builds; @@ -361,6 +366,10 @@ State::State() machinesFile = getEnv("NIX_REMOTE_SYSTEMS", "/etc/nix/machines"); machinesFileStat.st_ino = 0; machinesFileStat.st_mtime = 0; + + localPlatforms = {settings.thisSystem}; + if (settings.thisSystem == "x86_64-linux") + localPlatforms.insert("i686-linux"); } @@ -377,10 +386,7 @@ void State::loadMachinesFile() contents = readFile(machinesFile); machinesFileStat = st; } else { - StringSet systems = StringSet({settings.thisSystem}); - if (settings.thisSystem == "x86_64-linux") - systems.insert("i686-linux"); - contents = "localhost " + concatStringsSep(",", systems) + contents = "localhost " + concatStringsSep(",", localPlatforms) + " - " + int2String(settings.maxBuildJobs) + " 1"; } @@ -405,7 +411,9 @@ void State::loadMachinesFile() else machine->maxJobs = 1; machine->speedFactor = atof(tokens[4].c_str()); + if (tokens[5] == "-") tokens[5] = ""; machine->supportedFeatures = tokenizeString(tokens[5], ","); + if (tokens[6] == "-") tokens[6] = ""; machine->mandatoryFeatures = tokenizeString(tokens[6], ","); for (auto & f : machine->mandatoryFeatures) machine->supportedFeatures.insert(f); @@ -820,6 +828,11 @@ Step::ptr State::createStep(std::shared_ptr store, const Path & drvPat step->requiredSystemFeatures = tokenizeString>(i->second); } + auto attr = step->drv.env.find("preferLocalBuild"); + step->preferLocalBuild = + attr != step->drv.env.end() && attr->second == "1" + && has(localPlatforms, step->drv.platform); + /* Are all outputs valid? */ bool valid = true; for (auto & i : step->drv.outputs) { From 7e6135a8c676ce95b80d919b9cc20355445cfb30 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Tue, 30 Jun 2015 00:27:31 +0200 Subject: [PATCH 099/158] Don't repeat links to build step logs Hydra only stores the last log for a particular derivation, so only show log links for the last one. --- src/root/build.tt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/root/build.tt b/src/root/build.tt index 6d9f474a..5ab62a9a 100644 --- a/src/root/build.tt +++ b/src/root/build.tt @@ -18,9 +18,10 @@ - [% FOREACH step IN build.buildsteps %] + [% FOREACH step IN build.buildsteps.reverse %] [% IF ( type == "All" ) || ( type == "Failed" && step.status != 0 ) || ( type == "Running" && step.busy == 1 ) %] - [% has_log = buildStepLogExists(step); + [% has_log = seen.${step.drvpath} ? 0 : buildStepLogExists(step); + seen.${step.drvpath} = 1; log = c.uri_for('/build' build.id 'nixlog' step.stepnr); %] From 60e8b9881bf52a92a542832f817b84b195ad3f58 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Wed, 1 Jul 2015 10:47:18 +0200 Subject: [PATCH 100/158] The new queue runner requires "hydra" to be a trusted Nix user (Because it does unsigned imports into the store.) --- hydra-module.nix | 2 ++ 1 file changed, 2 insertions(+) diff --git a/hydra-module.nix b/hydra-module.nix index d7cd7c8f..56ac6fee 100644 --- a/hydra-module.nix +++ b/hydra-module.nix @@ -186,6 +186,8 @@ in build-compress-log = false ''; + nix.trustedUsers = [ "hydra" ]; + systemd.services.hydra-init = { wantedBy = [ "multi-user.target" ]; requires = [ "postgresql.service" ]; From 3c665dac82f5c6f1ab7a1d1f3d613c1004d87606 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Wed, 1 Jul 2015 11:34:19 +0200 Subject: [PATCH 101/158] Remove superfluous HYDRA_LOGO environment variable --- src/lib/Hydra/Controller/Root.pm | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lib/Hydra/Controller/Root.pm b/src/lib/Hydra/Controller/Root.pm index c4765065..cf437be7 100644 --- a/src/lib/Hydra/Controller/Root.pm +++ b/src/lib/Hydra/Controller/Root.pm @@ -30,7 +30,7 @@ sub begin :Private { $c->stash->{version} = $ENV{"HYDRA_RELEASE"} || ""; $c->stash->{nixVersion} = $ENV{"NIX_RELEASE"} || ""; $c->stash->{curTime} = time; - $c->stash->{logo} = ($c->config->{hydra_logo} // $ENV{"HYDRA_LOGO"}) ? "/logo" : ""; + $c->stash->{logo} = defined $c->config->{hydra_logo} ? "/logo" : ""; $c->stash->{tracker} = $ENV{"HYDRA_TRACKER"}; $c->stash->{flashMsg} = $c->flash->{flashMsg}; $c->stash->{successMsg} = $c->flash->{successMsg}; @@ -270,7 +270,7 @@ sub narinfo :LocalRegex('^([a-z0-9]+).narinfo$') :Args(0) { sub logo :Local { my ($self, $c) = @_; - my $path = $c->config->{hydra_logo} // $ENV{"HYDRA_LOGO"} // die("Logo not set!"); + my $path = $c->config->{hydra_logo} // die("Logo not set!"); $c->serve_static_file($path); } From 1e87b15f911d59ae4790e97b4b8fed3469998f82 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Wed, 1 Jul 2015 11:40:00 +0200 Subject: [PATCH 102/158] Remove Nix options not needed by the new queue runner --- hydra-module.nix | 9 --------- 1 file changed, 9 deletions(-) diff --git a/hydra-module.nix b/hydra-module.nix index 56ac6fee..6c395e76 100644 --- a/hydra-module.nix +++ b/hydra-module.nix @@ -175,15 +175,6 @@ in # The default (`true') slows Nix down a lot since the build farm # has so many GC roots. gc-check-reachability = false - - # Hydra needs caching of build failures. - build-cache-failure = true - - build-poll-interval = 10 - - # Online log compression makes it impossible to get the tail of - # builds that are in progress. - build-compress-log = false ''; nix.trustedUsers = [ "hydra" ]; From 85a1ce99c9f5f7bebf042231109fb8d80572ea46 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Wed, 1 Jul 2015 14:24:18 +0200 Subject: [PATCH 103/158] Only include Persona JS when Persona is enabled --- src/root/layout.tt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/root/layout.tt b/src/root/layout.tt index daa65b6d..cf221cd1 100644 --- a/src/root/layout.tt +++ b/src/root/layout.tt @@ -117,7 +117,7 @@ - [% ELSE %] + [% ELSIF personaEnabled %]
Queued:Queued at: [% INCLUDE renderDateTime timestamp = build.timestamp %]
@@ -27,7 +27,7 @@
NrWhatDurationMachineStatus
[% step.stepnr %]