From 7cd08c7c46621faed76bfb11fa3775d02a42eb0c Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Mon, 29 Feb 2016 15:10:30 +0100 Subject: [PATCH] Warn if PostgreSQL appears stalled --- src/hydra-queue-runner/builder.cc | 6 ++++++ src/hydra-queue-runner/counter.hh | 5 +++++ src/hydra-queue-runner/hydra-queue-runner.cc | 12 ++++++++++++ src/hydra-queue-runner/queue-monitor.cc | 6 ++++++ src/hydra-queue-runner/state.hh | 3 +++ 5 files changed, 32 insertions(+) diff --git a/src/hydra-queue-runner/builder.cc b/src/hydra-queue-runner/builder.cc index b8cb3c87..9179fe42 100644 --- a/src/hydra-queue-runner/builder.cc +++ b/src/hydra-queue-runner/builder.cc @@ -112,6 +112,7 @@ bool State::doBuildStep(nix::ref destStore, Step::ptr step, /* Create a build step record indicating that we started building. */ { + auto mc = startDbUpdate(); pqxx::work txn(*conn); stepNr = createBuildStep(txn, result.startTime, build, step, machine->sshName, bssBusy); txn.commit(); @@ -165,6 +166,7 @@ bool State::doBuildStep(nix::ref destStore, Step::ptr step, retry = step_->tries + 1 < maxTries; } if (retry) { + auto mc = startDbUpdate(); pqxx::work txn(*conn); finishBuildStep(txn, result.startTime, result.stopTime, result.overhead, build->id, stepNr, machine->sshName, bssAborted, result.errorMsg); @@ -213,6 +215,8 @@ bool State::doBuildStep(nix::ref destStore, Step::ptr step, /* Update the database. */ { + auto mc = startDbUpdate(); + pqxx::work txn(*conn); finishBuildStep(txn, result.startTime, result.stopTime, result.overhead, @@ -299,6 +303,8 @@ bool State::doBuildStep(nix::ref destStore, Step::ptr step, /* Update the database. */ { + auto mc = startDbUpdate(); + pqxx::work txn(*conn); BuildStatus buildStatus = diff --git a/src/hydra-queue-runner/counter.hh b/src/hydra-queue-runner/counter.hh index 1943d1c3..6afff99d 100644 --- a/src/hydra-queue-runner/counter.hh +++ b/src/hydra-queue-runner/counter.hh @@ -1,6 +1,7 @@ #pragma once #include +#include typedef std::atomic counter; @@ -8,5 +9,9 @@ struct MaintainCount { counter & c; MaintainCount(counter & c) : c(c) { c++; } + MaintainCount(counter & c, std::function warn) : c(c) + { + warn(++c); + } ~MaintainCount() { auto prev = c--; assert(prev); } }; diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 2e4229c2..d7e80f82 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -45,6 +45,16 @@ State::State() } +MaintainCount State::startDbUpdate() +{ + return MaintainCount(nrActiveDbUpdates, [](unsigned long c) { + if (c > 6) { + printMsg(lvlError, format("warning: %d concurrent database updates; PostgreSQL may be stalled") % c); + } + }); +} + + ref State::getLocalStore() { return ref(_localStore); @@ -552,6 +562,7 @@ void State::dumpStatus(Connection & conn, bool log) root.attr("nrQueueWakeups", nrQueueWakeups); root.attr("nrDispatcherWakeups", nrDispatcherWakeups); root.attr("nrDbConnections", dbPool.count()); + root.attr("nrActiveDbUpdates", nrActiveDbUpdates); { root.attr("machines"); JSONObject nested(out); @@ -661,6 +672,7 @@ void State::dumpStatus(Connection & conn, bool log) if (log) printMsg(lvlInfo, format("status: %1%") % out.str()); { + auto mc = startDbUpdate(); pqxx::work txn(conn); // FIXME: use PostgreSQL 9.5 upsert. txn.exec("delete from SystemStatus where what = 'queue-runner'"); diff --git a/src/hydra-queue-runner/queue-monitor.cc b/src/hydra-queue-runner/queue-monitor.cc index b44e0836..c9ea6da2 100644 --- a/src/hydra-queue-runner/queue-monitor.cc +++ b/src/hydra-queue-runner/queue-monitor.cc @@ -124,6 +124,7 @@ bool State::getQueuedBuilds(Connection & conn, ref localStore, /* Derivation has been GC'ed prematurely. */ printMsg(lvlError, format("aborting GC'ed build %1%") % build->id); if (!build->finishedInDB) { + auto mc = startDbUpdate(); pqxx::work txn(conn); txn.parameterized ("update Builds set finished = 1, buildStatus = $2, startTime = $3, stopTime = $3, errorMsg = $4 where id = $1 and finished = 0") @@ -161,10 +162,13 @@ bool State::getQueuedBuilds(Connection & conn, ref localStore, Derivation drv = readDerivation(build->drvPath); BuildOutput res = getBuildOutput(destStore, destStore->getFSAccessor(), drv); + { + auto mc = startDbUpdate(); pqxx::work txn(conn); time_t now = time(0); markSucceededBuild(txn, build, res, true, now, now); txn.commit(); + } build->finishedInDB = true; @@ -178,6 +182,7 @@ bool State::getQueuedBuilds(Connection & conn, ref localStore, if (checkCachedFailure(r, conn)) { printMsg(lvlError, format("marking build %1% as cached failure") % build->id); if (!build->finishedInDB) { + auto mc = startDbUpdate(); pqxx::work txn(conn); /* Find the previous build step record, first by @@ -421,6 +426,7 @@ Step::ptr State::createStep(ref destStore, time_t stopTime = time(0); { + auto mc = startDbUpdate(); pqxx::work txn(conn); createSubstitutionStep(txn, startTime, stopTime, build, drvPath, "out", i.second.path); txn.commit(); diff --git a/src/hydra-queue-runner/state.hh b/src/hydra-queue-runner/state.hh index c1a5cfb6..be3ead24 100644 --- a/src/hydra-queue-runner/state.hh +++ b/src/hydra-queue-runner/state.hh @@ -313,6 +313,7 @@ private: counter nrDispatcherWakeups{0}; counter bytesSent{0}; counter bytesReceived{0}; + counter nrActiveDbUpdates{0}; /* Log compressor work queue. */ nix::Sync> logCompressorQueue; @@ -359,6 +360,8 @@ public: private: + MaintainCount startDbUpdate(); + /* Return a store object that can access derivations produced by hydra-evaluator. */ nix::ref getLocalStore();