From cc6bafe538783ea2a3812cb9c5638a7495c55e64 Mon Sep 17 00:00:00 2001 From: Pierre Bourdon Date: Sat, 20 Apr 2024 16:48:03 +0200 Subject: [PATCH] queue-runner: add prom metrics to allow detecting internal bottlenecks By looking at the ratio of running vs. waiting for the dispatcher and the queue monitor, we should get better visibility into what hydra is currently bottlenecked on. There are other side effects we can try to measure to get to the same result, but having a simple way doesn't cost us much. --- src/hydra-queue-runner/dispatcher.cc | 12 ++++++--- src/hydra-queue-runner/hydra-queue-runner.cc | 28 ++++++++++++++++++++ src/hydra-queue-runner/queue-monitor.cc | 11 ++++++++ src/hydra-queue-runner/state.hh | 6 +++++ 4 files changed, 54 insertions(+), 3 deletions(-) diff --git a/src/hydra-queue-runner/dispatcher.cc b/src/hydra-queue-runner/dispatcher.cc index f5d9d3e3..8c7c0db8 100644 --- a/src/hydra-queue-runner/dispatcher.cc +++ b/src/hydra-queue-runner/dispatcher.cc @@ -39,13 +39,15 @@ void State::dispatcher() printMsg(lvlDebug, "dispatcher woken up"); nrDispatcherWakeups++; - auto now1 = std::chrono::steady_clock::now(); + auto t_before_work = std::chrono::steady_clock::now(); auto sleepUntil = doDispatch(); - auto now2 = std::chrono::steady_clock::now(); + auto t_after_work = std::chrono::steady_clock::now(); - dispatchTimeMs += std::chrono::duration_cast(now2 - now1).count(); + prom.dispatcher_time_spent_running.Increment( + std::chrono::duration_cast(t_after_work - t_before_work).count()); + dispatchTimeMs += std::chrono::duration_cast(t_after_work - t_before_work).count(); /* Sleep until we're woken up (either because a runnable build is added, or because a build finishes). */ @@ -59,6 +61,10 @@ void State::dispatcher() *dispatcherWakeup_ = false; } + auto t_after_sleep = std::chrono::steady_clock::now(); + prom.dispatcher_time_spent_waiting.Increment( + std::chrono::duration_cast(t_after_sleep - t_after_work).count()); + } catch (std::exception & e) { printError("dispatcher: %s", e.what()); sleep(1); diff --git a/src/hydra-queue-runner/hydra-queue-runner.cc b/src/hydra-queue-runner/hydra-queue-runner.cc index 6e7a1816..0caea7d3 100644 --- a/src/hydra-queue-runner/hydra-queue-runner.cc +++ b/src/hydra-queue-runner/hydra-queue-runner.cc @@ -77,6 +77,34 @@ State::PromMetrics::PromMetrics() .Register(*registry) .Add({}) ) + , dispatcher_time_spent_running( + prometheus::BuildCounter() + .Name("hydraqueuerunner_dispatcher_time_spent_running") + .Help("Time (in micros) spent running the dispatcher") + .Register(*registry) + .Add({}) + ) + , dispatcher_time_spent_waiting( + prometheus::BuildCounter() + .Name("hydraqueuerunner_dispatcher_time_spent_waiting") + .Help("Time (in micros) spent waiting for the dispatcher to obtain work") + .Register(*registry) + .Add({}) + ) + , queue_monitor_time_spent_running( + prometheus::BuildCounter() + .Name("hydraqueuerunner_queue_monitor_time_spent_running") + .Help("Time (in micros) spent running the queue monitor") + .Register(*registry) + .Add({}) + ) + , queue_monitor_time_spent_waiting( + prometheus::BuildCounter() + .Name("hydraqueuerunner_queue_monitor_time_spent_waiting") + .Help("Time (in micros) spent waiting for the queue monitor to obtain work") + .Register(*registry) + .Add({}) + ) { } diff --git a/src/hydra-queue-runner/queue-monitor.cc b/src/hydra-queue-runner/queue-monitor.cc index ce5eef3f..6b643ec3 100644 --- a/src/hydra-queue-runner/queue-monitor.cc +++ b/src/hydra-queue-runner/queue-monitor.cc @@ -41,12 +41,19 @@ void State::queueMonitorLoop(Connection & conn) bool quit = false; while (!quit) { + auto t_before_work = std::chrono::steady_clock::now(); + localStore->clearPathInfoCache(); bool done = getQueuedBuilds(conn, destStore, lastBuildId); if (buildOne && buildOneDone) quit = true; + auto t_after_work = std::chrono::steady_clock::now(); + + prom.queue_monitor_time_spent_running.Increment( + std::chrono::duration_cast(t_after_work - t_before_work).count()); + /* Sleep until we get notification from the database about an event. */ if (done && !quit) { @@ -71,6 +78,10 @@ void State::queueMonitorLoop(Connection & conn) printMsg(lvlTalkative, "got notification: jobset shares changed"); processJobsetSharesChange(conn); } + + auto t_after_sleep = std::chrono::steady_clock::now(); + prom.queue_monitor_time_spent_waiting.Increment( + std::chrono::duration_cast(t_after_sleep - t_after_work).count()); } exit(0); diff --git a/src/hydra-queue-runner/state.hh b/src/hydra-queue-runner/state.hh index 89a7567d..3fd57be3 100644 --- a/src/hydra-queue-runner/state.hh +++ b/src/hydra-queue-runner/state.hh @@ -492,6 +492,12 @@ private: prometheus::Counter& queue_checks_finished; prometheus::Gauge& queue_max_id; + prometheus::Counter& dispatcher_time_spent_running; + prometheus::Counter& dispatcher_time_spent_waiting; + + prometheus::Counter& queue_monitor_time_spent_running; + prometheus::Counter& queue_monitor_time_spent_waiting; + PromMetrics(); }; PromMetrics prom;