queue-runner: add prom metrics to allow detecting internal bottlenecks

By looking at the ratio of running vs. waiting for the dispatcher and
the queue monitor, we should get better visibility into what hydra is
currently bottlenecked on.

There are other side effects we can try to measure to get to the same
result, but having a simple way doesn't cost us much.
This commit is contained in:
Pierre Bourdon 2024-04-20 16:48:03 +02:00
parent 6189ba9c5e
commit cc6bafe538
Signed by untrusted user: delroth
GPG key ID: 6FB80DCD84DA0F1C
4 changed files with 54 additions and 3 deletions

View file

@ -39,13 +39,15 @@ void State::dispatcher()
printMsg(lvlDebug, "dispatcher woken up"); printMsg(lvlDebug, "dispatcher woken up");
nrDispatcherWakeups++; nrDispatcherWakeups++;
auto now1 = std::chrono::steady_clock::now(); auto t_before_work = std::chrono::steady_clock::now();
auto sleepUntil = doDispatch(); auto sleepUntil = doDispatch();
auto now2 = std::chrono::steady_clock::now(); auto t_after_work = std::chrono::steady_clock::now();
dispatchTimeMs += std::chrono::duration_cast<std::chrono::milliseconds>(now2 - now1).count(); prom.dispatcher_time_spent_running.Increment(
std::chrono::duration_cast<std::chrono::microseconds>(t_after_work - t_before_work).count());
dispatchTimeMs += std::chrono::duration_cast<std::chrono::milliseconds>(t_after_work - t_before_work).count();
/* Sleep until we're woken up (either because a runnable build /* Sleep until we're woken up (either because a runnable build
is added, or because a build finishes). */ is added, or because a build finishes). */
@ -59,6 +61,10 @@ void State::dispatcher()
*dispatcherWakeup_ = false; *dispatcherWakeup_ = false;
} }
auto t_after_sleep = std::chrono::steady_clock::now();
prom.dispatcher_time_spent_waiting.Increment(
std::chrono::duration_cast<std::chrono::microseconds>(t_after_sleep - t_after_work).count());
} catch (std::exception & e) { } catch (std::exception & e) {
printError("dispatcher: %s", e.what()); printError("dispatcher: %s", e.what());
sleep(1); sleep(1);

View file

@ -77,6 +77,34 @@ State::PromMetrics::PromMetrics()
.Register(*registry) .Register(*registry)
.Add({}) .Add({})
) )
, dispatcher_time_spent_running(
prometheus::BuildCounter()
.Name("hydraqueuerunner_dispatcher_time_spent_running")
.Help("Time (in micros) spent running the dispatcher")
.Register(*registry)
.Add({})
)
, dispatcher_time_spent_waiting(
prometheus::BuildCounter()
.Name("hydraqueuerunner_dispatcher_time_spent_waiting")
.Help("Time (in micros) spent waiting for the dispatcher to obtain work")
.Register(*registry)
.Add({})
)
, queue_monitor_time_spent_running(
prometheus::BuildCounter()
.Name("hydraqueuerunner_queue_monitor_time_spent_running")
.Help("Time (in micros) spent running the queue monitor")
.Register(*registry)
.Add({})
)
, queue_monitor_time_spent_waiting(
prometheus::BuildCounter()
.Name("hydraqueuerunner_queue_monitor_time_spent_waiting")
.Help("Time (in micros) spent waiting for the queue monitor to obtain work")
.Register(*registry)
.Add({})
)
{ {
} }

View file

@ -41,12 +41,19 @@ void State::queueMonitorLoop(Connection & conn)
bool quit = false; bool quit = false;
while (!quit) { while (!quit) {
auto t_before_work = std::chrono::steady_clock::now();
localStore->clearPathInfoCache(); localStore->clearPathInfoCache();
bool done = getQueuedBuilds(conn, destStore, lastBuildId); bool done = getQueuedBuilds(conn, destStore, lastBuildId);
if (buildOne && buildOneDone) quit = true; if (buildOne && buildOneDone) quit = true;
auto t_after_work = std::chrono::steady_clock::now();
prom.queue_monitor_time_spent_running.Increment(
std::chrono::duration_cast<std::chrono::microseconds>(t_after_work - t_before_work).count());
/* Sleep until we get notification from the database about an /* Sleep until we get notification from the database about an
event. */ event. */
if (done && !quit) { if (done && !quit) {
@ -71,6 +78,10 @@ void State::queueMonitorLoop(Connection & conn)
printMsg(lvlTalkative, "got notification: jobset shares changed"); printMsg(lvlTalkative, "got notification: jobset shares changed");
processJobsetSharesChange(conn); processJobsetSharesChange(conn);
} }
auto t_after_sleep = std::chrono::steady_clock::now();
prom.queue_monitor_time_spent_waiting.Increment(
std::chrono::duration_cast<std::chrono::microseconds>(t_after_sleep - t_after_work).count());
} }
exit(0); exit(0);

View file

@ -492,6 +492,12 @@ private:
prometheus::Counter& queue_checks_finished; prometheus::Counter& queue_checks_finished;
prometheus::Gauge& queue_max_id; prometheus::Gauge& queue_max_id;
prometheus::Counter& dispatcher_time_spent_running;
prometheus::Counter& dispatcher_time_spent_waiting;
prometheus::Counter& queue_monitor_time_spent_running;
prometheus::Counter& queue_monitor_time_spent_waiting;
PromMetrics(); PromMetrics();
}; };
PromMetrics prom; PromMetrics prom;