forked from lix-project/hydra
queue-runner: add prom metrics to allow detecting internal bottlenecks
By looking at the ratio of running vs. waiting for the dispatcher and the queue monitor, we should get better visibility into what hydra is currently bottlenecked on. There are other side effects we can try to measure to get to the same result, but having a simple way doesn't cost us much.
This commit is contained in:
parent
6189ba9c5e
commit
cc6bafe538
|
@ -39,13 +39,15 @@ void State::dispatcher()
|
||||||
printMsg(lvlDebug, "dispatcher woken up");
|
printMsg(lvlDebug, "dispatcher woken up");
|
||||||
nrDispatcherWakeups++;
|
nrDispatcherWakeups++;
|
||||||
|
|
||||||
auto now1 = std::chrono::steady_clock::now();
|
auto t_before_work = std::chrono::steady_clock::now();
|
||||||
|
|
||||||
auto sleepUntil = doDispatch();
|
auto sleepUntil = doDispatch();
|
||||||
|
|
||||||
auto now2 = std::chrono::steady_clock::now();
|
auto t_after_work = std::chrono::steady_clock::now();
|
||||||
|
|
||||||
dispatchTimeMs += std::chrono::duration_cast<std::chrono::milliseconds>(now2 - now1).count();
|
prom.dispatcher_time_spent_running.Increment(
|
||||||
|
std::chrono::duration_cast<std::chrono::microseconds>(t_after_work - t_before_work).count());
|
||||||
|
dispatchTimeMs += std::chrono::duration_cast<std::chrono::milliseconds>(t_after_work - t_before_work).count();
|
||||||
|
|
||||||
/* Sleep until we're woken up (either because a runnable build
|
/* Sleep until we're woken up (either because a runnable build
|
||||||
is added, or because a build finishes). */
|
is added, or because a build finishes). */
|
||||||
|
@ -59,6 +61,10 @@ void State::dispatcher()
|
||||||
*dispatcherWakeup_ = false;
|
*dispatcherWakeup_ = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
auto t_after_sleep = std::chrono::steady_clock::now();
|
||||||
|
prom.dispatcher_time_spent_waiting.Increment(
|
||||||
|
std::chrono::duration_cast<std::chrono::microseconds>(t_after_sleep - t_after_work).count());
|
||||||
|
|
||||||
} catch (std::exception & e) {
|
} catch (std::exception & e) {
|
||||||
printError("dispatcher: %s", e.what());
|
printError("dispatcher: %s", e.what());
|
||||||
sleep(1);
|
sleep(1);
|
||||||
|
|
|
@ -77,6 +77,34 @@ State::PromMetrics::PromMetrics()
|
||||||
.Register(*registry)
|
.Register(*registry)
|
||||||
.Add({})
|
.Add({})
|
||||||
)
|
)
|
||||||
|
, dispatcher_time_spent_running(
|
||||||
|
prometheus::BuildCounter()
|
||||||
|
.Name("hydraqueuerunner_dispatcher_time_spent_running")
|
||||||
|
.Help("Time (in micros) spent running the dispatcher")
|
||||||
|
.Register(*registry)
|
||||||
|
.Add({})
|
||||||
|
)
|
||||||
|
, dispatcher_time_spent_waiting(
|
||||||
|
prometheus::BuildCounter()
|
||||||
|
.Name("hydraqueuerunner_dispatcher_time_spent_waiting")
|
||||||
|
.Help("Time (in micros) spent waiting for the dispatcher to obtain work")
|
||||||
|
.Register(*registry)
|
||||||
|
.Add({})
|
||||||
|
)
|
||||||
|
, queue_monitor_time_spent_running(
|
||||||
|
prometheus::BuildCounter()
|
||||||
|
.Name("hydraqueuerunner_queue_monitor_time_spent_running")
|
||||||
|
.Help("Time (in micros) spent running the queue monitor")
|
||||||
|
.Register(*registry)
|
||||||
|
.Add({})
|
||||||
|
)
|
||||||
|
, queue_monitor_time_spent_waiting(
|
||||||
|
prometheus::BuildCounter()
|
||||||
|
.Name("hydraqueuerunner_queue_monitor_time_spent_waiting")
|
||||||
|
.Help("Time (in micros) spent waiting for the queue monitor to obtain work")
|
||||||
|
.Register(*registry)
|
||||||
|
.Add({})
|
||||||
|
)
|
||||||
{
|
{
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -41,12 +41,19 @@ void State::queueMonitorLoop(Connection & conn)
|
||||||
|
|
||||||
bool quit = false;
|
bool quit = false;
|
||||||
while (!quit) {
|
while (!quit) {
|
||||||
|
auto t_before_work = std::chrono::steady_clock::now();
|
||||||
|
|
||||||
localStore->clearPathInfoCache();
|
localStore->clearPathInfoCache();
|
||||||
|
|
||||||
bool done = getQueuedBuilds(conn, destStore, lastBuildId);
|
bool done = getQueuedBuilds(conn, destStore, lastBuildId);
|
||||||
|
|
||||||
if (buildOne && buildOneDone) quit = true;
|
if (buildOne && buildOneDone) quit = true;
|
||||||
|
|
||||||
|
auto t_after_work = std::chrono::steady_clock::now();
|
||||||
|
|
||||||
|
prom.queue_monitor_time_spent_running.Increment(
|
||||||
|
std::chrono::duration_cast<std::chrono::microseconds>(t_after_work - t_before_work).count());
|
||||||
|
|
||||||
/* Sleep until we get notification from the database about an
|
/* Sleep until we get notification from the database about an
|
||||||
event. */
|
event. */
|
||||||
if (done && !quit) {
|
if (done && !quit) {
|
||||||
|
@ -71,6 +78,10 @@ void State::queueMonitorLoop(Connection & conn)
|
||||||
printMsg(lvlTalkative, "got notification: jobset shares changed");
|
printMsg(lvlTalkative, "got notification: jobset shares changed");
|
||||||
processJobsetSharesChange(conn);
|
processJobsetSharesChange(conn);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
auto t_after_sleep = std::chrono::steady_clock::now();
|
||||||
|
prom.queue_monitor_time_spent_waiting.Increment(
|
||||||
|
std::chrono::duration_cast<std::chrono::microseconds>(t_after_sleep - t_after_work).count());
|
||||||
}
|
}
|
||||||
|
|
||||||
exit(0);
|
exit(0);
|
||||||
|
|
|
@ -492,6 +492,12 @@ private:
|
||||||
prometheus::Counter& queue_checks_finished;
|
prometheus::Counter& queue_checks_finished;
|
||||||
prometheus::Gauge& queue_max_id;
|
prometheus::Gauge& queue_max_id;
|
||||||
|
|
||||||
|
prometheus::Counter& dispatcher_time_spent_running;
|
||||||
|
prometheus::Counter& dispatcher_time_spent_waiting;
|
||||||
|
|
||||||
|
prometheus::Counter& queue_monitor_time_spent_running;
|
||||||
|
prometheus::Counter& queue_monitor_time_spent_waiting;
|
||||||
|
|
||||||
PromMetrics();
|
PromMetrics();
|
||||||
};
|
};
|
||||||
PromMetrics prom;
|
PromMetrics prom;
|
||||||
|
|
Loading…
Reference in a new issue