forked from the-distro/infra
Hydra metrics
Yoink the nixos org exporter, rewrite most of it, deploy
This commit is contained in:
parent
4749d204bf
commit
e2c6550796
|
@ -13,6 +13,8 @@
|
||||||
hydra.builders = lib.genList (i: "builder-${builtins.toString i}") 10;
|
hydra.builders = lib.genList (i: "builder-${builtins.toString i}") 10;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
bagel.monitoring.exporters.hydra.enable = true;
|
||||||
|
|
||||||
# Hydra is proxied.
|
# Hydra is proxied.
|
||||||
bagel.raito.v6-proxy-awareness.enable = true;
|
bagel.raito.v6-proxy-awareness.enable = true;
|
||||||
|
|
||||||
|
|
|
@ -1,6 +1,7 @@
|
||||||
{
|
{
|
||||||
imports = [
|
imports = [
|
||||||
./cadvisor.nix
|
./cadvisor.nix
|
||||||
|
./hydra
|
||||||
./nginx.nix
|
./nginx.nix
|
||||||
./postgres.nix
|
./postgres.nix
|
||||||
];
|
];
|
||||||
|
|
39
services/monitoring/exporters/hydra/default.nix
Normal file
39
services/monitoring/exporters/hydra/default.nix
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
{
|
||||||
|
config,
|
||||||
|
lib,
|
||||||
|
pkgs,
|
||||||
|
...
|
||||||
|
}:
|
||||||
|
let
|
||||||
|
cfg = config.bagel.monitoring.exporters.hydra;
|
||||||
|
|
||||||
|
python = pkgs.python3.withPackages(ps: [
|
||||||
|
ps.aioprometheus
|
||||||
|
ps.click
|
||||||
|
ps.httpx
|
||||||
|
ps.starlette
|
||||||
|
ps.uvicorn
|
||||||
|
]);
|
||||||
|
|
||||||
|
inherit (lib) escapeShellArg getExe mkEnableOption mkIf mkOption types;
|
||||||
|
in
|
||||||
|
{
|
||||||
|
options.bagel.monitoring.exporters.hydra = {
|
||||||
|
enable = mkEnableOption "bagel flavored Hydra exporter";
|
||||||
|
hydraUrl = mkOption {
|
||||||
|
type = types.str;
|
||||||
|
default = "https://hydra.forkos.org/";
|
||||||
|
description = "URL to the Hydra to monitor";
|
||||||
|
};
|
||||||
|
};
|
||||||
|
|
||||||
|
config = mkIf cfg.enable {
|
||||||
|
systemd.services.hydra-exporter = {
|
||||||
|
wantedBy = [ "multi-user.target" ];
|
||||||
|
description = "Hydra exporter";
|
||||||
|
script = "${getExe python} ${./hydra-exporter.py} --hydra-url=${escapeShellArg cfg.hydraUrl} --port=9105";
|
||||||
|
};
|
||||||
|
|
||||||
|
bagel.monitoring.grafana-agent.exporters.hydra.port = 9105;
|
||||||
|
};
|
||||||
|
}
|
360
services/monitoring/exporters/hydra/hydra-exporter.py
Executable file
360
services/monitoring/exporters/hydra/hydra-exporter.py
Executable file
|
@ -0,0 +1,360 @@
|
||||||
|
#!/usr/bin/env nix-shell
|
||||||
|
#!nix-shell -i python3 -p "python3.withPackages(ps: [ps.aioprometheus ps.click ps.httpx ps.starlette ps.uvicorn])"
|
||||||
|
import asyncio
|
||||||
|
from contextlib import asynccontextmanager
|
||||||
|
import logging
|
||||||
|
from aioprometheus import Counter, Gauge
|
||||||
|
from aioprometheus.asgi.starlette import metrics
|
||||||
|
import click
|
||||||
|
import httpx
|
||||||
|
from starlette.applications import Starlette
|
||||||
|
from starlette.routing import Route
|
||||||
|
import uvicorn
|
||||||
|
|
||||||
|
|
||||||
|
up = Gauge("hydra_up", "Is Hydra running")
|
||||||
|
time = Gauge("hydra_time", "Hydra's current time")
|
||||||
|
uptime = Gauge("hydra_uptime", "Hydra's uptime")
|
||||||
|
|
||||||
|
builds_queued = Gauge("hydra_builds_queued", "Number of jobs in build queue")
|
||||||
|
steps_active = Gauge("hydra_steps_active", "Number of active steps in build queue")
|
||||||
|
steps_building = Gauge("hydra_steps_building", "Number of steps currently building")
|
||||||
|
steps_copying_to = Gauge(
|
||||||
|
"hydra_steps_copying_to", "Number of steps copying inputs to a worker"
|
||||||
|
)
|
||||||
|
steps_copying_from = Gauge(
|
||||||
|
"hydra_steps_copying_from", "Number of steps copying outputs from a worker"
|
||||||
|
)
|
||||||
|
steps_waiting = Gauge(
|
||||||
|
"hydra_steps_waiting", "Number of steps currently waiting for a worker slot"
|
||||||
|
)
|
||||||
|
steps_unsupported = Gauge(
|
||||||
|
"hydra_steps_unsupported", "Number of unsupported steps in build queue"
|
||||||
|
)
|
||||||
|
|
||||||
|
bytes_sent = Counter(
|
||||||
|
"hydra_build_inputs_sent_bytes_total",
|
||||||
|
"Total number of bytes copied to workers as build inputs",
|
||||||
|
)
|
||||||
|
bytes_received = Counter(
|
||||||
|
"hydra_build_outputs_received_bytes_total",
|
||||||
|
"Total number of bytes copied from workers as build outputs",
|
||||||
|
)
|
||||||
|
|
||||||
|
builds_read = Counter(
|
||||||
|
"hydra_builds_read_total",
|
||||||
|
"Total number of builds whose outputs have been copied from workers",
|
||||||
|
)
|
||||||
|
builds_read_seconds = Counter(
|
||||||
|
"hydra_builds_read_seconds_total",
|
||||||
|
"Total time spent copying build outputs, in seconds",
|
||||||
|
)
|
||||||
|
|
||||||
|
builds_done = Counter("hydra_builds_done_total", "Total number of builds completed")
|
||||||
|
steps_started = Counter("hydra_steps_started_total", "Total number of steps started")
|
||||||
|
steps_done = Counter("hydra_steps_done_total", "Total number of steps completed")
|
||||||
|
|
||||||
|
retries = Counter("hydra_retries_total", "Total number of retries")
|
||||||
|
max_retries = Gauge(
|
||||||
|
"hydra_max_retries", "Maximum observed number of retries for a single step"
|
||||||
|
)
|
||||||
|
|
||||||
|
queue_wakeups = Counter(
|
||||||
|
"hydra_queue_wakeup_total",
|
||||||
|
"Count of the times the queue runner has been notified of queue changes",
|
||||||
|
)
|
||||||
|
dispatcher_wakeups = Counter(
|
||||||
|
"hydra_dispatcher_wakeup_total",
|
||||||
|
"Count of the times the queue runner work dispatcher woke up due to new runnable builds and completed builds.",
|
||||||
|
)
|
||||||
|
|
||||||
|
dispatch_time = Counter(
|
||||||
|
"hydra_dispatch_execution_seconds_total",
|
||||||
|
"Total time the dispatcher has spent working, in seconds",
|
||||||
|
)
|
||||||
|
|
||||||
|
db_connections = Gauge("hydra_db_connections", "Number of connections to the database")
|
||||||
|
active_db_updates = Gauge("hydra_db_updates", "Number of in-progress database updates")
|
||||||
|
|
||||||
|
steps_queued = Gauge("hydra_steps_queued", "Number of steps in build queue")
|
||||||
|
steps_runnable = Gauge(
|
||||||
|
"hydra_steps_runnable", "Number of runnable steps in build queue"
|
||||||
|
)
|
||||||
|
|
||||||
|
step_time = Counter(
|
||||||
|
"hydra_step_time_total", "Total time spent executing steps, in seconds"
|
||||||
|
)
|
||||||
|
step_build_time = Counter(
|
||||||
|
"hydra_step_build_time_total", "Total time spent executing build steps, in seconds"
|
||||||
|
)
|
||||||
|
|
||||||
|
machine_enabled = Gauge("hydra_machine_enabled", "Whether machine is enabled")
|
||||||
|
machine_steps_done = Counter(
|
||||||
|
"hydra_machine_steps_done_total", "Total number of steps completed by this worker"
|
||||||
|
)
|
||||||
|
machine_current_jobs = Gauge(
|
||||||
|
"hydra_machine_current_jobs", "Number of jobs currently running on this worker"
|
||||||
|
)
|
||||||
|
machine_disabled_until = Gauge(
|
||||||
|
"hydra_machine_disabled_until",
|
||||||
|
"Timestamp of when this worker will next become active",
|
||||||
|
)
|
||||||
|
machine_last_failure = Gauge(
|
||||||
|
"hydra_machine_last_failure", "Timestamp of when a build last failed on this worker"
|
||||||
|
)
|
||||||
|
machine_consecutive_failures = Gauge(
|
||||||
|
"hydra_machine_consecutive_failures",
|
||||||
|
"Number of consecutive failed builds on this worker",
|
||||||
|
)
|
||||||
|
|
||||||
|
machine_idle_since = Gauge(
|
||||||
|
"hydra_machine_idle_since", "Timestamp of when this worker last had jobs running"
|
||||||
|
)
|
||||||
|
machine_step_time = Counter(
|
||||||
|
"hydra_machine_step_time_total",
|
||||||
|
"Total time this worker spent executing steps, in seconds",
|
||||||
|
)
|
||||||
|
machine_step_build_time = Counter(
|
||||||
|
"hydra_machine_step_build_time_total",
|
||||||
|
"Total time this worker spent executing build steps, in seconds",
|
||||||
|
)
|
||||||
|
|
||||||
|
jobset_time = Counter(
|
||||||
|
"hydra_jobset_seconds_total",
|
||||||
|
"Total time this jobset has been building for, in seconds",
|
||||||
|
)
|
||||||
|
jobset_shares_used = Gauge(
|
||||||
|
"hydra_jobset_shares_used", "Number of shares currently consumed by this jobset"
|
||||||
|
)
|
||||||
|
|
||||||
|
machine_type_runnable = Gauge(
|
||||||
|
"hydra_machine_type_runnable",
|
||||||
|
"Number of steps currently runnable on this machine type",
|
||||||
|
)
|
||||||
|
machine_type_running = Gauge(
|
||||||
|
"hydra_machine_type_running",
|
||||||
|
"Number of steps currently running on this machine type",
|
||||||
|
)
|
||||||
|
machine_type_wait_time = Counter(
|
||||||
|
"hydra_machine_type_wait_time_total",
|
||||||
|
"Total time spent waiting for a build slot of this machine type",
|
||||||
|
)
|
||||||
|
machine_type_last_active = Gauge(
|
||||||
|
"hydra_machine_type_last_active",
|
||||||
|
"Timestamp of when a machine of this type was last active",
|
||||||
|
)
|
||||||
|
|
||||||
|
store_nar_info_read = Counter(
|
||||||
|
"hydra_store_nar_info_read_total",
|
||||||
|
"Total number of narinfo files read from the remote store",
|
||||||
|
)
|
||||||
|
store_nar_info_read_averted = Counter(
|
||||||
|
"hydra_store_nar_info_read_averted_total",
|
||||||
|
"Total number of narinfo file reads averted (already loaded)",
|
||||||
|
)
|
||||||
|
store_nar_info_missing = Counter(
|
||||||
|
"hydra_store_nar_info_missing_total",
|
||||||
|
"Total number of narinfo files found to be missing",
|
||||||
|
)
|
||||||
|
store_nar_info_write = Counter(
|
||||||
|
"hydra_store_nar_info_write_total",
|
||||||
|
"Total number of narinfo files written to the remote store",
|
||||||
|
)
|
||||||
|
store_nar_info_cache_size = Gauge(
|
||||||
|
"hydra_store_nar_info_cache_size",
|
||||||
|
"Size of the in-memory store path information cache",
|
||||||
|
)
|
||||||
|
store_nar_read = Counter(
|
||||||
|
"hydra_store_nar_read_total", "Total number of NAR files read from the remote store"
|
||||||
|
)
|
||||||
|
store_nar_read_bytes = Counter(
|
||||||
|
"hydra_store_nar_read_bytes_total",
|
||||||
|
"Total number of NAR file bytes read from the remote store (uncompressed)",
|
||||||
|
)
|
||||||
|
store_nar_read_compressed_bytes = Counter(
|
||||||
|
"hydra_store_nar_read_compressed_bytes_total",
|
||||||
|
"Total number of NAR file bytes read from the remote store (compressed)",
|
||||||
|
)
|
||||||
|
store_nar_write = Counter(
|
||||||
|
"hydra_store_nar_write_total",
|
||||||
|
"Total number of NAR files written to the remote store",
|
||||||
|
)
|
||||||
|
store_nar_write_averted = Counter(
|
||||||
|
"hydra_store_nar_write_averted_total",
|
||||||
|
"Total number of NAR file writes averted (already exists on remote)",
|
||||||
|
)
|
||||||
|
store_nar_write_bytes = Counter(
|
||||||
|
"hydra_store_nar_write_bytes_total",
|
||||||
|
"Total number of NAR file bytes written to the remote store (uncompressed)",
|
||||||
|
)
|
||||||
|
store_nar_write_compressed_bytes = Counter(
|
||||||
|
"hydra_store_nar_write_compressed_bytes_total",
|
||||||
|
"Total number of NAR file bytes written to the remote store (compressed)",
|
||||||
|
)
|
||||||
|
store_nar_write_compression_seconds = Counter(
|
||||||
|
"hydra_store_nar_write_compression_seconds_total",
|
||||||
|
"Total time spent compressing NAR files for writing to the remote store",
|
||||||
|
)
|
||||||
|
|
||||||
|
store_s3_put = Counter(
|
||||||
|
"hydra_store_s3_put_total", "Total number of PUT requests to S3 store"
|
||||||
|
)
|
||||||
|
store_s3_put_bytes = Counter(
|
||||||
|
"hydra_store_s3_put_bytes_total", "Total number of bytes written to S3 store"
|
||||||
|
)
|
||||||
|
store_s3_put_seconds = Counter(
|
||||||
|
"hydra_store_s3_put_seconds_total",
|
||||||
|
"Total time spent writing to S3 store, in seconds",
|
||||||
|
)
|
||||||
|
store_s3_get = Counter(
|
||||||
|
"hydra_store_s3_get_total", "Total number of GET requests to S3 store"
|
||||||
|
)
|
||||||
|
store_s3_get_bytes = Counter(
|
||||||
|
"hydra_store_s3_get_bytes_total", "Total number of bytes read from S3 store"
|
||||||
|
)
|
||||||
|
store_s3_get_seconds = Counter(
|
||||||
|
"hydra_store_s3_get_seconds_total",
|
||||||
|
"Total time spent reading from S3 store, in seconds",
|
||||||
|
)
|
||||||
|
store_s3_head = Counter(
|
||||||
|
"hydra_store_s3_head_total", "Total number of HEAD requests to S3 store"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def update_metrics(status):
|
||||||
|
up.set({}, int(status["status"] == "up"))
|
||||||
|
time.set({}, status["time"])
|
||||||
|
uptime.set({}, status["uptime"])
|
||||||
|
|
||||||
|
builds_queued.set({}, status["nrQueuedBuilds"])
|
||||||
|
steps_active.set({}, status["nrActiveSteps"])
|
||||||
|
steps_building.set({}, status["nrStepsBuilding"])
|
||||||
|
steps_copying_to.set({}, status["nrStepsCopyingTo"])
|
||||||
|
steps_copying_from.set({}, status["nrStepsCopyingFrom"])
|
||||||
|
steps_waiting.set({}, status["nrStepsWaiting"])
|
||||||
|
steps_unsupported.set({}, status["nrUnsupportedSteps"])
|
||||||
|
|
||||||
|
bytes_sent.set({}, status["bytesSent"])
|
||||||
|
bytes_received.set({}, status["bytesReceived"])
|
||||||
|
|
||||||
|
builds_read.set({}, status["nrBuildsRead"])
|
||||||
|
builds_read_seconds.set({}, status["buildReadTimeMs"] / 1000)
|
||||||
|
|
||||||
|
builds_done.set({}, status["nrBuildsDone"])
|
||||||
|
steps_started.set({}, status["nrStepsStarted"])
|
||||||
|
steps_done.set({}, status["nrStepsDone"])
|
||||||
|
|
||||||
|
retries.set({}, status["nrRetries"])
|
||||||
|
max_retries.set({}, status["maxNrRetries"])
|
||||||
|
|
||||||
|
queue_wakeups.set({}, status["nrQueueWakeups"])
|
||||||
|
dispatcher_wakeups.set({}, status["nrDispatcherWakeups"])
|
||||||
|
dispatch_time.set({}, status["dispatchTimeMs"] / 1000)
|
||||||
|
|
||||||
|
db_connections.set({}, status["nrDbConnections"])
|
||||||
|
active_db_updates.set({}, status["nrActiveDbUpdates"])
|
||||||
|
|
||||||
|
steps_queued.set({}, status["nrUnfinishedSteps"])
|
||||||
|
steps_runnable.set({}, status["nrRunnableSteps"])
|
||||||
|
|
||||||
|
if st := status.get("totalStepTime"):
|
||||||
|
step_time.set({}, st)
|
||||||
|
|
||||||
|
if sbt := status.get("totalStepBuildTime"):
|
||||||
|
step_build_time.set({}, sbt)
|
||||||
|
|
||||||
|
for machine_name, machine_status in status["machines"].items():
|
||||||
|
labels = {"host": machine_name}
|
||||||
|
machine_enabled.set(labels, int(machine_status["enabled"]))
|
||||||
|
machine_steps_done.set(labels, machine_status["nrStepsDone"])
|
||||||
|
machine_current_jobs.set(labels, machine_status["currentJobs"])
|
||||||
|
machine_disabled_until.set(labels, machine_status["disabledUntil"])
|
||||||
|
machine_last_failure.set(labels, machine_status["lastFailure"])
|
||||||
|
machine_consecutive_failures.set(labels, machine_status["consecutiveFailures"])
|
||||||
|
|
||||||
|
if isn := machine_status.get("idleSince"):
|
||||||
|
machine_idle_since.set(labels, isn)
|
||||||
|
|
||||||
|
if st := machine_status.get("totalStepTime"):
|
||||||
|
machine_step_time.set(labels, st)
|
||||||
|
|
||||||
|
if sbt := machine_status.get("totalStepBuildTime"):
|
||||||
|
machine_step_build_time.set(labels, sbt)
|
||||||
|
|
||||||
|
for jobset_name, jobset_status in status["jobsets"].items():
|
||||||
|
labels = {"name": jobset_name}
|
||||||
|
jobset_time.set(labels, jobset_status["seconds"])
|
||||||
|
jobset_shares_used.set(labels, jobset_status["shareUsed"])
|
||||||
|
|
||||||
|
for type_name, type_status in status["machineTypes"].items():
|
||||||
|
labels = {"machineType": type_name}
|
||||||
|
machine_type_runnable.set(labels, type_status["runnable"])
|
||||||
|
machine_type_running.set(labels, type_status["running"])
|
||||||
|
|
||||||
|
if wt := type_status.get("waitTime"):
|
||||||
|
machine_type_wait_time.set(labels, wt)
|
||||||
|
|
||||||
|
if la := type_status.get("lastActive"):
|
||||||
|
machine_type_last_active.set(labels, la)
|
||||||
|
|
||||||
|
store = status["store"]
|
||||||
|
store_nar_info_read.set({}, store["narInfoRead"])
|
||||||
|
store_nar_info_read_averted.set({}, store["narInfoReadAverted"])
|
||||||
|
store_nar_info_missing.set({}, store["narInfoMissing"])
|
||||||
|
store_nar_info_write.set({}, store["narInfoWrite"])
|
||||||
|
store_nar_info_cache_size.set({}, store["narInfoCacheSize"])
|
||||||
|
store_nar_read.set({}, store["narRead"])
|
||||||
|
store_nar_read_bytes.set({}, store["narReadBytes"])
|
||||||
|
store_nar_read_compressed_bytes.set({}, store["narReadCompressedBytes"])
|
||||||
|
store_nar_write.set({}, store["narWrite"])
|
||||||
|
store_nar_write_averted.set({}, store["narWriteAverted"])
|
||||||
|
store_nar_write_bytes.set({}, store["narWriteBytes"])
|
||||||
|
store_nar_write_compressed_bytes.set({}, store["narWriteCompressedBytes"])
|
||||||
|
store_nar_write_compression_seconds.set(
|
||||||
|
{}, store["narWriteCompressionTimeMs"] / 1000
|
||||||
|
)
|
||||||
|
|
||||||
|
if s3 := status.get("s3"):
|
||||||
|
store_s3_put.set({}, s3["put"])
|
||||||
|
store_s3_put_bytes.set({}, s3["putBytes"])
|
||||||
|
store_s3_put_seconds.set({}, s3["putTimeMs"] / 1000)
|
||||||
|
store_s3_get.set({}, s3["get"])
|
||||||
|
store_s3_get_bytes.set({}, s3["getBytes"])
|
||||||
|
store_s3_get_seconds.set({}, s3["getTimeMs"] / 1000)
|
||||||
|
store_s3_head.set({}, s3["head"])
|
||||||
|
|
||||||
|
|
||||||
|
async def update_metrics_loop(hydra_url, scrape_interval):
|
||||||
|
async with httpx.AsyncClient(base_url=hydra_url) as client:
|
||||||
|
while True:
|
||||||
|
try:
|
||||||
|
response = await client.get(
|
||||||
|
"/queue-runner-status",
|
||||||
|
headers={"Content-Type": "application/json"},
|
||||||
|
)
|
||||||
|
|
||||||
|
update_metrics(response.json())
|
||||||
|
|
||||||
|
await asyncio.sleep(scrape_interval)
|
||||||
|
except Exception as ex:
|
||||||
|
logging.exception("Failed to update metrics", exc_info=ex)
|
||||||
|
|
||||||
|
|
||||||
|
@click.command()
|
||||||
|
@click.option("--hydra-url", default="https://hydra.forkos.org/")
|
||||||
|
@click.option("--port", default=9200)
|
||||||
|
@click.option("--scrape-interval", default=15)
|
||||||
|
def main(hydra_url, port, scrape_interval):
|
||||||
|
@asynccontextmanager
|
||||||
|
async def lifespan(_):
|
||||||
|
loop = asyncio.get_event_loop()
|
||||||
|
loop.create_task(update_metrics_loop(hydra_url, scrape_interval))
|
||||||
|
yield
|
||||||
|
|
||||||
|
app = Starlette(routes=[Route("/metrics", metrics)], lifespan=lifespan)
|
||||||
|
|
||||||
|
uvicorn.run(app, port=port, log_level="info")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
Loading…
Reference in a new issue