Enable Mimir Alertmanager, add example alert #33

Merged
k900 merged 6 commits from alertmanager into main 2024-07-08 06:35:33 +00:00
14 changed files with 105 additions and 150 deletions
Showing only changes of commit ba0d50624d - Show all commits

View file

@ -40,7 +40,6 @@
hydra.enable = true; hydra.enable = true;
hydra.dbi = "dbi:Pg:dbname=hydra;user=hydra"; hydra.dbi = "dbi:Pg:dbname=hydra;user=hydra";
}; };
bagel.meta.monitoring.address = "bagel-box.infra.forkos.org";
security.acme.acceptTerms = true; security.acme.acceptTerms = true;
security.acme.defaults.email = "infra@forkos.org"; security.acme.defaults.email = "infra@forkos.org";

View file

@ -24,7 +24,6 @@
}; };
}; };
}; };
bagel.meta.monitoring.address = "gerrit01.infra.forkos.org";
fileSystems."/gerrit-data" = { fileSystems."/gerrit-data" = {
device = "/dev/disk/by-uuid/d1062305-0dea-4740-9a27-b6b1691862a4"; device = "/dev/disk/by-uuid/d1062305-0dea-4740-9a27-b6b1691862a4";

View file

@ -24,8 +24,6 @@
}; };
}; };
bagel.meta.monitoring.address = "fodwatch.infra.forkos.org";
i18n.defaultLocale = "en_US.UTF-8"; i18n.defaultLocale = "en_US.UTF-8";
system.stateVersion = "24.05"; system.stateVersion = "24.05";

View file

@ -21,7 +21,6 @@
enable = true; enable = true;
domain = "netbox.forkos.org"; domain = "netbox.forkos.org";
}; };
bagel.meta.monitoring.address = "meta01.infra.forkos.org";
bagel.services.prometheus.enable = true; bagel.services.prometheus.enable = true;
bagel.services.loki.enable = true; bagel.services.loki.enable = true;
bagel.services.grafana.enable = true; bagel.services.grafana.enable = true;

View file

@ -0,0 +1,100 @@
{
config,
lib,
...
}:
let
cfg = config.bagel.monitoring.grafana-agent;
inherit (lib) mkEnableOption mkOption mkIf types;
passwordAsCredential = "\${CREDENTIALS_DIRECTORY}/password";
in
{
options.bagel.monitoring.grafana-agent = {
enable = (mkEnableOption "Grafana Agent") // { default = true; };
exporters = mkOption {
description = "List of all exporters to scrape";
type = types.listOf (types.submodule {
options.port = mkOption {
description = "Exporter port";
type = types.int;
};
});
default = [];
};
};
config = mkIf cfg.enable {
age.secrets.grafana-agent-password.file = ../../secrets/metrics-push-password.age;
services.grafana-agent = {
enable = true;
credentials.password = config.age.secrets.grafana-agent-password.path;
settings = {
metrics = {
global.remote_write = [
{
url = "https://mimir.forkos.org/api/v1/push";
basic_auth = {
username = "promtail";
password_file = passwordAsCredential;
};
}
];
configs = [
{
name = config.networking.hostName;
scrape_configs = [
{
job_name = config.networking.hostName;

job_name is exposed as job in the resulting metrics.

I get that the previous version -- the pull based metrics collection using prometheus -- had one job per machine as well.

And I might be missing something. But I do feel like we can do with multiple jobs here. One job for each exporter, with the job_name set to the exporter name.

This allows one to use the metrics browser in Grafana to list all metrics of /one/ exporter by simply

  1. selecting "job" in "2. Select label to search in"
  2. and then in "3. Select (multiple) values for your labels" your job name (exporter name)

I find that helpful when wondering which metrics are actually exported and available.

And if you want all metrics of a single instance, you can throw together a simple regex with the instance label.
Regex, because instance contains both the hostname and a port.

Or add another static label, e.g. hostname, similar to what you do down below in the logs section with host.

Let me know what you think. I can do the implementation if you want.
config.bagel.monitoring.grafana-agent.exporters would need to become an attrset and all.

And the hostname thingy is as simple as

diff --git a/services/monitoring/agent.nix b/services/monitoring/agent.nix
index e538cb7..e52ea98 100644
--- a/services/monitoring/agent.nix
+++ b/services/monitoring/agent.nix
@@ -48,7 +48,10 @@ in
                 {
                   job_name = config.networking.hostName;
                   static_configs = [
-                    { targets = map (e: "localhost:" + (toString e.port)) config.bagel.monitoring.grafana-agent.exporters; }
+                    {
+                      targets = map (e: "localhost:" + (toString e.port)) config.bagel.monitoring.grafana-agent.exporters;
+                      labels.hostname = config.networking.hostName;;
+                    }
                   ];
                 }
               ];
`job_name` is exposed as `job` in the resulting metrics. I get that the previous version -- the pull based metrics collection using prometheus -- had one job per machine as well. And I might be missing something. But I do feel like we can do with multiple jobs here. One job for each exporter, with the `job_name` set to the exporter name. This allows one to use the metrics browser in Grafana to list all metrics of /one/ exporter by simply 1. selecting "job" in "2. Select label to search in" 1. and then in "3. Select (multiple) values for your labels" your job name (exporter name) I find that helpful when wondering which metrics are actually exported and available. And if you want all metrics of a single instance, you can throw together a simple regex with the `instance` label. Regex, because `instance` contains both the hostname and a port. Or add another static label, e.g. `hostname`, similar to what you do down below in the logs section with `host`. Let me know what you think. I can do the implementation if you want. `config.bagel.monitoring.grafana-agent.exporters` would need to become an attrset and all. And the `hostname` thingy is as simple as ```diff diff --git a/services/monitoring/agent.nix b/services/monitoring/agent.nix index e538cb7..e52ea98 100644 --- a/services/monitoring/agent.nix +++ b/services/monitoring/agent.nix @@ -48,7 +48,10 @@ in { job_name = config.networking.hostName; static_configs = [ - { targets = map (e: "localhost:" + (toString e.port)) config.bagel.monitoring.grafana-agent.exporters; } + { + targets = map (e: "localhost:" + (toString e.port)) config.bagel.monitoring.grafana-agent.exporters; + labels.hostname = config.networking.hostName;; + } ]; } ]; ```
Outdated
Review

Yeah, I can do that. I've never really bothered beyond the one job per machine thing but this can also work and if it makes life easier for people, I don't care either way.

Yeah, I can do that. I've never really bothered beyond the one job per machine thing but this can also work and if it makes life easier for people, I don't care either way.
static_configs = [
{ targets = map (e: "localhost:" + (toString e.port)) config.bagel.monitoring.grafana-agent.exporters; }
];
}
];
}
];
};
logs = {
global.clients = [
{
url = "https://loki.forkos.org/loki/api/v1/push";
basic_auth = {
username = "promtail";
password_file = passwordAsCredential;
};
}
];
configs = [
{
name = "journald";
scrape_configs = [
{
job_name = "system";
journal = {
max_age = "12h";
labels = {
job = "systemd-journal";
host = config.networking.hostName;
};
};
relabel_configs = [
{
source_labels = [ "__journal__systemd_unit" ];
target_label = "unit";
}
];
}
];
}
];
positions_directory = "\${STATE_DIRECTORY}/positions";
};
integrations.node_exporter.enable_collectors = [
"processes"
"systemd"
];
};
emilylange marked this conversation as resolved Outdated
Outdated
Review

I actually did this intentionally, but I can add all the exporters manually too.

I actually did this intentionally, but I can add all the exporters manually too.

Whoops I misread the previous services.prometheus.exporters.node.enabledCollectors that this is meant to replace.
Sorry for that. Will drop my commit. One sec.

Whoops I misread the previous `services.prometheus.exporters.node.enabledCollectors` that this is meant to replace. Sorry for that. Will drop my commit. One sec.
};
};
}

View file

@ -2,6 +2,6 @@
imports = [ imports = [
./exporters ./exporters
./lgtm ./lgtm
./promtail.nix ./agent.nix
]; ];
} }

View file

@ -17,6 +17,6 @@ in
listenAddress = "0.0.0.0"; listenAddress = "0.0.0.0";
}; };
bagel.meta.monitoring.exporters = [ { port = 9102; } ]; bagel.monitoring.grafana-agent.exporters = [ { port = 9102; } ];
}; };
} }

View file

@ -1,37 +1,7 @@
{
config,
lib,
...
}:
let
inherit (lib) mkOption types;
in
{ {
imports = [ imports = [
./cadvisor.nix ./cadvisor.nix
./node.nix
./nginx.nix ./nginx.nix
./postgres.nix ./postgres.nix
]; ];
options.bagel = {
meta.monitoring = {
address = mkOption {
description = "Node's public address";
type = types.str;
};
exporters = mkOption {
description = "List of all exporters to scrape";
type = types.listOf (types.submodule {
options.port = mkOption {
description = "Exporter port";
type = types.int;
};
});
default = [];
};
};
};
config.networking.firewall.allowedTCPPorts = map (e: e.port) config.bagel.meta.monitoring.exporters;
} }

View file

@ -30,7 +30,7 @@ in
]; ];
}; };
bagel.meta.monitoring.exporters = [ bagel.monitoring.grafana-agent.exporters = [
{ port = 9103; } { port = 9103; }
]; ];
}; };

View file

@ -1,25 +0,0 @@
{
config,
lib,
...
}:
let
cfg = config.bagel.monitoring.exporters.node;
inherit (lib) mkEnableOption mkIf;
in
{
options.bagel.monitoring.exporters.node.enable = (mkEnableOption "Standard node_exporter") // { default = true; };
config = mkIf cfg.enable {
services.prometheus.exporters.node = {
enable = true;
enabledCollectors = [
"processes"
"systemd"
];
port = 9101;
};
bagel.meta.monitoring.exporters = [ { port = 9101; } ];
};
}

View file

@ -24,7 +24,7 @@ in
services.postgresql.settings.shared_preload_libraries = "pg_stat_statements"; services.postgresql.settings.shared_preload_libraries = "pg_stat_statements";
bagel.meta.monitoring.exporters = [ bagel.monitoring.grafana-agent.exporters = [
{ port = 9104; } { port = 9104; }
]; ];
}; };

View file

@ -2,6 +2,6 @@
imports = [ imports = [
./grafana.nix ./grafana.nix
./loki.nix ./loki.nix
./prometheus.nix ./mimir.nix
]; ];
} }

View file

@ -1,7 +1,6 @@
{ {
config, config,
lib, lib,
nodes,
pkgs, pkgs,
... ...
}: }:
@ -9,25 +8,6 @@ let
cfg = config.bagel.services.prometheus; cfg = config.bagel.services.prometheus;
inherit (lib) mkEnableOption mkIf; inherit (lib) mkEnableOption mkIf;
forEachMachine = fn: map fn (builtins.attrValues nodes);
allMetas = forEachMachine (machine: {
name = machine.config.networking.hostName;
address = machine.config.bagel.meta.monitoring.address or null;
exporters = machine.config.bagel.meta.monitoring.exporters or [];
});
scrapableMetas = builtins.filter (m: m.address != null && m.exporters != []) allMetas;
toJobConfig = m: {
job_name = m.name;
static_configs = [
{ targets = map (e: m.address + ":" + (toString e.port)) m.exporters; }
];
};
jobConfigs = map toJobConfig scrapableMetas;
mimirPort = config.services.mimir.configuration.server.http_listen_port; mimirPort = config.services.mimir.configuration.server.http_listen_port;
in in
{ {
@ -42,18 +22,6 @@ in
mimir-environment.file = ../../../secrets/mimir-environment.age; mimir-environment.file = ../../../secrets/mimir-environment.age;
}; };
services.prometheus = {
enable = true;
enableAgentMode = true;
listenAddress = "127.0.0.1";
port = 9001;
globalConfig.scrape_interval = "15s";
scrapeConfigs = jobConfigs;
remoteWrite = [
{ url = "http://localhost:${toString mimirPort}/api/v1/push"; }
];
};
services.mimir = { services.mimir = {
enable = true; enable = true;
extraFlags = ["--config.expand-env=true"]; extraFlags = ["--config.expand-env=true"];

View file

@ -1,53 +0,0 @@
{
config,
lib,
...
}:
let
cfg = config.bagel.monitoring.promtail;
inherit (lib) mkEnableOption mkIf;
in
{
options.bagel.monitoring.promtail.enable = (mkEnableOption "Promtail log export") // { default = true; };
config = mkIf cfg.enable {
age.secrets.promtail-password = {
file = ../../secrets/metrics-push-password.age;
owner = "promtail";
};
services.promtail = {
enable = true;
configuration = {
server.disable = true;
clients = [
{
url = "https://loki.forkos.org/loki/api/v1/push";
basic_auth = {
username = "promtail";
password_file = config.age.secrets.promtail-password.path;
};
}
];
scrape_configs = [
{
job_name = "system";
journal = {
max_age = "12h";
labels = {
job = "systemd-journal";
host = config.networking.hostName;
};
};
relabel_configs = [
{
source_labels = [ "__journal__systemd_unit" ];
target_label = "unit";
}
];
}
];
};
};
};
}