forked from the-distro/infra
Compare commits
51 commits
9ebba0f74d
...
b534a4e6c8
Author | SHA1 | Date | |
---|---|---|---|
KFears | b534a4e6c8 | ||
raito | 8c0c7b517f | ||
raito | d5500d7c4e | ||
raito | eaf48a0cdd | ||
raito | e3129fec51 | ||
raito | 437293bdaa | ||
mei (ckie) | df8a57f91a | ||
Yureka | 97bee26977 | ||
Luke Granger-Brown | 84cfbdb050 | ||
Luke Granger-Brown | 6a8f49f180 | ||
Yureka | 06dd4d6e85 | ||
Luke Granger-Brown | de085155a6 | ||
Luke Granger-Brown | 2001012325 | ||
raito | fbf26302b6 | ||
raito | 1701a2b388 | ||
raito | decc9963ee | ||
raito | daa99e83e8 | ||
raito | 160e7c5ecb | ||
raito | b56b8963a2 | ||
raito | 192ba49f7c | ||
raito | 9ad7e7b139 | ||
raito | 96f5d45ff3 | ||
raito | 3df1697289 | ||
raito | 76276a8da3 | ||
raito | 7e205b16d0 | ||
raito | 1e421889e4 | ||
raito | 8838709a95 | ||
raito | 002db9a78f | ||
raito | 6978c1271d | ||
raito | 92560708b8 | ||
raito | 3b6be269d6 | ||
raito | acaaad68bb | ||
raito | 3c9b077bb2 | ||
raito | c23d290647 | ||
raito | c0689e6832 | ||
raito | a2eecd1886 | ||
raito | b5d412a5ba | ||
Yureka | 01f8322df9 | ||
Yureka | 3072dfad55 | ||
Maxine Aubrey | 86e833f52a | ||
raito | 1a862b2b0f | ||
raito | 6d3e14ec27 | ||
Ilya K | 5582a0a29b | ||
Ilya K | 4ddf87fa8e | ||
Ilya K | 98d899fabc | ||
Kiara Grouwstra | b291caac46 | ||
Ilya K | e2c6550796 | ||
raito | 4749d204bf | ||
raito | c86cefe21f | ||
raito | f321ab6450 | ||
Janik Haag | d462e8ca9c |
33
README.md
33
README.md
|
@ -1 +1,32 @@
|
|||
Infrastructure for the donut shaped thing that is absolutely not a donut.
|
||||
# Infrastructure for the donut shaped thing that is absolutely not a donut.
|
||||
|
||||
## Quick start
|
||||
|
||||
### Build the infrastructure
|
||||
|
||||
```
|
||||
$ colmena build --on @localboot
|
||||
```
|
||||
|
||||
Notice that `@localboot` is load-bearing as we have some machines that _cannot be_ deployed with vanilla Colmena. Fixing this is welcome.
|
||||
|
||||
### Recommended deploy process
|
||||
|
||||
```
|
||||
$ colmena apply dry-activate $machine # Verify that the nvd log is reasonable.
|
||||
$ colmena apply $machine
|
||||
```
|
||||
|
||||
### Recommended upgrade process
|
||||
|
||||
```
|
||||
$ nix flake update
|
||||
$ colmena apply dry-activate --on @localboot # Verify that the nvd log is reasonable. Run it twice to get only NVD logs shown.
|
||||
$ colmena apply --on @localboot
|
||||
```
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
### I failed to deploy `gerrit01`
|
||||
|
||||
Our Gerrit source build is known to have some hiccups sometimes, we are always interested in build logs, feel free to attach information in a new issue so we can make it more reliable.
|
||||
|
|
|
@ -1,16 +1,45 @@
|
|||
{ lib, ... }:
|
||||
let
|
||||
keys = import ./ssh-keys.nix;
|
||||
in {
|
||||
users.users.root.openssh.authorizedKeys.keys =
|
||||
keys.users.delroth ++
|
||||
keys.users.emilylange ++
|
||||
keys.users.hexchen ++
|
||||
keys.users.jade ++
|
||||
keys.users.janik ++
|
||||
keys.users.k900 ++
|
||||
keys.users.lukegb ++
|
||||
keys.users.maxine ++
|
||||
keys.users.raito ++
|
||||
keys.users.thubrecht ++
|
||||
keys.users.yuka;
|
||||
inherit (lib) genAttrs;
|
||||
in
|
||||
# Note: to add somefew in this list.
|
||||
# Ensure their SSH key is already in common/ssh-keys.nix with
|
||||
# the same username for here, so that the keys is automatically added.
|
||||
{
|
||||
bagel.groups = {
|
||||
floral-infra.members = [
|
||||
"delroth"
|
||||
"emilylange"
|
||||
"hexchen"
|
||||
"jade"
|
||||
"janik"
|
||||
"k900"
|
||||
"maxine"
|
||||
"raito"
|
||||
"thubrecht"
|
||||
"winter"
|
||||
"yuka"
|
||||
"ckie"
|
||||
];
|
||||
|
||||
lix-infra.members = [
|
||||
"raito"
|
||||
"hexchen"
|
||||
"jade"
|
||||
];
|
||||
};
|
||||
bagel.users = genAttrs [
|
||||
"delroth"
|
||||
"emilylange"
|
||||
"hexchen"
|
||||
"jade"
|
||||
"janik"
|
||||
"k900"
|
||||
"maxine"
|
||||
"raito"
|
||||
"thubrecht"
|
||||
"winter"
|
||||
"yuka"
|
||||
"ckie"
|
||||
] (name: {});
|
||||
}
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
{ lib, pkgs, ... }: {
|
||||
imports = [
|
||||
./known-ssh-keys.nix
|
||||
./cgroups.nix
|
||||
];
|
||||
|
||||
nixpkgs.overlays = import ../overlays;
|
||||
|
@ -30,7 +31,7 @@
|
|||
automatic = true;
|
||||
persistent = true;
|
||||
dates = lib.mkDefault "daily";
|
||||
options = "--delete-older-than 30d";
|
||||
options = lib.mkDefault "--delete-older-than 30d";
|
||||
};
|
||||
|
||||
services.journald.extraConfig = "SystemMaxUse=512M";
|
||||
|
|
83
common/cgroups.nix
Normal file
83
common/cgroups.nix
Normal file
|
@ -0,0 +1,83 @@
|
|||
# Relatively inspired by fbtax2:
|
||||
# https://facebookmicrosites.github.io/cgroup2/docs/fbtax-results.html
|
||||
#
|
||||
# See also the Chris Down talk at LISA'21:
|
||||
# https://www.usenix.org/conference/lisa21/presentation/down
|
||||
{ ... }:
|
||||
let
|
||||
systemCriticalSliceConfig = {
|
||||
ManagedOOMMemoryPressure = "kill";
|
||||
|
||||
# guarantee availability of memory
|
||||
MemoryMin = "192M";
|
||||
# default 100
|
||||
IOWeight = 1000;
|
||||
# default 100
|
||||
CPUWeight = 1000;
|
||||
};
|
||||
in
|
||||
{
|
||||
systemd.oomd = {
|
||||
enable = true;
|
||||
# why not, we have cgroups at user level now so it'll just kill the
|
||||
# terminal
|
||||
enableRootSlice = true;
|
||||
enableSystemSlice = true;
|
||||
enableUserSlices = true;
|
||||
};
|
||||
|
||||
systemd.enableCgroupAccounting = true;
|
||||
|
||||
systemd.services.nix-daemon = {
|
||||
serviceConfig = {
|
||||
# FIXME: how do i deprioritize this for memory
|
||||
CPUWeight = 10;
|
||||
IOWeight = 10;
|
||||
};
|
||||
};
|
||||
|
||||
systemd.slices.hostcritical = {
|
||||
description = "Ensures that services to keep the system alive remain alive";
|
||||
|
||||
unitConfig = {
|
||||
# required to avoid a dependency cycle on systemd-oomd. systemd will
|
||||
# actually guess this right but we should fix it anyway.
|
||||
DefaultDependencies = false;
|
||||
};
|
||||
|
||||
sliceConfig = systemCriticalSliceConfig;
|
||||
};
|
||||
|
||||
# make root logins higher priority for resources
|
||||
systemd.slices."user-0" = {
|
||||
sliceConfig = systemCriticalSliceConfig;
|
||||
};
|
||||
|
||||
|
||||
systemd.slices.system = {
|
||||
sliceConfig = {
|
||||
ManagedOOMMemoryPressure = "kill";
|
||||
ManagedOOMMemoryPressureLimit = "50%";
|
||||
|
||||
IOWeight = 100;
|
||||
};
|
||||
};
|
||||
|
||||
systemd.services.sshd = {
|
||||
serviceConfig = {
|
||||
Slice = "hostcritical.slice";
|
||||
};
|
||||
};
|
||||
|
||||
systemd.services.systemd-oomd = {
|
||||
serviceConfig = {
|
||||
Slice = "hostcritical.slice";
|
||||
};
|
||||
};
|
||||
|
||||
systemd.services.systemd-journald = {
|
||||
serviceConfig = {
|
||||
Slice = "hostcritical.slice";
|
||||
};
|
||||
};
|
||||
}
|
|
@ -1,12 +1,14 @@
|
|||
{
|
||||
imports = [
|
||||
./admins.nix
|
||||
./server-acl.nix
|
||||
./base-server.nix
|
||||
./hardening.nix
|
||||
./nix.nix
|
||||
./raito-proxy-aware-nginx.nix
|
||||
./raito-vm.nix
|
||||
./sysadmin
|
||||
./hardware
|
||||
./zsh.nix
|
||||
./secrets.nix
|
||||
];
|
||||
}
|
||||
|
|
7
common/hardware/default.nix
Normal file
7
common/hardware/default.nix
Normal file
|
@ -0,0 +1,7 @@
|
|||
{ ... }: {
|
||||
imports = [
|
||||
./raito-vm.nix
|
||||
./oracle-vm.nix
|
||||
./hetzner.nix
|
||||
];
|
||||
}
|
76
common/hardware/hetzner.nix
Normal file
76
common/hardware/hetzner.nix
Normal file
|
@ -0,0 +1,76 @@
|
|||
|
||||
{ lib, config, ... }:
|
||||
let
|
||||
cfg = config.bagel.hardware.hetzner;
|
||||
inherit (lib) mkEnableOption mkIf mkOption types;
|
||||
in
|
||||
{
|
||||
options.bagel.hardware.hetzner = {
|
||||
enable = mkEnableOption "Hetzner's hardware defaults";
|
||||
|
||||
platformType = mkOption {
|
||||
# Only VMs are supported.
|
||||
type = types.enum [ "virtual-machine" ];
|
||||
};
|
||||
|
||||
system = mkOption {
|
||||
# Only the aarch64-linux VM Hetzner is supported.
|
||||
type = types.enum [ "aarch64-linux" ];
|
||||
};
|
||||
|
||||
networking.wan = {
|
||||
mac = mkOption {
|
||||
type = types.str;
|
||||
description = "MAC address of the WAN interface in the Hetzner machine";
|
||||
};
|
||||
address = mkOption {
|
||||
type = types.listOf types.str;
|
||||
description = "List of static addresses attached to the WAN interface";
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
config = mkIf cfg.enable {
|
||||
# A bunch of stuff is virtio.
|
||||
boot.initrd.availableKernelModules = [
|
||||
"xhci_pci"
|
||||
"usbhid"
|
||||
"sr_mod"
|
||||
"virtio_gpu"
|
||||
"virtio_scsi"
|
||||
"virtio_rng"
|
||||
"virtio_pci"
|
||||
];
|
||||
|
||||
boot.loader.systemd-boot.enable = true;
|
||||
boot.loader.efi.canTouchEfiVariables = true;
|
||||
|
||||
networking.useDHCP = lib.mkDefault false;
|
||||
|
||||
# Stolen from the netplan provided by aarch64 Ubuntu images.
|
||||
systemd.network.enable = true;
|
||||
systemd.network.links."10-wan" = {
|
||||
linkConfig.Name = "wan";
|
||||
matchConfig.MACAddress = cfg.networking.mac;
|
||||
};
|
||||
systemd.network.networks."10-wan" = {
|
||||
matchConfig.Name = "wan";
|
||||
networkingConfig.Address = cfg.networking.address;
|
||||
linkConfig.RequiredForOnline = true;
|
||||
DHCP = "ipv4";
|
||||
routes = [
|
||||
{
|
||||
routeConfig = {
|
||||
Destination = "::/0";
|
||||
GatewayOnLink = true;
|
||||
Gateway = "fe80::1";
|
||||
};
|
||||
}
|
||||
];
|
||||
dhcpV4Config = {
|
||||
RouteMetric = 100;
|
||||
UseMTU = true;
|
||||
};
|
||||
};
|
||||
};
|
||||
}
|
52
common/hardware/oracle-vm.nix
Normal file
52
common/hardware/oracle-vm.nix
Normal file
|
@ -0,0 +1,52 @@
|
|||
|
||||
{ lib, config, modulesPath, ... }:
|
||||
let
|
||||
cfg = config.bagel.hardware.oracle-vm;
|
||||
inherit (lib) mkEnableOption mkIf mkOption types;
|
||||
in
|
||||
{
|
||||
options.bagel.hardware.oracle-vm = {
|
||||
enable = mkEnableOption "Oracle's VM hardware defaults";
|
||||
|
||||
system = mkOption {
|
||||
# Only the free Oracle VMs are supported.
|
||||
type = types.enum [ "aarch64-linux" ];
|
||||
};
|
||||
};
|
||||
|
||||
# Imports a bunch of virtio modules.
|
||||
imports = [
|
||||
"${modulesPath}/profiles/qemu-guest.nix"
|
||||
];
|
||||
|
||||
config = mkIf cfg.enable {
|
||||
boot.loader.systemd-boot.enable = true;
|
||||
boot.loader.efi.canTouchEfiVariables = true;
|
||||
boot.initrd.systemd.enable = true;
|
||||
|
||||
boot.initrd.availableKernelModules = [
|
||||
"xhci_pci" "virtio_pci" "usbhid" "sr_mod"
|
||||
];
|
||||
boot.initrd.kernelModules = [ ];
|
||||
boot.kernelModules = [ ];
|
||||
boot.extraModulePackages = [ ];
|
||||
|
||||
nixpkgs.hostPlatform = cfg.system;
|
||||
|
||||
# Enables DHCP on each ethernet and wireless interface. In case of scripted networking
|
||||
# (the default) this is the recommended approach. When using systemd-networkd it's
|
||||
# still possible to use this option, but it's recommended to use it in conjunction
|
||||
# with explicit per-interface declarations with `networking.interfaces.<interface>.useDHCP`.
|
||||
networking.useDHCP = lib.mkDefault false;
|
||||
# Examples:
|
||||
# 2: enp0s3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP group default qlen 1000
|
||||
# link/ether 02:00:17:00:91:6e brd ff:ff:ff:ff:ff:ff
|
||||
# inet 10.0.0.94/24 brd 10.0.0.255 scope global dynamic noprefixroute enp0s3
|
||||
# valid_lft 44162sec preferred_lft 33362sec
|
||||
# inet6 fe80::17ff:fe00:916e/64 scope link
|
||||
# valid_lft forever preferred_lft forever
|
||||
# [root@build02-aarch64-lahfa:~]# ip r
|
||||
# default via 10.0.0.1 dev enp0s3 proto dhcp src 10.0.0.94 metric 1002 mtu 9000
|
||||
networking.interfaces.enp0s3.useDHCP = lib.mkDefault true;
|
||||
};
|
||||
}
|
|
@ -2,5 +2,6 @@
|
|||
{
|
||||
programs.ssh.knownHosts = {
|
||||
"[cl.forkos.org]:29418".publicKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIM82mJ259C8Nc+BHHNBeRWXWhL3dfirQhmFbDAwHMle3";
|
||||
"[gerrit.lix.systems]:2022".publicKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICC/S6Z56uhv7zBMutkV0nU8eDuRcl3trykGWBch4L/l";
|
||||
};
|
||||
}
|
||||
|
|
22
common/secrets.nix
Normal file
22
common/secrets.nix
Normal file
|
@ -0,0 +1,22 @@
|
|||
## This is a simple secret abstraction with multi-tenancy awareness.
|
||||
{ config, lib, ... }:
|
||||
let
|
||||
cfg = config.bagel.secrets;
|
||||
inherit (lib) mkOption types genAttrs;
|
||||
in
|
||||
{
|
||||
options.bagel.secrets = {
|
||||
tenant = mkOption {
|
||||
type = types.enum [ "lix" "floral" ];
|
||||
};
|
||||
|
||||
files = mkOption {
|
||||
type = types.listOf types.str;
|
||||
default = [ ];
|
||||
};
|
||||
};
|
||||
|
||||
config.age.secrets = genAttrs cfg.files (secretFile: {
|
||||
file = ../secrets/${cfg.tenant}/${secretFile}.age;
|
||||
});
|
||||
}
|
69
common/server-acl.nix
Normal file
69
common/server-acl.nix
Normal file
|
@ -0,0 +1,69 @@
|
|||
{ lib, config, ... }:
|
||||
let
|
||||
keys = import ./ssh-keys.nix;
|
||||
inherit (lib) mkOption types length concatMap listToAttrs catAttrs attrValues;
|
||||
cfgAdmins = config.bagel.admins;
|
||||
cfgGroups = config.bagel.groups;
|
||||
cfgUsers = config.bagel.users;
|
||||
|
||||
userOpts = { name, ... }: {
|
||||
options = {
|
||||
sshKeys = mkOption {
|
||||
type = types.listOf types.str;
|
||||
description = "List of SSH keys associated to this user, defaults to `ssh-keys.nix` entries.";
|
||||
default = keys.users.${name} or [ ];
|
||||
};
|
||||
};
|
||||
};
|
||||
groupOpts = { name, ... }: {
|
||||
options = {
|
||||
members = mkOption {
|
||||
type = types.listOf types.str;
|
||||
description = "List of users member of this group";
|
||||
example = [ "raito" ];
|
||||
default = [ ];
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
# There might be duplicate in that list. We will turn it into an attribute set.
|
||||
allowedMembers = listToAttrs (
|
||||
map (member: {
|
||||
name = member;
|
||||
value = cfgUsers.${member};
|
||||
}) (concatMap (allowedGroup: cfgGroups.${allowedGroup}.members) cfgAdmins.allowedGroups));
|
||||
|
||||
rootKeys = concatMap ({ sshKeys, ... }: sshKeys) (attrValues allowedMembers);
|
||||
in
|
||||
{
|
||||
options.bagel.users = mkOption {
|
||||
type = types.attrsOf (types.submodule userOpts);
|
||||
description = "User configuration for server ACLs";
|
||||
};
|
||||
|
||||
options.bagel.groups = mkOption {
|
||||
type = types.attrsOf (types.submodule groupOpts);
|
||||
description = "Group configuration for server ACLs";
|
||||
};
|
||||
|
||||
options.bagel.admins = {
|
||||
allowedGroups = mkOption {
|
||||
type = types.listOf types.str;
|
||||
default = [ "catch-all" ];
|
||||
description = "List of groups which are allowed to admin this machine.";
|
||||
example = [ "lix" "build-infra" ];
|
||||
};
|
||||
};
|
||||
|
||||
config = {
|
||||
assertions = [
|
||||
{ assertion = length config.users.users.root.openssh.authorizedKeys.keys > 0;
|
||||
# TODO: you can add printing of `concatStringsSep ", " cfg.allowedGroups` to diagnose
|
||||
# which are the allowed groups and existing admins.
|
||||
message = "root@${config.networking.fqdnOrHostName} has no SSH key attached, this machine will lose its access if you deploy it successfully! Set a valid `bagel.admins.allowedGroups` or ensure you have at least one administrator of the relevant group registered";
|
||||
}
|
||||
];
|
||||
|
||||
users.users.root.openssh.authorizedKeys.keys = rootKeys;
|
||||
};
|
||||
}
|
|
@ -1,5 +1,6 @@
|
|||
{
|
||||
machines = {
|
||||
# Floral
|
||||
bagel-box = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIAsO4bNqY04uG13Pg3ubHfRDssTphDLzZ4YUniE5/p+M";
|
||||
meta01 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIM5t9gYorOWgpCFDJgb24pyCKIabGpeI2H/UfdvXODcT";
|
||||
public01 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIPBy8G8rfLA6E9i+t5kjVafxU1c2NXATXKxoXTH4Kgtm";
|
||||
|
@ -20,6 +21,16 @@
|
|||
builder-9 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOhws9zGgocVY36dMtOL+CXadpvRMffxoWMkfEcTBJm7";
|
||||
builder-10 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIE7sgIuTSqZiZhp8TvObSbIEhcHHsL5hcmYA22uzwxth";
|
||||
wob-vpn-gw = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAINVytPPW8XnXf/rD5TFzsw//CZc2lBjQLmDzlVGPZsjh";
|
||||
|
||||
# Lix
|
||||
build01-aarch64-lix = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICC69NZD/zhIB/wUb5odg46bss5g8hH2fDl22bk4qeSW";
|
||||
build02-aarch64-lix = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIGdJE375pe58RJbhKwXRp3D//+SJ3ssiVZrLsM9CLHn0";
|
||||
build01-aarch64-darwin-lix = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIMVf1uO0lv5UBti/naW/+amqLxvWZg+StXk9aM+lJ7e4";
|
||||
|
||||
buildbot-lix = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFoVSh35UqNQZ6ZZ1c6CzqERC40ovQ/KDXz8pC7nNlkR";
|
||||
|
||||
# Raito infrastructure
|
||||
epyc-newtype-fr = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOXT9Init1MhKt4rjBANLq0t0bPww/WQZ96uB4AEDrml";
|
||||
};
|
||||
|
||||
users = {
|
||||
|
@ -50,6 +61,8 @@
|
|||
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIKiXXYkhRh+s7ixZ8rvG8ntIqd6FELQ9hh7HoaHQJRPU"
|
||||
];
|
||||
thubrecht = [ "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIPM1jpXR7BWQa7Sed7ii3SbvIPRRlKb3G91qC0vOwfJn" ];
|
||||
yuka = [ "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIKath4/fDnlv/4fzxkPrQN1ttmoPRNu/m9bEtdPJBDfY cardno:16_933_242" ];
|
||||
yuka = [ "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIxQ3NYBi8v1f/vhxLKDcA6upmX0pctRDbnK6SER5OUR yureka" ];
|
||||
winter = [ "sk-ssh-ed25519@openssh.com AAAAGnNrLXNzaC1lZDI1NTE5QG9wZW5zc2guY29tAAAAIH/LDRUG+U+++UmlxvA2kspioTjktQZ8taDcHq8gVlkfAAAABHNzaDo=" ];
|
||||
ckie = [ "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIH3uTwzSSMAPg84fwbNp2cq9+BdLFeA1VzDGth4zCAbz https://mei.puppycat.house" ];
|
||||
};
|
||||
}
|
||||
|
|
|
@ -13,7 +13,11 @@ in
|
|||
tmux
|
||||
rsync
|
||||
fd
|
||||
eza
|
||||
grc
|
||||
ripgrep
|
||||
delta
|
||||
tshark
|
||||
pv
|
||||
kitty.terminfo
|
||||
config.boot.kernelPackages.perf
|
||||
|
|
63
flake.lock
63
flake.lock
|
@ -87,16 +87,16 @@
|
|||
"treefmt-nix": "treefmt-nix"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1722939563,
|
||||
"narHash": "sha256-lMe8aXgF550iQLRaoU+yn8yYQ4x2qiyqANgsFyjfWwA=",
|
||||
"ref": "refs/heads/non-flakes",
|
||||
"rev": "4a162a8aa5dad6cecdb33bd8534e67e0bdaeb13f",
|
||||
"revCount": 295,
|
||||
"lastModified": 1728837991,
|
||||
"narHash": "sha256-+jXVHPmX9eUtH2JhMKye0Tm2KMQTmD8FlHHfbcaXMOI=",
|
||||
"ref": "refs/heads/bring-back-old-gerrit-reporting",
|
||||
"rev": "879e9cdcdf2d7e6566ee512d015acc4d23f35517",
|
||||
"revCount": 302,
|
||||
"type": "git",
|
||||
"url": "https://git.lix.systems/lix-project/buildbot-nix.git"
|
||||
},
|
||||
"original": {
|
||||
"ref": "refs/heads/non-flakes",
|
||||
"ref": "refs/heads/bring-back-old-gerrit-reporting",
|
||||
"type": "git",
|
||||
"url": "https://git.lix.systems/lix-project/buildbot-nix.git"
|
||||
}
|
||||
|
@ -436,11 +436,11 @@
|
|||
},
|
||||
"locked": {
|
||||
"host": "gitlab.computer.surgery",
|
||||
"lastModified": 1723576377,
|
||||
"narHash": "sha256-sTa4XT5xMQkhhLknOfVd433YS1TvkMrE45qAsI1ZB6U=",
|
||||
"lastModified": 1727994504,
|
||||
"narHash": "sha256-FC6M1KKX58HbU9LG+cG6EJRr02J9lE/o0iiDi6m1gv8=",
|
||||
"owner": "matrix",
|
||||
"repo": "grapevine-fork",
|
||||
"rev": "3b99032456700d06dd937db6a85976a8be9d4fa7",
|
||||
"rev": "5a490a4397f0c6a36dab1cb631dadc67a849deab",
|
||||
"type": "gitlab"
|
||||
},
|
||||
"original": {
|
||||
|
@ -480,11 +480,11 @@
|
|||
]
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1724616313,
|
||||
"narHash": "sha256-9syppf9Gm/6F4wQQAbsf7rGY1DooMsprnsEY/0eaewg=",
|
||||
"lastModified": 1728321752,
|
||||
"narHash": "sha256-GbBAoBF7ZObz0IP+g0LZKxMafpMvNKjTEu9haiZbV54=",
|
||||
"ref": "refs/heads/main",
|
||||
"rev": "44b9a7b95d23e7a8587cb963f00382046707f2db",
|
||||
"revCount": 4202,
|
||||
"rev": "ee1234c15cdcb427dbd4828e0add09d02cd606c9",
|
||||
"revCount": 4220,
|
||||
"type": "git",
|
||||
"url": "https://git.lix.systems/lix-project/hydra.git"
|
||||
},
|
||||
|
@ -505,11 +505,11 @@
|
|||
"pre-commit-hooks": "pre-commit-hooks"
|
||||
},
|
||||
"locked": {
|
||||
"lastModified": 1723919517,
|
||||
"narHash": "sha256-D6+zmRXzr85p7riphuIrJQqangoJe70XM5jHhMWwXws=",
|
||||
"lastModified": 1728163191,
|
||||
"narHash": "sha256-SW0IEBsPN1EysqzvfDT+8Kimtzy03O1BxQQm7ZB6fRY=",
|
||||
"ref": "refs/heads/main",
|
||||
"rev": "278fddc317cf0cf4d3602d0ec0f24d1dd281fadb",
|
||||
"revCount": 16138,
|
||||
"rev": "ed9b7f4f84fd60ad8618645cc1bae2d686ff0db6",
|
||||
"revCount": 16323,
|
||||
"type": "git",
|
||||
"url": "https://git.lix.systems/lix-project/lix"
|
||||
},
|
||||
|
@ -670,11 +670,11 @@
|
|||
},
|
||||
"nixpkgs_2": {
|
||||
"locked": {
|
||||
"lastModified": 1723221148,
|
||||
"narHash": "sha256-7pjpeQlZUNQ4eeVntytU3jkw9dFK3k1Htgk2iuXjaD8=",
|
||||
"lastModified": 1728093190,
|
||||
"narHash": "sha256-CAZF2NRuHmqTtRTNAruWpHA43Gg2UvuCNEIzabP0l6M=",
|
||||
"owner": "NixOS",
|
||||
"repo": "nixpkgs",
|
||||
"rev": "154bcb95ad51bc257c2ce4043a725de6ca700ef6",
|
||||
"rev": "e2f08f4d8b3ecb5cf5c9fd9cb2d53bb3c71807da",
|
||||
"type": "github"
|
||||
},
|
||||
"original": {
|
||||
|
@ -715,6 +715,7 @@
|
|||
],
|
||||
"nix-gerrit": "nix-gerrit",
|
||||
"nixpkgs": "nixpkgs_2",
|
||||
"stateless-uptime-kuma": "stateless-uptime-kuma",
|
||||
"terranix": "terranix"
|
||||
}
|
||||
},
|
||||
|
@ -738,13 +739,13 @@
|
|||
"rust-manifest": {
|
||||
"flake": false,
|
||||
"locked": {
|
||||
"narHash": "sha256-aZFye4UrtlcvLHrISldx4g9uGt3thDbVlLMK5keBSj0=",
|
||||
"narHash": "sha256-tB9BZB6nRHDk5ELIVlGYlIjViLKBjQl52nC1avhcCwA=",
|
||||
"type": "file",
|
||||
"url": "https://static.rust-lang.org/dist/channel-rust-1.78.0.toml"
|
||||
"url": "https://static.rust-lang.org/dist/channel-rust-1.81.0.toml"
|
||||
},
|
||||
"original": {
|
||||
"type": "file",
|
||||
"url": "https://static.rust-lang.org/dist/channel-rust-1.78.0.toml"
|
||||
"url": "https://static.rust-lang.org/dist/channel-rust-1.81.0.toml"
|
||||
}
|
||||
},
|
||||
"stable": {
|
||||
|
@ -763,6 +764,22 @@
|
|||
"type": "github"
|
||||
}
|
||||
},
|
||||
"stateless-uptime-kuma": {
|
||||
"flake": false,
|
||||
"locked": {
|
||||
"lastModified": 1728243069,
|
||||
"narHash": "sha256-l9fgwesnmFxasCaYUCD7L9bGGJXytLuwtx3CZMgpwJg=",
|
||||
"ref": "refs/heads/master",
|
||||
"rev": "880f444ff7862d6127b051cf1a993ad1585b1652",
|
||||
"revCount": 25,
|
||||
"type": "git",
|
||||
"url": "https://git.dgnum.eu/DGNum/stateless-uptime-kuma.git"
|
||||
},
|
||||
"original": {
|
||||
"type": "git",
|
||||
"url": "https://git.dgnum.eu/DGNum/stateless-uptime-kuma.git"
|
||||
}
|
||||
},
|
||||
"systems": {
|
||||
"locked": {
|
||||
"lastModified": 1681028828,
|
||||
|
|
96
flake.nix
96
flake.nix
|
@ -22,12 +22,15 @@
|
|||
gerrit-dashboard.url = "git+https://git.lix.systems/the-distro/gerrit-monitoring.git";
|
||||
gerrit-dashboard.flake = false;
|
||||
|
||||
buildbot-nix.url = "git+https://git.lix.systems/lix-project/buildbot-nix.git?ref=refs/heads/non-flakes";
|
||||
buildbot-nix.url = "git+https://git.lix.systems/lix-project/buildbot-nix.git?ref=refs/heads/bring-back-old-gerrit-reporting";
|
||||
buildbot-nix.inputs.nixpkgs.follows = "nixpkgs";
|
||||
|
||||
channel-scripts.url = "git+https://git.lix.systems/the-distro/channel-scripts.git";
|
||||
channel-scripts.inputs.nixpkgs.follows = "nixpkgs";
|
||||
|
||||
stateless-uptime-kuma.url = "git+https://git.dgnum.eu/DGNum/stateless-uptime-kuma.git";
|
||||
stateless-uptime-kuma.flake = false;
|
||||
|
||||
lix.follows = "hydra/lix";
|
||||
|
||||
grapevine = {
|
||||
|
@ -55,6 +58,7 @@
|
|||
inputs.lix.overlays.default
|
||||
inputs.nix-gerrit.overlays.default
|
||||
inputs.channel-scripts.overlays.default
|
||||
(import "${inputs.stateless-uptime-kuma}/overlay.nix")
|
||||
];
|
||||
};
|
||||
terraform = pkgs.opentofu;
|
||||
|
@ -64,7 +68,6 @@
|
|||
./terraform
|
||||
{
|
||||
bagel.dnsimple.enable = true;
|
||||
bagel.gandi.enable = true;
|
||||
bagel.hydra.enable = true;
|
||||
}
|
||||
];
|
||||
|
@ -112,25 +115,90 @@
|
|||
./common
|
||||
];
|
||||
|
||||
makeBuilder = i: lib.nameValuePair "builder-${toString i}" {
|
||||
imports = commonModules;
|
||||
bagel.baremetal.builders = { enable = true; num = i; netboot = i >= 6; };
|
||||
floralInfraModules = commonModules ++ [
|
||||
({ config, lib, ... }: {
|
||||
# This means that anyone with @floral-infra permissions
|
||||
# can ssh on root of every machines handled here.
|
||||
bagel.admins.allowedGroups = [
|
||||
"floral-infra"
|
||||
];
|
||||
|
||||
# Tag all machines which have local boot as local bootables.
|
||||
deployment.tags = lib.mkMerge [
|
||||
[ "floral" ]
|
||||
(lib.mkIf (config.bagel.baremetal.builders.enable -> !config.bagel.baremetal.builders.netboot)
|
||||
[ "localboot" ]
|
||||
)
|
||||
];
|
||||
|
||||
bagel.monitoring.grafana-agent.tenant = "floral";
|
||||
bagel.secrets.tenant = "floral";
|
||||
bagel.builders.extra-build-capacity.provider.tenant = "floral";
|
||||
bagel.services.buildbot.tenant = "floral";
|
||||
})
|
||||
];
|
||||
|
||||
# These are Floral baremetal builders.
|
||||
makeBuilder = i:
|
||||
let
|
||||
enableNetboot = i >= 6;
|
||||
in
|
||||
lib.nameValuePair "builder-${toString i}" {
|
||||
imports = floralInfraModules;
|
||||
bagel.baremetal.builders = { enable = true; num = i; netboot = enableNetboot; };
|
||||
};
|
||||
|
||||
lixInfraModules = commonModules ++ [
|
||||
{
|
||||
# This means that anyone with @lix-infra permissions
|
||||
# can ssh on root of every machines handled here.
|
||||
bagel.admins.allowedGroups = [
|
||||
"lix-infra"
|
||||
];
|
||||
|
||||
# Tag all machines which have local boot as local bootables.
|
||||
# Lix has no netbootable machine.
|
||||
deployment.tags = [ "localboot" "lix" ];
|
||||
|
||||
bagel.monitoring.grafana-agent.tenant = "lix";
|
||||
bagel.secrets.tenant = "lix";
|
||||
bagel.builders.extra-build-capacity.provider = {
|
||||
tenant = "lix";
|
||||
buildfarmPublicKeys = [
|
||||
# buildbot.lix.systems SSH key
|
||||
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIDu4cEqZzAI/1vZjSQkTJ4ijIg9nuloOuSKUrnkJIOFn"
|
||||
];
|
||||
};
|
||||
bagel.services.buildbot.tenant = "lix";
|
||||
}
|
||||
];
|
||||
|
||||
builders = lib.listToAttrs (lib.genList makeBuilder 11);
|
||||
in {
|
||||
meta.nixpkgs = systemBits.x86_64-linux.pkgs;
|
||||
# Add any non-x86_64 native systems here.
|
||||
# Cross compilation is not supported yet.
|
||||
meta.nodeNixpkgs =
|
||||
let
|
||||
aarch64-systems = systems: lib.genAttrs systems (system: systemBits.aarch64-linux.pkgs);
|
||||
in
|
||||
aarch64-systems [
|
||||
"build01-aarch64-lix"
|
||||
];
|
||||
meta.specialArgs.inputs = inputs;
|
||||
|
||||
bagel-box.imports = commonModules ++ [ ./hosts/bagel-box ];
|
||||
meta01.imports = commonModules ++ [ ./hosts/meta01 ];
|
||||
gerrit01.imports = commonModules ++ [ ./hosts/gerrit01 ];
|
||||
fodwatch.imports = commonModules ++ [ ./hosts/fodwatch ];
|
||||
git.imports = commonModules ++ [ ./hosts/git ];
|
||||
wob-vpn-gw.imports = commonModules ++ [ ./hosts/wob-vpn-gw ];
|
||||
buildbot.imports = commonModules ++ [ ./hosts/buildbot ];
|
||||
public01.imports = commonModules ++ [ ./hosts/public01 ];
|
||||
build-coord.imports = commonModules ++ [ ./hosts/build-coord ];
|
||||
bagel-box.imports = floralInfraModules ++ [ ./hosts/bagel-box ];
|
||||
meta01.imports = floralInfraModules ++ [ ./hosts/meta01 ];
|
||||
gerrit01.imports = floralInfraModules ++ [ ./hosts/gerrit01 ];
|
||||
fodwatch.imports = floralInfraModules ++ [ ./hosts/fodwatch ];
|
||||
git.imports = floralInfraModules ++ [ ./hosts/git ];
|
||||
wob-vpn-gw.imports = floralInfraModules ++ [ ./hosts/wob-vpn-gw ];
|
||||
buildbot.imports = floralInfraModules ++ [ ./hosts/buildbot ];
|
||||
public01.imports = floralInfraModules ++ [ ./hosts/public01 ];
|
||||
build-coord.imports = floralInfraModules ++ [ ./hosts/build-coord ];
|
||||
|
||||
build01-aarch64-lix.imports = lixInfraModules ++ [ ./hosts/build01-aarch64-lix ];
|
||||
buildbot-lix.imports = lixInfraModules ++ [ ./hosts/buildbot-lix ];
|
||||
} // builders;
|
||||
|
||||
hydraJobs = builtins.mapAttrs (n: v: v.config.system.build.netbootDir or v.config.system.build.toplevel) self.nixosConfigurations;
|
||||
|
|
|
@ -13,6 +13,8 @@
|
|||
hydra.builders = lib.genList (i: "builder-${builtins.toString i}") 10;
|
||||
};
|
||||
|
||||
bagel.monitoring.exporters.hydra.enable = true;
|
||||
|
||||
# Hydra is proxied.
|
||||
bagel.raito.v6-proxy-awareness.enable = true;
|
||||
|
||||
|
|
27
hosts/build01-aarch64-lix/default.nix
Normal file
27
hosts/build01-aarch64-lix/default.nix
Normal file
|
@ -0,0 +1,27 @@
|
|||
{ ... }: {
|
||||
networking.hostName = "build01";
|
||||
networking.domain = "aarch64.lix.systems";
|
||||
|
||||
# Those free sweet VMs.
|
||||
bagel.hardware.oracle-vm = {
|
||||
enable = true;
|
||||
system = "aarch64-linux";
|
||||
};
|
||||
|
||||
fileSystems."/" =
|
||||
{ device = "/dev/disk/by-uuid/a333323c-99f0-4258-8f68-496858d56f71";
|
||||
fsType = "ext4";
|
||||
};
|
||||
|
||||
fileSystems."/boot" =
|
||||
{ device = "/dev/disk/by-uuid/3E74-C937";
|
||||
fsType = "vfat";
|
||||
};
|
||||
|
||||
swapDevices = [ ];
|
||||
|
||||
bagel.builders.extra-build-capacity.provider.enable = true;
|
||||
i18n.defaultLocale = "en_US.UTF-8";
|
||||
system.stateVersion = "24.05";
|
||||
deployment.targetHost = "build01.aarch64.lix.systems";
|
||||
}
|
71
hosts/buildbot-lix/default.nix
Normal file
71
hosts/buildbot-lix/default.nix
Normal file
|
@ -0,0 +1,71 @@
|
|||
# Configuration for a virtual machine in Raito's micro-DC basement.
|
||||
# 32 vCPU (2014 grade Xeon though)
|
||||
# 32GB RAM
|
||||
# 30GB SSD
|
||||
# 500GB HDD
|
||||
# All specifications can be upgraded to a certain extent, just ask Raito.
|
||||
# Hosts the coordinator for Buildbot.
|
||||
#
|
||||
# vim: et:ts=2:sw=2:
|
||||
#
|
||||
{ lib, modulesPath, ... }: {
|
||||
networking.hostName = "buildbot";
|
||||
networking.domain = "lix.systems";
|
||||
|
||||
zramSwap.enable = true;
|
||||
|
||||
bagel.sysadmin.enable = true;
|
||||
# Buildbot is proxied.
|
||||
bagel.raito.v6-proxy-awareness.enable = true;
|
||||
bagel.hardware.raito-vm = {
|
||||
enable = true;
|
||||
networking = {
|
||||
nat-lan-mac = "BC:24:11:75:62:42";
|
||||
wan = {
|
||||
mac = "BC:24:11:B2:5F:2E";
|
||||
address = "2001:bc8:38ee:100::200/56";
|
||||
};
|
||||
};
|
||||
};
|
||||
i18n.defaultLocale = "en_US.UTF-8";
|
||||
|
||||
bagel.services.buildbot = {
|
||||
enable = true;
|
||||
domain = "buildbot.lix.systems";
|
||||
gerrit =
|
||||
{
|
||||
domain = "gerrit.lix.systems";
|
||||
port = 2022;
|
||||
username = "buildbot";
|
||||
};
|
||||
cors.allowedOrigins = [
|
||||
"https://*.lix.systems"
|
||||
];
|
||||
projects = [
|
||||
"lix"
|
||||
"lix-installer"
|
||||
];
|
||||
buildSystems = [
|
||||
"x86_64-linux"
|
||||
"aarch64-linux"
|
||||
"aarch64-darwin"
|
||||
# Too slow.
|
||||
/* "x86_64-darwin" */
|
||||
];
|
||||
# Lix is not allowed to use yet Floral's x86_64 builders for now.
|
||||
builders = [ ];
|
||||
};
|
||||
|
||||
# This machine does not use /nix from btrfs, and instead uses a store on a bigger disk.
|
||||
fileSystems."/nix" =
|
||||
lib.mkForce
|
||||
{ device = "/dev/disk/by-uuid/1815ca49-d0b0-4b99-8aec-0d790498ba6f";
|
||||
fsType = "xfs";
|
||||
neededForBoot = true;
|
||||
options = [ "relatime" ];
|
||||
};
|
||||
|
||||
nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux";
|
||||
system.stateVersion = "24.05";
|
||||
deployment.targetHost = "buildbot.lix.systems";
|
||||
}
|
|
@ -2,6 +2,7 @@
|
|||
config,
|
||||
lib,
|
||||
pkgs,
|
||||
nodes,
|
||||
...
|
||||
}:
|
||||
{
|
||||
|
@ -26,7 +27,24 @@
|
|||
bagel.services.buildbot = {
|
||||
enable = true;
|
||||
domain = "buildbot.forkos.org";
|
||||
builders = [ "builder-10" ];
|
||||
gerrit =
|
||||
let
|
||||
cfgGerrit = nodes.gerrit01.config.bagel.services.gerrit;
|
||||
in
|
||||
{
|
||||
domain = cfgGerrit.canonicalDomain;
|
||||
port = cfgGerrit.port;
|
||||
username = "buildbot";
|
||||
};
|
||||
cors.allowedOrigins = [
|
||||
"https://*.forkos.org"
|
||||
];
|
||||
projects = [
|
||||
"buildbot-test"
|
||||
"nixpkgs"
|
||||
"infra"
|
||||
];
|
||||
builders = [ "builder-9" ];
|
||||
};
|
||||
|
||||
i18n.defaultLocale = "en_US.UTF-8";
|
||||
|
|
|
@ -23,6 +23,9 @@
|
|||
};
|
||||
};
|
||||
|
||||
# Block all these crawlers!!
|
||||
bagel.services.nginx.crawler-blocker.enable = true;
|
||||
|
||||
fileSystems."/gerrit-data" = {
|
||||
device = "/dev/disk/by-uuid/d1062305-0dea-4740-9a27-b6b1691862a4";
|
||||
fsType = "ext4";
|
||||
|
@ -39,7 +42,7 @@
|
|||
};
|
||||
|
||||
age.secrets.ows-deploy-key = {
|
||||
file = ../../secrets/ows-deploy-key.age;
|
||||
file = ../../secrets/floral/ows-deploy-key.age;
|
||||
mode = "0600";
|
||||
owner = "git";
|
||||
group = "git";
|
||||
|
@ -121,7 +124,7 @@
|
|||
};
|
||||
};
|
||||
|
||||
age.secrets.s3-channel-staging-keys.file = ../../secrets/s3-channel-staging-keys.age;
|
||||
age.secrets.s3-channel-staging-keys.file = ../../secrets/floral/s3-channel-staging-keys.age;
|
||||
bagel.nixpkgs.channel-scripts = {
|
||||
enable = true;
|
||||
otlp.enable = true;
|
||||
|
|
|
@ -9,6 +9,11 @@
|
|||
# TODO: make it the default
|
||||
networking.domain = "infra.forkos.org";
|
||||
|
||||
bagel.status = {
|
||||
enable = true;
|
||||
domain = "status.forkos.org";
|
||||
};
|
||||
|
||||
bagel.sysadmin.enable = true;
|
||||
# Newsletter is proxied.
|
||||
bagel.raito.v6-proxy-awareness.enable = true;
|
||||
|
|
97
secrets.nix
97
secrets.nix
|
@ -1,51 +1,76 @@
|
|||
let
|
||||
keys = import common/ssh-keys.nix;
|
||||
|
||||
commonKeys = keys.users.delroth ++ keys.users.raito;
|
||||
commonKeys = {
|
||||
# WARNING: `keys.users.*` are *lists*, so you need concatenate them, don't put them into lists!
|
||||
# Otherwise, agenix will be confused!
|
||||
global = keys.users.raito;
|
||||
lix = keys.users.hexchen ++ keys.users.jade;
|
||||
floral = keys.users.delroth;
|
||||
};
|
||||
|
||||
secrets = with keys; {
|
||||
hydra-postgres-key = [ machines.build-coord ];
|
||||
hydra-s3-credentials = [ machines.build-coord ];
|
||||
hydra-signing-priv = [ machines.build-coord ];
|
||||
hydra-ssh-key-priv = [ machines.build-coord ];
|
||||
floral = {
|
||||
hydra-postgres-key = [ machines.build-coord ];
|
||||
hydra-s3-credentials = [ machines.build-coord ];
|
||||
hydra-signing-priv = [ machines.build-coord ];
|
||||
hydra-ssh-key-priv = [ machines.build-coord ];
|
||||
|
||||
netbox-environment = [ machines.meta01 ];
|
||||
mimir-environment = [ machines.meta01 ];
|
||||
mimir-webhook-url = [ machines.meta01 ];
|
||||
grafana-oauth-secret = [ machines.meta01 ];
|
||||
loki-environment = [ machines.meta01 ];
|
||||
gerrit-prometheus-bearer-token = [ machines.gerrit01 machines.meta01 ];
|
||||
pyroscope-secrets = [ machines.meta01 ];
|
||||
tempo-environment = [ machines.meta01 ];
|
||||
netbox-environment = [ machines.meta01 ];
|
||||
mimir-environment = [ machines.meta01 ];
|
||||
mimir-webhook-url = [ machines.meta01 ];
|
||||
grafana-oauth-secret = [ machines.meta01 ];
|
||||
loki-environment = [ machines.meta01 ];
|
||||
gerrit-prometheus-bearer-token = [ machines.gerrit01 machines.meta01 ];
|
||||
pyroscope-secrets = [ machines.meta01 ];
|
||||
tempo-environment = [ machines.meta01 ];
|
||||
|
||||
buildbot-worker-password = [ machines.buildbot ];
|
||||
buildbot-oauth-secret = [ machines.buildbot ];
|
||||
buildbot-workers = [ machines.buildbot ];
|
||||
# Private SSH key to Gerrit
|
||||
# ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHx52RUPWzTa2rBA96xcnGjjzAboNN/hm6gW+Q6JiSos
|
||||
buildbot-service-key = [ machines.buildbot ];
|
||||
# Signing key for Buildbot's specific cache
|
||||
buildbot-signing-key = [ machines.buildbot ];
|
||||
buildbot-remote-builder-key = [ machines.buildbot ];
|
||||
buildbot-worker-password = [ machines.buildbot ];
|
||||
buildbot-oauth-secret = [ machines.buildbot ];
|
||||
buildbot-workers = [ machines.buildbot ];
|
||||
# Private SSH key to Gerrit
|
||||
# ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHx52RUPWzTa2rBA96xcnGjjzAboNN/hm6gW+Q6JiSos
|
||||
buildbot-service-key = [ machines.buildbot ];
|
||||
# Signing key for Buildbot's specific cache
|
||||
buildbot-signing-key = [ machines.buildbot ];
|
||||
buildbot-remote-builder-key = [ machines.buildbot ];
|
||||
|
||||
# These are the same password, but nginx wants it in htpasswd format
|
||||
metrics-push-htpasswd = [ machines.meta01 ];
|
||||
metrics-push-password = builtins.attrValues machines;
|
||||
# These are the same password, but nginx wants it in htpasswd format
|
||||
metrics-push-htpasswd = [ machines.meta01 ];
|
||||
# Yes, even Lix machines are included in this monitoring infrastructure.
|
||||
metrics-push-password = builtins.attrValues machines;
|
||||
|
||||
ows-deploy-key = [ machines.gerrit01 ];
|
||||
s3-channel-staging-keys = [ machines.gerrit01 ];
|
||||
s3-channel-keys = [ machines.gerrit01 ];
|
||||
ows-deploy-key = [ machines.gerrit01 ];
|
||||
s3-channel-staging-keys = [ machines.gerrit01 ];
|
||||
s3-channel-keys = [ machines.gerrit01 ];
|
||||
|
||||
postgres-ca-priv = [ machines.bagel-box ];
|
||||
postgres-tls-priv = [ machines.bagel-box ];
|
||||
postgres-ca-priv = [ machines.bagel-box ];
|
||||
postgres-tls-priv = [ machines.bagel-box ];
|
||||
|
||||
newsletter-secrets = [ machines.public01 ];
|
||||
s3-revproxy-api-keys = [ machines.public01 ];
|
||||
newsletter-secrets = [ machines.public01 ];
|
||||
s3-revproxy-api-keys = [ machines.public01 ];
|
||||
stateless-uptime-kuma-password = [ machines.public01 ];
|
||||
};
|
||||
|
||||
lix = {
|
||||
buildbot-worker-password = [ machines.buildbot-lix ];
|
||||
buildbot-oauth-secret = [ machines.buildbot-lix ];
|
||||
buildbot-workers = [ machines.buildbot-lix ];
|
||||
# Private SSH key to Gerrit
|
||||
# ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHx52RUPWzTa2rBA96xcnGjjzAboNN/hm6gW+Q6JiSos
|
||||
buildbot-service-key = [ machines.buildbot-lix ];
|
||||
# Signing key for Buildbot's specific cache
|
||||
buildbot-signing-key = [ machines.buildbot-lix ];
|
||||
buildbot-remote-builder-key = [ machines.buildbot-lix ];
|
||||
};
|
||||
};
|
||||
|
||||
mkSecretListFor = tenant:
|
||||
map (secretName: {
|
||||
name = "secrets/${tenant}/${secretName}.age";
|
||||
value.publicKeys = secrets.${tenant}."${secretName}" ++ commonKeys.global ++ commonKeys.${tenant};
|
||||
}) (builtins.attrNames secrets.${tenant});
|
||||
in
|
||||
builtins.listToAttrs (
|
||||
map (secretName: {
|
||||
name = "secrets/${secretName}.age";
|
||||
value.publicKeys = secrets."${secretName}" ++ commonKeys;
|
||||
}) (builtins.attrNames secrets)
|
||||
(mkSecretListFor "floral") ++ (mkSecretListFor "lix")
|
||||
)
|
||||
|
|
68
secrets/floral/metrics-push-password.age
Normal file
68
secrets/floral/metrics-push-password.age
Normal file
|
@ -0,0 +1,68 @@
|
|||
age-encryption.org/v1
|
||||
-> ssh-ed25519 +HUDfA d5f2ESneC0wsoc9rwTjNfNXMBjCbjAQ7euthH2Buq1E
|
||||
5CynaQ8zhDRBvcmifhCsiDtllztCVAqs8rU36DOxgPw
|
||||
-> ssh-ed25519 +uvEmw EtYRis2LP0jv1W8mx8vFYNzkgi8OoqnA8cM2huS6NBk
|
||||
ll1csFIO+hVYk+I0uSVJmlDKj9aTWvf4kaYI5LJcm7w
|
||||
-> ssh-ed25519 DMaM1w ex4QJN8CG99J15i+yvqGEiEZn9OlGIC+cmLHL4u8ZEI
|
||||
VXnOv4CGK68q5t6hUV3oKAtxGZ+4FVbrmE1yMn16A0Q
|
||||
-> ssh-ed25519 sixKXw drXN6+q1y7L7ZU4chTfHfelu5GcTdff+i/UMFV0+3RQ
|
||||
+8jmgnMh2OpQ3vhAuyQYWslfx7KO84a8KsCpoRD3Yl8
|
||||
-> ssh-ed25519 aHbF7w Af7NgjZ/Nvh5FHrX2VlF5riTIhJ+fdxTo6OR+8PcNwA
|
||||
ktKpm/HnOnw2Ym7xee3N1rneEX7+/xDhcp71N1NNHAA
|
||||
-> ssh-ed25519 87T2Ig 8mEUxJ/5NUvV+qQCDQH2Tm6Ryr5hf4xgsQlqXGf03Fw
|
||||
EavMcnsg/3EYBLQEBHX+0oTDKq5ZL4vj+mZntPM8UMU
|
||||
-> ssh-ed25519 Ao+7Wg UphWbatIaa+R1oZbfHazFhrawf0vax/3ZZS7YuX03Hs
|
||||
dwBbwoV0jpjiKr+nj+CRfUDgDl7ISpsCintVAzHnIFQ
|
||||
-> ssh-ed25519 wIR2ZA ZM58Nq7eJX9JVeYkoJf+mw8hxhYGoTx042ow1u3mJkw
|
||||
UtEaf7e4xsPO0ISlIF9LF+GcwTBqw4AXdMO4MASfgLQ
|
||||
-> ssh-ed25519 oGiV/Q G5KX/Eox+9md0yFRUZvGIsio2gWM17soHsL6H6zEX2g
|
||||
vI8jPjBAoFF0xhvRRLPzCMSiQOQ0fKuRb3CYVu3KUUo
|
||||
-> ssh-ed25519 gO3aog p9nZtjzoA0zJM+7Y6R16mpdub3dhu67yOYTUNKRytgI
|
||||
YL9vAp1+CK7jgmXkB47ufZMz+/swngkdUvEGR1zFZwc
|
||||
-> ssh-ed25519 N/+Clw 6LzFdtNsWewuJK2r97ZXJbRazvK3raN78UGanR/zWVU
|
||||
WT0y+sfDP3ffVwRcbYw51ArFR3OzXnoyi9IXwZZKEL8
|
||||
-> ssh-ed25519 CtkSZw CV0jQ5dIbgFtMxGK1X9b1qJOKmske8VgIPW5NW9mAwc
|
||||
clv7P3de61nZmXrvbOgL7Llw8ZqBMm2WFqgpznDwKv8
|
||||
-> ssh-ed25519 keg2lg 3Nk40ByQj8RThj4QDY2BdAkw55mXAJprXQRGjQqGvz0
|
||||
f8OFszJ8p90crFd+awEE12CNd7b22zgpH2XRKmH/Hf0
|
||||
-> ssh-ed25519 H885DA GDiJYH+gaC++TSqfoPDOTFcsCZRhEl0EeTeab7tgcWU
|
||||
kMILmwNMnMS7rgC3kKsAksu4Txn5owPU2y09h4aHKY8
|
||||
-> ssh-ed25519 Rq7K4Q VCNxGtCSCD2OYSWWwl0+yf189xV3QwRiwo80h4NPTkE
|
||||
hHkgYHLbISdl/RRdlInp9ub854M9ZKFSXpLgKW2YkmQ
|
||||
-> ssh-ed25519 vvyRpw XSCCrqEOtvzQRssI0U1DHirKoPgbOUKJxNKnioHhT2Y
|
||||
HGey1j0Kxae5Qs0aw6eqFziQGiRmNA+lEwbRdf5hhbM
|
||||
-> ssh-ed25519 aSEktQ mXY70Lgl76J4O5dPdDcIqmJ40EinigDuZrUghpGOq2I
|
||||
U2qeVFxGCYCEFWU+7vHc5Mu9EuzScowrjnwUyoqPj5U
|
||||
-> ssh-ed25519 cD6JxA at89poimBZyeeM8CQrxDxN0yCNDT2k04++py1fFycj8
|
||||
cQV/K5zc5x/oYnJ4N0MX3sTboT4G4ZNvVUVdHuJRzbA
|
||||
-> ssh-ed25519 ec6XRQ spJtb/xy4k4dmwKz8R2CPhC1WcuNV/rnDT978GkjHHk
|
||||
KrGEVGts/AhzbRNreqQ/CVanXL3l/9oMWxnpBLj23qU
|
||||
-> ssh-ed25519 1qYEfw KRkTYlvvnsCIExKQNmCyU7YxnGZsiI03kzecXNpLzUQ
|
||||
h2YagV7BzlsF7banzwXbOudTdlFzT7LC8PvtxAsX36U
|
||||
-> ssh-ed25519 2D+APA 4hdYlOnNIT9Q6tyKwXzy+u66Ti2EJopK43Sipebd0As
|
||||
tuesc9/QcEu4q9bTFJ5zJr0qvgLcmpn4at4cYtHrtbE
|
||||
-> ssh-ed25519 eTSU6g i1qT6PtepHXnoLCqDbhk86QG+SR9luQaw34a34gy5mw
|
||||
YE9VBAT5SLW2ECHRU+dMg9na6OQNVRVGuhY8vOdmE/Q
|
||||
-> ssh-ed25519 j2r2qQ TTTbSB/8UIDmmI3C9+u24PYZNfjl9jGADKHNWIwLfGE
|
||||
SNDforwii/GFp82TpyOcVIVrZWCe2QQKrjzPA6XA7Jc
|
||||
-> ssh-ed25519 CyxfgQ P5EiJ54v65Sz1gHuI0s170Z7c1WjcZLlb7NYigElfVs
|
||||
iYJUGpoE9LBIlv+O1navSSsy3EJ8tusXXX+/QAQvjNI
|
||||
-> ssh-ed25519 C/bBAQ hlBDpQRkcVCr3B6TCrBjxauhUX6Ndpm0s6x8W4hU6gM
|
||||
OFG3EuGJkSoEEXhbJ/Tp2DBdnBcs+hzxjNRdvcOSpQs
|
||||
-> ssh-ed25519 +qVung cGEGpO8NJfpj9ixAH9lhYkPKPDdQWryVxSOhMGQdnWM
|
||||
+MycbIEab3P/AOS9i/YmPBDXB76hp3xUcWI4VMihV2w
|
||||
-> ssh-rsa krWCLQ
|
||||
Zv3dPYERlX1MaVaJTBDwIcjt1yLmu4Z7MovPgjGg01p+XsdBXeepTyOl+gRBwGgo
|
||||
AW5CIuaChYxtSNJ6nOgSaUpqzILycUF1xE1jROe3MIX2MZ4KGD1qoqcHbiCAng+a
|
||||
RqYrwAKnNea9FQMVfhYZBkRoYE6ne1R+0G6BoFM/okz24pAAFPBx+sMMhfTkt0uV
|
||||
kHVx0dgRw1pxa7Na98WH/7E0zp9VuBvVHGXfk1rfW/UQlbIO5RP3nldFoa6OmOWS
|
||||
JZ022UvjyC1re0KCurka4y+qmaiRKnTBmpIXxJFMwNCAQ8O8SeAQ3DHKHmXNMOIL
|
||||
ZVICtRRk0uX36AVU8DWDog
|
||||
-> ssh-ed25519 /vwQcQ kF8+hsA+0Msjd3q0SL52cae5RDqx4ls5kPKnc3UZyms
|
||||
Q33kIKJL3Vjxu7LQ5l4M3tlEuj+OW4uGh1x+JxthW8A
|
||||
-> ssh-ed25519 0R97PA gWBH71l6w9upTE0DwqOMSvWXc5VyJiKFAQLaSpWQ43E
|
||||
IrOrvzEa0bABw6UOpP8pM8WhuRNMaWJ2khljJIKwOS8
|
||||
-> ssh-ed25519 K3b7BA oS14iav9pSioLecMkOanJz89OJygLugvrnnTs5pKzz8
|
||||
akupMSiqXussXJyHwFm/f0imKALjowJVqd8/LFcC/58
|
||||
--- bCJXTEDaKg4FF2dQp2zFMU4Zm0zACekCuDvthcAyncM
|
||||
&Ÿ€Waïãà›BD R(¯¥Ñ”ufj<>úVÁ8nÆ>‚ß›øëæðZúâ{Idƒ„©,³*„%Ç“È‚z«
|
20
secrets/floral/stateless-uptime-kuma-password.age
Normal file
20
secrets/floral/stateless-uptime-kuma-password.age
Normal file
|
@ -0,0 +1,20 @@
|
|||
age-encryption.org/v1
|
||||
-> ssh-ed25519 CyxfgQ D2o8bUccO13DKF4COLBQ9mJbACsE2XsRa5S+N71WnTk
|
||||
ZaldT7HhQxbxf2ptIwdMYkC60eGtzihc7uwcAkq7s00
|
||||
-> ssh-ed25519 K3b7BA AiUCG5CnNyv1DPu+iEwEgW9GqZ8zgpgxKJTAp350ADc
|
||||
cUVaDv7F1haQIF11/UhhDAR5DrfJlPttGfDjkv+z9vY
|
||||
-> ssh-ed25519 +qVung 1JXeXyea+2Pcwoln/NLRiR8IPPIiB3gaFCP4imyv4DA
|
||||
JWmAY6ZnyU46KxzhRrQigGmUPba9lJDDyRQ2GjQShqc
|
||||
-> ssh-rsa krWCLQ
|
||||
ciLu/+cXfQrB1ms8oTv+xi4eADyL4j0qwnY/6TE0wAXkQHuNXDmpF6ccWZoS2DqN
|
||||
NcnGXL6+WyWxmwlyBEq/rsBPvi1g0M6Md7Z4gXn2UvjJ+S7WyA8QEwkxoTDkJS7x
|
||||
k/NvtunmggVsWVK4Xdi5DKRw+f32qr/8GysDhIPrTt43iReBKNbyuYWmC5Ec85ep
|
||||
JU4JzCNZjJ07kixS5Y9BhaJbpEr47lCXE/KtJUvm3VAxS9IwfUn7KHHdFWynbExi
|
||||
F898j3zOR/kgYmeA0oTiexRD3Y2LCvjXIHQZ3MobbZ/PBrjWxe78Sw2vy2t5JLtB
|
||||
gFG0K8M1z8DT6a8TtvXEgg
|
||||
-> ssh-ed25519 /vwQcQ kUM21TO9iSa8oVXMlNxR7Kc+8TV4C/uTzyQ+t3xnARA
|
||||
oXt+egWWONsKT48H4vZ2CPdy3Zfb2QeQVe9l7dDyO/w
|
||||
-> ssh-ed25519 0R97PA e/piqf2RD5QgPaQs6jsJdzJgfZR9n1JDIWpbvLZErSs
|
||||
UTJH8POFdZ4+N9WkLoNESl1pvcVD0MS1qn7AdS/mg34
|
||||
--- 9aYEP0eHDKMacIf09h+OJqIYw+N99+FrW/x/do8Lbo4
|
||||
$ ÖëWÛ\zú—¾=s/à@.Ç,?ƒW6n^ù#–i!§Ã–ï¶1]±Nvù±Ž'Ï¥¹6?‚'mµpPÒqýŸº
|
7
secrets/lix/buildbot-oauth-secret.age
Normal file
7
secrets/lix/buildbot-oauth-secret.age
Normal file
|
@ -0,0 +1,7 @@
|
|||
age-encryption.org/v1
|
||||
-> ssh-ed25519 Ao+7Wg q7oRHUUlAvD8OUbpPT7d6eLMPWU0YS/verYTDE5BCkY
|
||||
/87/1uqOvRYeqel9IjnFmGyF9SXUQD8MTgfcj91b/Fs
|
||||
--- ulIeB91NJ7z/64h9BCLSD9/RW/zwv3m1Zo2ovNuInv8
|
||||
Îœç}³Óš#épÇ o>ä·*vµ÷ÄåŽs?[¦º´L
|
||||
<EFBFBD>þz™rý‰?R±Ñó7<Ê
|
||||
æi!€{X„¾òÓ
|
BIN
secrets/lix/buildbot-remote-builder-key.age
Normal file
BIN
secrets/lix/buildbot-remote-builder-key.age
Normal file
Binary file not shown.
BIN
secrets/lix/buildbot-service-key.age
Normal file
BIN
secrets/lix/buildbot-service-key.age
Normal file
Binary file not shown.
6
secrets/lix/buildbot-signing-key.age
Normal file
6
secrets/lix/buildbot-signing-key.age
Normal file
|
@ -0,0 +1,6 @@
|
|||
age-encryption.org/v1
|
||||
-> ssh-ed25519 Ao+7Wg EMpfs0EpWwaIKAoUBfEkyAHLIwi6JnGG6RvUWM5LjnU
|
||||
LKiwUBNc791U/GVRNlRPZE/TEMJjcFFrLruFJhiyiOI
|
||||
--- 0khp8u+4vHgGyQqP05m473Eo09eyOUZLI5+EK4olzoc
|
||||
N3(
|
||||
ª•ûxRq°<71>f<EFBFBD>Ó;ͼ3¬~RˆÓC^ñ+fœš1”®˜xˆ÷ÅëñSØ—hâ
£ÖË°GˆÓn–YIûµ:7¾!°u×Hþy/‰Øð‰™.¯¤á^¹lC™ôUÈËþ5cž:]ÿNž&'MÎè¶É-˜–ÆHF¦D0‘ cjô ‹Ð~
|
BIN
secrets/lix/buildbot-worker-password.age
Normal file
BIN
secrets/lix/buildbot-worker-password.age
Normal file
Binary file not shown.
6
secrets/lix/buildbot-workers.age
Normal file
6
secrets/lix/buildbot-workers.age
Normal file
|
@ -0,0 +1,6 @@
|
|||
age-encryption.org/v1
|
||||
-> ssh-ed25519 Ao+7Wg RPKKoI5l5cYVdSvOxTHCUtwceac4vSi3+vlaqHr8kQg
|
||||
qbgTHCeQDNM30IJNZ/BU6wgryJwB316H5GWWaYB/wng
|
||||
--- GuFi3GSRdlBJ5YRjfAVyFDZ+4TH575kFJLnFp5847N0
|
||||
-èƒÞHÖÜ*x´M7¼t<,4ˆŠÑ^<5E>5@v°<>£€º,Z•MÒg=M
|
||||
» 3výJÄ«ÐÖê¿Nz8'<^'4&WÂf"Êõ´À›ë\©º»ëêwmzúlAl|+„‘ÆKš~68ñEÝîk•8ø?S&òaM‹Ý~ž¹ê¿]Vfø ÝJxaõDù¥x
|
Binary file not shown.
40
services/block-crawlers/blocked-ua.txt
Normal file
40
services/block-crawlers/blocked-ua.txt
Normal file
|
@ -0,0 +1,40 @@
|
|||
AI2Bot
|
||||
Ai2Bot-Dolma
|
||||
Amazonbot
|
||||
anthropic-ai
|
||||
Applebot
|
||||
Applebot-Extended
|
||||
Bytespider
|
||||
CCBot
|
||||
ChatGPT-User
|
||||
Claude-Web
|
||||
ClaudeBot
|
||||
cohere-ai
|
||||
Diffbot
|
||||
FacebookBot
|
||||
facebookexternalhit
|
||||
FriendlyCrawler
|
||||
Google-Extended
|
||||
GoogleOther
|
||||
GoogleOther-Image
|
||||
GoogleOther-Video
|
||||
GPTBot
|
||||
iaskspider/2.0
|
||||
ICC-Crawler
|
||||
ImagesiftBot
|
||||
img2dataset
|
||||
ISSCyberRiskCrawler
|
||||
Kangaroo Bot
|
||||
Meta-ExternalAgent
|
||||
Meta-ExternalFetcher
|
||||
OAI-SearchBot
|
||||
omgili
|
||||
omgilibot
|
||||
PerplexityBot
|
||||
PetalBot
|
||||
Scrapy
|
||||
Sidetrade indexer bot
|
||||
Timpibot
|
||||
VelenPublicWebCrawler
|
||||
Webzio-Extended
|
||||
YouBot
|
32
services/block-crawlers/default.nix
Normal file
32
services/block-crawlers/default.nix
Normal file
|
@ -0,0 +1,32 @@
|
|||
{ pkgs, config, lib, ... }:
|
||||
let
|
||||
inherit (lib) mkEnableOption mkIf mkOption types concatStringsSep mkDefault splitString;
|
||||
cfg = config.bagel.services.nginx.crawler-blocker;
|
||||
mkRobotsFile = blockedUAs: pkgs.writeText "robots.txt" ''
|
||||
${concatStringsSep "\n" (map (ua: "User-agent: ${ua}") blockedUAs)}
|
||||
Disallow: /
|
||||
'';
|
||||
in
|
||||
{
|
||||
options = {
|
||||
bagel.services.nginx.crawler-blocker = {
|
||||
enable = mkEnableOption "the crawler blocker";
|
||||
|
||||
userAgents = mkOption {
|
||||
type = types.listOf types.str;
|
||||
default = splitString "\n" (builtins.readFile ./blocked-ua.txt);
|
||||
};
|
||||
};
|
||||
|
||||
services.nginx.virtualHosts = mkOption {
|
||||
type = types.attrsOf (types.submodule {
|
||||
config = {
|
||||
locations."= /robots.txt" = mkIf cfg.enable (mkDefault {
|
||||
alias = mkRobotsFile cfg.userAgents;
|
||||
});
|
||||
};
|
||||
});
|
||||
};
|
||||
};
|
||||
}
|
||||
|
|
@ -7,15 +7,69 @@
|
|||
}:
|
||||
let
|
||||
cfg = config.bagel.services.buildbot;
|
||||
cfgGerrit = nodes.gerrit01.config.bagel.services.gerrit;
|
||||
ssh-keys = import ../../common/ssh-keys.nix;
|
||||
freeGbDiskSpace = 20;
|
||||
extraTenantSpecificBuilders = {
|
||||
lix = import ./lix.nix {
|
||||
inherit config nodes;
|
||||
};
|
||||
floral = [ ];
|
||||
}.${cfg.tenant or (throw "${cfg.tenant} is not a known tenant")};
|
||||
clientId = {
|
||||
lix = "buildbot";
|
||||
floral = "forkos-buildbot";
|
||||
}.${cfg.tenant or (throw "${cfg.tenant} is not a known tenant")};
|
||||
inherit (lib) mkEnableOption mkOption mkIf types;
|
||||
in
|
||||
{
|
||||
options.bagel.services.buildbot = {
|
||||
enable = mkEnableOption "Buildbot";
|
||||
|
||||
tenant = mkOption {
|
||||
type = types.enum [ "lix" "floral" ];
|
||||
description = "Which buildbot tenant to enable";
|
||||
};
|
||||
|
||||
domain = mkOption {
|
||||
type = types.str;
|
||||
description = "Domain name for this Buildbot";
|
||||
};
|
||||
|
||||
gerrit = {
|
||||
domain = mkOption {
|
||||
type = types.str;
|
||||
description = "Canonical domain of the Gerrit associated to this Buildbot";
|
||||
example = [ "cl.forkos.org" ];
|
||||
};
|
||||
|
||||
port = mkOption {
|
||||
type = types.port;
|
||||
description = "Gerrit SSH port for this Buildbot";
|
||||
};
|
||||
|
||||
username = mkOption {
|
||||
type = types.str;
|
||||
description = "Gerrit service username for this Buildbot";
|
||||
};
|
||||
};
|
||||
|
||||
cors.allowedOrigins = mkOption {
|
||||
type = types.listOf types.str;
|
||||
example = [ "*.forkos.org" ];
|
||||
description = "Allowed origin for Buildbot and NGINX for CORS without the protocol";
|
||||
};
|
||||
|
||||
buildSystems = mkOption {
|
||||
type = types.listOf (types.enum [ "x86_64-linux" "aarch64-linux" "x86_64-darwin" "aarch64-darwin" ]);
|
||||
default = [ "x86_64-linux" ];
|
||||
example = [ "x86_64-linux" "aarch64-linux" ];
|
||||
description = "Supported build systems for this buildbot instance.";
|
||||
};
|
||||
|
||||
projects = mkOption {
|
||||
type = types.listOf types.str;
|
||||
example = [ "nixpkgs" ];
|
||||
description = "Static list of projects enabled for Buildbot CI";
|
||||
};
|
||||
|
||||
builders = mkOption {
|
||||
|
@ -27,28 +81,39 @@ in
|
|||
|
||||
config = mkIf cfg.enable {
|
||||
networking.firewall.allowedTCPPorts = [ 80 443 ];
|
||||
age.secrets.buildbot-worker-password.file = ../../secrets/buildbot-worker-password.age;
|
||||
age.secrets.buildbot-oauth-secret.file = ../../secrets/buildbot-oauth-secret.age;
|
||||
age.secrets.buildbot-workers.file = ../../secrets/buildbot-workers.age;
|
||||
age.secrets.buildbot-service-key.file = ../../secrets/buildbot-service-key.age;
|
||||
bagel.secrets.files = [
|
||||
"buildbot-worker-password"
|
||||
"buildbot-oauth-secret"
|
||||
"buildbot-workers"
|
||||
"buildbot-service-key"
|
||||
"buildbot-signing-key"
|
||||
"buildbot-remote-builder-key"
|
||||
];
|
||||
age.secrets.buildbot-signing-key = {
|
||||
file = ../../secrets/buildbot-signing-key.age;
|
||||
owner = "buildbot-worker";
|
||||
group = "buildbot-worker";
|
||||
};
|
||||
age.secrets.buildbot-remote-builder-key = {
|
||||
file = ../../secrets/buildbot-remote-builder-key.age;
|
||||
file = ../../secrets/${cfg.tenant}/buildbot-remote-builder-key.age;
|
||||
owner = "buildbot-worker";
|
||||
group = "buildbot-worker";
|
||||
};
|
||||
|
||||
services.nginx.virtualHosts.${cfg.domain} = {
|
||||
forceSSL = true;
|
||||
enableACME = true;
|
||||
extraConfig = ''
|
||||
add_header Access-Control-Allow-Credentials 'true' always;
|
||||
add_header Access-Control-Allow-Origin 'https://cl.forkos.org' always;
|
||||
services.nginx = {
|
||||
recommendedProxySettings = true;
|
||||
appendHttpConfig = ''
|
||||
# Our session stuff is too big with the TWISTED_COOKIE in addition.
|
||||
# Default is usually 4k or 8k.
|
||||
large_client_header_buffers 4 16k;
|
||||
'';
|
||||
virtualHosts.${cfg.domain} = {
|
||||
forceSSL = true;
|
||||
enableACME = true;
|
||||
extraConfig = ''
|
||||
# This is needed so that logged-in users in Buildbot can include their credentials in their requests.
|
||||
add_header Access-Control-Allow-Credentials 'true' always;
|
||||
'';
|
||||
};
|
||||
};
|
||||
|
||||
services.buildbot-nix.worker = {
|
||||
|
@ -74,30 +139,25 @@ in
|
|||
enable = true;
|
||||
|
||||
inherit (cfg) domain;
|
||||
|
||||
debugging.enable = true;
|
||||
# TODO(raito): is that really necessary when we can just collect buildMachines' systems?
|
||||
inherit (cfg) buildSystems;
|
||||
|
||||
oauth2 = {
|
||||
name = "Lix";
|
||||
clientId = "forkos-buildbot";
|
||||
inherit clientId;
|
||||
clientSecretFile = config.age.secrets.buildbot-oauth-secret.path;
|
||||
resourceEndpoint = "https://identity.lix.systems";
|
||||
authUri = "https://identity.lix.systems/realms/lix-project/protocol/openid-connect/auth";
|
||||
tokenUri = "https://identity.lix.systems/realms/lix-project/protocol/openid-connect/token";
|
||||
userinfoUri = "https://identity.lix.systems/realms/lix-project/protocol/openid-connect/userinfo";
|
||||
};
|
||||
|
||||
# TODO(raito): this is not really necessary, we never have remote buildbot workers.
|
||||
# we can replace all of this with automatic localworker generation on buildbot-nix side.
|
||||
workersFile = config.age.secrets.buildbot-workers.path;
|
||||
|
||||
allowedOrigins = [
|
||||
"*.forkos.org"
|
||||
];
|
||||
|
||||
# TODO(raito): is that really necessary when we can just collect buildMachines' systems?
|
||||
buildSystems = [
|
||||
"x86_64-linux"
|
||||
];
|
||||
# We rely on NGINX to do the CORS dance.
|
||||
allowedOrigins = cfg.cors.allowedOrigins;
|
||||
|
||||
buildMachines = map (n: {
|
||||
hostName = nodes.${n}.config.networking.fqdn;
|
||||
|
@ -111,20 +171,14 @@ in
|
|||
# Contrary to how Nix works, here we can specify non-base64 public host keys.
|
||||
publicHostKey = ssh-keys.machines.${n};
|
||||
}
|
||||
) cfg.builders;
|
||||
) cfg.builders ++ extraTenantSpecificBuilders;
|
||||
|
||||
gerrit = {
|
||||
domain = cfgGerrit.canonicalDomain;
|
||||
# Manually managed account…
|
||||
# TODO: https://git.lix.systems/the-distro/infra/issues/69
|
||||
username = "buildbot";
|
||||
port = cfgGerrit.port;
|
||||
inherit (cfg.gerrit) domain port username;
|
||||
privateKeyFile = config.age.secrets.buildbot-service-key.path;
|
||||
projects = [
|
||||
"buildbot-test"
|
||||
"nixpkgs"
|
||||
"infra"
|
||||
];
|
||||
inherit (cfg) projects;
|
||||
};
|
||||
|
||||
evalWorkerCount = 6;
|
||||
|
@ -133,10 +187,21 @@ in
|
|||
signingKeyFile = config.age.secrets.buildbot-signing-key.path;
|
||||
};
|
||||
|
||||
# Make PostgreSQL restart smoother.
|
||||
systemd.services.postgresql.serviceConfig = {
|
||||
Restart = "always";
|
||||
RestartMaxDelaySec = "5m";
|
||||
RestartSteps = 10;
|
||||
};
|
||||
|
||||
|
||||
nix.settings.keep-derivations = true;
|
||||
nix.gc = {
|
||||
automatic = true;
|
||||
dates = "hourly";
|
||||
options = ''
|
||||
--max-freed "$((${toString freeGbDiskSpace} * 1024**3 - 1024 * $(df -P -k /nix/store | tail -n 1 | ${pkgs.gawk}/bin/awk '{ print $4 }')))"
|
||||
'';
|
||||
};
|
||||
};
|
||||
}
|
||||
|
|
50
services/buildbot/lix.nix
Normal file
50
services/buildbot/lix.nix
Normal file
|
@ -0,0 +1,50 @@
|
|||
{ config, nodes, ... }:
|
||||
let
|
||||
ssh-keys = import ../../common/ssh-keys.nix;
|
||||
in
|
||||
[
|
||||
{
|
||||
hostName = "build01.aarch64.lix.systems";
|
||||
maxJobs = 2;
|
||||
protocol = "ssh-ng";
|
||||
sshKey = config.age.secrets.buildbot-remote-builder-key.path;
|
||||
sshUser = "nix";
|
||||
systems = [ "aarch64-linux" ];
|
||||
publicHostKey = ssh-keys.machines.build01-aarch64-lix;
|
||||
supportedFeatures = nodes.build01-aarch64-lix.config.nix.settings.system-features;
|
||||
}
|
||||
{
|
||||
hostName = "build02.aarch64.lix.systems";
|
||||
maxJobs = 4;
|
||||
protocol = "ssh-ng";
|
||||
sshKey = config.age.secrets.buildbot-remote-builder-key.path;
|
||||
sshUser = "nix";
|
||||
systems = [ "aarch64-linux" ];
|
||||
publicHostKey = ssh-keys.machines.build02-aarch64-lix;
|
||||
# TODO: use build02 features.
|
||||
supportedFeatures = nodes.build01-aarch64-lix.config.nix.settings.system-features;
|
||||
}
|
||||
{
|
||||
hostName = "build01.aarch64-darwin.lix.systems";
|
||||
maxJobs = 2;
|
||||
protocol = "ssh-ng";
|
||||
sshKey = config.age.secrets.buildbot-remote-builder-key.path;
|
||||
sshUser = "m1";
|
||||
systems = [ "aarch64-darwin" "x86_64-darwin" ];
|
||||
publicHostKey = ssh-keys.machines.build01-aarch64-darwin-lix;
|
||||
supportedFeatures = [ "big-parallel" ];
|
||||
}
|
||||
# a.k.a. https://git.newtype.fr/newtype/newtype-org-configurations/src/branch/main/docs/epyc.md
|
||||
{
|
||||
hostName = "epyc.infra.newtype.fr";
|
||||
# at 256G this could run 64 builds but the machine is shared
|
||||
# (and historically we used no more than 16 concurrent jobs)
|
||||
maxJobs = 16;
|
||||
protocol = "ssh-ng";
|
||||
sshKey = config.age.secrets.buildbot-remote-builder-key.path;
|
||||
sshUser = "nix";
|
||||
systems = [ "x86_64-linux" "i686-linux" ];
|
||||
publicHostKey = ssh-keys.machines.epyc-newtype-fr;
|
||||
supportedFeatures = [ "benchmark" "big-parallel" "nixos-test" "kvm" ];
|
||||
}
|
||||
]
|
|
@ -1,10 +1,12 @@
|
|||
{
|
||||
imports = [
|
||||
./block-crawlers
|
||||
./gerrit
|
||||
./channel-scripts
|
||||
./hydra
|
||||
./matrix
|
||||
./monitoring
|
||||
./uptime-kuma
|
||||
./netbox
|
||||
./ofborg
|
||||
./postgres
|
||||
|
@ -13,5 +15,6 @@
|
|||
./buildbot
|
||||
./newsletter
|
||||
./s3-revproxy
|
||||
./extra-builders
|
||||
];
|
||||
}
|
||||
|
|
6
services/extra-builders/default.nix
Normal file
6
services/extra-builders/default.nix
Normal file
|
@ -0,0 +1,6 @@
|
|||
{
|
||||
imports = [
|
||||
# Remote builders
|
||||
./provider.nix
|
||||
];
|
||||
}
|
46
services/extra-builders/provider.nix
Normal file
46
services/extra-builders/provider.nix
Normal file
|
@ -0,0 +1,46 @@
|
|||
## Tenant-specific build capacity.
|
||||
## This can come from anywhere and is not hold to the same level of responsibility that our build-infra has.
|
||||
{ pkgs, config, lib, nodes, ... }:
|
||||
let
|
||||
inherit (lib) mkIf types mkEnableOption mkOption;
|
||||
freeGbDiskSpace = 10;
|
||||
cfg = config.bagel.builders.extra-build-capacity.provider;
|
||||
in
|
||||
{
|
||||
options.bagel.builders.extra-build-capacity.provider = {
|
||||
enable = mkEnableOption "providing of extra build capacity to other systems";
|
||||
|
||||
buildfarmPublicKeys = mkOption {
|
||||
type = types.listOf types.str;
|
||||
description = "SSH public keys to allow to connect for remote builds";
|
||||
};
|
||||
|
||||
# TODO: register tenant in some deployment wide module
|
||||
# so that the consumer side can just automatically generate buildMachines entries.
|
||||
tenant = mkOption {
|
||||
type = types.enum [ "lix" ];
|
||||
};
|
||||
};
|
||||
|
||||
config = mkIf cfg.enable {
|
||||
users.groups.builders = {};
|
||||
users.users.nix = {
|
||||
openssh.authorizedKeys.keys = cfg.buildfarmPublicKeys;
|
||||
extraGroups = [ "builders" ];
|
||||
isNormalUser = true;
|
||||
};
|
||||
|
||||
nix.settings.allowed-users = [ "@wheel" "@builders" ];
|
||||
nix.settings.trusted-users = [ "@builders" ];
|
||||
|
||||
nix.gc.automatic = true;
|
||||
nix.gc.dates = "hourly";
|
||||
nix.gc.options = ''
|
||||
--max-freed "$((${toString freeGbDiskSpace} * 1024**3 - 1024 * $(df -P -k /nix/store | tail -n 1 | ${pkgs.gawk}/bin/awk '{ print $4 }')))"
|
||||
'';
|
||||
# Bump the open files limit so that non-root users can run NixOS VM tests, if supported at all.
|
||||
security.pam.loginLimits = [
|
||||
{ domain = "*"; item = "nofile"; type = "-"; value = "20480"; }
|
||||
];
|
||||
};
|
||||
}
|
|
@ -41,11 +41,12 @@ in
|
|||
imports = [
|
||||
./www.nix
|
||||
./one-way-sync.nix
|
||||
./git-gc-preserve.nix
|
||||
];
|
||||
|
||||
config = mkIf cfg.enable {
|
||||
networking.firewall.allowedTCPPorts = [ cfg.port ];
|
||||
age.secrets.alloy-push-password.file = ../../secrets/metrics-push-password.age;
|
||||
age.secrets.alloy-push-password.file = ../../secrets/floral/metrics-push-password.age;
|
||||
|
||||
environment.systemPackages = [ jdk
|
||||
pkgs.git
|
||||
|
@ -112,6 +113,7 @@ in
|
|||
pyroscope.java "java" {
|
||||
targets = discovery.relabel.java.output
|
||||
forward_to = [pyroscope.write.production.receiver]
|
||||
tmp_dir = "/run/alloy" // The default "/tmp" can be cleaned up and provoke failure, see #108
|
||||
profiling_config {
|
||||
interval = "60s"
|
||||
alloc = "512k"
|
||||
|
@ -140,7 +142,9 @@ in
|
|||
plugins = with pkgs.gerritPlugins; [
|
||||
oauth
|
||||
metrics-reporter-prometheus
|
||||
# Buildbot checks plugin (writeText because services.gerrit.plugins expects packages)
|
||||
# Theme plugin
|
||||
(pkgs.concatText "theme.js" [ ./theme.js ])
|
||||
# Buildbot checks plugin
|
||||
(pkgs.runCommand "checks.js" {
|
||||
BASE_URI = builtins.toJSON "https://buildbot.forkos.org";
|
||||
SUPPORTED_PROJECTS = builtins.toJSON [
|
||||
|
@ -312,13 +316,21 @@ in
|
|||
# setting for the Gerrit service to be disabled and reuse the
|
||||
# existing 'git' user.
|
||||
DynamicUser = lib.mkForce false;
|
||||
RuntimeDirectory = "alloy";
|
||||
User = "git";
|
||||
Group = "git";
|
||||
};
|
||||
environment.REVWALK_USE_PRIORITY_QUEUE = "true";
|
||||
};
|
||||
|
||||
age.secrets.gerrit-prometheus-bearer-token.file = ../../secrets/gerrit-prometheus-bearer-token.age;
|
||||
bagel.services.git-gc-preserve = {
|
||||
nixpkgs = {
|
||||
enable = true;
|
||||
repoPath = "/var/lib/gerrit/git/nixpkgs.git";
|
||||
};
|
||||
};
|
||||
|
||||
age.secrets.gerrit-prometheus-bearer-token.file = ../../secrets/floral/gerrit-prometheus-bearer-token.age;
|
||||
bagel.monitoring.grafana-agent.exporters.gerrit = {
|
||||
port = 4778; # grrt
|
||||
bearerTokenFile = config.age.secrets.gerrit-prometheus-bearer-token.path;
|
||||
|
|
86
services/gerrit/git-gc-preserve.nix
Normal file
86
services/gerrit/git-gc-preserve.nix
Normal file
|
@ -0,0 +1,86 @@
|
|||
{ lib, utils, config, pkgs, ... }: let
|
||||
inherit (lib) mkOption mkEnableOption types;
|
||||
cfg = config.bagel.services.git-gc-preserve;
|
||||
enabledServices = lib.filterAttrs (_: gcConfig: gcConfig.enable) cfg;
|
||||
in
|
||||
{
|
||||
options.bagel.services.git-gc-preserve = mkOption {
|
||||
default = { };
|
||||
description = "Repositories that should be garbage collected";
|
||||
type = types.attrsOf (types.submodule {
|
||||
options = {
|
||||
enable = mkEnableOption "git-gc-preserve";
|
||||
user = mkOption {
|
||||
type = types.str;
|
||||
default = "git";
|
||||
description = "The user which will run the garbage collection script";
|
||||
example = "forgejo";
|
||||
};
|
||||
group = mkOption {
|
||||
type = types.str;
|
||||
default = "git";
|
||||
description = "The group which will run the garbage collection script";
|
||||
example = "forgejo";
|
||||
};
|
||||
repoPath = mkOption {
|
||||
type = types.path;
|
||||
description = "The path to the git repository that should be garbage collected";
|
||||
example = "/var/lib/gerrit/git/nixpkgs";
|
||||
};
|
||||
timeoutSec = mkOption {
|
||||
type = types.str;
|
||||
default = "1h";
|
||||
description = "Garbage collection Systemd unit timeout";
|
||||
example = "infinity";
|
||||
};
|
||||
timerConfig = mkOption {
|
||||
type = types.attrsOf utils.systemdUtils.unitOptions.unitOption;
|
||||
default = {
|
||||
OnCalendar = "daily";
|
||||
};
|
||||
description = ''
|
||||
When to run the git-gc-preserve. See {manpage}`systemd.timer(5)` for details.
|
||||
'';
|
||||
example = {
|
||||
OnCalendar = "00:05";
|
||||
RandomizedDelaySec = "5h";
|
||||
Persistent = true;
|
||||
};
|
||||
};
|
||||
};
|
||||
});
|
||||
};
|
||||
config = {
|
||||
systemd.services =
|
||||
let
|
||||
mkGCService = name: gcConfig: {
|
||||
name = "git-gc-preserve-${name}";
|
||||
value = {
|
||||
description = "Git-GC-Preserve Service - ${name}";
|
||||
serviceConfig = {
|
||||
WorkingDirectory = gcConfig.repoPath;
|
||||
Type = "oneshot";
|
||||
User = gcConfig.user;
|
||||
Group = gcConfig.group;
|
||||
ExecStart = lib.getExe pkgs.git-gc-preserve;
|
||||
TimeoutSec = gcConfig.timeoutSec;
|
||||
};
|
||||
};
|
||||
};
|
||||
mkServices = lib.mapAttrs' mkGCService;
|
||||
in
|
||||
mkServices enabledServices;
|
||||
|
||||
systemd.timers = let
|
||||
mkGCTimer = name: gcConfig: {
|
||||
name = "git-gc-preserve-${name}";
|
||||
value = {
|
||||
wantedBy = [ "timers.target" ];
|
||||
after = [ "multi-user.target" ];
|
||||
timerConfig = gcConfig.timerConfig;
|
||||
};
|
||||
};
|
||||
mkTimer = lib.mapAttrs' mkGCTimer;
|
||||
in mkTimer enabledServices;
|
||||
};
|
||||
}
|
69
services/gerrit/theme.js
Normal file
69
services/gerrit/theme.js
Normal file
|
@ -0,0 +1,69 @@
|
|||
/* Set up theming for Floral.
|
||||
* vim: set et ts=2 sw=2:
|
||||
*/
|
||||
Gerrit.install((plugin) => {
|
||||
const stylesheet = new CSSStyleSheet();
|
||||
stylesheet.replace(`
|
||||
html {
|
||||
--header-title-content: 'floral.systems';
|
||||
|
||||
--blue-50: #f3f4fb;
|
||||
--blue-100: #e3e6f6;
|
||||
--blue-200: #ced5ef;
|
||||
--blue-300: #acb8e4;
|
||||
--blue-400: #8495d6;
|
||||
--blue-500: #6775ca;
|
||||
--blue-600: #5158bb;
|
||||
--blue-700: #494bac;
|
||||
--blue-800: #41408d;
|
||||
--blue-900: #383870;
|
||||
--blue-950: #252546;
|
||||
|
||||
--coral-50: #fff1f1;
|
||||
--coral-100: #ffe0e0;
|
||||
--coral-200: #ffc5c5;
|
||||
--coral-300: #ff9e9d;
|
||||
--coral-400: #ff6665;
|
||||
--coral-500: #fe4a49;
|
||||
--coral-600: #ec1716;
|
||||
--coral-700: #c70f0e;
|
||||
--coral-800: #a41110;
|
||||
--coral-900: #881514;
|
||||
--coral-950: #4a0505;
|
||||
|
||||
--teal-50: #eefbf5;
|
||||
--teal-100: #d6f5e5;
|
||||
--teal-200: #b1e9d0;
|
||||
--teal-300: #7ed7b5;
|
||||
--teal-400: #49be95;
|
||||
--teal-500: #27a27b;
|
||||
--teal-600: #188162;
|
||||
--teal-700: #136951;
|
||||
--teal-800: #125342;
|
||||
--teal-900: #104437;
|
||||
--teal-950: #08261f;
|
||||
|
||||
--zinc-50: #fafafa;
|
||||
--zinc-100: #f4f4f5;
|
||||
--zinc-200: #e4e4e7;
|
||||
--zinc-300: #d4d4d8;
|
||||
--zinc-400: #a1a1aa;
|
||||
--zinc-500: #71717a;
|
||||
--zinc-600: #52525b;
|
||||
--zinc-700: #3f3f46;
|
||||
--zinc-800: #27272a;
|
||||
--zinc-900: #18181b;
|
||||
--zinc-950: #09090b;
|
||||
}
|
||||
html.lightTheme {
|
||||
--header-background-color: var(--teal-700);
|
||||
--header-text-color: var(--coral-50);
|
||||
}
|
||||
html.darkTheme {
|
||||
--header-background-color: var(--teal-900);
|
||||
--header-text-color: var(--coral-50);
|
||||
}
|
||||
`).then(() => {
|
||||
document.adoptedStyleSheets = [...document.adoptedStyleSheets, stylesheet];
|
||||
});
|
||||
});
|
|
@ -29,10 +29,6 @@ in
|
|||
# NGINX should not give up super fast. Things can take time.
|
||||
proxy_read_timeout 3600;
|
||||
}
|
||||
|
||||
location = /robots.txt {
|
||||
return 200 'User-agent: *\nAllow: /';
|
||||
}
|
||||
'';
|
||||
};
|
||||
|
||||
|
|
|
@ -66,17 +66,19 @@ in {
|
|||
# does indeed have our public SSH key and are *builders*
|
||||
# as a simple evaluation preflight check.
|
||||
|
||||
age.secrets.hydra-s3-credentials.file = ../../secrets/hydra-s3-credentials.age;
|
||||
bagel.secrets.files = [
|
||||
"hydra-s3-credentials"
|
||||
"hydra-postgres-key"
|
||||
"hydra-signing-priv"
|
||||
"hydra-ssh-key-priv"
|
||||
];
|
||||
|
||||
age.secrets.hydra-postgres-key.group = "hydra";
|
||||
age.secrets.hydra-postgres-key.mode = "0440";
|
||||
age.secrets.hydra-postgres-key.file = ../../secrets/hydra-postgres-key.age;
|
||||
|
||||
age.secrets.hydra-signing-priv.owner = "hydra-queue-runner";
|
||||
age.secrets.hydra-signing-priv.file = ../../secrets/hydra-signing-priv.age;
|
||||
|
||||
age.secrets.hydra-ssh-key-priv.owner = "hydra-queue-runner";
|
||||
age.secrets.hydra-ssh-key-priv.file = ../../secrets/hydra-ssh-key-priv.age;
|
||||
|
||||
systemd.tmpfiles.rules = [
|
||||
"d /var/cache/hydra 0755 hydra hydra - -"
|
||||
|
|
|
@ -12,6 +12,14 @@ in
|
|||
options.bagel.monitoring.grafana-agent = {
|
||||
enable = (mkEnableOption "Grafana Agent") // { default = true; };
|
||||
|
||||
tenant = mkOption {
|
||||
description = ''
|
||||
Which tenant are we enabling Grafana Agent for.
|
||||
'';
|
||||
example = "lix";
|
||||
type = types.enum [ "lix" "floral" ];
|
||||
};
|
||||
|
||||
exporters = mkOption {
|
||||
description = ''
|
||||
Set of additional exporters to scrape.
|
||||
|
@ -59,7 +67,7 @@ in
|
|||
};
|
||||
|
||||
config = mkIf cfg.enable {
|
||||
age.secrets.grafana-agent-password.file = ../../secrets/metrics-push-password.age;
|
||||
age.secrets.grafana-agent-password.file = ../../secrets/floral/metrics-push-password.age;
|
||||
|
||||
services.grafana-agent = {
|
||||
enable = true;
|
||||
|
@ -76,7 +84,10 @@ in
|
|||
};
|
||||
}
|
||||
];
|
||||
global.external_labels.hostname = config.networking.hostName;
|
||||
global.external_labels = {
|
||||
hostname = config.networking.hostName;
|
||||
inherit (cfg) tenant;
|
||||
};
|
||||
configs = [
|
||||
{
|
||||
name = config.networking.hostName;
|
||||
|
|
|
@ -1,7 +1,8 @@
|
|||
{
|
||||
imports = [
|
||||
./cadvisor.nix
|
||||
./hydra
|
||||
./nginx.nix
|
||||
./postgres.nix
|
||||
];
|
||||
}
|
||||
}
|
||||
|
|
39
services/monitoring/exporters/hydra/default.nix
Normal file
39
services/monitoring/exporters/hydra/default.nix
Normal file
|
@ -0,0 +1,39 @@
|
|||
{
|
||||
config,
|
||||
lib,
|
||||
pkgs,
|
||||
...
|
||||
}:
|
||||
let
|
||||
cfg = config.bagel.monitoring.exporters.hydra;
|
||||
|
||||
python = pkgs.python3.withPackages(ps: [
|
||||
ps.aioprometheus
|
||||
ps.click
|
||||
ps.httpx
|
||||
ps.starlette
|
||||
ps.uvicorn
|
||||
]);
|
||||
|
||||
inherit (lib) escapeShellArg getExe mkEnableOption mkIf mkOption types;
|
||||
in
|
||||
{
|
||||
options.bagel.monitoring.exporters.hydra = {
|
||||
enable = mkEnableOption "bagel flavored Hydra exporter";
|
||||
hydraUrl = mkOption {
|
||||
type = types.str;
|
||||
default = "https://hydra.forkos.org/";
|
||||
description = "URL to the Hydra to monitor";
|
||||
};
|
||||
};
|
||||
|
||||
config = mkIf cfg.enable {
|
||||
systemd.services.hydra-exporter = {
|
||||
wantedBy = [ "multi-user.target" ];
|
||||
description = "Hydra exporter";
|
||||
script = "${getExe python} ${./hydra-exporter.py} --hydra-url=${escapeShellArg cfg.hydraUrl} --port=9105";
|
||||
};
|
||||
|
||||
bagel.monitoring.grafana-agent.exporters.hydra.port = 9105;
|
||||
};
|
||||
}
|
364
services/monitoring/exporters/hydra/hydra-exporter.py
Executable file
364
services/monitoring/exporters/hydra/hydra-exporter.py
Executable file
|
@ -0,0 +1,364 @@
|
|||
#!/usr/bin/env nix-shell
|
||||
#!nix-shell -i python3 -p "python3.withPackages(ps: [ps.aioprometheus ps.click ps.httpx ps.starlette ps.uvicorn])"
|
||||
import asyncio
|
||||
from contextlib import asynccontextmanager
|
||||
import logging
|
||||
from aioprometheus import Counter, Gauge
|
||||
from aioprometheus.asgi.starlette import metrics
|
||||
import click
|
||||
import httpx
|
||||
from starlette.applications import Starlette
|
||||
from starlette.routing import Route
|
||||
import uvicorn
|
||||
|
||||
|
||||
up = Gauge("hydra_up", "Is Hydra running")
|
||||
time = Gauge("hydra_time", "Hydra's current time")
|
||||
uptime = Gauge("hydra_uptime", "Hydra's uptime")
|
||||
|
||||
builds_queued = Gauge("hydra_builds_queued", "Number of jobs in build queue")
|
||||
steps_active = Gauge("hydra_steps_active", "Number of active steps in build queue")
|
||||
steps_building = Gauge("hydra_steps_building", "Number of steps currently building")
|
||||
steps_copying_to = Gauge(
|
||||
"hydra_steps_copying_to", "Number of steps copying inputs to a worker"
|
||||
)
|
||||
steps_waiting_for_download_slot = Gauge(
|
||||
"hydra_steps_waiting_for_download_slot", "Number of steps waiting for download slot"
|
||||
)
|
||||
steps_copying_from = Gauge(
|
||||
"hydra_steps_copying_from", "Number of steps copying outputs from a worker"
|
||||
)
|
||||
steps_waiting = Gauge(
|
||||
"hydra_steps_waiting", "Number of steps currently waiting for a worker slot"
|
||||
)
|
||||
steps_unsupported = Gauge(
|
||||
"hydra_steps_unsupported", "Number of unsupported steps in build queue"
|
||||
)
|
||||
|
||||
bytes_sent = Counter(
|
||||
"hydra_build_inputs_sent_bytes_total",
|
||||
"Total number of bytes copied to workers as build inputs",
|
||||
)
|
||||
bytes_received = Counter(
|
||||
"hydra_build_outputs_received_bytes_total",
|
||||
"Total number of bytes copied from workers as build outputs",
|
||||
)
|
||||
|
||||
builds_read = Counter(
|
||||
"hydra_builds_read_total",
|
||||
"Total number of builds whose outputs have been copied from workers",
|
||||
)
|
||||
builds_read_seconds = Counter(
|
||||
"hydra_builds_read_seconds_total",
|
||||
"Total time spent copying build outputs, in seconds",
|
||||
)
|
||||
|
||||
builds_done = Counter("hydra_builds_done_total", "Total number of builds completed")
|
||||
steps_started = Counter("hydra_steps_started_total", "Total number of steps started")
|
||||
steps_done = Counter("hydra_steps_done_total", "Total number of steps completed")
|
||||
|
||||
retries = Counter("hydra_retries_total", "Total number of retries")
|
||||
max_retries = Gauge(
|
||||
"hydra_max_retries", "Maximum observed number of retries for a single step"
|
||||
)
|
||||
|
||||
queue_wakeups = Counter(
|
||||
"hydra_queue_wakeup_total",
|
||||
"Count of the times the queue runner has been notified of queue changes",
|
||||
)
|
||||
dispatcher_wakeups = Counter(
|
||||
"hydra_dispatcher_wakeup_total",
|
||||
"Count of the times the queue runner work dispatcher woke up due to new runnable builds and completed builds.",
|
||||
)
|
||||
|
||||
dispatch_time = Counter(
|
||||
"hydra_dispatch_execution_seconds_total",
|
||||
"Total time the dispatcher has spent working, in seconds",
|
||||
)
|
||||
|
||||
db_connections = Gauge("hydra_db_connections", "Number of connections to the database")
|
||||
active_db_updates = Gauge("hydra_db_updates", "Number of in-progress database updates")
|
||||
|
||||
steps_queued = Gauge("hydra_steps_queued", "Number of steps in build queue")
|
||||
steps_runnable = Gauge(
|
||||
"hydra_steps_runnable", "Number of runnable steps in build queue"
|
||||
)
|
||||
|
||||
step_time = Counter(
|
||||
"hydra_step_time_total", "Total time spent executing steps, in seconds"
|
||||
)
|
||||
step_build_time = Counter(
|
||||
"hydra_step_build_time_total", "Total time spent executing build steps, in seconds"
|
||||
)
|
||||
|
||||
machine_enabled = Gauge("hydra_machine_enabled", "Whether machine is enabled")
|
||||
machine_steps_done = Counter(
|
||||
"hydra_machine_steps_done_total", "Total number of steps completed by this worker"
|
||||
)
|
||||
machine_current_jobs = Gauge(
|
||||
"hydra_machine_current_jobs", "Number of jobs currently running on this worker"
|
||||
)
|
||||
machine_disabled_until = Gauge(
|
||||
"hydra_machine_disabled_until",
|
||||
"Timestamp of when this worker will next become active",
|
||||
)
|
||||
machine_last_failure = Gauge(
|
||||
"hydra_machine_last_failure", "Timestamp of when a build last failed on this worker"
|
||||
)
|
||||
machine_consecutive_failures = Gauge(
|
||||
"hydra_machine_consecutive_failures",
|
||||
"Number of consecutive failed builds on this worker",
|
||||
)
|
||||
|
||||
machine_idle_since = Gauge(
|
||||
"hydra_machine_idle_since", "Timestamp of when this worker last had jobs running"
|
||||
)
|
||||
machine_step_time = Counter(
|
||||
"hydra_machine_step_time_total",
|
||||
"Total time this worker spent executing steps, in seconds",
|
||||
)
|
||||
machine_step_build_time = Counter(
|
||||
"hydra_machine_step_build_time_total",
|
||||
"Total time this worker spent executing build steps, in seconds",
|
||||
)
|
||||
|
||||
jobset_time = Counter(
|
||||
"hydra_jobset_seconds_total",
|
||||
"Total time this jobset has been building for, in seconds",
|
||||
)
|
||||
jobset_shares_used = Gauge(
|
||||
"hydra_jobset_shares_used", "Number of shares currently consumed by this jobset"
|
||||
)
|
||||
|
||||
machine_type_runnable = Gauge(
|
||||
"hydra_machine_type_runnable",
|
||||
"Number of steps currently runnable on this machine type",
|
||||
)
|
||||
machine_type_running = Gauge(
|
||||
"hydra_machine_type_running",
|
||||
"Number of steps currently running on this machine type",
|
||||
)
|
||||
machine_type_wait_time = Counter(
|
||||
"hydra_machine_type_wait_time_total",
|
||||
"Total time spent waiting for a build slot of this machine type",
|
||||
)
|
||||
machine_type_last_active = Gauge(
|
||||
"hydra_machine_type_last_active",
|
||||
"Timestamp of when a machine of this type was last active",
|
||||
)
|
||||
|
||||
store_nar_info_read = Counter(
|
||||
"hydra_store_nar_info_read_total",
|
||||
"Total number of narinfo files read from the remote store",
|
||||
)
|
||||
store_nar_info_read_averted = Counter(
|
||||
"hydra_store_nar_info_read_averted_total",
|
||||
"Total number of narinfo file reads averted (already loaded)",
|
||||
)
|
||||
store_nar_info_missing = Counter(
|
||||
"hydra_store_nar_info_missing_total",
|
||||
"Total number of narinfo files found to be missing",
|
||||
)
|
||||
store_nar_info_write = Counter(
|
||||
"hydra_store_nar_info_write_total",
|
||||
"Total number of narinfo files written to the remote store",
|
||||
)
|
||||
store_nar_info_cache_size = Gauge(
|
||||
"hydra_store_nar_info_cache_size",
|
||||
"Size of the in-memory store path information cache",
|
||||
)
|
||||
store_nar_read = Counter(
|
||||
"hydra_store_nar_read_total", "Total number of NAR files read from the remote store"
|
||||
)
|
||||
store_nar_read_bytes = Counter(
|
||||
"hydra_store_nar_read_bytes_total",
|
||||
"Total number of NAR file bytes read from the remote store (uncompressed)",
|
||||
)
|
||||
store_nar_read_compressed_bytes = Counter(
|
||||
"hydra_store_nar_read_compressed_bytes_total",
|
||||
"Total number of NAR file bytes read from the remote store (compressed)",
|
||||
)
|
||||
store_nar_write = Counter(
|
||||
"hydra_store_nar_write_total",
|
||||
"Total number of NAR files written to the remote store",
|
||||
)
|
||||
store_nar_write_averted = Counter(
|
||||
"hydra_store_nar_write_averted_total",
|
||||
"Total number of NAR file writes averted (already exists on remote)",
|
||||
)
|
||||
store_nar_write_bytes = Counter(
|
||||
"hydra_store_nar_write_bytes_total",
|
||||
"Total number of NAR file bytes written to the remote store (uncompressed)",
|
||||
)
|
||||
store_nar_write_compressed_bytes = Counter(
|
||||
"hydra_store_nar_write_compressed_bytes_total",
|
||||
"Total number of NAR file bytes written to the remote store (compressed)",
|
||||
)
|
||||
store_nar_write_compression_seconds = Counter(
|
||||
"hydra_store_nar_write_compression_seconds_total",
|
||||
"Total time spent compressing NAR files for writing to the remote store",
|
||||
)
|
||||
|
||||
store_s3_put = Counter(
|
||||
"hydra_store_s3_put_total", "Total number of PUT requests to S3 store"
|
||||
)
|
||||
store_s3_put_bytes = Counter(
|
||||
"hydra_store_s3_put_bytes_total", "Total number of bytes written to S3 store"
|
||||
)
|
||||
store_s3_put_seconds = Counter(
|
||||
"hydra_store_s3_put_seconds_total",
|
||||
"Total time spent writing to S3 store, in seconds",
|
||||
)
|
||||
store_s3_get = Counter(
|
||||
"hydra_store_s3_get_total", "Total number of GET requests to S3 store"
|
||||
)
|
||||
store_s3_get_bytes = Counter(
|
||||
"hydra_store_s3_get_bytes_total", "Total number of bytes read from S3 store"
|
||||
)
|
||||
store_s3_get_seconds = Counter(
|
||||
"hydra_store_s3_get_seconds_total",
|
||||
"Total time spent reading from S3 store, in seconds",
|
||||
)
|
||||
store_s3_head = Counter(
|
||||
"hydra_store_s3_head_total", "Total number of HEAD requests to S3 store"
|
||||
)
|
||||
|
||||
|
||||
def update_metrics(status):
|
||||
up.set({}, int(status["status"] == "up"))
|
||||
time.set({}, status["time"])
|
||||
uptime.set({}, status["uptime"])
|
||||
|
||||
builds_queued.set({}, status["nrQueuedBuilds"])
|
||||
steps_active.set({}, status["nrActiveSteps"])
|
||||
steps_building.set({}, status["nrStepsBuilding"])
|
||||
steps_copying_to.set({}, status["nrStepsCopyingTo"])
|
||||
steps_waiting_for_download_slot.set({}, status["nrStepsWaitingForDownloadSlot"])
|
||||
steps_copying_from.set({}, status["nrStepsCopyingFrom"])
|
||||
steps_waiting.set({}, status["nrStepsWaiting"])
|
||||
steps_unsupported.set({}, status["nrUnsupportedSteps"])
|
||||
|
||||
bytes_sent.set({}, status["bytesSent"])
|
||||
bytes_received.set({}, status["bytesReceived"])
|
||||
|
||||
builds_read.set({}, status["nrBuildsRead"])
|
||||
builds_read_seconds.set({}, status["buildReadTimeMs"] / 1000)
|
||||
|
||||
builds_done.set({}, status["nrBuildsDone"])
|
||||
steps_started.set({}, status["nrStepsStarted"])
|
||||
steps_done.set({}, status["nrStepsDone"])
|
||||
|
||||
retries.set({}, status["nrRetries"])
|
||||
max_retries.set({}, status["maxNrRetries"])
|
||||
|
||||
queue_wakeups.set({}, status["nrQueueWakeups"])
|
||||
dispatcher_wakeups.set({}, status["nrDispatcherWakeups"])
|
||||
dispatch_time.set({}, status["dispatchTimeMs"] / 1000)
|
||||
|
||||
db_connections.set({}, status["nrDbConnections"])
|
||||
active_db_updates.set({}, status["nrActiveDbUpdates"])
|
||||
|
||||
steps_queued.set({}, status["nrUnfinishedSteps"])
|
||||
steps_runnable.set({}, status["nrRunnableSteps"])
|
||||
|
||||
if st := status.get("totalStepTime"):
|
||||
step_time.set({}, st)
|
||||
|
||||
if sbt := status.get("totalStepBuildTime"):
|
||||
step_build_time.set({}, sbt)
|
||||
|
||||
for machine_name, machine_status in status["machines"].items():
|
||||
labels = {"host": machine_name}
|
||||
machine_enabled.set(labels, int(machine_status["enabled"]))
|
||||
machine_steps_done.set(labels, machine_status["nrStepsDone"])
|
||||
machine_current_jobs.set(labels, machine_status["currentJobs"])
|
||||
machine_disabled_until.set(labels, machine_status["disabledUntil"])
|
||||
machine_last_failure.set(labels, machine_status["lastFailure"])
|
||||
machine_consecutive_failures.set(labels, machine_status["consecutiveFailures"])
|
||||
|
||||
if isn := machine_status.get("idleSince"):
|
||||
machine_idle_since.set(labels, isn)
|
||||
|
||||
if st := machine_status.get("totalStepTime"):
|
||||
machine_step_time.set(labels, st)
|
||||
|
||||
if sbt := machine_status.get("totalStepBuildTime"):
|
||||
machine_step_build_time.set(labels, sbt)
|
||||
|
||||
for jobset_name, jobset_status in status["jobsets"].items():
|
||||
labels = {"name": jobset_name}
|
||||
jobset_time.set(labels, jobset_status["seconds"])
|
||||
jobset_shares_used.set(labels, jobset_status["shareUsed"])
|
||||
|
||||
for type_name, type_status in status["machineTypes"].items():
|
||||
labels = {"machineType": type_name}
|
||||
machine_type_runnable.set(labels, type_status["runnable"])
|
||||
machine_type_running.set(labels, type_status["running"])
|
||||
|
||||
if wt := type_status.get("waitTime"):
|
||||
machine_type_wait_time.set(labels, wt)
|
||||
|
||||
if la := type_status.get("lastActive"):
|
||||
machine_type_last_active.set(labels, la)
|
||||
|
||||
store = status["store"]
|
||||
store_nar_info_read.set({}, store["narInfoRead"])
|
||||
store_nar_info_read_averted.set({}, store["narInfoReadAverted"])
|
||||
store_nar_info_missing.set({}, store["narInfoMissing"])
|
||||
store_nar_info_write.set({}, store["narInfoWrite"])
|
||||
store_nar_info_cache_size.set({}, store["narInfoCacheSize"])
|
||||
store_nar_read.set({}, store["narRead"])
|
||||
store_nar_read_bytes.set({}, store["narReadBytes"])
|
||||
store_nar_read_compressed_bytes.set({}, store["narReadCompressedBytes"])
|
||||
store_nar_write.set({}, store["narWrite"])
|
||||
store_nar_write_averted.set({}, store["narWriteAverted"])
|
||||
store_nar_write_bytes.set({}, store["narWriteBytes"])
|
||||
store_nar_write_compressed_bytes.set({}, store["narWriteCompressedBytes"])
|
||||
store_nar_write_compression_seconds.set(
|
||||
{}, store["narWriteCompressionTimeMs"] / 1000
|
||||
)
|
||||
|
||||
if s3 := status.get("s3"):
|
||||
store_s3_put.set({}, s3["put"])
|
||||
store_s3_put_bytes.set({}, s3["putBytes"])
|
||||
store_s3_put_seconds.set({}, s3["putTimeMs"] / 1000)
|
||||
store_s3_get.set({}, s3["get"])
|
||||
store_s3_get_bytes.set({}, s3["getBytes"])
|
||||
store_s3_get_seconds.set({}, s3["getTimeMs"] / 1000)
|
||||
store_s3_head.set({}, s3["head"])
|
||||
|
||||
|
||||
async def update_metrics_loop(hydra_url, scrape_interval):
|
||||
async with httpx.AsyncClient(base_url=hydra_url) as client:
|
||||
while True:
|
||||
try:
|
||||
response = await client.get(
|
||||
"/queue-runner-status",
|
||||
headers={"Content-Type": "application/json"},
|
||||
)
|
||||
|
||||
update_metrics(response.json())
|
||||
except Exception as ex:
|
||||
logging.exception("Failed to update metrics", exc_info=ex)
|
||||
|
||||
await asyncio.sleep(scrape_interval)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option("--hydra-url", default="https://hydra.forkos.org/")
|
||||
@click.option("--port", default=9200)
|
||||
@click.option("--scrape-interval", default=15)
|
||||
def main(hydra_url, port, scrape_interval):
|
||||
@asynccontextmanager
|
||||
async def lifespan(_):
|
||||
loop = asyncio.get_event_loop()
|
||||
loop.create_task(update_metrics_loop(hydra_url, scrape_interval))
|
||||
yield
|
||||
|
||||
app = Starlette(routes=[Route("/metrics", metrics)], lifespan=lifespan)
|
||||
|
||||
uvicorn.run(app, port=port, log_level="info")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
|
@ -1,11 +1,12 @@
|
|||
namespace: forkos
|
||||
groups:
|
||||
- name: ForkOS automation
|
||||
rules:
|
||||
- alert: SyncFailedTooOften
|
||||
expr: 'changes(node_systemd_unit_state{name=~"ows.*.service",state="failed"}[24h]) > 2'
|
||||
for: 30m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Synchronization job {{ $labels.name }} has failed more than twice in the last 24 hours"
|
||||
description: "On {{ $labels.instance }}, the synchronization job has failed more than twice in the last 24 hours, check if there's a conflict or a stdenv change."
|
||||
- name: ForkOS automation
|
||||
rules:
|
||||
- alert: SyncFailedTooOften
|
||||
expr: 'changes(node_systemd_unit_state{name=~"ows.*.service",state="failed"}[1d]) > 2'
|
||||
for: 30m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: On {{ $labels.instance }}, the synchronization job has failed more than twice in the last 24 hours, check if there's a conflict or a stdenv change.
|
||||
summary: Synchronization job {{ $labels.name }} has failed more than twice in the last 24 hours
|
||||
|
|
|
@ -1,102 +1,119 @@
|
|||
namespace: postgres
|
||||
groups:
|
||||
- name: PostgreSQL
|
||||
rules:
|
||||
|
||||
- alert: PostgresqlTableNotAutoVacuumed
|
||||
expr: '(pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql table not auto vacuumed (instance {{ $labels.instance }})
|
||||
description: "Table {{ $labels.relname }} has not been auto vacuumed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlTableNotAutoAnalyzed
|
||||
expr: '(pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql table not auto analyzed (instance {{ $labels.instance }})
|
||||
description: "Table {{ $labels.relname }} has not been auto analyzed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlDeadLocks
|
||||
expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql dead locks (instance {{ $labels.instance }})
|
||||
description: "PostgreSQL has dead-locks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlHighRollbackRate
|
||||
expr: 'sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql high rollback rate (instance {{ $labels.instance }})
|
||||
description: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlHighRateStatementTimeout
|
||||
expr: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql high rate statement timeout (instance {{ $labels.instance }})
|
||||
description: "Postgres transactions showing high rate of statement timeouts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlHighRateDeadlock
|
||||
expr: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1'
|
||||
for: 0m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql high rate deadlock (instance {{ $labels.instance }})
|
||||
description: "Postgres detected deadlocks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlTooManyDeadTuples
|
||||
expr: '((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql too many dead tuples (instance {{ $labels.instance }})
|
||||
description: "PostgreSQL dead tuples is too large\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlTooManyLocksAcquired
|
||||
expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: Postgresql too many locks acquired (instance {{ $labels.instance }})
|
||||
description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlBloatIndexHigh(>80%)
|
||||
expr: 'pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)'
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql bloat index high (> 80%) (instance {{ $labels.instance }})
|
||||
description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlBloatTableHigh(>80%)
|
||||
expr: 'pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)'
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }})
|
||||
description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- alert: PostgresqlInvalidIndex
|
||||
expr: 'pg_genaral_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
|
||||
for: 6h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Postgresql invalid index (instance {{ $labels.instance }})
|
||||
description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- name: PostgreSQL
|
||||
rules:
|
||||
- alert: PostgresqlTableNotAutoVacuumed
|
||||
expr: '(pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10'
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: |-
|
||||
Table {{ $labels.relname }} has not been auto vacuumed for 10 days
|
||||
VALUE = {{ $value }}
|
||||
LABELS = {{ $labels }}
|
||||
summary: Postgresql table not auto vacuumed (instance {{ $labels.instance }})
|
||||
- alert: PostgresqlTableNotAutoAnalyzed
|
||||
expr: '(pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10'
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: |-
|
||||
Table {{ $labels.relname }} has not been auto analyzed for 10 days
|
||||
VALUE = {{ $value }}
|
||||
LABELS = {{ $labels }}
|
||||
summary: Postgresql table not auto analyzed (instance {{ $labels.instance }})
|
||||
- alert: PostgresqlDeadLocks
|
||||
expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: |-
|
||||
PostgreSQL has dead-locks
|
||||
VALUE = {{ $value }}
|
||||
LABELS = {{ $labels }}
|
||||
summary: Postgresql dead locks (instance {{ $labels.instance }})
|
||||
- alert: PostgresqlHighRollbackRate
|
||||
expr: 'sum by (namespace, datname) ((rate(pg_stat_database_xact_rollback{datid!="0",datname!~"template.*|postgres"}[3m])) / ((rate(pg_stat_database_xact_rollback{datid!="0",datname!~"template.*|postgres"}[3m])) + (rate(pg_stat_database_xact_commit{datid!="0",datname!~"template.*|postgres"}[3m])))) > 0.02'
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: |-
|
||||
Ratio of transactions being aborted compared to committed is > 2 %
|
||||
VALUE = {{ $value }}
|
||||
LABELS = {{ $labels }}
|
||||
summary: Postgresql high rollback rate (instance {{ $labels.instance }})
|
||||
- alert: PostgresqlHighRateStatementTimeout
|
||||
expr: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3'
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: |-
|
||||
Postgres transactions showing high rate of statement timeouts
|
||||
VALUE = {{ $value }}
|
||||
LABELS = {{ $labels }}
|
||||
summary: Postgresql high rate statement timeout (instance {{ $labels.instance }})
|
||||
- alert: PostgresqlHighRateDeadlock
|
||||
expr: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1'
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: |-
|
||||
Postgres detected deadlocks
|
||||
VALUE = {{ $value }}
|
||||
LABELS = {{ $labels }}
|
||||
summary: Postgresql high rate deadlock (instance {{ $labels.instance }})
|
||||
- alert: PostgresqlTooManyDeadTuples
|
||||
expr: '((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: |-
|
||||
PostgreSQL dead tuples is too large
|
||||
VALUE = {{ $value }}
|
||||
LABELS = {{ $labels }}
|
||||
summary: Postgresql too many dead tuples (instance {{ $labels.instance }})
|
||||
- alert: PostgresqlTooManyLocksAcquired
|
||||
expr: '((sum(pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.2'
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
description: |-
|
||||
Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.
|
||||
VALUE = {{ $value }}
|
||||
LABELS = {{ $labels }}
|
||||
summary: Postgresql too many locks acquired (instance {{ $labels.instance }})
|
||||
- alert: PostgresqlBloatIndexHigh(>80%)
|
||||
expr: 'pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 1e+08)'
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: |-
|
||||
The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`
|
||||
VALUE = {{ $value }}
|
||||
LABELS = {{ $labels }}
|
||||
summary: Postgresql bloat index high (> 80%) (instance {{ $labels.instance }})
|
||||
- alert: PostgresqlBloatTableHigh(>80%)
|
||||
expr: 'pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 2e+08)'
|
||||
for: 1h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: |-
|
||||
The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`
|
||||
VALUE = {{ $value }}
|
||||
LABELS = {{ $labels }}
|
||||
summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }})
|
||||
- alert: PostgresqlInvalidIndex
|
||||
expr: 'pg_genaral_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
|
||||
for: 6h
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: |-
|
||||
The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`
|
||||
VALUE = {{ $value }}
|
||||
LABELS = {{ $labels }}
|
||||
summary: Postgresql invalid index (instance {{ $labels.instance }})
|
||||
|
|
|
@ -1,76 +1,101 @@
|
|||
namespace: resources
|
||||
groups:
|
||||
- name: Host & hardware
|
||||
rules:
|
||||
- alert: HostOutOfMemory
|
||||
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of memory (instance {{ $labels.instance }})
|
||||
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: HostMemoryUnderMemoryPressure
|
||||
expr: (rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host memory under memory pressure (instance {{ $labels.instance }})
|
||||
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: HostMemoryIsUnderutilized
|
||||
expr: (100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
for: 1w
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host Memory is underutilized (instance {{ $labels.instance }})
|
||||
description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: HostOutOfDiskSpace
|
||||
expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host out of disk space (instance {{ $labels.instance }})
|
||||
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: HostDiskWillFillIn24Hours
|
||||
expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
|
||||
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: HostCpuIsUnderutilized
|
||||
expr: (100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
for: 1w
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
summary: Host CPU is underutilized (instance {{ $labels.instance }})
|
||||
description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: HostCpuStealNoisyNeighbor
|
||||
expr: (avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
|
||||
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: HostOomKillDetected
|
||||
expr: (increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
for: 0m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host OOM kill detected (instance {{ $labels.instance }})
|
||||
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
- alert: HostNetworkInterfaceSaturated
|
||||
expr: ((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
|
||||
description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
|
||||
|
||||
- name: Host & hardware
|
||||
rules:
|
||||
- alert: HostOutOfMemory
|
||||
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on (instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: |-
|
||||
Node memory is filling up (< 10% left)
|
||||
VALUE = {{ $value }}
|
||||
LABELS = {{ $labels }}
|
||||
summary: Host out of memory (instance {{ $labels.instance }})
|
||||
- alert: HostMemoryUnderMemoryPressure
|
||||
expr: (rate(node_vmstat_pgmajfault[1m]) > 1000) * on (instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: |-
|
||||
The node is under heavy memory pressure. High rate of major page faults
|
||||
VALUE = {{ $value }}
|
||||
LABELS = {{ $labels }}
|
||||
summary: Host memory under memory pressure (instance {{ $labels.instance }})
|
||||
- alert: HostMemoryIsUnderutilized
|
||||
expr: (100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on (instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
for: 1w
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
description: |-
|
||||
Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})
|
||||
VALUE = {{ $value }}
|
||||
LABELS = {{ $labels }}
|
||||
summary: Host Memory is underutilized (instance {{ $labels.instance }})
|
||||
- alert: HostOutOfDiskSpace
|
||||
expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and on (instance, device, mountpoint) node_filesystem_readonly == 0) * on (instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: |-
|
||||
Disk is almost full (< 10% left)
|
||||
VALUE = {{ $value }}
|
||||
LABELS = {{ $labels }}
|
||||
summary: Host out of disk space (instance {{ $labels.instance }})
|
||||
- alert: HostDiskWillFillIn24Hours
|
||||
expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and on (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and on (instance, device, mountpoint) node_filesystem_readonly == 0) * on (instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
for: 2m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: |-
|
||||
Filesystem is predicted to run out of space within the next 24 hours at current write rate
|
||||
VALUE = {{ $value }}
|
||||
LABELS = {{ $labels }}
|
||||
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
|
||||
- alert: HostCpuIsUnderutilized
|
||||
expr: (100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on (instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
for: 1w
|
||||
labels:
|
||||
severity: info
|
||||
annotations:
|
||||
description: |-
|
||||
CPU load is < 20% for 1 week. Consider reducing the number of CPUs.
|
||||
VALUE = {{ $value }}
|
||||
LABELS = {{ $labels }}
|
||||
summary: Host CPU is underutilized (instance {{ $labels.instance }})
|
||||
- alert: HostCpuStealNoisyNeighbor
|
||||
expr: (avg by (instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on (instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: |-
|
||||
CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
|
||||
VALUE = {{ $value }}
|
||||
LABELS = {{ $labels }}
|
||||
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
|
||||
- alert: HostOomKillDetected
|
||||
expr: (increase(node_vmstat_oom_kill[1m]) > 0) * on (instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: |-
|
||||
OOM kill detected
|
||||
VALUE = {{ $value }}
|
||||
LABELS = {{ $labels }}
|
||||
summary: Host OOM kill detected (instance {{ $labels.instance }})
|
||||
- alert: HostNetworkInterfaceSaturated
|
||||
expr: ((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on (instance) group_left (nodename) node_uname_info{nodename=~".+"}
|
||||
for: 1m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
description: |-
|
||||
The network interface "{{ $labels.device }}" on "{{ $labels.instance }}" is getting overloaded.
|
||||
VALUE = {{ $value }}
|
||||
LABELS = {{ $labels }}
|
||||
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
|
||||
|
|
|
@ -17,7 +17,7 @@ in
|
|||
|
||||
config = mkIf cfg.enable {
|
||||
age.secrets.grafana-oauth-secret = {
|
||||
file = ../../../secrets/grafana-oauth-secret.age;
|
||||
file = ../../../secrets/floral/grafana-oauth-secret.age;
|
||||
owner = "grafana";
|
||||
};
|
||||
|
||||
|
|
|
@ -13,10 +13,10 @@ in
|
|||
config = mkIf cfg.enable {
|
||||
age.secrets = {
|
||||
metrics-push-htpasswd = {
|
||||
file = ../../../secrets/metrics-push-htpasswd.age;
|
||||
file = ../../../secrets/floral/metrics-push-htpasswd.age;
|
||||
owner = "nginx";
|
||||
};
|
||||
loki-environment.file = ../../../secrets/loki-environment.age;
|
||||
loki-environment.file = ../../../secrets/floral/loki-environment.age;
|
||||
};
|
||||
|
||||
services.loki = {
|
||||
|
|
|
@ -9,6 +9,18 @@ let
|
|||
inherit (lib) mkEnableOption mkIf;
|
||||
|
||||
mimirPort = config.services.mimir.configuration.server.http_listen_port;
|
||||
|
||||
alerts = pkgs.runCommand "mimir-alerts-checked" {
|
||||
src = ./alerts;
|
||||
nativeBuildInputs = with pkgs; [ mimir ];
|
||||
} ''
|
||||
mkdir $out
|
||||
cp -R $src $out/anonymous/
|
||||
chmod -R +w $out
|
||||
mimirtool rules check --rule-dirs=$out/anonymous
|
||||
mimirtool rules lint --rule-dirs=$out/anonymous
|
||||
diff -r $src $out/anonymous
|
||||
'';
|
||||
in
|
||||
{
|
||||
options.bagel.services.prometheus.enable = mkEnableOption "Prometheus scraper";
|
||||
|
@ -16,11 +28,11 @@ in
|
|||
config = mkIf cfg.enable {
|
||||
age.secrets = {
|
||||
metrics-push-htpasswd = {
|
||||
file = ../../../secrets/metrics-push-htpasswd.age;
|
||||
file = ../../../secrets/floral/metrics-push-htpasswd.age;
|
||||
owner = "nginx";
|
||||
};
|
||||
mimir-environment.file = ../../../secrets/mimir-environment.age;
|
||||
mimir-webhook-url.file = ../../../secrets/mimir-webhook-url.age;
|
||||
mimir-environment.file = ../../../secrets/floral/mimir-environment.age;
|
||||
mimir-webhook-url.file = ../../../secrets/floral/mimir-webhook-url.age;
|
||||
};
|
||||
|
||||
services.mimir = {
|
||||
|
@ -60,10 +72,7 @@ in
|
|||
blocks_storage.backend = "s3";
|
||||
ruler_storage = {
|
||||
backend = "local";
|
||||
local.directory = pkgs.runCommand "mimir-rules" {} ''
|
||||
mkdir -p $out
|
||||
ln -s ${./alerts} $out/anonymous
|
||||
'';
|
||||
local.directory = alerts;
|
||||
};
|
||||
|
||||
alertmanager = {
|
||||
|
|
|
@ -13,10 +13,10 @@ in
|
|||
config = mkIf cfg.enable {
|
||||
age.secrets = {
|
||||
metrics-push-htpasswd = {
|
||||
file = ../../../secrets/metrics-push-htpasswd.age;
|
||||
file = ../../../secrets/floral/metrics-push-htpasswd.age;
|
||||
owner = "nginx";
|
||||
};
|
||||
tempo-environment.file = ../../../secrets/tempo-environment.age;
|
||||
tempo-environment.file = ../../../secrets/floral/tempo-environment.age;
|
||||
};
|
||||
|
||||
services.tempo = {
|
||||
|
|
|
@ -15,7 +15,7 @@ in
|
|||
];
|
||||
|
||||
config = mkIf cfg.enable {
|
||||
age.secrets.pyroscope-secrets.file = ../../../secrets/pyroscope-secrets.age;
|
||||
age.secrets.pyroscope-secrets.file = ../../../secrets/floral/pyroscope-secrets.age;
|
||||
services.nginx = {
|
||||
upstreams.pyroscope = {
|
||||
servers."127.0.0.1:${toString pyroscopePort}" = {};
|
||||
|
|
|
@ -20,7 +20,7 @@ in
|
|||
};
|
||||
|
||||
config = mkIf cfg.enable {
|
||||
age.secrets.netbox-environment.file = ../../secrets/netbox-environment.age;
|
||||
age.secrets.netbox-environment.file = ../../secrets/floral/netbox-environment.age;
|
||||
services = {
|
||||
netbox = {
|
||||
enable = true;
|
||||
|
|
|
@ -14,7 +14,7 @@ in
|
|||
};
|
||||
|
||||
config = mkIf cfg.enable {
|
||||
age.secrets.newsletter-secrets.file = ../../secrets/newsletter-secrets.age;
|
||||
age.secrets.newsletter-secrets.file = ../../secrets/floral/newsletter-secrets.age;
|
||||
services.listmonk = {
|
||||
enable = true;
|
||||
secretFile = config.age.secrets.newsletter-secrets.path;
|
||||
|
|
|
@ -11,7 +11,7 @@ in {
|
|||
|
||||
config = lib.mkIf cfg.enable {
|
||||
age.secrets.postgresql-tls-priv.owner = "postgres";
|
||||
age.secrets.postgresql-tls-priv.file = ../../secrets/postgres-tls-priv.age;
|
||||
age.secrets.postgresql-tls-priv.file = ../../secrets/floral/postgres-tls-priv.age;
|
||||
|
||||
systemd.tmpfiles.rules = [
|
||||
"d /var/db 0755 root root - -"
|
||||
|
@ -67,7 +67,7 @@ in {
|
|||
|
||||
# Provisioned on the server so that CA operations can be done there.
|
||||
age.secrets.postgresql-ca-priv.owner = "postgres";
|
||||
age.secrets.postgresql-ca-priv.file = ../../secrets/postgres-ca-priv.age;
|
||||
age.secrets.postgresql-ca-priv.file = ../../secrets/floral/postgres-ca-priv.age;
|
||||
|
||||
users.users.postgres.packages = [
|
||||
(pkgs.writeShellScriptBin "postgres-mint-new-client" ''
|
||||
|
|
|
@ -70,7 +70,7 @@ in
|
|||
];
|
||||
|
||||
config = mkIf cfg.enable {
|
||||
age.secrets.s3-revproxy-api-keys.file = ../../secrets/s3-revproxy-api-keys.age;
|
||||
age.secrets.s3-revproxy-api-keys.file = ../../secrets/floral/s3-revproxy-api-keys.age;
|
||||
# For each target, generate an entry that passes it to the s3-revproxy.
|
||||
services.nginx.virtualHosts = mapAttrs' (subdomain: _: nameValuePair "${subdomain}.${cfg.domain}" (mkProxiedSubdomain subdomain)) cfg.targets;
|
||||
# this solves garage supporting neither anonymous access nor automatic
|
||||
|
|
97
services/uptime-kuma/default.nix
Normal file
97
services/uptime-kuma/default.nix
Normal file
|
@ -0,0 +1,97 @@
|
|||
{
|
||||
inputs,
|
||||
lib,
|
||||
config,
|
||||
...
|
||||
}:
|
||||
let
|
||||
cfg = config.bagel.status;
|
||||
# TODO: pull domains from a central place
|
||||
subdomains = [
|
||||
"cl"
|
||||
"netbox"
|
||||
"cache"
|
||||
"grafana"
|
||||
"hydra"
|
||||
"loki"
|
||||
"mimir"
|
||||
"pyroscope"
|
||||
"matrix"
|
||||
"tempo"
|
||||
"amqp"
|
||||
"fodwatch"
|
||||
"git"
|
||||
"alerts"
|
||||
"buildbot"
|
||||
"b"
|
||||
"postgres"
|
||||
"news"
|
||||
];
|
||||
port = 3001;
|
||||
in
|
||||
{
|
||||
imports = [ "${inputs.stateless-uptime-kuma}/nixos/module.nix" ];
|
||||
|
||||
options.bagel.status = {
|
||||
enable = lib.mkEnableOption "the status page service (uptime-kuma)";
|
||||
domain = lib.mkOption {
|
||||
type = lib.types.str;
|
||||
};
|
||||
};
|
||||
|
||||
config = lib.mkIf cfg.enable {
|
||||
services.uptime-kuma.enable = true;
|
||||
|
||||
services.nginx = {
|
||||
enable = true;
|
||||
virtualHosts.${cfg.domain} = {
|
||||
enableACME = true;
|
||||
forceSSL = true;
|
||||
locations."/" = {
|
||||
proxyPass = "http://127.0.0.1:${builtins.toString port}";
|
||||
proxyWebsockets = true;
|
||||
};
|
||||
};
|
||||
};
|
||||
|
||||
networking.firewall.allowedTCPPorts = [
|
||||
80
|
||||
443
|
||||
];
|
||||
|
||||
age.secrets.stateless-uptime-kuma-password.file = ../../secrets/floral/stateless-uptime-kuma-password.age;
|
||||
statelessUptimeKuma = {
|
||||
probesConfig = {
|
||||
monitors = lib.genAttrs subdomains (name: {
|
||||
type = "http";
|
||||
url = "https://${name}.forkos.org/";
|
||||
tags = [];
|
||||
});
|
||||
status_pages = {
|
||||
"forkos" = {
|
||||
title = "ForkOS";
|
||||
description = "health of the ForkOS infra";
|
||||
showTags = true;
|
||||
publicGroupList = [
|
||||
{
|
||||
name = "Services";
|
||||
weight = 1;
|
||||
monitorList = lib.genAttrs subdomains (id: {
|
||||
inherit id;
|
||||
});
|
||||
}
|
||||
];
|
||||
};
|
||||
};
|
||||
settings = {
|
||||
entryPage = "statusPage-forkos";
|
||||
};
|
||||
};
|
||||
extraFlags = [ "-s" ];
|
||||
host = "http://localhost:${builtins.toString port}/";
|
||||
username = "forkos";
|
||||
passwordFile = config.age.secrets."stateless-uptime-kuma-password".path;
|
||||
enableService = true;
|
||||
};
|
||||
};
|
||||
}
|
|
@ -1,7 +1,6 @@
|
|||
{
|
||||
imports = [
|
||||
./common.nix
|
||||
./gandi.nix
|
||||
./dnsimple.nix
|
||||
./hydra.nix
|
||||
./state.nix
|
||||
|
|
|
@ -114,6 +114,7 @@ in
|
|||
(record "b" 300 "CNAME" "public01.infra.p.forkos.org")
|
||||
(record "postgres" 300 "CNAME" "bagel-box.infra.p.forkos.org")
|
||||
(record "news" 3600 "CNAME" "public01.infra.p.forkos.org")
|
||||
(record "status" 3600 "CNAME" "public01.infra.p.forkos.org")
|
||||
|
||||
# S3 in delroth's basement
|
||||
(record "cache" 300 "AAAA" "2a02:168:6426::12") # smol.delroth.net
|
||||
|
|
|
@ -1,115 +0,0 @@
|
|||
{ lib, config, ... }:
|
||||
let
|
||||
inherit (lib) mkEnableOption mkIf tf genList;
|
||||
cfg = config.bagel.gandi;
|
||||
in
|
||||
{
|
||||
options.bagel.gandi = {
|
||||
enable = mkEnableOption "the Gandi DNS configuration";
|
||||
};
|
||||
|
||||
config = mkIf cfg.enable {
|
||||
terraform.required_providers.gandi = {
|
||||
version = "~> 2.3.0";
|
||||
source = "go-gandi/gandi";
|
||||
};
|
||||
|
||||
resource.secret_resource.gandi_pat.lifecycle.prevent_destroy = true;
|
||||
|
||||
provider.gandi = {
|
||||
personal_access_token = tf.ref "resource.secret_resource.gandi_pat.value";
|
||||
};
|
||||
|
||||
resource.gandi_livedns_domain.forkos_org = {
|
||||
name = "forkos.org";
|
||||
};
|
||||
|
||||
resource.gandi_livedns_record = let
|
||||
record = name: ttl: type: values: {
|
||||
inherit name ttl type values;
|
||||
};
|
||||
|
||||
proxyRecords = name: ttl: type: values: [
|
||||
# kurisu.lahfa.xyz running a sniproxy:
|
||||
(record name ttl "A" ["163.172.69.160"])
|
||||
(record name ttl type values)
|
||||
];
|
||||
|
||||
# Creates a extra *.p record pointing to the sniproxy
|
||||
dualProxyRecords = name: ttl: type: values: lib.flatten [
|
||||
(record name ttl type values)
|
||||
(proxyRecords "${name}.p" ttl type values)
|
||||
];
|
||||
|
||||
# TODO: make less fragile and have actual unique and stable names
|
||||
canonicalName = record: let
|
||||
name = builtins.replaceStrings ["." "@"] ["_" "_root_"] record.name;
|
||||
in
|
||||
"forkos_org_${record.type}_${name}";
|
||||
|
||||
forkosRecords = records:
|
||||
builtins.listToAttrs (map (record: {
|
||||
name = canonicalName record;
|
||||
value = record // {
|
||||
zone = tf.ref "resource.gandi_livedns_domain.forkos_org.id";
|
||||
};
|
||||
}) (lib.flatten records));
|
||||
|
||||
in forkosRecords ([
|
||||
# (record "@" 300 "A" ["163.172.69.160"])
|
||||
(record "@" 300 "AAAA" ["2001:bc8:38ee:100:1000::20"])
|
||||
|
||||
(dualProxyRecords "bagel-box.infra" 300 "AAAA" ["2001:bc8:38ee:100:100::1"])
|
||||
(dualProxyRecords "gerrit01.infra" 300 "AAAA" ["2001:bc8:38ee:100:1000::10"])
|
||||
(dualProxyRecords "meta01.infra" 300 "AAAA" ["2001:bc8:38ee:100:1000::20"])
|
||||
(dualProxyRecords "fodwatch.infra" 300 "AAAA" ["2001:bc8:38ee:100:1000::30"])
|
||||
# git.infra.forkos.org exposes opensshd
|
||||
(dualProxyRecords "git.infra" 300 "AAAA" ["2001:bc8:38ee:100:1000::41"])
|
||||
# git.p.forkos.org exposes forgejo ssh server.
|
||||
(proxyRecords "git.p" 300 "AAAA" ["2001:bc8:38ee:100:1000::40"])
|
||||
(dualProxyRecords "buildbot.infra" 300 "AAAA" ["2001:bc8:38ee:100:1000::50"])
|
||||
(dualProxyRecords "public01.infra" 300 "AAAA" ["2001:bc8:38ee:100:1000::60"])
|
||||
|
||||
(record "cl" 300 "CNAME" ["gerrit01.infra.p"])
|
||||
(record "fodwatch" 300 "CNAME" ["fodwatch.infra.p"])
|
||||
# git.p.forkos.org is the proxy variant of the Forgejo server.
|
||||
(record "git" 300 "CNAME" ["git.p"])
|
||||
(record "netbox" 300 "CNAME" ["meta01.infra.p"])
|
||||
(record "amqp" 300 "CNAME" ["bagel-box.infra.p"])
|
||||
(record "grafana" 300 "CNAME" ["meta01.infra.p"])
|
||||
(record "hydra" 300 "CNAME" ["build-coord.wob01.infra.p"])
|
||||
(record "loki" 300 "CNAME" ["meta01.infra.p"])
|
||||
(record "mimir" 300 "CNAME" ["meta01.infra.p"])
|
||||
(record "pyroscope" 300 "CNAME" ["meta01.infra.p"])
|
||||
(record "tempo" 300 "CNAME" ["meta01.infra.p"])
|
||||
(record "matrix" 300 "CNAME" ["meta01.infra.p"])
|
||||
(record "alerts" 300 "CNAME" ["meta01.infra.p"])
|
||||
(record "buildbot" 300 "CNAME" ["buildbot.infra.p"])
|
||||
(record "b" 300 "CNAME" ["public01.infra.p"])
|
||||
(record "postgres" 300 "CNAME" ["bagel-box.infra.p"])
|
||||
(record "news" 3600 "CNAME" ["public01.infra.p"])
|
||||
|
||||
# S3 in delroth's basement
|
||||
(record "cache" 300 "AAAA" ["2a02:168:6426::12"]) # smol.delroth.net
|
||||
(record "cache" 300 "A" ["195.39.247.161"]) # sni proxy
|
||||
|
||||
(record "vpn-gw.wob01.infra" 300 "AAAA" [ "2a01:584:11::2" ])
|
||||
|
||||
(dualProxyRecords "build-coord.wob01.infra" 300 "AAAA" [ "2a01:584:11::1:11" ])
|
||||
# TODO: do not hardcode, just reuse the Colmena hive module outputs to generate all the required details.
|
||||
]
|
||||
++ (map (index: record "builder-${toString index}.wob01.infra" 300 "AAAA" [ "2a01:584:11::1:${toString index}" ]) (genList lib.id 11))
|
||||
++ (
|
||||
let
|
||||
# FIXME: figure out a way to poke `config.services.s3-revproxy` and
|
||||
# automate the DNS part away?
|
||||
buckets = [
|
||||
"channels"
|
||||
"releases"
|
||||
"channel-scripts-test"
|
||||
];
|
||||
in
|
||||
map (bucket: record "${bucket}" 300 "CNAME" [ "public01.infra.p" ]) buckets
|
||||
));
|
||||
};
|
||||
}
|
Loading…
Reference in a new issue