Compare commits

..

1 commit

Author SHA1 Message Date
Janik Haag 464a726664
fix(builders/netboot): make "normal" evaluation pass
Without this patch running `colmena build` will run into a few assertion
errors for machines that have `config.bagel.baremetal.builders.netboot == true`
set. This is due to an assertion check in the initrd module making sure
there is a mount point for `/`. This can be trivially fixed by just
setting the mount point to the real world value, which is a tmpfs with
64GB assigned.

We also set `deployment.targetHost` to a domain that will
never resolve in the public internet, to make sure nobody applies these
machines by hand. It would have been nice to throw a error whenever
`colmena apply` gets executed for one of these hosts, but doing so would
defeat the purpose of this patch, because the colmena `build` and `apply`
argument both evaluate the exact same code paths and thus colmena
`build` would error again.

The motivation behind this was, so we could run `colmena build` in CI
in the future, and to not scare of new contributors with random build
failures when they first try to build the machines.

The proper solution would be to exclude all the network booted builders
from the regular colmena hive that is exposed to the cli, but this is
too many yaks to shave for now.
2024-09-23 23:13:33 +02:00
92 changed files with 473 additions and 2149 deletions

View file

@ -1,32 +1 @@
# Infrastructure for the donut shaped thing that is absolutely not a donut.
## Quick start
### Build the infrastructure
```
$ colmena build --on @localboot
```
Notice that `@localboot` is load-bearing as we have some machines that _cannot be_ deployed with vanilla Colmena. Fixing this is welcome.
### Recommended deploy process
```
$ colmena apply dry-activate $machine # Verify that the nvd log is reasonable.
$ colmena apply $machine
```
### Recommended upgrade process
```
$ nix flake update
$ colmena apply dry-activate --on @localboot # Verify that the nvd log is reasonable. Run it twice to get only NVD logs shown.
$ colmena apply --on @localboot
```
## Troubleshooting
### I failed to deploy `gerrit01`
Our Gerrit source build is known to have some hiccups sometimes, we are always interested in build logs, feel free to attach information in a new issue so we can make it more reliable.
Infrastructure for the donut shaped thing that is absolutely not a donut.

View file

@ -1,45 +1,16 @@
{ lib, ... }:
let
inherit (lib) genAttrs;
in
# Note: to add somefew in this list.
# Ensure their SSH key is already in common/ssh-keys.nix with
# the same username for here, so that the keys is automatically added.
{
bagel.groups = {
floral-infra.members = [
"delroth"
"emilylange"
"hexchen"
"jade"
"janik"
"k900"
"maxine"
"raito"
"thubrecht"
"winter"
"yuka"
"ckie"
];
lix-infra.members = [
"raito"
"hexchen"
"jade"
];
};
bagel.users = genAttrs [
"delroth"
"emilylange"
"hexchen"
"jade"
"janik"
"k900"
"maxine"
"raito"
"thubrecht"
"winter"
"yuka"
"ckie"
] (name: {});
keys = import ./ssh-keys.nix;
in {
users.users.root.openssh.authorizedKeys.keys =
keys.users.delroth ++
keys.users.emilylange ++
keys.users.hexchen ++
keys.users.jade ++
keys.users.janik ++
keys.users.k900 ++
keys.users.lukegb ++
keys.users.maxine ++
keys.users.raito ++
keys.users.thubrecht ++
keys.users.yuka;
}

View file

@ -1,7 +1,6 @@
{ lib, pkgs, ... }: {
imports = [
./known-ssh-keys.nix
./cgroups.nix
];
nixpkgs.overlays = import ../overlays;
@ -31,7 +30,7 @@
automatic = true;
persistent = true;
dates = lib.mkDefault "daily";
options = lib.mkDefault "--delete-older-than 30d";
options = "--delete-older-than 30d";
};
services.journald.extraConfig = "SystemMaxUse=512M";

View file

@ -1,83 +0,0 @@
# Relatively inspired by fbtax2:
# https://facebookmicrosites.github.io/cgroup2/docs/fbtax-results.html
#
# See also the Chris Down talk at LISA'21:
# https://www.usenix.org/conference/lisa21/presentation/down
{ ... }:
let
systemCriticalSliceConfig = {
ManagedOOMMemoryPressure = "kill";
# guarantee availability of memory
MemoryMin = "192M";
# default 100
IOWeight = 1000;
# default 100
CPUWeight = 1000;
};
in
{
systemd.oomd = {
enable = true;
# why not, we have cgroups at user level now so it'll just kill the
# terminal
enableRootSlice = true;
enableSystemSlice = true;
enableUserSlices = true;
};
systemd.enableCgroupAccounting = true;
systemd.services.nix-daemon = {
serviceConfig = {
# FIXME: how do i deprioritize this for memory
CPUWeight = 10;
IOWeight = 10;
};
};
systemd.slices.hostcritical = {
description = "Ensures that services to keep the system alive remain alive";
unitConfig = {
# required to avoid a dependency cycle on systemd-oomd. systemd will
# actually guess this right but we should fix it anyway.
DefaultDependencies = false;
};
sliceConfig = systemCriticalSliceConfig;
};
# make root logins higher priority for resources
systemd.slices."user-0" = {
sliceConfig = systemCriticalSliceConfig;
};
systemd.slices.system = {
sliceConfig = {
ManagedOOMMemoryPressure = "kill";
ManagedOOMMemoryPressureLimit = "50%";
IOWeight = 100;
};
};
systemd.services.sshd = {
serviceConfig = {
Slice = "hostcritical.slice";
};
};
systemd.services.systemd-oomd = {
serviceConfig = {
Slice = "hostcritical.slice";
};
};
systemd.services.systemd-journald = {
serviceConfig = {
Slice = "hostcritical.slice";
};
};
}

View file

@ -1,14 +1,12 @@
{
imports = [
./admins.nix
./server-acl.nix
./base-server.nix
./hardening.nix
./nix.nix
./raito-proxy-aware-nginx.nix
./raito-vm.nix
./sysadmin
./hardware
./zsh.nix
./secrets.nix
];
}

View file

@ -1,7 +0,0 @@
{ ... }: {
imports = [
./raito-vm.nix
./oracle-vm.nix
./hetzner.nix
];
}

View file

@ -1,76 +0,0 @@
{ lib, config, ... }:
let
cfg = config.bagel.hardware.hetzner;
inherit (lib) mkEnableOption mkIf mkOption types;
in
{
options.bagel.hardware.hetzner = {
enable = mkEnableOption "Hetzner's hardware defaults";
platformType = mkOption {
# Only VMs are supported.
type = types.enum [ "virtual-machine" ];
};
system = mkOption {
# Only the aarch64-linux VM Hetzner is supported.
type = types.enum [ "aarch64-linux" ];
};
networking.wan = {
mac = mkOption {
type = types.str;
description = "MAC address of the WAN interface in the Hetzner machine";
};
address = mkOption {
type = types.listOf types.str;
description = "List of static addresses attached to the WAN interface";
};
};
};
config = mkIf cfg.enable {
# A bunch of stuff is virtio.
boot.initrd.availableKernelModules = [
"xhci_pci"
"usbhid"
"sr_mod"
"virtio_gpu"
"virtio_scsi"
"virtio_rng"
"virtio_pci"
];
boot.loader.systemd-boot.enable = true;
boot.loader.efi.canTouchEfiVariables = true;
networking.useDHCP = lib.mkDefault false;
# Stolen from the netplan provided by aarch64 Ubuntu images.
systemd.network.enable = true;
systemd.network.links."10-wan" = {
linkConfig.Name = "wan";
matchConfig.MACAddress = cfg.networking.mac;
};
systemd.network.networks."10-wan" = {
matchConfig.Name = "wan";
networkingConfig.Address = cfg.networking.address;
linkConfig.RequiredForOnline = true;
DHCP = "ipv4";
routes = [
{
routeConfig = {
Destination = "::/0";
GatewayOnLink = true;
Gateway = "fe80::1";
};
}
];
dhcpV4Config = {
RouteMetric = 100;
UseMTU = true;
};
};
};
}

View file

@ -1,52 +0,0 @@
{ lib, config, modulesPath, ... }:
let
cfg = config.bagel.hardware.oracle-vm;
inherit (lib) mkEnableOption mkIf mkOption types;
in
{
options.bagel.hardware.oracle-vm = {
enable = mkEnableOption "Oracle's VM hardware defaults";
system = mkOption {
# Only the free Oracle VMs are supported.
type = types.enum [ "aarch64-linux" ];
};
};
# Imports a bunch of virtio modules.
imports = [
"${modulesPath}/profiles/qemu-guest.nix"
];
config = mkIf cfg.enable {
boot.loader.systemd-boot.enable = true;
boot.loader.efi.canTouchEfiVariables = true;
boot.initrd.systemd.enable = true;
boot.initrd.availableKernelModules = [
"xhci_pci" "virtio_pci" "usbhid" "sr_mod"
];
boot.initrd.kernelModules = [ ];
boot.kernelModules = [ ];
boot.extraModulePackages = [ ];
nixpkgs.hostPlatform = cfg.system;
# Enables DHCP on each ethernet and wireless interface. In case of scripted networking
# (the default) this is the recommended approach. When using systemd-networkd it's
# still possible to use this option, but it's recommended to use it in conjunction
# with explicit per-interface declarations with `networking.interfaces.<interface>.useDHCP`.
networking.useDHCP = lib.mkDefault false;
# Examples:
# 2: enp0s3: <BROADCAST,MULTICAST,UP,LOWER_UP> mtu 1500 qdisc mq state UP group default qlen 1000
# link/ether 02:00:17:00:91:6e brd ff:ff:ff:ff:ff:ff
# inet 10.0.0.94/24 brd 10.0.0.255 scope global dynamic noprefixroute enp0s3
# valid_lft 44162sec preferred_lft 33362sec
# inet6 fe80::17ff:fe00:916e/64 scope link
# valid_lft forever preferred_lft forever
# [root@build02-aarch64-lahfa:~]# ip r
# default via 10.0.0.1 dev enp0s3 proto dhcp src 10.0.0.94 metric 1002 mtu 9000
networking.interfaces.enp0s3.useDHCP = lib.mkDefault true;
};
}

View file

@ -2,6 +2,5 @@
{
programs.ssh.knownHosts = {
"[cl.forkos.org]:29418".publicKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIM82mJ259C8Nc+BHHNBeRWXWhL3dfirQhmFbDAwHMle3";
"[gerrit.lix.systems]:2022".publicKey = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICC/S6Z56uhv7zBMutkV0nU8eDuRcl3trykGWBch4L/l";
};
}

View file

@ -1,22 +0,0 @@
## This is a simple secret abstraction with multi-tenancy awareness.
{ config, lib, ... }:
let
cfg = config.bagel.secrets;
inherit (lib) mkOption types genAttrs;
in
{
options.bagel.secrets = {
tenant = mkOption {
type = types.enum [ "lix" "floral" ];
};
files = mkOption {
type = types.listOf types.str;
default = [ ];
};
};
config.age.secrets = genAttrs cfg.files (secretFile: {
file = ../secrets/${cfg.tenant}/${secretFile}.age;
});
}

View file

@ -1,69 +0,0 @@
{ lib, config, ... }:
let
keys = import ./ssh-keys.nix;
inherit (lib) mkOption types length concatMap listToAttrs catAttrs attrValues;
cfgAdmins = config.bagel.admins;
cfgGroups = config.bagel.groups;
cfgUsers = config.bagel.users;
userOpts = { name, ... }: {
options = {
sshKeys = mkOption {
type = types.listOf types.str;
description = "List of SSH keys associated to this user, defaults to `ssh-keys.nix` entries.";
default = keys.users.${name} or [ ];
};
};
};
groupOpts = { name, ... }: {
options = {
members = mkOption {
type = types.listOf types.str;
description = "List of users member of this group";
example = [ "raito" ];
default = [ ];
};
};
};
# There might be duplicate in that list. We will turn it into an attribute set.
allowedMembers = listToAttrs (
map (member: {
name = member;
value = cfgUsers.${member};
}) (concatMap (allowedGroup: cfgGroups.${allowedGroup}.members) cfgAdmins.allowedGroups));
rootKeys = concatMap ({ sshKeys, ... }: sshKeys) (attrValues allowedMembers);
in
{
options.bagel.users = mkOption {
type = types.attrsOf (types.submodule userOpts);
description = "User configuration for server ACLs";
};
options.bagel.groups = mkOption {
type = types.attrsOf (types.submodule groupOpts);
description = "Group configuration for server ACLs";
};
options.bagel.admins = {
allowedGroups = mkOption {
type = types.listOf types.str;
default = [ "catch-all" ];
description = "List of groups which are allowed to admin this machine.";
example = [ "lix" "build-infra" ];
};
};
config = {
assertions = [
{ assertion = length config.users.users.root.openssh.authorizedKeys.keys > 0;
# TODO: you can add printing of `concatStringsSep ", " cfg.allowedGroups` to diagnose
# which are the allowed groups and existing admins.
message = "root@${config.networking.fqdnOrHostName} has no SSH key attached, this machine will lose its access if you deploy it successfully! Set a valid `bagel.admins.allowedGroups` or ensure you have at least one administrator of the relevant group registered";
}
];
users.users.root.openssh.authorizedKeys.keys = rootKeys;
};
}

View file

@ -1,6 +1,5 @@
{
machines = {
# Floral
bagel-box = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIAsO4bNqY04uG13Pg3ubHfRDssTphDLzZ4YUniE5/p+M";
meta01 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIM5t9gYorOWgpCFDJgb24pyCKIabGpeI2H/UfdvXODcT";
public01 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIPBy8G8rfLA6E9i+t5kjVafxU1c2NXATXKxoXTH4Kgtm";
@ -21,16 +20,6 @@
builder-9 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOhws9zGgocVY36dMtOL+CXadpvRMffxoWMkfEcTBJm7";
builder-10 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIE7sgIuTSqZiZhp8TvObSbIEhcHHsL5hcmYA22uzwxth";
wob-vpn-gw = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAINVytPPW8XnXf/rD5TFzsw//CZc2lBjQLmDzlVGPZsjh";
# Lix
build01-aarch64-lix = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAICC69NZD/zhIB/wUb5odg46bss5g8hH2fDl22bk4qeSW";
build02-aarch64-lix = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIGdJE375pe58RJbhKwXRp3D//+SJ3ssiVZrLsM9CLHn0";
build01-aarch64-darwin-lix = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIMVf1uO0lv5UBti/naW/+amqLxvWZg+StXk9aM+lJ7e4";
buildbot-lix = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFoVSh35UqNQZ6ZZ1c6CzqERC40ovQ/KDXz8pC7nNlkR";
# Raito infrastructure
epyc-newtype-fr = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOXT9Init1MhKt4rjBANLq0t0bPww/WQZ96uB4AEDrml";
};
users = {
@ -61,8 +50,6 @@
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIKiXXYkhRh+s7ixZ8rvG8ntIqd6FELQ9hh7HoaHQJRPU"
];
thubrecht = [ "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIPM1jpXR7BWQa7Sed7ii3SbvIPRRlKb3G91qC0vOwfJn" ];
yuka = [ "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIxQ3NYBi8v1f/vhxLKDcA6upmX0pctRDbnK6SER5OUR yureka" ];
winter = [ "sk-ssh-ed25519@openssh.com AAAAGnNrLXNzaC1lZDI1NTE5QG9wZW5zc2guY29tAAAAIH/LDRUG+U+++UmlxvA2kspioTjktQZ8taDcHq8gVlkfAAAABHNzaDo=" ];
ckie = [ "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIH3uTwzSSMAPg84fwbNp2cq9+BdLFeA1VzDGth4zCAbz https://mei.puppycat.house" ];
yuka = [ "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIKath4/fDnlv/4fzxkPrQN1ttmoPRNu/m9bEtdPJBDfY cardno:16_933_242" ];
};
}

View file

@ -13,11 +13,7 @@ in
tmux
rsync
fd
eza
grc
ripgrep
delta
tshark
pv
kitty.terminfo
config.boot.kernelPackages.perf

View file

@ -87,16 +87,16 @@
"treefmt-nix": "treefmt-nix"
},
"locked": {
"lastModified": 1728837991,
"narHash": "sha256-+jXVHPmX9eUtH2JhMKye0Tm2KMQTmD8FlHHfbcaXMOI=",
"ref": "refs/heads/bring-back-old-gerrit-reporting",
"rev": "879e9cdcdf2d7e6566ee512d015acc4d23f35517",
"revCount": 302,
"lastModified": 1722939563,
"narHash": "sha256-lMe8aXgF550iQLRaoU+yn8yYQ4x2qiyqANgsFyjfWwA=",
"ref": "refs/heads/non-flakes",
"rev": "4a162a8aa5dad6cecdb33bd8534e67e0bdaeb13f",
"revCount": 295,
"type": "git",
"url": "https://git.lix.systems/lix-project/buildbot-nix.git"
},
"original": {
"ref": "refs/heads/bring-back-old-gerrit-reporting",
"ref": "refs/heads/non-flakes",
"type": "git",
"url": "https://git.lix.systems/lix-project/buildbot-nix.git"
}
@ -436,11 +436,11 @@
},
"locked": {
"host": "gitlab.computer.surgery",
"lastModified": 1727994504,
"narHash": "sha256-FC6M1KKX58HbU9LG+cG6EJRr02J9lE/o0iiDi6m1gv8=",
"lastModified": 1723576377,
"narHash": "sha256-sTa4XT5xMQkhhLknOfVd433YS1TvkMrE45qAsI1ZB6U=",
"owner": "matrix",
"repo": "grapevine-fork",
"rev": "5a490a4397f0c6a36dab1cb631dadc67a849deab",
"rev": "3b99032456700d06dd937db6a85976a8be9d4fa7",
"type": "gitlab"
},
"original": {
@ -480,11 +480,11 @@
]
},
"locked": {
"lastModified": 1728321752,
"narHash": "sha256-GbBAoBF7ZObz0IP+g0LZKxMafpMvNKjTEu9haiZbV54=",
"lastModified": 1724616313,
"narHash": "sha256-9syppf9Gm/6F4wQQAbsf7rGY1DooMsprnsEY/0eaewg=",
"ref": "refs/heads/main",
"rev": "ee1234c15cdcb427dbd4828e0add09d02cd606c9",
"revCount": 4220,
"rev": "44b9a7b95d23e7a8587cb963f00382046707f2db",
"revCount": 4202,
"type": "git",
"url": "https://git.lix.systems/lix-project/hydra.git"
},
@ -505,11 +505,11 @@
"pre-commit-hooks": "pre-commit-hooks"
},
"locked": {
"lastModified": 1728163191,
"narHash": "sha256-SW0IEBsPN1EysqzvfDT+8Kimtzy03O1BxQQm7ZB6fRY=",
"lastModified": 1723919517,
"narHash": "sha256-D6+zmRXzr85p7riphuIrJQqangoJe70XM5jHhMWwXws=",
"ref": "refs/heads/main",
"rev": "ed9b7f4f84fd60ad8618645cc1bae2d686ff0db6",
"revCount": 16323,
"rev": "278fddc317cf0cf4d3602d0ec0f24d1dd281fadb",
"revCount": 16138,
"type": "git",
"url": "https://git.lix.systems/lix-project/lix"
},
@ -670,11 +670,11 @@
},
"nixpkgs_2": {
"locked": {
"lastModified": 1728093190,
"narHash": "sha256-CAZF2NRuHmqTtRTNAruWpHA43Gg2UvuCNEIzabP0l6M=",
"lastModified": 1723221148,
"narHash": "sha256-7pjpeQlZUNQ4eeVntytU3jkw9dFK3k1Htgk2iuXjaD8=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "e2f08f4d8b3ecb5cf5c9fd9cb2d53bb3c71807da",
"rev": "154bcb95ad51bc257c2ce4043a725de6ca700ef6",
"type": "github"
},
"original": {
@ -715,7 +715,6 @@
],
"nix-gerrit": "nix-gerrit",
"nixpkgs": "nixpkgs_2",
"stateless-uptime-kuma": "stateless-uptime-kuma",
"terranix": "terranix"
}
},
@ -739,13 +738,13 @@
"rust-manifest": {
"flake": false,
"locked": {
"narHash": "sha256-tB9BZB6nRHDk5ELIVlGYlIjViLKBjQl52nC1avhcCwA=",
"narHash": "sha256-aZFye4UrtlcvLHrISldx4g9uGt3thDbVlLMK5keBSj0=",
"type": "file",
"url": "https://static.rust-lang.org/dist/channel-rust-1.81.0.toml"
"url": "https://static.rust-lang.org/dist/channel-rust-1.78.0.toml"
},
"original": {
"type": "file",
"url": "https://static.rust-lang.org/dist/channel-rust-1.81.0.toml"
"url": "https://static.rust-lang.org/dist/channel-rust-1.78.0.toml"
}
},
"stable": {
@ -764,22 +763,6 @@
"type": "github"
}
},
"stateless-uptime-kuma": {
"flake": false,
"locked": {
"lastModified": 1728243069,
"narHash": "sha256-l9fgwesnmFxasCaYUCD7L9bGGJXytLuwtx3CZMgpwJg=",
"ref": "refs/heads/master",
"rev": "880f444ff7862d6127b051cf1a993ad1585b1652",
"revCount": 25,
"type": "git",
"url": "https://git.dgnum.eu/DGNum/stateless-uptime-kuma.git"
},
"original": {
"type": "git",
"url": "https://git.dgnum.eu/DGNum/stateless-uptime-kuma.git"
}
},
"systems": {
"locked": {
"lastModified": 1681028828,

View file

@ -22,15 +22,12 @@
gerrit-dashboard.url = "git+https://git.lix.systems/the-distro/gerrit-monitoring.git";
gerrit-dashboard.flake = false;
buildbot-nix.url = "git+https://git.lix.systems/lix-project/buildbot-nix.git?ref=refs/heads/bring-back-old-gerrit-reporting";
buildbot-nix.url = "git+https://git.lix.systems/lix-project/buildbot-nix.git?ref=refs/heads/non-flakes";
buildbot-nix.inputs.nixpkgs.follows = "nixpkgs";
channel-scripts.url = "git+https://git.lix.systems/the-distro/channel-scripts.git";
channel-scripts.inputs.nixpkgs.follows = "nixpkgs";
stateless-uptime-kuma.url = "git+https://git.dgnum.eu/DGNum/stateless-uptime-kuma.git";
stateless-uptime-kuma.flake = false;
lix.follows = "hydra/lix";
grapevine = {
@ -58,7 +55,6 @@
inputs.lix.overlays.default
inputs.nix-gerrit.overlays.default
inputs.channel-scripts.overlays.default
(import "${inputs.stateless-uptime-kuma}/overlay.nix")
];
};
terraform = pkgs.opentofu;
@ -68,6 +64,7 @@
./terraform
{
bagel.dnsimple.enable = true;
bagel.gandi.enable = true;
bagel.hydra.enable = true;
}
];
@ -115,90 +112,25 @@
./common
];
floralInfraModules = commonModules ++ [
({ config, lib, ... }: {
# This means that anyone with @floral-infra permissions
# can ssh on root of every machines handled here.
bagel.admins.allowedGroups = [
"floral-infra"
];
# Tag all machines which have local boot as local bootables.
deployment.tags = lib.mkMerge [
[ "floral" ]
(lib.mkIf (config.bagel.baremetal.builders.enable -> !config.bagel.baremetal.builders.netboot)
[ "localboot" ]
)
];
bagel.monitoring.grafana-agent.tenant = "floral";
bagel.secrets.tenant = "floral";
bagel.builders.extra-build-capacity.provider.tenant = "floral";
bagel.services.buildbot.tenant = "floral";
})
];
# These are Floral baremetal builders.
makeBuilder = i:
let
enableNetboot = i >= 6;
in
lib.nameValuePair "builder-${toString i}" {
imports = floralInfraModules;
bagel.baremetal.builders = { enable = true; num = i; netboot = enableNetboot; };
makeBuilder = i: lib.nameValuePair "builder-${toString i}" {
imports = commonModules;
bagel.baremetal.builders = { enable = true; num = i; netboot = i >= 6; };
};
lixInfraModules = commonModules ++ [
{
# This means that anyone with @lix-infra permissions
# can ssh on root of every machines handled here.
bagel.admins.allowedGroups = [
"lix-infra"
];
# Tag all machines which have local boot as local bootables.
# Lix has no netbootable machine.
deployment.tags = [ "localboot" "lix" ];
bagel.monitoring.grafana-agent.tenant = "lix";
bagel.secrets.tenant = "lix";
bagel.builders.extra-build-capacity.provider = {
tenant = "lix";
buildfarmPublicKeys = [
# buildbot.lix.systems SSH key
"ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIDu4cEqZzAI/1vZjSQkTJ4ijIg9nuloOuSKUrnkJIOFn"
];
};
bagel.services.buildbot.tenant = "lix";
}
];
builders = lib.listToAttrs (lib.genList makeBuilder 11);
in {
meta.nixpkgs = systemBits.x86_64-linux.pkgs;
# Add any non-x86_64 native systems here.
# Cross compilation is not supported yet.
meta.nodeNixpkgs =
let
aarch64-systems = systems: lib.genAttrs systems (system: systemBits.aarch64-linux.pkgs);
in
aarch64-systems [
"build01-aarch64-lix"
];
meta.specialArgs.inputs = inputs;
bagel-box.imports = floralInfraModules ++ [ ./hosts/bagel-box ];
meta01.imports = floralInfraModules ++ [ ./hosts/meta01 ];
gerrit01.imports = floralInfraModules ++ [ ./hosts/gerrit01 ];
fodwatch.imports = floralInfraModules ++ [ ./hosts/fodwatch ];
git.imports = floralInfraModules ++ [ ./hosts/git ];
wob-vpn-gw.imports = floralInfraModules ++ [ ./hosts/wob-vpn-gw ];
buildbot.imports = floralInfraModules ++ [ ./hosts/buildbot ];
public01.imports = floralInfraModules ++ [ ./hosts/public01 ];
build-coord.imports = floralInfraModules ++ [ ./hosts/build-coord ];
build01-aarch64-lix.imports = lixInfraModules ++ [ ./hosts/build01-aarch64-lix ];
buildbot-lix.imports = lixInfraModules ++ [ ./hosts/buildbot-lix ];
bagel-box.imports = commonModules ++ [ ./hosts/bagel-box ];
meta01.imports = commonModules ++ [ ./hosts/meta01 ];
gerrit01.imports = commonModules ++ [ ./hosts/gerrit01 ];
fodwatch.imports = commonModules ++ [ ./hosts/fodwatch ];
git.imports = commonModules ++ [ ./hosts/git ];
wob-vpn-gw.imports = commonModules ++ [ ./hosts/wob-vpn-gw ];
buildbot.imports = commonModules ++ [ ./hosts/buildbot ];
public01.imports = commonModules ++ [ ./hosts/public01 ];
build-coord.imports = commonModules ++ [ ./hosts/build-coord ];
} // builders;
hydraJobs = builtins.mapAttrs (n: v: v.config.system.build.netbootDir or v.config.system.build.toplevel) self.nixosConfigurations;

View file

@ -13,8 +13,6 @@
hydra.builders = lib.genList (i: "builder-${builtins.toString i}") 10;
};
bagel.monitoring.exporters.hydra.enable = true;
# Hydra is proxied.
bagel.raito.v6-proxy-awareness.enable = true;

View file

@ -1,27 +0,0 @@
{ ... }: {
networking.hostName = "build01";
networking.domain = "aarch64.lix.systems";
# Those free sweet VMs.
bagel.hardware.oracle-vm = {
enable = true;
system = "aarch64-linux";
};
fileSystems."/" =
{ device = "/dev/disk/by-uuid/a333323c-99f0-4258-8f68-496858d56f71";
fsType = "ext4";
};
fileSystems."/boot" =
{ device = "/dev/disk/by-uuid/3E74-C937";
fsType = "vfat";
};
swapDevices = [ ];
bagel.builders.extra-build-capacity.provider.enable = true;
i18n.defaultLocale = "en_US.UTF-8";
system.stateVersion = "24.05";
deployment.targetHost = "build01.aarch64.lix.systems";
}

View file

@ -1,71 +0,0 @@
# Configuration for a virtual machine in Raito's micro-DC basement.
# 32 vCPU (2014 grade Xeon though)
# 32GB RAM
# 30GB SSD
# 500GB HDD
# All specifications can be upgraded to a certain extent, just ask Raito.
# Hosts the coordinator for Buildbot.
#
# vim: et:ts=2:sw=2:
#
{ lib, modulesPath, ... }: {
networking.hostName = "buildbot";
networking.domain = "lix.systems";
zramSwap.enable = true;
bagel.sysadmin.enable = true;
# Buildbot is proxied.
bagel.raito.v6-proxy-awareness.enable = true;
bagel.hardware.raito-vm = {
enable = true;
networking = {
nat-lan-mac = "BC:24:11:75:62:42";
wan = {
mac = "BC:24:11:B2:5F:2E";
address = "2001:bc8:38ee:100::200/56";
};
};
};
i18n.defaultLocale = "en_US.UTF-8";
bagel.services.buildbot = {
enable = true;
domain = "buildbot.lix.systems";
gerrit =
{
domain = "gerrit.lix.systems";
port = 2022;
username = "buildbot";
};
cors.allowedOrigins = [
"https://*.lix.systems"
];
projects = [
"lix"
"lix-installer"
];
buildSystems = [
"x86_64-linux"
"aarch64-linux"
"aarch64-darwin"
# Too slow.
/* "x86_64-darwin" */
];
# Lix is not allowed to use yet Floral's x86_64 builders for now.
builders = [ ];
};
# This machine does not use /nix from btrfs, and instead uses a store on a bigger disk.
fileSystems."/nix" =
lib.mkForce
{ device = "/dev/disk/by-uuid/1815ca49-d0b0-4b99-8aec-0d790498ba6f";
fsType = "xfs";
neededForBoot = true;
options = [ "relatime" ];
};
nixpkgs.hostPlatform = lib.mkDefault "x86_64-linux";
system.stateVersion = "24.05";
deployment.targetHost = "buildbot.lix.systems";
}

View file

@ -2,7 +2,6 @@
config,
lib,
pkgs,
nodes,
...
}:
{
@ -27,24 +26,7 @@
bagel.services.buildbot = {
enable = true;
domain = "buildbot.forkos.org";
gerrit =
let
cfgGerrit = nodes.gerrit01.config.bagel.services.gerrit;
in
{
domain = cfgGerrit.canonicalDomain;
port = cfgGerrit.port;
username = "buildbot";
};
cors.allowedOrigins = [
"https://*.forkos.org"
];
projects = [
"buildbot-test"
"nixpkgs"
"infra"
];
builders = [ "builder-9" ];
builders = [ "builder-10" ];
};
i18n.defaultLocale = "en_US.UTF-8";

View file

@ -23,9 +23,6 @@
};
};
# Block all these crawlers!!
bagel.services.nginx.crawler-blocker.enable = true;
fileSystems."/gerrit-data" = {
device = "/dev/disk/by-uuid/d1062305-0dea-4740-9a27-b6b1691862a4";
fsType = "ext4";
@ -42,7 +39,7 @@
};
age.secrets.ows-deploy-key = {
file = ../../secrets/floral/ows-deploy-key.age;
file = ../../secrets/ows-deploy-key.age;
mode = "0600";
owner = "git";
group = "git";
@ -124,7 +121,7 @@
};
};
age.secrets.s3-channel-staging-keys.file = ../../secrets/floral/s3-channel-staging-keys.age;
age.secrets.s3-channel-staging-keys.file = ../../secrets/s3-channel-staging-keys.age;
bagel.nixpkgs.channel-scripts = {
enable = true;
otlp.enable = true;

View file

@ -9,11 +9,6 @@
# TODO: make it the default
networking.domain = "infra.forkos.org";
bagel.status = {
enable = true;
domain = "status.forkos.org";
};
bagel.sysadmin.enable = true;
# Newsletter is proxied.
bagel.raito.v6-proxy-awareness.enable = true;

View file

@ -1,76 +1,51 @@
let
keys = import common/ssh-keys.nix;
commonKeys = {
# WARNING: `keys.users.*` are *lists*, so you need concatenate them, don't put them into lists!
# Otherwise, agenix will be confused!
global = keys.users.raito;
lix = keys.users.hexchen ++ keys.users.jade;
floral = keys.users.delroth;
};
commonKeys = keys.users.delroth ++ keys.users.raito;
secrets = with keys; {
floral = {
hydra-postgres-key = [ machines.build-coord ];
hydra-s3-credentials = [ machines.build-coord ];
hydra-signing-priv = [ machines.build-coord ];
hydra-ssh-key-priv = [ machines.build-coord ];
hydra-postgres-key = [ machines.build-coord ];
hydra-s3-credentials = [ machines.build-coord ];
hydra-signing-priv = [ machines.build-coord ];
hydra-ssh-key-priv = [ machines.build-coord ];
netbox-environment = [ machines.meta01 ];
mimir-environment = [ machines.meta01 ];
mimir-webhook-url = [ machines.meta01 ];
grafana-oauth-secret = [ machines.meta01 ];
loki-environment = [ machines.meta01 ];
gerrit-prometheus-bearer-token = [ machines.gerrit01 machines.meta01 ];
pyroscope-secrets = [ machines.meta01 ];
tempo-environment = [ machines.meta01 ];
netbox-environment = [ machines.meta01 ];
mimir-environment = [ machines.meta01 ];
mimir-webhook-url = [ machines.meta01 ];
grafana-oauth-secret = [ machines.meta01 ];
loki-environment = [ machines.meta01 ];
gerrit-prometheus-bearer-token = [ machines.gerrit01 machines.meta01 ];
pyroscope-secrets = [ machines.meta01 ];
tempo-environment = [ machines.meta01 ];
buildbot-worker-password = [ machines.buildbot ];
buildbot-oauth-secret = [ machines.buildbot ];
buildbot-workers = [ machines.buildbot ];
# Private SSH key to Gerrit
# ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHx52RUPWzTa2rBA96xcnGjjzAboNN/hm6gW+Q6JiSos
buildbot-service-key = [ machines.buildbot ];
# Signing key for Buildbot's specific cache
buildbot-signing-key = [ machines.buildbot ];
buildbot-remote-builder-key = [ machines.buildbot ];
buildbot-worker-password = [ machines.buildbot ];
buildbot-oauth-secret = [ machines.buildbot ];
buildbot-workers = [ machines.buildbot ];
# Private SSH key to Gerrit
# ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHx52RUPWzTa2rBA96xcnGjjzAboNN/hm6gW+Q6JiSos
buildbot-service-key = [ machines.buildbot ];
# Signing key for Buildbot's specific cache
buildbot-signing-key = [ machines.buildbot ];
buildbot-remote-builder-key = [ machines.buildbot ];
# These are the same password, but nginx wants it in htpasswd format
metrics-push-htpasswd = [ machines.meta01 ];
# Yes, even Lix machines are included in this monitoring infrastructure.
metrics-push-password = builtins.attrValues machines;
# These are the same password, but nginx wants it in htpasswd format
metrics-push-htpasswd = [ machines.meta01 ];
metrics-push-password = builtins.attrValues machines;
ows-deploy-key = [ machines.gerrit01 ];
s3-channel-staging-keys = [ machines.gerrit01 ];
s3-channel-keys = [ machines.gerrit01 ];
ows-deploy-key = [ machines.gerrit01 ];
s3-channel-staging-keys = [ machines.gerrit01 ];
s3-channel-keys = [ machines.gerrit01 ];
postgres-ca-priv = [ machines.bagel-box ];
postgres-tls-priv = [ machines.bagel-box ];
postgres-ca-priv = [ machines.bagel-box ];
postgres-tls-priv = [ machines.bagel-box ];
newsletter-secrets = [ machines.public01 ];
s3-revproxy-api-keys = [ machines.public01 ];
stateless-uptime-kuma-password = [ machines.public01 ];
};
lix = {
buildbot-worker-password = [ machines.buildbot-lix ];
buildbot-oauth-secret = [ machines.buildbot-lix ];
buildbot-workers = [ machines.buildbot-lix ];
# Private SSH key to Gerrit
# ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIHx52RUPWzTa2rBA96xcnGjjzAboNN/hm6gW+Q6JiSos
buildbot-service-key = [ machines.buildbot-lix ];
# Signing key for Buildbot's specific cache
buildbot-signing-key = [ machines.buildbot-lix ];
buildbot-remote-builder-key = [ machines.buildbot-lix ];
};
newsletter-secrets = [ machines.public01 ];
s3-revproxy-api-keys = [ machines.public01 ];
};
mkSecretListFor = tenant:
map (secretName: {
name = "secrets/${tenant}/${secretName}.age";
value.publicKeys = secrets.${tenant}."${secretName}" ++ commonKeys.global ++ commonKeys.${tenant};
}) (builtins.attrNames secrets.${tenant});
in
builtins.listToAttrs (
(mkSecretListFor "floral") ++ (mkSecretListFor "lix")
map (secretName: {
name = "secrets/${secretName}.age";
value.publicKeys = secrets."${secretName}" ++ commonKeys;
}) (builtins.attrNames secrets)
)

View file

@ -1,68 +0,0 @@
age-encryption.org/v1
-> ssh-ed25519 +HUDfA d5f2ESneC0wsoc9rwTjNfNXMBjCbjAQ7euthH2Buq1E
5CynaQ8zhDRBvcmifhCsiDtllztCVAqs8rU36DOxgPw
-> ssh-ed25519 +uvEmw EtYRis2LP0jv1W8mx8vFYNzkgi8OoqnA8cM2huS6NBk
ll1csFIO+hVYk+I0uSVJmlDKj9aTWvf4kaYI5LJcm7w
-> ssh-ed25519 DMaM1w ex4QJN8CG99J15i+yvqGEiEZn9OlGIC+cmLHL4u8ZEI
VXnOv4CGK68q5t6hUV3oKAtxGZ+4FVbrmE1yMn16A0Q
-> ssh-ed25519 sixKXw drXN6+q1y7L7ZU4chTfHfelu5GcTdff+i/UMFV0+3RQ
+8jmgnMh2OpQ3vhAuyQYWslfx7KO84a8KsCpoRD3Yl8
-> ssh-ed25519 aHbF7w Af7NgjZ/Nvh5FHrX2VlF5riTIhJ+fdxTo6OR+8PcNwA
ktKpm/HnOnw2Ym7xee3N1rneEX7+/xDhcp71N1NNHAA
-> ssh-ed25519 87T2Ig 8mEUxJ/5NUvV+qQCDQH2Tm6Ryr5hf4xgsQlqXGf03Fw
EavMcnsg/3EYBLQEBHX+0oTDKq5ZL4vj+mZntPM8UMU
-> ssh-ed25519 Ao+7Wg UphWbatIaa+R1oZbfHazFhrawf0vax/3ZZS7YuX03Hs
dwBbwoV0jpjiKr+nj+CRfUDgDl7ISpsCintVAzHnIFQ
-> ssh-ed25519 wIR2ZA ZM58Nq7eJX9JVeYkoJf+mw8hxhYGoTx042ow1u3mJkw
UtEaf7e4xsPO0ISlIF9LF+GcwTBqw4AXdMO4MASfgLQ
-> ssh-ed25519 oGiV/Q G5KX/Eox+9md0yFRUZvGIsio2gWM17soHsL6H6zEX2g
vI8jPjBAoFF0xhvRRLPzCMSiQOQ0fKuRb3CYVu3KUUo
-> ssh-ed25519 gO3aog p9nZtjzoA0zJM+7Y6R16mpdub3dhu67yOYTUNKRytgI
YL9vAp1+CK7jgmXkB47ufZMz+/swngkdUvEGR1zFZwc
-> ssh-ed25519 N/+Clw 6LzFdtNsWewuJK2r97ZXJbRazvK3raN78UGanR/zWVU
WT0y+sfDP3ffVwRcbYw51ArFR3OzXnoyi9IXwZZKEL8
-> ssh-ed25519 CtkSZw CV0jQ5dIbgFtMxGK1X9b1qJOKmske8VgIPW5NW9mAwc
clv7P3de61nZmXrvbOgL7Llw8ZqBMm2WFqgpznDwKv8
-> ssh-ed25519 keg2lg 3Nk40ByQj8RThj4QDY2BdAkw55mXAJprXQRGjQqGvz0
f8OFszJ8p90crFd+awEE12CNd7b22zgpH2XRKmH/Hf0
-> ssh-ed25519 H885DA GDiJYH+gaC++TSqfoPDOTFcsCZRhEl0EeTeab7tgcWU
kMILmwNMnMS7rgC3kKsAksu4Txn5owPU2y09h4aHKY8
-> ssh-ed25519 Rq7K4Q VCNxGtCSCD2OYSWWwl0+yf189xV3QwRiwo80h4NPTkE
hHkgYHLbISdl/RRdlInp9ub854M9ZKFSXpLgKW2YkmQ
-> ssh-ed25519 vvyRpw XSCCrqEOtvzQRssI0U1DHirKoPgbOUKJxNKnioHhT2Y
HGey1j0Kxae5Qs0aw6eqFziQGiRmNA+lEwbRdf5hhbM
-> ssh-ed25519 aSEktQ mXY70Lgl76J4O5dPdDcIqmJ40EinigDuZrUghpGOq2I
U2qeVFxGCYCEFWU+7vHc5Mu9EuzScowrjnwUyoqPj5U
-> ssh-ed25519 cD6JxA at89poimBZyeeM8CQrxDxN0yCNDT2k04++py1fFycj8
cQV/K5zc5x/oYnJ4N0MX3sTboT4G4ZNvVUVdHuJRzbA
-> ssh-ed25519 ec6XRQ spJtb/xy4k4dmwKz8R2CPhC1WcuNV/rnDT978GkjHHk
KrGEVGts/AhzbRNreqQ/CVanXL3l/9oMWxnpBLj23qU
-> ssh-ed25519 1qYEfw KRkTYlvvnsCIExKQNmCyU7YxnGZsiI03kzecXNpLzUQ
h2YagV7BzlsF7banzwXbOudTdlFzT7LC8PvtxAsX36U
-> ssh-ed25519 2D+APA 4hdYlOnNIT9Q6tyKwXzy+u66Ti2EJopK43Sipebd0As
tuesc9/QcEu4q9bTFJ5zJr0qvgLcmpn4at4cYtHrtbE
-> ssh-ed25519 eTSU6g i1qT6PtepHXnoLCqDbhk86QG+SR9luQaw34a34gy5mw
YE9VBAT5SLW2ECHRU+dMg9na6OQNVRVGuhY8vOdmE/Q
-> ssh-ed25519 j2r2qQ TTTbSB/8UIDmmI3C9+u24PYZNfjl9jGADKHNWIwLfGE
SNDforwii/GFp82TpyOcVIVrZWCe2QQKrjzPA6XA7Jc
-> ssh-ed25519 CyxfgQ P5EiJ54v65Sz1gHuI0s170Z7c1WjcZLlb7NYigElfVs
iYJUGpoE9LBIlv+O1navSSsy3EJ8tusXXX+/QAQvjNI
-> ssh-ed25519 C/bBAQ hlBDpQRkcVCr3B6TCrBjxauhUX6Ndpm0s6x8W4hU6gM
OFG3EuGJkSoEEXhbJ/Tp2DBdnBcs+hzxjNRdvcOSpQs
-> ssh-ed25519 +qVung cGEGpO8NJfpj9ixAH9lhYkPKPDdQWryVxSOhMGQdnWM
+MycbIEab3P/AOS9i/YmPBDXB76hp3xUcWI4VMihV2w
-> ssh-rsa krWCLQ
Zv3dPYERlX1MaVaJTBDwIcjt1yLmu4Z7MovPgjGg01p+XsdBXeepTyOl+gRBwGgo
AW5CIuaChYxtSNJ6nOgSaUpqzILycUF1xE1jROe3MIX2MZ4KGD1qoqcHbiCAng+a
RqYrwAKnNea9FQMVfhYZBkRoYE6ne1R+0G6BoFM/okz24pAAFPBx+sMMhfTkt0uV
kHVx0dgRw1pxa7Na98WH/7E0zp9VuBvVHGXfk1rfW/UQlbIO5RP3nldFoa6OmOWS
JZ022UvjyC1re0KCurka4y+qmaiRKnTBmpIXxJFMwNCAQ8O8SeAQ3DHKHmXNMOIL
ZVICtRRk0uX36AVU8DWDog
-> ssh-ed25519 /vwQcQ kF8+hsA+0Msjd3q0SL52cae5RDqx4ls5kPKnc3UZyms
Q33kIKJL3Vjxu7LQ5l4M3tlEuj+OW4uGh1x+JxthW8A
-> ssh-ed25519 0R97PA gWBH71l6w9upTE0DwqOMSvWXc5VyJiKFAQLaSpWQ43E
IrOrvzEa0bABw6UOpP8pM8WhuRNMaWJ2khljJIKwOS8
-> ssh-ed25519 K3b7BA oS14iav9pSioLecMkOanJz89OJygLugvrnnTs5pKzz8
akupMSiqXussXJyHwFm/f0imKALjowJVqd8/LFcC/58
--- bCJXTEDaKg4FF2dQp2zFMU4Zm0zACekCuDvthcAyncM
&­Ÿ €WaïãàBD R(¯¥Ñ”ufj<>úVÁ8nÆ>ßøëæðZúâ{Idƒ„©,³*„%Ç“È

View file

@ -1,20 +0,0 @@
age-encryption.org/v1
-> ssh-ed25519 CyxfgQ D2o8bUccO13DKF4COLBQ9mJbACsE2XsRa5S+N71WnTk
ZaldT7HhQxbxf2ptIwdMYkC60eGtzihc7uwcAkq7s00
-> ssh-ed25519 K3b7BA AiUCG5CnNyv1DPu+iEwEgW9GqZ8zgpgxKJTAp350ADc
cUVaDv7F1haQIF11/UhhDAR5DrfJlPttGfDjkv+z9vY
-> ssh-ed25519 +qVung 1JXeXyea+2Pcwoln/NLRiR8IPPIiB3gaFCP4imyv4DA
JWmAY6ZnyU46KxzhRrQigGmUPba9lJDDyRQ2GjQShqc
-> ssh-rsa krWCLQ
ciLu/+cXfQrB1ms8oTv+xi4eADyL4j0qwnY/6TE0wAXkQHuNXDmpF6ccWZoS2DqN
NcnGXL6+WyWxmwlyBEq/rsBPvi1g0M6Md7Z4gXn2UvjJ+S7WyA8QEwkxoTDkJS7x
k/NvtunmggVsWVK4Xdi5DKRw+f32qr/8GysDhIPrTt43iReBKNbyuYWmC5Ec85ep
JU4JzCNZjJ07kixS5Y9BhaJbpEr47lCXE/KtJUvm3VAxS9IwfUn7KHHdFWynbExi
F898j3zOR/kgYmeA0oTiexRD3Y2LCvjXIHQZ3MobbZ/PBrjWxe78Sw2vy2t5JLtB
gFG0K8M1z8DT6a8TtvXEgg
-> ssh-ed25519 /vwQcQ kUM21TO9iSa8oVXMlNxR7Kc+8TV4C/uTzyQ+t3xnARA
oXt+egWWONsKT48H4vZ2CPdy3Zfb2QeQVe9l7dDyO/w
-> ssh-ed25519 0R97PA e/piqf2RD5QgPaQs6jsJdzJgfZR9n1JDIWpbvLZErSs
UTJH8POFdZ4+N9WkLoNESl1pvcVD0MS1qn7AdS/mg34
--- 9aYEP0eHDKMacIf09h+OJqIYw+N99+FrW/x/do8Lbo4
$ ÖëWÛ\zú—¾=s/à@.Ç,?ƒW6n^ù#i!§Ãï¶1]±Nvù±Ž 'Ï¥¹6?'mµpPÒqýŸº

View file

@ -1,7 +0,0 @@
age-encryption.org/v1
-> ssh-ed25519 Ao+7Wg q7oRHUUlAvD8OUbpPT7d6eLMPWU0YS/verYTDE5BCkY
/87/1uqOvRYeqel9IjnFmGyF9SXUQD8MTgfcj91b/Fs
--- ulIeB91NJ7z/64h9BCLSD9/RW/zwv3m1Zo2ovNuInv8
Îœç}³Óš#épÇ o>ä·*vµ÷ÄåŽs?[¦º´L
<EFBFBD>þz™rý‰?R±Ñó7<Ê
æi!€{X„¾òÓ

Binary file not shown.

View file

@ -1,6 +0,0 @@
age-encryption.org/v1
-> ssh-ed25519 Ao+7Wg EMpfs0EpWwaIKAoUBfEkyAHLIwi6JnGG6RvUWM5LjnU
LKiwUBNc791U/GVRNlRPZE/TEMJjcFFrLruFJhiyiOI
--- 0khp8u+4vHgGyQqP05m473Eo09eyOUZLI5+EK4olzoc
N3(
ª•ûxRq°<71>f<EFBFBD>Ó;ͼ3¬~RˆÓC^ñ +fœš1”®˜xˆ÷ÅëñSØ— hâ £ÖË°GˆÓnYIûµ:7¾!°u×Hþy/‰Øð‰™.¯¤á^¹lC™ôUÈËþ5cž:]ÿNž&'MÎè¶É-˜ÆHF¦D0 cjô ‹Ð~

View file

@ -1,6 +0,0 @@
age-encryption.org/v1
-> ssh-ed25519 Ao+7Wg RPKKoI5l5cYVdSvOxTHCUtwceac4vSi3+vlaqHr8kQg
qbgTHCeQDNM30IJNZ/BU6wgryJwB316H5GWWaYB/wng
--- GuFi3GSRdlBJ5YRjfAVyFDZ+4TH575kFJLnFp5847N0
-èƒÞHÖÜ*x´M7¼t<,4ˆŠÑ^<5E>5@v°<>£€º,ZMÒg=M
» 3výJÄ«ÐÖê¿Nz­8'<^'4&WÂf"Êõ´À›ë\©º»ëêwmzúlAl|+„ÆKš~68ñE­Ýîk•8ø?S&òaMÝ~ž¹ê¿]Vfø ÝJxaõDù¥x

Binary file not shown.

View file

@ -135,7 +135,7 @@ in
{ address = "2a01:584:11::1:${toString cfg.num}"; prefixLength = 64; }
];
networking.defaultGateway6 = { interface = "uplink"; address = "2a01:584:11::1"; };
deployment.targetHost = "2a01:584:11::1:${toString cfg.num}";
deployment.targetHost = lib.mkIf (!cfg.netboot) "2a01:584:11::1:${toString cfg.num}";
deployment.tags = [ "builders" ];
# Why can't we have nice things? https://bugs.openjdk.org/browse/JDK-8170568

View file

@ -21,13 +21,22 @@ in
'';
};
# machines with the netboot module enabled should only be updated by appliying wob-vpn-gw and rebooting
deployment.targetHost = "invalid.example.com";
# fixes initrd eval warning, and allows `colmena build` to succed
fileSystems."/" = {
device = "none";
fsType = "tmpfs";
options = [ "defaults" "size=64G" "mode=755" ];
};
system.build = {
# Build a kernel and initramfs which will download the IPXE script from hydra using
# u-root pxeboot tool and kexec into the final netbooted system.
notipxe = import (modulesPath + "/..") {
system = "x86_64-linux";
configuration =
configuration =
{ pkgs, config, ... }:
{
@ -57,7 +66,7 @@ in
script = ''
ln -sf /dev/console /dev/tty
until ${pkgs.iputils}/bin/ping -c 1 hydra.forkos.org; do sleep 1; done
${pkgs.u-root}/bin/pxeboot -v -ipv4=false -file https://hydra.forkos.org/job/infra/main/${node.config.networking.hostName}/latest/download-by-type/file/ipxe
${pkgs.u-root}/bin/pxeboot -v -ipv4=false -file https://hydra.forkos.org/job/infra/main/${node.config.networking.hostName}/latest/download-by-type/file/ipxe
'';
};
boot.initrd.systemd.contents."/etc/ssl/certs/ca-certificates.crt".source = "${pkgs.cacert}/etc/ssl/certs/ca-bundle.crt";

View file

@ -1,40 +0,0 @@
AI2Bot
Ai2Bot-Dolma
Amazonbot
anthropic-ai
Applebot
Applebot-Extended
Bytespider
CCBot
ChatGPT-User
Claude-Web
ClaudeBot
cohere-ai
Diffbot
FacebookBot
facebookexternalhit
FriendlyCrawler
Google-Extended
GoogleOther
GoogleOther-Image
GoogleOther-Video
GPTBot
iaskspider/2.0
ICC-Crawler
ImagesiftBot
img2dataset
ISSCyberRiskCrawler
Kangaroo Bot
Meta-ExternalAgent
Meta-ExternalFetcher
OAI-SearchBot
omgili
omgilibot
PerplexityBot
PetalBot
Scrapy
Sidetrade indexer bot
Timpibot
VelenPublicWebCrawler
Webzio-Extended
YouBot

View file

@ -1,32 +0,0 @@
{ pkgs, config, lib, ... }:
let
inherit (lib) mkEnableOption mkIf mkOption types concatStringsSep mkDefault splitString;
cfg = config.bagel.services.nginx.crawler-blocker;
mkRobotsFile = blockedUAs: pkgs.writeText "robots.txt" ''
${concatStringsSep "\n" (map (ua: "User-agent: ${ua}") blockedUAs)}
Disallow: /
'';
in
{
options = {
bagel.services.nginx.crawler-blocker = {
enable = mkEnableOption "the crawler blocker";
userAgents = mkOption {
type = types.listOf types.str;
default = splitString "\n" (builtins.readFile ./blocked-ua.txt);
};
};
services.nginx.virtualHosts = mkOption {
type = types.attrsOf (types.submodule {
config = {
locations."= /robots.txt" = mkIf cfg.enable (mkDefault {
alias = mkRobotsFile cfg.userAgents;
});
};
});
};
};
}

View file

@ -7,69 +7,15 @@
}:
let
cfg = config.bagel.services.buildbot;
cfgGerrit = nodes.gerrit01.config.bagel.services.gerrit;
ssh-keys = import ../../common/ssh-keys.nix;
freeGbDiskSpace = 20;
extraTenantSpecificBuilders = {
lix = import ./lix.nix {
inherit config nodes;
};
floral = [ ];
}.${cfg.tenant or (throw "${cfg.tenant} is not a known tenant")};
clientId = {
lix = "buildbot";
floral = "forkos-buildbot";
}.${cfg.tenant or (throw "${cfg.tenant} is not a known tenant")};
inherit (lib) mkEnableOption mkOption mkIf types;
in
{
options.bagel.services.buildbot = {
enable = mkEnableOption "Buildbot";
tenant = mkOption {
type = types.enum [ "lix" "floral" ];
description = "Which buildbot tenant to enable";
};
domain = mkOption {
type = types.str;
description = "Domain name for this Buildbot";
};
gerrit = {
domain = mkOption {
type = types.str;
description = "Canonical domain of the Gerrit associated to this Buildbot";
example = [ "cl.forkos.org" ];
};
port = mkOption {
type = types.port;
description = "Gerrit SSH port for this Buildbot";
};
username = mkOption {
type = types.str;
description = "Gerrit service username for this Buildbot";
};
};
cors.allowedOrigins = mkOption {
type = types.listOf types.str;
example = [ "*.forkos.org" ];
description = "Allowed origin for Buildbot and NGINX for CORS without the protocol";
};
buildSystems = mkOption {
type = types.listOf (types.enum [ "x86_64-linux" "aarch64-linux" "x86_64-darwin" "aarch64-darwin" ]);
default = [ "x86_64-linux" ];
example = [ "x86_64-linux" "aarch64-linux" ];
description = "Supported build systems for this buildbot instance.";
};
projects = mkOption {
type = types.listOf types.str;
example = [ "nixpkgs" ];
description = "Static list of projects enabled for Buildbot CI";
};
builders = mkOption {
@ -81,39 +27,28 @@ in
config = mkIf cfg.enable {
networking.firewall.allowedTCPPorts = [ 80 443 ];
bagel.secrets.files = [
"buildbot-worker-password"
"buildbot-oauth-secret"
"buildbot-workers"
"buildbot-service-key"
"buildbot-signing-key"
"buildbot-remote-builder-key"
];
age.secrets.buildbot-worker-password.file = ../../secrets/buildbot-worker-password.age;
age.secrets.buildbot-oauth-secret.file = ../../secrets/buildbot-oauth-secret.age;
age.secrets.buildbot-workers.file = ../../secrets/buildbot-workers.age;
age.secrets.buildbot-service-key.file = ../../secrets/buildbot-service-key.age;
age.secrets.buildbot-signing-key = {
file = ../../secrets/buildbot-signing-key.age;
owner = "buildbot-worker";
group = "buildbot-worker";
};
age.secrets.buildbot-remote-builder-key = {
file = ../../secrets/${cfg.tenant}/buildbot-remote-builder-key.age;
file = ../../secrets/buildbot-remote-builder-key.age;
owner = "buildbot-worker";
group = "buildbot-worker";
};
services.nginx = {
recommendedProxySettings = true;
appendHttpConfig = ''
# Our session stuff is too big with the TWISTED_COOKIE in addition.
# Default is usually 4k or 8k.
large_client_header_buffers 4 16k;
services.nginx.virtualHosts.${cfg.domain} = {
forceSSL = true;
enableACME = true;
extraConfig = ''
add_header Access-Control-Allow-Credentials 'true' always;
add_header Access-Control-Allow-Origin 'https://cl.forkos.org' always;
'';
virtualHosts.${cfg.domain} = {
forceSSL = true;
enableACME = true;
extraConfig = ''
# This is needed so that logged-in users in Buildbot can include their credentials in their requests.
add_header Access-Control-Allow-Credentials 'true' always;
'';
};
};
services.buildbot-nix.worker = {
@ -139,25 +74,30 @@ in
enable = true;
inherit (cfg) domain;
# TODO(raito): is that really necessary when we can just collect buildMachines' systems?
inherit (cfg) buildSystems;
debugging.enable = true;
oauth2 = {
name = "Lix";
inherit clientId;
clientId = "forkos-buildbot";
clientSecretFile = config.age.secrets.buildbot-oauth-secret.path;
resourceEndpoint = "https://identity.lix.systems";
authUri = "https://identity.lix.systems/realms/lix-project/protocol/openid-connect/auth";
tokenUri = "https://identity.lix.systems/realms/lix-project/protocol/openid-connect/token";
userinfoUri = "https://identity.lix.systems/realms/lix-project/protocol/openid-connect/userinfo";
};
# TODO(raito): this is not really necessary, we never have remote buildbot workers.
# we can replace all of this with automatic localworker generation on buildbot-nix side.
workersFile = config.age.secrets.buildbot-workers.path;
# We rely on NGINX to do the CORS dance.
allowedOrigins = cfg.cors.allowedOrigins;
allowedOrigins = [
"*.forkos.org"
];
# TODO(raito): is that really necessary when we can just collect buildMachines' systems?
buildSystems = [
"x86_64-linux"
];
buildMachines = map (n: {
hostName = nodes.${n}.config.networking.fqdn;
@ -171,14 +111,20 @@ in
# Contrary to how Nix works, here we can specify non-base64 public host keys.
publicHostKey = ssh-keys.machines.${n};
}
) cfg.builders ++ extraTenantSpecificBuilders;
) cfg.builders;
gerrit = {
domain = cfgGerrit.canonicalDomain;
# Manually managed account…
# TODO: https://git.lix.systems/the-distro/infra/issues/69
inherit (cfg.gerrit) domain port username;
username = "buildbot";
port = cfgGerrit.port;
privateKeyFile = config.age.secrets.buildbot-service-key.path;
inherit (cfg) projects;
projects = [
"buildbot-test"
"nixpkgs"
"infra"
];
};
evalWorkerCount = 6;
@ -187,21 +133,10 @@ in
signingKeyFile = config.age.secrets.buildbot-signing-key.path;
};
# Make PostgreSQL restart smoother.
systemd.services.postgresql.serviceConfig = {
Restart = "always";
RestartMaxDelaySec = "5m";
RestartSteps = 10;
};
nix.settings.keep-derivations = true;
nix.gc = {
automatic = true;
dates = "hourly";
options = ''
--max-freed "$((${toString freeGbDiskSpace} * 1024**3 - 1024 * $(df -P -k /nix/store | tail -n 1 | ${pkgs.gawk}/bin/awk '{ print $4 }')))"
'';
};
};
}

View file

@ -1,50 +0,0 @@
{ config, nodes, ... }:
let
ssh-keys = import ../../common/ssh-keys.nix;
in
[
{
hostName = "build01.aarch64.lix.systems";
maxJobs = 2;
protocol = "ssh-ng";
sshKey = config.age.secrets.buildbot-remote-builder-key.path;
sshUser = "nix";
systems = [ "aarch64-linux" ];
publicHostKey = ssh-keys.machines.build01-aarch64-lix;
supportedFeatures = nodes.build01-aarch64-lix.config.nix.settings.system-features;
}
{
hostName = "build02.aarch64.lix.systems";
maxJobs = 4;
protocol = "ssh-ng";
sshKey = config.age.secrets.buildbot-remote-builder-key.path;
sshUser = "nix";
systems = [ "aarch64-linux" ];
publicHostKey = ssh-keys.machines.build02-aarch64-lix;
# TODO: use build02 features.
supportedFeatures = nodes.build01-aarch64-lix.config.nix.settings.system-features;
}
{
hostName = "build01.aarch64-darwin.lix.systems";
maxJobs = 2;
protocol = "ssh-ng";
sshKey = config.age.secrets.buildbot-remote-builder-key.path;
sshUser = "m1";
systems = [ "aarch64-darwin" "x86_64-darwin" ];
publicHostKey = ssh-keys.machines.build01-aarch64-darwin-lix;
supportedFeatures = [ "big-parallel" ];
}
# a.k.a. https://git.newtype.fr/newtype/newtype-org-configurations/src/branch/main/docs/epyc.md
{
hostName = "epyc.infra.newtype.fr";
# at 256G this could run 64 builds but the machine is shared
# (and historically we used no more than 16 concurrent jobs)
maxJobs = 16;
protocol = "ssh-ng";
sshKey = config.age.secrets.buildbot-remote-builder-key.path;
sshUser = "nix";
systems = [ "x86_64-linux" "i686-linux" ];
publicHostKey = ssh-keys.machines.epyc-newtype-fr;
supportedFeatures = [ "benchmark" "big-parallel" "nixos-test" "kvm" ];
}
]

View file

@ -1,12 +1,10 @@
{
imports = [
./block-crawlers
./gerrit
./channel-scripts
./hydra
./matrix
./monitoring
./uptime-kuma
./netbox
./ofborg
./postgres
@ -15,6 +13,5 @@
./buildbot
./newsletter
./s3-revproxy
./extra-builders
];
}

View file

@ -1,6 +0,0 @@
{
imports = [
# Remote builders
./provider.nix
];
}

View file

@ -1,46 +0,0 @@
## Tenant-specific build capacity.
## This can come from anywhere and is not hold to the same level of responsibility that our build-infra has.
{ pkgs, config, lib, nodes, ... }:
let
inherit (lib) mkIf types mkEnableOption mkOption;
freeGbDiskSpace = 10;
cfg = config.bagel.builders.extra-build-capacity.provider;
in
{
options.bagel.builders.extra-build-capacity.provider = {
enable = mkEnableOption "providing of extra build capacity to other systems";
buildfarmPublicKeys = mkOption {
type = types.listOf types.str;
description = "SSH public keys to allow to connect for remote builds";
};
# TODO: register tenant in some deployment wide module
# so that the consumer side can just automatically generate buildMachines entries.
tenant = mkOption {
type = types.enum [ "lix" ];
};
};
config = mkIf cfg.enable {
users.groups.builders = {};
users.users.nix = {
openssh.authorizedKeys.keys = cfg.buildfarmPublicKeys;
extraGroups = [ "builders" ];
isNormalUser = true;
};
nix.settings.allowed-users = [ "@wheel" "@builders" ];
nix.settings.trusted-users = [ "@builders" ];
nix.gc.automatic = true;
nix.gc.dates = "hourly";
nix.gc.options = ''
--max-freed "$((${toString freeGbDiskSpace} * 1024**3 - 1024 * $(df -P -k /nix/store | tail -n 1 | ${pkgs.gawk}/bin/awk '{ print $4 }')))"
'';
# Bump the open files limit so that non-root users can run NixOS VM tests, if supported at all.
security.pam.loginLimits = [
{ domain = "*"; item = "nofile"; type = "-"; value = "20480"; }
];
};
}

View file

@ -41,12 +41,11 @@ in
imports = [
./www.nix
./one-way-sync.nix
./git-gc-preserve.nix
];
config = mkIf cfg.enable {
networking.firewall.allowedTCPPorts = [ cfg.port ];
age.secrets.alloy-push-password.file = ../../secrets/floral/metrics-push-password.age;
age.secrets.alloy-push-password.file = ../../secrets/metrics-push-password.age;
environment.systemPackages = [ jdk
pkgs.git
@ -141,9 +140,7 @@ in
plugins = with pkgs.gerritPlugins; [
oauth
metrics-reporter-prometheus
# Theme plugin
(pkgs.concatText "theme.js" [ ./theme.js ])
# Buildbot checks plugin
# Buildbot checks plugin (writeText because services.gerrit.plugins expects packages)
(pkgs.runCommand "checks.js" {
BASE_URI = builtins.toJSON "https://buildbot.forkos.org";
SUPPORTED_PROJECTS = builtins.toJSON [
@ -321,14 +318,7 @@ in
environment.REVWALK_USE_PRIORITY_QUEUE = "true";
};
bagel.services.git-gc-preserve = {
nixpkgs = {
enable = true;
repoPath = "/var/lib/gerrit/git/nixpkgs.git";
};
};
age.secrets.gerrit-prometheus-bearer-token.file = ../../secrets/floral/gerrit-prometheus-bearer-token.age;
age.secrets.gerrit-prometheus-bearer-token.file = ../../secrets/gerrit-prometheus-bearer-token.age;
bagel.monitoring.grafana-agent.exporters.gerrit = {
port = 4778; # grrt
bearerTokenFile = config.age.secrets.gerrit-prometheus-bearer-token.path;

View file

@ -1,86 +0,0 @@
{ lib, utils, config, pkgs, ... }: let
inherit (lib) mkOption mkEnableOption types;
cfg = config.bagel.services.git-gc-preserve;
enabledServices = lib.filterAttrs (_: gcConfig: gcConfig.enable) cfg;
in
{
options.bagel.services.git-gc-preserve = mkOption {
default = { };
description = "Repositories that should be garbage collected";
type = types.attrsOf (types.submodule {
options = {
enable = mkEnableOption "git-gc-preserve";
user = mkOption {
type = types.str;
default = "git";
description = "The user which will run the garbage collection script";
example = "forgejo";
};
group = mkOption {
type = types.str;
default = "git";
description = "The group which will run the garbage collection script";
example = "forgejo";
};
repoPath = mkOption {
type = types.path;
description = "The path to the git repository that should be garbage collected";
example = "/var/lib/gerrit/git/nixpkgs";
};
timeoutSec = mkOption {
type = types.str;
default = "1h";
description = "Garbage collection Systemd unit timeout";
example = "infinity";
};
timerConfig = mkOption {
type = types.attrsOf utils.systemdUtils.unitOptions.unitOption;
default = {
OnCalendar = "daily";
};
description = ''
When to run the git-gc-preserve. See {manpage}`systemd.timer(5)` for details.
'';
example = {
OnCalendar = "00:05";
RandomizedDelaySec = "5h";
Persistent = true;
};
};
};
});
};
config = {
systemd.services =
let
mkGCService = name: gcConfig: {
name = "git-gc-preserve-${name}";
value = {
description = "Git-GC-Preserve Service - ${name}";
serviceConfig = {
WorkingDirectory = gcConfig.repoPath;
Type = "oneshot";
User = gcConfig.user;
Group = gcConfig.group;
ExecStart = lib.getExe pkgs.git-gc-preserve;
TimeoutSec = gcConfig.timeoutSec;
};
};
};
mkServices = lib.mapAttrs' mkGCService;
in
mkServices enabledServices;
systemd.timers = let
mkGCTimer = name: gcConfig: {
name = "git-gc-preserve-${name}";
value = {
wantedBy = [ "timers.target" ];
after = [ "multi-user.target" ];
timerConfig = gcConfig.timerConfig;
};
};
mkTimer = lib.mapAttrs' mkGCTimer;
in mkTimer enabledServices;
};
}

View file

@ -1,69 +0,0 @@
/* Set up theming for Floral.
* vim: set et ts=2 sw=2:
*/
Gerrit.install((plugin) => {
const stylesheet = new CSSStyleSheet();
stylesheet.replace(`
html {
--header-title-content: 'floral.systems';
--blue-50: #f3f4fb;
--blue-100: #e3e6f6;
--blue-200: #ced5ef;
--blue-300: #acb8e4;
--blue-400: #8495d6;
--blue-500: #6775ca;
--blue-600: #5158bb;
--blue-700: #494bac;
--blue-800: #41408d;
--blue-900: #383870;
--blue-950: #252546;
--coral-50: #fff1f1;
--coral-100: #ffe0e0;
--coral-200: #ffc5c5;
--coral-300: #ff9e9d;
--coral-400: #ff6665;
--coral-500: #fe4a49;
--coral-600: #ec1716;
--coral-700: #c70f0e;
--coral-800: #a41110;
--coral-900: #881514;
--coral-950: #4a0505;
--teal-50: #eefbf5;
--teal-100: #d6f5e5;
--teal-200: #b1e9d0;
--teal-300: #7ed7b5;
--teal-400: #49be95;
--teal-500: #27a27b;
--teal-600: #188162;
--teal-700: #136951;
--teal-800: #125342;
--teal-900: #104437;
--teal-950: #08261f;
--zinc-50: #fafafa;
--zinc-100: #f4f4f5;
--zinc-200: #e4e4e7;
--zinc-300: #d4d4d8;
--zinc-400: #a1a1aa;
--zinc-500: #71717a;
--zinc-600: #52525b;
--zinc-700: #3f3f46;
--zinc-800: #27272a;
--zinc-900: #18181b;
--zinc-950: #09090b;
}
html.lightTheme {
--header-background-color: var(--teal-700);
--header-text-color: var(--coral-50);
}
html.darkTheme {
--header-background-color: var(--teal-900);
--header-text-color: var(--coral-50);
}
`).then(() => {
document.adoptedStyleSheets = [...document.adoptedStyleSheets, stylesheet];
});
});

View file

@ -29,6 +29,10 @@ in
# NGINX should not give up super fast. Things can take time.
proxy_read_timeout 3600;
}
location = /robots.txt {
return 200 'User-agent: *\nAllow: /';
}
'';
};

View file

@ -66,19 +66,17 @@ in {
# does indeed have our public SSH key and are *builders*
# as a simple evaluation preflight check.
bagel.secrets.files = [
"hydra-s3-credentials"
"hydra-postgres-key"
"hydra-signing-priv"
"hydra-ssh-key-priv"
];
age.secrets.hydra-s3-credentials.file = ../../secrets/hydra-s3-credentials.age;
age.secrets.hydra-postgres-key.group = "hydra";
age.secrets.hydra-postgres-key.mode = "0440";
age.secrets.hydra-postgres-key.file = ../../secrets/hydra-postgres-key.age;
age.secrets.hydra-signing-priv.owner = "hydra-queue-runner";
age.secrets.hydra-signing-priv.file = ../../secrets/hydra-signing-priv.age;
age.secrets.hydra-ssh-key-priv.owner = "hydra-queue-runner";
age.secrets.hydra-ssh-key-priv.file = ../../secrets/hydra-ssh-key-priv.age;
systemd.tmpfiles.rules = [
"d /var/cache/hydra 0755 hydra hydra - -"

View file

@ -12,14 +12,6 @@ in
options.bagel.monitoring.grafana-agent = {
enable = (mkEnableOption "Grafana Agent") // { default = true; };
tenant = mkOption {
description = ''
Which tenant are we enabling Grafana Agent for.
'';
example = "lix";
type = types.enum [ "lix" "floral" ];
};
exporters = mkOption {
description = ''
Set of additional exporters to scrape.
@ -67,7 +59,7 @@ in
};
config = mkIf cfg.enable {
age.secrets.grafana-agent-password.file = ../../secrets/floral/metrics-push-password.age;
age.secrets.grafana-agent-password.file = ../../secrets/metrics-push-password.age;
services.grafana-agent = {
enable = true;
@ -84,10 +76,7 @@ in
};
}
];
global.external_labels = {
hostname = config.networking.hostName;
inherit (cfg) tenant;
};
global.external_labels.hostname = config.networking.hostName;
configs = [
{
name = config.networking.hostName;

View file

@ -1,8 +1,7 @@
{
imports = [
./cadvisor.nix
./hydra
./nginx.nix
./postgres.nix
];
}
}

View file

@ -1,39 +0,0 @@
{
config,
lib,
pkgs,
...
}:
let
cfg = config.bagel.monitoring.exporters.hydra;
python = pkgs.python3.withPackages(ps: [
ps.aioprometheus
ps.click
ps.httpx
ps.starlette
ps.uvicorn
]);
inherit (lib) escapeShellArg getExe mkEnableOption mkIf mkOption types;
in
{
options.bagel.monitoring.exporters.hydra = {
enable = mkEnableOption "bagel flavored Hydra exporter";
hydraUrl = mkOption {
type = types.str;
default = "https://hydra.forkos.org/";
description = "URL to the Hydra to monitor";
};
};
config = mkIf cfg.enable {
systemd.services.hydra-exporter = {
wantedBy = [ "multi-user.target" ];
description = "Hydra exporter";
script = "${getExe python} ${./hydra-exporter.py} --hydra-url=${escapeShellArg cfg.hydraUrl} --port=9105";
};
bagel.monitoring.grafana-agent.exporters.hydra.port = 9105;
};
}

View file

@ -1,364 +0,0 @@
#!/usr/bin/env nix-shell
#!nix-shell -i python3 -p "python3.withPackages(ps: [ps.aioprometheus ps.click ps.httpx ps.starlette ps.uvicorn])"
import asyncio
from contextlib import asynccontextmanager
import logging
from aioprometheus import Counter, Gauge
from aioprometheus.asgi.starlette import metrics
import click
import httpx
from starlette.applications import Starlette
from starlette.routing import Route
import uvicorn
up = Gauge("hydra_up", "Is Hydra running")
time = Gauge("hydra_time", "Hydra's current time")
uptime = Gauge("hydra_uptime", "Hydra's uptime")
builds_queued = Gauge("hydra_builds_queued", "Number of jobs in build queue")
steps_active = Gauge("hydra_steps_active", "Number of active steps in build queue")
steps_building = Gauge("hydra_steps_building", "Number of steps currently building")
steps_copying_to = Gauge(
"hydra_steps_copying_to", "Number of steps copying inputs to a worker"
)
steps_waiting_for_download_slot = Gauge(
"hydra_steps_waiting_for_download_slot", "Number of steps waiting for download slot"
)
steps_copying_from = Gauge(
"hydra_steps_copying_from", "Number of steps copying outputs from a worker"
)
steps_waiting = Gauge(
"hydra_steps_waiting", "Number of steps currently waiting for a worker slot"
)
steps_unsupported = Gauge(
"hydra_steps_unsupported", "Number of unsupported steps in build queue"
)
bytes_sent = Counter(
"hydra_build_inputs_sent_bytes_total",
"Total number of bytes copied to workers as build inputs",
)
bytes_received = Counter(
"hydra_build_outputs_received_bytes_total",
"Total number of bytes copied from workers as build outputs",
)
builds_read = Counter(
"hydra_builds_read_total",
"Total number of builds whose outputs have been copied from workers",
)
builds_read_seconds = Counter(
"hydra_builds_read_seconds_total",
"Total time spent copying build outputs, in seconds",
)
builds_done = Counter("hydra_builds_done_total", "Total number of builds completed")
steps_started = Counter("hydra_steps_started_total", "Total number of steps started")
steps_done = Counter("hydra_steps_done_total", "Total number of steps completed")
retries = Counter("hydra_retries_total", "Total number of retries")
max_retries = Gauge(
"hydra_max_retries", "Maximum observed number of retries for a single step"
)
queue_wakeups = Counter(
"hydra_queue_wakeup_total",
"Count of the times the queue runner has been notified of queue changes",
)
dispatcher_wakeups = Counter(
"hydra_dispatcher_wakeup_total",
"Count of the times the queue runner work dispatcher woke up due to new runnable builds and completed builds.",
)
dispatch_time = Counter(
"hydra_dispatch_execution_seconds_total",
"Total time the dispatcher has spent working, in seconds",
)
db_connections = Gauge("hydra_db_connections", "Number of connections to the database")
active_db_updates = Gauge("hydra_db_updates", "Number of in-progress database updates")
steps_queued = Gauge("hydra_steps_queued", "Number of steps in build queue")
steps_runnable = Gauge(
"hydra_steps_runnable", "Number of runnable steps in build queue"
)
step_time = Counter(
"hydra_step_time_total", "Total time spent executing steps, in seconds"
)
step_build_time = Counter(
"hydra_step_build_time_total", "Total time spent executing build steps, in seconds"
)
machine_enabled = Gauge("hydra_machine_enabled", "Whether machine is enabled")
machine_steps_done = Counter(
"hydra_machine_steps_done_total", "Total number of steps completed by this worker"
)
machine_current_jobs = Gauge(
"hydra_machine_current_jobs", "Number of jobs currently running on this worker"
)
machine_disabled_until = Gauge(
"hydra_machine_disabled_until",
"Timestamp of when this worker will next become active",
)
machine_last_failure = Gauge(
"hydra_machine_last_failure", "Timestamp of when a build last failed on this worker"
)
machine_consecutive_failures = Gauge(
"hydra_machine_consecutive_failures",
"Number of consecutive failed builds on this worker",
)
machine_idle_since = Gauge(
"hydra_machine_idle_since", "Timestamp of when this worker last had jobs running"
)
machine_step_time = Counter(
"hydra_machine_step_time_total",
"Total time this worker spent executing steps, in seconds",
)
machine_step_build_time = Counter(
"hydra_machine_step_build_time_total",
"Total time this worker spent executing build steps, in seconds",
)
jobset_time = Counter(
"hydra_jobset_seconds_total",
"Total time this jobset has been building for, in seconds",
)
jobset_shares_used = Gauge(
"hydra_jobset_shares_used", "Number of shares currently consumed by this jobset"
)
machine_type_runnable = Gauge(
"hydra_machine_type_runnable",
"Number of steps currently runnable on this machine type",
)
machine_type_running = Gauge(
"hydra_machine_type_running",
"Number of steps currently running on this machine type",
)
machine_type_wait_time = Counter(
"hydra_machine_type_wait_time_total",
"Total time spent waiting for a build slot of this machine type",
)
machine_type_last_active = Gauge(
"hydra_machine_type_last_active",
"Timestamp of when a machine of this type was last active",
)
store_nar_info_read = Counter(
"hydra_store_nar_info_read_total",
"Total number of narinfo files read from the remote store",
)
store_nar_info_read_averted = Counter(
"hydra_store_nar_info_read_averted_total",
"Total number of narinfo file reads averted (already loaded)",
)
store_nar_info_missing = Counter(
"hydra_store_nar_info_missing_total",
"Total number of narinfo files found to be missing",
)
store_nar_info_write = Counter(
"hydra_store_nar_info_write_total",
"Total number of narinfo files written to the remote store",
)
store_nar_info_cache_size = Gauge(
"hydra_store_nar_info_cache_size",
"Size of the in-memory store path information cache",
)
store_nar_read = Counter(
"hydra_store_nar_read_total", "Total number of NAR files read from the remote store"
)
store_nar_read_bytes = Counter(
"hydra_store_nar_read_bytes_total",
"Total number of NAR file bytes read from the remote store (uncompressed)",
)
store_nar_read_compressed_bytes = Counter(
"hydra_store_nar_read_compressed_bytes_total",
"Total number of NAR file bytes read from the remote store (compressed)",
)
store_nar_write = Counter(
"hydra_store_nar_write_total",
"Total number of NAR files written to the remote store",
)
store_nar_write_averted = Counter(
"hydra_store_nar_write_averted_total",
"Total number of NAR file writes averted (already exists on remote)",
)
store_nar_write_bytes = Counter(
"hydra_store_nar_write_bytes_total",
"Total number of NAR file bytes written to the remote store (uncompressed)",
)
store_nar_write_compressed_bytes = Counter(
"hydra_store_nar_write_compressed_bytes_total",
"Total number of NAR file bytes written to the remote store (compressed)",
)
store_nar_write_compression_seconds = Counter(
"hydra_store_nar_write_compression_seconds_total",
"Total time spent compressing NAR files for writing to the remote store",
)
store_s3_put = Counter(
"hydra_store_s3_put_total", "Total number of PUT requests to S3 store"
)
store_s3_put_bytes = Counter(
"hydra_store_s3_put_bytes_total", "Total number of bytes written to S3 store"
)
store_s3_put_seconds = Counter(
"hydra_store_s3_put_seconds_total",
"Total time spent writing to S3 store, in seconds",
)
store_s3_get = Counter(
"hydra_store_s3_get_total", "Total number of GET requests to S3 store"
)
store_s3_get_bytes = Counter(
"hydra_store_s3_get_bytes_total", "Total number of bytes read from S3 store"
)
store_s3_get_seconds = Counter(
"hydra_store_s3_get_seconds_total",
"Total time spent reading from S3 store, in seconds",
)
store_s3_head = Counter(
"hydra_store_s3_head_total", "Total number of HEAD requests to S3 store"
)
def update_metrics(status):
up.set({}, int(status["status"] == "up"))
time.set({}, status["time"])
uptime.set({}, status["uptime"])
builds_queued.set({}, status["nrQueuedBuilds"])
steps_active.set({}, status["nrActiveSteps"])
steps_building.set({}, status["nrStepsBuilding"])
steps_copying_to.set({}, status["nrStepsCopyingTo"])
steps_waiting_for_download_slot.set({}, status["nrStepsWaitingForDownloadSlot"])
steps_copying_from.set({}, status["nrStepsCopyingFrom"])
steps_waiting.set({}, status["nrStepsWaiting"])
steps_unsupported.set({}, status["nrUnsupportedSteps"])
bytes_sent.set({}, status["bytesSent"])
bytes_received.set({}, status["bytesReceived"])
builds_read.set({}, status["nrBuildsRead"])
builds_read_seconds.set({}, status["buildReadTimeMs"] / 1000)
builds_done.set({}, status["nrBuildsDone"])
steps_started.set({}, status["nrStepsStarted"])
steps_done.set({}, status["nrStepsDone"])
retries.set({}, status["nrRetries"])
max_retries.set({}, status["maxNrRetries"])
queue_wakeups.set({}, status["nrQueueWakeups"])
dispatcher_wakeups.set({}, status["nrDispatcherWakeups"])
dispatch_time.set({}, status["dispatchTimeMs"] / 1000)
db_connections.set({}, status["nrDbConnections"])
active_db_updates.set({}, status["nrActiveDbUpdates"])
steps_queued.set({}, status["nrUnfinishedSteps"])
steps_runnable.set({}, status["nrRunnableSteps"])
if st := status.get("totalStepTime"):
step_time.set({}, st)
if sbt := status.get("totalStepBuildTime"):
step_build_time.set({}, sbt)
for machine_name, machine_status in status["machines"].items():
labels = {"host": machine_name}
machine_enabled.set(labels, int(machine_status["enabled"]))
machine_steps_done.set(labels, machine_status["nrStepsDone"])
machine_current_jobs.set(labels, machine_status["currentJobs"])
machine_disabled_until.set(labels, machine_status["disabledUntil"])
machine_last_failure.set(labels, machine_status["lastFailure"])
machine_consecutive_failures.set(labels, machine_status["consecutiveFailures"])
if isn := machine_status.get("idleSince"):
machine_idle_since.set(labels, isn)
if st := machine_status.get("totalStepTime"):
machine_step_time.set(labels, st)
if sbt := machine_status.get("totalStepBuildTime"):
machine_step_build_time.set(labels, sbt)
for jobset_name, jobset_status in status["jobsets"].items():
labels = {"name": jobset_name}
jobset_time.set(labels, jobset_status["seconds"])
jobset_shares_used.set(labels, jobset_status["shareUsed"])
for type_name, type_status in status["machineTypes"].items():
labels = {"machineType": type_name}
machine_type_runnable.set(labels, type_status["runnable"])
machine_type_running.set(labels, type_status["running"])
if wt := type_status.get("waitTime"):
machine_type_wait_time.set(labels, wt)
if la := type_status.get("lastActive"):
machine_type_last_active.set(labels, la)
store = status["store"]
store_nar_info_read.set({}, store["narInfoRead"])
store_nar_info_read_averted.set({}, store["narInfoReadAverted"])
store_nar_info_missing.set({}, store["narInfoMissing"])
store_nar_info_write.set({}, store["narInfoWrite"])
store_nar_info_cache_size.set({}, store["narInfoCacheSize"])
store_nar_read.set({}, store["narRead"])
store_nar_read_bytes.set({}, store["narReadBytes"])
store_nar_read_compressed_bytes.set({}, store["narReadCompressedBytes"])
store_nar_write.set({}, store["narWrite"])
store_nar_write_averted.set({}, store["narWriteAverted"])
store_nar_write_bytes.set({}, store["narWriteBytes"])
store_nar_write_compressed_bytes.set({}, store["narWriteCompressedBytes"])
store_nar_write_compression_seconds.set(
{}, store["narWriteCompressionTimeMs"] / 1000
)
if s3 := status.get("s3"):
store_s3_put.set({}, s3["put"])
store_s3_put_bytes.set({}, s3["putBytes"])
store_s3_put_seconds.set({}, s3["putTimeMs"] / 1000)
store_s3_get.set({}, s3["get"])
store_s3_get_bytes.set({}, s3["getBytes"])
store_s3_get_seconds.set({}, s3["getTimeMs"] / 1000)
store_s3_head.set({}, s3["head"])
async def update_metrics_loop(hydra_url, scrape_interval):
async with httpx.AsyncClient(base_url=hydra_url) as client:
while True:
try:
response = await client.get(
"/queue-runner-status",
headers={"Content-Type": "application/json"},
)
update_metrics(response.json())
except Exception as ex:
logging.exception("Failed to update metrics", exc_info=ex)
await asyncio.sleep(scrape_interval)
@click.command()
@click.option("--hydra-url", default="https://hydra.forkos.org/")
@click.option("--port", default=9200)
@click.option("--scrape-interval", default=15)
def main(hydra_url, port, scrape_interval):
@asynccontextmanager
async def lifespan(_):
loop = asyncio.get_event_loop()
loop.create_task(update_metrics_loop(hydra_url, scrape_interval))
yield
app = Starlette(routes=[Route("/metrics", metrics)], lifespan=lifespan)
uvicorn.run(app, port=port, log_level="info")
if __name__ == "__main__":
main()

View file

@ -1,12 +1,11 @@
namespace: forkos
groups:
- name: ForkOS automation
rules:
- alert: SyncFailedTooOften
expr: 'changes(node_systemd_unit_state{name=~"ows.*.service",state="failed"}[1d]) > 2'
for: 30m
labels:
severity: critical
annotations:
description: On {{ $labels.instance }}, the synchronization job has failed more than twice in the last 24 hours, check if there's a conflict or a stdenv change.
summary: Synchronization job {{ $labels.name }} has failed more than twice in the last 24 hours
- name: ForkOS automation
rules:
- alert: SyncFailedTooOften
expr: 'changes(node_systemd_unit_state{name=~"ows.*.service",state="failed"}[24h]) > 2'
for: 30m
labels:
severity: critical
annotations:
summary: "Synchronization job {{ $labels.name }} has failed more than twice in the last 24 hours"
description: "On {{ $labels.instance }}, the synchronization job has failed more than twice in the last 24 hours, check if there's a conflict or a stdenv change."

View file

@ -1,119 +1,102 @@
namespace: postgres
groups:
- name: PostgreSQL
rules:
- alert: PostgresqlTableNotAutoVacuumed
expr: '(pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10'
labels:
severity: warning
annotations:
description: |-
Table {{ $labels.relname }} has not been auto vacuumed for 10 days
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Postgresql table not auto vacuumed (instance {{ $labels.instance }})
- alert: PostgresqlTableNotAutoAnalyzed
expr: '(pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10'
labels:
severity: warning
annotations:
description: |-
Table {{ $labels.relname }} has not been auto analyzed for 10 days
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Postgresql table not auto analyzed (instance {{ $labels.instance }})
- alert: PostgresqlDeadLocks
expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
labels:
severity: warning
annotations:
description: |-
PostgreSQL has dead-locks
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Postgresql dead locks (instance {{ $labels.instance }})
- alert: PostgresqlHighRollbackRate
expr: 'sum by (namespace, datname) ((rate(pg_stat_database_xact_rollback{datid!="0",datname!~"template.*|postgres"}[3m])) / ((rate(pg_stat_database_xact_rollback{datid!="0",datname!~"template.*|postgres"}[3m])) + (rate(pg_stat_database_xact_commit{datid!="0",datname!~"template.*|postgres"}[3m])))) > 0.02'
labels:
severity: warning
annotations:
description: |-
Ratio of transactions being aborted compared to committed is > 2 %
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Postgresql high rollback rate (instance {{ $labels.instance }})
- alert: PostgresqlHighRateStatementTimeout
expr: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3'
labels:
severity: critical
annotations:
description: |-
Postgres transactions showing high rate of statement timeouts
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Postgresql high rate statement timeout (instance {{ $labels.instance }})
- alert: PostgresqlHighRateDeadlock
expr: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1'
labels:
severity: critical
annotations:
description: |-
Postgres detected deadlocks
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Postgresql high rate deadlock (instance {{ $labels.instance }})
- alert: PostgresqlTooManyDeadTuples
expr: '((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1'
for: 2m
labels:
severity: warning
annotations:
description: |-
PostgreSQL dead tuples is too large
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Postgresql too many dead tuples (instance {{ $labels.instance }})
- alert: PostgresqlTooManyLocksAcquired
expr: '((sum(pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.2'
for: 2m
labels:
severity: critical
annotations:
description: |-
Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Postgresql too many locks acquired (instance {{ $labels.instance }})
- alert: PostgresqlBloatIndexHigh(>80%)
expr: 'pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 1e+08)'
for: 1h
labels:
severity: warning
annotations:
description: |-
The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Postgresql bloat index high (> 80%) (instance {{ $labels.instance }})
- alert: PostgresqlBloatTableHigh(>80%)
expr: 'pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 2e+08)'
for: 1h
labels:
severity: warning
annotations:
description: |-
The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }})
- alert: PostgresqlInvalidIndex
expr: 'pg_genaral_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
for: 6h
labels:
severity: warning
annotations:
description: |-
The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Postgresql invalid index (instance {{ $labels.instance }})
- name: PostgreSQL
rules:
- alert: PostgresqlTableNotAutoVacuumed
expr: '(pg_stat_user_tables_last_autovacuum > 0) and (time() - pg_stat_user_tables_last_autovacuum) > 60 * 60 * 24 * 10'
for: 0m
labels:
severity: warning
annotations:
summary: Postgresql table not auto vacuumed (instance {{ $labels.instance }})
description: "Table {{ $labels.relname }} has not been auto vacuumed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlTableNotAutoAnalyzed
expr: '(pg_stat_user_tables_last_autoanalyze > 0) and (time() - pg_stat_user_tables_last_autoanalyze) > 24 * 60 * 60 * 10'
for: 0m
labels:
severity: warning
annotations:
summary: Postgresql table not auto analyzed (instance {{ $labels.instance }})
description: "Table {{ $labels.relname }} has not been auto analyzed for 10 days\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlDeadLocks
expr: 'increase(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 5'
for: 0m
labels:
severity: warning
annotations:
summary: Postgresql dead locks (instance {{ $labels.instance }})
description: "PostgreSQL has dead-locks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlHighRollbackRate
expr: 'sum by (namespace,datname) ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) / ((rate(pg_stat_database_xact_rollback{datname!~"template.*|postgres",datid!="0"}[3m])) + (rate(pg_stat_database_xact_commit{datname!~"template.*|postgres",datid!="0"}[3m])))) > 0.02'
for: 0m
labels:
severity: warning
annotations:
summary: Postgresql high rollback rate (instance {{ $labels.instance }})
description: "Ratio of transactions being aborted compared to committed is > 2 %\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlHighRateStatementTimeout
expr: 'rate(postgresql_errors_total{type="statement_timeout"}[1m]) > 3'
for: 0m
labels:
severity: critical
annotations:
summary: Postgresql high rate statement timeout (instance {{ $labels.instance }})
description: "Postgres transactions showing high rate of statement timeouts\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlHighRateDeadlock
expr: 'increase(postgresql_errors_total{type="deadlock_detected"}[1m]) > 1'
for: 0m
labels:
severity: critical
annotations:
summary: Postgresql high rate deadlock (instance {{ $labels.instance }})
description: "Postgres detected deadlocks\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlTooManyDeadTuples
expr: '((pg_stat_user_tables_n_dead_tup > 10000) / (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup)) >= 0.1'
for: 2m
labels:
severity: warning
annotations:
summary: Postgresql too many dead tuples (instance {{ $labels.instance }})
description: "PostgreSQL dead tuples is too large\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlTooManyLocksAcquired
expr: '((sum (pg_locks_count)) / (pg_settings_max_locks_per_transaction * pg_settings_max_connections)) > 0.20'
for: 2m
labels:
severity: critical
annotations:
summary: Postgresql too many locks acquired (instance {{ $labels.instance }})
description: "Too many locks acquired on the database. If this alert happens frequently, we may need to increase the postgres setting max_locks_per_transaction.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlBloatIndexHigh(>80%)
expr: 'pg_bloat_btree_bloat_pct > 80 and on (idxname) (pg_bloat_btree_real_size > 100000000)'
for: 1h
labels:
severity: warning
annotations:
summary: Postgresql bloat index high (> 80%) (instance {{ $labels.instance }})
description: "The index {{ $labels.idxname }} is bloated. You should execute `REINDEX INDEX CONCURRENTLY {{ $labels.idxname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlBloatTableHigh(>80%)
expr: 'pg_bloat_table_bloat_pct > 80 and on (relname) (pg_bloat_table_real_size > 200000000)'
for: 1h
labels:
severity: warning
annotations:
summary: Postgresql bloat table high (> 80%) (instance {{ $labels.instance }})
description: "The table {{ $labels.relname }} is bloated. You should execute `VACUUM {{ $labels.relname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: PostgresqlInvalidIndex
expr: 'pg_genaral_index_info_pg_relation_size{indexrelname=~".*ccnew.*"}'
for: 6h
labels:
severity: warning
annotations:
summary: Postgresql invalid index (instance {{ $labels.instance }})
description: "The table {{ $labels.relname }} has an invalid index: {{ $labels.indexrelname }}. You should execute `DROP INDEX {{ $labels.indexrelname }};`\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -1,101 +1,76 @@
namespace: resources
groups:
- name: Host & hardware
rules:
- alert: HostOutOfMemory
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on (instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
description: |-
Node memory is filling up (< 10% left)
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host out of memory (instance {{ $labels.instance }})
- alert: HostMemoryUnderMemoryPressure
expr: (rate(node_vmstat_pgmajfault[1m]) > 1000) * on (instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
description: |-
The node is under heavy memory pressure. High rate of major page faults
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host memory under memory pressure (instance {{ $labels.instance }})
- alert: HostMemoryIsUnderutilized
expr: (100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on (instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 1w
labels:
severity: info
annotations:
description: |-
Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host Memory is underutilized (instance {{ $labels.instance }})
- alert: HostOutOfDiskSpace
expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and on (instance, device, mountpoint) node_filesystem_readonly == 0) * on (instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
description: |-
Disk is almost full (< 10% left)
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host out of disk space (instance {{ $labels.instance }})
- alert: HostDiskWillFillIn24Hours
expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and on (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and on (instance, device, mountpoint) node_filesystem_readonly == 0) * on (instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
description: |-
Filesystem is predicted to run out of space within the next 24 hours at current write rate
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
- alert: HostCpuIsUnderutilized
expr: (100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on (instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 1w
labels:
severity: info
annotations:
description: |-
CPU load is < 20% for 1 week. Consider reducing the number of CPUs.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host CPU is underutilized (instance {{ $labels.instance }})
- alert: HostCpuStealNoisyNeighbor
expr: (avg by (instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on (instance) group_left (nodename) node_uname_info{nodename=~".+"}
labels:
severity: warning
annotations:
description: |-
CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
- alert: HostOomKillDetected
expr: (increase(node_vmstat_oom_kill[1m]) > 0) * on (instance) group_left (nodename) node_uname_info{nodename=~".+"}
labels:
severity: warning
annotations:
description: |-
OOM kill detected
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host OOM kill detected (instance {{ $labels.instance }})
- alert: HostNetworkInterfaceSaturated
expr: ((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on (instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 1m
labels:
severity: warning
annotations:
description: |-
The network interface "{{ $labels.device }}" on "{{ $labels.instance }}" is getting overloaded.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
- name: Host & hardware
rules:
- alert: HostOutOfMemory
expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: "Node memory is filling up (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostMemoryUnderMemoryPressure
expr: (rate(node_vmstat_pgmajfault[1m]) > 1000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host memory under memory pressure (instance {{ $labels.instance }})
description: "The node is under heavy memory pressure. High rate of major page faults\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostMemoryIsUnderutilized
expr: (100 - (avg_over_time(node_memory_MemAvailable_bytes[30m]) / node_memory_MemTotal_bytes * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 1w
labels:
severity: info
annotations:
summary: Host Memory is underutilized (instance {{ $labels.instance }})
description: "Node memory is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOutOfDiskSpace
expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host out of disk space (instance {{ $labels.instance }})
description: "Disk is almost full (< 10% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostDiskWillFillIn24Hours
expr: ((node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 2m
labels:
severity: warning
annotations:
summary: Host disk will fill in 24 hours (instance {{ $labels.instance }})
description: "Filesystem is predicted to run out of space within the next 24 hours at current write rate\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuIsUnderutilized
expr: (100 - (rate(node_cpu_seconds_total{mode="idle"}[30m]) * 100) < 20) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 1w
labels:
severity: info
annotations:
summary: Host CPU is underutilized (instance {{ $labels.instance }})
description: "CPU load is < 20% for 1 week. Consider reducing the number of CPUs.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostCpuStealNoisyNeighbor
expr: (avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 0m
labels:
severity: warning
annotations:
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
description: "CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostOomKillDetected
expr: (increase(node_vmstat_oom_kill[1m]) > 0) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 0m
labels:
severity: warning
annotations:
summary: Host OOM kill detected (instance {{ $labels.instance }})
description: "OOM kill detected\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: HostNetworkInterfaceSaturated
expr: ((rate(node_network_receive_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*|^vnet.*|^veth.*|^tun.*"} > 0.8 < 10000) * on(instance) group_left (nodename) node_uname_info{nodename=~".+"}
for: 1m
labels:
severity: warning
annotations:
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
description: "The network interface \"{{ $labels.device }}\" on \"{{ $labels.instance }}\" is getting overloaded.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"

View file

@ -17,7 +17,7 @@ in
config = mkIf cfg.enable {
age.secrets.grafana-oauth-secret = {
file = ../../../secrets/floral/grafana-oauth-secret.age;
file = ../../../secrets/grafana-oauth-secret.age;
owner = "grafana";
};

View file

@ -13,10 +13,10 @@ in
config = mkIf cfg.enable {
age.secrets = {
metrics-push-htpasswd = {
file = ../../../secrets/floral/metrics-push-htpasswd.age;
file = ../../../secrets/metrics-push-htpasswd.age;
owner = "nginx";
};
loki-environment.file = ../../../secrets/floral/loki-environment.age;
loki-environment.file = ../../../secrets/loki-environment.age;
};
services.loki = {

View file

@ -9,18 +9,6 @@ let
inherit (lib) mkEnableOption mkIf;
mimirPort = config.services.mimir.configuration.server.http_listen_port;
alerts = pkgs.runCommand "mimir-alerts-checked" {
src = ./alerts;
nativeBuildInputs = with pkgs; [ mimir ];
} ''
mkdir $out
cp -R $src $out/anonymous/
chmod -R +w $out
mimirtool rules check --rule-dirs=$out/anonymous
mimirtool rules lint --rule-dirs=$out/anonymous
diff -r $src $out/anonymous
'';
in
{
options.bagel.services.prometheus.enable = mkEnableOption "Prometheus scraper";
@ -28,11 +16,11 @@ in
config = mkIf cfg.enable {
age.secrets = {
metrics-push-htpasswd = {
file = ../../../secrets/floral/metrics-push-htpasswd.age;
file = ../../../secrets/metrics-push-htpasswd.age;
owner = "nginx";
};
mimir-environment.file = ../../../secrets/floral/mimir-environment.age;
mimir-webhook-url.file = ../../../secrets/floral/mimir-webhook-url.age;
mimir-environment.file = ../../../secrets/mimir-environment.age;
mimir-webhook-url.file = ../../../secrets/mimir-webhook-url.age;
};
services.mimir = {
@ -72,7 +60,10 @@ in
blocks_storage.backend = "s3";
ruler_storage = {
backend = "local";
local.directory = alerts;
local.directory = pkgs.runCommand "mimir-rules" {} ''
mkdir -p $out
ln -s ${./alerts} $out/anonymous
'';
};
alertmanager = {

View file

@ -13,10 +13,10 @@ in
config = mkIf cfg.enable {
age.secrets = {
metrics-push-htpasswd = {
file = ../../../secrets/floral/metrics-push-htpasswd.age;
file = ../../../secrets/metrics-push-htpasswd.age;
owner = "nginx";
};
tempo-environment.file = ../../../secrets/floral/tempo-environment.age;
tempo-environment.file = ../../../secrets/tempo-environment.age;
};
services.tempo = {

View file

@ -15,7 +15,7 @@ in
];
config = mkIf cfg.enable {
age.secrets.pyroscope-secrets.file = ../../../secrets/floral/pyroscope-secrets.age;
age.secrets.pyroscope-secrets.file = ../../../secrets/pyroscope-secrets.age;
services.nginx = {
upstreams.pyroscope = {
servers."127.0.0.1:${toString pyroscopePort}" = {};

View file

@ -20,7 +20,7 @@ in
};
config = mkIf cfg.enable {
age.secrets.netbox-environment.file = ../../secrets/floral/netbox-environment.age;
age.secrets.netbox-environment.file = ../../secrets/netbox-environment.age;
services = {
netbox = {
enable = true;

View file

@ -14,7 +14,7 @@ in
};
config = mkIf cfg.enable {
age.secrets.newsletter-secrets.file = ../../secrets/floral/newsletter-secrets.age;
age.secrets.newsletter-secrets.file = ../../secrets/newsletter-secrets.age;
services.listmonk = {
enable = true;
secretFile = config.age.secrets.newsletter-secrets.path;

View file

@ -11,7 +11,7 @@ in {
config = lib.mkIf cfg.enable {
age.secrets.postgresql-tls-priv.owner = "postgres";
age.secrets.postgresql-tls-priv.file = ../../secrets/floral/postgres-tls-priv.age;
age.secrets.postgresql-tls-priv.file = ../../secrets/postgres-tls-priv.age;
systemd.tmpfiles.rules = [
"d /var/db 0755 root root - -"
@ -67,7 +67,7 @@ in {
# Provisioned on the server so that CA operations can be done there.
age.secrets.postgresql-ca-priv.owner = "postgres";
age.secrets.postgresql-ca-priv.file = ../../secrets/floral/postgres-ca-priv.age;
age.secrets.postgresql-ca-priv.file = ../../secrets/postgres-ca-priv.age;
users.users.postgres.packages = [
(pkgs.writeShellScriptBin "postgres-mint-new-client" ''

View file

@ -70,7 +70,7 @@ in
];
config = mkIf cfg.enable {
age.secrets.s3-revproxy-api-keys.file = ../../secrets/floral/s3-revproxy-api-keys.age;
age.secrets.s3-revproxy-api-keys.file = ../../secrets/s3-revproxy-api-keys.age;
# For each target, generate an entry that passes it to the s3-revproxy.
services.nginx.virtualHosts = mapAttrs' (subdomain: _: nameValuePair "${subdomain}.${cfg.domain}" (mkProxiedSubdomain subdomain)) cfg.targets;
# this solves garage supporting neither anonymous access nor automatic

View file

@ -1,97 +0,0 @@
{
inputs,
lib,
config,
...
}:
let
cfg = config.bagel.status;
# TODO: pull domains from a central place
subdomains = [
"cl"
"netbox"
"cache"
"grafana"
"hydra"
"loki"
"mimir"
"pyroscope"
"matrix"
"tempo"
"amqp"
"fodwatch"
"git"
"alerts"
"buildbot"
"b"
"postgres"
"news"
];
port = 3001;
in
{
imports = [ "${inputs.stateless-uptime-kuma}/nixos/module.nix" ];
options.bagel.status = {
enable = lib.mkEnableOption "the status page service (uptime-kuma)";
domain = lib.mkOption {
type = lib.types.str;
};
};
config = lib.mkIf cfg.enable {
services.uptime-kuma.enable = true;
services.nginx = {
enable = true;
virtualHosts.${cfg.domain} = {
enableACME = true;
forceSSL = true;
locations."/" = {
proxyPass = "http://127.0.0.1:${builtins.toString port}";
proxyWebsockets = true;
};
};
};
networking.firewall.allowedTCPPorts = [
80
443
];
age.secrets.stateless-uptime-kuma-password.file = ../../secrets/floral/stateless-uptime-kuma-password.age;
statelessUptimeKuma = {
probesConfig = {
monitors = lib.genAttrs subdomains (name: {
type = "http";
url = "https://${name}.forkos.org/";
tags = [];
});
status_pages = {
"forkos" = {
title = "ForkOS";
description = "health of the ForkOS infra";
showTags = true;
publicGroupList = [
{
name = "Services";
weight = 1;
monitorList = lib.genAttrs subdomains (id: {
inherit id;
});
}
];
};
};
settings = {
entryPage = "statusPage-forkos";
};
};
extraFlags = [ "-s" ];
host = "http://localhost:${builtins.toString port}/";
username = "forkos";
passwordFile = config.age.secrets."stateless-uptime-kuma-password".path;
enableService = true;
};
};
}

View file

@ -1,6 +1,7 @@
{
imports = [
./common.nix
./gandi.nix
./dnsimple.nix
./hydra.nix
./state.nix

View file

@ -45,104 +45,5 @@ in
resource.dnsimple_zone.vzfdfp_de = {
name = "vzfdfp.de";
};
resource.dnsimple_zone_record = let
# https://registry.terraform.io/providers/dnsimple/dnsimple/latest/docs/resources/zone_record
canonicalName = zoneName: record: let
# TODO: make less fragile and have actual unique and stable names
normalize = builtins.replaceStrings ["." "@"] ["_" "_root_"];
zone = normalize zoneName;
name = normalize record.name;
in "${zone}_${record.type}_${name}";
record = name: ttl: type: value: {
inherit name ttl type value;
};
proxyRecords = name: ttl: type: value: [
# kurisu.lahfa.xyz running a sniproxy:
(record name ttl "A" "163.172.69.160")
(record name ttl type value)
];
# Creates a extra *.p record pointing to the sniproxy
dualProxyRecords = name: ttl: type: value: lib.flatten [
(record name ttl type value)
(proxyRecords "${name}.p" ttl type value)
];
domain = zoneName: records:
builtins.listToAttrs (map (record: {
name = canonicalName zoneName record;
value = record // {
zone_name = zoneName;
};
}
) (lib.flatten records));
zones = domains: lib.zipAttrs (lib.mapAttrsToList (zoneName: records: domain zoneName records) domains);
in zones {
"forkos.org" = ([
# (record "@" 300 "A" "163.172.69.160")
(record "@" 300 "AAAA" "2001:bc8:38ee:100:1000::20")
(dualProxyRecords "bagel-box.infra" 300 "AAAA" "2001:bc8:38ee:100:100::1")
(dualProxyRecords "gerrit01.infra" 300 "AAAA" "2001:bc8:38ee:100:1000::10")
(dualProxyRecords "meta01.infra" 300 "AAAA" "2001:bc8:38ee:100:1000::20")
(dualProxyRecords "fodwatch.infra" 300 "AAAA" "2001:bc8:38ee:100:1000::30")
# git.infra.forkos.org exposes opensshd
(dualProxyRecords "git.infra" 300 "AAAA" "2001:bc8:38ee:100:1000::41")
# git.p.forkos.org exposes forgejo ssh server.
(proxyRecords "git.p" 300 "AAAA" "2001:bc8:38ee:100:1000::40")
(dualProxyRecords "buildbot.infra" 300 "AAAA" "2001:bc8:38ee:100:1000::50")
(dualProxyRecords "public01.infra" 300 "AAAA" "2001:bc8:38ee:100:1000::60")
(record "cl" 300 "CNAME" "gerrit01.infra.p.forkos.org")
(record "fodwatch" 300 "CNAME" "fodwatch.infra.p.forkos.org")
# git.p.forkos.org is the proxy variant of the Forgejo server.
(record "git" 300 "CNAME" "git.p.forkos.org")
(record "netbox" 300 "CNAME" "meta01.infra.p.forkos.org")
(record "amqp" 300 "CNAME" "bagel-box.infra.p.forkos.org")
(record "grafana" 300 "CNAME" "meta01.infra.p.forkos.org")
(record "hydra" 300 "CNAME" "build-coord.wob01.infra.p.forkos.org")
(record "loki" 300 "CNAME" "meta01.infra.p.forkos.org")
(record "mimir" 300 "CNAME" "meta01.infra.p.forkos.org")
(record "pyroscope" 300 "CNAME" "meta01.infra.p.forkos.org")
(record "tempo" 300 "CNAME" "meta01.infra.p.forkos.org")
(record "matrix" 300 "CNAME" "meta01.infra.p.forkos.org")
(record "alerts" 300 "CNAME" "meta01.infra.p.forkos.org")
(record "buildbot" 300 "CNAME" "buildbot.infra.p.forkos.org")
(record "b" 300 "CNAME" "public01.infra.p.forkos.org")
(record "postgres" 300 "CNAME" "bagel-box.infra.p.forkos.org")
(record "news" 3600 "CNAME" "public01.infra.p.forkos.org")
(record "status" 3600 "CNAME" "public01.infra.p.forkos.org")
# S3 in delroth's basement
(record "cache" 300 "AAAA" "2a02:168:6426::12") # smol.delroth.net
(record "cache" 300 "A" "195.39.247.161") # sni proxy
(record "vpn-gw.wob01.infra" 300 "AAAA" "2a01:584:11::2")
(dualProxyRecords "build-coord.wob01.infra" 300 "AAAA" "2a01:584:11::1:11")
# TODO: do not hardcode, just reuse the Colmena hive module outputs to generate all the required details.
]
++ (map (index: record "builder-${toString index}.wob01.infra" 300 "AAAA" "2a01:584:11::1:${toString index}") (genList lib.id 11))
++ (
let
# FIXME: figure out a way to poke `config.services.s3-revproxy` and
# automate the DNS part away?
buckets = [
"channels"
"releases"
"channel-scripts-test"
];
in
map (bucket: record "${bucket}" 300 "CNAME" "public01.infra.p.forkos.org") buckets
));
"flowery.systems" = [
(record "" 300 "ALIAS" "news.forkos.org")
];
"vzfdfp.de" = [
];
};
};
}

115
terraform/gandi.nix Normal file
View file

@ -0,0 +1,115 @@
{ lib, config, ... }:
let
inherit (lib) mkEnableOption mkIf tf genList;
cfg = config.bagel.gandi;
in
{
options.bagel.gandi = {
enable = mkEnableOption "the Gandi DNS configuration";
};
config = mkIf cfg.enable {
terraform.required_providers.gandi = {
version = "~> 2.3.0";
source = "go-gandi/gandi";
};
resource.secret_resource.gandi_pat.lifecycle.prevent_destroy = true;
provider.gandi = {
personal_access_token = tf.ref "resource.secret_resource.gandi_pat.value";
};
resource.gandi_livedns_domain.forkos_org = {
name = "forkos.org";
};
resource.gandi_livedns_record = let
record = name: ttl: type: values: {
inherit name ttl type values;
};
proxyRecords = name: ttl: type: values: [
# kurisu.lahfa.xyz running a sniproxy:
(record name ttl "A" ["163.172.69.160"])
(record name ttl type values)
];
# Creates a extra *.p record pointing to the sniproxy
dualProxyRecords = name: ttl: type: values: lib.flatten [
(record name ttl type values)
(proxyRecords "${name}.p" ttl type values)
];
# TODO: make less fragile and have actual unique and stable names
canonicalName = record: let
name = builtins.replaceStrings ["." "@"] ["_" "_root_"] record.name;
in
"forkos_org_${record.type}_${name}";
forkosRecords = records:
builtins.listToAttrs (map (record: {
name = canonicalName record;
value = record // {
zone = tf.ref "resource.gandi_livedns_domain.forkos_org.id";
};
}) (lib.flatten records));
in forkosRecords ([
# (record "@" 300 "A" ["163.172.69.160"])
(record "@" 300 "AAAA" ["2001:bc8:38ee:100:1000::20"])
(dualProxyRecords "bagel-box.infra" 300 "AAAA" ["2001:bc8:38ee:100:100::1"])
(dualProxyRecords "gerrit01.infra" 300 "AAAA" ["2001:bc8:38ee:100:1000::10"])
(dualProxyRecords "meta01.infra" 300 "AAAA" ["2001:bc8:38ee:100:1000::20"])
(dualProxyRecords "fodwatch.infra" 300 "AAAA" ["2001:bc8:38ee:100:1000::30"])
# git.infra.forkos.org exposes opensshd
(dualProxyRecords "git.infra" 300 "AAAA" ["2001:bc8:38ee:100:1000::41"])
# git.p.forkos.org exposes forgejo ssh server.
(proxyRecords "git.p" 300 "AAAA" ["2001:bc8:38ee:100:1000::40"])
(dualProxyRecords "buildbot.infra" 300 "AAAA" ["2001:bc8:38ee:100:1000::50"])
(dualProxyRecords "public01.infra" 300 "AAAA" ["2001:bc8:38ee:100:1000::60"])
(record "cl" 300 "CNAME" ["gerrit01.infra.p"])
(record "fodwatch" 300 "CNAME" ["fodwatch.infra.p"])
# git.p.forkos.org is the proxy variant of the Forgejo server.
(record "git" 300 "CNAME" ["git.p"])
(record "netbox" 300 "CNAME" ["meta01.infra.p"])
(record "amqp" 300 "CNAME" ["bagel-box.infra.p"])
(record "grafana" 300 "CNAME" ["meta01.infra.p"])
(record "hydra" 300 "CNAME" ["build-coord.wob01.infra.p"])
(record "loki" 300 "CNAME" ["meta01.infra.p"])
(record "mimir" 300 "CNAME" ["meta01.infra.p"])
(record "pyroscope" 300 "CNAME" ["meta01.infra.p"])
(record "tempo" 300 "CNAME" ["meta01.infra.p"])
(record "matrix" 300 "CNAME" ["meta01.infra.p"])
(record "alerts" 300 "CNAME" ["meta01.infra.p"])
(record "buildbot" 300 "CNAME" ["buildbot.infra.p"])
(record "b" 300 "CNAME" ["public01.infra.p"])
(record "postgres" 300 "CNAME" ["bagel-box.infra.p"])
(record "news" 3600 "CNAME" ["public01.infra.p"])
# S3 in delroth's basement
(record "cache" 300 "AAAA" ["2a02:168:6426::12"]) # smol.delroth.net
(record "cache" 300 "A" ["195.39.247.161"]) # sni proxy
(record "vpn-gw.wob01.infra" 300 "AAAA" [ "2a01:584:11::2" ])
(dualProxyRecords "build-coord.wob01.infra" 300 "AAAA" [ "2a01:584:11::1:11" ])
# TODO: do not hardcode, just reuse the Colmena hive module outputs to generate all the required details.
]
++ (map (index: record "builder-${toString index}.wob01.infra" 300 "AAAA" [ "2a01:584:11::1:${toString index}" ]) (genList lib.id 11))
++ (
let
# FIXME: figure out a way to poke `config.services.s3-revproxy` and
# automate the DNS part away?
buckets = [
"channels"
"releases"
"channel-scripts-test"
];
in
map (bucket: record "${bucket}" 300 "CNAME" [ "public01.infra.p" ]) buckets
));
};
}