chore: introduce finer-grained baremetal management

for multiple roles such as storage or builders.

Signed-off-by: Raito Bezarius <masterancpp@gmail.com>
This commit is contained in:
raito 2024-10-27 20:22:49 +01:00
parent f593645cde
commit 1b68df7229
12 changed files with 317 additions and 111 deletions

View file

@ -8,18 +8,19 @@
fodwatch = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFRyTNfvKl5FcSyzGzw+h+bNFNOxdhvI67WdUZ2iIJ1L";
buildbot = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIJgIu6ouagYqBeMLfmn1CbaDJMuZcPH9bnUhkht8GfuB";
git = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIEQJcpkCUOx8+5oukMX6lxrYcIX8FyHu8Mc/3+ieKMUn";
bm-0 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIBHSNcDGctvlG6BHcJuYIzW9WsBJsts2vpwSketsbXoL";
bm-1 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIQOGUjERK7Mx8UPM/rbOdMqVyn1sbWqYOG6CbOzH2wm";
bm-2 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIMKzXIqCoYElEKIYgjbSpqEcDeOvV+Wo3Agq3jba83cB";
bm-3 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIGq0A5233XGt34T097KaEKBUqFvaa7a6nYZRsSO0166l";
bm-4 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIB9dVo2xZhgIMDgB1rUj5ApmppL39BtYu/+OFHeduvXr";
bm-5 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIE7vZTBxrVHmHpv7slQ8A8XwjjbfN+ZJA0V5C3k0wNBD";
bm-6 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOt1qR/2BRtc6PABuSBulowwJVO6wBNDyEFzh0qsTeOF";
bm-7 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFinAAw1v8TJB8/wcmTVBbHHc4LCYh6z4TO6ViwUPkoh";
bm-8 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIKGSWHNeqT0kF/e4yVy2ieW98X5QMyCYIYZh9WTmQDs1";
bm-9 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOhws9zGgocVY36dMtOL+CXadpvRMffxoWMkfEcTBJm7";
bm-10 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIE7sgIuTSqZiZhp8TvObSbIEhcHHsL5hcmYA22uzwxth";
# bm-11 actually?
build-coord = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAINpAEJP7F+XtJBpQP1jTzwXwQgJrFxwEJjPf/rnCXkJA";
builder-0 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIBHSNcDGctvlG6BHcJuYIzW9WsBJsts2vpwSketsbXoL";
builder-1 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIIQOGUjERK7Mx8UPM/rbOdMqVyn1sbWqYOG6CbOzH2wm";
builder-2 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIMKzXIqCoYElEKIYgjbSpqEcDeOvV+Wo3Agq3jba83cB";
builder-3 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIGq0A5233XGt34T097KaEKBUqFvaa7a6nYZRsSO0166l";
builder-4 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIB9dVo2xZhgIMDgB1rUj5ApmppL39BtYu/+OFHeduvXr";
builder-5 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIE7vZTBxrVHmHpv7slQ8A8XwjjbfN+ZJA0V5C3k0wNBD";
builder-6 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOt1qR/2BRtc6PABuSBulowwJVO6wBNDyEFzh0qsTeOF";
builder-7 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFinAAw1v8TJB8/wcmTVBbHHc4LCYh6z4TO6ViwUPkoh";
builder-8 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIKGSWHNeqT0kF/e4yVy2ieW98X5QMyCYIYZh9WTmQDs1";
builder-9 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOhws9zGgocVY36dMtOL+CXadpvRMffxoWMkfEcTBJm7";
builder-10 = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIE7sgIuTSqZiZhp8TvObSbIEhcHHsL5hcmYA22uzwxth";
wob-vpn-gw = "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAINVytPPW8XnXf/rD5TFzsw//CZc2lBjQLmDzlVGPZsjh";
# Lix

View file

@ -84,6 +84,9 @@
});
forEachSystem' = f: forEachSystem (system: (f systemBits.${system}));
inherit (nixpkgs) lib;
# ForkOS' library functions.
flib = import ./lib { inherit (nixpkgs) lib; };
inherit (flib) singleton;
in
{
apps = forEachSystem' ({ system, pkgs, terraformCfg, terraform, ... }: {
@ -135,9 +138,14 @@
# Tag all machines which have local boot as local bootables.
deployment.tags = lib.mkMerge [
[ "floral" ]
(lib.mkIf (config.bagel.baremetal.builders.enable -> !config.bagel.baremetal.builders.netboot)
# All nodes that can be local booted, including baremetal nodes.
(lib.mkIf (config.bagel.baremetal.enable -> !config.bagel.baremetal.netboot)
[ "localboot" ]
)
# Only baremetal nodes that can be local booted.
(lib.mkIf (config.bagel.baremetal.enable && !config.bagel.baremetal.netboot)
[ "bm-localboot" ]
)
];
bagel.monitoring.grafana-agent.tenant = "floral";
@ -148,15 +156,74 @@
];
# These are Floral baremetal builders.
makeBuilder = i:
makeColoBaremetal = i:
let
enableNetboot = i >= 6;
in
lib.nameValuePair "builder-${toString i}" {
# bm for baremetal.
lib.nameValuePair "bm-${toString i}" {
imports = floralInfraModules;
bagel.baremetal.builders = { enable = true; num = i; netboot = enableNetboot; };
bagel.baremetal = { enable = true; num = i; netboot = enableNetboot; };
};
# Given the data of:
# - a selector function to filter NixOS nodes
# - a module factory function to extend a NixOS configuration
# this will return a function that will take a set of nodes and project it to the filtered
# nodes augmented with the module factory function.
# Composing twice the projector should have no effect.
# `mkSystem :: { renumberedIndex: int, node: NixOS configuration } → NixOS configuration`
mkProjector = { selector, mkSystem }: nodes:
let
# Select all the nodes using the selector.
selectedNodes = lib.filterAttrs (_: node: selector node.bagel.baremetal.num) nodes;
in
# Re-map selected nodes and renumber them in some iteration order
# and apply the module extension function.
flib.renumber
# Indexing function
(node: node.bagel.baremetal.num)
# Renumbering function
(renumberedIndex: node: mkSystem { inherit renumberedIndex node; })
selectedNodes;
# Current map:
# builders: [4, 10].
# storage: [5]
# build-coord: [11].
# Set of projectors that will take a generic baremetal node
# and reconfigure it for a specific role.
projectors = {
storage = {
# Selectors are just fancy functions that can filter based on the index information.
# It is possible to construct a range filter to express a collection of intervals,
# e.g. select 0→4 & 6→8 & 12→15.
# For now, we will only use pointwise as we have very few machines.
selector = flib.mkPointwiseFilter [ 5 ];
mkSystem = { renumberedIndex, node }:
{
imports = [ node ];
bagel.baremetal.storage = {
enable = true;
num = renumberedIndex;
};
};
};
builders = {
selector = flib.mkPointwiseFilter [ 4 10 ];
mkSystem = { renumberedIndex, node }: {
imports = [ node ];
bagel.baremetal.builders = {
enable = true;
num = renumberedIndex;
};
};
};
};
project = role: mkProjector projectors.${role};
lixInfraModules = commonModules ++ [
{
# This means that anyone with @lix-infra permissions
@ -182,7 +249,25 @@
}
];
builders = lib.listToAttrs (map makeBuilder [4 5 10 11]);
baremetalNodes =
let
# We consider all possible baremetal systems and we filter out a subset that is activated.
# To configure the set of used machines, configure the `setXYZ` role setter selectors.
allNodes = lib.listToAttrs (lib.genList makeColoBaremetal 11);
perRoles = {
# Project in the sense of linear algebra projectors.
# We are projecting allNodes on the set of storage nodes.
# (remember, a projector is a linear function such that p^2 = p).
storageNodes = project "storage" allNodes;
builderNodes = project "builders" allNodes;
# buildCoordinatorNodes = setBuildCoordinators allNodes;
};
in
# TODO: compute what are the offender nodes and their simultaneous roles.
assert (lib.assertMsg (flib.isValidPartition perRoles) "A baremetal node is simultaneously storage, builder and build coordinator, please review the ranges.");
# Merge all roles together into one big attribute set of nodes.
flib.chainAttrs perRoles;
in {
meta.nixpkgs = systemBits.x86_64-linux.pkgs;
# Add any non-x86_64 native systems here.
@ -208,7 +293,7 @@
build01-aarch64-lix.imports = lixInfraModules ++ [ ./hosts/build01-aarch64-lix ];
buildbot-lix.imports = lixInfraModules ++ [ ./hosts/buildbot-lix ];
} // builders;
} // baremetalNodes;
hydraJobs = builtins.mapAttrs (n: v: v.config.system.build.netbootDir or v.config.system.build.toplevel) self.nixosConfigurations;
buildbotJobs = builtins.mapAttrs (_: v: v.config.system.build.toplevel) self.nixosConfigurations;

View file

@ -9,7 +9,8 @@
bagel.services = {
hydra.enable = true;
hydra.builders = map (i: "builder-${builtins.toString i}") [4 5 10];
# TODO: use the roles to avoid setting up builders which are not… builders!
hydra.builders = map (i: "bm-${builtins.toString i}") [4 10];
# Arguably, the build-coordinator is the most sensitive piece of our own infrastructure.
# Henceforth, it can run as well another sensitive piece of the system: the Vault.

65
lib/default.nix Normal file
View file

@ -0,0 +1,65 @@
# Some useful utilities to do things that depends on the nixpkgs library.
{ lib }:
let
inherit (lib) listToAttrs zipListsWith nameValuePair length range foldl any mapAttrs;
in
rec {
closedOpenInterval = a: b: { start = a; end = b; };
interval = a: b: closedOpenInterval a b;
singleton = x: interval x (x + 1);
inRange = i: range: i >= range.start && i < range.end;
# Build a selector function that will filters point-by-point any index in xs.
# e.g. if you want to select specific indexes you can just use that.
# If you want to select contiguous interval of indexes, you are better served by
# `mkIntervalFilter`.
mkPointwiseFilter = xs: index: any (allowedIndex: index == allowedIndex) xs;
# Build a selector function that will filters interval-by-interval any index in intervals.
# It will check if the given index is present in any of the passed intervals according
# to `inRange`.
mkIntervalFilter = intervals: index: any (allowedRange: inRange index allowedRange) intervals;
# Build an attribute set map from values to indexes.
# e.g. reversedEnumerate [ "a" "b" ] == { "a" = 0; "b" = 1; }.
reversedEnumerate = list: listToAttrs
(zipListsWith
(index: value: nameValuePair value index)
(range 0 (length list - 1))
list);
# Collect a list of attribute sets into an attribute set.
# Merge order depends on attrValues iteration order and foldl.
chainAttrs = attrs: foldl (a: b: a // b) { } (builtins.attrValues attrs);
# Given an attribute set of an attribute set of items, does it describe a valid partition of some global set?
# This does not check for completeness.
# idFunction :: Attrs K V → List Identifier
isValidPartition = attrs:
let
values = builtins.attrValues attrs;
in
# TODO(performance?): this is the simple dumb idea.
# A better idea would use n(n - 1)/2 iterations over values to exploit symmetry of item equality.
# To do so, a strategy could be to consider all shifted toplevel identifiers lists and zip them.
# There's sum_k(n - k) such lists, and therefore: n(n - 1)/2 lists.
# For every list, we need to perform list intersection which is supposedly in O(n log n) in the size of the nodes identifiers.
# So, if we have N subsets in the partition and each subset has at most K items, we end up doing something like (K log K) * N(N - 1)/2
# In practice, K should be the biggest and N is quite small.
lib.all (subset:
lib.all (anotherSubset:
subset != anotherSubset -> lib.intersectAttrs subset anotherSubset == {}
) values
) values;
# Renumber an attribute set of items.
# For each item in the attribute set, we replace its value by a call to the renumbering function
# where we pass renumberedIndex and value.
# It's a form of imap for attribute sets.
renumber = indexFn: renumberingFn: attrs:
let
indexes = reversedEnumerate (map (n: toString (indexFn n)) (builtins.attrValues attrs));
in
mapAttrs (name: value: renumberingFn indexes.${toString (indexFn value)} value) attrs;
}

View file

@ -3,7 +3,7 @@
let
genBuilders = { offset ? 0, count, f }: builtins.genList (x: rec { name = "builder-${toString (offset + x)}"; value = f name; }) count;
in builtins.listToAttrs (
genBuilders { offset = 4; count = 2; f = name: {
genBuilders { offset = 0; count = 2; f = name: {
cores = 8;
max-jobs = 8;
supported-features = [ "kvm" "nixos-test" ];
@ -11,7 +11,7 @@ in builtins.listToAttrs (
}; }
++
# This builder is exclusively for big-parallel
genBuilders { offset = 10; count = 1; f = name: {
genBuilders { offset = 2; count = 1; f = name: {
cores = 20;
max-jobs = 1;
supported-features = [ "kvm" "nixos-test" "big-parallel" ];

View file

@ -1,15 +1,12 @@
{ pkgs, lib, config, ... }:
let
cfgParent = config.bagel.baremetal;
cfg = config.bagel.baremetal.builders;
in
{
imports = [ ./netboot.nix ];
options = {
bagel.baremetal.builders = {
enable = lib.mkEnableOption "baremetal bagel oven";
netboot = lib.mkEnableOption "netboot";
enable = lib.mkEnableOption "builder role";
num = lib.mkOption {
type = lib.types.int;
};
@ -17,9 +14,6 @@ in
};
config = lib.mkIf cfg.enable {
boot.initrd.availableKernelModules = [ "ahci" "ehci_pci" "usb_storage" "usbhid" "sd_mod" ];
boot.initrd.kernelModules = [ "dm-snapshot" ];
users.users.builder = {
isSystemUser = true;
group = "nogroup";
@ -48,45 +42,21 @@ in
inherit ((import ./assignments.nix).${config.networking.hostName}) max-jobs cores;
};
nixpkgs.hostPlatform = "x86_64-linux";
hardware.cpu.intel.updateMicrocode = true;
fileSystems = {
"/mnt" = {
device = "/dev/disk/by-label/hydra";
fsType = "xfs";
options = ["logbsize=256k"];
};
boot.loader.systemd-boot.enable = true;
boot.loader.efi.canTouchEfiVariables = true;
boot.initrd.systemd.enable = true;
# We want the tmp filesystem on the same filesystem as the hydra store, so that builds can use reflinks
"/tmp" = {
device = "/mnt/tmp";
options = [ "bind" ];
};
};
boot.initrd.services.lvm.enable = true;
boot.kernel.sysctl."fs.xfs.xfssyncd_centisecs" = "12000";
fileSystems = lib.mkMerge [
(lib.mkIf (!cfg.netboot) {
"/" = {
device = "/dev/disk/by-label/root";
fsType = "xfs";
};
"/boot" = {
device = "/dev/disk/by-label/BOOT";
fsType = "vfat";
options = [ "fmask=0022" "dmask=0022" ];
};
})
{
"/mnt" = {
device = "/dev/disk/by-label/hydra";
fsType = "xfs";
options = ["logbsize=256k"];
};
# We want the tmp filesystem on the same filesystem as the hydra store, so that builds can use reflinks
"/tmp" = {
device = "/mnt/tmp";
options = [ "bind" ];
};
}
];
swapDevices = lib.optionals (!cfg.netboot) [
swapDevices = lib.optionals (!cfgParent.netboot) [
{
device = "/swapfile";
size = 50 * 1024; # 50GiB
@ -103,44 +73,7 @@ in
"console=ttyS0,115200"
];
networking.useNetworkd = true;
networking.hostName = "builder-${toString cfg.num}";
networking.domain = "wob01.infra.forkos.org";
systemd.network = {
netdevs = {
"40-uplink" = {
netdevConfig = {
Kind = "bond";
Name = "uplink";
};
bondConfig = {
Mode = "802.3ad";
TransmitHashPolicy = "layer3+4";
};
};
};
networks = {
"40-eno1" = {
name = "eno1";
bond = [ "uplink" ];
};
"40-eno2" = {
name = "eno2";
bond = [ "uplink" ];
};
};
};
networking.interfaces.uplink.ipv6.addresses = [
{ address = "2a01:584:11::1:${toString cfg.num}"; prefixLength = 64; }
];
networking.defaultGateway6 = { interface = "uplink"; address = "2a01:584:11::1"; };
bagel.infra.self.wan = {
family = "inet6";
address = "2a01:584:11::1:${toString cfg.num}";
prefixLength = 64;
};
deployment.targetHost = "2a01:584:11::1:${toString cfg.num}";
deployment.tags = [ "builders" ];
# Why can't we have nice things? https://bugs.openjdk.org/browse/JDK-8170568
@ -184,11 +117,5 @@ in
wantedBy = [ "timers.target" ];
};
systemd.timers.hydra-gc.timerConfig.Persistent = true;
bagel.sysadmin.enable = true;
environment.systemPackages = [ pkgs.ipmitool ];
system.stateVersion = "24.05";
};
}

View file

@ -0,0 +1,12 @@
{
imports = [
# Compute nodes
./builders
# Storage nodes
./storage
# Bases
./netboot.nix
./hardware.nix
];
}

View file

@ -0,0 +1,96 @@
{ pkgs, lib, config, ... }:
let
cfg = config.bagel.baremetal;
in
{
options = {
bagel.baremetal = {
enable = lib.mkEnableOption "baremetal bagel oven";
netboot = lib.mkEnableOption "netboot";
num = lib.mkOption {
type = lib.types.int;
};
};
};
config = lib.mkIf cfg.enable {
boot.initrd.availableKernelModules = [ "ahci" "ehci_pci" "usb_storage" "usbhid" "sd_mod" ];
boot.initrd.kernelModules = [ "dm-snapshot" ];
nixpkgs.hostPlatform = "x86_64-linux";
hardware.cpu.intel.updateMicrocode = true;
boot.loader.systemd-boot.enable = true;
boot.loader.efi.canTouchEfiVariables = true;
boot.initrd.systemd.enable = true;
boot.initrd.services.lvm.enable = true;
boot.kernel.sysctl."fs.xfs.xfssyncd_centisecs" = "12000";
fileSystems = lib.mkIf (!cfg.netboot) {
"/" = {
device = "/dev/disk/by-label/root";
fsType = "xfs";
};
"/boot" = {
device = "/dev/disk/by-label/BOOT";
fsType = "vfat";
options = [ "fmask=0022" "dmask=0022" ];
};
};
zramSwap = {
enable = true;
memoryPercent = 25;
};
boot.kernelParams = [
"console=tty1"
"console=ttyS0,115200"
];
networking.useNetworkd = true;
networking.domain = "wob01.infra.forkos.org";
systemd.network = {
netdevs = {
"40-uplink" = {
netdevConfig = {
Kind = "bond";
Name = "uplink";
};
bondConfig = {
Mode = "802.3ad";
TransmitHashPolicy = "layer3+4";
};
};
};
networks = {
"40-eno1" = {
name = "eno1";
bond = [ "uplink" ];
};
"40-eno2" = {
name = "eno2";
bond = [ "uplink" ];
};
};
};
networking.interfaces.uplink.ipv6.addresses = [
{ address = "2a01:584:11::1:${toString cfg.num}"; prefixLength = 64; }
];
networking.defaultGateway6 = { interface = "uplink"; address = "2a01:584:11::1"; };
bagel.infra.self.wan = {
family = "inet6";
address = "2a01:584:11::1:${toString cfg.num}";
prefixLength = 64;
};
deployment.targetHost = "2a01:584:11::1:${toString cfg.num}";
bagel.sysadmin.enable = true;
environment.systemPackages = [ pkgs.ipmitool ];
system.stateVersion = "24.05";
};
}

View file

@ -1,6 +1,6 @@
{ modulesPath, pkgs, lib, config, extendModules, ... }@node:
let
cfg = config.bagel.baremetal.builders;
cfg = config.bagel.baremetal;
in
{
config = lib.mkIf (cfg.enable && cfg.netboot) {

View file

@ -0,0 +1,19 @@
{ lib, config, ... }:
let
cfg = config.bagel.baremetal.storage;
in
{
options = {
bagel.baremetal.storage = {
enable = lib.mkEnableOption "storage role";
num = lib.mkOption {
type = lib.types.int;
};
};
};
config = lib.mkIf cfg.enable {
networking.hostName = "storage-${toString cfg.num}";
deployment.tags = [ "storage" ];
};
}

View file

@ -12,7 +12,7 @@
./ofborg
./postgres
./forgejo
./baremetal-builder
./baremetal
./buildbot
./newsletter
./s3-revproxy

View file

@ -38,7 +38,7 @@ let
# - generalize to new features
baremetalBuilders = lib.concatStringsSep "\n"
(map (n: let
assignments = (import ../baremetal-builder/assignments.nix).${n} or {
assignments = (import ../baremetal/builders/assignments.nix).${n} or {
inherit (nodes.${n}.config.nix.settings) max-jobs;
supported-features = [ "big-parallel" "kvm" "nixos-test" ];
required-features = [];