epyc: hack in better lix logging for debugging our CI problems #5

Merged
raito merged 1 commit from jade/raito-shared-public-infra:jade/logs-pls into main 2024-08-30 06:54:05 +00:00
4 changed files with 65 additions and 3 deletions

View file

@ -320,11 +320,11 @@
},
"nixpkgs_2": {
"locked": {
"lastModified": 1718983919,
"narHash": "sha256-+1xgeIow4gJeiwo4ETvMRvWoircnvb0JOt7NS9kUhoM=",
"lastModified": 1724932487,
"narHash": "sha256-zzbqHmY1mt21omyk1+14QbAkII1B7OHlwKLcczVq22w=",
"owner": "NixOS",
"repo": "nixpkgs",
"rev": "90338afd6177fc683a04d934199d693708c85a3b",
"rev": "b4f7fb71438d00539b21f1b1e6968c0eac060127",
"type": "github"
},
"original": {

View file

@ -13,6 +13,7 @@ in
../modules/garage.nix
../modules/users/friends.nix
../modules/bagel-container.nix
../modules/lix-bug-details-pls
];
networking.hostName = "epyc";

View file

@ -0,0 +1,40 @@
From 96937c58232ad6eaa11d1370220101c3ce2d00c3 Mon Sep 17 00:00:00 2001
From: Jade Lovelace <lix@jade.fyi>
Date: Thu, 29 Aug 2024 23:04:39 -0700
Subject: [PATCH] wip: complain about failing goals at warn level
I want to fix the bug that appears here:
error: build of '/nix/store/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-nixos-test-driver-nix-copy-closure.drv' on 'ssh-ng://nix@epyc.infra.newtype.fr' failed: error: some dependencies of '/nix/store/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-nixos-test-driver-nix-copy-closure.drv' are missing
error: builder for '/nix/store/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-nixos-test-driver-nix-copy-closure.drv' failed with exit code 1
error: 1 dependencies of derivation '/nix/store/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-vm-test-run-nix-copy-closure.drv' failed to build
However, this is conditional on nrFailed, and I cannot for the life of
me figure out *who* is failing and *why*.
Hopefully with these data I can narrow down why this bug is happening
Change-Id: I7dca71b1c8ac92e7cc40c47ab37c952a7673cf42
---
src/libstore/build/worker.cc | 5 ++++-
1 file changed, 4 insertions(+), 1 deletion(-)
diff --git a/src/libstore/build/worker.cc b/src/libstore/build/worker.cc
index 1b4633e64..a93be28a6 100644
--- a/src/libstore/build/worker.cc
+++ b/src/libstore/build/worker.cc
@@ -160,7 +160,10 @@ void Worker::goalFinished(GoalPtr goal, Goal::Finished & f)
waiting->trace(fmt("waitee '%s' done; %d left", goal->name, waiting->waitees.size()));
- if (f.result != Goal::ecSuccess) ++waiting->nrFailed;
+ if (f.result != Goal::ecSuccess) {
+ ++waiting->nrFailed;
+ warn("Waiter %s experienced non-success of waitee %s with result %d", waiting->getName(), goal->getName(), f.result);
+ }
if (f.result == Goal::ecNoSubstituters) ++waiting->nrNoSubstituters;
if (f.result == Goal::ecIncompleteClosure) ++waiting->nrIncompleteClosure;
--
2.44.1

View file

@ -0,0 +1,21 @@
{ ... }:
{
# jade: this exists because of a Lix bug that has me losing my damn mind and we really cannot debug it without either:
# * debug logs (infeasible. they are way too spammy)
# * patching lix (well look where we are)
#
# I don't really think it's necessarily appropriate to log at info level when
# a derivation fails on `main`, so here we have a yolopatch to get the damn
# thing in the log.
#
# I suspect it is a race condition with the garbage collector.
nixpkgs.overlays = [
(final: prev: {
lix = prev.lix.overrideAttrs (old: {
patches = old.patches ++ [
./0001-wip-complain-about-failing-goals-at-warn-level.patch
];
});
})
];
}