From fdc980208da283a51a23f43f441634fc15fb810a Mon Sep 17 00:00:00 2001 From: Jade Lovelace Date: Thu, 29 Aug 2024 23:33:54 -0700 Subject: [PATCH] epyc: hack in better lix logging for debugging our CI problems Sample broken thing: https://buildbot.lix.systems/#/builders/40/builds/3400/steps/1/logs/stdio I have a suspicion it's that the path vanished due to GC or something, and I want to catch the next occurrence. The error: error: build of '/nix/store/f02iygmil30yfy92xf8cwp224lcr2nay-nixos-test-driver-nix-copy-closure.drv' on 'ssh-ng://nix@epyc.infra.newtype.fr' failed: error: some dependencies of '/nix/store/f02iygmil30yfy92xf8cwp224lcr2nay-nixos-test-driver-nix-copy-closure.drv' are missing This is because nrFailed is not zero, which we don't have any god damned idea why that is because there is NO LOG for those. So we don't even know what goal type failed to begin with. ---- WARNING: this updates nixpkgs by two months!! You will potentially experience other breakage. --- flake.lock | 6 +-- hosts/epyc.nix | 1 + ...in-about-failing-goals-at-warn-level.patch | 40 +++++++++++++++++++ modules/lix-bug-details-pls/default.nix | 21 ++++++++++ 4 files changed, 65 insertions(+), 3 deletions(-) create mode 100644 modules/lix-bug-details-pls/0001-wip-complain-about-failing-goals-at-warn-level.patch create mode 100644 modules/lix-bug-details-pls/default.nix diff --git a/flake.lock b/flake.lock index 1c6e92c..ab271e3 100644 --- a/flake.lock +++ b/flake.lock @@ -320,11 +320,11 @@ }, "nixpkgs_2": { "locked": { - "lastModified": 1718983919, - "narHash": "sha256-+1xgeIow4gJeiwo4ETvMRvWoircnvb0JOt7NS9kUhoM=", + "lastModified": 1724932487, + "narHash": "sha256-zzbqHmY1mt21omyk1+14QbAkII1B7OHlwKLcczVq22w=", "owner": "NixOS", "repo": "nixpkgs", - "rev": "90338afd6177fc683a04d934199d693708c85a3b", + "rev": "b4f7fb71438d00539b21f1b1e6968c0eac060127", "type": "github" }, "original": { diff --git a/hosts/epyc.nix b/hosts/epyc.nix index 593bebc..c7acff8 100644 --- a/hosts/epyc.nix +++ b/hosts/epyc.nix @@ -13,6 +13,7 @@ in ../modules/garage.nix ../modules/users/friends.nix ../modules/bagel-container.nix + ../modules/lix-bug-details-pls ]; networking.hostName = "epyc"; diff --git a/modules/lix-bug-details-pls/0001-wip-complain-about-failing-goals-at-warn-level.patch b/modules/lix-bug-details-pls/0001-wip-complain-about-failing-goals-at-warn-level.patch new file mode 100644 index 0000000..8a90427 --- /dev/null +++ b/modules/lix-bug-details-pls/0001-wip-complain-about-failing-goals-at-warn-level.patch @@ -0,0 +1,40 @@ +From 96937c58232ad6eaa11d1370220101c3ce2d00c3 Mon Sep 17 00:00:00 2001 +From: Jade Lovelace +Date: Thu, 29 Aug 2024 23:04:39 -0700 +Subject: [PATCH] wip: complain about failing goals at warn level + +I want to fix the bug that appears here: + +error: build of '/nix/store/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-nixos-test-driver-nix-copy-closure.drv' on 'ssh-ng://nix@epyc.infra.newtype.fr' failed: error: some dependencies of '/nix/store/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-nixos-test-driver-nix-copy-closure.drv' are missing +error: builder for '/nix/store/aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa-nixos-test-driver-nix-copy-closure.drv' failed with exit code 1 +error: 1 dependencies of derivation '/nix/store/bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb-vm-test-run-nix-copy-closure.drv' failed to build + +However, this is conditional on nrFailed, and I cannot for the life of +me figure out *who* is failing and *why*. + +Hopefully with these data I can narrow down why this bug is happening + +Change-Id: I7dca71b1c8ac92e7cc40c47ab37c952a7673cf42 +--- + src/libstore/build/worker.cc | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/src/libstore/build/worker.cc b/src/libstore/build/worker.cc +index 1b4633e64..a93be28a6 100644 +--- a/src/libstore/build/worker.cc ++++ b/src/libstore/build/worker.cc +@@ -160,7 +160,10 @@ void Worker::goalFinished(GoalPtr goal, Goal::Finished & f) + + waiting->trace(fmt("waitee '%s' done; %d left", goal->name, waiting->waitees.size())); + +- if (f.result != Goal::ecSuccess) ++waiting->nrFailed; ++ if (f.result != Goal::ecSuccess) { ++ ++waiting->nrFailed; ++ warn("Waiter %s experienced non-success of waitee %s with result %d", waiting->getName(), goal->getName(), f.result); ++ } + if (f.result == Goal::ecNoSubstituters) ++waiting->nrNoSubstituters; + if (f.result == Goal::ecIncompleteClosure) ++waiting->nrIncompleteClosure; + +-- +2.44.1 + diff --git a/modules/lix-bug-details-pls/default.nix b/modules/lix-bug-details-pls/default.nix new file mode 100644 index 0000000..eca78a5 --- /dev/null +++ b/modules/lix-bug-details-pls/default.nix @@ -0,0 +1,21 @@ +{ ... }: +{ + # jade: this exists because of a Lix bug that has me losing my damn mind and we really cannot debug it without either: + # * debug logs (infeasible. they are way too spammy) + # * patching lix (well look where we are) + # + # I don't really think it's necessarily appropriate to log at info level when + # a derivation fails on `main`, so here we have a yolopatch to get the damn + # thing in the log. + # + # I suspect it is a race condition with the garbage collector. + nixpkgs.overlays = [ + (final: prev: { + lix = prev.lix.overrideAttrs (old: { + patches = old.patches ++ [ + ./0001-wip-complain-about-failing-goals-at-warn-level.patch + ]; + }); + }) + ]; +}