From b3fb8d9822419a9836ec48701bf3ec09c58541e3 Mon Sep 17 00:00:00 2001 From: Jade Lovelace Date: Fri, 12 Jul 2024 23:24:34 +0200 Subject: [PATCH 1/3] daemon: fix a crash bug "FATAL: exception not rethrown" This is caused by pthread_cancel effectively throwing a not-specifically-identifiable C++ exception into the targeted thread, which, if it is not rethrown, terminates the process entirely. This is rather "impolite" behaviour, we would say. But thread cancellation is *always* busted, and we should simply not use it where unnecessary. It's particularly unnecessary when what we *actually* need it for is, err, interrupting a poll(2). That can in turn be achieved by simply listening to more stuff in the poll, namely, a pipe, which we send a character to when needing to stop the thread. While looking at this code, we also investigated whether any of the poll() madness is required, or was even *ever* required. Curiously we found in the XNU kernel source code that the thing about needing to listen to POLLHUP is probably *correct*, but switching it to POLLRDNORM should not have made any difference at all. We've left a FIXME to look into that further because what's written here is super janky. https://github.com/apple-oss-distributions/xnu/blob/94d3b452840153a99b38a3a9659680b2a006908e/bsd/kern/sys_generic.c#L1751-L1758 This is the crash on some Hydra machines: Thread 1 (Thread 0x7f56b77776c0 (LWP 955542) (Exiting)): 0 0x00007f56b8e9b7dc in __pthread_kill_implementation () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 1 0x00007f56b8e49516 in raise () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 2 0x00007f56b8e31935 in abort () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 3 0x00007f56b8e327f3 in __libc_message_impl.cold () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 4 0x00007f56b8e8e8e9 in __libc_fatal () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 5 0x00007f56b8ea23c4 in unwind_cleanup () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 6 0x00007f56b9d2a1b8 in nix::triggerInterrupt() [clone .cold] () from /nix/store/sahgw550p621m9dy1pd7whl9c5g1g0p7-lix-2.90.0-rc1/lib/liblixutil.so 7 0x00007f56b990ac9d in std::thread::_State_impl > >::_M_run() () from /nix/store/sahgw550p621m9dy1pd7whl9c5g1g0p7-lix-2.90.0-rc1/lib/liblixstore.so 8 0x00007f56b90e86d3 in execute_native_thread_routine () from /nix/store/c6r62m84hywf4i6qq1h28f13zv38yqyp-gcc-13.3.0-lib/lib/libstdc++.so.6 9 0x00007f56b8e99a42 in start_thread () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 10 0x00007f56b8f1905c in clone3 () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 As for testing, we've started a daemon with this change and verified it deals with HUPs correctly on x86_64-linux, but I don't think we can easily test the destructor behaviour without whatever Hydra was doing that broke. Change-Id: I29c7de0425674494b6e43c075810126c3ff77363 --- src/libutil/monitor-fd.hh | 48 ++++++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/src/libutil/monitor-fd.hh b/src/libutil/monitor-fd.hh index 228fb13f8..233b73796 100644 --- a/src/libutil/monitor-fd.hh +++ b/src/libutil/monitor-fd.hh @@ -10,6 +10,8 @@ #include #include +#include "error.hh" +#include "file-descriptor.hh" #include "signals.hh" namespace nix { @@ -19,19 +21,36 @@ class MonitorFdHup { private: std::thread thread; + /** + * Pipe used to interrupt the poll()ing in the monitoring thread. + */ + Pipe terminatePipe; + std::atomic_bool quit = false; public: MonitorFdHup(int fd) { - thread = std::thread([fd]() { - while (true) { + terminatePipe.create(); + auto &quit_ = this->quit; + int terminateFd = terminatePipe.readSide.get(); + thread = std::thread([fd, terminateFd, &quit_]() { + while (!quit_) { /* Wait indefinitely until a POLLHUP occurs. */ - struct pollfd fds[1]; + struct pollfd fds[2]; fds[0].fd = fd; /* Polling for no specific events (i.e. just waiting for an error/hangup) doesn't work on macOS anymore. So wait for read events and ignore them. */ + // FIXME(jade): we have looked at the XNU kernel code and as + // far as we can tell, the above is bogus. It should be the + // case that the previous version of this and the current + // version are identical: waiting for POLLHUP and POLLRDNORM in + // the kernel *should* be identical. + // https://github.com/apple-oss-distributions/xnu/blob/94d3b452840153a99b38a3a9659680b2a006908e/bsd/kern/sys_generic.c#L1751-L1758 + // + // So, this needs actual testing and we need to figure out if + // this is actually bogus. fds[0].events = #ifdef __APPLE__ POLLRDNORM @@ -39,8 +58,18 @@ public: 0 #endif ; - auto count = poll(fds, 1, -1); - if (count == -1) abort(); // can't happen + fds[1].fd = terminateFd; + fds[1].events = POLLIN; + + auto count = poll(fds, 2, -1); + if (count == -1) { + if (errno == EINTR || errno == EAGAIN) { + // These are best dealt with by just trying again. + continue; + } else { + throw SysError("in MonitorFdHup poll()"); + } + } /* This shouldn't happen, but can on macOS due to a bug. See rdar://37550628. @@ -53,6 +82,11 @@ public: triggerInterrupt(); break; } + // No reason to actually look at the pipe FD, we just need it + // to be able to get woken. + if (quit_) { + break; + } /* This will only happen on macOS. We sleep a bit to avoid waking up too often if the client is sending input. */ @@ -63,7 +97,9 @@ public: ~MonitorFdHup() { - pthread_cancel(thread.native_handle()); + quit = true; + // Poke the thread out of its poll wait + writeFull(terminatePipe.writeSide.get(), "*", false); thread.join(); } }; From 69e2ee5b25752ba5fd8644cef56fb9d627ca4a64 Mon Sep 17 00:00:00 2001 From: Jade Lovelace Date: Sat, 13 Jul 2024 00:27:09 +0200 Subject: [PATCH 2/3] daemon: remove workaround for macOS kernel bug that seems fixed This was filed as https://github.com/nixos/nix/issues/7584, but as far as I can tell, the previous solution of POLLHUP works just fine on macOS 14. I've also tested on an ancient machine with macOS 10.15.7, which also has POLLHUP work correctly. It's possible this might regress some older versions of macOS that have a kernel bug, but I went looking through the history on the sources and didn't find anything that looked terribly convincingly like a bug fix between 2020 and today. If such a broken version exists, it seems pretty reasonable to suggest simply updating the OS. Change-Id: I178a038baa000f927ea2cbc4587d69d8ab786843 --- src/libutil/monitor-fd.hh | 45 +++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/src/libutil/monitor-fd.hh b/src/libutil/monitor-fd.hh index 233b73796..ea6e89017 100644 --- a/src/libutil/monitor-fd.hh +++ b/src/libutil/monitor-fd.hh @@ -38,26 +38,17 @@ public: /* Wait indefinitely until a POLLHUP occurs. */ struct pollfd fds[2]; fds[0].fd = fd; - /* Polling for no specific events (i.e. just waiting - for an error/hangup) doesn't work on macOS - anymore. So wait for read events and ignore - them. */ - // FIXME(jade): we have looked at the XNU kernel code and as - // far as we can tell, the above is bogus. It should be the - // case that the previous version of this and the current - // version are identical: waiting for POLLHUP and POLLRDNORM in - // the kernel *should* be identical. - // https://github.com/apple-oss-distributions/xnu/blob/94d3b452840153a99b38a3a9659680b2a006908e/bsd/kern/sys_generic.c#L1751-L1758 + // There is a POSIX violation on macOS: you have to listen for + // at least POLLHUP to receive HUP events for a FD. POSIX says + // this is not so, and you should just receive them regardless, + // however, as of our testing on macOS 14.5, the events do not + // get delivered in such a case. // - // So, this needs actual testing and we need to figure out if - // this is actually bogus. - fds[0].events = - #ifdef __APPLE__ - POLLRDNORM - #else - 0 - #endif - ; + // This is allegedly filed as rdar://37537852. + // + // Relevant code, which backs this up: + // https://github.com/apple-oss-distributions/xnu/blob/94d3b452840153a99b38a3a9659680b2a006908e/bsd/kern/sys_generic.c#L1751-L1758 + fds[0].events = POLLHUP; fds[1].fd = terminateFd; fds[1].events = POLLIN; @@ -82,14 +73,16 @@ public: triggerInterrupt(); break; } - // No reason to actually look at the pipe FD, we just need it - // to be able to get woken. + // No reason to actually look at the pipe FD if that's what + // woke us, the only thing that actually matters is the quit + // flag. if (quit_) { break; } - /* This will only happen on macOS. We sleep a bit to - avoid waking up too often if the client is sending - input. */ + // On macOS, it is possible (although not observed on macOS + // 14.5) that in some limited cases on buggy kernel versions, + // all the non-POLLHUP events for the socket get delivered. + // Sleeping avoids pointlessly spinning a thread on those. sleep(1); } }); @@ -100,7 +93,9 @@ public: quit = true; // Poke the thread out of its poll wait writeFull(terminatePipe.writeSide.get(), "*", false); - thread.join(); + if (thread.joinable()) { + thread.join(); + } } }; From 702f02c31f80536ce3b5953a8501677f0bf4977b Mon Sep 17 00:00:00 2001 From: Jade Lovelace Date: Fri, 12 Jul 2024 18:20:29 +0200 Subject: [PATCH 3/3] docs: document the actual comparison rules instead of lies Although the comparison rules are ugly and we do not like various parts of them, we must not hide them away for only catgirls to know about, so the documentation should actually say how they work. Change-Id: Ib20e9aa0e7b6486ade4f401035aafd85fbb08c91 --- doc/manual/src/language/operators.md | 60 ++++++++++++++++++++++++++-- 1 file changed, 56 insertions(+), 4 deletions(-) diff --git a/doc/manual/src/language/operators.md b/doc/manual/src/language/operators.md index ff09e739f..6dcdc6eb0 100644 --- a/doc/manual/src/language/operators.md +++ b/doc/manual/src/language/operators.md @@ -145,19 +145,71 @@ All comparison operators are implemented in terms of `<`, and the following equi | *a* `>` *b* | *b* `<` *a* | | *a* `>=` *b* | `! (` *a* `<` *b* `)` | +Note that the above behaviour violates IEEE 754 for floating point numbers with respect to NaN, for instance. +This may be fixed in a future major language revision. + [Comparison]: #comparison-operators ## Equality -- [Attribute sets][attribute set] and [list]s are compared recursively, and therefore are fully evaluated. -- Comparison of [function]s always returns `false`. -- Numbers are type-compatible, see [arithmetic] operators. -- Floating point numbers only differ up to a limited precision. +The following equality comparison rules are followed in order: + +- Comparisons are first, sometimes, performed by identity (pointer value), and whether or not this occurs varies depending on the context in which the comparison is performed; for example, through `builtins.elem`, comparison of lists, or other cases. + The exact instances in which this occurs, aside from direct list and attribute set comparisons as discussed below, are too dependent on implementation details to meaningfully document. + + See [note on identity comparison](#identity-comparison) below. +- Comparisons between a combination of integers and floating point numbers are first converted to floating point then compared as floating point. +- Comparisons between values of differing types, besides the ones mentioned in the above rule, are unequal. +- Strings are compared as their string values, disregarding string contexts. +- Paths are compared as their absolute form (since they are stored as such). +- [Functions][function] are always considered unequal, including with themselves. +- The following are compared in the typical manner: + - Integers + - Floating point numbers have equality comparison per IEEE 754. + + Note that this means that just like in most languages, floating point arithmetic results are not typically equality comparable, and should instead be compared by checking that the absolute difference is less than some error margin. + - Booleans + - Null +- [Attribute sets][attribute set] are compared following these rules in order: + - If both attribute sets have the same identity (via pointer equality), they are considered equal, regardless of whether the contents have reflexive equality (e.g. even if there are functions contained within). + + See [note on identity comparison](#identity-comparison) below. + - If both attribute sets have `type = "derivation"` and have an attribute `outPath` that is equal, they are considered equal. + + This means that two results of `builtins.derivation`, regardless of other things added to their attributes via `//` afterwards (or `passthru` in nixpkgs), will compare equal if they passed the same arguments to `builtins.derivation`. + - Otherwise, they are compared element-wise in an unspecified order. + Although this order *may* be deterministic in some cases, this is not guaranteed, and correct code must not rely on this ordering behaviour. + + The order determines which elements are evaluated first and thus, if there are throwing values in the attribute set, which of those get evaluated, if any, before the comparison returns an unequal result. +- Lists are compared following these rules in order: + - If both lists have the same identity (via pointer equality), they are considered equal, regardless of whether the contents have reflexive equality (e.g. even if there are functions contained within). + + See [note on identity comparison](#identity-comparison) below. + - Otherwise, they are compared element-wise in list order. [function]: ./constructs.md#functions [Equality]: #equality +### Identity comparison + +In the current revision of the Nix language, values are first compared by identity (pointer equality). +This means that values that are not reflexively equal (that is, they do not satisfy `a == a`), such as functions, are nonetheless sometimes compared as equal with themselves if they are placed in attribute sets or lists, or are compared through other indirect means. + +Whether identity comparison applies to a given usage of the language aside from direct list and attribute set comparison is strongly dependent on implementation details to the point it is not feasible to document the exact instances. + +This is rather unfortunate behaviour which is regrettably load-bearing on nixpkgs (such as with the `type` attribute of NixOS options) and cannot be changed for the time being. +It may be changed in a future major language revision. + +Correct code must not rely on this behaviour. + +For example: + +``` +nix-repl> let f = x: 1; s = { func = f; }; in [ (f == f) (s == s) ] +[ false true ] +``` + ## Logical implication Equivalent to `!`*b1* `||` *b2*.