From b3fb8d9822419a9836ec48701bf3ec09c58541e3 Mon Sep 17 00:00:00 2001 From: Jade Lovelace Date: Fri, 12 Jul 2024 23:24:34 +0200 Subject: [PATCH] daemon: fix a crash bug "FATAL: exception not rethrown" This is caused by pthread_cancel effectively throwing a not-specifically-identifiable C++ exception into the targeted thread, which, if it is not rethrown, terminates the process entirely. This is rather "impolite" behaviour, we would say. But thread cancellation is *always* busted, and we should simply not use it where unnecessary. It's particularly unnecessary when what we *actually* need it for is, err, interrupting a poll(2). That can in turn be achieved by simply listening to more stuff in the poll, namely, a pipe, which we send a character to when needing to stop the thread. While looking at this code, we also investigated whether any of the poll() madness is required, or was even *ever* required. Curiously we found in the XNU kernel source code that the thing about needing to listen to POLLHUP is probably *correct*, but switching it to POLLRDNORM should not have made any difference at all. We've left a FIXME to look into that further because what's written here is super janky. https://github.com/apple-oss-distributions/xnu/blob/94d3b452840153a99b38a3a9659680b2a006908e/bsd/kern/sys_generic.c#L1751-L1758 This is the crash on some Hydra machines: Thread 1 (Thread 0x7f56b77776c0 (LWP 955542) (Exiting)): 0 0x00007f56b8e9b7dc in __pthread_kill_implementation () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 1 0x00007f56b8e49516 in raise () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 2 0x00007f56b8e31935 in abort () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 3 0x00007f56b8e327f3 in __libc_message_impl.cold () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 4 0x00007f56b8e8e8e9 in __libc_fatal () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 5 0x00007f56b8ea23c4 in unwind_cleanup () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 6 0x00007f56b9d2a1b8 in nix::triggerInterrupt() [clone .cold] () from /nix/store/sahgw550p621m9dy1pd7whl9c5g1g0p7-lix-2.90.0-rc1/lib/liblixutil.so 7 0x00007f56b990ac9d in std::thread::_State_impl > >::_M_run() () from /nix/store/sahgw550p621m9dy1pd7whl9c5g1g0p7-lix-2.90.0-rc1/lib/liblixstore.so 8 0x00007f56b90e86d3 in execute_native_thread_routine () from /nix/store/c6r62m84hywf4i6qq1h28f13zv38yqyp-gcc-13.3.0-lib/lib/libstdc++.so.6 9 0x00007f56b8e99a42 in start_thread () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 10 0x00007f56b8f1905c in clone3 () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 As for testing, we've started a daemon with this change and verified it deals with HUPs correctly on x86_64-linux, but I don't think we can easily test the destructor behaviour without whatever Hydra was doing that broke. Change-Id: I29c7de0425674494b6e43c075810126c3ff77363 --- src/libutil/monitor-fd.hh | 48 ++++++++++++++++++++++++++++++++++----- 1 file changed, 42 insertions(+), 6 deletions(-) diff --git a/src/libutil/monitor-fd.hh b/src/libutil/monitor-fd.hh index 228fb13f8..233b73796 100644 --- a/src/libutil/monitor-fd.hh +++ b/src/libutil/monitor-fd.hh @@ -10,6 +10,8 @@ #include #include +#include "error.hh" +#include "file-descriptor.hh" #include "signals.hh" namespace nix { @@ -19,19 +21,36 @@ class MonitorFdHup { private: std::thread thread; + /** + * Pipe used to interrupt the poll()ing in the monitoring thread. + */ + Pipe terminatePipe; + std::atomic_bool quit = false; public: MonitorFdHup(int fd) { - thread = std::thread([fd]() { - while (true) { + terminatePipe.create(); + auto &quit_ = this->quit; + int terminateFd = terminatePipe.readSide.get(); + thread = std::thread([fd, terminateFd, &quit_]() { + while (!quit_) { /* Wait indefinitely until a POLLHUP occurs. */ - struct pollfd fds[1]; + struct pollfd fds[2]; fds[0].fd = fd; /* Polling for no specific events (i.e. just waiting for an error/hangup) doesn't work on macOS anymore. So wait for read events and ignore them. */ + // FIXME(jade): we have looked at the XNU kernel code and as + // far as we can tell, the above is bogus. It should be the + // case that the previous version of this and the current + // version are identical: waiting for POLLHUP and POLLRDNORM in + // the kernel *should* be identical. + // https://github.com/apple-oss-distributions/xnu/blob/94d3b452840153a99b38a3a9659680b2a006908e/bsd/kern/sys_generic.c#L1751-L1758 + // + // So, this needs actual testing and we need to figure out if + // this is actually bogus. fds[0].events = #ifdef __APPLE__ POLLRDNORM @@ -39,8 +58,18 @@ public: 0 #endif ; - auto count = poll(fds, 1, -1); - if (count == -1) abort(); // can't happen + fds[1].fd = terminateFd; + fds[1].events = POLLIN; + + auto count = poll(fds, 2, -1); + if (count == -1) { + if (errno == EINTR || errno == EAGAIN) { + // These are best dealt with by just trying again. + continue; + } else { + throw SysError("in MonitorFdHup poll()"); + } + } /* This shouldn't happen, but can on macOS due to a bug. See rdar://37550628. @@ -53,6 +82,11 @@ public: triggerInterrupt(); break; } + // No reason to actually look at the pipe FD, we just need it + // to be able to get woken. + if (quit_) { + break; + } /* This will only happen on macOS. We sleep a bit to avoid waking up too often if the client is sending input. */ @@ -63,7 +97,9 @@ public: ~MonitorFdHup() { - pthread_cancel(thread.native_handle()); + quit = true; + // Poke the thread out of its poll wait + writeFull(terminatePipe.writeSide.get(), "*", false); thread.join(); } };