lix/src/libutil/monitor-fd.hh

104 lines
3.3 KiB
C++
Raw Normal View History

#pragma once
///@file
#include <thread>
2014-07-23 22:16:06 +00:00
#include <atomic>
2014-12-14 00:51:14 +00:00
#include <cstdlib>
#include <poll.h>
#include <sys/types.h>
#include <unistd.h>
#include <signal.h>
daemon: fix a crash bug "FATAL: exception not rethrown" This is caused by pthread_cancel effectively throwing a not-specifically-identifiable C++ exception into the targeted thread, which, if it is not rethrown, terminates the process entirely. This is rather "impolite" behaviour, we would say. But thread cancellation is *always* busted, and we should simply not use it where unnecessary. It's particularly unnecessary when what we *actually* need it for is, err, interrupting a poll(2). That can in turn be achieved by simply listening to more stuff in the poll, namely, a pipe, which we send a character to when needing to stop the thread. While looking at this code, we also investigated whether any of the poll() madness is required, or was even *ever* required. Curiously we found in the XNU kernel source code that the thing about needing to listen to POLLHUP is probably *correct*, but switching it to POLLRDNORM should not have made any difference at all. We've left a FIXME to look into that further because what's written here is super janky. https://github.com/apple-oss-distributions/xnu/blob/94d3b452840153a99b38a3a9659680b2a006908e/bsd/kern/sys_generic.c#L1751-L1758 This is the crash on some Hydra machines: Thread 1 (Thread 0x7f56b77776c0 (LWP 955542) (Exiting)): 0 0x00007f56b8e9b7dc in __pthread_kill_implementation () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 1 0x00007f56b8e49516 in raise () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 2 0x00007f56b8e31935 in abort () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 3 0x00007f56b8e327f3 in __libc_message_impl.cold () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 4 0x00007f56b8e8e8e9 in __libc_fatal () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 5 0x00007f56b8ea23c4 in unwind_cleanup () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 6 0x00007f56b9d2a1b8 in nix::triggerInterrupt() [clone .cold] () from /nix/store/sahgw550p621m9dy1pd7whl9c5g1g0p7-lix-2.90.0-rc1/lib/liblixutil.so 7 0x00007f56b990ac9d in std::thread::_State_impl<std::thread::_Invoker<std::tuple<nix::MonitorFdHup::MonitorFdHup(int)::{lambda()#1}> > >::_M_run() () from /nix/store/sahgw550p621m9dy1pd7whl9c5g1g0p7-lix-2.90.0-rc1/lib/liblixstore.so 8 0x00007f56b90e86d3 in execute_native_thread_routine () from /nix/store/c6r62m84hywf4i6qq1h28f13zv38yqyp-gcc-13.3.0-lib/lib/libstdc++.so.6 9 0x00007f56b8e99a42 in start_thread () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 10 0x00007f56b8f1905c in clone3 () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 As for testing, we've started a daemon with this change and verified it deals with HUPs correctly on x86_64-linux, but I don't think we can easily test the destructor behaviour without whatever Hydra was doing that broke. Change-Id: I29c7de0425674494b6e43c075810126c3ff77363
2024-07-12 21:24:34 +00:00
#include "error.hh"
#include "file-descriptor.hh"
#include "signals.hh"
namespace nix {
class MonitorFdHup
{
private:
std::thread thread;
daemon: fix a crash bug "FATAL: exception not rethrown" This is caused by pthread_cancel effectively throwing a not-specifically-identifiable C++ exception into the targeted thread, which, if it is not rethrown, terminates the process entirely. This is rather "impolite" behaviour, we would say. But thread cancellation is *always* busted, and we should simply not use it where unnecessary. It's particularly unnecessary when what we *actually* need it for is, err, interrupting a poll(2). That can in turn be achieved by simply listening to more stuff in the poll, namely, a pipe, which we send a character to when needing to stop the thread. While looking at this code, we also investigated whether any of the poll() madness is required, or was even *ever* required. Curiously we found in the XNU kernel source code that the thing about needing to listen to POLLHUP is probably *correct*, but switching it to POLLRDNORM should not have made any difference at all. We've left a FIXME to look into that further because what's written here is super janky. https://github.com/apple-oss-distributions/xnu/blob/94d3b452840153a99b38a3a9659680b2a006908e/bsd/kern/sys_generic.c#L1751-L1758 This is the crash on some Hydra machines: Thread 1 (Thread 0x7f56b77776c0 (LWP 955542) (Exiting)): 0 0x00007f56b8e9b7dc in __pthread_kill_implementation () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 1 0x00007f56b8e49516 in raise () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 2 0x00007f56b8e31935 in abort () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 3 0x00007f56b8e327f3 in __libc_message_impl.cold () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 4 0x00007f56b8e8e8e9 in __libc_fatal () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 5 0x00007f56b8ea23c4 in unwind_cleanup () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 6 0x00007f56b9d2a1b8 in nix::triggerInterrupt() [clone .cold] () from /nix/store/sahgw550p621m9dy1pd7whl9c5g1g0p7-lix-2.90.0-rc1/lib/liblixutil.so 7 0x00007f56b990ac9d in std::thread::_State_impl<std::thread::_Invoker<std::tuple<nix::MonitorFdHup::MonitorFdHup(int)::{lambda()#1}> > >::_M_run() () from /nix/store/sahgw550p621m9dy1pd7whl9c5g1g0p7-lix-2.90.0-rc1/lib/liblixstore.so 8 0x00007f56b90e86d3 in execute_native_thread_routine () from /nix/store/c6r62m84hywf4i6qq1h28f13zv38yqyp-gcc-13.3.0-lib/lib/libstdc++.so.6 9 0x00007f56b8e99a42 in start_thread () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 10 0x00007f56b8f1905c in clone3 () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 As for testing, we've started a daemon with this change and verified it deals with HUPs correctly on x86_64-linux, but I don't think we can easily test the destructor behaviour without whatever Hydra was doing that broke. Change-Id: I29c7de0425674494b6e43c075810126c3ff77363
2024-07-12 21:24:34 +00:00
/**
* Pipe used to interrupt the poll()ing in the monitoring thread.
*/
Pipe terminatePipe;
std::atomic_bool quit = false;
public:
MonitorFdHup(int fd)
{
daemon: fix a crash bug "FATAL: exception not rethrown" This is caused by pthread_cancel effectively throwing a not-specifically-identifiable C++ exception into the targeted thread, which, if it is not rethrown, terminates the process entirely. This is rather "impolite" behaviour, we would say. But thread cancellation is *always* busted, and we should simply not use it where unnecessary. It's particularly unnecessary when what we *actually* need it for is, err, interrupting a poll(2). That can in turn be achieved by simply listening to more stuff in the poll, namely, a pipe, which we send a character to when needing to stop the thread. While looking at this code, we also investigated whether any of the poll() madness is required, or was even *ever* required. Curiously we found in the XNU kernel source code that the thing about needing to listen to POLLHUP is probably *correct*, but switching it to POLLRDNORM should not have made any difference at all. We've left a FIXME to look into that further because what's written here is super janky. https://github.com/apple-oss-distributions/xnu/blob/94d3b452840153a99b38a3a9659680b2a006908e/bsd/kern/sys_generic.c#L1751-L1758 This is the crash on some Hydra machines: Thread 1 (Thread 0x7f56b77776c0 (LWP 955542) (Exiting)): 0 0x00007f56b8e9b7dc in __pthread_kill_implementation () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 1 0x00007f56b8e49516 in raise () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 2 0x00007f56b8e31935 in abort () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 3 0x00007f56b8e327f3 in __libc_message_impl.cold () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 4 0x00007f56b8e8e8e9 in __libc_fatal () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 5 0x00007f56b8ea23c4 in unwind_cleanup () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 6 0x00007f56b9d2a1b8 in nix::triggerInterrupt() [clone .cold] () from /nix/store/sahgw550p621m9dy1pd7whl9c5g1g0p7-lix-2.90.0-rc1/lib/liblixutil.so 7 0x00007f56b990ac9d in std::thread::_State_impl<std::thread::_Invoker<std::tuple<nix::MonitorFdHup::MonitorFdHup(int)::{lambda()#1}> > >::_M_run() () from /nix/store/sahgw550p621m9dy1pd7whl9c5g1g0p7-lix-2.90.0-rc1/lib/liblixstore.so 8 0x00007f56b90e86d3 in execute_native_thread_routine () from /nix/store/c6r62m84hywf4i6qq1h28f13zv38yqyp-gcc-13.3.0-lib/lib/libstdc++.so.6 9 0x00007f56b8e99a42 in start_thread () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 10 0x00007f56b8f1905c in clone3 () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 As for testing, we've started a daemon with this change and verified it deals with HUPs correctly on x86_64-linux, but I don't think we can easily test the destructor behaviour without whatever Hydra was doing that broke. Change-Id: I29c7de0425674494b6e43c075810126c3ff77363
2024-07-12 21:24:34 +00:00
terminatePipe.create();
auto &quit_ = this->quit;
int terminateFd = terminatePipe.readSide.get();
thread = std::thread([fd, terminateFd, &quit_]() {
while (!quit_) {
/* Wait indefinitely until a POLLHUP occurs. */
daemon: fix a crash bug "FATAL: exception not rethrown" This is caused by pthread_cancel effectively throwing a not-specifically-identifiable C++ exception into the targeted thread, which, if it is not rethrown, terminates the process entirely. This is rather "impolite" behaviour, we would say. But thread cancellation is *always* busted, and we should simply not use it where unnecessary. It's particularly unnecessary when what we *actually* need it for is, err, interrupting a poll(2). That can in turn be achieved by simply listening to more stuff in the poll, namely, a pipe, which we send a character to when needing to stop the thread. While looking at this code, we also investigated whether any of the poll() madness is required, or was even *ever* required. Curiously we found in the XNU kernel source code that the thing about needing to listen to POLLHUP is probably *correct*, but switching it to POLLRDNORM should not have made any difference at all. We've left a FIXME to look into that further because what's written here is super janky. https://github.com/apple-oss-distributions/xnu/blob/94d3b452840153a99b38a3a9659680b2a006908e/bsd/kern/sys_generic.c#L1751-L1758 This is the crash on some Hydra machines: Thread 1 (Thread 0x7f56b77776c0 (LWP 955542) (Exiting)): 0 0x00007f56b8e9b7dc in __pthread_kill_implementation () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 1 0x00007f56b8e49516 in raise () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 2 0x00007f56b8e31935 in abort () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 3 0x00007f56b8e327f3 in __libc_message_impl.cold () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 4 0x00007f56b8e8e8e9 in __libc_fatal () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 5 0x00007f56b8ea23c4 in unwind_cleanup () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 6 0x00007f56b9d2a1b8 in nix::triggerInterrupt() [clone .cold] () from /nix/store/sahgw550p621m9dy1pd7whl9c5g1g0p7-lix-2.90.0-rc1/lib/liblixutil.so 7 0x00007f56b990ac9d in std::thread::_State_impl<std::thread::_Invoker<std::tuple<nix::MonitorFdHup::MonitorFdHup(int)::{lambda()#1}> > >::_M_run() () from /nix/store/sahgw550p621m9dy1pd7whl9c5g1g0p7-lix-2.90.0-rc1/lib/liblixstore.so 8 0x00007f56b90e86d3 in execute_native_thread_routine () from /nix/store/c6r62m84hywf4i6qq1h28f13zv38yqyp-gcc-13.3.0-lib/lib/libstdc++.so.6 9 0x00007f56b8e99a42 in start_thread () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 10 0x00007f56b8f1905c in clone3 () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 As for testing, we've started a daemon with this change and verified it deals with HUPs correctly on x86_64-linux, but I don't think we can easily test the destructor behaviour without whatever Hydra was doing that broke. Change-Id: I29c7de0425674494b6e43c075810126c3ff77363
2024-07-12 21:24:34 +00:00
struct pollfd fds[2];
fds[0].fd = fd;
// There is a POSIX violation on macOS: you have to listen for
// at least POLLHUP to receive HUP events for a FD. POSIX says
// this is not so, and you should just receive them regardless,
// however, as of our testing on macOS 14.5, the events do not
// get delivered in such a case.
//
// This is allegedly filed as rdar://37537852.
daemon: fix a crash bug "FATAL: exception not rethrown" This is caused by pthread_cancel effectively throwing a not-specifically-identifiable C++ exception into the targeted thread, which, if it is not rethrown, terminates the process entirely. This is rather "impolite" behaviour, we would say. But thread cancellation is *always* busted, and we should simply not use it where unnecessary. It's particularly unnecessary when what we *actually* need it for is, err, interrupting a poll(2). That can in turn be achieved by simply listening to more stuff in the poll, namely, a pipe, which we send a character to when needing to stop the thread. While looking at this code, we also investigated whether any of the poll() madness is required, or was even *ever* required. Curiously we found in the XNU kernel source code that the thing about needing to listen to POLLHUP is probably *correct*, but switching it to POLLRDNORM should not have made any difference at all. We've left a FIXME to look into that further because what's written here is super janky. https://github.com/apple-oss-distributions/xnu/blob/94d3b452840153a99b38a3a9659680b2a006908e/bsd/kern/sys_generic.c#L1751-L1758 This is the crash on some Hydra machines: Thread 1 (Thread 0x7f56b77776c0 (LWP 955542) (Exiting)): 0 0x00007f56b8e9b7dc in __pthread_kill_implementation () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 1 0x00007f56b8e49516 in raise () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 2 0x00007f56b8e31935 in abort () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 3 0x00007f56b8e327f3 in __libc_message_impl.cold () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 4 0x00007f56b8e8e8e9 in __libc_fatal () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 5 0x00007f56b8ea23c4 in unwind_cleanup () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 6 0x00007f56b9d2a1b8 in nix::triggerInterrupt() [clone .cold] () from /nix/store/sahgw550p621m9dy1pd7whl9c5g1g0p7-lix-2.90.0-rc1/lib/liblixutil.so 7 0x00007f56b990ac9d in std::thread::_State_impl<std::thread::_Invoker<std::tuple<nix::MonitorFdHup::MonitorFdHup(int)::{lambda()#1}> > >::_M_run() () from /nix/store/sahgw550p621m9dy1pd7whl9c5g1g0p7-lix-2.90.0-rc1/lib/liblixstore.so 8 0x00007f56b90e86d3 in execute_native_thread_routine () from /nix/store/c6r62m84hywf4i6qq1h28f13zv38yqyp-gcc-13.3.0-lib/lib/libstdc++.so.6 9 0x00007f56b8e99a42 in start_thread () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 10 0x00007f56b8f1905c in clone3 () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 As for testing, we've started a daemon with this change and verified it deals with HUPs correctly on x86_64-linux, but I don't think we can easily test the destructor behaviour without whatever Hydra was doing that broke. Change-Id: I29c7de0425674494b6e43c075810126c3ff77363
2024-07-12 21:24:34 +00:00
//
// Relevant code, which backs this up:
// https://github.com/apple-oss-distributions/xnu/blob/94d3b452840153a99b38a3a9659680b2a006908e/bsd/kern/sys_generic.c#L1751-L1758
fds[0].events = POLLHUP;
daemon: fix a crash bug "FATAL: exception not rethrown" This is caused by pthread_cancel effectively throwing a not-specifically-identifiable C++ exception into the targeted thread, which, if it is not rethrown, terminates the process entirely. This is rather "impolite" behaviour, we would say. But thread cancellation is *always* busted, and we should simply not use it where unnecessary. It's particularly unnecessary when what we *actually* need it for is, err, interrupting a poll(2). That can in turn be achieved by simply listening to more stuff in the poll, namely, a pipe, which we send a character to when needing to stop the thread. While looking at this code, we also investigated whether any of the poll() madness is required, or was even *ever* required. Curiously we found in the XNU kernel source code that the thing about needing to listen to POLLHUP is probably *correct*, but switching it to POLLRDNORM should not have made any difference at all. We've left a FIXME to look into that further because what's written here is super janky. https://github.com/apple-oss-distributions/xnu/blob/94d3b452840153a99b38a3a9659680b2a006908e/bsd/kern/sys_generic.c#L1751-L1758 This is the crash on some Hydra machines: Thread 1 (Thread 0x7f56b77776c0 (LWP 955542) (Exiting)): 0 0x00007f56b8e9b7dc in __pthread_kill_implementation () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 1 0x00007f56b8e49516 in raise () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 2 0x00007f56b8e31935 in abort () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 3 0x00007f56b8e327f3 in __libc_message_impl.cold () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 4 0x00007f56b8e8e8e9 in __libc_fatal () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 5 0x00007f56b8ea23c4 in unwind_cleanup () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 6 0x00007f56b9d2a1b8 in nix::triggerInterrupt() [clone .cold] () from /nix/store/sahgw550p621m9dy1pd7whl9c5g1g0p7-lix-2.90.0-rc1/lib/liblixutil.so 7 0x00007f56b990ac9d in std::thread::_State_impl<std::thread::_Invoker<std::tuple<nix::MonitorFdHup::MonitorFdHup(int)::{lambda()#1}> > >::_M_run() () from /nix/store/sahgw550p621m9dy1pd7whl9c5g1g0p7-lix-2.90.0-rc1/lib/liblixstore.so 8 0x00007f56b90e86d3 in execute_native_thread_routine () from /nix/store/c6r62m84hywf4i6qq1h28f13zv38yqyp-gcc-13.3.0-lib/lib/libstdc++.so.6 9 0x00007f56b8e99a42 in start_thread () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 10 0x00007f56b8f1905c in clone3 () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 As for testing, we've started a daemon with this change and verified it deals with HUPs correctly on x86_64-linux, but I don't think we can easily test the destructor behaviour without whatever Hydra was doing that broke. Change-Id: I29c7de0425674494b6e43c075810126c3ff77363
2024-07-12 21:24:34 +00:00
fds[1].fd = terminateFd;
fds[1].events = POLLIN;
auto count = poll(fds, 2, -1);
if (count == -1) {
if (errno == EINTR || errno == EAGAIN) {
// These are best dealt with by just trying again.
continue;
} else {
throw SysError("in MonitorFdHup poll()");
}
}
/* This shouldn't happen, but can on macOS due to a bug.
See rdar://37550628.
This may eventually need a delay or further
coordination with the main thread if spinning proves
too harmful.
*/
if (count == 0) continue;
if (fds[0].revents & POLLHUP) {
triggerInterrupt();
break;
}
// No reason to actually look at the pipe FD if that's what
// woke us, the only thing that actually matters is the quit
// flag.
daemon: fix a crash bug "FATAL: exception not rethrown" This is caused by pthread_cancel effectively throwing a not-specifically-identifiable C++ exception into the targeted thread, which, if it is not rethrown, terminates the process entirely. This is rather "impolite" behaviour, we would say. But thread cancellation is *always* busted, and we should simply not use it where unnecessary. It's particularly unnecessary when what we *actually* need it for is, err, interrupting a poll(2). That can in turn be achieved by simply listening to more stuff in the poll, namely, a pipe, which we send a character to when needing to stop the thread. While looking at this code, we also investigated whether any of the poll() madness is required, or was even *ever* required. Curiously we found in the XNU kernel source code that the thing about needing to listen to POLLHUP is probably *correct*, but switching it to POLLRDNORM should not have made any difference at all. We've left a FIXME to look into that further because what's written here is super janky. https://github.com/apple-oss-distributions/xnu/blob/94d3b452840153a99b38a3a9659680b2a006908e/bsd/kern/sys_generic.c#L1751-L1758 This is the crash on some Hydra machines: Thread 1 (Thread 0x7f56b77776c0 (LWP 955542) (Exiting)): 0 0x00007f56b8e9b7dc in __pthread_kill_implementation () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 1 0x00007f56b8e49516 in raise () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 2 0x00007f56b8e31935 in abort () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 3 0x00007f56b8e327f3 in __libc_message_impl.cold () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 4 0x00007f56b8e8e8e9 in __libc_fatal () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 5 0x00007f56b8ea23c4 in unwind_cleanup () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 6 0x00007f56b9d2a1b8 in nix::triggerInterrupt() [clone .cold] () from /nix/store/sahgw550p621m9dy1pd7whl9c5g1g0p7-lix-2.90.0-rc1/lib/liblixutil.so 7 0x00007f56b990ac9d in std::thread::_State_impl<std::thread::_Invoker<std::tuple<nix::MonitorFdHup::MonitorFdHup(int)::{lambda()#1}> > >::_M_run() () from /nix/store/sahgw550p621m9dy1pd7whl9c5g1g0p7-lix-2.90.0-rc1/lib/liblixstore.so 8 0x00007f56b90e86d3 in execute_native_thread_routine () from /nix/store/c6r62m84hywf4i6qq1h28f13zv38yqyp-gcc-13.3.0-lib/lib/libstdc++.so.6 9 0x00007f56b8e99a42 in start_thread () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 10 0x00007f56b8f1905c in clone3 () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 As for testing, we've started a daemon with this change and verified it deals with HUPs correctly on x86_64-linux, but I don't think we can easily test the destructor behaviour without whatever Hydra was doing that broke. Change-Id: I29c7de0425674494b6e43c075810126c3ff77363
2024-07-12 21:24:34 +00:00
if (quit_) {
break;
}
// On macOS, it is possible (although not observed on macOS
// 14.5) that in some limited cases on buggy kernel versions,
// all the non-POLLHUP events for the socket get delivered.
// Sleeping avoids pointlessly spinning a thread on those.
sleep(1);
}
});
};
~MonitorFdHup()
{
daemon: fix a crash bug "FATAL: exception not rethrown" This is caused by pthread_cancel effectively throwing a not-specifically-identifiable C++ exception into the targeted thread, which, if it is not rethrown, terminates the process entirely. This is rather "impolite" behaviour, we would say. But thread cancellation is *always* busted, and we should simply not use it where unnecessary. It's particularly unnecessary when what we *actually* need it for is, err, interrupting a poll(2). That can in turn be achieved by simply listening to more stuff in the poll, namely, a pipe, which we send a character to when needing to stop the thread. While looking at this code, we also investigated whether any of the poll() madness is required, or was even *ever* required. Curiously we found in the XNU kernel source code that the thing about needing to listen to POLLHUP is probably *correct*, but switching it to POLLRDNORM should not have made any difference at all. We've left a FIXME to look into that further because what's written here is super janky. https://github.com/apple-oss-distributions/xnu/blob/94d3b452840153a99b38a3a9659680b2a006908e/bsd/kern/sys_generic.c#L1751-L1758 This is the crash on some Hydra machines: Thread 1 (Thread 0x7f56b77776c0 (LWP 955542) (Exiting)): 0 0x00007f56b8e9b7dc in __pthread_kill_implementation () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 1 0x00007f56b8e49516 in raise () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 2 0x00007f56b8e31935 in abort () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 3 0x00007f56b8e327f3 in __libc_message_impl.cold () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 4 0x00007f56b8e8e8e9 in __libc_fatal () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 5 0x00007f56b8ea23c4 in unwind_cleanup () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 6 0x00007f56b9d2a1b8 in nix::triggerInterrupt() [clone .cold] () from /nix/store/sahgw550p621m9dy1pd7whl9c5g1g0p7-lix-2.90.0-rc1/lib/liblixutil.so 7 0x00007f56b990ac9d in std::thread::_State_impl<std::thread::_Invoker<std::tuple<nix::MonitorFdHup::MonitorFdHup(int)::{lambda()#1}> > >::_M_run() () from /nix/store/sahgw550p621m9dy1pd7whl9c5g1g0p7-lix-2.90.0-rc1/lib/liblixstore.so 8 0x00007f56b90e86d3 in execute_native_thread_routine () from /nix/store/c6r62m84hywf4i6qq1h28f13zv38yqyp-gcc-13.3.0-lib/lib/libstdc++.so.6 9 0x00007f56b8e99a42 in start_thread () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 10 0x00007f56b8f1905c in clone3 () from /nix/store/m71p7f0nymb19yn1dascklyya2i96jfw-glibc-2.39-52/lib/libc.so.6 As for testing, we've started a daemon with this change and verified it deals with HUPs correctly on x86_64-linux, but I don't think we can easily test the destructor behaviour without whatever Hydra was doing that broke. Change-Id: I29c7de0425674494b6e43c075810126c3ff77363
2024-07-12 21:24:34 +00:00
quit = true;
// Poke the thread out of its poll wait
writeFull(terminatePipe.writeSide.get(), "*", false);
if (thread.joinable()) {
thread.join();
}
}
};
}