From bcaad1c934fd3bc5d93557c52697a9379acfbaea Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Wed, 30 Mar 2022 22:39:48 +0200 Subject: [PATCH] openConnection(): Don't throw exceptions in forked child On hydra.nixos.org the queue runner had child processes that were stuck handling an exception: Thread 1 (Thread 0x7f501f7fe640 (LWP 1413473) "bld~v54h5zkhmb3"): #0 futex_wait (private=0, expected=2, futex_word=0x7f50c27969b0 <_rtld_local+2480>) at ../sysdeps/nptl/futex-internal.h:146 #1 __lll_lock_wait (futex=0x7f50c27969b0 <_rtld_local+2480>, private=0) at lowlevellock.c:52 #2 0x00007f50c21eaee4 in __GI___pthread_mutex_lock (mutex=0x7f50c27969b0 <_rtld_local+2480>) at ../nptl/pthread_mutex_lock.c:115 #3 0x00007f50c1854bef in __GI___dl_iterate_phdr (callback=0x7f50c190c020 <_Unwind_IteratePhdrCallback>, data=0x7f501f7fb040) at dl-iteratephdr.c:40 #4 0x00007f50c190d2d1 in _Unwind_Find_FDE () from /nix/store/65hafbsx91127farbmyyv4r5ifgjdg43-glibc-2.33-117/lib/libgcc_s.so.1 #5 0x00007f50c19099b3 in uw_frame_state_for () from /nix/store/65hafbsx91127farbmyyv4r5ifgjdg43-glibc-2.33-117/lib/libgcc_s.so.1 #6 0x00007f50c190ab90 in uw_init_context_1 () from /nix/store/65hafbsx91127farbmyyv4r5ifgjdg43-glibc-2.33-117/lib/libgcc_s.so.1 #7 0x00007f50c190b08e in _Unwind_RaiseException () from /nix/store/65hafbsx91127farbmyyv4r5ifgjdg43-glibc-2.33-117/lib/libgcc_s.so.1 #8 0x00007f50c1b02ab7 in __cxa_throw () from /nix/store/dd8swlwhpdhn6bv219562vyxhi8278hs-gcc-10.3.0-lib/lib/libstdc++.so.6 #9 0x00007f50c1d01abe in nix::parseURL (url="root@cb893012.packethost.net") at src/libutil/url.cc:53 #10 0x0000000000484f55 in extraStoreArgs (machine="root@cb893012.packethost.net") at build-remote.cc:35 #11 operator() (__closure=0x7f4fe9fe0420) at build-remote.cc:79 ... Maybe the fork happened while another thread was holding some global stack unwinding lock (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71744). Anyway, since the hanging child inherits all file descriptors to SSH clients, shutting down remote builds (via 'child.to = -1' in State::buildRemote()) doesn't work and 'child.pid.wait()' hangs forever. So let's not do any significant work between fork and exec. --- src/hydra-queue-runner/build-remote.cc | 48 ++++++++++++-------------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/src/hydra-queue-runner/build-remote.cc b/src/hydra-queue-runner/build-remote.cc index bdbd44b9..57a5f0df 100644 --- a/src/hydra-queue-runner/build-remote.cc +++ b/src/hydra-queue-runner/build-remote.cc @@ -55,8 +55,30 @@ static void openConnection(Machine::ptr machine, Path tmpDir, int stderrFD, Chil to.create(); from.create(); - child.pid = startProcess([&]() { + Strings argv; + if (machine->isLocalhost()) { + pgmName = "nix-store"; + argv = {"nix-store", "--builders", "", "--serve", "--write"}; + } else { + pgmName = "ssh"; + auto sshName = machine->sshName; + Strings extraArgs = extraStoreArgs(sshName); + argv = {"ssh", sshName}; + if (machine->sshKey != "") append(argv, {"-i", machine->sshKey}); + if (machine->sshPublicHostKey != "") { + Path fileName = tmpDir + "/host-key"; + auto p = machine->sshName.find("@"); + std::string host = p != std::string::npos ? std::string(machine->sshName, p + 1) : machine->sshName; + writeFile(fileName, host + " " + machine->sshPublicHostKey + "\n"); + append(argv, {"-oUserKnownHostsFile=" + fileName}); + } + append(argv, + { "-x", "-a", "-oBatchMode=yes", "-oConnectTimeout=60", "-oTCPKeepAlive=yes" + , "--", "nix-store", "--serve", "--write" }); + append(argv, extraArgs); + } + child.pid = startProcess([&]() { restoreProcessContext(); if (dup2(to.readSide.get(), STDIN_FILENO) == -1) @@ -68,30 +90,6 @@ static void openConnection(Machine::ptr machine, Path tmpDir, int stderrFD, Chil if (dup2(stderrFD, STDERR_FILENO) == -1) throw SysError("cannot dup stderr"); - Strings argv; - if (machine->isLocalhost()) { - pgmName = "nix-store"; - argv = {"nix-store", "--builders", "", "--serve", "--write"}; - } - else { - pgmName = "ssh"; - auto sshName = machine->sshName; - Strings extraArgs = extraStoreArgs(sshName); - argv = {"ssh", sshName}; - if (machine->sshKey != "") append(argv, {"-i", machine->sshKey}); - if (machine->sshPublicHostKey != "") { - Path fileName = tmpDir + "/host-key"; - auto p = machine->sshName.find("@"); - std::string host = p != std::string::npos ? std::string(machine->sshName, p + 1) : machine->sshName; - writeFile(fileName, host + " " + machine->sshPublicHostKey + "\n"); - append(argv, {"-oUserKnownHostsFile=" + fileName}); - } - append(argv, - { "-x", "-a", "-oBatchMode=yes", "-oConnectTimeout=60", "-oTCPKeepAlive=yes" - , "--", "nix-store", "--serve", "--write" }); - append(argv, extraArgs); - } - execvp(argv.front().c_str(), (char * *) stringsToCharPtrs(argv).data()); // FIXME: remove cast throw SysError("cannot start %s", pgmName);