forked from lix-project/hydra
openConnection(): Don't throw exceptions in forked child
On hydra.nixos.org the queue runner had child processes that were stuck handling an exception: Thread 1 (Thread 0x7f501f7fe640 (LWP 1413473) "bld~v54h5zkhmb3"): #0 futex_wait (private=0, expected=2, futex_word=0x7f50c27969b0 <_rtld_local+2480>) at ../sysdeps/nptl/futex-internal.h:146 #1 __lll_lock_wait (futex=0x7f50c27969b0 <_rtld_local+2480>, private=0) at lowlevellock.c:52 #2 0x00007f50c21eaee4 in __GI___pthread_mutex_lock (mutex=0x7f50c27969b0 <_rtld_local+2480>) at ../nptl/pthread_mutex_lock.c:115 #3 0x00007f50c1854bef in __GI___dl_iterate_phdr (callback=0x7f50c190c020 <_Unwind_IteratePhdrCallback>, data=0x7f501f7fb040) at dl-iteratephdr.c:40 #4 0x00007f50c190d2d1 in _Unwind_Find_FDE () from /nix/store/65hafbsx91127farbmyyv4r5ifgjdg43-glibc-2.33-117/lib/libgcc_s.so.1 #5 0x00007f50c19099b3 in uw_frame_state_for () from /nix/store/65hafbsx91127farbmyyv4r5ifgjdg43-glibc-2.33-117/lib/libgcc_s.so.1 #6 0x00007f50c190ab90 in uw_init_context_1 () from /nix/store/65hafbsx91127farbmyyv4r5ifgjdg43-glibc-2.33-117/lib/libgcc_s.so.1 #7 0x00007f50c190b08e in _Unwind_RaiseException () from /nix/store/65hafbsx91127farbmyyv4r5ifgjdg43-glibc-2.33-117/lib/libgcc_s.so.1 #8 0x00007f50c1b02ab7 in __cxa_throw () from /nix/store/dd8swlwhpdhn6bv219562vyxhi8278hs-gcc-10.3.0-lib/lib/libstdc++.so.6 #9 0x00007f50c1d01abe in nix::parseURL (url="root@cb893012.packethost.net") at src/libutil/url.cc:53 #10 0x0000000000484f55 in extraStoreArgs (machine="root@cb893012.packethost.net") at build-remote.cc:35 #11 operator() (__closure=0x7f4fe9fe0420) at build-remote.cc:79 ... Maybe the fork happened while another thread was holding some global stack unwinding lock (https://gcc.gnu.org/bugzilla/show_bug.cgi?id=71744). Anyway, since the hanging child inherits all file descriptors to SSH clients, shutting down remote builds (via 'child.to = -1' in State::buildRemote()) doesn't work and 'child.pid.wait()' hangs forever. So let's not do any significant work between fork and exec.
This commit is contained in:
parent
bbb0998699
commit
bcaad1c934
|
@ -55,25 +55,11 @@ static void openConnection(Machine::ptr machine, Path tmpDir, int stderrFD, Chil
|
|||
to.create();
|
||||
from.create();
|
||||
|
||||
child.pid = startProcess([&]() {
|
||||
|
||||
restoreProcessContext();
|
||||
|
||||
if (dup2(to.readSide.get(), STDIN_FILENO) == -1)
|
||||
throw SysError("cannot dup input pipe to stdin");
|
||||
|
||||
if (dup2(from.writeSide.get(), STDOUT_FILENO) == -1)
|
||||
throw SysError("cannot dup output pipe to stdout");
|
||||
|
||||
if (dup2(stderrFD, STDERR_FILENO) == -1)
|
||||
throw SysError("cannot dup stderr");
|
||||
|
||||
Strings argv;
|
||||
if (machine->isLocalhost()) {
|
||||
pgmName = "nix-store";
|
||||
argv = {"nix-store", "--builders", "", "--serve", "--write"};
|
||||
}
|
||||
else {
|
||||
} else {
|
||||
pgmName = "ssh";
|
||||
auto sshName = machine->sshName;
|
||||
Strings extraArgs = extraStoreArgs(sshName);
|
||||
|
@ -92,6 +78,18 @@ static void openConnection(Machine::ptr machine, Path tmpDir, int stderrFD, Chil
|
|||
append(argv, extraArgs);
|
||||
}
|
||||
|
||||
child.pid = startProcess([&]() {
|
||||
restoreProcessContext();
|
||||
|
||||
if (dup2(to.readSide.get(), STDIN_FILENO) == -1)
|
||||
throw SysError("cannot dup input pipe to stdin");
|
||||
|
||||
if (dup2(from.writeSide.get(), STDOUT_FILENO) == -1)
|
||||
throw SysError("cannot dup output pipe to stdout");
|
||||
|
||||
if (dup2(stderrFD, STDERR_FILENO) == -1)
|
||||
throw SysError("cannot dup stderr");
|
||||
|
||||
execvp(argv.front().c_str(), (char * *) stringsToCharPtrs(argv).data()); // FIXME: remove cast
|
||||
|
||||
throw SysError("cannot start %s", pgmName);
|
||||
|
|
Loading…
Reference in a new issue