forked from lix-project/hydra
Merge pull request #1203 from DeterminateSystems/ds-33/wait-child-on-eof
hydra-eval-jobs: Report forked worker process status information on exception
This commit is contained in:
commit
7c133a98f8
3 changed files with 90 additions and 1 deletions
|
@ -25,6 +25,28 @@
|
|||
|
||||
#include <nlohmann/json.hpp>
|
||||
|
||||
void check_pid_status_nonblocking(pid_t check_pid) {
|
||||
// Only check 'initialized' and known PID's
|
||||
if (check_pid <= 0) { return; }
|
||||
|
||||
int wstatus = 0;
|
||||
pid_t pid = waitpid(check_pid, &wstatus, WNOHANG);
|
||||
// -1 = failure, WNOHANG: 0 = no change
|
||||
if (pid <= 0) { return; }
|
||||
|
||||
std::cerr << "child process (" << pid << ") ";
|
||||
|
||||
if (WIFEXITED(wstatus)) {
|
||||
std::cerr << "exited with status=" << WEXITSTATUS(wstatus) << std::endl;
|
||||
} else if (WIFSIGNALED(wstatus)) {
|
||||
std::cerr << "killed by signal=" << WTERMSIG(wstatus) << std::endl;
|
||||
} else if (WIFSTOPPED(wstatus)) {
|
||||
std::cerr << "stopped by signal=" << WSTOPSIG(wstatus) << std::endl;
|
||||
} else if (WIFCONTINUED(wstatus)) {
|
||||
std::cerr << "continued" << std::endl;
|
||||
}
|
||||
}
|
||||
|
||||
using namespace nix;
|
||||
|
||||
static Path gcRootsDir;
|
||||
|
@ -311,8 +333,8 @@ int main(int argc, char * * argv)
|
|||
/* Start a handler thread per worker process. */
|
||||
auto handler = [&]()
|
||||
{
|
||||
try {
|
||||
pid_t pid = -1;
|
||||
try {
|
||||
AutoCloseFD from, to;
|
||||
|
||||
while (true) {
|
||||
|
@ -414,6 +436,7 @@ int main(int argc, char * * argv)
|
|||
}
|
||||
}
|
||||
} catch (...) {
|
||||
check_pid_status_nonblocking(pid);
|
||||
auto state(state_.lock());
|
||||
state->exc = std::current_exception();
|
||||
wakeup.notify_all();
|
||||
|
|
63
t/evaluator/evaluate-oom-job.t
Normal file
63
t/evaluator/evaluate-oom-job.t
Normal file
|
@ -0,0 +1,63 @@
|
|||
use strict;
|
||||
use warnings;
|
||||
use Setup;
|
||||
use Test2::V0;
|
||||
use Hydra::Helper::Exec;
|
||||
|
||||
# Ensure that `systemd-run` is
|
||||
# - Available in the PATH/envionment
|
||||
# - Accessable to the user executing it
|
||||
# - Capable of using the command switches we use in our test
|
||||
my $sd_res;
|
||||
eval {
|
||||
($sd_res) = captureStdoutStderr(3, (
|
||||
"systemd-run",
|
||||
"--user",
|
||||
"--collect",
|
||||
"--scope",
|
||||
"--property",
|
||||
"MemoryMax=25M",
|
||||
"--",
|
||||
"true"
|
||||
));
|
||||
} or do {
|
||||
# The command failed to execute, likely because `systemd-run` is not present
|
||||
# in `PATH`
|
||||
skip_all("`systemd-run` failed when invoked in this environment");
|
||||
};
|
||||
if ($sd_res != 0) {
|
||||
# `systemd-run` executed but `sytemd-run` failed to call `true` and return
|
||||
# successfully
|
||||
skip_all("`systemd-run` returned non-zero when executing `true` (expected 0)");
|
||||
}
|
||||
|
||||
my $ctx = test_context();
|
||||
|
||||
# Contain the memory usage to 25 MegaBytes using `systemd-run`
|
||||
# Run `hydra-eval-jobs` on test job that will purposefully consume all memory
|
||||
# available
|
||||
my ($res, $stdout, $stderr) = captureStdoutStderr(60, (
|
||||
"systemd-run",
|
||||
"--user",
|
||||
"--collect",
|
||||
"--scope",
|
||||
"--property",
|
||||
"MemoryMax=25M",
|
||||
"--",
|
||||
"hydra-eval-jobs",
|
||||
"-I", "/dev/zero",
|
||||
"-I", $ctx->jobsdir,
|
||||
($ctx->jobsdir . "/oom.nix")
|
||||
));
|
||||
|
||||
isnt($res, 0, "`hydra-eval-jobs` exits non-zero");
|
||||
ok(utf8::decode($stderr), "Stderr output is UTF8-clean");
|
||||
like(
|
||||
$stderr,
|
||||
# Assert error log contains messages added in PR
|
||||
# https://github.com/NixOS/hydra/pull/1203
|
||||
qr/^child process \(\d+?\) killed by signal=9$/m,
|
||||
"The stderr record includes a relevant error message"
|
||||
);
|
||||
|
||||
done_testing;
|
3
t/jobs/oom.nix
Normal file
3
t/jobs/oom.nix
Normal file
|
@ -0,0 +1,3 @@
|
|||
{
|
||||
oom = builtins.readFile "/dev/zero";
|
||||
}
|
Loading…
Reference in a new issue