forked from lix-project/hydra
Merge pull request #1203 from DeterminateSystems/ds-33/wait-child-on-eof
hydra-eval-jobs: Report forked worker process status information on exception
This commit is contained in:
commit
7c133a98f8
3 changed files with 90 additions and 1 deletions
|
@ -25,6 +25,28 @@
|
||||||
|
|
||||||
#include <nlohmann/json.hpp>
|
#include <nlohmann/json.hpp>
|
||||||
|
|
||||||
|
void check_pid_status_nonblocking(pid_t check_pid) {
|
||||||
|
// Only check 'initialized' and known PID's
|
||||||
|
if (check_pid <= 0) { return; }
|
||||||
|
|
||||||
|
int wstatus = 0;
|
||||||
|
pid_t pid = waitpid(check_pid, &wstatus, WNOHANG);
|
||||||
|
// -1 = failure, WNOHANG: 0 = no change
|
||||||
|
if (pid <= 0) { return; }
|
||||||
|
|
||||||
|
std::cerr << "child process (" << pid << ") ";
|
||||||
|
|
||||||
|
if (WIFEXITED(wstatus)) {
|
||||||
|
std::cerr << "exited with status=" << WEXITSTATUS(wstatus) << std::endl;
|
||||||
|
} else if (WIFSIGNALED(wstatus)) {
|
||||||
|
std::cerr << "killed by signal=" << WTERMSIG(wstatus) << std::endl;
|
||||||
|
} else if (WIFSTOPPED(wstatus)) {
|
||||||
|
std::cerr << "stopped by signal=" << WSTOPSIG(wstatus) << std::endl;
|
||||||
|
} else if (WIFCONTINUED(wstatus)) {
|
||||||
|
std::cerr << "continued" << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
using namespace nix;
|
using namespace nix;
|
||||||
|
|
||||||
static Path gcRootsDir;
|
static Path gcRootsDir;
|
||||||
|
@ -311,8 +333,8 @@ int main(int argc, char * * argv)
|
||||||
/* Start a handler thread per worker process. */
|
/* Start a handler thread per worker process. */
|
||||||
auto handler = [&]()
|
auto handler = [&]()
|
||||||
{
|
{
|
||||||
|
pid_t pid = -1;
|
||||||
try {
|
try {
|
||||||
pid_t pid = -1;
|
|
||||||
AutoCloseFD from, to;
|
AutoCloseFD from, to;
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
|
@ -414,6 +436,7 @@ int main(int argc, char * * argv)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
|
check_pid_status_nonblocking(pid);
|
||||||
auto state(state_.lock());
|
auto state(state_.lock());
|
||||||
state->exc = std::current_exception();
|
state->exc = std::current_exception();
|
||||||
wakeup.notify_all();
|
wakeup.notify_all();
|
||||||
|
|
63
t/evaluator/evaluate-oom-job.t
Normal file
63
t/evaluator/evaluate-oom-job.t
Normal file
|
@ -0,0 +1,63 @@
|
||||||
|
use strict;
|
||||||
|
use warnings;
|
||||||
|
use Setup;
|
||||||
|
use Test2::V0;
|
||||||
|
use Hydra::Helper::Exec;
|
||||||
|
|
||||||
|
# Ensure that `systemd-run` is
|
||||||
|
# - Available in the PATH/envionment
|
||||||
|
# - Accessable to the user executing it
|
||||||
|
# - Capable of using the command switches we use in our test
|
||||||
|
my $sd_res;
|
||||||
|
eval {
|
||||||
|
($sd_res) = captureStdoutStderr(3, (
|
||||||
|
"systemd-run",
|
||||||
|
"--user",
|
||||||
|
"--collect",
|
||||||
|
"--scope",
|
||||||
|
"--property",
|
||||||
|
"MemoryMax=25M",
|
||||||
|
"--",
|
||||||
|
"true"
|
||||||
|
));
|
||||||
|
} or do {
|
||||||
|
# The command failed to execute, likely because `systemd-run` is not present
|
||||||
|
# in `PATH`
|
||||||
|
skip_all("`systemd-run` failed when invoked in this environment");
|
||||||
|
};
|
||||||
|
if ($sd_res != 0) {
|
||||||
|
# `systemd-run` executed but `sytemd-run` failed to call `true` and return
|
||||||
|
# successfully
|
||||||
|
skip_all("`systemd-run` returned non-zero when executing `true` (expected 0)");
|
||||||
|
}
|
||||||
|
|
||||||
|
my $ctx = test_context();
|
||||||
|
|
||||||
|
# Contain the memory usage to 25 MegaBytes using `systemd-run`
|
||||||
|
# Run `hydra-eval-jobs` on test job that will purposefully consume all memory
|
||||||
|
# available
|
||||||
|
my ($res, $stdout, $stderr) = captureStdoutStderr(60, (
|
||||||
|
"systemd-run",
|
||||||
|
"--user",
|
||||||
|
"--collect",
|
||||||
|
"--scope",
|
||||||
|
"--property",
|
||||||
|
"MemoryMax=25M",
|
||||||
|
"--",
|
||||||
|
"hydra-eval-jobs",
|
||||||
|
"-I", "/dev/zero",
|
||||||
|
"-I", $ctx->jobsdir,
|
||||||
|
($ctx->jobsdir . "/oom.nix")
|
||||||
|
));
|
||||||
|
|
||||||
|
isnt($res, 0, "`hydra-eval-jobs` exits non-zero");
|
||||||
|
ok(utf8::decode($stderr), "Stderr output is UTF8-clean");
|
||||||
|
like(
|
||||||
|
$stderr,
|
||||||
|
# Assert error log contains messages added in PR
|
||||||
|
# https://github.com/NixOS/hydra/pull/1203
|
||||||
|
qr/^child process \(\d+?\) killed by signal=9$/m,
|
||||||
|
"The stderr record includes a relevant error message"
|
||||||
|
);
|
||||||
|
|
||||||
|
done_testing;
|
3
t/jobs/oom.nix
Normal file
3
t/jobs/oom.nix
Normal file
|
@ -0,0 +1,3 @@
|
||||||
|
{
|
||||||
|
oom = builtins.readFile "/dev/zero";
|
||||||
|
}
|
Loading…
Reference in a new issue