Merge pull request #7802 from edolstra/fix-7783

Fix PID namespace support check
This commit is contained in:
Eelco Dolstra 2023-02-10 20:41:13 +01:00 committed by GitHub
commit 67451d8ed7
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
6 changed files with 83 additions and 65 deletions

View file

@ -209,7 +209,7 @@ void LocalDerivationGoal::tryLocalBuild()
#if __linux__ #if __linux__
if (useChroot) { if (useChroot) {
if (!mountNamespacesSupported() || !pidNamespacesSupported()) { if (!mountAndPidNamespacesSupported()) {
if (!settings.sandboxFallback) if (!settings.sandboxFallback)
throw Error("this system does not support the kernel namespaces that are required for sandboxing; use '--no-sandbox' to disable sandboxing"); throw Error("this system does not support the kernel namespaces that are required for sandboxing; use '--no-sandbox' to disable sandboxing");
debug("auto-disabling sandboxing because the prerequisite namespaces are not available"); debug("auto-disabling sandboxing because the prerequisite namespaces are not available");
@ -385,12 +385,6 @@ void LocalDerivationGoal::cleanupPostOutputsRegisteredModeNonCheck()
} }
int childEntry(void * arg)
{
((LocalDerivationGoal *) arg)->runChild();
return 1;
}
#if __linux__ #if __linux__
static void linkOrCopy(const Path & from, const Path & to) static void linkOrCopy(const Path & from, const Path & to)
{ {
@ -916,21 +910,15 @@ void LocalDerivationGoal::startBuilder()
if (getuid() == 0 && setgroups(0, 0) == -1) if (getuid() == 0 && setgroups(0, 0) == -1)
throw SysError("setgroups failed"); throw SysError("setgroups failed");
size_t stackSize = 1 * 1024 * 1024; ProcessOptions options;
char * stack = (char *) mmap(0, stackSize, options.cloneFlags = CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_PARENT | SIGCHLD;
PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
if (stack == MAP_FAILED) throw SysError("allocating stack");
int flags = CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_PARENT | SIGCHLD;
if (privateNetwork) if (privateNetwork)
flags |= CLONE_NEWNET; options.cloneFlags |= CLONE_NEWNET;
if (usingUserNamespace) if (usingUserNamespace)
flags |= CLONE_NEWUSER; options.cloneFlags |= CLONE_NEWUSER;
pid_t child = clone(childEntry, stack + stackSize, flags, this); pid_t child = startProcess([&]() { runChild(); }, options);
if (child == -1)
throw SysError("creating sandboxed builder process using clone()");
writeFull(builderOut.writeSide.get(), writeFull(builderOut.writeSide.get(),
fmt("%d %d\n", usingUserNamespace, child)); fmt("%d %d\n", usingUserNamespace, child));
_exit(0); _exit(0);

View file

@ -4,7 +4,7 @@
#include "util.hh" #include "util.hh"
#include "finally.hh" #include "finally.hh"
#include <mntent.h> #include <sys/mount.h>
namespace nix { namespace nix {
@ -33,60 +33,57 @@ bool userNamespacesSupported()
return false; return false;
} }
try {
Pid pid = startProcess([&]() Pid pid = startProcess([&]()
{ {
auto res = unshare(CLONE_NEWUSER); _exit(0);
_exit(res ? 1 : 0); }, {
.cloneFlags = CLONE_NEWUSER
}); });
bool supported = pid.wait() == 0; auto r = pid.wait();
assert(!r);
} catch (SysError & e) {
debug("user namespaces do not work on this system: %s", e.msg());
return false;
}
if (!supported) return true;
debug("user namespaces do not work on this system");
return supported;
}(); }();
return res; return res;
} }
bool mountNamespacesSupported() bool mountAndPidNamespacesSupported()
{ {
static auto res = [&]() -> bool static auto res = [&]() -> bool
{ {
bool useUserNamespace = userNamespacesSupported(); try {
Pid pid = startProcess([&]() Pid pid = startProcess([&]()
{ {
auto res = unshare(CLONE_NEWNS | (useUserNamespace ? CLONE_NEWUSER : 0)); /* Make sure we don't remount the parent's /proc. */
_exit(res ? 1 : 0); if (mount(0, "/", 0, MS_PRIVATE | MS_REC, 0) == -1)
_exit(1);
/* Test whether we can remount /proc. The kernel disallows
this if /proc is not fully visible, i.e. if there are
filesystems mounted on top of files inside /proc. See
https://lore.kernel.org/lkml/87tvsrjai0.fsf@xmission.com/T/. */
if (mount("none", "/proc", "proc", 0, 0) == -1)
_exit(2);
_exit(0);
}, {
.cloneFlags = CLONE_NEWNS | CLONE_NEWPID | (userNamespacesSupported() ? CLONE_NEWUSER : 0)
}); });
bool supported = pid.wait() == 0; if (pid.wait()) {
debug("PID namespaces do not work on this system: cannot remount /proc");
if (!supported) return false;
debug("mount namespaces do not work on this system");
return supported;
}();
return res;
} }
bool pidNamespacesSupported() } catch (SysError & e) {
{ debug("mount namespaces do not work on this system: %s", e.msg());
static auto res = [&]() -> bool
{
/* Check whether /proc is fully visible, i.e. there are no
filesystems mounted on top of files inside /proc. If this
is not the case, then we cannot mount a new /proc inside
the sandbox that matches the sandbox's PID namespace.
See https://lore.kernel.org/lkml/87tvsrjai0.fsf@xmission.com/T/. */
auto fp = fopen("/proc/mounts", "r");
if (!fp) return false;
Finally delFP = [&]() { fclose(fp); };
while (auto ent = getmntent(fp))
if (hasPrefix(std::string_view(ent->mnt_dir), "/proc/")) {
debug("PID namespaces do not work because /proc is not fully visible; disabling sandboxing");
return false; return false;
} }

View file

@ -6,9 +6,7 @@ namespace nix {
bool userNamespacesSupported(); bool userNamespacesSupported();
bool mountNamespacesSupported(); bool mountAndPidNamespacesSupported();
bool pidNamespacesSupported();
#endif #endif

View file

@ -36,6 +36,7 @@
#ifdef __linux__ #ifdef __linux__
#include <sys/prctl.h> #include <sys/prctl.h>
#include <sys/resource.h> #include <sys/resource.h>
#include <sys/mman.h>
#include <cmath> #include <cmath>
#endif #endif
@ -1064,9 +1065,17 @@ static pid_t doFork(bool allowVfork, std::function<void()> fun)
} }
static int childEntry(void * arg)
{
auto main = (std::function<void()> *) arg;
(*main)();
return 1;
}
pid_t startProcess(std::function<void()> fun, const ProcessOptions & options) pid_t startProcess(std::function<void()> fun, const ProcessOptions & options)
{ {
auto wrapper = [&]() { std::function<void()> wrapper = [&]() {
if (!options.allowVfork) if (!options.allowVfork)
logger = makeSimpleLogger(); logger = makeSimpleLogger();
try { try {
@ -1086,7 +1095,27 @@ pid_t startProcess(std::function<void()> fun, const ProcessOptions & options)
_exit(1); _exit(1);
}; };
pid_t pid = doFork(options.allowVfork, wrapper); pid_t pid = -1;
if (options.cloneFlags) {
#ifdef __linux__
// Not supported, since then we don't know when to free the stack.
assert(!(options.cloneFlags & CLONE_VM));
size_t stackSize = 1 * 1024 * 1024;
auto stack = (char *) mmap(0, stackSize,
PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
if (stack == MAP_FAILED) throw SysError("allocating stack");
Finally freeStack([&]() { munmap(stack, stackSize); });
pid = clone(childEntry, stack + stackSize, options.cloneFlags | SIGCHLD, &wrapper);
#else
throw Error("clone flags are only supported on Linux");
#endif
} else
pid = doFork(options.allowVfork, wrapper);
if (pid == -1) throw SysError("unable to fork"); if (pid == -1) throw SysError("unable to fork");
return pid; return pid;

View file

@ -307,6 +307,7 @@ struct ProcessOptions
bool dieWithParent = true; bool dieWithParent = true;
bool runExitHandlers = false; bool runExitHandlers = false;
bool allowVfork = false; bool allowVfork = false;
int cloneFlags = 0; // use clone() with the specified flags (Linux only)
}; };
pid_t startProcess(std::function<void()> fun, const ProcessOptions & options = ProcessOptions()); pid_t startProcess(std::function<void()> fun, const ProcessOptions & options = ProcessOptions());

View file

@ -11,6 +11,11 @@ let
{ services.openssh.enable = true; { services.openssh.enable = true;
virtualisation.writableStore = true; virtualisation.writableStore = true;
nix.settings.sandbox = true; nix.settings.sandbox = true;
# Regression test for use of PID namespaces when /proc has
# filesystems mounted on top of it
# (i.e. /proc/sys/fs/binfmt_misc).
boot.binfmt.emulatedSystems = [ "aarch64-linux" ];
}; };
# Trivial Nix expression to build remotely. # Trivial Nix expression to build remotely.