Merge pull request #7802 from edolstra/fix-7783
Fix PID namespace support check
This commit is contained in:
commit
67451d8ed7
6 changed files with 83 additions and 65 deletions
|
@ -209,7 +209,7 @@ void LocalDerivationGoal::tryLocalBuild()
|
||||||
|
|
||||||
#if __linux__
|
#if __linux__
|
||||||
if (useChroot) {
|
if (useChroot) {
|
||||||
if (!mountNamespacesSupported() || !pidNamespacesSupported()) {
|
if (!mountAndPidNamespacesSupported()) {
|
||||||
if (!settings.sandboxFallback)
|
if (!settings.sandboxFallback)
|
||||||
throw Error("this system does not support the kernel namespaces that are required for sandboxing; use '--no-sandbox' to disable sandboxing");
|
throw Error("this system does not support the kernel namespaces that are required for sandboxing; use '--no-sandbox' to disable sandboxing");
|
||||||
debug("auto-disabling sandboxing because the prerequisite namespaces are not available");
|
debug("auto-disabling sandboxing because the prerequisite namespaces are not available");
|
||||||
|
@ -385,12 +385,6 @@ void LocalDerivationGoal::cleanupPostOutputsRegisteredModeNonCheck()
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
int childEntry(void * arg)
|
|
||||||
{
|
|
||||||
((LocalDerivationGoal *) arg)->runChild();
|
|
||||||
return 1;
|
|
||||||
}
|
|
||||||
|
|
||||||
#if __linux__
|
#if __linux__
|
||||||
static void linkOrCopy(const Path & from, const Path & to)
|
static void linkOrCopy(const Path & from, const Path & to)
|
||||||
{
|
{
|
||||||
|
@ -916,21 +910,15 @@ void LocalDerivationGoal::startBuilder()
|
||||||
if (getuid() == 0 && setgroups(0, 0) == -1)
|
if (getuid() == 0 && setgroups(0, 0) == -1)
|
||||||
throw SysError("setgroups failed");
|
throw SysError("setgroups failed");
|
||||||
|
|
||||||
size_t stackSize = 1 * 1024 * 1024;
|
ProcessOptions options;
|
||||||
char * stack = (char *) mmap(0, stackSize,
|
options.cloneFlags = CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_PARENT | SIGCHLD;
|
||||||
PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
|
|
||||||
if (stack == MAP_FAILED) throw SysError("allocating stack");
|
|
||||||
|
|
||||||
int flags = CLONE_NEWPID | CLONE_NEWNS | CLONE_NEWIPC | CLONE_NEWUTS | CLONE_PARENT | SIGCHLD;
|
|
||||||
if (privateNetwork)
|
if (privateNetwork)
|
||||||
flags |= CLONE_NEWNET;
|
options.cloneFlags |= CLONE_NEWNET;
|
||||||
if (usingUserNamespace)
|
if (usingUserNamespace)
|
||||||
flags |= CLONE_NEWUSER;
|
options.cloneFlags |= CLONE_NEWUSER;
|
||||||
|
|
||||||
pid_t child = clone(childEntry, stack + stackSize, flags, this);
|
pid_t child = startProcess([&]() { runChild(); }, options);
|
||||||
|
|
||||||
if (child == -1)
|
|
||||||
throw SysError("creating sandboxed builder process using clone()");
|
|
||||||
writeFull(builderOut.writeSide.get(),
|
writeFull(builderOut.writeSide.get(),
|
||||||
fmt("%d %d\n", usingUserNamespace, child));
|
fmt("%d %d\n", usingUserNamespace, child));
|
||||||
_exit(0);
|
_exit(0);
|
||||||
|
|
|
@ -4,7 +4,7 @@
|
||||||
#include "util.hh"
|
#include "util.hh"
|
||||||
#include "finally.hh"
|
#include "finally.hh"
|
||||||
|
|
||||||
#include <mntent.h>
|
#include <sys/mount.h>
|
||||||
|
|
||||||
namespace nix {
|
namespace nix {
|
||||||
|
|
||||||
|
@ -33,63 +33,60 @@ bool userNamespacesSupported()
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
Pid pid = startProcess([&]()
|
try {
|
||||||
{
|
Pid pid = startProcess([&]()
|
||||||
auto res = unshare(CLONE_NEWUSER);
|
{
|
||||||
_exit(res ? 1 : 0);
|
_exit(0);
|
||||||
});
|
}, {
|
||||||
|
.cloneFlags = CLONE_NEWUSER
|
||||||
|
});
|
||||||
|
|
||||||
bool supported = pid.wait() == 0;
|
auto r = pid.wait();
|
||||||
|
assert(!r);
|
||||||
|
} catch (SysError & e) {
|
||||||
|
debug("user namespaces do not work on this system: %s", e.msg());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
if (!supported)
|
return true;
|
||||||
debug("user namespaces do not work on this system");
|
|
||||||
|
|
||||||
return supported;
|
|
||||||
}();
|
}();
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool mountNamespacesSupported()
|
bool mountAndPidNamespacesSupported()
|
||||||
{
|
{
|
||||||
static auto res = [&]() -> bool
|
static auto res = [&]() -> bool
|
||||||
{
|
{
|
||||||
bool useUserNamespace = userNamespacesSupported();
|
try {
|
||||||
|
|
||||||
Pid pid = startProcess([&]()
|
Pid pid = startProcess([&]()
|
||||||
{
|
{
|
||||||
auto res = unshare(CLONE_NEWNS | (useUserNamespace ? CLONE_NEWUSER : 0));
|
/* Make sure we don't remount the parent's /proc. */
|
||||||
_exit(res ? 1 : 0);
|
if (mount(0, "/", 0, MS_PRIVATE | MS_REC, 0) == -1)
|
||||||
});
|
_exit(1);
|
||||||
|
|
||||||
bool supported = pid.wait() == 0;
|
/* Test whether we can remount /proc. The kernel disallows
|
||||||
|
this if /proc is not fully visible, i.e. if there are
|
||||||
|
filesystems mounted on top of files inside /proc. See
|
||||||
|
https://lore.kernel.org/lkml/87tvsrjai0.fsf@xmission.com/T/. */
|
||||||
|
if (mount("none", "/proc", "proc", 0, 0) == -1)
|
||||||
|
_exit(2);
|
||||||
|
|
||||||
if (!supported)
|
_exit(0);
|
||||||
debug("mount namespaces do not work on this system");
|
}, {
|
||||||
|
.cloneFlags = CLONE_NEWNS | CLONE_NEWPID | (userNamespacesSupported() ? CLONE_NEWUSER : 0)
|
||||||
|
});
|
||||||
|
|
||||||
return supported;
|
if (pid.wait()) {
|
||||||
}();
|
debug("PID namespaces do not work on this system: cannot remount /proc");
|
||||||
return res;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool pidNamespacesSupported()
|
|
||||||
{
|
|
||||||
static auto res = [&]() -> bool
|
|
||||||
{
|
|
||||||
/* Check whether /proc is fully visible, i.e. there are no
|
|
||||||
filesystems mounted on top of files inside /proc. If this
|
|
||||||
is not the case, then we cannot mount a new /proc inside
|
|
||||||
the sandbox that matches the sandbox's PID namespace.
|
|
||||||
See https://lore.kernel.org/lkml/87tvsrjai0.fsf@xmission.com/T/. */
|
|
||||||
auto fp = fopen("/proc/mounts", "r");
|
|
||||||
if (!fp) return false;
|
|
||||||
Finally delFP = [&]() { fclose(fp); };
|
|
||||||
|
|
||||||
while (auto ent = getmntent(fp))
|
|
||||||
if (hasPrefix(std::string_view(ent->mnt_dir), "/proc/")) {
|
|
||||||
debug("PID namespaces do not work because /proc is not fully visible; disabling sandboxing");
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
} catch (SysError & e) {
|
||||||
|
debug("mount namespaces do not work on this system: %s", e.msg());
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
}();
|
}();
|
||||||
return res;
|
return res;
|
||||||
|
|
|
@ -6,9 +6,7 @@ namespace nix {
|
||||||
|
|
||||||
bool userNamespacesSupported();
|
bool userNamespacesSupported();
|
||||||
|
|
||||||
bool mountNamespacesSupported();
|
bool mountAndPidNamespacesSupported();
|
||||||
|
|
||||||
bool pidNamespacesSupported();
|
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -36,6 +36,7 @@
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
#include <sys/prctl.h>
|
#include <sys/prctl.h>
|
||||||
#include <sys/resource.h>
|
#include <sys/resource.h>
|
||||||
|
#include <sys/mman.h>
|
||||||
|
|
||||||
#include <cmath>
|
#include <cmath>
|
||||||
#endif
|
#endif
|
||||||
|
@ -1064,9 +1065,17 @@ static pid_t doFork(bool allowVfork, std::function<void()> fun)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static int childEntry(void * arg)
|
||||||
|
{
|
||||||
|
auto main = (std::function<void()> *) arg;
|
||||||
|
(*main)();
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
pid_t startProcess(std::function<void()> fun, const ProcessOptions & options)
|
pid_t startProcess(std::function<void()> fun, const ProcessOptions & options)
|
||||||
{
|
{
|
||||||
auto wrapper = [&]() {
|
std::function<void()> wrapper = [&]() {
|
||||||
if (!options.allowVfork)
|
if (!options.allowVfork)
|
||||||
logger = makeSimpleLogger();
|
logger = makeSimpleLogger();
|
||||||
try {
|
try {
|
||||||
|
@ -1086,7 +1095,27 @@ pid_t startProcess(std::function<void()> fun, const ProcessOptions & options)
|
||||||
_exit(1);
|
_exit(1);
|
||||||
};
|
};
|
||||||
|
|
||||||
pid_t pid = doFork(options.allowVfork, wrapper);
|
pid_t pid = -1;
|
||||||
|
|
||||||
|
if (options.cloneFlags) {
|
||||||
|
#ifdef __linux__
|
||||||
|
// Not supported, since then we don't know when to free the stack.
|
||||||
|
assert(!(options.cloneFlags & CLONE_VM));
|
||||||
|
|
||||||
|
size_t stackSize = 1 * 1024 * 1024;
|
||||||
|
auto stack = (char *) mmap(0, stackSize,
|
||||||
|
PROT_WRITE | PROT_READ, MAP_PRIVATE | MAP_ANONYMOUS | MAP_STACK, -1, 0);
|
||||||
|
if (stack == MAP_FAILED) throw SysError("allocating stack");
|
||||||
|
|
||||||
|
Finally freeStack([&]() { munmap(stack, stackSize); });
|
||||||
|
|
||||||
|
pid = clone(childEntry, stack + stackSize, options.cloneFlags | SIGCHLD, &wrapper);
|
||||||
|
#else
|
||||||
|
throw Error("clone flags are only supported on Linux");
|
||||||
|
#endif
|
||||||
|
} else
|
||||||
|
pid = doFork(options.allowVfork, wrapper);
|
||||||
|
|
||||||
if (pid == -1) throw SysError("unable to fork");
|
if (pid == -1) throw SysError("unable to fork");
|
||||||
|
|
||||||
return pid;
|
return pid;
|
||||||
|
|
|
@ -307,6 +307,7 @@ struct ProcessOptions
|
||||||
bool dieWithParent = true;
|
bool dieWithParent = true;
|
||||||
bool runExitHandlers = false;
|
bool runExitHandlers = false;
|
||||||
bool allowVfork = false;
|
bool allowVfork = false;
|
||||||
|
int cloneFlags = 0; // use clone() with the specified flags (Linux only)
|
||||||
};
|
};
|
||||||
|
|
||||||
pid_t startProcess(std::function<void()> fun, const ProcessOptions & options = ProcessOptions());
|
pid_t startProcess(std::function<void()> fun, const ProcessOptions & options = ProcessOptions());
|
||||||
|
|
|
@ -11,6 +11,11 @@ let
|
||||||
{ services.openssh.enable = true;
|
{ services.openssh.enable = true;
|
||||||
virtualisation.writableStore = true;
|
virtualisation.writableStore = true;
|
||||||
nix.settings.sandbox = true;
|
nix.settings.sandbox = true;
|
||||||
|
|
||||||
|
# Regression test for use of PID namespaces when /proc has
|
||||||
|
# filesystems mounted on top of it
|
||||||
|
# (i.e. /proc/sys/fs/binfmt_misc).
|
||||||
|
boot.binfmt.emulatedSystems = [ "aarch64-linux" ];
|
||||||
};
|
};
|
||||||
|
|
||||||
# Trivial Nix expression to build remotely.
|
# Trivial Nix expression to build remotely.
|
||||||
|
|
Loading…
Reference in a new issue