2015-06-09 12:21:21 +00:00
|
|
|
|
#include <algorithm>
|
|
|
|
|
|
|
|
|
|
#include <sys/types.h>
|
|
|
|
|
#include <sys/stat.h>
|
|
|
|
|
#include <fcntl.h>
|
|
|
|
|
|
|
|
|
|
#include "serve-protocol.hh"
|
2015-07-07 08:25:33 +00:00
|
|
|
|
#include "state.hh"
|
|
|
|
|
#include "util.hh"
|
2015-06-09 12:21:21 +00:00
|
|
|
|
#include "worker-protocol.hh"
|
|
|
|
|
|
|
|
|
|
using namespace nix;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
struct Child
|
|
|
|
|
{
|
|
|
|
|
Pid pid;
|
|
|
|
|
AutoCloseFD to, from;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
|
2015-06-25 13:29:22 +00:00
|
|
|
|
static void append(Strings & dst, const Strings & src)
|
|
|
|
|
{
|
|
|
|
|
dst.insert(dst.end(), src.begin(), src.end());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2015-08-26 11:43:02 +00:00
|
|
|
|
static void openConnection(Machine::ptr machine, Path tmpDir, int stderrFD, Child & child)
|
2015-06-09 12:21:21 +00:00
|
|
|
|
{
|
|
|
|
|
Pipe to, from;
|
|
|
|
|
to.create();
|
|
|
|
|
from.create();
|
|
|
|
|
|
|
|
|
|
child.pid = startProcess([&]() {
|
|
|
|
|
|
|
|
|
|
if (dup2(to.readSide, STDIN_FILENO) == -1)
|
|
|
|
|
throw SysError("cannot dup input pipe to stdin");
|
|
|
|
|
|
|
|
|
|
if (dup2(from.writeSide, STDOUT_FILENO) == -1)
|
|
|
|
|
throw SysError("cannot dup output pipe to stdout");
|
|
|
|
|
|
|
|
|
|
if (dup2(stderrFD, STDERR_FILENO) == -1)
|
|
|
|
|
throw SysError("cannot dup stderr");
|
|
|
|
|
|
2015-06-25 13:29:22 +00:00
|
|
|
|
Strings argv;
|
2015-08-26 11:43:02 +00:00
|
|
|
|
if (machine->sshName == "localhost")
|
2015-06-25 13:29:22 +00:00
|
|
|
|
argv = {"nix-store", "--serve", "--write"};
|
|
|
|
|
else {
|
2015-08-26 11:43:02 +00:00
|
|
|
|
argv = {"ssh", machine->sshName};
|
|
|
|
|
if (machine->sshKey != "") append(argv, {"-i", machine->sshKey});
|
|
|
|
|
if (machine->sshPublicHostKey != "") {
|
|
|
|
|
Path fileName = tmpDir + "/host-key";
|
|
|
|
|
auto p = machine->sshName.find("@");
|
|
|
|
|
string host = p != string::npos ? string(machine->sshName, p + 1) : machine->sshName;
|
|
|
|
|
writeFile(fileName, host + " " + machine->sshPublicHostKey + "\n");
|
|
|
|
|
append(argv, {"-oUserKnownHostsFile=" + fileName});
|
|
|
|
|
}
|
2015-06-25 13:29:22 +00:00
|
|
|
|
append(argv,
|
|
|
|
|
{ "-x", "-a", "-oBatchMode=yes", "-oConnectTimeout=60", "-oTCPKeepAlive=yes"
|
|
|
|
|
, "--", "nix-store", "--serve", "--write" });
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
execvp(argv.front().c_str(), (char * *) stringsToCharPtrs(argv).data()); // FIXME: remove cast
|
2015-06-09 12:21:21 +00:00
|
|
|
|
|
|
|
|
|
throw SysError("cannot start ssh");
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
to.readSide.close();
|
|
|
|
|
from.writeSide.close();
|
|
|
|
|
|
|
|
|
|
child.to = to.writeSide.borrow();
|
|
|
|
|
child.from = from.readSide.borrow();
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2016-02-15 20:10:29 +00:00
|
|
|
|
static void copyClosureTo(ref<Store> destStore,
|
2015-06-09 12:21:21 +00:00
|
|
|
|
FdSource & from, FdSink & to, const PathSet & paths,
|
2015-07-07 12:04:36 +00:00
|
|
|
|
counter & bytesSent,
|
2015-06-09 12:21:21 +00:00
|
|
|
|
bool useSubstitutes = false)
|
|
|
|
|
{
|
|
|
|
|
PathSet closure;
|
|
|
|
|
for (auto & path : paths)
|
2016-02-15 20:10:29 +00:00
|
|
|
|
destStore->computeFSClosure(path, closure);
|
2015-06-09 12:21:21 +00:00
|
|
|
|
|
|
|
|
|
/* Send the "query valid paths" command with the "lock" option
|
|
|
|
|
enabled. This prevents a race where the remote host
|
|
|
|
|
garbage-collect paths that are already there. Optionally, ask
|
|
|
|
|
the remote host to substitute missing paths. */
|
2015-07-20 23:45:00 +00:00
|
|
|
|
to << cmdQueryValidPaths << 1 << useSubstitutes << closure;
|
2015-06-09 12:21:21 +00:00
|
|
|
|
to.flush();
|
|
|
|
|
|
|
|
|
|
/* Get back the set of paths that are already valid on the remote
|
|
|
|
|
host. */
|
|
|
|
|
auto present = readStorePaths<PathSet>(from);
|
|
|
|
|
|
2015-06-09 14:03:41 +00:00
|
|
|
|
if (present.size() == closure.size()) return;
|
|
|
|
|
|
2016-02-15 20:10:29 +00:00
|
|
|
|
Paths sorted = destStore->topoSortPaths(closure);
|
2015-06-09 14:03:41 +00:00
|
|
|
|
|
|
|
|
|
Paths missing;
|
|
|
|
|
for (auto i = sorted.rbegin(); i != sorted.rend(); ++i)
|
|
|
|
|
if (present.find(*i) == present.end()) missing.push_back(*i);
|
2015-06-09 12:21:21 +00:00
|
|
|
|
|
2015-06-17 09:45:20 +00:00
|
|
|
|
printMsg(lvlDebug, format("sending %1% missing paths") % missing.size());
|
2015-06-09 12:21:21 +00:00
|
|
|
|
|
2015-06-25 14:46:59 +00:00
|
|
|
|
for (auto & p : missing)
|
2016-02-15 20:10:29 +00:00
|
|
|
|
bytesSent += destStore->queryPathInfo(p).narSize;
|
2015-06-25 14:46:59 +00:00
|
|
|
|
|
2015-07-20 23:45:00 +00:00
|
|
|
|
to << cmdImportPaths;
|
2016-02-15 20:10:29 +00:00
|
|
|
|
destStore->exportPaths(missing, false, to);
|
2015-06-09 14:03:41 +00:00
|
|
|
|
to.flush();
|
|
|
|
|
|
|
|
|
|
if (readInt(from) != 1)
|
|
|
|
|
throw Error("remote machine failed to import closure");
|
2015-06-09 12:21:21 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2016-02-15 20:10:29 +00:00
|
|
|
|
static void copyClosureFrom(ref<Store> destStore,
|
2016-02-26 14:21:44 +00:00
|
|
|
|
FdSource & from, FdSink & to, const PathSet & paths, counter & bytesReceived,
|
|
|
|
|
std::shared_ptr<FSAccessor> accessor)
|
2015-06-09 12:21:21 +00:00
|
|
|
|
{
|
2015-07-20 23:45:00 +00:00
|
|
|
|
to << cmdExportPaths << 0 << paths;
|
2015-06-09 12:21:21 +00:00
|
|
|
|
to.flush();
|
2016-02-26 14:21:44 +00:00
|
|
|
|
destStore->importPaths(false, from, accessor);
|
2015-06-25 14:46:59 +00:00
|
|
|
|
|
|
|
|
|
for (auto & p : paths)
|
2016-02-15 20:10:29 +00:00
|
|
|
|
bytesReceived += destStore->queryPathInfo(p).narSize;
|
2015-06-09 12:21:21 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2016-02-15 20:10:29 +00:00
|
|
|
|
void State::buildRemote(ref<Store> destStore,
|
2015-07-07 08:25:33 +00:00
|
|
|
|
Machine::ptr machine, Step::ptr step,
|
|
|
|
|
unsigned int maxSilentTime, unsigned int buildTimeout,
|
|
|
|
|
RemoteResult & result)
|
2015-06-09 12:21:21 +00:00
|
|
|
|
{
|
2015-07-07 08:25:33 +00:00
|
|
|
|
string base = baseNameOf(step->drvPath);
|
2015-06-19 12:51:59 +00:00
|
|
|
|
result.logFile = logDir + "/" + string(base, 0, 2) + "/" + string(base, 2);
|
|
|
|
|
AutoDelete autoDelete(result.logFile, false);
|
2015-06-09 12:21:21 +00:00
|
|
|
|
|
2015-06-19 12:51:59 +00:00
|
|
|
|
createDirs(dirOf(result.logFile));
|
2015-06-09 12:21:21 +00:00
|
|
|
|
|
2015-06-19 12:51:59 +00:00
|
|
|
|
AutoCloseFD logFD(open(result.logFile.c_str(), O_CREAT | O_TRUNC | O_WRONLY, 0666));
|
|
|
|
|
if (logFD == -1) throw SysError(format("creating log file ‘%1%’") % result.logFile);
|
2015-06-09 12:21:21 +00:00
|
|
|
|
|
2015-08-26 11:43:02 +00:00
|
|
|
|
nix::Path tmpDir = createTempDir();
|
|
|
|
|
AutoDelete tmpDirDel(tmpDir, true);
|
|
|
|
|
|
2015-06-09 12:21:21 +00:00
|
|
|
|
Child child;
|
2015-08-26 11:43:02 +00:00
|
|
|
|
openConnection(machine, tmpDir, logFD, child);
|
2015-06-09 12:21:21 +00:00
|
|
|
|
|
|
|
|
|
logFD.close();
|
|
|
|
|
|
|
|
|
|
FdSource from(child.from);
|
|
|
|
|
FdSink to(child.to);
|
|
|
|
|
|
|
|
|
|
/* Handshake. */
|
2015-07-20 23:45:00 +00:00
|
|
|
|
bool sendDerivation = true;
|
2015-10-06 15:35:08 +00:00
|
|
|
|
unsigned int remoteVersion;
|
|
|
|
|
|
2015-06-17 09:45:20 +00:00
|
|
|
|
try {
|
2015-10-06 15:35:08 +00:00
|
|
|
|
to << SERVE_MAGIC_1 << 0x202;
|
2015-06-17 09:45:20 +00:00
|
|
|
|
to.flush();
|
|
|
|
|
|
|
|
|
|
unsigned int magic = readInt(from);
|
|
|
|
|
if (magic != SERVE_MAGIC_2)
|
2015-07-07 08:25:33 +00:00
|
|
|
|
throw Error(format("protocol mismatch with ‘nix-store --serve’ on ‘%1%’") % machine->sshName);
|
2015-10-06 15:35:08 +00:00
|
|
|
|
remoteVersion = readInt(from);
|
|
|
|
|
if (GET_PROTOCOL_MAJOR(remoteVersion) != 0x200)
|
2015-07-07 08:25:33 +00:00
|
|
|
|
throw Error(format("unsupported ‘nix-store --serve’ protocol version on ‘%1%’") % machine->sshName);
|
2015-10-06 15:35:08 +00:00
|
|
|
|
if (GET_PROTOCOL_MINOR(remoteVersion) >= 1)
|
2015-07-20 23:45:00 +00:00
|
|
|
|
sendDerivation = false;
|
2015-07-31 01:39:20 +00:00
|
|
|
|
|
2015-06-17 09:45:20 +00:00
|
|
|
|
} catch (EndOfFile & e) {
|
|
|
|
|
child.pid.wait(true);
|
2015-07-21 13:53:27 +00:00
|
|
|
|
|
|
|
|
|
{
|
|
|
|
|
/* Disable this machine until a certain period of time has
|
|
|
|
|
passed. This period increases on every consecutive
|
|
|
|
|
failure. However, don't count failures that occurred
|
|
|
|
|
soon after the last one (to take into account steps
|
|
|
|
|
started in parallel). */
|
|
|
|
|
auto info(machine->state->connectInfo.lock());
|
|
|
|
|
auto now = std::chrono::system_clock::now();
|
|
|
|
|
if (info->consecutiveFailures == 0 || info->lastFailure < now - std::chrono::seconds(30)) {
|
|
|
|
|
info->consecutiveFailures = std::min(info->consecutiveFailures + 1, (unsigned int) 4);
|
|
|
|
|
info->lastFailure = now;
|
|
|
|
|
int delta = retryInterval * powf(retryBackoff, info->consecutiveFailures - 1) + (rand() % 30);
|
|
|
|
|
printMsg(lvlInfo, format("will disable machine ‘%1%’ for %2%s") % machine->sshName % delta);
|
|
|
|
|
info->disabledUntil = now + std::chrono::seconds(delta);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2015-06-19 12:51:59 +00:00
|
|
|
|
string s = chomp(readFile(result.logFile));
|
2015-07-07 08:25:33 +00:00
|
|
|
|
throw Error(format("cannot connect to ‘%1%’: %2%") % machine->sshName % s);
|
2015-06-17 09:45:20 +00:00
|
|
|
|
}
|
2015-06-09 12:21:21 +00:00
|
|
|
|
|
2015-07-21 13:53:27 +00:00
|
|
|
|
{
|
|
|
|
|
auto info(machine->state->connectInfo.lock());
|
|
|
|
|
info->consecutiveFailures = 0;
|
|
|
|
|
}
|
|
|
|
|
|
2015-07-20 23:45:00 +00:00
|
|
|
|
/* Gather the inputs. If the remote side is Nix <= 1.9, we have to
|
2015-07-31 01:41:55 +00:00
|
|
|
|
copy the entire closure of ‘drvPath’, as well as the required
|
2015-07-20 23:45:00 +00:00
|
|
|
|
outputs of the input derivations. On Nix > 1.9, we only need to
|
|
|
|
|
copy the immediate sources of the derivation and the required
|
|
|
|
|
outputs of the input derivations. */
|
|
|
|
|
PathSet inputs;
|
2015-07-31 01:41:55 +00:00
|
|
|
|
BasicDerivation basicDrv(step->drv);
|
2015-07-20 23:45:00 +00:00
|
|
|
|
|
|
|
|
|
if (sendDerivation)
|
|
|
|
|
inputs.insert(step->drvPath);
|
|
|
|
|
else
|
|
|
|
|
for (auto & p : step->drv.inputSrcs)
|
|
|
|
|
inputs.insert(p);
|
|
|
|
|
|
2015-07-07 08:25:33 +00:00
|
|
|
|
for (auto & input : step->drv.inputDrvs) {
|
2015-06-17 15:28:59 +00:00
|
|
|
|
Derivation drv2 = readDerivation(input.first);
|
|
|
|
|
for (auto & name : input.second) {
|
|
|
|
|
auto i = drv2.outputs.find(name);
|
2015-07-20 23:45:00 +00:00
|
|
|
|
if (i == drv2.outputs.end()) continue;
|
|
|
|
|
inputs.insert(i->second.path);
|
2015-07-31 01:41:55 +00:00
|
|
|
|
basicDrv.inputSrcs.insert(i->second.path);
|
2015-06-17 15:28:59 +00:00
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2016-02-15 20:10:29 +00:00
|
|
|
|
/* Ensure that the inputs exist in the destination store. This is
|
|
|
|
|
a no-op for regular stores, but for the binary cache store,
|
|
|
|
|
this will copy the inputs to the binary cache from the local
|
|
|
|
|
store. */
|
|
|
|
|
destStore->buildPaths(basicDrv.inputSrcs);
|
|
|
|
|
|
2015-06-09 12:21:21 +00:00
|
|
|
|
/* Copy the input closure. */
|
2016-02-15 20:10:29 +00:00
|
|
|
|
if (/* machine->sshName != "localhost" */ true) {
|
2015-07-10 17:10:14 +00:00
|
|
|
|
auto mc1 = std::make_shared<MaintainCount>(nrStepsWaiting);
|
2015-07-07 12:04:36 +00:00
|
|
|
|
std::lock_guard<std::mutex> sendLock(machine->state->sendLock);
|
2015-07-10 17:10:14 +00:00
|
|
|
|
mc1.reset();
|
|
|
|
|
MaintainCount mc2(nrStepsCopyingTo);
|
|
|
|
|
printMsg(lvlDebug, format("sending closure of ‘%1%’ to ‘%2%’") % step->drvPath % machine->sshName);
|
2016-02-17 09:28:42 +00:00
|
|
|
|
|
|
|
|
|
auto now1 = std::chrono::steady_clock::now();
|
|
|
|
|
|
2016-02-18 15:42:05 +00:00
|
|
|
|
copyClosureTo(destStore, from, to, inputs, bytesSent, true);
|
2016-02-17 09:28:42 +00:00
|
|
|
|
|
|
|
|
|
auto now2 = std::chrono::steady_clock::now();
|
|
|
|
|
|
|
|
|
|
result.overhead += std::chrono::duration_cast<std::chrono::milliseconds>(now2 - now1).count();
|
2015-06-24 11:19:16 +00:00
|
|
|
|
}
|
2015-06-09 12:21:21 +00:00
|
|
|
|
|
2015-06-19 12:51:59 +00:00
|
|
|
|
autoDelete.cancel();
|
|
|
|
|
|
2015-06-09 12:21:21 +00:00
|
|
|
|
/* Do the build. */
|
2015-07-07 08:25:33 +00:00
|
|
|
|
printMsg(lvlDebug, format("building ‘%1%’ on ‘%2%’") % step->drvPath % machine->sshName);
|
2015-07-20 23:45:00 +00:00
|
|
|
|
|
|
|
|
|
if (sendDerivation)
|
2015-10-06 15:35:08 +00:00
|
|
|
|
to << cmdBuildPaths << PathSet({step->drvPath});
|
2015-07-20 23:45:00 +00:00
|
|
|
|
else
|
2015-10-06 15:35:08 +00:00
|
|
|
|
to << cmdBuildDerivation << step->drvPath << basicDrv;
|
|
|
|
|
to << maxSilentTime << buildTimeout;
|
|
|
|
|
if (GET_PROTOCOL_MINOR(remoteVersion) >= 2)
|
|
|
|
|
to << 64 * 1024 * 1024; // == maxLogSize
|
2015-06-09 12:21:21 +00:00
|
|
|
|
to.flush();
|
2015-07-20 23:45:00 +00:00
|
|
|
|
|
2015-06-09 12:21:21 +00:00
|
|
|
|
result.startTime = time(0);
|
2015-06-22 09:23:00 +00:00
|
|
|
|
int res;
|
|
|
|
|
{
|
|
|
|
|
MaintainCount mc(nrStepsBuilding);
|
|
|
|
|
res = readInt(from);
|
|
|
|
|
}
|
2015-06-09 12:21:21 +00:00
|
|
|
|
result.stopTime = time(0);
|
2015-07-20 23:45:00 +00:00
|
|
|
|
|
|
|
|
|
if (sendDerivation) {
|
|
|
|
|
if (res) {
|
|
|
|
|
result.errorMsg = (format("%1% on ‘%2%’") % readString(from) % machine->sshName).str();
|
|
|
|
|
if (res == 100) result.status = BuildResult::PermanentFailure;
|
|
|
|
|
else if (res == 101) result.status = BuildResult::TimedOut;
|
|
|
|
|
else result.status = BuildResult::MiscFailure;
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
result.status = BuildResult::Built;
|
|
|
|
|
} else {
|
|
|
|
|
result.status = (BuildResult::Status) res;
|
|
|
|
|
result.errorMsg = readString(from);
|
|
|
|
|
if (!result.success()) return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* If the path was substituted or already valid, then we didn't
|
|
|
|
|
get a build log. */
|
|
|
|
|
if (result.status == BuildResult::Substituted || result.status == BuildResult::AlreadyValid) {
|
|
|
|
|
unlink(result.logFile.c_str());
|
|
|
|
|
result.logFile = "";
|
2015-06-09 12:21:21 +00:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Copy the output paths. */
|
2016-02-15 20:10:29 +00:00
|
|
|
|
if (/* machine->sshName != "localhost" */ true) {
|
2015-07-07 08:25:33 +00:00
|
|
|
|
printMsg(lvlDebug, format("copying outputs of ‘%1%’ from ‘%2%’") % step->drvPath % machine->sshName);
|
2015-06-25 14:46:59 +00:00
|
|
|
|
PathSet outputs;
|
2015-07-07 08:25:33 +00:00
|
|
|
|
for (auto & output : step->drv.outputs)
|
2015-06-25 14:46:59 +00:00
|
|
|
|
outputs.insert(output.second.path);
|
2015-06-24 11:19:16 +00:00
|
|
|
|
MaintainCount mc(nrStepsCopyingFrom);
|
2016-02-17 09:28:42 +00:00
|
|
|
|
|
2016-02-26 14:21:44 +00:00
|
|
|
|
result.accessor = destStore->getFSAccessor();
|
|
|
|
|
|
2016-02-17 09:28:42 +00:00
|
|
|
|
auto now1 = std::chrono::steady_clock::now();
|
|
|
|
|
|
2016-02-26 14:21:44 +00:00
|
|
|
|
copyClosureFrom(destStore, from, to, outputs, bytesReceived, result.accessor);
|
2016-02-17 09:28:42 +00:00
|
|
|
|
|
|
|
|
|
auto now2 = std::chrono::steady_clock::now();
|
|
|
|
|
|
|
|
|
|
result.overhead += std::chrono::duration_cast<std::chrono::milliseconds>(now2 - now1).count();
|
2015-06-24 11:19:16 +00:00
|
|
|
|
}
|
2015-06-09 12:21:21 +00:00
|
|
|
|
|
|
|
|
|
/* Shut down the connection. */
|
|
|
|
|
child.to.close();
|
|
|
|
|
child.pid.wait(true);
|
|
|
|
|
}
|