Git object hashing in libutil

This is the core functionality but just unit-tested and not yet made
part of the store layer. This is because there is some tech debt around
(a) repeated boilerplate hashing objects (b) better integration of the
new `SourceAccessor` type that needs to be cleaned up first.

Part of RFC 133

Co-Authored-By: Matthew Bauer <mjbauer95@gmail.com>
Co-Authored-By: Carlo Nucera <carlo.nucera@protonmail.com>
Co-authored-by: Robert Hensing <roberth@users.noreply.github.com>
Co-authored-by: Florian Klink <flokli@flokli.de>
This commit is contained in:
John Ericson 2023-11-04 15:35:38 -04:00
parent 9afa697ab6
commit 20b95d6223
13 changed files with 667 additions and 38 deletions

View file

@ -96,6 +96,14 @@ constexpr std::array<ExperimentalFeatureDetails, numXpFeatures> xpFeatureDetails
[`nix`](@docroot@/command-ref/new-cli/nix.md) for details.
)",
},
{
.tag = Xp::GitHashing,
.name = "git-hashing",
.description = R"(
Allow creating (content-addressed) store objects which are hashed via Git's hashing algorithm.
These store objects will not be understandable by older versions of Nix.
)",
},
{
.tag = Xp::RecursiveNix,
.name = "recursive-nix",

View file

@ -22,6 +22,7 @@ enum struct ExperimentalFeature
Flakes,
FetchTree,
NixCommand,
GitHashing,
RecursiveNix,
NoUrlLiterals,
FetchClosure,

View file

@ -1,9 +1,263 @@
#include "git.hh"
#include <cerrno>
#include <algorithm>
#include <vector>
#include <map>
#include <regex>
#include <strings.h> // for strcasecmp
#include "signals.hh"
#include "config.hh"
#include "hash.hh"
#include "posix-source-accessor.hh"
#include "git.hh"
#include "serialise.hh"
namespace nix::git {
using namespace nix;
using namespace std::string_literals;
std::optional<Mode> decodeMode(RawMode m) {
switch (m) {
case (RawMode) Mode::Directory:
case (RawMode) Mode::Executable:
case (RawMode) Mode::Regular:
case (RawMode) Mode::Symlink:
return (Mode) m;
default:
return std::nullopt;
}
}
static std::string getStringUntil(Source & source, char byte)
{
std::string s;
char n[1];
source(std::string_view { n, 1 });
while (*n != byte) {
s += *n;
source(std::string_view { n, 1 });
}
return s;
}
static std::string getString(Source & source, int n)
{
std::string v;
v.resize(n);
source(v);
return v;
}
void parse(
ParseSink & sink,
const Path & sinkPath,
Source & source,
std::function<SinkHook> hook,
const ExperimentalFeatureSettings & xpSettings)
{
xpSettings.require(Xp::GitHashing);
auto type = getString(source, 5);
if (type == "blob ") {
sink.createRegularFile(sinkPath);
unsigned long long size = std::stoi(getStringUntil(source, 0));
sink.preallocateContents(size);
unsigned long long left = size;
std::string buf;
buf.reserve(65536);
while (left) {
checkInterrupt();
buf.resize(std::min((unsigned long long)buf.capacity(), left));
source(buf);
sink.receiveContents(buf);
left -= buf.size();
}
} else if (type == "tree ") {
unsigned long long size = std::stoi(getStringUntil(source, 0));
unsigned long long left = size;
sink.createDirectory(sinkPath);
while (left) {
std::string perms = getStringUntil(source, ' ');
left -= perms.size();
left -= 1;
RawMode rawMode = std::stoi(perms, 0, 8);
auto modeOpt = decodeMode(rawMode);
if (!modeOpt)
throw Error("Unknown Git permission: %o", perms);
auto mode = std::move(*modeOpt);
std::string name = getStringUntil(source, '\0');
left -= name.size();
left -= 1;
std::string hashs = getString(source, 20);
left -= 20;
Hash hash(htSHA1);
std::copy(hashs.begin(), hashs.end(), hash.hash);
hook(name, TreeEntry {
.mode = mode,
.hash = hash,
});
if (mode == Mode::Executable)
sink.isExecutable();
}
} else throw Error("input doesn't look like a Git object");
}
std::optional<Mode> convertMode(SourceAccessor::Type type)
{
switch (type) {
case SourceAccessor::tSymlink: return Mode::Symlink;
case SourceAccessor::tRegular: return Mode::Regular;
case SourceAccessor::tDirectory: return Mode::Directory;
case SourceAccessor::tMisc: return std::nullopt;
default: abort();
}
}
void restore(ParseSink & sink, Source & source, std::function<RestoreHook> hook)
{
parse(sink, "", source, [&](Path name, TreeEntry entry) {
auto [accessor, from] = hook(entry.hash);
auto stat = accessor->lstat(from);
auto gotOpt = convertMode(stat.type);
if (!gotOpt)
throw Error("file '%s' (git hash %s) has an unsupported type",
from,
entry.hash.to_string(HashFormat::Base16, false));
auto & got = *gotOpt;
if (got != entry.mode)
throw Error("git mode of file '%s' (git hash %s) is %o but expected %o",
from,
entry.hash.to_string(HashFormat::Base16, false),
(RawMode) got,
(RawMode) entry.mode);
copyRecursive(
*accessor, from,
sink, name);
});
}
void dumpBlobPrefix(
uint64_t size, Sink & sink,
const ExperimentalFeatureSettings & xpSettings)
{
xpSettings.require(Xp::GitHashing);
auto s = fmt("blob %d\0"s, std::to_string(size));
sink(s);
}
void dumpTree(const Tree & entries, Sink & sink,
const ExperimentalFeatureSettings & xpSettings)
{
xpSettings.require(Xp::GitHashing);
std::string v1;
for (auto & [name, entry] : entries) {
auto name2 = name;
if (entry.mode == Mode::Directory) {
assert(name2.back() == '/');
name2.pop_back();
}
v1 += fmt("%o %s\0"s, static_cast<RawMode>(entry.mode), name2);
std::copy(entry.hash.hash, entry.hash.hash + entry.hash.hashSize, std::back_inserter(v1));
}
{
auto s = fmt("tree %d\0"s, v1.size());
sink(s);
}
sink(v1);
}
Mode dump(
SourceAccessor & accessor, const CanonPath & path,
Sink & sink,
std::function<DumpHook> hook,
PathFilter & filter,
const ExperimentalFeatureSettings & xpSettings)
{
auto st = accessor.lstat(path);
switch (st.type) {
case SourceAccessor::tRegular:
{
accessor.readFile(path, sink, [&](uint64_t size) {
dumpBlobPrefix(size, sink, xpSettings);
});
return st.isExecutable
? Mode::Executable
: Mode::Regular;
}
case SourceAccessor::tDirectory:
{
Tree entries;
for (auto & [name, _] : accessor.readDirectory(path)) {
auto child = path + name;
if (!filter(child.abs())) continue;
auto entry = hook(child);
auto name2 = name;
if (entry.mode == Mode::Directory)
name2 += "/";
entries.insert_or_assign(std::move(name2), std::move(entry));
}
dumpTree(entries, sink, xpSettings);
return Mode::Directory;
}
case SourceAccessor::tSymlink:
case SourceAccessor::tMisc:
default:
throw Error("file '%1%' has an unsupported type", path);
}
}
TreeEntry dumpHash(
HashType ht,
SourceAccessor & accessor, const CanonPath & path, PathFilter & filter)
{
std::function<DumpHook> hook;
hook = [&](const CanonPath & path) -> TreeEntry {
auto hashSink = HashSink(ht);
auto mode = dump(accessor, path, hashSink, hook, filter);
auto hash = hashSink.finish().first;
return {
.mode = mode,
.hash = hash,
};
};
return hook(path);
}
namespace nix {
namespace git {
std::optional<LsRemoteRefLine> parseLsRemoteLine(std::string_view line)
{
@ -22,4 +276,3 @@ std::optional<LsRemoteRefLine> parseLsRemoteLine(std::string_view line)
}
}
}

View file

@ -5,9 +5,127 @@
#include <string_view>
#include <optional>
namespace nix {
#include "types.hh"
#include "serialise.hh"
#include "hash.hh"
#include "source-accessor.hh"
#include "fs-sink.hh"
namespace git {
namespace nix::git {
using RawMode = uint32_t;
enum struct Mode : RawMode {
Directory = 0040000,
Executable = 0100755,
Regular = 0100644,
Symlink = 0120000,
};
std::optional<Mode> decodeMode(RawMode m);
/**
* An anonymous Git tree object entry (no name part).
*/
struct TreeEntry
{
Mode mode;
Hash hash;
GENERATE_CMP(TreeEntry, me->mode, me->hash);
};
/**
* A Git tree object, fully decoded and stored in memory.
*
* Directory names must end in a `/` for sake of sorting. See
* https://github.com/mirage/irmin/issues/352
*/
using Tree = std::map<std::string, TreeEntry>;
/**
* Callback for processing a child hash with `parse`
*
* The function should
*
* 1. Obtain the file system objects denoted by `gitHash`
*
* 2. Ensure they match `mode`
*
* 3. Feed them into the same sink `parse` was called with
*
* Implementations may seek to memoize resources (bandwidth, storage,
* etc.) for the same Git hash.
*/
using SinkHook = void(const Path & name, TreeEntry entry);
void parse(
ParseSink & sink, const Path & sinkPath,
Source & source,
std::function<SinkHook> hook,
const ExperimentalFeatureSettings & xpSettings = experimentalFeatureSettings);
/**
* Assists with writing a `SinkHook` step (2).
*/
std::optional<Mode> convertMode(SourceAccessor::Type type);
/**
* Simplified version of `SinkHook` for `restore`.
*
* Given a `Hash`, return a `SourceAccessor` and `CanonPath` pointing to
* the file system object with that path.
*/
using RestoreHook = std::pair<SourceAccessor *, CanonPath>(Hash);
/**
* Wrapper around `parse` and `RestoreSink`
*/
void restore(ParseSink & sink, Source & source, std::function<RestoreHook> hook);
/**
* Dumps a single file to a sink
*
* @param xpSettings for testing purposes
*/
void dumpBlobPrefix(
uint64_t size, Sink & sink,
const ExperimentalFeatureSettings & xpSettings = experimentalFeatureSettings);
/**
* Dumps a representation of a git tree to a sink
*/
void dumpTree(
const Tree & entries, Sink & sink,
const ExperimentalFeatureSettings & xpSettings = experimentalFeatureSettings);
/**
* Callback for processing a child with `dump`
*
* The function should return the Git hash and mode of the file at the
* given path in the accessor passed to `dump`.
*
* Note that if the child is a directory, its child in must also be so
* processed in order to compute this information.
*/
using DumpHook = TreeEntry(const CanonPath & path);
Mode dump(
SourceAccessor & accessor, const CanonPath & path,
Sink & sink,
std::function<DumpHook> hook,
PathFilter & filter = defaultPathFilter,
const ExperimentalFeatureSettings & xpSettings = experimentalFeatureSettings);
/**
* Recursively dumps path, hashing as we go.
*
* A smaller wrapper around `dump`.
*/
TreeEntry dumpHash(
HashType ht,
SourceAccessor & accessor, const CanonPath & path,
PathFilter & filter = defaultPathFilter);
/**
* A line from the output of `git ls-remote --symref`.
@ -16,15 +134,17 @@ namespace git {
*
* - Symbolic references of the form
*
* ref: {target} {reference}
*
* where {target} is itself a reference and {reference} is optional
* ```
* ref: {target} {reference}
* ```
* where {target} is itself a reference and {reference} is optional
*
* - Object references of the form
*
* {target} {reference}
*
* where {target} is a commit id and {reference} is mandatory
* ```
* {target} {reference}
* ```
* where {target} is a commit id and {reference} is mandatory
*/
struct LsRemoteRefLine {
enum struct Kind {
@ -36,8 +156,9 @@ struct LsRemoteRefLine {
std::optional<std::string> reference;
};
/**
* Parse an `LsRemoteRefLine`
*/
std::optional<LsRemoteRefLine> parseLsRemoteLine(std::string_view line);
}
}

View file

@ -74,6 +74,10 @@ void Source::operator () (char * data, size_t len)
}
}
void Source::operator () (std::string_view data)
{
(*this)((char *)data.data(), data.size());
}
void Source::drainInto(Sink & sink)
{

View file

@ -73,6 +73,7 @@ struct Source
* an error if it is not going to be available.
*/
void operator () (char * data, size_t len);
void operator () (std::string_view data);
/**
* Store up to len in the buffer pointed to by data, and

View file

@ -1,33 +1,236 @@
#include "git.hh"
#include <gtest/gtest.h>
#include "git.hh"
#include "memory-source-accessor.hh"
#include "tests/characterization.hh"
namespace nix {
TEST(GitLsRemote, parseSymrefLineWithReference) {
auto line = "ref: refs/head/main HEAD";
auto res = git::parseLsRemoteLine(line);
ASSERT_TRUE(res.has_value());
ASSERT_EQ(res->kind, git::LsRemoteRefLine::Kind::Symbolic);
ASSERT_EQ(res->target, "refs/head/main");
ASSERT_EQ(res->reference, "HEAD");
using namespace git;
class GitTest : public CharacterizationTest
{
Path unitTestData = getUnitTestData() + "/libutil/git";
public:
Path goldenMaster(std::string_view testStem) const override {
return unitTestData + "/" + testStem;
}
TEST(GitLsRemote, parseSymrefLineWithNoReference) {
auto line = "ref: refs/head/main";
auto res = git::parseLsRemoteLine(line);
ASSERT_TRUE(res.has_value());
ASSERT_EQ(res->kind, git::LsRemoteRefLine::Kind::Symbolic);
ASSERT_EQ(res->target, "refs/head/main");
ASSERT_EQ(res->reference, std::nullopt);
}
/**
* We set these in tests rather than the regular globals so we don't have
* to worry about race conditions if the tests run concurrently.
*/
ExperimentalFeatureSettings mockXpSettings;
TEST(GitLsRemote, parseObjectRefLine) {
auto line = "abc123 refs/head/main";
auto res = git::parseLsRemoteLine(line);
ASSERT_TRUE(res.has_value());
ASSERT_EQ(res->kind, git::LsRemoteRefLine::Kind::Object);
ASSERT_EQ(res->target, "abc123");
ASSERT_EQ(res->reference, "refs/head/main");
private:
void SetUp() override
{
mockXpSettings.set("experimental-features", "git-hashing");
}
};
TEST(GitMode, gitMode_directory) {
Mode m = Mode::Directory;
RawMode r = 0040000;
ASSERT_EQ(static_cast<RawMode>(m), r);
ASSERT_EQ(decodeMode(r), std::optional { m });
};
TEST(GitMode, gitMode_executable) {
Mode m = Mode::Executable;
RawMode r = 0100755;
ASSERT_EQ(static_cast<RawMode>(m), r);
ASSERT_EQ(decodeMode(r), std::optional { m });
};
TEST(GitMode, gitMode_regular) {
Mode m = Mode::Regular;
RawMode r = 0100644;
ASSERT_EQ(static_cast<RawMode>(m), r);
ASSERT_EQ(decodeMode(r), std::optional { m });
};
TEST(GitMode, gitMode_symlink) {
Mode m = Mode::Symlink;
RawMode r = 0120000;
ASSERT_EQ(static_cast<RawMode>(m), r);
ASSERT_EQ(decodeMode(r), std::optional { m });
};
TEST_F(GitTest, blob_read) {
readTest("hello-world-blob.bin", [&](const auto & encoded) {
StringSource in { encoded };
StringSink out;
RegularFileSink out2 { out };
parse(out2, "", in, [](auto &, auto) {}, mockXpSettings);
auto expected = readFile(goldenMaster("hello-world.bin"));
ASSERT_EQ(out.s, expected);
});
}
TEST_F(GitTest, blob_write) {
writeTest("hello-world-blob.bin", [&]() {
auto decoded = readFile(goldenMaster("hello-world.bin"));
StringSink s;
dumpBlobPrefix(decoded.size(), s, mockXpSettings);
s(decoded);
return s.s;
});
}
/**
* This data is for "shallow" tree tests. However, we use "real" hashes
* so that we can check our test data in the corresponding functional
* test (`git-hashing/unit-test-data`).
*/
const static Tree tree = {
{
"Foo",
{
.mode = Mode::Regular,
// hello world with special chars from above
.hash = Hash::parseAny("63ddb340119baf8492d2da53af47e8c7cfcd5eb2", htSHA1),
},
},
{
"bAr",
{
.mode = Mode::Executable,
// ditto
.hash = Hash::parseAny("63ddb340119baf8492d2da53af47e8c7cfcd5eb2", htSHA1),
},
},
{
"baZ/",
{
.mode = Mode::Directory,
// Empty directory hash
.hash = Hash::parseAny("4b825dc642cb6eb9a060e54bf8d69288fbee4904", htSHA1),
},
},
};
TEST_F(GitTest, tree_read) {
readTest("tree.bin", [&](const auto & encoded) {
StringSource in { encoded };
NullParseSink out;
Tree got;
parse(out, "", in, [&](auto & name, auto entry) {
auto name2 = name;
if (entry.mode == Mode::Directory)
name2 += '/';
got.insert_or_assign(name2, std::move(entry));
}, mockXpSettings);
ASSERT_EQ(got, tree);
});
}
TEST_F(GitTest, tree_write) {
writeTest("tree.bin", [&]() {
StringSink s;
dumpTree(tree, s, mockXpSettings);
return s.s;
});
}
TEST_F(GitTest, both_roundrip) {
using File = MemorySourceAccessor::File;
MemorySourceAccessor files;
files.root = File::Directory {
.contents {
{
"foo",
File::Regular {
.contents = "hello\n\0\n\tworld!",
},
},
{
"bar",
File::Directory {
.contents = {
{
"baz",
File::Regular {
.executable = true,
.contents = "good day,\n\0\n\tworld!",
},
},
},
},
},
},
};
std::map<Hash, std::string> cas;
std::function<DumpHook> dumpHook;
dumpHook = [&](const CanonPath & path) {
StringSink s;
HashSink hashSink { htSHA1 };
TeeSink s2 { s, hashSink };
auto mode = dump(
files, path, s2, dumpHook,
defaultPathFilter, mockXpSettings);
auto hash = hashSink.finish().first;
cas.insert_or_assign(hash, std::move(s.s));
return TreeEntry {
.mode = mode,
.hash = hash,
};
};
auto root = dumpHook(CanonPath::root);
MemorySourceAccessor files2;
MemorySink sinkFiles2 { files2 };
std::function<void(const Path, const Hash &)> mkSinkHook;
mkSinkHook = [&](const Path prefix, const Hash & hash) {
StringSource in { cas[hash] };
parse(sinkFiles2, prefix, in, [&](const Path & name, const auto & entry) {
mkSinkHook(prefix + "/" + name, entry.hash);
}, mockXpSettings);
};
mkSinkHook("", root.hash);
ASSERT_EQ(files, files2);
}
TEST(GitLsRemote, parseSymrefLineWithReference) {
auto line = "ref: refs/head/main HEAD";
auto res = parseLsRemoteLine(line);
ASSERT_TRUE(res.has_value());
ASSERT_EQ(res->kind, LsRemoteRefLine::Kind::Symbolic);
ASSERT_EQ(res->target, "refs/head/main");
ASSERT_EQ(res->reference, "HEAD");
}
TEST(GitLsRemote, parseSymrefLineWithNoReference) {
auto line = "ref: refs/head/main";
auto res = parseLsRemoteLine(line);
ASSERT_TRUE(res.has_value());
ASSERT_EQ(res->kind, LsRemoteRefLine::Kind::Symbolic);
ASSERT_EQ(res->target, "refs/head/main");
ASSERT_EQ(res->reference, std::nullopt);
}
TEST(GitLsRemote, parseObjectRefLine) {
auto line = "abc123 refs/head/main";
auto res = parseLsRemoteLine(line);
ASSERT_TRUE(res.has_value());
ASSERT_EQ(res->kind, LsRemoteRefLine::Kind::Object);
ASSERT_EQ(res->target, "abc123");
ASSERT_EQ(res->reference, "refs/head/main");
}
}

View file

@ -27,3 +27,7 @@ libutil-tests_CXXFLAGS += -I src/libutil
libutil-tests_LIBS = libutil
libutil-tests_LDFLAGS := -lrapidcheck $(GTEST_LIBS)
check: unit-test-data/libutil/git/check-data.sh.test
$(eval $(call run-test,unit-test-data/libutil/git/check-data.sh))

View file

@ -0,0 +1,31 @@
#!/usr/bin/env bash
set -eu -o pipefail
export TEST_ROOT=$(realpath ${TMPDIR:-/tmp}/nix-test)/git-hashing/unit-test-data
mkdir -p $TEST_ROOT
repo="$TEST_ROOT/scratch"
git init "$repo"
git -C "$repo" config user.email "you@example.com"
git -C "$repo" config user.name "Your Name"
# `-w` to write for tree test
freshlyAddedHash=$(git -C "$repo" hash-object -w -t blob --stdin < "./hello-world.bin")
encodingHash=$(sha1sum -b < "./hello-world-blob.bin" | head -c 40)
# If the hashes match, then `hello-world-blob.bin` must be the encoding
# of `hello-world.bin`.
[[ "$encodingHash" == "$freshlyAddedHash" ]]
# Create empty directory object for tree test
echo -n | git -C "$repo" hash-object -w -t tree --stdin
# Relies on both child hashes already existing in the git store
freshlyAddedHash=$(git -C "$repo" mktree < "./tree.txt")
encodingHash=$(sha1sum -b < "./tree.bin" | head -c 40)
# If the hashes match, then `tree.bin` must be the encoding of the
# directory denoted by `tree.txt` interpreted as git directory listing.
[[ "$encodingHash" == "$freshlyAddedHash" ]]

Binary file not shown.

Binary file not shown.

Binary file not shown.

View file

@ -0,0 +1,3 @@
100644 blob 63ddb340119baf8492d2da53af47e8c7cfcd5eb2 Foo
100755 blob 63ddb340119baf8492d2da53af47e8c7cfcd5eb2 bAr
040000 tree 4b825dc642cb6eb9a060e54bf8d69288fbee4904 baZ