libutil: rewrite RewritingSink as source

the rewriting sink was just broken. when given a rewrite set that
contained a key that is also a proper infix of another key it was
possible to produce an incorrectly rewritten result if the writer
used the wrong block size. fixing this duplicates rewriteStrings,
to avoid this we'll rewrite rewriteStrings to use RewritingSource
in a new mode that'll allow rewrites we had previously forbidden.

Change-Id: I57fa0a9a994e654e11d07172b8e31d15f0b7e8c0
This commit is contained in:
eldritch horrors 2024-05-02 01:25:46 +02:00
parent 014410cbf0
commit df8851f286
6 changed files with 104 additions and 78 deletions

View file

@ -2150,14 +2150,10 @@ SingleDrvOutputs LocalDerivationGoal::registerOutputs()
if (!rewrites.empty()) { if (!rewrites.empty()) {
debug("rewriting hashes in '%1%'; cross fingers", actualPath); debug("rewriting hashes in '%1%'; cross fingers", actualPath);
/* FIXME: Is this actually streaming? */ GeneratorSource dump{dumpPath(actualPath)};
auto source = sinkToSource([&](Sink & nextSink) { RewritingSource rewritten(rewrites, dump);
RewritingSink rsink(rewrites, nextSink);
rsink << dumpPath(actualPath);
rsink.flush();
});
Path tmpPath = actualPath + ".tmp"; Path tmpPath = actualPath + ".tmp";
restorePath(tmpPath, *source); restorePath(tmpPath, rewritten);
deletePath(actualPath); deletePath(actualPath);
movePath(tmpPath, actualPath); movePath(tmpPath, actualPath);

View file

@ -61,15 +61,12 @@ std::map<StorePath, StorePath> makeContentAddressed(
printInfo("rewriting '%s' to '%s'", pathS, dstStore.printStorePath(info.path)); printInfo("rewriting '%s' to '%s'", pathS, dstStore.printStorePath(info.path));
StringSink sink2; const auto rewritten = rewriteStrings(sink.s, {{oldHashPart, std::string(info.path.hashPart())}});
RewritingSink rsink2(oldHashPart, std::string(info.path.hashPart()), sink2);
rsink2(sink.s);
rsink2.flush();
info.narHash = hashString(htSHA256, sink2.s); info.narHash = hashString(htSHA256, rewritten);
info.narSize = sink.s.size(); info.narSize = sink.s.size();
StringSource source(sink2.s); StringSource source(rewritten);
dstStore.addToStore(info, source); dstStore.addToStore(info, source);
remappings.insert_or_assign(std::move(path), std::move(info.path)); remappings.insert_or_assign(std::move(path), std::move(info.path));

View file

@ -65,56 +65,102 @@ void RefScanSink::operator () (std::string_view data)
} }
RewritingSink::RewritingSink(const std::string & from, const std::string & to, Sink & nextSink) RewritingSource::RewritingSource(const std::string & from, const std::string & to, Source & inner)
: RewritingSink({{from, to}}, nextSink) : RewritingSource({{from, to}}, inner)
{ {
} }
RewritingSink::RewritingSink(const StringMap & rewrites, Sink & nextSink) RewritingSource::RewritingSource(StringMap rewrites, Source & inner)
: rewrites(rewrites), nextSink(nextSink) : RewritingSource(may_change_size, std::move(rewrites), inner)
{ {
std::string::size_type maxRewriteSize = 0; for (auto & [from, to] : this->rewrites) {
for (auto & [from, to] : rewrites) {
assert(from.size() == to.size()); assert(from.size() == to.size());
maxRewriteSize = std::max(maxRewriteSize, from.size());
} }
this->maxRewriteSize = maxRewriteSize;
} }
void RewritingSink::operator () (std::string_view data) RewritingSource::RewritingSource(may_change_size_t, StringMap rewrites, Source & inner)
: maxRewriteSize([&, result = size_t(0)]() mutable {
for (auto & [k, v] : rewrites) {
result = std::max(result, k.size());
}
return result;
}())
, initials([&]() -> std::string {
std::string initials;
for (const auto & [k, v] : rewrites) {
assert(!k.empty());
initials.push_back(k[0]);
}
std::ranges::sort(initials);
auto [firstDupe, _end] = std::ranges::unique(initials);
return {initials.begin(), firstDupe};
}())
, rewrites(std::move(rewrites))
, inner(&inner)
{ {
std::string s(prev);
s.append(data);
s = rewriteStrings(s, rewrites);
prev = s.size() < maxRewriteSize
? s
: maxRewriteSize == 0
? ""
: std::string(s, s.size() - maxRewriteSize + 1, maxRewriteSize - 1);
auto consumed = s.size() - prev.size();
if (consumed) nextSink(s.substr(0, consumed));
} }
void RewritingSink::flush() size_t RewritingSource::read(char * data, size_t len)
{ {
if (prev.empty()) return; if (rewrites.empty()) {
nextSink(prev); return inner->read(data, len);
prev.clear(); }
if (unreturned.empty()) {
// always make sure to have at least *two* full rewrites in the buffer,
// otherwise we may end up incorrectly rewriting if the replacement map
// contains keys that are proper infixes of other keys in the map. take
// for example the set { ab -> cc, babb -> bbbb } on the input babb. if
// we feed the input bytewise without additional windowing we will miss
// the full babb match once the second b has been seen and bab has been
// rewritten to ccb, even though babb occurs first in the input string.
while (inner && buffered.size() < std::max(2 * maxRewriteSize, len)) {
try {
auto read = inner->read(data, std::min(2 * maxRewriteSize, len));
buffered.append(data, read);
} catch (EndOfFile &) {
inner = nullptr;
}
}
if (buffered.empty() && !inner) {
throw EndOfFile("rewritten source exhausted");
}
const size_t reserved = inner ? maxRewriteSize : 0;
size_t j = 0;
while ((j = buffered.find_first_of(initials, j)) < buffered.size() - reserved) {
size_t skip = 1;
for (const auto & [from, to] : rewrites) {
if (buffered.compare(j, from.size(), from) == 0) {
buffered.replace(j, from.size(), to);
skip = to.size();
break;
}
}
j += skip;
}
rewritten = std::move(buffered);
buffered = rewritten.substr(rewritten.size() - reserved);
unreturned = rewritten;
unreturned.remove_suffix(reserved);
}
len = std::min(len, unreturned.size());
memcpy(data, unreturned.data(), len);
unreturned.remove_prefix(len);
return len;
} }
HashResult computeHashModulo(HashType ht, const std::string & modulus, Source & source) HashResult computeHashModulo(HashType ht, const std::string & modulus, Source & source)
{ {
HashSink hashSink(ht); HashSink hashSink(ht);
LengthSink lengthSink; LengthSink lengthSink;
RewritingSink rewritingSink(modulus, std::string(modulus.size(), 0), hashSink); RewritingSource rewritingSource(modulus, std::string(modulus.size(), 0), source);
TeeSink tee{rewritingSink, lengthSink}; TeeSink tee{hashSink, lengthSink};
source.drainInto(tee); rewritingSource.drainInto(tee);
rewritingSink.flush();
/* Hash the positions of the self-references. This ensures that a /* Hash the positions of the self-references. This ensures that a
NAR with self-references and a NAR with some of the NAR with self-references and a NAR with some of the

View file

@ -23,19 +23,24 @@ public:
void operator () (std::string_view data) override; void operator () (std::string_view data) override;
}; };
struct RewritingSink : Sink struct RewritingSource : Source
{ {
const std::string::size_type maxRewriteSize;
const std::string initials;
const StringMap rewrites; const StringMap rewrites;
std::string::size_type maxRewriteSize; std::string rewritten, buffered;
std::string prev; std::string_view unreturned;
Sink & nextSink; Source * inner;
RewritingSink(const std::string & from, const std::string & to, Sink & nextSink); static constexpr struct may_change_size_t {
RewritingSink(const StringMap & rewrites, Sink & nextSink); explicit may_change_size_t() = default;
} may_change_size{};
void operator () (std::string_view data) override; RewritingSource(const std::string & from, const std::string & to, Source & inner);
RewritingSource(StringMap rewrites, Source & inner);
RewritingSource(may_change_size_t, StringMap rewrites, Source & inner);
void flush(); size_t read(char * data, size_t len) override;
}; };
HashResult computeHashModulo(HashType ht, const std::string & modulus, Source & source); HashResult computeHashModulo(HashType ht, const std::string & modulus, Source & source);

View file

@ -1,4 +1,5 @@
#include "strings.hh" #include "strings.hh"
#include "references.hh"
#include <boost/lexical_cast.hpp> #include <boost/lexical_cast.hpp>
#include <stdint.h> #include <stdint.h>
@ -65,30 +66,13 @@ std::string replaceStrings(
Rewriter::Rewriter(std::map<std::string, std::string> rewrites) Rewriter::Rewriter(std::map<std::string, std::string> rewrites)
: rewrites(std::move(rewrites)) : rewrites(std::move(rewrites))
{ {
for (const auto & [k, v] : this->rewrites) {
assert(!k.empty());
initials.push_back(k[0]);
}
std::ranges::sort(initials);
auto [firstDupe, end] = std::ranges::unique(initials);
initials.erase(firstDupe, end);
} }
std::string Rewriter::operator()(std::string s) std::string Rewriter::operator()(std::string s)
{ {
size_t j = 0; StringSource src{s};
while ((j = s.find_first_of(initials, j)) != std::string::npos) { RewritingSource inner{RewritingSource::may_change_size, rewrites, src};
size_t skip = 1; return inner.drain();
for (auto & [from, to] : rewrites) {
if (s.compare(j, from.size(), from) == 0) {
s.replace(j, from.size(), to);
skip = to.size();
break;
}
}
j += skip;
}
return s;
} }
template<class N> template<class N>

View file

@ -25,11 +25,8 @@ class RewriteTest : public ::testing::TestWithParam<RewriteParams> {
TEST_P(RewriteTest, IdentityRewriteIsIdentity) { TEST_P(RewriteTest, IdentityRewriteIsIdentity) {
RewriteParams param = GetParam(); RewriteParams param = GetParam();
StringSink rewritten; StringSource src{param.originalString};
auto rewriter = RewritingSink(param.rewrites, rewritten); ASSERT_EQ(RewritingSource(param.rewrites, src).drain(), param.finalString);
rewriter(param.originalString);
rewriter.flush();
ASSERT_EQ(rewritten.s, param.finalString);
} }
INSTANTIATE_TEST_CASE_P( INSTANTIATE_TEST_CASE_P(
@ -38,7 +35,8 @@ INSTANTIATE_TEST_CASE_P(
::testing::Values( ::testing::Values(
RewriteParams{ "foooo", "baroo", {{"foo", "bar"}, {"bar", "baz"}}}, RewriteParams{ "foooo", "baroo", {{"foo", "bar"}, {"bar", "baz"}}},
RewriteParams{ "foooo", "bazoo", {{"fou", "bar"}, {"foo", "baz"}}}, RewriteParams{ "foooo", "bazoo", {{"fou", "bar"}, {"foo", "baz"}}},
RewriteParams{ "foooo", "foooo", {}} RewriteParams{ "foooo", "foooo", {}},
RewriteParams{ "babb", "bbbb", {{"ab", "aa"}, {"babb", "bbbb"}}}
) )
); );