From df8851f286a407c46ea9107ca403888291f1afbe Mon Sep 17 00:00:00 2001 From: eldritch horrors Date: Thu, 2 May 2024 01:25:46 +0200 Subject: [PATCH] libutil: rewrite RewritingSink as source the rewriting sink was just broken. when given a rewrite set that contained a key that is also a proper infix of another key it was possible to produce an incorrectly rewritten result if the writer used the wrong block size. fixing this duplicates rewriteStrings, to avoid this we'll rewrite rewriteStrings to use RewritingSource in a new mode that'll allow rewrites we had previously forbidden. Change-Id: I57fa0a9a994e654e11d07172b8e31d15f0b7e8c0 --- src/libstore/build/local-derivation-goal.cc | 10 +- src/libstore/make-content-addressed.cc | 9 +- src/libutil/references.cc | 108 ++++++++++++++------ src/libutil/references.hh | 21 ++-- src/libutil/strings.cc | 24 +---- tests/unit/libutil/references.cc | 10 +- 6 files changed, 104 insertions(+), 78 deletions(-) diff --git a/src/libstore/build/local-derivation-goal.cc b/src/libstore/build/local-derivation-goal.cc index d523ef51f..a071883bb 100644 --- a/src/libstore/build/local-derivation-goal.cc +++ b/src/libstore/build/local-derivation-goal.cc @@ -2150,14 +2150,10 @@ SingleDrvOutputs LocalDerivationGoal::registerOutputs() if (!rewrites.empty()) { debug("rewriting hashes in '%1%'; cross fingers", actualPath); - /* FIXME: Is this actually streaming? */ - auto source = sinkToSource([&](Sink & nextSink) { - RewritingSink rsink(rewrites, nextSink); - rsink << dumpPath(actualPath); - rsink.flush(); - }); + GeneratorSource dump{dumpPath(actualPath)}; + RewritingSource rewritten(rewrites, dump); Path tmpPath = actualPath + ".tmp"; - restorePath(tmpPath, *source); + restorePath(tmpPath, rewritten); deletePath(actualPath); movePath(tmpPath, actualPath); diff --git a/src/libstore/make-content-addressed.cc b/src/libstore/make-content-addressed.cc index b5397541c..1aa139b38 100644 --- a/src/libstore/make-content-addressed.cc +++ b/src/libstore/make-content-addressed.cc @@ -61,15 +61,12 @@ std::map makeContentAddressed( printInfo("rewriting '%s' to '%s'", pathS, dstStore.printStorePath(info.path)); - StringSink sink2; - RewritingSink rsink2(oldHashPart, std::string(info.path.hashPart()), sink2); - rsink2(sink.s); - rsink2.flush(); + const auto rewritten = rewriteStrings(sink.s, {{oldHashPart, std::string(info.path.hashPart())}}); - info.narHash = hashString(htSHA256, sink2.s); + info.narHash = hashString(htSHA256, rewritten); info.narSize = sink.s.size(); - StringSource source(sink2.s); + StringSource source(rewritten); dstStore.addToStore(info, source); remappings.insert_or_assign(std::move(path), std::move(info.path)); diff --git a/src/libutil/references.cc b/src/libutil/references.cc index 8792578d4..9f4ab0678 100644 --- a/src/libutil/references.cc +++ b/src/libutil/references.cc @@ -65,56 +65,102 @@ void RefScanSink::operator () (std::string_view data) } -RewritingSink::RewritingSink(const std::string & from, const std::string & to, Sink & nextSink) - : RewritingSink({{from, to}}, nextSink) +RewritingSource::RewritingSource(const std::string & from, const std::string & to, Source & inner) + : RewritingSource({{from, to}}, inner) { } -RewritingSink::RewritingSink(const StringMap & rewrites, Sink & nextSink) - : rewrites(rewrites), nextSink(nextSink) +RewritingSource::RewritingSource(StringMap rewrites, Source & inner) + : RewritingSource(may_change_size, std::move(rewrites), inner) { - std::string::size_type maxRewriteSize = 0; - for (auto & [from, to] : rewrites) { + for (auto & [from, to] : this->rewrites) { assert(from.size() == to.size()); - maxRewriteSize = std::max(maxRewriteSize, from.size()); } - this->maxRewriteSize = maxRewriteSize; } -void RewritingSink::operator () (std::string_view data) +RewritingSource::RewritingSource(may_change_size_t, StringMap rewrites, Source & inner) + : maxRewriteSize([&, result = size_t(0)]() mutable { + for (auto & [k, v] : rewrites) { + result = std::max(result, k.size()); + } + return result; + }()) + , initials([&]() -> std::string { + std::string initials; + for (const auto & [k, v] : rewrites) { + assert(!k.empty()); + initials.push_back(k[0]); + } + std::ranges::sort(initials); + auto [firstDupe, _end] = std::ranges::unique(initials); + return {initials.begin(), firstDupe}; + }()) + , rewrites(std::move(rewrites)) + , inner(&inner) { - std::string s(prev); - s.append(data); - - s = rewriteStrings(s, rewrites); - - prev = s.size() < maxRewriteSize - ? s - : maxRewriteSize == 0 - ? "" - : std::string(s, s.size() - maxRewriteSize + 1, maxRewriteSize - 1); - - auto consumed = s.size() - prev.size(); - - if (consumed) nextSink(s.substr(0, consumed)); } -void RewritingSink::flush() +size_t RewritingSource::read(char * data, size_t len) { - if (prev.empty()) return; - nextSink(prev); - prev.clear(); + if (rewrites.empty()) { + return inner->read(data, len); + } + + if (unreturned.empty()) { + // always make sure to have at least *two* full rewrites in the buffer, + // otherwise we may end up incorrectly rewriting if the replacement map + // contains keys that are proper infixes of other keys in the map. take + // for example the set { ab -> cc, babb -> bbbb } on the input babb. if + // we feed the input bytewise without additional windowing we will miss + // the full babb match once the second b has been seen and bab has been + // rewritten to ccb, even though babb occurs first in the input string. + while (inner && buffered.size() < std::max(2 * maxRewriteSize, len)) { + try { + auto read = inner->read(data, std::min(2 * maxRewriteSize, len)); + buffered.append(data, read); + } catch (EndOfFile &) { + inner = nullptr; + } + } + + if (buffered.empty() && !inner) { + throw EndOfFile("rewritten source exhausted"); + } + + const size_t reserved = inner ? maxRewriteSize : 0; + size_t j = 0; + while ((j = buffered.find_first_of(initials, j)) < buffered.size() - reserved) { + size_t skip = 1; + for (const auto & [from, to] : rewrites) { + if (buffered.compare(j, from.size(), from) == 0) { + buffered.replace(j, from.size(), to); + skip = to.size(); + break; + } + } + j += skip; + } + + rewritten = std::move(buffered); + buffered = rewritten.substr(rewritten.size() - reserved); + unreturned = rewritten; + unreturned.remove_suffix(reserved); + } + + len = std::min(len, unreturned.size()); + memcpy(data, unreturned.data(), len); + unreturned.remove_prefix(len); + return len; } HashResult computeHashModulo(HashType ht, const std::string & modulus, Source & source) { HashSink hashSink(ht); LengthSink lengthSink; - RewritingSink rewritingSink(modulus, std::string(modulus.size(), 0), hashSink); + RewritingSource rewritingSource(modulus, std::string(modulus.size(), 0), source); - TeeSink tee{rewritingSink, lengthSink}; - source.drainInto(tee); - rewritingSink.flush(); + TeeSink tee{hashSink, lengthSink}; + rewritingSource.drainInto(tee); /* Hash the positions of the self-references. This ensures that a NAR with self-references and a NAR with some of the diff --git a/src/libutil/references.hh b/src/libutil/references.hh index 3fefd824b..f0f467190 100644 --- a/src/libutil/references.hh +++ b/src/libutil/references.hh @@ -23,19 +23,24 @@ public: void operator () (std::string_view data) override; }; -struct RewritingSink : Sink +struct RewritingSource : Source { + const std::string::size_type maxRewriteSize; + const std::string initials; const StringMap rewrites; - std::string::size_type maxRewriteSize; - std::string prev; - Sink & nextSink; + std::string rewritten, buffered; + std::string_view unreturned; + Source * inner; - RewritingSink(const std::string & from, const std::string & to, Sink & nextSink); - RewritingSink(const StringMap & rewrites, Sink & nextSink); + static constexpr struct may_change_size_t { + explicit may_change_size_t() = default; + } may_change_size{}; - void operator () (std::string_view data) override; + RewritingSource(const std::string & from, const std::string & to, Source & inner); + RewritingSource(StringMap rewrites, Source & inner); + RewritingSource(may_change_size_t, StringMap rewrites, Source & inner); - void flush(); + size_t read(char * data, size_t len) override; }; HashResult computeHashModulo(HashType ht, const std::string & modulus, Source & source); diff --git a/src/libutil/strings.cc b/src/libutil/strings.cc index 947478481..df48e9203 100644 --- a/src/libutil/strings.cc +++ b/src/libutil/strings.cc @@ -1,4 +1,5 @@ #include "strings.hh" +#include "references.hh" #include #include @@ -65,30 +66,13 @@ std::string replaceStrings( Rewriter::Rewriter(std::map rewrites) : rewrites(std::move(rewrites)) { - for (const auto & [k, v] : this->rewrites) { - assert(!k.empty()); - initials.push_back(k[0]); - } - std::ranges::sort(initials); - auto [firstDupe, end] = std::ranges::unique(initials); - initials.erase(firstDupe, end); } std::string Rewriter::operator()(std::string s) { - size_t j = 0; - while ((j = s.find_first_of(initials, j)) != std::string::npos) { - size_t skip = 1; - for (auto & [from, to] : rewrites) { - if (s.compare(j, from.size(), from) == 0) { - s.replace(j, from.size(), to); - skip = to.size(); - break; - } - } - j += skip; - } - return s; + StringSource src{s}; + RewritingSource inner{RewritingSource::may_change_size, rewrites, src}; + return inner.drain(); } template diff --git a/tests/unit/libutil/references.cc b/tests/unit/libutil/references.cc index be2b59051..a914e6c70 100644 --- a/tests/unit/libutil/references.cc +++ b/tests/unit/libutil/references.cc @@ -25,11 +25,8 @@ class RewriteTest : public ::testing::TestWithParam { TEST_P(RewriteTest, IdentityRewriteIsIdentity) { RewriteParams param = GetParam(); - StringSink rewritten; - auto rewriter = RewritingSink(param.rewrites, rewritten); - rewriter(param.originalString); - rewriter.flush(); - ASSERT_EQ(rewritten.s, param.finalString); + StringSource src{param.originalString}; + ASSERT_EQ(RewritingSource(param.rewrites, src).drain(), param.finalString); } INSTANTIATE_TEST_CASE_P( @@ -38,7 +35,8 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values( RewriteParams{ "foooo", "baroo", {{"foo", "bar"}, {"bar", "baz"}}}, RewriteParams{ "foooo", "bazoo", {{"fou", "bar"}, {"foo", "baz"}}}, - RewriteParams{ "foooo", "foooo", {}} + RewriteParams{ "foooo", "foooo", {}}, + RewriteParams{ "babb", "bbbb", {{"ab", "aa"}, {"babb", "bbbb"}}} ) );