diff --git a/src/libstore/build/local-derivation-goal.cc b/src/libstore/build/local-derivation-goal.cc index d523ef51f..a071883bb 100644 --- a/src/libstore/build/local-derivation-goal.cc +++ b/src/libstore/build/local-derivation-goal.cc @@ -2150,14 +2150,10 @@ SingleDrvOutputs LocalDerivationGoal::registerOutputs() if (!rewrites.empty()) { debug("rewriting hashes in '%1%'; cross fingers", actualPath); - /* FIXME: Is this actually streaming? */ - auto source = sinkToSource([&](Sink & nextSink) { - RewritingSink rsink(rewrites, nextSink); - rsink << dumpPath(actualPath); - rsink.flush(); - }); + GeneratorSource dump{dumpPath(actualPath)}; + RewritingSource rewritten(rewrites, dump); Path tmpPath = actualPath + ".tmp"; - restorePath(tmpPath, *source); + restorePath(tmpPath, rewritten); deletePath(actualPath); movePath(tmpPath, actualPath); diff --git a/src/libstore/make-content-addressed.cc b/src/libstore/make-content-addressed.cc index b5397541c..1aa139b38 100644 --- a/src/libstore/make-content-addressed.cc +++ b/src/libstore/make-content-addressed.cc @@ -61,15 +61,12 @@ std::map makeContentAddressed( printInfo("rewriting '%s' to '%s'", pathS, dstStore.printStorePath(info.path)); - StringSink sink2; - RewritingSink rsink2(oldHashPart, std::string(info.path.hashPart()), sink2); - rsink2(sink.s); - rsink2.flush(); + const auto rewritten = rewriteStrings(sink.s, {{oldHashPart, std::string(info.path.hashPart())}}); - info.narHash = hashString(htSHA256, sink2.s); + info.narHash = hashString(htSHA256, rewritten); info.narSize = sink.s.size(); - StringSource source(sink2.s); + StringSource source(rewritten); dstStore.addToStore(info, source); remappings.insert_or_assign(std::move(path), std::move(info.path)); diff --git a/src/libutil/references.cc b/src/libutil/references.cc index 8792578d4..9f4ab0678 100644 --- a/src/libutil/references.cc +++ b/src/libutil/references.cc @@ -65,56 +65,102 @@ void RefScanSink::operator () (std::string_view data) } -RewritingSink::RewritingSink(const std::string & from, const std::string & to, Sink & nextSink) - : RewritingSink({{from, to}}, nextSink) +RewritingSource::RewritingSource(const std::string & from, const std::string & to, Source & inner) + : RewritingSource({{from, to}}, inner) { } -RewritingSink::RewritingSink(const StringMap & rewrites, Sink & nextSink) - : rewrites(rewrites), nextSink(nextSink) +RewritingSource::RewritingSource(StringMap rewrites, Source & inner) + : RewritingSource(may_change_size, std::move(rewrites), inner) { - std::string::size_type maxRewriteSize = 0; - for (auto & [from, to] : rewrites) { + for (auto & [from, to] : this->rewrites) { assert(from.size() == to.size()); - maxRewriteSize = std::max(maxRewriteSize, from.size()); } - this->maxRewriteSize = maxRewriteSize; } -void RewritingSink::operator () (std::string_view data) +RewritingSource::RewritingSource(may_change_size_t, StringMap rewrites, Source & inner) + : maxRewriteSize([&, result = size_t(0)]() mutable { + for (auto & [k, v] : rewrites) { + result = std::max(result, k.size()); + } + return result; + }()) + , initials([&]() -> std::string { + std::string initials; + for (const auto & [k, v] : rewrites) { + assert(!k.empty()); + initials.push_back(k[0]); + } + std::ranges::sort(initials); + auto [firstDupe, _end] = std::ranges::unique(initials); + return {initials.begin(), firstDupe}; + }()) + , rewrites(std::move(rewrites)) + , inner(&inner) { - std::string s(prev); - s.append(data); - - s = rewriteStrings(s, rewrites); - - prev = s.size() < maxRewriteSize - ? s - : maxRewriteSize == 0 - ? "" - : std::string(s, s.size() - maxRewriteSize + 1, maxRewriteSize - 1); - - auto consumed = s.size() - prev.size(); - - if (consumed) nextSink(s.substr(0, consumed)); } -void RewritingSink::flush() +size_t RewritingSource::read(char * data, size_t len) { - if (prev.empty()) return; - nextSink(prev); - prev.clear(); + if (rewrites.empty()) { + return inner->read(data, len); + } + + if (unreturned.empty()) { + // always make sure to have at least *two* full rewrites in the buffer, + // otherwise we may end up incorrectly rewriting if the replacement map + // contains keys that are proper infixes of other keys in the map. take + // for example the set { ab -> cc, babb -> bbbb } on the input babb. if + // we feed the input bytewise without additional windowing we will miss + // the full babb match once the second b has been seen and bab has been + // rewritten to ccb, even though babb occurs first in the input string. + while (inner && buffered.size() < std::max(2 * maxRewriteSize, len)) { + try { + auto read = inner->read(data, std::min(2 * maxRewriteSize, len)); + buffered.append(data, read); + } catch (EndOfFile &) { + inner = nullptr; + } + } + + if (buffered.empty() && !inner) { + throw EndOfFile("rewritten source exhausted"); + } + + const size_t reserved = inner ? maxRewriteSize : 0; + size_t j = 0; + while ((j = buffered.find_first_of(initials, j)) < buffered.size() - reserved) { + size_t skip = 1; + for (const auto & [from, to] : rewrites) { + if (buffered.compare(j, from.size(), from) == 0) { + buffered.replace(j, from.size(), to); + skip = to.size(); + break; + } + } + j += skip; + } + + rewritten = std::move(buffered); + buffered = rewritten.substr(rewritten.size() - reserved); + unreturned = rewritten; + unreturned.remove_suffix(reserved); + } + + len = std::min(len, unreturned.size()); + memcpy(data, unreturned.data(), len); + unreturned.remove_prefix(len); + return len; } HashResult computeHashModulo(HashType ht, const std::string & modulus, Source & source) { HashSink hashSink(ht); LengthSink lengthSink; - RewritingSink rewritingSink(modulus, std::string(modulus.size(), 0), hashSink); + RewritingSource rewritingSource(modulus, std::string(modulus.size(), 0), source); - TeeSink tee{rewritingSink, lengthSink}; - source.drainInto(tee); - rewritingSink.flush(); + TeeSink tee{hashSink, lengthSink}; + rewritingSource.drainInto(tee); /* Hash the positions of the self-references. This ensures that a NAR with self-references and a NAR with some of the diff --git a/src/libutil/references.hh b/src/libutil/references.hh index 3fefd824b..f0f467190 100644 --- a/src/libutil/references.hh +++ b/src/libutil/references.hh @@ -23,19 +23,24 @@ public: void operator () (std::string_view data) override; }; -struct RewritingSink : Sink +struct RewritingSource : Source { + const std::string::size_type maxRewriteSize; + const std::string initials; const StringMap rewrites; - std::string::size_type maxRewriteSize; - std::string prev; - Sink & nextSink; + std::string rewritten, buffered; + std::string_view unreturned; + Source * inner; - RewritingSink(const std::string & from, const std::string & to, Sink & nextSink); - RewritingSink(const StringMap & rewrites, Sink & nextSink); + static constexpr struct may_change_size_t { + explicit may_change_size_t() = default; + } may_change_size{}; - void operator () (std::string_view data) override; + RewritingSource(const std::string & from, const std::string & to, Source & inner); + RewritingSource(StringMap rewrites, Source & inner); + RewritingSource(may_change_size_t, StringMap rewrites, Source & inner); - void flush(); + size_t read(char * data, size_t len) override; }; HashResult computeHashModulo(HashType ht, const std::string & modulus, Source & source); diff --git a/src/libutil/strings.cc b/src/libutil/strings.cc index 947478481..df48e9203 100644 --- a/src/libutil/strings.cc +++ b/src/libutil/strings.cc @@ -1,4 +1,5 @@ #include "strings.hh" +#include "references.hh" #include #include @@ -65,30 +66,13 @@ std::string replaceStrings( Rewriter::Rewriter(std::map rewrites) : rewrites(std::move(rewrites)) { - for (const auto & [k, v] : this->rewrites) { - assert(!k.empty()); - initials.push_back(k[0]); - } - std::ranges::sort(initials); - auto [firstDupe, end] = std::ranges::unique(initials); - initials.erase(firstDupe, end); } std::string Rewriter::operator()(std::string s) { - size_t j = 0; - while ((j = s.find_first_of(initials, j)) != std::string::npos) { - size_t skip = 1; - for (auto & [from, to] : rewrites) { - if (s.compare(j, from.size(), from) == 0) { - s.replace(j, from.size(), to); - skip = to.size(); - break; - } - } - j += skip; - } - return s; + StringSource src{s}; + RewritingSource inner{RewritingSource::may_change_size, rewrites, src}; + return inner.drain(); } template diff --git a/tests/unit/libutil/references.cc b/tests/unit/libutil/references.cc index be2b59051..a914e6c70 100644 --- a/tests/unit/libutil/references.cc +++ b/tests/unit/libutil/references.cc @@ -25,11 +25,8 @@ class RewriteTest : public ::testing::TestWithParam { TEST_P(RewriteTest, IdentityRewriteIsIdentity) { RewriteParams param = GetParam(); - StringSink rewritten; - auto rewriter = RewritingSink(param.rewrites, rewritten); - rewriter(param.originalString); - rewriter.flush(); - ASSERT_EQ(rewritten.s, param.finalString); + StringSource src{param.originalString}; + ASSERT_EQ(RewritingSource(param.rewrites, src).drain(), param.finalString); } INSTANTIATE_TEST_CASE_P( @@ -38,7 +35,8 @@ INSTANTIATE_TEST_CASE_P( ::testing::Values( RewriteParams{ "foooo", "baroo", {{"foo", "bar"}, {"bar", "baz"}}}, RewriteParams{ "foooo", "bazoo", {{"fou", "bar"}, {"foo", "baz"}}}, - RewriteParams{ "foooo", "foooo", {}} + RewriteParams{ "foooo", "foooo", {}}, + RewriteParams{ "babb", "bbbb", {{"ab", "aa"}, {"babb", "bbbb"}}} ) );