From c852ae60da16b890f77e9e1b274d31dced73ae66 Mon Sep 17 00:00:00 2001 From: piegames Date: Tue, 1 Oct 2024 18:41:54 +0200 Subject: [PATCH] libexpr: Rewrite stripIndentation for indented strings This commit should faithfully reproduce the old behavior down to the bugs. The new code is a lot more readable, all quirks are well documented, and it is overall much more maintainable. Change-Id: I629585918e4f2b7d296b6b8330235cdc90b7bade --- src/libexpr/parser/grammar.hh | 63 +++++--- src/libexpr/parser/parser-impl1.inc.cc | 32 ++-- src/libexpr/parser/state.hh | 204 ++++++++++++++----------- 3 files changed, 183 insertions(+), 116 deletions(-) diff --git a/src/libexpr/parser/grammar.hh b/src/libexpr/parser/grammar.hh index 6e42609c0..92721fa20 100644 --- a/src/libexpr/parser/grammar.hh +++ b/src/libexpr/parser/grammar.hh @@ -225,7 +225,8 @@ struct string : _string, seq< > {}; struct _ind_string { - template + struct line_start : semantic, star> {}; + template struct literal : semantic, seq {}; struct interpolation : semantic, seq< p::string<'$', '{'>, seps, @@ -233,34 +234,54 @@ struct _ind_string { must> > {}; struct escape : semantic, must {}; + /* Marker for non-empty lines */ + struct has_content : semantic, seq<> {}; }; struct ind_string : _ind_string, seq< TAO_PEGTL_STRING("''"), + // Strip first line completely if empty opt>, one<'\n'>>, - star< - sor< - _ind_string::literal< - true, + list< + seq< + // Start a line with some indentation + // (we always match even the empty string if no indentation, as this creates the line) + _ind_string::line_start, + // The actual line + opt< plus< sor< - not_one<'$', '\''>, - seq, not_one<'{', '\''>>, - seq, not_one<'\'', '$'>> - > - > - >, - _ind_string::interpolation, - _ind_string::literal>, - _ind_string::literal, not_at>>, - seq, _ind_string::literal>>, - seq< - p::string<'\'', '\''>, - sor< - _ind_string::literal>, - seq, _ind_string::escape> + _ind_string::literal< + true, + plus< + sor< + not_one<'$', '\'', '\n'>, + // TODO probably factor this out like the others for performance + seq, not_one<'{', '\'', '\n'>>, + seq, at>>, + seq, not_one<'\'', '$', '\n'>>, + seq, at>> + > + > + >, + _ind_string::interpolation, + _ind_string::literal>, + _ind_string::literal, not_at>>, + seq, _ind_string::literal>>, + seq< + p::string<'\'', '\''>, + sor< + _ind_string::literal>, + seq, _ind_string::escape> + > + > + >, + _ind_string::has_content > > - > + >, + // End of line, LF. CR is just ignored and not treated as ending a line + // (for the purpose of indentation stripping) + _ind_string::literal> >, must > {}; diff --git a/src/libexpr/parser/parser-impl1.inc.cc b/src/libexpr/parser/parser-impl1.inc.cc index 0d41324b3..c65fb3ddc 100644 --- a/src/libexpr/parser/parser-impl1.inc.cc +++ b/src/libexpr/parser/parser-impl1.inc.cc @@ -533,36 +533,48 @@ template<> struct BuildAST : change_head { struct IndStringState : SubexprState { using SubexprState::SubexprState; - std::vector, StringToken>>> parts; + std::vector lines; }; -template -struct BuildAST> { +template<> struct BuildAST { static void apply(const auto & in, IndStringState & s, State & ps) { - s.parts.emplace_back(ps.at(in), StringToken{in.string_view(), Indented}); + s.lines.push_back(IndStringLine { in.string_view(), ps.at(in) }); + } +}; + +template +struct BuildAST> { + static void apply(const auto & in, IndStringState & s, State & ps) { + s.lines.back().parts.emplace_back(ps.at(in), StringToken{ in.string_view(), CanMerge }); } }; template<> struct BuildAST { static void apply(const auto & in, IndStringState & s, State & ps) { - s.parts.emplace_back(ps.at(in), s->popExprOnly()); + s.lines.back().parts.emplace_back(ps.at(in), s->popExprOnly()); } }; template<> struct BuildAST { static void apply(const auto & in, IndStringState & s, State & ps) { switch (*in.begin()) { - case 'n': s.parts.emplace_back(ps.at(in), StringToken{"\n"}); break; - case 'r': s.parts.emplace_back(ps.at(in), StringToken{"\r"}); break; - case 't': s.parts.emplace_back(ps.at(in), StringToken{"\t"}); break; - default: s.parts.emplace_back(ps.at(in), StringToken{in.string_view()}); break; + case 'n': s.lines.back().parts.emplace_back(ps.at(in), StringToken{"\n"}); break; + case 'r': s.lines.back().parts.emplace_back(ps.at(in), StringToken{"\r"}); break; + case 't': s.lines.back().parts.emplace_back(ps.at(in), StringToken{"\t"}); break; + default: s.lines.back().parts.emplace_back(ps.at(in), StringToken{in.string_view()}); break; } } }; +template<> struct BuildAST { + static void apply(const auto & in, IndStringState & s, State & ps) { + s.lines.back().hasContent = true; + } +}; + template<> struct BuildAST : change_head { static void success(const auto & in, IndStringState & s, ExprState & e, State & ps) { - e.exprs.emplace_back(noPos, ps.stripIndentation(ps.at(in), std::move(s.parts))); + e.exprs.emplace_back(noPos, ps.stripIndentation(ps.at(in), std::move(s.lines))); } }; diff --git a/src/libexpr/parser/state.hh b/src/libexpr/parser/state.hh index 3b9b90b94..62bf08ae7 100644 --- a/src/libexpr/parser/state.hh +++ b/src/libexpr/parser/state.hh @@ -9,10 +9,28 @@ namespace nix::parser { struct StringToken { std::string_view s; - bool hasIndentation = false; + // canMerge is only used to faithfully reproduce the quirks from the old code base. + bool canMerge = false; operator std::string_view() const { return s; } }; +struct IndStringLine { + // String containing only the leading whitespace of the line. May be empty. + std::string_view indentation; + // Position of the line start (before the indentation) + PosIdx pos; + + // Whether the line contains anything besides indentation and line break + bool hasContent = false; + + std::vector< + std::pair< + PosIdx, + std::variant, StringToken> + > + > parts = {}; +}; + struct State { SymbolTable & symbols; @@ -27,8 +45,7 @@ struct State void overridesFound(const PosIdx pos); void addAttr(ExprAttrs * attrs, AttrPath && attrPath, std::unique_ptr e, const PosIdx pos); std::unique_ptr validateFormals(std::unique_ptr formals, PosIdx pos = noPos, Symbol arg = {}); - std::unique_ptr stripIndentation(const PosIdx pos, - std::vector, StringToken>>> && es); + std::unique_ptr stripIndentation(const PosIdx pos, std::vector && line); // lazy positioning means we don't get byte offsets directly, in.position() would work // but also requires line and column (which is expensive) @@ -182,98 +199,115 @@ inline std::unique_ptr State::validateFormals(std::unique_ptr return formals; } -inline std::unique_ptr State::stripIndentation(const PosIdx pos, - std::vector, StringToken>>> && es) +inline std::unique_ptr State::stripIndentation( + const PosIdx pos, + std::vector && lines) { - if (es.empty()) return std::make_unique(""); + /* If the only line is whitespace-only, directly return empty string. + * NOTE: This is not merely an optimization, but `compatStripLeadingEmptyString` + * later on relies on the string not being empty for working. + */ + if (lines.size() == 1 && lines.front().parts.empty()) { + return std::make_unique(""); + } - /* Figure out the minimum indentation. Note that by design - whitespace-only final lines are not taken into account. (So - the " " in "\n ''" is ignored, but the " " in "\n foo''" is.) */ - bool atStartOfLine = true; /* = seen only whitespace in the current line */ + /* If the last line only contains whitespace, trim it to not cause excessive whitespace. + * (Other whitespace-only lines get stripped only of the common indentation, and excess + * whitespace becomes part of the string.) + */ + if (lines.back().parts.empty()) { + lines.back().indentation = {}; + } + + /* + * Quirk compatibility: + * + * » nix-instantiate --parse -E $'\'\'${"foo"}\'\'' + * "foo" + * » nix-instantiate --parse -E $'\'\' ${"foo"}\'\'' + * ("" + "foo") + * + * Our code always produces the form with the additional "" +, so we'll manually + * strip it at the end if necessary. + */ + const bool compatStripLeadingEmptyString = !lines.empty() && lines[0].indentation.empty(); + + /* Figure out the minimum indentation. Note that by design + whitespace-only lines are not taken into account. */ size_t minIndent = 1000000; - size_t curIndent = 0; - for (auto & [i_pos, i] : es) { - auto * str = std::get_if(&i); - if (!str || !str->hasIndentation) { - /* Anti-quotations and escaped characters end the current start-of-line whitespace. */ - if (atStartOfLine) { - atStartOfLine = false; - if (curIndent < minIndent) minIndent = curIndent; - } - continue; - } - for (size_t j = 0; j < str->s.size(); ++j) { - if (atStartOfLine) { - if (str->s[j] == ' ') - curIndent++; - else if (str->s[j] == '\n') { - /* Empty line, doesn't influence minimum - indentation. */ - curIndent = 0; - } else { - atStartOfLine = false; - if (curIndent < minIndent) minIndent = curIndent; - } - } else if (str->s[j] == '\n') { - atStartOfLine = true; - curIndent = 0; - } + for (auto & line : lines) { + if (line.hasContent) { + minIndent = std::min(minIndent, line.indentation.size()); } } /* Strip spaces from each line. */ - std::vector>> es2; - atStartOfLine = true; - size_t curDropped = 0; - size_t n = es.size(); - auto i = es.begin(); - const auto trimExpr = [&] (std::unique_ptr e) { - atStartOfLine = false; - curDropped = 0; - es2.emplace_back(i->first, std::move(e)); - }; - const auto trimString = [&] (const StringToken t) { - std::string s2; - for (size_t j = 0; j < t.s.size(); ++j) { - if (atStartOfLine) { - if (t.s[j] == ' ') { - if (curDropped++ >= minIndent) - s2 += t.s[j]; - } - else if (t.s[j] == '\n') { - curDropped = 0; - s2 += t.s[j]; - } else { - atStartOfLine = false; - curDropped = 0; - s2 += t.s[j]; - } - } else { - s2 += t.s[j]; - if (t.s[j] == '\n') atStartOfLine = true; - } - } - - /* Remove the last line if it is empty and consists only of - spaces. */ - if (n == 1) { - std::string::size_type p = s2.find_last_of('\n'); - if (p != std::string::npos && s2.find_first_not_of(' ', p + 1) == std::string::npos) - s2 = std::string(s2, 0, p + 1); - } - - es2.emplace_back(i->first, std::make_unique(std::move(s2))); - }; - for (; i != es.end(); ++i, --n) { - std::visit(overloaded { trimExpr, trimString }, std::move(i->second)); + for (auto & line : lines) { + line.indentation.remove_prefix(std::min(minIndent, line.indentation.size())); } - /* If this is a single string, then don't do a concatenation. */ - if (es2.size() == 1 && dynamic_cast(es2[0].second.get())) { - return std::move(es2[0].second); + /* Concat the parts together again */ + + /* Note that we don't concat all adjacent string parts to fully reproduce the original code. + * This means that any escapes will result in string concatenation even if this is unnecessary. + */ + std::vector>> parts; + /* Accumulator for merging intermediates */ + PosIdx merged_pos; + std::string merged = ""; + bool has_merged = false; + + auto push_merged = [&] (PosIdx i_pos, std::string_view str) { + merged += str; + if (!has_merged) { + has_merged = true; + merged_pos = i_pos; + } + }; + + auto flush_merged = [&] () { + if (has_merged) { + parts.emplace_back(merged_pos, std::make_unique(std::string(merged))); + merged.clear(); + has_merged = false; + } + }; + + for (auto && [li, line] : enumerate(lines)) { + /* Always merge indentation, except for the first line when compatStripLeadingEmptyString is set (see above) */ + if (!compatStripLeadingEmptyString || li != 0) { + push_merged(line.pos, line.indentation); + } + + for (auto & val : line.parts) { + auto &[i_pos, item] = val; + + std::visit(overloaded{ + [&](StringToken str) { + if (str.canMerge) { + push_merged(i_pos, str.s); + } else { + flush_merged(); + parts.emplace_back(i_pos, std::make_unique(std::string(str.s))); + } + }, + [&](std::unique_ptr expr) { + flush_merged(); + parts.emplace_back(i_pos, std::move(expr)); + }, + }, std::move(item)); + } } - return std::make_unique(pos, true, std::move(es2)); + + flush_merged(); + + /* If this is a single string, then don't do a concatenation. + * (If it's a single expression, still do the ConcatStrings to properly force it being a string.) + */ + if (parts.size() == 1 && dynamic_cast(parts[0].second.get())) { + return std::move(parts[0].second); + } + return std::make_unique(pos, true, std::move(parts)); } }