libexpr: Rewrite stripIndentation for indented strings

This commit should faithfully reproduce the old behavior down to the bugs. The new code is a lot more readable, all quirks are well documented, and it is overall much more maintainable. Change-Id: I629585918e4f2b7d296b6b8330235cdc90b7bade
2024-10-01 18:41:54 +02:00 · 2024-10-01 18:41:54 +02:00 · c852ae60da
parent 765771a355
commit c852ae60da
3 changed files with 183 additions and 116 deletions
--- a/src/libexpr/parser/grammar.hh
+++ b/src/libexpr/parser/grammar.hh
@ -225,7 +225,8 @@ struct string : _string, seq<
 > {};
 struct _ind_string {
-    template<bool Indented, typename... Inner>
+    struct line_start : semantic, star<one<' '>> {};
    template<bool CanMerge, typename... Inner>
    struct literal : semantic, seq<Inner...> {};
    struct interpolation : semantic, seq<
        p::string<'$', '{'>, seps,
@ -233,19 +234,32 @@ struct _ind_string {
        must<one<'}'>>
    > {};
    struct escape : semantic, must<any> {};
    /* Marker for non-empty lines */
    struct has_content : semantic, seq<> {};
 };
 struct ind_string : _ind_string, seq<
    TAO_PEGTL_STRING("''"),
    // Strip first line completely if empty
    opt<star<one<' '>>, one<'\n'>>,
-    star<
+    list<
        seq<
            // Start a line with some indentation
            // (we always match even the empty string if no indentation, as this creates the line)
            _ind_string::line_start,
            // The actual line
            opt<
                plus<
                    sor<
                        _ind_string::literal<
                            true,
                            plus<
                                sor<
-                        not_one<'$', '\''>,
+                                    not_one<'$', '\'', '\n'>,
-                        seq<one<'$'>, not_one<'{', '\''>>,
+                                    // TODO probably factor this out like the others for performance
-                        seq<one<'\''>, not_one<'\'', '$'>>
+                                    seq<one<'$'>, not_one<'{', '\'', '\n'>>,
                                    seq<one<'$'>, at<one<'\n'>>>,
                                    seq<one<'\''>, not_one<'\'', '$', '\n'>>,
                                    seq<one<'\''>, at<one<'\n'>>>
                                >
                            >
                        >,
@ -260,7 +274,14 @@ struct ind_string : _ind_string, seq<
                                seq<one<'\\'>, _ind_string::escape>
                            >
                        >
                    >,
                    _ind_string::has_content
                >
            >
        >,
        // End of line, LF. CR is just ignored and not treated as ending a line
        // (for the purpose of indentation stripping)
        _ind_string::literal<true, one<'\n'>>
    >,
    must<TAO_PEGTL_STRING("''")>
 > {};
--- a/src/libexpr/parser/parser-impl1.inc.cc
+++ b/src/libexpr/parser/parser-impl1.inc.cc
@ -533,36 +533,48 @@ template<> struct BuildAST<grammar::v1::string> : change_head<StringState> {
 struct IndStringState : SubexprState {
    using SubexprState::SubexprState;
-    std::vector<std::pair<PosIdx, std::variant<std::unique_ptr<Expr>, StringToken>>> parts;
+    std::vector<IndStringLine> lines;
 };
-template<bool Indented, typename... Content>
+template<> struct BuildAST<grammar::v1::ind_string::line_start> {
 struct BuildAST<grammar::v1::ind_string::literal<Indented, Content...>> {
    static void apply(const auto & in, IndStringState & s, State & ps) {
-        s.parts.emplace_back(ps.at(in), StringToken{in.string_view(), Indented});
+        s.lines.push_back(IndStringLine { in.string_view(), ps.at(in) });
    }
 };
 template<bool CanMerge, typename... Content>
 struct BuildAST<grammar::v1::ind_string::literal<CanMerge, Content...>> {
    static void apply(const auto & in, IndStringState & s, State & ps) {
        s.lines.back().parts.emplace_back(ps.at(in), StringToken{ in.string_view(), CanMerge });
    }
 };
 template<> struct BuildAST<grammar::v1::ind_string::interpolation> {
    static void apply(const auto & in, IndStringState & s, State & ps) {
-        s.parts.emplace_back(ps.at(in), s->popExprOnly());
+        s.lines.back().parts.emplace_back(ps.at(in), s->popExprOnly());
    }
 };
 template<> struct BuildAST<grammar::v1::ind_string::escape> {
    static void apply(const auto & in, IndStringState & s, State & ps) {
        switch (*in.begin()) {
-        case 'n': s.parts.emplace_back(ps.at(in), StringToken{"\n"}); break;
+        case 'n': s.lines.back().parts.emplace_back(ps.at(in), StringToken{"\n"}); break;
-        case 'r': s.parts.emplace_back(ps.at(in), StringToken{"\r"}); break;
+        case 'r': s.lines.back().parts.emplace_back(ps.at(in), StringToken{"\r"}); break;
-        case 't': s.parts.emplace_back(ps.at(in), StringToken{"\t"}); break;
+        case 't': s.lines.back().parts.emplace_back(ps.at(in), StringToken{"\t"}); break;
-        default:  s.parts.emplace_back(ps.at(in), StringToken{in.string_view()}); break;
+        default:  s.lines.back().parts.emplace_back(ps.at(in), StringToken{in.string_view()}); break;
        }
    }
 };
 template<> struct BuildAST<grammar::v1::ind_string::has_content> {
    static void apply(const auto & in, IndStringState & s, State & ps) {
        s.lines.back().hasContent = true;
    }
 };
 template<> struct BuildAST<grammar::v1::ind_string> : change_head<IndStringState> {
    static void success(const auto & in, IndStringState & s, ExprState & e, State & ps) {
-        e.exprs.emplace_back(noPos, ps.stripIndentation(ps.at(in), std::move(s.parts)));
+        e.exprs.emplace_back(noPos, ps.stripIndentation(ps.at(in), std::move(s.lines)));
    }
 };
--- a/src/libexpr/parser/state.hh
+++ b/src/libexpr/parser/state.hh
@ -9,10 +9,28 @@ namespace nix::parser {
 struct StringToken
 {
    std::string_view s;
-    bool hasIndentation = false;
+    // canMerge is only used to faithfully reproduce the quirks from the old code base.
    bool canMerge = false;
    operator std::string_view() const { return s; }
 };
 struct IndStringLine {
    // String containing only the leading whitespace of the line. May be empty.
    std::string_view indentation;
    // Position of the line start (before the indentation)
    PosIdx pos;
    // Whether the line contains anything besides indentation and line break
    bool hasContent = false;
    std::vector<
        std::pair<
            PosIdx,
            std::variant<std::unique_ptr<Expr>, StringToken>
        >
    > parts = {};
 };
 struct State
 {
    SymbolTable & symbols;
@ -27,8 +45,7 @@ struct State
    void overridesFound(const PosIdx pos);
    void addAttr(ExprAttrs * attrs, AttrPath && attrPath, std::unique_ptr<Expr> e, const PosIdx pos);
    std::unique_ptr<Formals> validateFormals(std::unique_ptr<Formals> formals, PosIdx pos = noPos, Symbol arg = {});
-    std::unique_ptr<Expr> stripIndentation(const PosIdx pos,
+    std::unique_ptr<Expr> stripIndentation(const PosIdx pos, std::vector<IndStringLine> && line);
        std::vector<std::pair<PosIdx, std::variant<std::unique_ptr<Expr>, StringToken>>> && es);
    // lazy positioning means we don't get byte offsets directly, in.position() would work
    // but also requires line and column (which is expensive)
@ -182,98 +199,115 @@ inline std::unique_ptr<Formals> State::validateFormals(std::unique_ptr<Formals>
    return formals;
 }
-inline std::unique_ptr<Expr> State::stripIndentation(const PosIdx pos,
+inline std::unique_ptr<Expr> State::stripIndentation(
-    std::vector<std::pair<PosIdx, std::variant<std::unique_ptr<Expr>, StringToken>>> && es)
+    const PosIdx pos,
    std::vector<IndStringLine> && lines)
 {
-    if (es.empty()) return std::make_unique<ExprString>("");
+    /* If the only line is whitespace-only, directly return empty string.
     * NOTE: This is not merely an optimization, but `compatStripLeadingEmptyString`
     * later on relies on the string not being empty for working.
     */
    if (lines.size() == 1 && lines.front().parts.empty()) {
        return std::make_unique<ExprString>("");
    }
    /* If the last line only contains whitespace, trim it to not cause excessive whitespace.
     * (Other whitespace-only lines get stripped only of the common indentation, and excess
     * whitespace becomes part of the string.)
     */
    if (lines.back().parts.empty()) {
        lines.back().indentation = {};
    }
    /*
     * Quirk compatibility:
     *
     * » nix-instantiate --parse -E $'\'\'${"foo"}\'\''
     * "foo"
     * » nix-instantiate --parse -E $'\'\'    ${"foo"}\'\''
     * ("" + "foo")
     *
     * Our code always produces the form with the additional "" +, so we'll manually
     * strip it at the end if necessary.
     */
    const bool compatStripLeadingEmptyString = !lines.empty() && lines[0].indentation.empty();
    /* Figure out the minimum indentation. Note that by design
-       whitespace-only final lines are not taken into account.  (So
+       whitespace-only lines are not taken into account. */
       the " " in "\n ''" is ignored, but the " " in "\n foo''" is.) */
    bool atStartOfLine = true; /* = seen only whitespace in the current line */
    size_t minIndent = 1000000;
-    size_t curIndent = 0;
+    for (auto & line : lines) {
-    for (auto & [i_pos, i] : es) {
+        if (line.hasContent) {
-        auto * str = std::get_if<StringToken>(&i);
+            minIndent = std::min(minIndent, line.indentation.size());
        if (!str || !str->hasIndentation) {
            /* Anti-quotations and escaped characters end the current start-of-line whitespace. */
            if (atStartOfLine) {
                atStartOfLine = false;
                if (curIndent < minIndent) minIndent = curIndent;
            }
            continue;
        }
        for (size_t j = 0; j < str->s.size(); ++j) {
            if (atStartOfLine) {
                if (str->s[j] == ' ')
                    curIndent++;
                else if (str->s[j] == '\n') {
                    /* Empty line, doesn't influence minimum
                       indentation. */
                    curIndent = 0;
                } else {
                    atStartOfLine = false;
                    if (curIndent < minIndent) minIndent = curIndent;
                }
            } else if (str->s[j] == '\n') {
                atStartOfLine = true;
                curIndent = 0;
            }
        }
    }
    /* Strip spaces from each line. */
-    std::vector<std::pair<PosIdx, std::unique_ptr<Expr>>> es2;
+    for (auto & line : lines) {
-    atStartOfLine = true;
+        line.indentation.remove_prefix(std::min(minIndent, line.indentation.size()));
-    size_t curDropped = 0;
+    }
-    size_t n = es.size();
+
-    auto i = es.begin();
+    /* Concat the parts together again */
-    const auto trimExpr = [&] (std::unique_ptr<Expr> e) {
+
-        atStartOfLine = false;
+    /* Note that we don't concat all adjacent string parts to fully reproduce the original code.
-        curDropped = 0;
+     * This means that any escapes will result in string concatenation even if this is unnecessary.
-        es2.emplace_back(i->first, std::move(e));
+     */
    std::vector<std::pair<PosIdx, std::unique_ptr<Expr>>> parts;
    /* Accumulator for merging intermediates */
    PosIdx merged_pos;
    std::string merged = "";
    bool has_merged = false;
    auto push_merged = [&] (PosIdx i_pos, std::string_view str) {
        merged += str;
        if (!has_merged) {
            has_merged = true;
            merged_pos = i_pos;
        }
    };
    const auto trimString = [&] (const StringToken t) {
        std::string s2;
        for (size_t j = 0; j < t.s.size(); ++j) {
            if (atStartOfLine) {
                if (t.s[j] == ' ') {
                    if (curDropped++ >= minIndent)
                        s2 += t.s[j];
                }
                else if (t.s[j] == '\n') {
                    curDropped = 0;
                    s2 += t.s[j];
                } else {
                    atStartOfLine = false;
                    curDropped = 0;
                    s2 += t.s[j];
                }
            } else {
                s2 += t.s[j];
                if (t.s[j] == '\n') atStartOfLine = true;
            }
        }
-        /* Remove the last line if it is empty and consists only of
+    auto flush_merged = [&] () {
-           spaces. */
+        if (has_merged) {
-        if (n == 1) {
+            parts.emplace_back(merged_pos, std::make_unique<ExprString>(std::string(merged)));
-            std::string::size_type p = s2.find_last_of('\n');
+            merged.clear();
-            if (p != std::string::npos && s2.find_first_not_of(' ', p + 1) == std::string::npos)
+            has_merged = false;
                s2 = std::string(s2, 0, p + 1);
        }
        es2.emplace_back(i->first, std::make_unique<ExprString>(std::move(s2)));
    };
-    for (; i != es.end(); ++i, --n) {
+
-        std::visit(overloaded { trimExpr, trimString }, std::move(i->second));
+    for (auto && [li, line] : enumerate(lines)) {
        /* Always merge indentation, except for the first line when compatStripLeadingEmptyString is set (see above) */
        if (!compatStripLeadingEmptyString || li != 0) {
            push_merged(line.pos, line.indentation);
        }
-    /* If this is a single string, then don't do a concatenation. */
+        for (auto & val : line.parts) {
-    if (es2.size() == 1 && dynamic_cast<ExprString *>(es2[0].second.get())) {
+            auto &[i_pos, item] = val;
-        return std::move(es2[0].second);
+
            std::visit(overloaded{
                [&](StringToken str) {
                    if (str.canMerge) {
                        push_merged(i_pos, str.s);
                    } else {
                        flush_merged();
                        parts.emplace_back(i_pos, std::make_unique<ExprString>(std::string(str.s)));
                    }
-    return std::make_unique<ExprConcatStrings>(pos, true, std::move(es2));
+                },
                [&](std::unique_ptr<Expr> expr) {
                    flush_merged();
                    parts.emplace_back(i_pos, std::move(expr));
                },
            }, std::move(item));
        }
    }
    flush_merged();
    /* If this is a single string, then don't do a concatenation.
     * (If it's a single expression, still do the ConcatStrings to properly force it being a string.)
     */
    if (parts.size() == 1 && dynamic_cast<ExprString *>(parts[0].second.get())) {
        return std::move(parts[0].second);
    }
    return std::make_unique<ExprConcatStrings>(pos, true, std::move(parts));
 }
 }