libexpr/nix2: String improvements 2/2

Copied over the escape rules improvements to indented strings (with appropriate adjustments). Indented strings now have their line endings properly normalized. Also the first line is not stripped anymore, and mixing first line and multiline indented strings is now forbidden to minimize the risk of confusion. Co-authored-by: eldritch horrors <pennae@lix.systems> Change-Id: If9e2240a04627cb43aa25c76db021a5acf7c58fa
2024-10-22 13:58:12 +02:00 · 2024-10-22 13:58:12 +02:00 · 635fb7da7f
parent cabf2a300a
commit 635fb7da7f
4 changed files with 91 additions and 9 deletions
--- a/src/libexpr/parser/grammar2.hh
+++ b/src/libexpr/parser/grammar2.hh
@ -257,14 +257,26 @@ struct _ind_string {
        must<expr>, seps,
        must<one<'}'>>
    > {};
-    struct escape : semantic, must<any> {};
+    struct simple_escape : semantic, sor<
+        one<'\\'>,
+        one<'$'>,
+        one<'\''>,
+        one<'t'>,
+        one<'r'>,
+        one<'n'>
+    > {};
+    struct binary_escape : semantic, seq<rep<2, xdigit>> {};
+    struct unicode_escape : semantic, seq<rep_min_max<1, 6, xdigit>> {};
+    struct escape : sor<
+        simple_escape,
+        seq<one<'x'>, must<binary_escape>>,
+        seq<one<'u'>, must<one<'{'>>, must<unicode_escape>, must<one<'}'>>>
+    > {};
    /* Marker for non-empty lines */
    struct has_content : semantic, seq<> {};
 };
 struct ind_string : _ind_string, seq<
    TAO_PEGTL_STRING("''"),
-    // Strip first line completely if empty
-    opt<star<one<' '>>, one<'\n'>>,
    list<
        seq<
            // Start a line with some indentation
@ -302,9 +314,11 @@ struct ind_string : _ind_string, seq<
                >
            >
        >,
-        // End of line, LF. CR is just ignored and not treated as ending a line
-        // (for the purpose of indentation stripping)
-        _ind_string::literal<one<'\n'>>
+        // End of line: LF or CRLF. We only put the LF into literal because of line ending normalization
+        sor<
+            _ind_string::literal<one<'\n'>>,
+            seq<one<'\r'>, _ind_string::literal<must<one<'\n'>>>>
+        >
    >,
    must<TAO_PEGTL_STRING("''")>
 > {};
--- a/src/libexpr/parser/parser-impl2.inc.cc
+++ b/src/libexpr/parser/parser-impl2.inc.cc
@ -39,6 +39,9 @@ error_message_for(grammar::v2::seps) = "expecting separators";
 error_message_for(grammar::v2::string::escape) = "expecting escape sequence";
 error_message_for(grammar::v2::string::binary_escape) = "expecting two hex digits";
 error_message_for(grammar::v2::string::unicode_escape) = "expecting hex-encoded Unicode code point";
+error_message_for(grammar::v2::ind_string::escape) = "expecting escape sequence";
+error_message_for(grammar::v2::ind_string::binary_escape) = "expecting two hex digits";
+error_message_for(grammar::v2::ind_string::unicode_escape) = "expecting hex-encoded Unicode code point";
 error_message_for(grammar::v2::path::forbid_prefix_triple_slash) = "too many slashes in path";
 error_message_for(grammar::v2::path::forbid_prefix_double_slash_no_interp) = "path has a trailing slash";
 error_message_for(grammar::v2::expr) = "expecting expression";
@ -626,6 +629,9 @@ template<> struct BuildAST<grammar::v2::string> : change_head<StringState> {
 struct IndStringState : SubexprState {
    using SubexprState::SubexprState;

+    // IndStringLine requires string_view, so when the string is not static or in the
+    // parsed input, whe need to own and store it somewhere.
+    std::vector<std::string> interpolTmp = {};
    std::vector<IndStringLine> lines;
 };

@ -648,17 +654,44 @@ template<> struct BuildAST<grammar::v2::ind_string::interpolation> {
    }
 };

-template<> struct BuildAST<grammar::v2::ind_string::escape> {
+template<> struct BuildAST<grammar::v2::ind_string::simple_escape> {
    static void apply(const auto & in, IndStringState & s, State & ps) {
        switch (*in.begin()) {
+        case '\\': s.lines.back().parts.emplace_back(ps.at(in), "\\"); break;
+        case '$': s.lines.back().parts.emplace_back(ps.at(in), "$"); break;
+        case '\'': s.lines.back().parts.emplace_back(ps.at(in), "'"); break;
        case 'n': s.lines.back().parts.emplace_back(ps.at(in), "\n"); break;
        case 'r': s.lines.back().parts.emplace_back(ps.at(in), "\r"); break;
        case 't': s.lines.back().parts.emplace_back(ps.at(in), "\t"); break;
-        default:  s.lines.back().parts.emplace_back(ps.at(in), in.string_view()); break;
+        default: std::abort(); // Unreachable
        }
    }
 };

+template<> struct BuildAST<grammar::v2::ind_string::binary_escape> {
+    static void apply(const auto & in, IndStringState & s, State & ps) {
+        int8_t v;
+        // Don't error handle as the function is infallible for our input
+        std::from_chars(in.begin(), in.end(), v, 16);
+        char val[1] = { (char) v };
+        s.interpolTmp.emplace_back(std::string(std::string_view{val, 1}));
+        s.lines.back().parts.emplace_back(ps.at(in), s.interpolTmp.back());
+    }
+};
+
+template<> struct BuildAST<grammar::v2::ind_string::unicode_escape> {
+    static void apply(const auto & in, IndStringState & s, State & ps) {
+        int32_t v;
+        // Don't error handle as the function is infallible for our input
+        std::from_chars(in.begin(), in.end(), v, 16);
+
+        char out[4]{};
+        std::size_t len = saneScalarToWtf8((char32_t) v, out);
+        s.interpolTmp.emplace_back(std::string(std::string_view{out, len}));
+        s.lines.back().parts.emplace_back(ps.at(in), s.interpolTmp.back());
+    }
+};
+
 template<> struct BuildAST<grammar::v2::ind_string::has_content> {
    static void apply(const auto & in, IndStringState & s, State & ps) {
        s.lines.back().hasContent = true;
@ -667,7 +700,23 @@ template<> struct BuildAST<grammar::v2::ind_string::has_content> {

 template<> struct BuildAST<grammar::v2::ind_string> : change_head<IndStringState> {
    static void success(const auto & in, IndStringState & s, ExprState & e, State & ps) {
-        e.exprs.emplace_back(noPos, ps.stripIndentation(ps.at(in), std::move(s.lines)));
+        if (s.lines.size() == 1) {
+            /* Single-line string. Don't strip, only merge parts */
+            e.exprs.emplace_back(noPos, ps.mergeStringParts(ps.at(in), std::move(s.lines)));
+        } else {
+            /* Multi-line string. Ensure first line is empty then strip it before proceeding with the rest */
+            auto & firstLine = s.lines.front();
+
+            // TODO: If the first line only contains indentation, do we want to throw an error or just ignore it?
+            if (!firstLine.indentation.empty() || firstLine.hasContent)
+                throw ParseError({
+                    .msg = HintFmt("Multi-line string must start with a line break after %s", "''"),
+                    .pos = ps.positions[ps.at(in)],
+                });
+
+            s.lines.erase(s.lines.begin());
+            e.exprs.emplace_back(noPos, ps.stripIndentation(ps.at(in), std::move(s.lines)));
+        }
    }
 };

--- a/src/libexpr/parser/state.hh
+++ b/src/libexpr/parser/state.hh
@ -38,6 +38,7 @@ struct State
    void addAttr(ExprAttrs * attrs, AttrPath && attrPath, std::unique_ptr<Expr> e, const PosIdx pos);
    std::unique_ptr<Formals> validateFormals(std::unique_ptr<Formals> formals, PosIdx pos = noPos, Symbol arg = {});
    std::unique_ptr<Expr> stripIndentation(const PosIdx pos, std::vector<IndStringLine> && line);
+    std::unique_ptr<Expr> mergeStringParts(const PosIdx pos, std::vector<IndStringLine> && line);

    // lazy positioning means we don't get byte offsets directly, in.position() would work
    // but also requires line and column (which is expensive)
@ -224,6 +225,13 @@ inline std::unique_ptr<Expr> State::stripIndentation(
        line.indentation.remove_prefix(std::min(minIndent, line.indentation.size()));
    }

+    return mergeStringParts(pos, std::move(lines));
+}
+
+inline std::unique_ptr<Expr> State::mergeStringParts(
+    const PosIdx pos,
+    std::vector<IndStringLine> && lines)
+{
    /* Concat the parts together again */

    std::vector<std::pair<PosIdx, std::unique_ptr<Expr>>> parts;
--- a/tests/unit/libexpr/trivial.cc
+++ b/tests/unit/libexpr/trivial.cc
@ -315,4 +315,15 @@ namespace nix {
        ASSERT_THROW(eval("\"$${\"", true, mockFeatureSettings), Error);
    }

+    TEST_F(TrivialExpressionTest, indStringImproved) {
+        FeatureSettings mockFeatureSettings;
+        mockFeatureSettings.set("experimental-features", "nix-lang2");
+
+        auto v = eval("''    foo bar    ''");
+        ASSERT_THAT(v, IsStringEq("foo bar    "));
+        v = eval("''    foo bar    ''", true, mockFeatureSettings);
+        ASSERT_THAT(v, IsStringEq("    foo bar    "));
+
+        ASSERT_THROW(eval("''foo\n    bar''", true, mockFeatureSettings), Error);
+    }
 } /* namespace nix */