libexpr/nix2: String improvements 1/2

A complete overhaul of the escape semantics, which are just horrible and mostly useless. The design is inspired by JSON and Rust. Indented strings are not affected Co-authored-by: eldritch horrors <pennae@lix.systems> Change-Id: Id89679d4115d59869090bdbb5d9b305f374447fb
2024-10-19 10:23:50 +02:00 · 2024-10-19 10:23:50 +02:00 · cabf2a300a
parent 5058a00130
commit cabf2a300a
5 changed files with 147 additions and 40 deletions
--- a/src/libexpr/parser/grammar2.hh
+++ b/src/libexpr/parser/grammar2.hh
@ -35,10 +35,13 @@ using
    p::not_at,
    p::opt,
    p::plus,
+    p::rep,
+    p::rep_min_max,
    p::sor,
    p::seq,
    p::star,
    p::until,
+    p::xdigit,
    p8::any,
    p8::not_one,
    p8::one,
@ -213,7 +216,23 @@ struct _string {
        must<expr>, seps,
        must<one<'}'>>
    > {};
-    struct escape : semantic, must<any> {};
+    struct simple_escape : semantic, sor<
+        one<'\\'>,
+        one<'$'>,
+        one<'"'>,
+        one<'t'>,
+        one<'r'>,
+        one<'n'>,
+        // Escaping newlines to break up strings into multiline (string continuation)
+        seq<t::eol, star<one<' ', '\t'>>>
+    > {};
+    struct binary_escape : semantic, seq<rep<2, xdigit>> {};
+    struct unicode_escape : semantic, seq<rep_min_max<1, 6, xdigit>> {};
+    struct escape : sor<
+        simple_escape,
+        seq<one<'x'>, must<binary_escape>>,
+        seq<one<'u'>, must<one<'{'>>, must<unicode_escape>, must<one<'}'>>>
+    > {};
 };
 struct string : _string, seq<
    one<'"'>,
@ -222,8 +241,8 @@ struct string : _string, seq<
            _string::literal<plus<not_one<'$', '"', '\\', '\r'>>>,
            _string::cr_lf,
            _string::interpolation,
-            _string::literal<one<'$'>, opt<one<'$'>>>,
-            seq<one<'\\'>, _string::escape>
+            _string::literal<one<'$'>>,
+            seq<one<'\\'>, must<_string::escape>>
        >
    >,
    must<one<'"'>>
--- a/src/libexpr/parser/parser-impl2.inc.cc
+++ b/src/libexpr/parser/parser-impl2.inc.cc
@ -36,6 +36,9 @@ error_message_for(p8::any) = "expecting any character";
 error_message_for(p::plus<p::digit>) = "expecting at least one digit";
 error_message_for(grammar::v2::eof) = "expecting end of file";
 error_message_for(grammar::v2::seps) = "expecting separators";
+error_message_for(grammar::v2::string::escape) = "expecting escape sequence";
+error_message_for(grammar::v2::string::binary_escape) = "expecting two hex digits";
+error_message_for(grammar::v2::string::unicode_escape) = "expecting hex-encoded Unicode code point";
 error_message_for(grammar::v2::path::forbid_prefix_triple_slash) = "too many slashes in path";
 error_message_for(grammar::v2::path::forbid_prefix_double_slash_no_interp) = "path has a trailing slash";
 error_message_for(grammar::v2::expr) = "expecting expression";
@ -479,37 +482,9 @@ struct StringState : SubexprState {
        currentLiteral += s;
    }

-    // FIXME this truncates strings on NUL for compat with the old parser. ideally
-    // we should use the decomposition the g gives us instead of iterating over
-    // the entire string again.
-    static void unescapeStr(std::string & str)
-    {
-        char * s = str.data();
-        char * t = s;
-        char c;
-        while ((c = *s++)) {
-            if (c == '\\') {
-                c = *s++;
-                if (c == 'n') *t = '\n';
-                else if (c == 'r') *t = '\r';
-                else if (c == 't') *t = '\t';
-                else *t = c;
-            }
-            else if (c == '\r') {
-                /* Normalise CR and CR/LF into LF. */
-                *t = '\n';
-                if (*s == '\n') s++; /* cr/lf */
-            }
-            else *t = c;
-            t++;
-        }
-        str.resize(t - str.data());
-    }
-
    void endLiteral()
    {
        if (!currentLiteral.empty()) {
-            unescapeStr(currentLiteral);
            parts.emplace_back(currentPos, std::make_unique<ExprString>(std::move(currentLiteral)));
        }
    }
@ -517,7 +492,6 @@ struct StringState : SubexprState {
    std::unique_ptr<Expr> finish()
    {
        if (parts.empty()) {
-            unescapeStr(currentLiteral);
            return std::make_unique<ExprString>(std::move(currentLiteral));
        } else {
            endLiteral();
@ -535,7 +509,104 @@ template<typename... Content> struct BuildAST<grammar::v2::string::literal<Conte

 template<> struct BuildAST<grammar::v2::string::cr_lf> {
    static void apply(const auto & in, StringState & s, State & ps) {
-        s.append(ps.at(in), in.string_view()); // FIXME compat with old parser
+        // Normalize to LF
+        s.append(ps.at(in), "\n");
+    }
+};
+
+template<> struct BuildAST<grammar::v2::string::simple_escape> {
+    static void apply(const auto & in, StringState & s, State & ps) {
+        switch (*in.begin()) {
+            case '\\':
+                s.append(ps.at(in), "\\");
+                break;
+            case '$':
+                s.append(ps.at(in), "$");
+                break;
+            case '"':
+                s.append(ps.at(in), "\"");
+                break;
+            case 't':
+                s.append(ps.at(in), "\t");
+                break;
+            case 'r':
+                s.append(ps.at(in), "\r");
+                break;
+            case 'n':
+                s.append(ps.at(in), "\n");
+                break;
+            /* Escape line breaks themselves
+             * note that only LF and CRLF can be escaped, but distinguishing
+             * them based on the first character is unambiguous.
+             */
+            case '\n':
+            case '\r':
+                break;
+            // Unreachable
+            default:
+                std::abort();
+        }
+    }
+};
+
+template<> struct BuildAST<grammar::v2::string::binary_escape> {
+    static void apply(const auto & in, StringState & s, State & ps) {
+        int8_t v;
+        // Don't error handle as the function is infallible for our input
+        std::from_chars(in.begin(), in.end(), v, 16);
+        char val[1] = { (char) v };
+        s.append(ps.at(in), std::string_view{val, 1});
+    }
+};
+
+/*
+ * C++ currently offers no good way to UTF-8 encode a Unicode code point.
+ * std::c32rtomb exists but besides the unhinged API its "multi byte" representation
+ * depends on the locale, and may or may not actually be UTF-8.
+ *
+ * Code copied over and adapted from https://utfcpp.sourceforge.net/ version 2.0,
+ * licensed under the Boost License.
+ *
+ * The code point validity check has been removed:
+ * The grammar will ensure that it is within range,
+ * and we do want to allow non-scalar surrogate code points anyways.
+ *
+ * See https://simonsapin.github.io/wtf-8 about the WTF encoding.
+ */
+int saneScalarToWtf8(char32_t cp, char * out) {
+    if (cp < 0x80) {                      // one octet
+        *(out++) = static_cast<uint8_t>(cp);
+        return 1;
+    }
+    else if (cp < 0x800) {                // two octets
+        *(out++) = static_cast<uint8_t>((cp >> 6)            | 0xc0);
+        *(out++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
+        return 2;
+    }
+    else if (cp < 0x10000) {              // three octets
+        *(out++) = static_cast<uint8_t>((cp >> 12)           | 0xe0);
+        *(out++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
+        *(out++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
+        return 3;
+    }
+    else {                                // four octets
+        *(out++) = static_cast<uint8_t>((cp >> 18)           | 0xf0);
+        *(out++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)  | 0x80);
+        *(out++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
+        *(out++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
+        return 4;
+    }
+}
+
+template<> struct BuildAST<grammar::v2::string::unicode_escape> {
+    static void apply(const auto & in, StringState & s, State & ps) {
+        int32_t v;
+        // Don't error handle as the function is infallible for our input
+        std::from_chars(in.begin(), in.end(), v, 16);
+
+        char out[4]{};
+        std::size_t len = saneScalarToWtf8((char32_t) v, out);
+        s.append(ps.at(in), std::string_view{out, len});
    }
 };

@ -546,13 +617,6 @@ template<> struct BuildAST<grammar::v2::string::interpolation> {
    }
 };

-template<> struct BuildAST<grammar::v2::string::escape> {
-    static void apply(const auto & in, StringState & s, State & ps) {
-        s.append(ps.at(in), "\\"); // FIXME compat with old parser
-        s.append(ps.at(in), in.string_view());
-    }
-};
-
 template<> struct BuildAST<grammar::v2::string> : change_head<StringState> {
    static void success0(StringState & s, ExprState & e, State &) {
        e.exprs.emplace_back(noPos, s.finish());
--- a/src/libexpr/parser/parser.cc
+++ b/src/libexpr/parser/parser.cc
@ -14,6 +14,7 @@

 #include <charconv>
 #include <memory>
+#include <cuchar>

 // Linter complains that this is a "suspicious include of file with '.cc' extension".
 // While that is correct and generally not great, it is one of the less bad options to pick
--- a/src/libutil/experimental-features.cc
+++ b/src/libutil/experimental-features.cc
@ -204,6 +204,12 @@ constexpr std::array<ExperimentalFeatureDetails, numXpFeatures> xpFeatureDetails
            - Removed ancient `let {` syntax. See also the `ancient-let` deprecated feature
            - All Nix code must now be fully valid UTF-8 text.
            - Line endings must be LF or CRLF, not CR.
+            - String syntax has now sane escape rules for `\`:
+                - `t`, `r`, `n` yield the usual whitespace characters.
+                - `\`, `$`, `"` yield the character itself.
+                - Newlines can be escaped with a trailing `\` like in Rust's string continuations, which will skip the string until the next non-whitespace character.
+                - `x` and `u` allow to insert raw bytes and UTF-8 encoded Unicode scalars into the string. `\u` uses the curly braces syntax as in Rust instead of hard-coding exactly four hex digits.
+                - No other escape rules. Notably, `$${{}` now is an interpolation.
        )",
    },
    {
--- a/tests/unit/libexpr/trivial.cc
+++ b/tests/unit/libexpr/trivial.cc
@ -298,4 +298,21 @@ namespace nix {
        mockFeatureSettings.set("experimental-features", "nix-lang2");
        ASSERT_THROW(eval("foo\x10\xff\xffobar", true, mockFeatureSettings), Error);
    }
+
+    TEST_F(TrivialExpressionTest, stringEscapes) {
+        FeatureSettings mockFeatureSettings;
+        mockFeatureSettings.set("experimental-features", "nix-lang2");
+
+        auto v = eval("\"\\x42\\u{6c34}\\u{6C34}\\u{06c34}\\u{006c34}\"", true, mockFeatureSettings);
+        ASSERT_THAT(v, IsStringEq("B水水水水"));
+
+        v = eval("\"foo\\\n               bar\"", true, mockFeatureSettings);
+        ASSERT_THAT(v, IsStringEq("foobar"));
+
+        v = eval("\"foo\\\r\n               bar\"", true, mockFeatureSettings);
+        ASSERT_THAT(v, IsStringEq("foobar"));
+
+        ASSERT_THROW(eval("\"$${\"", true, mockFeatureSettings), Error);
+    }
+
 } /* namespace nix */