From cabf2a300a3b583b54f2e51eabdc87b39525f072 Mon Sep 17 00:00:00 2001
From: piegames <git@piegames.de>
Date: Sat, 19 Oct 2024 10:23:50 +0200
Subject: [PATCH] libexpr/nix2: String improvements 1/2

A complete overhaul of the escape semantics, which are just horrible and
mostly useless. The design is inspired by JSON and Rust. Indented
strings are not affected

Co-authored-by: eldritch horrors <pennae@lix.systems>
Change-Id: Id89679d4115d59869090bdbb5d9b305f374447fb
---
 src/libexpr/parser/grammar2.hh         |  25 ++++-
 src/libexpr/parser/parser-impl2.inc.cc | 138 ++++++++++++++++++-------
 src/libexpr/parser/parser.cc           |   1 +
 src/libutil/experimental-features.cc   |   6 ++
 tests/unit/libexpr/trivial.cc          |  17 +++
 5 files changed, 147 insertions(+), 40 deletions(-)
diff --git a/src/libexpr/parser/grammar2.hh b/src/libexpr/parser/grammar2.hh
index 35a67beee..61d01cced 100644
--- a/src/libexpr/parser/grammar2.hh
+++ b/src/libexpr/parser/grammar2.hh
@@ -35,10 +35,13 @@ using
     p::not_at,
     p::opt,
     p::plus,
+    p::rep,
+    p::rep_min_max,
     p::sor,
     p::seq,
     p::star,
     p::until,
+    p::xdigit,
     p8::any,
     p8::not_one,
     p8::one,
@@ -213,7 +216,23 @@ struct _string {
         must<expr>, seps,
         must<one<'}'>>
     > {};
-    struct escape : semantic, must<any> {};
+    struct simple_escape : semantic, sor<
+        one<'\\'>,
+        one<'$'>,
+        one<'"'>,
+        one<'t'>,
+        one<'r'>,
+        one<'n'>,
+        // Escaping newlines to break up strings into multiline (string continuation)
+        seq<t::eol, star<one<' ', '\t'>>>
+    > {};
+    struct binary_escape : semantic, seq<rep<2, xdigit>> {};
+    struct unicode_escape : semantic, seq<rep_min_max<1, 6, xdigit>> {};
+    struct escape : sor<
+        simple_escape,
+        seq<one<'x'>, must<binary_escape>>,
+        seq<one<'u'>, must<one<'{'>>, must<unicode_escape>, must<one<'}'>>>
+    > {};
 };
 struct string : _string, seq<
     one<'"'>,
@@ -222,8 +241,8 @@ struct string : _string, seq<
             _string::literal<plus<not_one<'$', '"', '\\', '\r'>>>,
             _string::cr_lf,
             _string::interpolation,
-            _string::literal<one<'$'>, opt<one<'$'>>>,
-            seq<one<'\\'>, _string::escape>
+            _string::literal<one<'$'>>,
+            seq<one<'\\'>, must<_string::escape>>
         >
     >,
     must<one<'"'>>
diff --git a/src/libexpr/parser/parser-impl2.inc.cc b/src/libexpr/parser/parser-impl2.inc.cc
index 5f0276822..b45e4a21d 100644
--- a/src/libexpr/parser/parser-impl2.inc.cc
+++ b/src/libexpr/parser/parser-impl2.inc.cc
@@ -36,6 +36,9 @@ error_message_for(p8::any) = "expecting any character";
 error_message_for(p::plus<p::digit>) = "expecting at least one digit";
 error_message_for(grammar::v2::eof) = "expecting end of file";
 error_message_for(grammar::v2::seps) = "expecting separators";
+error_message_for(grammar::v2::string::escape) = "expecting escape sequence";
+error_message_for(grammar::v2::string::binary_escape) = "expecting two hex digits";
+error_message_for(grammar::v2::string::unicode_escape) = "expecting hex-encoded Unicode code point";
 error_message_for(grammar::v2::path::forbid_prefix_triple_slash) = "too many slashes in path";
 error_message_for(grammar::v2::path::forbid_prefix_double_slash_no_interp) = "path has a trailing slash";
 error_message_for(grammar::v2::expr) = "expecting expression";
@@ -479,37 +482,9 @@ struct StringState : SubexprState {
         currentLiteral += s;
     }
 
-    // FIXME this truncates strings on NUL for compat with the old parser. ideally
-    // we should use the decomposition the g gives us instead of iterating over
-    // the entire string again.
-    static void unescapeStr(std::string & str)
-    {
-        char * s = str.data();
-        char * t = s;
-        char c;
-        while ((c = *s++)) {
-            if (c == '\\') {
-                c = *s++;
-                if (c == 'n') *t = '\n';
-                else if (c == 'r') *t = '\r';
-                else if (c == 't') *t = '\t';
-                else *t = c;
-            }
-            else if (c == '\r') {
-                /* Normalise CR and CR/LF into LF. */
-                *t = '\n';
-                if (*s == '\n') s++; /* cr/lf */
-            }
-            else *t = c;
-            t++;
-        }
-        str.resize(t - str.data());
-    }
-
     void endLiteral()
     {
         if (!currentLiteral.empty()) {
-            unescapeStr(currentLiteral);
             parts.emplace_back(currentPos, std::make_unique<ExprString>(std::move(currentLiteral)));
         }
     }
@@ -517,7 +492,6 @@ struct StringState : SubexprState {
     std::unique_ptr<Expr> finish()
     {
         if (parts.empty()) {
-            unescapeStr(currentLiteral);
             return std::make_unique<ExprString>(std::move(currentLiteral));
         } else {
             endLiteral();
@@ -535,7 +509,104 @@ template<typename... Content> struct BuildAST<grammar::v2::string::literal<Conte
 
 template<> struct BuildAST<grammar::v2::string::cr_lf> {
     static void apply(const auto & in, StringState & s, State & ps) {
-        s.append(ps.at(in), in.string_view()); // FIXME compat with old parser
+        // Normalize to LF
+        s.append(ps.at(in), "\n");
+    }
+};
+
+template<> struct BuildAST<grammar::v2::string::simple_escape> {
+    static void apply(const auto & in, StringState & s, State & ps) {
+        switch (*in.begin()) {
+            case '\\':
+                s.append(ps.at(in), "\\");
+                break;
+            case '$':
+                s.append(ps.at(in), "$");
+                break;
+            case '"':
+                s.append(ps.at(in), "\"");
+                break;
+            case 't':
+                s.append(ps.at(in), "\t");
+                break;
+            case 'r':
+                s.append(ps.at(in), "\r");
+                break;
+            case 'n':
+                s.append(ps.at(in), "\n");
+                break;
+            /* Escape line breaks themselves
+             * note that only LF and CRLF can be escaped, but distinguishing
+             * them based on the first character is unambiguous.
+             */
+            case '\n':
+            case '\r':
+                break;
+            // Unreachable
+            default:
+                std::abort();
+        }
+    }
+};
+
+template<> struct BuildAST<grammar::v2::string::binary_escape> {
+    static void apply(const auto & in, StringState & s, State & ps) {
+        int8_t v;
+        // Don't error handle as the function is infallible for our input
+        std::from_chars(in.begin(), in.end(), v, 16);
+        char val[1] = { (char) v };
+        s.append(ps.at(in), std::string_view{val, 1});
+    }
+};
+
+/*
+ * C++ currently offers no good way to UTF-8 encode a Unicode code point.
+ * std::c32rtomb exists but besides the unhinged API its "multi byte" representation
+ * depends on the locale, and may or may not actually be UTF-8.
+ *
+ * Code copied over and adapted from https://utfcpp.sourceforge.net/ version 2.0,
+ * licensed under the Boost License.
+ *
+ * The code point validity check has been removed:
+ * The grammar will ensure that it is within range,
+ * and we do want to allow non-scalar surrogate code points anyways.
+ *
+ * See https://simonsapin.github.io/wtf-8 about the WTF encoding.
+ */
+int saneScalarToWtf8(char32_t cp, char * out) {
+    if (cp < 0x80) {                      // one octet
+        *(out++) = static_cast<uint8_t>(cp);
+        return 1;
+    }
+    else if (cp < 0x800) {                // two octets
+        *(out++) = static_cast<uint8_t>((cp >> 6)            | 0xc0);
+        *(out++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
+        return 2;
+    }
+    else if (cp < 0x10000) {              // three octets
+        *(out++) = static_cast<uint8_t>((cp >> 12)           | 0xe0);
+        *(out++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
+        *(out++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
+        return 3;
+    }
+    else {                                // four octets
+        *(out++) = static_cast<uint8_t>((cp >> 18)           | 0xf0);
+        *(out++) = static_cast<uint8_t>(((cp >> 12) & 0x3f)  | 0x80);
+        *(out++) = static_cast<uint8_t>(((cp >> 6) & 0x3f)   | 0x80);
+        *(out++) = static_cast<uint8_t>((cp & 0x3f)          | 0x80);
+        return 4;
+    }
+}
+
+template<> struct BuildAST<grammar::v2::string::unicode_escape> {
+    static void apply(const auto & in, StringState & s, State & ps) {
+        int32_t v;
+        // Don't error handle as the function is infallible for our input
+        std::from_chars(in.begin(), in.end(), v, 16);
+
+        char out[4]{};
+        std::size_t len = saneScalarToWtf8((char32_t) v, out);
+        s.append(ps.at(in), std::string_view{out, len});
     }
 };
 
@@ -546,13 +617,6 @@ template<> struct BuildAST<grammar::v2::string::interpolation> {
     }
 };
 
-template<> struct BuildAST<grammar::v2::string::escape> {
-    static void apply(const auto & in, StringState & s, State & ps) {
-        s.append(ps.at(in), "\\"); // FIXME compat with old parser
-        s.append(ps.at(in), in.string_view());
-    }
-};
-
 template<> struct BuildAST<grammar::v2::string> : change_head<StringState> {
     static void success0(StringState & s, ExprState & e, State &) {
         e.exprs.emplace_back(noPos, s.finish());
diff --git a/src/libexpr/parser/parser.cc b/src/libexpr/parser/parser.cc
index ba432da8d..5f70ab266 100644
--- a/src/libexpr/parser/parser.cc
+++ b/src/libexpr/parser/parser.cc
@@ -14,6 +14,7 @@
 
 #include <charconv>
 #include <memory>
+#include <cuchar>
 
 // Linter complains that this is a "suspicious include of file with '.cc' extension".
 // While that is correct and generally not great, it is one of the less bad options to pick
diff --git a/src/libutil/experimental-features.cc b/src/libutil/experimental-features.cc
index 588ca0126..eb51a8dcd 100644
--- a/src/libutil/experimental-features.cc
+++ b/src/libutil/experimental-features.cc
@@ -204,6 +204,12 @@ constexpr std::array<ExperimentalFeatureDetails, numXpFeatures> xpFeatureDetails
             - Removed ancient `let {` syntax. See also the `ancient-let` deprecated feature
             - All Nix code must now be fully valid UTF-8 text.
             - Line endings must be LF or CRLF, not CR.
+            - String syntax has now sane escape rules for `\`:
+                - `t`, `r`, `n` yield the usual whitespace characters.
+                - `\`, `$`, `"` yield the character itself.
+                - Newlines can be escaped with a trailing `\` like in Rust's string continuations, which will skip the string until the next non-whitespace character.
+                - `x` and `u` allow to insert raw bytes and UTF-8 encoded Unicode scalars into the string. `\u` uses the curly braces syntax as in Rust instead of hard-coding exactly four hex digits.
+                - No other escape rules. Notably, `$${{}` now is an interpolation.
         )",
     },
     {
diff --git a/tests/unit/libexpr/trivial.cc b/tests/unit/libexpr/trivial.cc
index 9c6674163..cd44afe96 100644
--- a/tests/unit/libexpr/trivial.cc
+++ b/tests/unit/libexpr/trivial.cc
@@ -298,4 +298,21 @@ namespace nix {
         mockFeatureSettings.set("experimental-features", "nix-lang2");
         ASSERT_THROW(eval("foo\x10\xff\xffobar", true, mockFeatureSettings), Error);
     }
+
+    TEST_F(TrivialExpressionTest, stringEscapes) {
+        FeatureSettings mockFeatureSettings;
+        mockFeatureSettings.set("experimental-features", "nix-lang2");
+
+        auto v = eval("\"\\x42\\u{6c34}\\u{6C34}\\u{06c34}\\u{006c34}\"", true, mockFeatureSettings);
+        ASSERT_THAT(v, IsStringEq("B水水水水"));
+
+        v = eval("\"foo\\\n               bar\"", true, mockFeatureSettings);
+        ASSERT_THAT(v, IsStringEq("foobar"));
+
+        v = eval("\"foo\\\r\n               bar\"", true, mockFeatureSettings);
+        ASSERT_THAT(v, IsStringEq("foobar"));
+
+        ASSERT_THROW(eval("\"$${\"", true, mockFeatureSettings), Error);
+    }
+
 } /* namespace nix */