From cabf2a300a3b583b54f2e51eabdc87b39525f072 Mon Sep 17 00:00:00 2001 From: piegames Date: Sat, 19 Oct 2024 10:23:50 +0200 Subject: [PATCH] libexpr/nix2: String improvements 1/2 A complete overhaul of the escape semantics, which are just horrible and mostly useless. The design is inspired by JSON and Rust. Indented strings are not affected Co-authored-by: eldritch horrors Change-Id: Id89679d4115d59869090bdbb5d9b305f374447fb --- src/libexpr/parser/grammar2.hh | 25 ++++- src/libexpr/parser/parser-impl2.inc.cc | 138 ++++++++++++++++++------- src/libexpr/parser/parser.cc | 1 + src/libutil/experimental-features.cc | 6 ++ tests/unit/libexpr/trivial.cc | 17 +++ 5 files changed, 147 insertions(+), 40 deletions(-) diff --git a/src/libexpr/parser/grammar2.hh b/src/libexpr/parser/grammar2.hh index 35a67beee..61d01cced 100644 --- a/src/libexpr/parser/grammar2.hh +++ b/src/libexpr/parser/grammar2.hh @@ -35,10 +35,13 @@ using p::not_at, p::opt, p::plus, + p::rep, + p::rep_min_max, p::sor, p::seq, p::star, p::until, + p::xdigit, p8::any, p8::not_one, p8::one, @@ -213,7 +216,23 @@ struct _string { must, seps, must> > {}; - struct escape : semantic, must {}; + struct simple_escape : semantic, sor< + one<'\\'>, + one<'$'>, + one<'"'>, + one<'t'>, + one<'r'>, + one<'n'>, + // Escaping newlines to break up strings into multiline (string continuation) + seq>> + > {}; + struct binary_escape : semantic, seq> {}; + struct unicode_escape : semantic, seq> {}; + struct escape : sor< + simple_escape, + seq, must>, + seq, must>, must, must>> + > {}; }; struct string : _string, seq< one<'"'>, @@ -222,8 +241,8 @@ struct string : _string, seq< _string::literal>>, _string::cr_lf, _string::interpolation, - _string::literal, opt>>, - seq, _string::escape> + _string::literal>, + seq, must<_string::escape>> > >, must> diff --git a/src/libexpr/parser/parser-impl2.inc.cc b/src/libexpr/parser/parser-impl2.inc.cc index 5f0276822..b45e4a21d 100644 --- a/src/libexpr/parser/parser-impl2.inc.cc +++ b/src/libexpr/parser/parser-impl2.inc.cc @@ -36,6 +36,9 @@ error_message_for(p8::any) = "expecting any character"; error_message_for(p::plus) = "expecting at least one digit"; error_message_for(grammar::v2::eof) = "expecting end of file"; error_message_for(grammar::v2::seps) = "expecting separators"; +error_message_for(grammar::v2::string::escape) = "expecting escape sequence"; +error_message_for(grammar::v2::string::binary_escape) = "expecting two hex digits"; +error_message_for(grammar::v2::string::unicode_escape) = "expecting hex-encoded Unicode code point"; error_message_for(grammar::v2::path::forbid_prefix_triple_slash) = "too many slashes in path"; error_message_for(grammar::v2::path::forbid_prefix_double_slash_no_interp) = "path has a trailing slash"; error_message_for(grammar::v2::expr) = "expecting expression"; @@ -479,37 +482,9 @@ struct StringState : SubexprState { currentLiteral += s; } - // FIXME this truncates strings on NUL for compat with the old parser. ideally - // we should use the decomposition the g gives us instead of iterating over - // the entire string again. - static void unescapeStr(std::string & str) - { - char * s = str.data(); - char * t = s; - char c; - while ((c = *s++)) { - if (c == '\\') { - c = *s++; - if (c == 'n') *t = '\n'; - else if (c == 'r') *t = '\r'; - else if (c == 't') *t = '\t'; - else *t = c; - } - else if (c == '\r') { - /* Normalise CR and CR/LF into LF. */ - *t = '\n'; - if (*s == '\n') s++; /* cr/lf */ - } - else *t = c; - t++; - } - str.resize(t - str.data()); - } - void endLiteral() { if (!currentLiteral.empty()) { - unescapeStr(currentLiteral); parts.emplace_back(currentPos, std::make_unique(std::move(currentLiteral))); } } @@ -517,7 +492,6 @@ struct StringState : SubexprState { std::unique_ptr finish() { if (parts.empty()) { - unescapeStr(currentLiteral); return std::make_unique(std::move(currentLiteral)); } else { endLiteral(); @@ -535,7 +509,104 @@ template struct BuildAST struct BuildAST { static void apply(const auto & in, StringState & s, State & ps) { - s.append(ps.at(in), in.string_view()); // FIXME compat with old parser + // Normalize to LF + s.append(ps.at(in), "\n"); + } +}; + +template<> struct BuildAST { + static void apply(const auto & in, StringState & s, State & ps) { + switch (*in.begin()) { + case '\\': + s.append(ps.at(in), "\\"); + break; + case '$': + s.append(ps.at(in), "$"); + break; + case '"': + s.append(ps.at(in), "\""); + break; + case 't': + s.append(ps.at(in), "\t"); + break; + case 'r': + s.append(ps.at(in), "\r"); + break; + case 'n': + s.append(ps.at(in), "\n"); + break; + /* Escape line breaks themselves + * note that only LF and CRLF can be escaped, but distinguishing + * them based on the first character is unambiguous. + */ + case '\n': + case '\r': + break; + // Unreachable + default: + std::abort(); + } + } +}; + +template<> struct BuildAST { + static void apply(const auto & in, StringState & s, State & ps) { + int8_t v; + // Don't error handle as the function is infallible for our input + std::from_chars(in.begin(), in.end(), v, 16); + char val[1] = { (char) v }; + s.append(ps.at(in), std::string_view{val, 1}); + } +}; + +/* + * C++ currently offers no good way to UTF-8 encode a Unicode code point. + * std::c32rtomb exists but besides the unhinged API its "multi byte" representation + * depends on the locale, and may or may not actually be UTF-8. + * + * Code copied over and adapted from https://utfcpp.sourceforge.net/ version 2.0, + * licensed under the Boost License. + * + * The code point validity check has been removed: + * The grammar will ensure that it is within range, + * and we do want to allow non-scalar surrogate code points anyways. + * + * See https://simonsapin.github.io/wtf-8 about the WTF encoding. + */ +int saneScalarToWtf8(char32_t cp, char * out) { + if (cp < 0x80) { // one octet + *(out++) = static_cast(cp); + return 1; + } + else if (cp < 0x800) { // two octets + *(out++) = static_cast((cp >> 6) | 0xc0); + *(out++) = static_cast((cp & 0x3f) | 0x80); + return 2; + } + else if (cp < 0x10000) { // three octets + *(out++) = static_cast((cp >> 12) | 0xe0); + *(out++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(out++) = static_cast((cp & 0x3f) | 0x80); + return 3; + } + else { // four octets + *(out++) = static_cast((cp >> 18) | 0xf0); + *(out++) = static_cast(((cp >> 12) & 0x3f) | 0x80); + *(out++) = static_cast(((cp >> 6) & 0x3f) | 0x80); + *(out++) = static_cast((cp & 0x3f) | 0x80); + return 4; + } +} + +template<> struct BuildAST { + static void apply(const auto & in, StringState & s, State & ps) { + int32_t v; + // Don't error handle as the function is infallible for our input + std::from_chars(in.begin(), in.end(), v, 16); + + char out[4]{}; + std::size_t len = saneScalarToWtf8((char32_t) v, out); + s.append(ps.at(in), std::string_view{out, len}); } }; @@ -546,13 +617,6 @@ template<> struct BuildAST { } }; -template<> struct BuildAST { - static void apply(const auto & in, StringState & s, State & ps) { - s.append(ps.at(in), "\\"); // FIXME compat with old parser - s.append(ps.at(in), in.string_view()); - } -}; - template<> struct BuildAST : change_head { static void success0(StringState & s, ExprState & e, State &) { e.exprs.emplace_back(noPos, s.finish()); diff --git a/src/libexpr/parser/parser.cc b/src/libexpr/parser/parser.cc index ba432da8d..5f70ab266 100644 --- a/src/libexpr/parser/parser.cc +++ b/src/libexpr/parser/parser.cc @@ -14,6 +14,7 @@ #include #include +#include // Linter complains that this is a "suspicious include of file with '.cc' extension". // While that is correct and generally not great, it is one of the less bad options to pick diff --git a/src/libutil/experimental-features.cc b/src/libutil/experimental-features.cc index 588ca0126..eb51a8dcd 100644 --- a/src/libutil/experimental-features.cc +++ b/src/libutil/experimental-features.cc @@ -204,6 +204,12 @@ constexpr std::array xpFeatureDetails - Removed ancient `let {` syntax. See also the `ancient-let` deprecated feature - All Nix code must now be fully valid UTF-8 text. - Line endings must be LF or CRLF, not CR. + - String syntax has now sane escape rules for `\`: + - `t`, `r`, `n` yield the usual whitespace characters. + - `\`, `$`, `"` yield the character itself. + - Newlines can be escaped with a trailing `\` like in Rust's string continuations, which will skip the string until the next non-whitespace character. + - `x` and `u` allow to insert raw bytes and UTF-8 encoded Unicode scalars into the string. `\u` uses the curly braces syntax as in Rust instead of hard-coding exactly four hex digits. + - No other escape rules. Notably, `$${{}` now is an interpolation. )", }, { diff --git a/tests/unit/libexpr/trivial.cc b/tests/unit/libexpr/trivial.cc index 9c6674163..cd44afe96 100644 --- a/tests/unit/libexpr/trivial.cc +++ b/tests/unit/libexpr/trivial.cc @@ -298,4 +298,21 @@ namespace nix { mockFeatureSettings.set("experimental-features", "nix-lang2"); ASSERT_THROW(eval("foo\x10\xff\xffobar", true, mockFeatureSettings), Error); } + + TEST_F(TrivialExpressionTest, stringEscapes) { + FeatureSettings mockFeatureSettings; + mockFeatureSettings.set("experimental-features", "nix-lang2"); + + auto v = eval("\"\\x42\\u{6c34}\\u{6C34}\\u{06c34}\\u{006c34}\"", true, mockFeatureSettings); + ASSERT_THAT(v, IsStringEq("B水水水水")); + + v = eval("\"foo\\\n bar\"", true, mockFeatureSettings); + ASSERT_THAT(v, IsStringEq("foobar")); + + v = eval("\"foo\\\r\n bar\"", true, mockFeatureSettings); + ASSERT_THAT(v, IsStringEq("foobar")); + + ASSERT_THROW(eval("\"$${\"", true, mockFeatureSettings), Error); + } + } /* namespace nix */