libexpr/nix2: String improvements 1/2

A complete overhaul of the escape semantics, which are just horrible and
mostly useless. The design is inspired by JSON and Rust. Indented
strings are not affected

Co-authored-by: eldritch horrors <pennae@lix.systems>
Change-Id: Id89679d4115d59869090bdbb5d9b305f374447fb
This commit is contained in:
piegames 2024-10-19 10:23:50 +02:00
parent 5058a00130
commit cabf2a300a
5 changed files with 147 additions and 40 deletions

View file

@ -35,10 +35,13 @@ using
p::not_at,
p::opt,
p::plus,
p::rep,
p::rep_min_max,
p::sor,
p::seq,
p::star,
p::until,
p::xdigit,
p8::any,
p8::not_one,
p8::one,
@ -213,7 +216,23 @@ struct _string {
must<expr>, seps,
must<one<'}'>>
> {};
struct escape : semantic, must<any> {};
struct simple_escape : semantic, sor<
one<'\\'>,
one<'$'>,
one<'"'>,
one<'t'>,
one<'r'>,
one<'n'>,
// Escaping newlines to break up strings into multiline (string continuation)
seq<t::eol, star<one<' ', '\t'>>>
> {};
struct binary_escape : semantic, seq<rep<2, xdigit>> {};
struct unicode_escape : semantic, seq<rep_min_max<1, 6, xdigit>> {};
struct escape : sor<
simple_escape,
seq<one<'x'>, must<binary_escape>>,
seq<one<'u'>, must<one<'{'>>, must<unicode_escape>, must<one<'}'>>>
> {};
};
struct string : _string, seq<
one<'"'>,
@ -222,8 +241,8 @@ struct string : _string, seq<
_string::literal<plus<not_one<'$', '"', '\\', '\r'>>>,
_string::cr_lf,
_string::interpolation,
_string::literal<one<'$'>, opt<one<'$'>>>,
seq<one<'\\'>, _string::escape>
_string::literal<one<'$'>>,
seq<one<'\\'>, must<_string::escape>>
>
>,
must<one<'"'>>

View file

@ -36,6 +36,9 @@ error_message_for(p8::any) = "expecting any character";
error_message_for(p::plus<p::digit>) = "expecting at least one digit";
error_message_for(grammar::v2::eof) = "expecting end of file";
error_message_for(grammar::v2::seps) = "expecting separators";
error_message_for(grammar::v2::string::escape) = "expecting escape sequence";
error_message_for(grammar::v2::string::binary_escape) = "expecting two hex digits";
error_message_for(grammar::v2::string::unicode_escape) = "expecting hex-encoded Unicode code point";
error_message_for(grammar::v2::path::forbid_prefix_triple_slash) = "too many slashes in path";
error_message_for(grammar::v2::path::forbid_prefix_double_slash_no_interp) = "path has a trailing slash";
error_message_for(grammar::v2::expr) = "expecting expression";
@ -479,37 +482,9 @@ struct StringState : SubexprState {
currentLiteral += s;
}
// FIXME this truncates strings on NUL for compat with the old parser. ideally
// we should use the decomposition the g gives us instead of iterating over
// the entire string again.
static void unescapeStr(std::string & str)
{
char * s = str.data();
char * t = s;
char c;
while ((c = *s++)) {
if (c == '\\') {
c = *s++;
if (c == 'n') *t = '\n';
else if (c == 'r') *t = '\r';
else if (c == 't') *t = '\t';
else *t = c;
}
else if (c == '\r') {
/* Normalise CR and CR/LF into LF. */
*t = '\n';
if (*s == '\n') s++; /* cr/lf */
}
else *t = c;
t++;
}
str.resize(t - str.data());
}
void endLiteral()
{
if (!currentLiteral.empty()) {
unescapeStr(currentLiteral);
parts.emplace_back(currentPos, std::make_unique<ExprString>(std::move(currentLiteral)));
}
}
@ -517,7 +492,6 @@ struct StringState : SubexprState {
std::unique_ptr<Expr> finish()
{
if (parts.empty()) {
unescapeStr(currentLiteral);
return std::make_unique<ExprString>(std::move(currentLiteral));
} else {
endLiteral();
@ -535,7 +509,104 @@ template<typename... Content> struct BuildAST<grammar::v2::string::literal<Conte
template<> struct BuildAST<grammar::v2::string::cr_lf> {
static void apply(const auto & in, StringState & s, State & ps) {
s.append(ps.at(in), in.string_view()); // FIXME compat with old parser
// Normalize to LF
s.append(ps.at(in), "\n");
}
};
template<> struct BuildAST<grammar::v2::string::simple_escape> {
static void apply(const auto & in, StringState & s, State & ps) {
switch (*in.begin()) {
case '\\':
s.append(ps.at(in), "\\");
break;
case '$':
s.append(ps.at(in), "$");
break;
case '"':
s.append(ps.at(in), "\"");
break;
case 't':
s.append(ps.at(in), "\t");
break;
case 'r':
s.append(ps.at(in), "\r");
break;
case 'n':
s.append(ps.at(in), "\n");
break;
/* Escape line breaks themselves
* note that only LF and CRLF can be escaped, but distinguishing
* them based on the first character is unambiguous.
*/
case '\n':
case '\r':
break;
// Unreachable
default:
std::abort();
}
}
};
template<> struct BuildAST<grammar::v2::string::binary_escape> {
static void apply(const auto & in, StringState & s, State & ps) {
int8_t v;
// Don't error handle as the function is infallible for our input
std::from_chars(in.begin(), in.end(), v, 16);
char val[1] = { (char) v };
s.append(ps.at(in), std::string_view{val, 1});
}
};
/*
* C++ currently offers no good way to UTF-8 encode a Unicode code point.
* std::c32rtomb exists but besides the unhinged API its "multi byte" representation
* depends on the locale, and may or may not actually be UTF-8.
*
* Code copied over and adapted from https://utfcpp.sourceforge.net/ version 2.0,
* licensed under the Boost License.
*
* The code point validity check has been removed:
* The grammar will ensure that it is within range,
* and we do want to allow non-scalar surrogate code points anyways.
*
* See https://simonsapin.github.io/wtf-8 about the WTF encoding.
*/
int saneScalarToWtf8(char32_t cp, char * out) {
if (cp < 0x80) { // one octet
*(out++) = static_cast<uint8_t>(cp);
return 1;
}
else if (cp < 0x800) { // two octets
*(out++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
*(out++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
return 2;
}
else if (cp < 0x10000) { // three octets
*(out++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
*(out++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
*(out++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
return 3;
}
else { // four octets
*(out++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
*(out++) = static_cast<uint8_t>(((cp >> 12) & 0x3f) | 0x80);
*(out++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
*(out++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
return 4;
}
}
template<> struct BuildAST<grammar::v2::string::unicode_escape> {
static void apply(const auto & in, StringState & s, State & ps) {
int32_t v;
// Don't error handle as the function is infallible for our input
std::from_chars(in.begin(), in.end(), v, 16);
char out[4]{};
std::size_t len = saneScalarToWtf8((char32_t) v, out);
s.append(ps.at(in), std::string_view{out, len});
}
};
@ -546,13 +617,6 @@ template<> struct BuildAST<grammar::v2::string::interpolation> {
}
};
template<> struct BuildAST<grammar::v2::string::escape> {
static void apply(const auto & in, StringState & s, State & ps) {
s.append(ps.at(in), "\\"); // FIXME compat with old parser
s.append(ps.at(in), in.string_view());
}
};
template<> struct BuildAST<grammar::v2::string> : change_head<StringState> {
static void success0(StringState & s, ExprState & e, State &) {
e.exprs.emplace_back(noPos, s.finish());

View file

@ -14,6 +14,7 @@
#include <charconv>
#include <memory>
#include <cuchar>
// Linter complains that this is a "suspicious include of file with '.cc' extension".
// While that is correct and generally not great, it is one of the less bad options to pick

View file

@ -204,6 +204,12 @@ constexpr std::array<ExperimentalFeatureDetails, numXpFeatures> xpFeatureDetails
- Removed ancient `let {` syntax. See also the `ancient-let` deprecated feature
- All Nix code must now be fully valid UTF-8 text.
- Line endings must be LF or CRLF, not CR.
- String syntax has now sane escape rules for `\`:
- `t`, `r`, `n` yield the usual whitespace characters.
- `\`, `$`, `"` yield the character itself.
- Newlines can be escaped with a trailing `\` like in Rust's string continuations, which will skip the string until the next non-whitespace character.
- `x` and `u` allow to insert raw bytes and UTF-8 encoded Unicode scalars into the string. `\u` uses the curly braces syntax as in Rust instead of hard-coding exactly four hex digits.
- No other escape rules. Notably, `$${{}` now is an interpolation.
)",
},
{

View file

@ -298,4 +298,21 @@ namespace nix {
mockFeatureSettings.set("experimental-features", "nix-lang2");
ASSERT_THROW(eval("foo\x10\xff\xffobar", true, mockFeatureSettings), Error);
}
TEST_F(TrivialExpressionTest, stringEscapes) {
FeatureSettings mockFeatureSettings;
mockFeatureSettings.set("experimental-features", "nix-lang2");
auto v = eval("\"\\x42\\u{6c34}\\u{6C34}\\u{06c34}\\u{006c34}\"", true, mockFeatureSettings);
ASSERT_THAT(v, IsStringEq("B水水水水"));
v = eval("\"foo\\\n bar\"", true, mockFeatureSettings);
ASSERT_THAT(v, IsStringEq("foobar"));
v = eval("\"foo\\\r\n bar\"", true, mockFeatureSettings);
ASSERT_THAT(v, IsStringEq("foobar"));
ASSERT_THROW(eval("\"$${\"", true, mockFeatureSettings), Error);
}
} /* namespace nix */