forked from lix-project/lix
libexpr/nix2: String improvements 1/2
A complete overhaul of the escape semantics, which are just horrible and
mostly useless. The design is inspired by JSON and Rust. Indented
strings are not affected
Co-authored-by: eldritch horrors <pennae@lix.systems>
Change-Id: Id89679d4115d59869090bdbb5d9b305f374447fb
This commit is contained in:
parent
5058a00130
commit
cabf2a300a
|
@ -35,10 +35,13 @@ using
|
|||
p::not_at,
|
||||
p::opt,
|
||||
p::plus,
|
||||
p::rep,
|
||||
p::rep_min_max,
|
||||
p::sor,
|
||||
p::seq,
|
||||
p::star,
|
||||
p::until,
|
||||
p::xdigit,
|
||||
p8::any,
|
||||
p8::not_one,
|
||||
p8::one,
|
||||
|
@ -213,7 +216,23 @@ struct _string {
|
|||
must<expr>, seps,
|
||||
must<one<'}'>>
|
||||
> {};
|
||||
struct escape : semantic, must<any> {};
|
||||
struct simple_escape : semantic, sor<
|
||||
one<'\\'>,
|
||||
one<'$'>,
|
||||
one<'"'>,
|
||||
one<'t'>,
|
||||
one<'r'>,
|
||||
one<'n'>,
|
||||
// Escaping newlines to break up strings into multiline (string continuation)
|
||||
seq<t::eol, star<one<' ', '\t'>>>
|
||||
> {};
|
||||
struct binary_escape : semantic, seq<rep<2, xdigit>> {};
|
||||
struct unicode_escape : semantic, seq<rep_min_max<1, 6, xdigit>> {};
|
||||
struct escape : sor<
|
||||
simple_escape,
|
||||
seq<one<'x'>, must<binary_escape>>,
|
||||
seq<one<'u'>, must<one<'{'>>, must<unicode_escape>, must<one<'}'>>>
|
||||
> {};
|
||||
};
|
||||
struct string : _string, seq<
|
||||
one<'"'>,
|
||||
|
@ -222,8 +241,8 @@ struct string : _string, seq<
|
|||
_string::literal<plus<not_one<'$', '"', '\\', '\r'>>>,
|
||||
_string::cr_lf,
|
||||
_string::interpolation,
|
||||
_string::literal<one<'$'>, opt<one<'$'>>>,
|
||||
seq<one<'\\'>, _string::escape>
|
||||
_string::literal<one<'$'>>,
|
||||
seq<one<'\\'>, must<_string::escape>>
|
||||
>
|
||||
>,
|
||||
must<one<'"'>>
|
||||
|
|
|
@ -36,6 +36,9 @@ error_message_for(p8::any) = "expecting any character";
|
|||
error_message_for(p::plus<p::digit>) = "expecting at least one digit";
|
||||
error_message_for(grammar::v2::eof) = "expecting end of file";
|
||||
error_message_for(grammar::v2::seps) = "expecting separators";
|
||||
error_message_for(grammar::v2::string::escape) = "expecting escape sequence";
|
||||
error_message_for(grammar::v2::string::binary_escape) = "expecting two hex digits";
|
||||
error_message_for(grammar::v2::string::unicode_escape) = "expecting hex-encoded Unicode code point";
|
||||
error_message_for(grammar::v2::path::forbid_prefix_triple_slash) = "too many slashes in path";
|
||||
error_message_for(grammar::v2::path::forbid_prefix_double_slash_no_interp) = "path has a trailing slash";
|
||||
error_message_for(grammar::v2::expr) = "expecting expression";
|
||||
|
@ -479,37 +482,9 @@ struct StringState : SubexprState {
|
|||
currentLiteral += s;
|
||||
}
|
||||
|
||||
// FIXME this truncates strings on NUL for compat with the old parser. ideally
|
||||
// we should use the decomposition the g gives us instead of iterating over
|
||||
// the entire string again.
|
||||
static void unescapeStr(std::string & str)
|
||||
{
|
||||
char * s = str.data();
|
||||
char * t = s;
|
||||
char c;
|
||||
while ((c = *s++)) {
|
||||
if (c == '\\') {
|
||||
c = *s++;
|
||||
if (c == 'n') *t = '\n';
|
||||
else if (c == 'r') *t = '\r';
|
||||
else if (c == 't') *t = '\t';
|
||||
else *t = c;
|
||||
}
|
||||
else if (c == '\r') {
|
||||
/* Normalise CR and CR/LF into LF. */
|
||||
*t = '\n';
|
||||
if (*s == '\n') s++; /* cr/lf */
|
||||
}
|
||||
else *t = c;
|
||||
t++;
|
||||
}
|
||||
str.resize(t - str.data());
|
||||
}
|
||||
|
||||
void endLiteral()
|
||||
{
|
||||
if (!currentLiteral.empty()) {
|
||||
unescapeStr(currentLiteral);
|
||||
parts.emplace_back(currentPos, std::make_unique<ExprString>(std::move(currentLiteral)));
|
||||
}
|
||||
}
|
||||
|
@ -517,7 +492,6 @@ struct StringState : SubexprState {
|
|||
std::unique_ptr<Expr> finish()
|
||||
{
|
||||
if (parts.empty()) {
|
||||
unescapeStr(currentLiteral);
|
||||
return std::make_unique<ExprString>(std::move(currentLiteral));
|
||||
} else {
|
||||
endLiteral();
|
||||
|
@ -535,7 +509,104 @@ template<typename... Content> struct BuildAST<grammar::v2::string::literal<Conte
|
|||
|
||||
template<> struct BuildAST<grammar::v2::string::cr_lf> {
|
||||
static void apply(const auto & in, StringState & s, State & ps) {
|
||||
s.append(ps.at(in), in.string_view()); // FIXME compat with old parser
|
||||
// Normalize to LF
|
||||
s.append(ps.at(in), "\n");
|
||||
}
|
||||
};
|
||||
|
||||
template<> struct BuildAST<grammar::v2::string::simple_escape> {
|
||||
static void apply(const auto & in, StringState & s, State & ps) {
|
||||
switch (*in.begin()) {
|
||||
case '\\':
|
||||
s.append(ps.at(in), "\\");
|
||||
break;
|
||||
case '$':
|
||||
s.append(ps.at(in), "$");
|
||||
break;
|
||||
case '"':
|
||||
s.append(ps.at(in), "\"");
|
||||
break;
|
||||
case 't':
|
||||
s.append(ps.at(in), "\t");
|
||||
break;
|
||||
case 'r':
|
||||
s.append(ps.at(in), "\r");
|
||||
break;
|
||||
case 'n':
|
||||
s.append(ps.at(in), "\n");
|
||||
break;
|
||||
/* Escape line breaks themselves
|
||||
* note that only LF and CRLF can be escaped, but distinguishing
|
||||
* them based on the first character is unambiguous.
|
||||
*/
|
||||
case '\n':
|
||||
case '\r':
|
||||
break;
|
||||
// Unreachable
|
||||
default:
|
||||
std::abort();
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
template<> struct BuildAST<grammar::v2::string::binary_escape> {
|
||||
static void apply(const auto & in, StringState & s, State & ps) {
|
||||
int8_t v;
|
||||
// Don't error handle as the function is infallible for our input
|
||||
std::from_chars(in.begin(), in.end(), v, 16);
|
||||
char val[1] = { (char) v };
|
||||
s.append(ps.at(in), std::string_view{val, 1});
|
||||
}
|
||||
};
|
||||
|
||||
/*
|
||||
* C++ currently offers no good way to UTF-8 encode a Unicode code point.
|
||||
* std::c32rtomb exists but besides the unhinged API its "multi byte" representation
|
||||
* depends on the locale, and may or may not actually be UTF-8.
|
||||
*
|
||||
* Code copied over and adapted from https://utfcpp.sourceforge.net/ version 2.0,
|
||||
* licensed under the Boost License.
|
||||
*
|
||||
* The code point validity check has been removed:
|
||||
* The grammar will ensure that it is within range,
|
||||
* and we do want to allow non-scalar surrogate code points anyways.
|
||||
*
|
||||
* See https://simonsapin.github.io/wtf-8 about the WTF encoding.
|
||||
*/
|
||||
int saneScalarToWtf8(char32_t cp, char * out) {
|
||||
if (cp < 0x80) { // one octet
|
||||
*(out++) = static_cast<uint8_t>(cp);
|
||||
return 1;
|
||||
}
|
||||
else if (cp < 0x800) { // two octets
|
||||
*(out++) = static_cast<uint8_t>((cp >> 6) | 0xc0);
|
||||
*(out++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||
return 2;
|
||||
}
|
||||
else if (cp < 0x10000) { // three octets
|
||||
*(out++) = static_cast<uint8_t>((cp >> 12) | 0xe0);
|
||||
*(out++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
|
||||
*(out++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||
return 3;
|
||||
}
|
||||
else { // four octets
|
||||
*(out++) = static_cast<uint8_t>((cp >> 18) | 0xf0);
|
||||
*(out++) = static_cast<uint8_t>(((cp >> 12) & 0x3f) | 0x80);
|
||||
*(out++) = static_cast<uint8_t>(((cp >> 6) & 0x3f) | 0x80);
|
||||
*(out++) = static_cast<uint8_t>((cp & 0x3f) | 0x80);
|
||||
return 4;
|
||||
}
|
||||
}
|
||||
|
||||
template<> struct BuildAST<grammar::v2::string::unicode_escape> {
|
||||
static void apply(const auto & in, StringState & s, State & ps) {
|
||||
int32_t v;
|
||||
// Don't error handle as the function is infallible for our input
|
||||
std::from_chars(in.begin(), in.end(), v, 16);
|
||||
|
||||
char out[4]{};
|
||||
std::size_t len = saneScalarToWtf8((char32_t) v, out);
|
||||
s.append(ps.at(in), std::string_view{out, len});
|
||||
}
|
||||
};
|
||||
|
||||
|
@ -546,13 +617,6 @@ template<> struct BuildAST<grammar::v2::string::interpolation> {
|
|||
}
|
||||
};
|
||||
|
||||
template<> struct BuildAST<grammar::v2::string::escape> {
|
||||
static void apply(const auto & in, StringState & s, State & ps) {
|
||||
s.append(ps.at(in), "\\"); // FIXME compat with old parser
|
||||
s.append(ps.at(in), in.string_view());
|
||||
}
|
||||
};
|
||||
|
||||
template<> struct BuildAST<grammar::v2::string> : change_head<StringState> {
|
||||
static void success0(StringState & s, ExprState & e, State &) {
|
||||
e.exprs.emplace_back(noPos, s.finish());
|
||||
|
|
|
@ -14,6 +14,7 @@
|
|||
|
||||
#include <charconv>
|
||||
#include <memory>
|
||||
#include <cuchar>
|
||||
|
||||
// Linter complains that this is a "suspicious include of file with '.cc' extension".
|
||||
// While that is correct and generally not great, it is one of the less bad options to pick
|
||||
|
|
|
@ -204,6 +204,12 @@ constexpr std::array<ExperimentalFeatureDetails, numXpFeatures> xpFeatureDetails
|
|||
- Removed ancient `let {` syntax. See also the `ancient-let` deprecated feature
|
||||
- All Nix code must now be fully valid UTF-8 text.
|
||||
- Line endings must be LF or CRLF, not CR.
|
||||
- String syntax has now sane escape rules for `\`:
|
||||
- `t`, `r`, `n` yield the usual whitespace characters.
|
||||
- `\`, `$`, `"` yield the character itself.
|
||||
- Newlines can be escaped with a trailing `\` like in Rust's string continuations, which will skip the string until the next non-whitespace character.
|
||||
- `x` and `u` allow to insert raw bytes and UTF-8 encoded Unicode scalars into the string. `\u` uses the curly braces syntax as in Rust instead of hard-coding exactly four hex digits.
|
||||
- No other escape rules. Notably, `$${{}` now is an interpolation.
|
||||
)",
|
||||
},
|
||||
{
|
||||
|
|
|
@ -298,4 +298,21 @@ namespace nix {
|
|||
mockFeatureSettings.set("experimental-features", "nix-lang2");
|
||||
ASSERT_THROW(eval("foo\x10\xff\xffobar", true, mockFeatureSettings), Error);
|
||||
}
|
||||
|
||||
TEST_F(TrivialExpressionTest, stringEscapes) {
|
||||
FeatureSettings mockFeatureSettings;
|
||||
mockFeatureSettings.set("experimental-features", "nix-lang2");
|
||||
|
||||
auto v = eval("\"\\x42\\u{6c34}\\u{6C34}\\u{06c34}\\u{006c34}\"", true, mockFeatureSettings);
|
||||
ASSERT_THAT(v, IsStringEq("B水水水水"));
|
||||
|
||||
v = eval("\"foo\\\n bar\"", true, mockFeatureSettings);
|
||||
ASSERT_THAT(v, IsStringEq("foobar"));
|
||||
|
||||
v = eval("\"foo\\\r\n bar\"", true, mockFeatureSettings);
|
||||
ASSERT_THAT(v, IsStringEq("foobar"));
|
||||
|
||||
ASSERT_THROW(eval("\"$${\"", true, mockFeatureSettings), Error);
|
||||
}
|
||||
|
||||
} /* namespace nix */
|
||||
|
|
Loading…
Reference in a new issue