libexpr: Optimize complex indented strings

The old behavior results in lots of concatenations happening for no good
reason and is an artifact of the technical limitations of the old parser
(combined with some lack of care for such details).

Change-Id: I0d78d6220ca6aeaa10bc437e48e08bf7922e0bb3
This commit is contained in:
piegames 2024-10-15 21:48:15 +02:00
parent 878e181882
commit e5de1d13c4
4 changed files with 22 additions and 59 deletions

View file

@ -226,7 +226,7 @@ struct string : _string, seq<
struct _ind_string { struct _ind_string {
struct line_start : semantic, star<one<' '>> {}; struct line_start : semantic, star<one<' '>> {};
template<bool CanMerge, typename... Inner> template<typename... Inner>
struct literal : semantic, seq<Inner...> {}; struct literal : semantic, seq<Inner...> {};
struct interpolation : semantic, seq< struct interpolation : semantic, seq<
p::string<'$', '{'>, seps, p::string<'$', '{'>, seps,
@ -251,7 +251,6 @@ struct ind_string : _ind_string, seq<
plus< plus<
sor< sor<
_ind_string::literal< _ind_string::literal<
true,
plus< plus<
sor< sor<
not_one<'$', '\'', '\n'>, not_one<'$', '\'', '\n'>,
@ -264,13 +263,13 @@ struct ind_string : _ind_string, seq<
> >
>, >,
_ind_string::interpolation, _ind_string::interpolation,
_ind_string::literal<false, one<'$'>>, _ind_string::literal<one<'$'>>,
_ind_string::literal<false, one<'\''>, not_at<one<'\''>>>, _ind_string::literal<one<'\''>, not_at<one<'\''>>>,
seq<one<'\''>, _ind_string::literal<false, p::string<'\'', '\''>>>, seq<one<'\''>, _ind_string::literal<p::string<'\'', '\''>>>,
seq< seq<
p::string<'\'', '\''>, p::string<'\'', '\''>,
sor< sor<
_ind_string::literal<false, one<'$'>>, _ind_string::literal<one<'$'>>,
seq<one<'\\'>, _ind_string::escape> seq<one<'\\'>, _ind_string::escape>
> >
> >
@ -281,7 +280,7 @@ struct ind_string : _ind_string, seq<
>, >,
// End of line, LF. CR is just ignored and not treated as ending a line // End of line, LF. CR is just ignored and not treated as ending a line
// (for the purpose of indentation stripping) // (for the purpose of indentation stripping)
_ind_string::literal<true, one<'\n'>> _ind_string::literal<one<'\n'>>
>, >,
must<TAO_PEGTL_STRING("''")> must<TAO_PEGTL_STRING("''")>
> {}; > {};

View file

@ -542,10 +542,10 @@ template<> struct BuildAST<grammar::v1::ind_string::line_start> {
} }
}; };
template<bool CanMerge, typename... Content> template<typename... Content>
struct BuildAST<grammar::v1::ind_string::literal<CanMerge, Content...>> { struct BuildAST<grammar::v1::ind_string::literal<Content...>> {
static void apply(const auto & in, IndStringState & s, State & ps) { static void apply(const auto & in, IndStringState & s, State & ps) {
s.lines.back().parts.emplace_back(ps.at(in), StringToken{ in.string_view(), CanMerge }); s.lines.back().parts.emplace_back(ps.at(in), in.string_view());
} }
}; };
@ -558,10 +558,10 @@ template<> struct BuildAST<grammar::v1::ind_string::interpolation> {
template<> struct BuildAST<grammar::v1::ind_string::escape> { template<> struct BuildAST<grammar::v1::ind_string::escape> {
static void apply(const auto & in, IndStringState & s, State & ps) { static void apply(const auto & in, IndStringState & s, State & ps) {
switch (*in.begin()) { switch (*in.begin()) {
case 'n': s.lines.back().parts.emplace_back(ps.at(in), StringToken{"\n"}); break; case 'n': s.lines.back().parts.emplace_back(ps.at(in), "\n"); break;
case 'r': s.lines.back().parts.emplace_back(ps.at(in), StringToken{"\r"}); break; case 'r': s.lines.back().parts.emplace_back(ps.at(in), "\r"); break;
case 't': s.lines.back().parts.emplace_back(ps.at(in), StringToken{"\t"}); break; case 't': s.lines.back().parts.emplace_back(ps.at(in), "\t"); break;
default: s.lines.back().parts.emplace_back(ps.at(in), StringToken{in.string_view()}); break; default: s.lines.back().parts.emplace_back(ps.at(in), in.string_view()); break;
} }
} }
}; };

View file

@ -6,14 +6,6 @@
namespace nix::parser { namespace nix::parser {
struct StringToken
{
std::string_view s;
// canMerge is only used to faithfully reproduce the quirks from the old code base.
bool canMerge = false;
operator std::string_view() const { return s; }
};
struct IndStringLine { struct IndStringLine {
// String containing only the leading whitespace of the line. May be empty. // String containing only the leading whitespace of the line. May be empty.
std::string_view indentation; std::string_view indentation;
@ -26,7 +18,7 @@ struct IndStringLine {
std::vector< std::vector<
std::pair< std::pair<
PosIdx, PosIdx,
std::variant<std::unique_ptr<Expr>, StringToken> std::variant<std::unique_ptr<Expr>, std::string_view>
> >
> parts = {}; > parts = {};
}; };
@ -204,8 +196,7 @@ inline std::unique_ptr<Expr> State::stripIndentation(
std::vector<IndStringLine> && lines) std::vector<IndStringLine> && lines)
{ {
/* If the only line is whitespace-only, directly return empty string. /* If the only line is whitespace-only, directly return empty string.
* NOTE: This is not merely an optimization, but `compatStripLeadingEmptyString` * The rest of the code relies on the final string not being empty.
* later on relies on the string not being empty for working.
*/ */
if (lines.size() == 1 && lines.front().parts.empty()) { if (lines.size() == 1 && lines.front().parts.empty()) {
return std::make_unique<ExprString>(""); return std::make_unique<ExprString>("");
@ -219,19 +210,6 @@ inline std::unique_ptr<Expr> State::stripIndentation(
lines.back().indentation = {}; lines.back().indentation = {};
} }
/*
* Quirk compatibility:
*
* » nix-instantiate --parse -E $'\'\'${"foo"}\'\''
* "foo"
* » nix-instantiate --parse -E $'\'\' ${"foo"}\'\''
* ("" + "foo")
*
* Our code always produces the form with the additional "" +, so we'll manually
* strip it at the end if necessary.
*/
const bool compatStripLeadingEmptyString = !lines.empty() && lines[0].indentation.empty();
/* Figure out the minimum indentation. Note that by design /* Figure out the minimum indentation. Note that by design
whitespace-only lines are not taken into account. */ whitespace-only lines are not taken into account. */
size_t minIndent = 1000000; size_t minIndent = 1000000;
@ -248,48 +226,34 @@ inline std::unique_ptr<Expr> State::stripIndentation(
/* Concat the parts together again */ /* Concat the parts together again */
/* Note that we don't concat all adjacent string parts to fully reproduce the original code.
* This means that any escapes will result in string concatenation even if this is unnecessary.
*/
std::vector<std::pair<PosIdx, std::unique_ptr<Expr>>> parts; std::vector<std::pair<PosIdx, std::unique_ptr<Expr>>> parts;
/* Accumulator for merging intermediates */ /* Accumulator for merging intermediates */
PosIdx merged_pos; PosIdx merged_pos;
std::string merged = ""; std::string merged = "";
bool has_merged = false;
auto push_merged = [&] (PosIdx i_pos, std::string_view str) { auto push_merged = [&] (PosIdx i_pos, std::string_view str) {
merged += str; if (merged.empty()) {
if (!has_merged) {
has_merged = true;
merged_pos = i_pos; merged_pos = i_pos;
} }
merged += str;
}; };
auto flush_merged = [&] () { auto flush_merged = [&] () {
if (has_merged) { if (!merged.empty()) {
parts.emplace_back(merged_pos, std::make_unique<ExprString>(std::string(merged))); parts.emplace_back(merged_pos, std::make_unique<ExprString>(std::string(merged)));
merged.clear(); merged.clear();
has_merged = false;
} }
}; };
for (auto && [li, line] : enumerate(lines)) { for (auto && [li, line] : enumerate(lines)) {
/* Always merge indentation, except for the first line when compatStripLeadingEmptyString is set (see above) */
if (!compatStripLeadingEmptyString || li != 0) {
push_merged(line.pos, line.indentation); push_merged(line.pos, line.indentation);
}
for (auto & val : line.parts) { for (auto & val : line.parts) {
auto &[i_pos, item] = val; auto &[i_pos, item] = val;
std::visit(overloaded{ std::visit(overloaded{
[&](StringToken str) { [&](std::string_view str) {
if (str.canMerge) { push_merged(i_pos, str);
push_merged(i_pos, str.s);
} else {
flush_merged();
parts.emplace_back(i_pos, std::make_unique<ExprString>(std::string(str.s)));
}
}, },
[&](std::unique_ptr<Expr> expr) { [&](std::unique_ptr<Expr> expr) {
flush_merged(); flush_merged();

View file

@ -1 +1 @@
(let s1 = "This is an indented multi-line string\nliteral. An amount of whitespace at\nthe start of each line matching the minimum\nindentation of all lines in the string\nliteral together will be removed. Thus,\nin this case four spaces will be\nstripped from each line, even though\n THIS LINE is indented six spaces.\n\nAlso, empty lines don't count in the\ndetermination of the indentation level (the\nprevious empty line has indentation 0, but\nit doesn't matter).\n"; s10 = ""; s11 = ""; s12 = ""; s13 = ("start on network-interfaces\n\nstart script\n\n rm -f /var/run/opengl-driver\n " + "${(if true then "ln -sf 123 /var/run/opengl-driver" else (if true then "ln -sf 456 /var/run/opengl-driver" else ""))}" + "\n\n rm -f /var/log/slim.log\n \nend script\n\nenv SLIM_CFGFILE=" + "abc" + "\nenv SLIM_THEMESDIR=" + "def" + "\nenv FONTCONFIG_FILE=/etc/fonts/fonts.conf \t\t\t\t# !!! cleanup\nenv XKB_BINDIR=" + "foo" + "/bin \t\t\t\t# Needed for the Xkb extension.\nenv LD_LIBRARY_PATH=" + "libX11" + "/lib:" + "libXext" + "/lib:/usr/lib/ # related to xorg-sys-opengl - needed to load libglx for (AI)GLX support (for compiz)\n\n" + "${(if true then ("env XORG_DRI_DRIVER_PATH=" + "nvidiaDrivers" + "/X11R6/lib/modules/drivers/") else (if true then ("env XORG_DRI_DRIVER_PATH=" + "mesa" + "/lib/modules/dri") else ""))}" + " \n\nexec " + "slim" + "/bin/slim\n"); s14 = ("Escaping of ' followed by ': " + "''" + "\nEscaping of $ followed by {: " + "$" + "{\nAnd finally to interpret \\n etc. as in a string: " + "\n" + ", " + "\r" + ", " + "\t" + ".\n"); s15 = (let x = "bla"; in ("foo\n" + "'" + "${x}" + "'\nbar\n")); s16 = ("cut -d " + "$" + "'\\t' -f 1\n"); s17 = ((("ending dollar " + "$") + "$") + "\n"); s18 = " Lines without any indentation effectively disable the indentation\n stripping for the entire string:\n\n cat >$out/foo/data <<EOF\n lasjdöaxnasd\nasdom 12398\nä\"§Æẞ¢«»”alsd\nEOF\n"; s19 = "Empty lines with a bit of whitespace don't affect the indentation calculation:\n\nAnd empty lines with more whitespace will have whitespace in the string:\n \nUnless it's the last line:\n"; s2 = "If the string starts with whitespace\n followed by a newline, it's stripped, but\n that's not the case here. Two spaces are\n stripped because of the \" \" at the start. \n"; s20 = " Indentation stripping\n must not be impressed by\nthe last line not being empty"; s21 = "\t Nor by people\n weirdly mixing tabs\n\tand spaces\n\t"; s3 = "This line is indented\na bit further.\n"; s4 = ("Anti-quotations, like " + "${(if true then "so" else "not so")}" + ", are\nalso allowed.\n"); s5 = (" The \\ is not special here.\n' can be followed by any character except another ', e.g. 'x'.\nLikewise for $, e.g. $$ or $varName.\nBut ' followed by ' is special, as is $ followed by {.\nIf you want them, use anti-quotations: " + "''" + ", " + "\${" + ".\n"); s6 = " Tabs are not interpreted as whitespace (since we can't guess\n what tab settings are intended), so don't use them.\n\tThis line starts with a space and a tab, so only one\n space will be stripped from each line.\n"; s7 = "Also note that if the last line (just before the closing ' ')\nconsists only of whitespace, it's ignored. But here there is\nsome non-whitespace stuff, so the line isn't removed. "; s8 = ("" + "" + "\nThis shows a hacky way to preserve an empty line after the start.\nBut there's no reason to do so: you could just repeat the empty\nline.\n"); s9 = ("" + "" + " Similarly you can force an indentation level,\n in this case to 2 spaces. This works because the anti-quote\n is significant (not whitespace).\n"); in ((((((((((((((((((((s1 + s2) + s3) + s4) + s5) + s6) + s7) + s8) + s9) + s10) + s11) + s12) + s13) + s14) + s15) + s16) + s17) + s18) + s19) + s20) + s21)) (let s1 = "This is an indented multi-line string\nliteral. An amount of whitespace at\nthe start of each line matching the minimum\nindentation of all lines in the string\nliteral together will be removed. Thus,\nin this case four spaces will be\nstripped from each line, even though\n THIS LINE is indented six spaces.\n\nAlso, empty lines don't count in the\ndetermination of the indentation level (the\nprevious empty line has indentation 0, but\nit doesn't matter).\n"; s10 = ""; s11 = ""; s12 = ""; s13 = ("start on network-interfaces\n\nstart script\n\n rm -f /var/run/opengl-driver\n " + "${(if true then "ln -sf 123 /var/run/opengl-driver" else (if true then "ln -sf 456 /var/run/opengl-driver" else ""))}" + "\n\n rm -f /var/log/slim.log\n \nend script\n\nenv SLIM_CFGFILE=" + "abc" + "\nenv SLIM_THEMESDIR=" + "def" + "\nenv FONTCONFIG_FILE=/etc/fonts/fonts.conf \t\t\t\t# !!! cleanup\nenv XKB_BINDIR=" + "foo" + "/bin \t\t\t\t# Needed for the Xkb extension.\nenv LD_LIBRARY_PATH=" + "libX11" + "/lib:" + "libXext" + "/lib:/usr/lib/ # related to xorg-sys-opengl - needed to load libglx for (AI)GLX support (for compiz)\n\n" + "${(if true then ("env XORG_DRI_DRIVER_PATH=" + "nvidiaDrivers" + "/X11R6/lib/modules/drivers/") else (if true then ("env XORG_DRI_DRIVER_PATH=" + "mesa" + "/lib/modules/dri") else ""))}" + " \n\nexec " + "slim" + "/bin/slim\n"); s14 = "Escaping of ' followed by ': ''\nEscaping of $ followed by {: \${\nAnd finally to interpret \\n etc. as in a string: \n, \r, \t.\n"; s15 = (let x = "bla"; in ("foo\n'" + "${x}" + "'\nbar\n")); s16 = "cut -d $'\\t' -f 1\n"; s17 = (("ending dollar $" + "$") + "\n"); s18 = " Lines without any indentation effectively disable the indentation\n stripping for the entire string:\n\n cat >$out/foo/data <<EOF\n lasjdöaxnasd\nasdom 12398\nä\"§Æẞ¢«»”alsd\nEOF\n"; s19 = "Empty lines with a bit of whitespace don't affect the indentation calculation:\n\nAnd empty lines with more whitespace will have whitespace in the string:\n \nUnless it's the last line:\n"; s2 = "If the string starts with whitespace\n followed by a newline, it's stripped, but\n that's not the case here. Two spaces are\n stripped because of the \" \" at the start. \n"; s20 = " Indentation stripping\n must not be impressed by\nthe last line not being empty"; s21 = "\t Nor by people\n weirdly mixing tabs\n\tand spaces\n\t"; s3 = "This line is indented\na bit further.\n"; s4 = ("Anti-quotations, like " + "${(if true then "so" else "not so")}" + ", are\nalso allowed.\n"); s5 = (" The \\ is not special here.\n' can be followed by any character except another ', e.g. 'x'.\nLikewise for $, e.g. $$ or $varName.\nBut ' followed by ' is special, as is $ followed by {.\nIf you want them, use anti-quotations: " + "''" + ", " + "\${" + ".\n"); s6 = " Tabs are not interpreted as whitespace (since we can't guess\n what tab settings are intended), so don't use them.\n\tThis line starts with a space and a tab, so only one\n space will be stripped from each line.\n"; s7 = "Also note that if the last line (just before the closing ' ')\nconsists only of whitespace, it's ignored. But here there is\nsome non-whitespace stuff, so the line isn't removed. "; s8 = ("" + "\nThis shows a hacky way to preserve an empty line after the start.\nBut there's no reason to do so: you could just repeat the empty\nline.\n"); s9 = ("" + " Similarly you can force an indentation level,\n in this case to 2 spaces. This works because the anti-quote\n is significant (not whitespace).\n"); in ((((((((((((((((((((s1 + s2) + s3) + s4) + s5) + s6) + s7) + s8) + s9) + s10) + s11) + s12) + s13) + s14) + s15) + s16) + s17) + s18) + s19) + s20) + s21))