From e5de1d13c493f80ff6cd21c51f77a4ed10088ea2 Mon Sep 17 00:00:00 2001 From: piegames Date: Tue, 15 Oct 2024 21:48:15 +0200 Subject: [PATCH] libexpr: Optimize complex indented strings The old behavior results in lots of concatenations happening for no good reason and is an artifact of the technical limitations of the old parser (combined with some lack of care for such details). Change-Id: I0d78d6220ca6aeaa10bc437e48e08bf7922e0bb3 --- src/libexpr/parser/grammar.hh | 13 +++-- src/libexpr/parser/parser-impl1.inc.cc | 14 ++--- src/libexpr/parser/state.hh | 52 +++---------------- .../functional/lang/parse-okay-ind-string.exp | 2 +- 4 files changed, 22 insertions(+), 59 deletions(-) diff --git a/src/libexpr/parser/grammar.hh b/src/libexpr/parser/grammar.hh index 92721fa20..701b40505 100644 --- a/src/libexpr/parser/grammar.hh +++ b/src/libexpr/parser/grammar.hh @@ -226,7 +226,7 @@ struct string : _string, seq< struct _ind_string { struct line_start : semantic, star> {}; - template + template struct literal : semantic, seq {}; struct interpolation : semantic, seq< p::string<'$', '{'>, seps, @@ -251,7 +251,6 @@ struct ind_string : _ind_string, seq< plus< sor< _ind_string::literal< - true, plus< sor< not_one<'$', '\'', '\n'>, @@ -264,13 +263,13 @@ struct ind_string : _ind_string, seq< > >, _ind_string::interpolation, - _ind_string::literal>, - _ind_string::literal, not_at>>, - seq, _ind_string::literal>>, + _ind_string::literal>, + _ind_string::literal, not_at>>, + seq, _ind_string::literal>>, seq< p::string<'\'', '\''>, sor< - _ind_string::literal>, + _ind_string::literal>, seq, _ind_string::escape> > > @@ -281,7 +280,7 @@ struct ind_string : _ind_string, seq< >, // End of line, LF. CR is just ignored and not treated as ending a line // (for the purpose of indentation stripping) - _ind_string::literal> + _ind_string::literal> >, must > {}; diff --git a/src/libexpr/parser/parser-impl1.inc.cc b/src/libexpr/parser/parser-impl1.inc.cc index c65fb3ddc..5836ab752 100644 --- a/src/libexpr/parser/parser-impl1.inc.cc +++ b/src/libexpr/parser/parser-impl1.inc.cc @@ -542,10 +542,10 @@ template<> struct BuildAST { } }; -template -struct BuildAST> { +template +struct BuildAST> { static void apply(const auto & in, IndStringState & s, State & ps) { - s.lines.back().parts.emplace_back(ps.at(in), StringToken{ in.string_view(), CanMerge }); + s.lines.back().parts.emplace_back(ps.at(in), in.string_view()); } }; @@ -558,10 +558,10 @@ template<> struct BuildAST { template<> struct BuildAST { static void apply(const auto & in, IndStringState & s, State & ps) { switch (*in.begin()) { - case 'n': s.lines.back().parts.emplace_back(ps.at(in), StringToken{"\n"}); break; - case 'r': s.lines.back().parts.emplace_back(ps.at(in), StringToken{"\r"}); break; - case 't': s.lines.back().parts.emplace_back(ps.at(in), StringToken{"\t"}); break; - default: s.lines.back().parts.emplace_back(ps.at(in), StringToken{in.string_view()}); break; + case 'n': s.lines.back().parts.emplace_back(ps.at(in), "\n"); break; + case 'r': s.lines.back().parts.emplace_back(ps.at(in), "\r"); break; + case 't': s.lines.back().parts.emplace_back(ps.at(in), "\t"); break; + default: s.lines.back().parts.emplace_back(ps.at(in), in.string_view()); break; } } }; diff --git a/src/libexpr/parser/state.hh b/src/libexpr/parser/state.hh index 62bf08ae7..b969f73e4 100644 --- a/src/libexpr/parser/state.hh +++ b/src/libexpr/parser/state.hh @@ -6,14 +6,6 @@ namespace nix::parser { -struct StringToken -{ - std::string_view s; - // canMerge is only used to faithfully reproduce the quirks from the old code base. - bool canMerge = false; - operator std::string_view() const { return s; } -}; - struct IndStringLine { // String containing only the leading whitespace of the line. May be empty. std::string_view indentation; @@ -26,7 +18,7 @@ struct IndStringLine { std::vector< std::pair< PosIdx, - std::variant, StringToken> + std::variant, std::string_view> > > parts = {}; }; @@ -204,8 +196,7 @@ inline std::unique_ptr State::stripIndentation( std::vector && lines) { /* If the only line is whitespace-only, directly return empty string. - * NOTE: This is not merely an optimization, but `compatStripLeadingEmptyString` - * later on relies on the string not being empty for working. + * The rest of the code relies on the final string not being empty. */ if (lines.size() == 1 && lines.front().parts.empty()) { return std::make_unique(""); @@ -219,19 +210,6 @@ inline std::unique_ptr State::stripIndentation( lines.back().indentation = {}; } - /* - * Quirk compatibility: - * - * » nix-instantiate --parse -E $'\'\'${"foo"}\'\'' - * "foo" - * » nix-instantiate --parse -E $'\'\' ${"foo"}\'\'' - * ("" + "foo") - * - * Our code always produces the form with the additional "" +, so we'll manually - * strip it at the end if necessary. - */ - const bool compatStripLeadingEmptyString = !lines.empty() && lines[0].indentation.empty(); - /* Figure out the minimum indentation. Note that by design whitespace-only lines are not taken into account. */ size_t minIndent = 1000000; @@ -248,48 +226,34 @@ inline std::unique_ptr State::stripIndentation( /* Concat the parts together again */ - /* Note that we don't concat all adjacent string parts to fully reproduce the original code. - * This means that any escapes will result in string concatenation even if this is unnecessary. - */ std::vector>> parts; /* Accumulator for merging intermediates */ PosIdx merged_pos; std::string merged = ""; - bool has_merged = false; auto push_merged = [&] (PosIdx i_pos, std::string_view str) { - merged += str; - if (!has_merged) { - has_merged = true; + if (merged.empty()) { merged_pos = i_pos; } + merged += str; }; auto flush_merged = [&] () { - if (has_merged) { + if (!merged.empty()) { parts.emplace_back(merged_pos, std::make_unique(std::string(merged))); merged.clear(); - has_merged = false; } }; for (auto && [li, line] : enumerate(lines)) { - /* Always merge indentation, except for the first line when compatStripLeadingEmptyString is set (see above) */ - if (!compatStripLeadingEmptyString || li != 0) { - push_merged(line.pos, line.indentation); - } + push_merged(line.pos, line.indentation); for (auto & val : line.parts) { auto &[i_pos, item] = val; std::visit(overloaded{ - [&](StringToken str) { - if (str.canMerge) { - push_merged(i_pos, str.s); - } else { - flush_merged(); - parts.emplace_back(i_pos, std::make_unique(std::string(str.s))); - } + [&](std::string_view str) { + push_merged(i_pos, str); }, [&](std::unique_ptr expr) { flush_merged(); diff --git a/tests/functional/lang/parse-okay-ind-string.exp b/tests/functional/lang/parse-okay-ind-string.exp index d9154ab60..570785ee2 100644 --- a/tests/functional/lang/parse-okay-ind-string.exp +++ b/tests/functional/lang/parse-okay-ind-string.exp @@ -1 +1 @@ -(let s1 = "This is an indented multi-line string\nliteral. An amount of whitespace at\nthe start of each line matching the minimum\nindentation of all lines in the string\nliteral together will be removed. Thus,\nin this case four spaces will be\nstripped from each line, even though\n THIS LINE is indented six spaces.\n\nAlso, empty lines don't count in the\ndetermination of the indentation level (the\nprevious empty line has indentation 0, but\nit doesn't matter).\n"; s10 = ""; s11 = ""; s12 = ""; s13 = ("start on network-interfaces\n\nstart script\n\n rm -f /var/run/opengl-driver\n " + "${(if true then "ln -sf 123 /var/run/opengl-driver" else (if true then "ln -sf 456 /var/run/opengl-driver" else ""))}" + "\n\n rm -f /var/log/slim.log\n \nend script\n\nenv SLIM_CFGFILE=" + "abc" + "\nenv SLIM_THEMESDIR=" + "def" + "\nenv FONTCONFIG_FILE=/etc/fonts/fonts.conf \t\t\t\t# !!! cleanup\nenv XKB_BINDIR=" + "foo" + "/bin \t\t\t\t# Needed for the Xkb extension.\nenv LD_LIBRARY_PATH=" + "libX11" + "/lib:" + "libXext" + "/lib:/usr/lib/ # related to xorg-sys-opengl - needed to load libglx for (AI)GLX support (for compiz)\n\n" + "${(if true then ("env XORG_DRI_DRIVER_PATH=" + "nvidiaDrivers" + "/X11R6/lib/modules/drivers/") else (if true then ("env XORG_DRI_DRIVER_PATH=" + "mesa" + "/lib/modules/dri") else ""))}" + " \n\nexec " + "slim" + "/bin/slim\n"); s14 = ("Escaping of ' followed by ': " + "''" + "\nEscaping of $ followed by {: " + "$" + "{\nAnd finally to interpret \\n etc. as in a string: " + "\n" + ", " + "\r" + ", " + "\t" + ".\n"); s15 = (let x = "bla"; in ("foo\n" + "'" + "${x}" + "'\nbar\n")); s16 = ("cut -d " + "$" + "'\\t' -f 1\n"); s17 = ((("ending dollar " + "$") + "$") + "\n"); s18 = " Lines without any indentation effectively disable the indentation\n stripping for the entire string:\n\n cat >$out/foo/data <