forked from lix-project/lix
libexpr: Rewrite stripIndentation for indented strings
This commit should faithfully reproduce the old behavior down to the
bugs. The new code is a lot more readable, all quirks are well
documented, and it is overall much more maintainable.
Change-Id: I629585918e4f2b7d296b6b8330235cdc90b7bade
This commit is contained in:
parent
765771a355
commit
c852ae60da
|
@ -225,7 +225,8 @@ struct string : _string, seq<
|
||||||
> {};
|
> {};
|
||||||
|
|
||||||
struct _ind_string {
|
struct _ind_string {
|
||||||
template<bool Indented, typename... Inner>
|
struct line_start : semantic, star<one<' '>> {};
|
||||||
|
template<bool CanMerge, typename... Inner>
|
||||||
struct literal : semantic, seq<Inner...> {};
|
struct literal : semantic, seq<Inner...> {};
|
||||||
struct interpolation : semantic, seq<
|
struct interpolation : semantic, seq<
|
||||||
p::string<'$', '{'>, seps,
|
p::string<'$', '{'>, seps,
|
||||||
|
@ -233,19 +234,32 @@ struct _ind_string {
|
||||||
must<one<'}'>>
|
must<one<'}'>>
|
||||||
> {};
|
> {};
|
||||||
struct escape : semantic, must<any> {};
|
struct escape : semantic, must<any> {};
|
||||||
|
/* Marker for non-empty lines */
|
||||||
|
struct has_content : semantic, seq<> {};
|
||||||
};
|
};
|
||||||
struct ind_string : _ind_string, seq<
|
struct ind_string : _ind_string, seq<
|
||||||
TAO_PEGTL_STRING("''"),
|
TAO_PEGTL_STRING("''"),
|
||||||
|
// Strip first line completely if empty
|
||||||
opt<star<one<' '>>, one<'\n'>>,
|
opt<star<one<' '>>, one<'\n'>>,
|
||||||
star<
|
list<
|
||||||
|
seq<
|
||||||
|
// Start a line with some indentation
|
||||||
|
// (we always match even the empty string if no indentation, as this creates the line)
|
||||||
|
_ind_string::line_start,
|
||||||
|
// The actual line
|
||||||
|
opt<
|
||||||
|
plus<
|
||||||
sor<
|
sor<
|
||||||
_ind_string::literal<
|
_ind_string::literal<
|
||||||
true,
|
true,
|
||||||
plus<
|
plus<
|
||||||
sor<
|
sor<
|
||||||
not_one<'$', '\''>,
|
not_one<'$', '\'', '\n'>,
|
||||||
seq<one<'$'>, not_one<'{', '\''>>,
|
// TODO probably factor this out like the others for performance
|
||||||
seq<one<'\''>, not_one<'\'', '$'>>
|
seq<one<'$'>, not_one<'{', '\'', '\n'>>,
|
||||||
|
seq<one<'$'>, at<one<'\n'>>>,
|
||||||
|
seq<one<'\''>, not_one<'\'', '$', '\n'>>,
|
||||||
|
seq<one<'\''>, at<one<'\n'>>>
|
||||||
>
|
>
|
||||||
>
|
>
|
||||||
>,
|
>,
|
||||||
|
@ -260,7 +274,14 @@ struct ind_string : _ind_string, seq<
|
||||||
seq<one<'\\'>, _ind_string::escape>
|
seq<one<'\\'>, _ind_string::escape>
|
||||||
>
|
>
|
||||||
>
|
>
|
||||||
|
>,
|
||||||
|
_ind_string::has_content
|
||||||
>
|
>
|
||||||
|
>
|
||||||
|
>,
|
||||||
|
// End of line, LF. CR is just ignored and not treated as ending a line
|
||||||
|
// (for the purpose of indentation stripping)
|
||||||
|
_ind_string::literal<true, one<'\n'>>
|
||||||
>,
|
>,
|
||||||
must<TAO_PEGTL_STRING("''")>
|
must<TAO_PEGTL_STRING("''")>
|
||||||
> {};
|
> {};
|
||||||
|
|
|
@ -533,36 +533,48 @@ template<> struct BuildAST<grammar::v1::string> : change_head<StringState> {
|
||||||
struct IndStringState : SubexprState {
|
struct IndStringState : SubexprState {
|
||||||
using SubexprState::SubexprState;
|
using SubexprState::SubexprState;
|
||||||
|
|
||||||
std::vector<std::pair<PosIdx, std::variant<std::unique_ptr<Expr>, StringToken>>> parts;
|
std::vector<IndStringLine> lines;
|
||||||
};
|
};
|
||||||
|
|
||||||
template<bool Indented, typename... Content>
|
template<> struct BuildAST<grammar::v1::ind_string::line_start> {
|
||||||
struct BuildAST<grammar::v1::ind_string::literal<Indented, Content...>> {
|
|
||||||
static void apply(const auto & in, IndStringState & s, State & ps) {
|
static void apply(const auto & in, IndStringState & s, State & ps) {
|
||||||
s.parts.emplace_back(ps.at(in), StringToken{in.string_view(), Indented});
|
s.lines.push_back(IndStringLine { in.string_view(), ps.at(in) });
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
template<bool CanMerge, typename... Content>
|
||||||
|
struct BuildAST<grammar::v1::ind_string::literal<CanMerge, Content...>> {
|
||||||
|
static void apply(const auto & in, IndStringState & s, State & ps) {
|
||||||
|
s.lines.back().parts.emplace_back(ps.at(in), StringToken{ in.string_view(), CanMerge });
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template<> struct BuildAST<grammar::v1::ind_string::interpolation> {
|
template<> struct BuildAST<grammar::v1::ind_string::interpolation> {
|
||||||
static void apply(const auto & in, IndStringState & s, State & ps) {
|
static void apply(const auto & in, IndStringState & s, State & ps) {
|
||||||
s.parts.emplace_back(ps.at(in), s->popExprOnly());
|
s.lines.back().parts.emplace_back(ps.at(in), s->popExprOnly());
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
template<> struct BuildAST<grammar::v1::ind_string::escape> {
|
template<> struct BuildAST<grammar::v1::ind_string::escape> {
|
||||||
static void apply(const auto & in, IndStringState & s, State & ps) {
|
static void apply(const auto & in, IndStringState & s, State & ps) {
|
||||||
switch (*in.begin()) {
|
switch (*in.begin()) {
|
||||||
case 'n': s.parts.emplace_back(ps.at(in), StringToken{"\n"}); break;
|
case 'n': s.lines.back().parts.emplace_back(ps.at(in), StringToken{"\n"}); break;
|
||||||
case 'r': s.parts.emplace_back(ps.at(in), StringToken{"\r"}); break;
|
case 'r': s.lines.back().parts.emplace_back(ps.at(in), StringToken{"\r"}); break;
|
||||||
case 't': s.parts.emplace_back(ps.at(in), StringToken{"\t"}); break;
|
case 't': s.lines.back().parts.emplace_back(ps.at(in), StringToken{"\t"}); break;
|
||||||
default: s.parts.emplace_back(ps.at(in), StringToken{in.string_view()}); break;
|
default: s.lines.back().parts.emplace_back(ps.at(in), StringToken{in.string_view()}); break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
template<> struct BuildAST<grammar::v1::ind_string::has_content> {
|
||||||
|
static void apply(const auto & in, IndStringState & s, State & ps) {
|
||||||
|
s.lines.back().hasContent = true;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
template<> struct BuildAST<grammar::v1::ind_string> : change_head<IndStringState> {
|
template<> struct BuildAST<grammar::v1::ind_string> : change_head<IndStringState> {
|
||||||
static void success(const auto & in, IndStringState & s, ExprState & e, State & ps) {
|
static void success(const auto & in, IndStringState & s, ExprState & e, State & ps) {
|
||||||
e.exprs.emplace_back(noPos, ps.stripIndentation(ps.at(in), std::move(s.parts)));
|
e.exprs.emplace_back(noPos, ps.stripIndentation(ps.at(in), std::move(s.lines)));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
|
@ -9,10 +9,28 @@ namespace nix::parser {
|
||||||
struct StringToken
|
struct StringToken
|
||||||
{
|
{
|
||||||
std::string_view s;
|
std::string_view s;
|
||||||
bool hasIndentation = false;
|
// canMerge is only used to faithfully reproduce the quirks from the old code base.
|
||||||
|
bool canMerge = false;
|
||||||
operator std::string_view() const { return s; }
|
operator std::string_view() const { return s; }
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct IndStringLine {
|
||||||
|
// String containing only the leading whitespace of the line. May be empty.
|
||||||
|
std::string_view indentation;
|
||||||
|
// Position of the line start (before the indentation)
|
||||||
|
PosIdx pos;
|
||||||
|
|
||||||
|
// Whether the line contains anything besides indentation and line break
|
||||||
|
bool hasContent = false;
|
||||||
|
|
||||||
|
std::vector<
|
||||||
|
std::pair<
|
||||||
|
PosIdx,
|
||||||
|
std::variant<std::unique_ptr<Expr>, StringToken>
|
||||||
|
>
|
||||||
|
> parts = {};
|
||||||
|
};
|
||||||
|
|
||||||
struct State
|
struct State
|
||||||
{
|
{
|
||||||
SymbolTable & symbols;
|
SymbolTable & symbols;
|
||||||
|
@ -27,8 +45,7 @@ struct State
|
||||||
void overridesFound(const PosIdx pos);
|
void overridesFound(const PosIdx pos);
|
||||||
void addAttr(ExprAttrs * attrs, AttrPath && attrPath, std::unique_ptr<Expr> e, const PosIdx pos);
|
void addAttr(ExprAttrs * attrs, AttrPath && attrPath, std::unique_ptr<Expr> e, const PosIdx pos);
|
||||||
std::unique_ptr<Formals> validateFormals(std::unique_ptr<Formals> formals, PosIdx pos = noPos, Symbol arg = {});
|
std::unique_ptr<Formals> validateFormals(std::unique_ptr<Formals> formals, PosIdx pos = noPos, Symbol arg = {});
|
||||||
std::unique_ptr<Expr> stripIndentation(const PosIdx pos,
|
std::unique_ptr<Expr> stripIndentation(const PosIdx pos, std::vector<IndStringLine> && line);
|
||||||
std::vector<std::pair<PosIdx, std::variant<std::unique_ptr<Expr>, StringToken>>> && es);
|
|
||||||
|
|
||||||
// lazy positioning means we don't get byte offsets directly, in.position() would work
|
// lazy positioning means we don't get byte offsets directly, in.position() would work
|
||||||
// but also requires line and column (which is expensive)
|
// but also requires line and column (which is expensive)
|
||||||
|
@ -182,98 +199,115 @@ inline std::unique_ptr<Formals> State::validateFormals(std::unique_ptr<Formals>
|
||||||
return formals;
|
return formals;
|
||||||
}
|
}
|
||||||
|
|
||||||
inline std::unique_ptr<Expr> State::stripIndentation(const PosIdx pos,
|
inline std::unique_ptr<Expr> State::stripIndentation(
|
||||||
std::vector<std::pair<PosIdx, std::variant<std::unique_ptr<Expr>, StringToken>>> && es)
|
const PosIdx pos,
|
||||||
|
std::vector<IndStringLine> && lines)
|
||||||
{
|
{
|
||||||
if (es.empty()) return std::make_unique<ExprString>("");
|
/* If the only line is whitespace-only, directly return empty string.
|
||||||
|
* NOTE: This is not merely an optimization, but `compatStripLeadingEmptyString`
|
||||||
|
* later on relies on the string not being empty for working.
|
||||||
|
*/
|
||||||
|
if (lines.size() == 1 && lines.front().parts.empty()) {
|
||||||
|
return std::make_unique<ExprString>("");
|
||||||
|
}
|
||||||
|
|
||||||
|
/* If the last line only contains whitespace, trim it to not cause excessive whitespace.
|
||||||
|
* (Other whitespace-only lines get stripped only of the common indentation, and excess
|
||||||
|
* whitespace becomes part of the string.)
|
||||||
|
*/
|
||||||
|
if (lines.back().parts.empty()) {
|
||||||
|
lines.back().indentation = {};
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Quirk compatibility:
|
||||||
|
*
|
||||||
|
* » nix-instantiate --parse -E $'\'\'${"foo"}\'\''
|
||||||
|
* "foo"
|
||||||
|
* » nix-instantiate --parse -E $'\'\' ${"foo"}\'\''
|
||||||
|
* ("" + "foo")
|
||||||
|
*
|
||||||
|
* Our code always produces the form with the additional "" +, so we'll manually
|
||||||
|
* strip it at the end if necessary.
|
||||||
|
*/
|
||||||
|
const bool compatStripLeadingEmptyString = !lines.empty() && lines[0].indentation.empty();
|
||||||
|
|
||||||
/* Figure out the minimum indentation. Note that by design
|
/* Figure out the minimum indentation. Note that by design
|
||||||
whitespace-only final lines are not taken into account. (So
|
whitespace-only lines are not taken into account. */
|
||||||
the " " in "\n ''" is ignored, but the " " in "\n foo''" is.) */
|
|
||||||
bool atStartOfLine = true; /* = seen only whitespace in the current line */
|
|
||||||
size_t minIndent = 1000000;
|
size_t minIndent = 1000000;
|
||||||
size_t curIndent = 0;
|
for (auto & line : lines) {
|
||||||
for (auto & [i_pos, i] : es) {
|
if (line.hasContent) {
|
||||||
auto * str = std::get_if<StringToken>(&i);
|
minIndent = std::min(minIndent, line.indentation.size());
|
||||||
if (!str || !str->hasIndentation) {
|
|
||||||
/* Anti-quotations and escaped characters end the current start-of-line whitespace. */
|
|
||||||
if (atStartOfLine) {
|
|
||||||
atStartOfLine = false;
|
|
||||||
if (curIndent < minIndent) minIndent = curIndent;
|
|
||||||
}
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
for (size_t j = 0; j < str->s.size(); ++j) {
|
|
||||||
if (atStartOfLine) {
|
|
||||||
if (str->s[j] == ' ')
|
|
||||||
curIndent++;
|
|
||||||
else if (str->s[j] == '\n') {
|
|
||||||
/* Empty line, doesn't influence minimum
|
|
||||||
indentation. */
|
|
||||||
curIndent = 0;
|
|
||||||
} else {
|
|
||||||
atStartOfLine = false;
|
|
||||||
if (curIndent < minIndent) minIndent = curIndent;
|
|
||||||
}
|
|
||||||
} else if (str->s[j] == '\n') {
|
|
||||||
atStartOfLine = true;
|
|
||||||
curIndent = 0;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Strip spaces from each line. */
|
/* Strip spaces from each line. */
|
||||||
std::vector<std::pair<PosIdx, std::unique_ptr<Expr>>> es2;
|
for (auto & line : lines) {
|
||||||
atStartOfLine = true;
|
line.indentation.remove_prefix(std::min(minIndent, line.indentation.size()));
|
||||||
size_t curDropped = 0;
|
}
|
||||||
size_t n = es.size();
|
|
||||||
auto i = es.begin();
|
/* Concat the parts together again */
|
||||||
const auto trimExpr = [&] (std::unique_ptr<Expr> e) {
|
|
||||||
atStartOfLine = false;
|
/* Note that we don't concat all adjacent string parts to fully reproduce the original code.
|
||||||
curDropped = 0;
|
* This means that any escapes will result in string concatenation even if this is unnecessary.
|
||||||
es2.emplace_back(i->first, std::move(e));
|
*/
|
||||||
|
std::vector<std::pair<PosIdx, std::unique_ptr<Expr>>> parts;
|
||||||
|
/* Accumulator for merging intermediates */
|
||||||
|
PosIdx merged_pos;
|
||||||
|
std::string merged = "";
|
||||||
|
bool has_merged = false;
|
||||||
|
|
||||||
|
auto push_merged = [&] (PosIdx i_pos, std::string_view str) {
|
||||||
|
merged += str;
|
||||||
|
if (!has_merged) {
|
||||||
|
has_merged = true;
|
||||||
|
merged_pos = i_pos;
|
||||||
|
}
|
||||||
};
|
};
|
||||||
const auto trimString = [&] (const StringToken t) {
|
|
||||||
std::string s2;
|
|
||||||
for (size_t j = 0; j < t.s.size(); ++j) {
|
|
||||||
if (atStartOfLine) {
|
|
||||||
if (t.s[j] == ' ') {
|
|
||||||
if (curDropped++ >= minIndent)
|
|
||||||
s2 += t.s[j];
|
|
||||||
}
|
|
||||||
else if (t.s[j] == '\n') {
|
|
||||||
curDropped = 0;
|
|
||||||
s2 += t.s[j];
|
|
||||||
} else {
|
|
||||||
atStartOfLine = false;
|
|
||||||
curDropped = 0;
|
|
||||||
s2 += t.s[j];
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
s2 += t.s[j];
|
|
||||||
if (t.s[j] == '\n') atStartOfLine = true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Remove the last line if it is empty and consists only of
|
auto flush_merged = [&] () {
|
||||||
spaces. */
|
if (has_merged) {
|
||||||
if (n == 1) {
|
parts.emplace_back(merged_pos, std::make_unique<ExprString>(std::string(merged)));
|
||||||
std::string::size_type p = s2.find_last_of('\n');
|
merged.clear();
|
||||||
if (p != std::string::npos && s2.find_first_not_of(' ', p + 1) == std::string::npos)
|
has_merged = false;
|
||||||
s2 = std::string(s2, 0, p + 1);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
es2.emplace_back(i->first, std::make_unique<ExprString>(std::move(s2)));
|
|
||||||
};
|
};
|
||||||
for (; i != es.end(); ++i, --n) {
|
|
||||||
std::visit(overloaded { trimExpr, trimString }, std::move(i->second));
|
for (auto && [li, line] : enumerate(lines)) {
|
||||||
|
/* Always merge indentation, except for the first line when compatStripLeadingEmptyString is set (see above) */
|
||||||
|
if (!compatStripLeadingEmptyString || li != 0) {
|
||||||
|
push_merged(line.pos, line.indentation);
|
||||||
}
|
}
|
||||||
|
|
||||||
/* If this is a single string, then don't do a concatenation. */
|
for (auto & val : line.parts) {
|
||||||
if (es2.size() == 1 && dynamic_cast<ExprString *>(es2[0].second.get())) {
|
auto &[i_pos, item] = val;
|
||||||
return std::move(es2[0].second);
|
|
||||||
|
std::visit(overloaded{
|
||||||
|
[&](StringToken str) {
|
||||||
|
if (str.canMerge) {
|
||||||
|
push_merged(i_pos, str.s);
|
||||||
|
} else {
|
||||||
|
flush_merged();
|
||||||
|
parts.emplace_back(i_pos, std::make_unique<ExprString>(std::string(str.s)));
|
||||||
}
|
}
|
||||||
return std::make_unique<ExprConcatStrings>(pos, true, std::move(es2));
|
},
|
||||||
|
[&](std::unique_ptr<Expr> expr) {
|
||||||
|
flush_merged();
|
||||||
|
parts.emplace_back(i_pos, std::move(expr));
|
||||||
|
},
|
||||||
|
}, std::move(item));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
flush_merged();
|
||||||
|
|
||||||
|
/* If this is a single string, then don't do a concatenation.
|
||||||
|
* (If it's a single expression, still do the ConcatStrings to properly force it being a string.)
|
||||||
|
*/
|
||||||
|
if (parts.size() == 1 && dynamic_cast<ExprString *>(parts[0].second.get())) {
|
||||||
|
return std::move(parts[0].second);
|
||||||
|
}
|
||||||
|
return std::make_unique<ExprConcatStrings>(pos, true, std::move(parts));
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue