libexpr: Replace regex engine with boost::regex

This avoids C++'s standard library regexes, which aren't the same
across platforms, and have many other issues, like using stack
so much that they stack overflow when processing a lot of data.

To avoid backwards and forward compatibility issues, regexes are
processed using a function converting libstdc++ regexes into Boost
regexes, escaping characters that Boost needs to have escaped, and
rejecting features that Boost has and libstdc++ doesn't.

Related context:

- Original failed attempt to use `boost::regex` in CppNix, failed due to
  boost icu dependency being large (disabling ICU is no longer necessary
  because linking ICU requires using a different header file,
  `boost/regex/icu.hpp`): https://github.com/NixOS/nix/pull/3826

- An attempt to use PCRE, rejected due to providing less backwards
  compatibility with `std::regex` than `boost::regex`:
  https://github.com/NixOS/nix/pull/7336

- Second attempt to use `boost::regex`, failed due to `}` regex failing
  to compile (dealt with by writing a wrapper that parses a regular
  expression and escapes `}` characters):
  https://github.com/NixOS/nix/pull/7762

Closes #34. Closes #476.

Change-Id: Ieb0eb9e270a93e4c7eed412ba4f9f96cb00a5fa4
This commit is contained in:
sugar🍬🍭🏳️‍⚧️ 2024-08-20 00:21:59 +02:00
parent e727dbc3a3
commit 447212fa65
4 changed files with 384 additions and 12 deletions

View file

@ -129,6 +129,10 @@ roberth:
display_name: Robert Hensing
github: roberth
sugar:
forgejo: sugar
github: sugar700
thufschmitt:
display_name: Théophane Hufschmitt
github: thufschmitt

View file

@ -0,0 +1,37 @@
---
synopsis: Replace regex engine with boost::regex
issues: [fj#34, fj#476]
cls: [1821]
category: Fixes
credits: [sugar]
---
Previously, the C++ standard regex expression library was used, the
behaviour of which varied depending on the platform. This has been
replaced with the Boost regex library, which works identically across
platforms.
The visible behaviour of the regex functions doesn't change. While
the new library has more features, Lix will reject regular expressions
using them.
This also fixes regex matching reporting stack overflow when matching
on too much data.
Before:
nix-repl> builtins.match ".*" (
builtins.concatStringsSep "" (
builtins.genList (_: "a") 1000000
)
)
error: stack overflow (possible infinite recursion)
After:
nix-repl> builtins.match ".*" (
builtins.concatStringsSep "" (
builtins.genList (_: "a") 1000000
)
)
[ ]

View file

@ -17,6 +17,7 @@
#include "fetch-to-store.hh"
#include <boost/container/small_vector.hpp>
#include <boost/regex.hpp>
#include <nlohmann/json.hpp>
#include <sys/types.h>
@ -26,7 +27,6 @@
#include <algorithm>
#include <cstring>
#include <sstream>
#include <regex>
#include <dlfcn.h>
#include <cmath>
@ -3878,19 +3878,205 @@ static RegisterPrimOp primop_hashString({
.fun = prim_hashString,
});
enum class RegexParseState {
// Anything outside of those
Regular,
// Bounded repeats, `}` shouldn't be escaped in those
//
// a{2,5}b
// ^^^^
BoundedRepeat,
// Backslashes, as C++ regexes only support escaping what needs to be
// escaped and nothing else
//
// a\nb
// ^
Backslash,
// Initial part of character set, as `[]]` is a regex for `]` character
//
// [abc] [^abc]
// ^ ^
CharacterSetStart,
// Initial part of negated character set, as `[^]]` is a regex for
// anything but `]` character
//
// [^abc]
// ^
NegatedCharacterSetStart,
// Character set after its first character
//
// [abc]
// ^^
CharacterSetMiddle,
// Parser state after seeing [, assumes the input is character extension
// after seeing `:`, `.`, or `=`
//
// [a[:alpha:]b]
// ^
PossibleCharacterSetExtension,
// Within character extension
//
// [a[:alpha:]b]
// ^^^^^^^
CharacterSetExtension,
// Within equivalence class expression
//
// [[=a=]]
// ^
EquivalenceClassExpression,
};
static boost::regex compile_regex(std::string_view re) {
// Make sure that Boost supports everything that C++ regexes do,
// and no non-standard extensions are available.
//
// In particular, C++ regexes only support escaping regex metacharacters.
// They don't support other escape sequences like `\n` and `\d`.
// Additionally, within character groups, it's not possible to escape
// anything, backslash is a literal character in those. `[\]` in regexes
// is a weird way to write `\\`.
std::string boost_re;
boost_re.reserve(re.size());
auto state = RegexParseState::Regular;
for (char c : re) {
switch (state) {
case RegexParseState::Regular:
switch (c) {
// Boost regex engine supports more escape sequences than C++ regexes,
// and as such it's necessary to ensure only escapes supported by C++
// are allowed.
case '\\':
state = RegexParseState::Backslash;
break;
case '[':
state = RegexParseState::CharacterSetStart;
break;
case '{':
state = RegexParseState::BoundedRepeat;
break;
// Boost doesn't permit unescaped `}`, escape it outside of
// bounded repeats.
case '}':
boost_re.push_back('\\');
break;
default:
break;
}
break;
case RegexParseState::BoundedRepeat:
if (c == '}') {
state = RegexParseState::Regular;
}
break;
case RegexParseState::Backslash:
switch (c) {
case '.': case '|': case '*': case '?': case '+': case '{':
case '^': case '$': case '[': case '(': case ')': case '\\':
state = RegexParseState::Regular;
break;
default:
throw boost::regex_error(
boost::regex_constants::error_type::error_escape
);
}
break;
case RegexParseState::CharacterSetStart:
if (c == '^') {
state = RegexParseState::NegatedCharacterSetStart;
break;
}
[[fallthrough]];
case RegexParseState::NegatedCharacterSetStart:
if (c == ']') {
state = RegexParseState::CharacterSetMiddle;
break;
}
[[fallthrough]];
case RegexParseState::CharacterSetMiddle:
middle:
switch (c) {
case '[':
state = RegexParseState::PossibleCharacterSetExtension;
break;
case '\\':
// Backslashes aren't supported in character groups, escape them
boost_re.push_back('\\');
state = RegexParseState::CharacterSetMiddle;
break;
case ']':
state = RegexParseState::Regular;
break;
default:
state = RegexParseState::CharacterSetMiddle;
break;
}
break;
case RegexParseState::PossibleCharacterSetExtension:
switch (c) {
case ':': case '.':
state = RegexParseState::CharacterSetExtension;
break;
case '=':
state = RegexParseState::EquivalenceClassExpression;
break;
default:
goto middle;
}
break;
case RegexParseState::CharacterSetExtension:
if (c == ']') {
state = RegexParseState::CharacterSetMiddle;
}
break;
case RegexParseState::EquivalenceClassExpression:
// C++'s regex parser only supports equivalence classes for
// alphabetic characters
if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))) {
throw boost::regex_error(
boost::regex_constants::error_type::error_brack
);
}
// After verifying first character, this can be parsed as
// a regular character set extension, Boost will notice issues
// after that.
state = RegexParseState::CharacterSetExtension;
break;
}
boost_re.push_back(c);
}
return boost::regex(boost_re, boost::regex::extended);
}
struct RegexCache
{
// TODO use C++20 transparent comparison when available
std::unordered_map<std::string_view, std::regex> cache;
std::unordered_map<std::string_view, boost::regex> cache;
std::list<std::string> keys;
std::regex get(std::string_view re)
boost::regex get(std::string_view re)
{
auto it = cache.find(re);
if (it != cache.end())
return it->second;
keys.emplace_back(re);
return cache.emplace(keys.back(), std::regex(keys.back(), std::regex::extended)).first->second;
return cache.emplace(keys.back(), compile_regex(re)).first->second;
}
};
@ -3910,8 +4096,8 @@ void prim_match(EvalState & state, const PosIdx pos, Value * * args, Value & v)
NixStringContext context;
const auto str = state.forceString(*args[1], context, pos, "while evaluating the second argument passed to builtins.match");
std::cmatch match;
if (!std::regex_match(str.begin(), str.end(), match, regex)) {
boost::cmatch match;
if (!boost::regex_match(str.begin(), str.end(), match, regex)) {
v.mkNull();
return;
}
@ -3926,8 +4112,8 @@ void prim_match(EvalState & state, const PosIdx pos, Value * * args, Value & v)
(v.listElems()[i] = state.allocValue())->mkString(match[i + 1].str());
}
} catch (std::regex_error & e) {
if (e.code() == std::regex_constants::error_space) {
} catch (boost::regex_error & e) {
if (e.code() == boost::regex_constants::error_space) {
// limit is _GLIBCXX_REGEX_STATE_LIMIT for libstdc++
state.error<EvalError>("memory limit exceeded by regular expression '%s'", re)
.atPos(pos)
@ -3988,8 +4174,8 @@ void prim_split(EvalState & state, const PosIdx pos, Value * * args, Value & v)
NixStringContext context;
const auto str = state.forceString(*args[1], context, pos, "while evaluating the second argument passed to builtins.split");
auto begin = std::cregex_iterator(str.begin(), str.end(), regex);
auto end = std::cregex_iterator();
auto begin = boost::cregex_iterator(str.begin(), str.end(), regex);
auto end = boost::cregex_iterator();
// Any matches results are surrounded by non-matching results.
const size_t len = std::distance(begin, end);
@ -4028,8 +4214,8 @@ void prim_split(EvalState & state, const PosIdx pos, Value * * args, Value & v)
assert(idx == 2 * len + 1);
} catch (std::regex_error & e) {
if (e.code() == std::regex_constants::error_space) {
} catch (boost::regex_error & e) {
if (e.code() == boost::regex_constants::error_space) {
// limit is _GLIBCXX_REGEX_STATE_LIMIT for libstdc++
state.error<EvalError>("memory limit exceeded by regular expression '%s'", re)
.atPos(pos)

View file

@ -816,6 +816,151 @@ namespace nix {
ASSERT_THAT(*v.listElems()[0], IsStringEq("FOO"));
}
TEST_F(PrimOpTest, match5) {
auto v = eval("builtins.match ''}'' ''}''");
ASSERT_THAT(v, IsListOfSize(0));
}
TEST_F(PrimOpTest, match6) {
auto v = eval("builtins.match '']'' '']''");
ASSERT_THAT(v, IsListOfSize(0));
}
TEST_F(PrimOpTest, match7) {
auto v = eval("builtins.match ''[[]'' ''[''");
ASSERT_THAT(v, IsListOfSize(0));
}
TEST_F(PrimOpTest, match8) {
auto v = eval("builtins.match ''[]]'' '']''");
ASSERT_THAT(v, IsListOfSize(0));
}
TEST_F(PrimOpTest, match9) {
auto v = eval("builtins.match ''[[=a=]]'' ''A''");
ASSERT_THAT(v, IsListOfSize(0));
}
TEST_F(PrimOpTest, match10) {
auto v = eval("builtins.match ''[[.right-curly-bracket.]]'' ''}''");
ASSERT_THAT(v, IsListOfSize(0));
}
TEST_F(PrimOpTest, match11) {
auto v = eval("builtins.match ''[[.tilde.]]'' ''~''");
ASSERT_THAT(v, IsListOfSize(0));
}
TEST_F(PrimOpTest, match12) {
auto v = eval("builtins.match ''[\\n]'' ''\\''");
ASSERT_THAT(v, IsListOfSize(0));
}
TEST_F(PrimOpTest, match13) {
auto v = eval("builtins.match ''[\\[]'' ''\\''");
ASSERT_THAT(v, IsListOfSize(0));
}
TEST_F(PrimOpTest, match14) {
auto v = eval("builtins.match ''[\\]]'' ''\\]''");
ASSERT_THAT(v, IsListOfSize(0));
}
TEST_F(PrimOpTest, match15) {
auto v = eval("builtins.match ''[\\-z]'' ''y''");
ASSERT_THAT(v, IsListOfSize(0));
}
TEST_F(PrimOpTest, match16) {
auto v = eval("builtins.match ''[\\\\]'' ''\\''");
ASSERT_THAT(v, IsListOfSize(0));
}
TEST_F(PrimOpTest, match17) {
auto v = eval("builtins.match ''[\\]'' ''\\''");
ASSERT_THAT(v, IsListOfSize(0));
}
TEST_F(PrimOpTest, match18) {
auto v = eval("builtins.match ''[\\]]'' ''\\]''");
ASSERT_THAT(v, IsListOfSize(0));
}
TEST_F(PrimOpTest, match19) {
auto v = eval("builtins.match ''.*[Ω].*'' ''β''");
ASSERT_THAT(v, IsListOfSize(0));
}
TEST_F(PrimOpTest, match20) {
auto v = eval("builtins.match ''[^]]'' ''a''");
ASSERT_THAT(v, IsListOfSize(0));
}
TEST_F(PrimOpTest, match21) {
auto v = eval("builtins.match ''[[[:alpha:]]'' ''[''");
ASSERT_THAT(v, IsListOfSize(0));
}
TEST_F(PrimOpTest, match_unsupported_syntax1) {
ASSERT_THROW(eval("builtins.match ''{'' ''{''"), EvalError);
}
TEST_F(PrimOpTest, match_unsupported_syntax2) {
ASSERT_THROW(eval("builtins.match ''(a)\\1'' ''aa''"), EvalError);
}
TEST_F(PrimOpTest, match_unsupported_syntax3) {
ASSERT_THROW(eval("builtins.match ''\\}'' ''}''"), EvalError);
}
TEST_F(PrimOpTest, match_unsupported_syntax4) {
ASSERT_THROW(eval("builtins.match ''\\]'' '']''"), EvalError);
}
TEST_F(PrimOpTest, match_unsupported_syntax5) {
ASSERT_THROW(eval("builtins.match ''\\x41'' ''A''"), EvalError);
}
TEST_F(PrimOpTest, match_unsupported_syntax6) {
ASSERT_THROW(eval("builtins.match ''\\n'' \"\\n\""), EvalError);
}
TEST_F(PrimOpTest, match_unsupported_syntax7) {
ASSERT_THROW(eval("builtins.match ''\\d'' ''1''"), EvalError);
}
TEST_F(PrimOpTest, match_unsupported_syntax8) {
ASSERT_THROW(eval("builtins.match ''\\b1'' ''1''"), EvalError);
}
TEST_F(PrimOpTest, match_unsupported_syntax9) {
ASSERT_THROW(eval("builtins.match ''\\A1'' ''1''"), EvalError);
}
TEST_F(PrimOpTest, match_unsupported_syntax10) {
ASSERT_THROW(eval("builtins.match ''(?:1)'' ''1''"), EvalError);
}
TEST_F(PrimOpTest, match_unsupported_syntax11) {
ASSERT_THROW(eval("builtins.match ''[b-a]'' ''b''"), EvalError);
}
TEST_F(PrimOpTest, match_unsupported_syntax12) {
ASSERT_THROW(eval("builtins.match ''[[:alpha:]-a]'' ''b''"), EvalError);
}
TEST_F(PrimOpTest, match_unsupported_syntax13) {
ASSERT_THROW(eval("builtins.match ''[[=1=]]'' ''1''"), EvalError);
}
TEST_F(PrimOpTest, match_unsupported_syntax14) {
ASSERT_THROW(eval("builtins.match ''[[=]=]]'' '']''"), EvalError);
}
TEST_F(PrimOpTest, match_unsupported_syntax15) {
ASSERT_THROW(eval("builtins.match ''[a-b-c]'' ''b''"), EvalError);
}
TEST_F(PrimOpTest, attrNames) {
auto v = eval("builtins.attrNames { x = 1; y = 2; z = 3; a = 2; }");
ASSERT_THAT(v, IsListOfSize(4));