From 447212fa65a80180150b265411924cc638a2c52c Mon Sep 17 00:00:00 2001 From: sugar Date: Tue, 20 Aug 2024 00:21:59 +0200 Subject: [PATCH] libexpr: Replace regex engine with boost::regex This avoids C++'s standard library regexes, which aren't the same across platforms, and have many other issues, like using stack so much that they stack overflow when processing a lot of data. To avoid backwards and forward compatibility issues, regexes are processed using a function converting libstdc++ regexes into Boost regexes, escaping characters that Boost needs to have escaped, and rejecting features that Boost has and libstdc++ doesn't. Related context: - Original failed attempt to use `boost::regex` in CppNix, failed due to boost icu dependency being large (disabling ICU is no longer necessary because linking ICU requires using a different header file, `boost/regex/icu.hpp`): https://github.com/NixOS/nix/pull/3826 - An attempt to use PCRE, rejected due to providing less backwards compatibility with `std::regex` than `boost::regex`: https://github.com/NixOS/nix/pull/7336 - Second attempt to use `boost::regex`, failed due to `}` regex failing to compile (dealt with by writing a wrapper that parses a regular expression and escapes `}` characters): https://github.com/NixOS/nix/pull/7762 Closes #34. Closes #476. Change-Id: Ieb0eb9e270a93e4c7eed412ba4f9f96cb00a5fa4 --- doc/manual/change-authors.yml | 4 + doc/manual/rl-next/boost-regex.md | 37 ++++++ src/libexpr/primops.cc | 210 ++++++++++++++++++++++++++++-- tests/unit/libexpr/primops.cc | 145 +++++++++++++++++++++ 4 files changed, 384 insertions(+), 12 deletions(-) create mode 100644 doc/manual/rl-next/boost-regex.md diff --git a/doc/manual/change-authors.yml b/doc/manual/change-authors.yml index e18abada1..d9303a747 100644 --- a/doc/manual/change-authors.yml +++ b/doc/manual/change-authors.yml @@ -129,6 +129,10 @@ roberth: display_name: Robert Hensing github: roberth +sugar: + forgejo: sugar + github: sugar700 + thufschmitt: display_name: Théophane Hufschmitt github: thufschmitt diff --git a/doc/manual/rl-next/boost-regex.md b/doc/manual/rl-next/boost-regex.md new file mode 100644 index 000000000..c541434d0 --- /dev/null +++ b/doc/manual/rl-next/boost-regex.md @@ -0,0 +1,37 @@ +--- +synopsis: Replace regex engine with boost::regex +issues: [fj#34, fj#476] +cls: [1821] +category: Fixes +credits: [sugar] +--- + +Previously, the C++ standard regex expression library was used, the +behaviour of which varied depending on the platform. This has been +replaced with the Boost regex library, which works identically across +platforms. + +The visible behaviour of the regex functions doesn't change. While +the new library has more features, Lix will reject regular expressions +using them. + +This also fixes regex matching reporting stack overflow when matching +on too much data. + +Before: + + nix-repl> builtins.match ".*" ( + builtins.concatStringsSep "" ( + builtins.genList (_: "a") 1000000 + ) + ) + error: stack overflow (possible infinite recursion) + +After: + + nix-repl> builtins.match ".*" ( + builtins.concatStringsSep "" ( + builtins.genList (_: "a") 1000000 + ) + ) + [ ] diff --git a/src/libexpr/primops.cc b/src/libexpr/primops.cc index dab96d6d4..d6618df2a 100644 --- a/src/libexpr/primops.cc +++ b/src/libexpr/primops.cc @@ -17,6 +17,7 @@ #include "fetch-to-store.hh" #include +#include #include #include @@ -26,7 +27,6 @@ #include #include #include -#include #include #include @@ -3878,19 +3878,205 @@ static RegisterPrimOp primop_hashString({ .fun = prim_hashString, }); +enum class RegexParseState { + // Anything outside of those + Regular, + + // Bounded repeats, `}` shouldn't be escaped in those + // + // a{2,5}b + // ^^^^ + BoundedRepeat, + + // Backslashes, as C++ regexes only support escaping what needs to be + // escaped and nothing else + // + // a\nb + // ^ + Backslash, + + // Initial part of character set, as `[]]` is a regex for `]` character + // + // [abc] [^abc] + // ^ ^ + CharacterSetStart, + + // Initial part of negated character set, as `[^]]` is a regex for + // anything but `]` character + // + // [^abc] + // ^ + NegatedCharacterSetStart, + + // Character set after its first character + // + // [abc] + // ^^ + CharacterSetMiddle, + + // Parser state after seeing [, assumes the input is character extension + // after seeing `:`, `.`, or `=` + // + // [a[:alpha:]b] + // ^ + PossibleCharacterSetExtension, + + // Within character extension + // + // [a[:alpha:]b] + // ^^^^^^^ + CharacterSetExtension, + + // Within equivalence class expression + // + // [[=a=]] + // ^ + EquivalenceClassExpression, +}; + +static boost::regex compile_regex(std::string_view re) { + // Make sure that Boost supports everything that C++ regexes do, + // and no non-standard extensions are available. + // + // In particular, C++ regexes only support escaping regex metacharacters. + // They don't support other escape sequences like `\n` and `\d`. + // Additionally, within character groups, it's not possible to escape + // anything, backslash is a literal character in those. `[\]` in regexes + // is a weird way to write `\\`. + std::string boost_re; + boost_re.reserve(re.size()); + auto state = RegexParseState::Regular; + for (char c : re) { + switch (state) { + case RegexParseState::Regular: + switch (c) { + // Boost regex engine supports more escape sequences than C++ regexes, + // and as such it's necessary to ensure only escapes supported by C++ + // are allowed. + case '\\': + state = RegexParseState::Backslash; + break; + case '[': + state = RegexParseState::CharacterSetStart; + break; + case '{': + state = RegexParseState::BoundedRepeat; + break; + // Boost doesn't permit unescaped `}`, escape it outside of + // bounded repeats. + case '}': + boost_re.push_back('\\'); + break; + default: + break; + } + break; + + case RegexParseState::BoundedRepeat: + if (c == '}') { + state = RegexParseState::Regular; + } + break; + + case RegexParseState::Backslash: + switch (c) { + case '.': case '|': case '*': case '?': case '+': case '{': + case '^': case '$': case '[': case '(': case ')': case '\\': + state = RegexParseState::Regular; + break; + default: + throw boost::regex_error( + boost::regex_constants::error_type::error_escape + ); + } + break; + + case RegexParseState::CharacterSetStart: + if (c == '^') { + state = RegexParseState::NegatedCharacterSetStart; + break; + } + [[fallthrough]]; + + case RegexParseState::NegatedCharacterSetStart: + if (c == ']') { + state = RegexParseState::CharacterSetMiddle; + break; + } + [[fallthrough]]; + + case RegexParseState::CharacterSetMiddle: + middle: + switch (c) { + case '[': + state = RegexParseState::PossibleCharacterSetExtension; + break; + case '\\': + // Backslashes aren't supported in character groups, escape them + boost_re.push_back('\\'); + state = RegexParseState::CharacterSetMiddle; + break; + case ']': + state = RegexParseState::Regular; + break; + default: + state = RegexParseState::CharacterSetMiddle; + break; + } + break; + + case RegexParseState::PossibleCharacterSetExtension: + switch (c) { + case ':': case '.': + state = RegexParseState::CharacterSetExtension; + break; + case '=': + state = RegexParseState::EquivalenceClassExpression; + break; + default: + goto middle; + } + break; + + case RegexParseState::CharacterSetExtension: + if (c == ']') { + state = RegexParseState::CharacterSetMiddle; + } + break; + + case RegexParseState::EquivalenceClassExpression: + // C++'s regex parser only supports equivalence classes for + // alphabetic characters + if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))) { + throw boost::regex_error( + boost::regex_constants::error_type::error_brack + ); + } + // After verifying first character, this can be parsed as + // a regular character set extension, Boost will notice issues + // after that. + state = RegexParseState::CharacterSetExtension; + break; + } + + boost_re.push_back(c); + } + return boost::regex(boost_re, boost::regex::extended); +} + struct RegexCache { // TODO use C++20 transparent comparison when available - std::unordered_map cache; + std::unordered_map cache; std::list keys; - std::regex get(std::string_view re) + boost::regex get(std::string_view re) { auto it = cache.find(re); if (it != cache.end()) return it->second; keys.emplace_back(re); - return cache.emplace(keys.back(), std::regex(keys.back(), std::regex::extended)).first->second; + return cache.emplace(keys.back(), compile_regex(re)).first->second; } }; @@ -3910,8 +4096,8 @@ void prim_match(EvalState & state, const PosIdx pos, Value * * args, Value & v) NixStringContext context; const auto str = state.forceString(*args[1], context, pos, "while evaluating the second argument passed to builtins.match"); - std::cmatch match; - if (!std::regex_match(str.begin(), str.end(), match, regex)) { + boost::cmatch match; + if (!boost::regex_match(str.begin(), str.end(), match, regex)) { v.mkNull(); return; } @@ -3926,8 +4112,8 @@ void prim_match(EvalState & state, const PosIdx pos, Value * * args, Value & v) (v.listElems()[i] = state.allocValue())->mkString(match[i + 1].str()); } - } catch (std::regex_error & e) { - if (e.code() == std::regex_constants::error_space) { + } catch (boost::regex_error & e) { + if (e.code() == boost::regex_constants::error_space) { // limit is _GLIBCXX_REGEX_STATE_LIMIT for libstdc++ state.error("memory limit exceeded by regular expression '%s'", re) .atPos(pos) @@ -3988,8 +4174,8 @@ void prim_split(EvalState & state, const PosIdx pos, Value * * args, Value & v) NixStringContext context; const auto str = state.forceString(*args[1], context, pos, "while evaluating the second argument passed to builtins.split"); - auto begin = std::cregex_iterator(str.begin(), str.end(), regex); - auto end = std::cregex_iterator(); + auto begin = boost::cregex_iterator(str.begin(), str.end(), regex); + auto end = boost::cregex_iterator(); // Any matches results are surrounded by non-matching results. const size_t len = std::distance(begin, end); @@ -4028,8 +4214,8 @@ void prim_split(EvalState & state, const PosIdx pos, Value * * args, Value & v) assert(idx == 2 * len + 1); - } catch (std::regex_error & e) { - if (e.code() == std::regex_constants::error_space) { + } catch (boost::regex_error & e) { + if (e.code() == boost::regex_constants::error_space) { // limit is _GLIBCXX_REGEX_STATE_LIMIT for libstdc++ state.error("memory limit exceeded by regular expression '%s'", re) .atPos(pos) diff --git a/tests/unit/libexpr/primops.cc b/tests/unit/libexpr/primops.cc index bd174a6c0..b73fdff72 100644 --- a/tests/unit/libexpr/primops.cc +++ b/tests/unit/libexpr/primops.cc @@ -816,6 +816,151 @@ namespace nix { ASSERT_THAT(*v.listElems()[0], IsStringEq("FOO")); } + TEST_F(PrimOpTest, match5) { + auto v = eval("builtins.match ''}'' ''}''"); + ASSERT_THAT(v, IsListOfSize(0)); + } + + TEST_F(PrimOpTest, match6) { + auto v = eval("builtins.match '']'' '']''"); + ASSERT_THAT(v, IsListOfSize(0)); + } + + TEST_F(PrimOpTest, match7) { + auto v = eval("builtins.match ''[[]'' ''[''"); + ASSERT_THAT(v, IsListOfSize(0)); + } + + TEST_F(PrimOpTest, match8) { + auto v = eval("builtins.match ''[]]'' '']''"); + ASSERT_THAT(v, IsListOfSize(0)); + } + + TEST_F(PrimOpTest, match9) { + auto v = eval("builtins.match ''[[=a=]]'' ''A''"); + ASSERT_THAT(v, IsListOfSize(0)); + } + + TEST_F(PrimOpTest, match10) { + auto v = eval("builtins.match ''[[.right-curly-bracket.]]'' ''}''"); + ASSERT_THAT(v, IsListOfSize(0)); + } + + TEST_F(PrimOpTest, match11) { + auto v = eval("builtins.match ''[[.tilde.]]'' ''~''"); + ASSERT_THAT(v, IsListOfSize(0)); + } + + TEST_F(PrimOpTest, match12) { + auto v = eval("builtins.match ''[\\n]'' ''\\''"); + ASSERT_THAT(v, IsListOfSize(0)); + } + + TEST_F(PrimOpTest, match13) { + auto v = eval("builtins.match ''[\\[]'' ''\\''"); + ASSERT_THAT(v, IsListOfSize(0)); + } + + TEST_F(PrimOpTest, match14) { + auto v = eval("builtins.match ''[\\]]'' ''\\]''"); + ASSERT_THAT(v, IsListOfSize(0)); + } + + TEST_F(PrimOpTest, match15) { + auto v = eval("builtins.match ''[\\-z]'' ''y''"); + ASSERT_THAT(v, IsListOfSize(0)); + } + + TEST_F(PrimOpTest, match16) { + auto v = eval("builtins.match ''[\\\\]'' ''\\''"); + ASSERT_THAT(v, IsListOfSize(0)); + } + + TEST_F(PrimOpTest, match17) { + auto v = eval("builtins.match ''[\\]'' ''\\''"); + ASSERT_THAT(v, IsListOfSize(0)); + } + + TEST_F(PrimOpTest, match18) { + auto v = eval("builtins.match ''[\\]]'' ''\\]''"); + ASSERT_THAT(v, IsListOfSize(0)); + } + + TEST_F(PrimOpTest, match19) { + auto v = eval("builtins.match ''.*[Ω].*'' ''β''"); + ASSERT_THAT(v, IsListOfSize(0)); + } + + TEST_F(PrimOpTest, match20) { + auto v = eval("builtins.match ''[^]]'' ''a''"); + ASSERT_THAT(v, IsListOfSize(0)); + } + + TEST_F(PrimOpTest, match21) { + auto v = eval("builtins.match ''[[[:alpha:]]'' ''[''"); + ASSERT_THAT(v, IsListOfSize(0)); + } + + TEST_F(PrimOpTest, match_unsupported_syntax1) { + ASSERT_THROW(eval("builtins.match ''{'' ''{''"), EvalError); + } + + TEST_F(PrimOpTest, match_unsupported_syntax2) { + ASSERT_THROW(eval("builtins.match ''(a)\\1'' ''aa''"), EvalError); + } + + TEST_F(PrimOpTest, match_unsupported_syntax3) { + ASSERT_THROW(eval("builtins.match ''\\}'' ''}''"), EvalError); + } + + TEST_F(PrimOpTest, match_unsupported_syntax4) { + ASSERT_THROW(eval("builtins.match ''\\]'' '']''"), EvalError); + } + + TEST_F(PrimOpTest, match_unsupported_syntax5) { + ASSERT_THROW(eval("builtins.match ''\\x41'' ''A''"), EvalError); + } + + TEST_F(PrimOpTest, match_unsupported_syntax6) { + ASSERT_THROW(eval("builtins.match ''\\n'' \"\\n\""), EvalError); + } + + TEST_F(PrimOpTest, match_unsupported_syntax7) { + ASSERT_THROW(eval("builtins.match ''\\d'' ''1''"), EvalError); + } + + TEST_F(PrimOpTest, match_unsupported_syntax8) { + ASSERT_THROW(eval("builtins.match ''\\b1'' ''1''"), EvalError); + } + + TEST_F(PrimOpTest, match_unsupported_syntax9) { + ASSERT_THROW(eval("builtins.match ''\\A1'' ''1''"), EvalError); + } + + TEST_F(PrimOpTest, match_unsupported_syntax10) { + ASSERT_THROW(eval("builtins.match ''(?:1)'' ''1''"), EvalError); + } + + TEST_F(PrimOpTest, match_unsupported_syntax11) { + ASSERT_THROW(eval("builtins.match ''[b-a]'' ''b''"), EvalError); + } + + TEST_F(PrimOpTest, match_unsupported_syntax12) { + ASSERT_THROW(eval("builtins.match ''[[:alpha:]-a]'' ''b''"), EvalError); + } + + TEST_F(PrimOpTest, match_unsupported_syntax13) { + ASSERT_THROW(eval("builtins.match ''[[=1=]]'' ''1''"), EvalError); + } + + TEST_F(PrimOpTest, match_unsupported_syntax14) { + ASSERT_THROW(eval("builtins.match ''[[=]=]]'' '']''"), EvalError); + } + + TEST_F(PrimOpTest, match_unsupported_syntax15) { + ASSERT_THROW(eval("builtins.match ''[a-b-c]'' ''b''"), EvalError); + } + TEST_F(PrimOpTest, attrNames) { auto v = eval("builtins.attrNames { x = 1; y = 2; z = 3; a = 2; }"); ASSERT_THAT(v, IsListOfSize(4));