From 447212fa65a80180150b265411924cc638a2c52c Mon Sep 17 00:00:00 2001
From: sugar <sugar@sylveon.social>
Date: Tue, 20 Aug 2024 00:21:59 +0200
Subject: [PATCH] libexpr: Replace regex engine with boost::regex

This avoids C++'s standard library regexes, which aren't the same
across platforms, and have many other issues, like using stack
so much that they stack overflow when processing a lot of data.

To avoid backwards and forward compatibility issues, regexes are
processed using a function converting libstdc++ regexes into Boost
regexes, escaping characters that Boost needs to have escaped, and
rejecting features that Boost has and libstdc++ doesn't.

Related context:

- Original failed attempt to use `boost::regex` in CppNix, failed due to
  boost icu dependency being large (disabling ICU is no longer necessary
  because linking ICU requires using a different header file,
  `boost/regex/icu.hpp`): https://github.com/NixOS/nix/pull/3826

- An attempt to use PCRE, rejected due to providing less backwards
  compatibility with `std::regex` than `boost::regex`:
  https://github.com/NixOS/nix/pull/7336

- Second attempt to use `boost::regex`, failed due to `}` regex failing
  to compile (dealt with by writing a wrapper that parses a regular
  expression and escapes `}` characters):
  https://github.com/NixOS/nix/pull/7762

Closes #34. Closes #476.

Change-Id: Ieb0eb9e270a93e4c7eed412ba4f9f96cb00a5fa4
---
 doc/manual/change-authors.yml     |   4 +
 doc/manual/rl-next/boost-regex.md |  37 ++++++
 src/libexpr/primops.cc            | 210 ++++++++++++++++++++++++++++--
 tests/unit/libexpr/primops.cc     | 145 +++++++++++++++++++++
 4 files changed, 384 insertions(+), 12 deletions(-)
 create mode 100644 doc/manual/rl-next/boost-regex.md
diff --git a/doc/manual/change-authors.yml b/doc/manual/change-authors.yml
index e18abada1..d9303a747 100644
--- a/doc/manual/change-authors.yml
+++ b/doc/manual/change-authors.yml
@@ -129,6 +129,10 @@ roberth:
   display_name: Robert Hensing
   github: roberth
 
+sugar:
+  forgejo: sugar
+  github: sugar700
+
 thufschmitt:
   display_name: Théophane Hufschmitt
   github: thufschmitt
diff --git a/doc/manual/rl-next/boost-regex.md b/doc/manual/rl-next/boost-regex.md
new file mode 100644
index 000000000..c541434d0
--- /dev/null
+++ b/doc/manual/rl-next/boost-regex.md
@@ -0,0 +1,37 @@
+---
+synopsis: Replace regex engine with boost::regex
+issues: [fj#34, fj#476]
+cls: [1821]
+category: Fixes
+credits: [sugar]
+---
+
+Previously, the C++ standard regex expression library was used, the
+behaviour of which varied depending on the platform. This has been
+replaced with the Boost regex library, which works identically across
+platforms.
+
+The visible behaviour of the regex functions doesn't change. While
+the new library has more features, Lix will reject regular expressions
+using them.
+
+This also fixes regex matching reporting stack overflow when matching
+on too much data.
+
+Before:
+
+    nix-repl> builtins.match ".*" (
+                builtins.concatStringsSep "" (
+                  builtins.genList (_: "a") 1000000
+                )
+              )
+    error: stack overflow (possible infinite recursion)
+
+After:
+
+    nix-repl> builtins.match ".*" (
+                builtins.concatStringsSep "" (
+                  builtins.genList (_: "a") 1000000
+                )
+              )
+    [ ]
diff --git a/src/libexpr/primops.cc b/src/libexpr/primops.cc
index dab96d6d4..d6618df2a 100644
--- a/src/libexpr/primops.cc
+++ b/src/libexpr/primops.cc
@@ -17,6 +17,7 @@
 #include "fetch-to-store.hh"
 
 #include <boost/container/small_vector.hpp>
+#include <boost/regex.hpp>
 #include <nlohmann/json.hpp>
 
 #include <sys/types.h>
@@ -26,7 +27,6 @@
 #include <algorithm>
 #include <cstring>
 #include <sstream>
-#include <regex>
 #include <dlfcn.h>
 
 #include <cmath>
@@ -3878,19 +3878,205 @@ static RegisterPrimOp primop_hashString({
     .fun = prim_hashString,
 });
 
+enum class RegexParseState {
+    // Anything outside of those
+    Regular,
+
+    // Bounded repeats, `}` shouldn't be escaped in those
+    //
+    // a{2,5}b
+    //   ^^^^
+    BoundedRepeat,
+
+    // Backslashes, as C++ regexes only support escaping what needs to be
+    // escaped and nothing else
+    //
+    // a\nb
+    //   ^
+    Backslash,
+
+    // Initial part of character set, as `[]]` is a regex for `]` character
+    //
+    // [abc] [^abc]
+    //  ^     ^
+    CharacterSetStart,
+
+    // Initial part of negated character set, as `[^]]` is a regex for
+    // anything but `]` character
+    //
+    // [^abc]
+    //   ^
+    NegatedCharacterSetStart,
+
+    // Character set after its first character
+    //
+    // [abc]
+    //   ^^
+    CharacterSetMiddle,
+
+    // Parser state after seeing [, assumes the input is character extension
+    // after seeing `:`, `.`, or `=`
+    //
+    // [a[:alpha:]b]
+    //    ^
+    PossibleCharacterSetExtension,
+
+    // Within character extension
+    //
+    // [a[:alpha:]b]
+    //     ^^^^^^^
+    CharacterSetExtension,
+
+    // Within equivalence class expression
+    //
+    // [[=a=]]
+    //    ^
+    EquivalenceClassExpression,
+};
+
+static boost::regex compile_regex(std::string_view re) {
+    // Make sure that Boost supports everything that C++ regexes do,
+    // and no non-standard extensions are available.
+    //
+    // In particular, C++ regexes only support escaping regex metacharacters.
+    // They don't support other escape sequences like `\n` and `\d`.
+    // Additionally, within character groups, it's not possible to escape
+    // anything, backslash is a literal character in those. `[\]` in regexes
+    // is a weird way to write `\\`.
+    std::string boost_re;
+    boost_re.reserve(re.size());
+    auto state = RegexParseState::Regular;
+    for (char c : re) {
+        switch (state) {
+        case RegexParseState::Regular:
+            switch (c) {
+            // Boost regex engine supports more escape sequences than C++ regexes,
+            // and as such it's necessary to ensure only escapes supported by C++
+            // are allowed.
+            case '\\':
+                state = RegexParseState::Backslash;
+                break;
+            case '[':
+                state = RegexParseState::CharacterSetStart;
+                break;
+            case '{':
+                state = RegexParseState::BoundedRepeat;
+                break;
+            // Boost doesn't permit unescaped `}`, escape it outside of
+            // bounded repeats.
+            case '}':
+                boost_re.push_back('\\');
+                break;
+            default:
+                break;
+            }
+            break;
+
+        case RegexParseState::BoundedRepeat:
+            if (c == '}') {
+                state = RegexParseState::Regular;
+            }
+            break;
+
+        case RegexParseState::Backslash:
+            switch (c) {
+                case '.': case '|': case '*': case '?': case '+': case '{':
+                case '^': case '$': case '[': case '(': case ')': case '\\':
+                    state = RegexParseState::Regular;
+                    break;
+                default:
+                    throw boost::regex_error(
+                        boost::regex_constants::error_type::error_escape
+                    );
+            }
+            break;
+
+        case RegexParseState::CharacterSetStart:
+            if (c == '^') {
+                state = RegexParseState::NegatedCharacterSetStart;
+                break;
+            }
+            [[fallthrough]];
+
+        case RegexParseState::NegatedCharacterSetStart:
+            if (c == ']') {
+                state = RegexParseState::CharacterSetMiddle;
+                break;
+            }
+            [[fallthrough]];
+
+        case RegexParseState::CharacterSetMiddle:
+        middle:
+            switch (c) {
+            case '[':
+                state = RegexParseState::PossibleCharacterSetExtension;
+                break;
+            case '\\':
+                // Backslashes aren't supported in character groups, escape them
+                boost_re.push_back('\\');
+                state = RegexParseState::CharacterSetMiddle;
+                break;
+            case ']':
+                state = RegexParseState::Regular;
+                break;
+            default:
+                state = RegexParseState::CharacterSetMiddle;
+                break;
+            }
+            break;
+
+        case RegexParseState::PossibleCharacterSetExtension:
+            switch (c) {
+                case ':': case '.':
+                    state = RegexParseState::CharacterSetExtension;
+                    break;
+                case '=':
+                    state = RegexParseState::EquivalenceClassExpression;
+                    break;
+                default:
+                    goto middle;
+            }
+            break;
+
+        case RegexParseState::CharacterSetExtension:
+            if (c == ']') {
+                state = RegexParseState::CharacterSetMiddle;
+            }
+            break;
+
+        case RegexParseState::EquivalenceClassExpression:
+            // C++'s regex parser only supports equivalence classes for
+            // alphabetic characters
+            if (!((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'))) {
+                throw boost::regex_error(
+                    boost::regex_constants::error_type::error_brack
+                );
+            }
+            // After verifying first character, this can be parsed as
+            // a regular character set extension, Boost will notice issues
+            // after that.
+            state = RegexParseState::CharacterSetExtension;
+            break;
+        }
+
+        boost_re.push_back(c);
+    }
+    return boost::regex(boost_re, boost::regex::extended);
+}
+
 struct RegexCache
 {
     // TODO use C++20 transparent comparison when available
-    std::unordered_map<std::string_view, std::regex> cache;
+    std::unordered_map<std::string_view, boost::regex> cache;
     std::list<std::string> keys;
 
-    std::regex get(std::string_view re)
+    boost::regex get(std::string_view re)
     {
         auto it = cache.find(re);
         if (it != cache.end())
             return it->second;
         keys.emplace_back(re);
-        return cache.emplace(keys.back(), std::regex(keys.back(), std::regex::extended)).first->second;
+        return cache.emplace(keys.back(), compile_regex(re)).first->second;
     }
 };
 
@@ -3910,8 +4096,8 @@ void prim_match(EvalState & state, const PosIdx pos, Value * * args, Value & v)
         NixStringContext context;
         const auto str = state.forceString(*args[1], context, pos, "while evaluating the second argument passed to builtins.match");
 
-        std::cmatch match;
-        if (!std::regex_match(str.begin(), str.end(), match, regex)) {
+        boost::cmatch match;
+        if (!boost::regex_match(str.begin(), str.end(), match, regex)) {
             v.mkNull();
             return;
         }
@@ -3926,8 +4112,8 @@ void prim_match(EvalState & state, const PosIdx pos, Value * * args, Value & v)
                 (v.listElems()[i] = state.allocValue())->mkString(match[i + 1].str());
         }
 
-    } catch (std::regex_error & e) {
-        if (e.code() == std::regex_constants::error_space) {
+    } catch (boost::regex_error & e) {
+        if (e.code() == boost::regex_constants::error_space) {
             // limit is _GLIBCXX_REGEX_STATE_LIMIT for libstdc++
             state.error<EvalError>("memory limit exceeded by regular expression '%s'", re)
                 .atPos(pos)
@@ -3988,8 +4174,8 @@ void prim_split(EvalState & state, const PosIdx pos, Value * * args, Value & v)
         NixStringContext context;
         const auto str = state.forceString(*args[1], context, pos, "while evaluating the second argument passed to builtins.split");
 
-        auto begin = std::cregex_iterator(str.begin(), str.end(), regex);
-        auto end = std::cregex_iterator();
+        auto begin = boost::cregex_iterator(str.begin(), str.end(), regex);
+        auto end = boost::cregex_iterator();
 
         // Any matches results are surrounded by non-matching results.
         const size_t len = std::distance(begin, end);
@@ -4028,8 +4214,8 @@ void prim_split(EvalState & state, const PosIdx pos, Value * * args, Value & v)
 
         assert(idx == 2 * len + 1);
 
-    } catch (std::regex_error & e) {
-        if (e.code() == std::regex_constants::error_space) {
+    } catch (boost::regex_error & e) {
+        if (e.code() == boost::regex_constants::error_space) {
             // limit is _GLIBCXX_REGEX_STATE_LIMIT for libstdc++
             state.error<EvalError>("memory limit exceeded by regular expression '%s'", re)
                 .atPos(pos)
diff --git a/tests/unit/libexpr/primops.cc b/tests/unit/libexpr/primops.cc
index bd174a6c0..b73fdff72 100644
--- a/tests/unit/libexpr/primops.cc
+++ b/tests/unit/libexpr/primops.cc
@@ -816,6 +816,151 @@ namespace nix {
         ASSERT_THAT(*v.listElems()[0], IsStringEq("FOO"));
     }
 
+    TEST_F(PrimOpTest, match5) {
+        auto v = eval("builtins.match ''}'' ''}''");
+        ASSERT_THAT(v, IsListOfSize(0));
+    }
+
+    TEST_F(PrimOpTest, match6) {
+        auto v = eval("builtins.match '']'' '']''");
+        ASSERT_THAT(v, IsListOfSize(0));
+    }
+
+    TEST_F(PrimOpTest, match7) {
+        auto v = eval("builtins.match ''[[]'' ''[''");
+        ASSERT_THAT(v, IsListOfSize(0));
+    }
+
+    TEST_F(PrimOpTest, match8) {
+        auto v = eval("builtins.match ''[]]'' '']''");
+        ASSERT_THAT(v, IsListOfSize(0));
+    }
+
+    TEST_F(PrimOpTest, match9) {
+        auto v = eval("builtins.match ''[[=a=]]'' ''A''");
+        ASSERT_THAT(v, IsListOfSize(0));
+    }
+
+    TEST_F(PrimOpTest, match10) {
+        auto v = eval("builtins.match ''[[.right-curly-bracket.]]'' ''}''");
+        ASSERT_THAT(v, IsListOfSize(0));
+    }
+
+    TEST_F(PrimOpTest, match11) {
+        auto v = eval("builtins.match ''[[.tilde.]]'' ''~''");
+        ASSERT_THAT(v, IsListOfSize(0));
+    }
+
+    TEST_F(PrimOpTest, match12) {
+        auto v = eval("builtins.match ''[\\n]'' ''\\''");
+        ASSERT_THAT(v, IsListOfSize(0));
+    }
+
+    TEST_F(PrimOpTest, match13) {
+        auto v = eval("builtins.match ''[\\[]'' ''\\''");
+        ASSERT_THAT(v, IsListOfSize(0));
+    }
+
+    TEST_F(PrimOpTest, match14) {
+        auto v = eval("builtins.match ''[\\]]'' ''\\]''");
+        ASSERT_THAT(v, IsListOfSize(0));
+    }
+
+    TEST_F(PrimOpTest, match15) {
+        auto v = eval("builtins.match ''[\\-z]'' ''y''");
+        ASSERT_THAT(v, IsListOfSize(0));
+    }
+
+    TEST_F(PrimOpTest, match16) {
+        auto v = eval("builtins.match ''[\\\\]'' ''\\''");
+        ASSERT_THAT(v, IsListOfSize(0));
+    }
+
+    TEST_F(PrimOpTest, match17) {
+        auto v = eval("builtins.match ''[\\]'' ''\\''");
+        ASSERT_THAT(v, IsListOfSize(0));
+    }
+
+    TEST_F(PrimOpTest, match18) {
+        auto v = eval("builtins.match ''[\\]]'' ''\\]''");
+        ASSERT_THAT(v, IsListOfSize(0));
+    }
+
+    TEST_F(PrimOpTest, match19) {
+        auto v = eval("builtins.match ''.*[Ω].*'' ''β''");
+        ASSERT_THAT(v, IsListOfSize(0));
+    }
+
+    TEST_F(PrimOpTest, match20) {
+        auto v = eval("builtins.match ''[^]]'' ''a''");
+        ASSERT_THAT(v, IsListOfSize(0));
+    }
+
+    TEST_F(PrimOpTest, match21) {
+        auto v = eval("builtins.match ''[[[:alpha:]]'' ''[''");
+        ASSERT_THAT(v, IsListOfSize(0));
+    }
+
+    TEST_F(PrimOpTest, match_unsupported_syntax1) {
+        ASSERT_THROW(eval("builtins.match ''{'' ''{''"), EvalError);
+    }
+
+    TEST_F(PrimOpTest, match_unsupported_syntax2) {
+        ASSERT_THROW(eval("builtins.match ''(a)\\1'' ''aa''"), EvalError);
+    }
+
+    TEST_F(PrimOpTest, match_unsupported_syntax3) {
+        ASSERT_THROW(eval("builtins.match ''\\}'' ''}''"), EvalError);
+    }
+
+    TEST_F(PrimOpTest, match_unsupported_syntax4) {
+        ASSERT_THROW(eval("builtins.match ''\\]'' '']''"), EvalError);
+    }
+
+    TEST_F(PrimOpTest, match_unsupported_syntax5) {
+        ASSERT_THROW(eval("builtins.match ''\\x41'' ''A''"), EvalError);
+    }
+
+    TEST_F(PrimOpTest, match_unsupported_syntax6) {
+        ASSERT_THROW(eval("builtins.match ''\\n'' \"\\n\""), EvalError);
+    }
+
+    TEST_F(PrimOpTest, match_unsupported_syntax7) {
+        ASSERT_THROW(eval("builtins.match ''\\d'' ''1''"), EvalError);
+    }
+
+    TEST_F(PrimOpTest, match_unsupported_syntax8) {
+        ASSERT_THROW(eval("builtins.match ''\\b1'' ''1''"), EvalError);
+    }
+
+    TEST_F(PrimOpTest, match_unsupported_syntax9) {
+        ASSERT_THROW(eval("builtins.match ''\\A1'' ''1''"), EvalError);
+    }
+
+    TEST_F(PrimOpTest, match_unsupported_syntax10) {
+        ASSERT_THROW(eval("builtins.match ''(?:1)'' ''1''"), EvalError);
+    }
+
+    TEST_F(PrimOpTest, match_unsupported_syntax11) {
+        ASSERT_THROW(eval("builtins.match ''[b-a]'' ''b''"), EvalError);
+    }
+
+    TEST_F(PrimOpTest, match_unsupported_syntax12) {
+        ASSERT_THROW(eval("builtins.match ''[[:alpha:]-a]'' ''b''"), EvalError);
+    }
+
+    TEST_F(PrimOpTest, match_unsupported_syntax13) {
+        ASSERT_THROW(eval("builtins.match ''[[=1=]]'' ''1''"), EvalError);
+    }
+
+    TEST_F(PrimOpTest, match_unsupported_syntax14) {
+        ASSERT_THROW(eval("builtins.match ''[[=]=]]'' '']''"), EvalError);
+    }
+
+    TEST_F(PrimOpTest, match_unsupported_syntax15) {
+        ASSERT_THROW(eval("builtins.match ''[a-b-c]'' ''b''"), EvalError);
+    }
+
     TEST_F(PrimOpTest, attrNames) {
         auto v = eval("builtins.attrNames { x = 1; y = 2; z = 3; a = 2; }");
         ASSERT_THAT(v, IsListOfSize(4));