From c8bdd1547b6252786493907b11f4f902ddeabee2 Mon Sep 17 00:00:00 2001 From: Alois Wohlschlager Date: Thu, 29 Aug 2024 19:36:29 +0200 Subject: [PATCH] initial commit --- .envrc | 1 + .gitignore | 4 ++ README.md | 47 ++++++++++++++++++ default.nix | 14 ++++++ meson.build | 30 ++++++++++++ package.nix | 39 +++++++++++++++ shell.nix | 15 ++++++ src/data.cc | 47 ++++++++++++++++++ src/data.hh | 9 ++++ src/engine.hh | 13 +++++ src/engine_boost.cc | 33 +++++++++++++ src/engine_c.cc | 52 ++++++++++++++++++++ src/engine_oniguruma.cc | 69 +++++++++++++++++++++++++++ src/engine_pcre.cc | 57 ++++++++++++++++++++++ src/engine_re2.cc | 37 +++++++++++++++ src/engine_std.cc | 33 +++++++++++++ src/engine_tre.cc | 50 ++++++++++++++++++++ src/main.cc | 102 ++++++++++++++++++++++++++++++++++++++++ 18 files changed, 652 insertions(+) create mode 100644 .envrc create mode 100644 .gitignore create mode 100644 README.md create mode 100644 default.nix create mode 100644 meson.build create mode 100644 package.nix create mode 100644 shell.nix create mode 100644 src/data.cc create mode 100644 src/data.hh create mode 100644 src/engine.hh create mode 100644 src/engine_boost.cc create mode 100644 src/engine_c.cc create mode 100644 src/engine_oniguruma.cc create mode 100644 src/engine_pcre.cc create mode 100644 src/engine_re2.cc create mode 100644 src/engine_std.cc create mode 100644 src/engine_tre.cc create mode 100644 src/main.cc diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..1d953f4 --- /dev/null +++ b/.envrc @@ -0,0 +1 @@ +use nix diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..774339f --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +/.cache +/.direnv +result +result-* diff --git a/README.md b/README.md new file mode 100644 index 0000000..431532c --- /dev/null +++ b/README.md @@ -0,0 +1,47 @@ +# Regular expressions are hard + +Writing a high-quality implementation of POSIX Extended Regular Expressions does not seem easy. +Ideally, the following features would be offered at the same time: + +* Strict standards compliance (see https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/V1_chap09.html and https://pubs.opengroup.org/onlinepubs/9799919799/functions/regexec.html). +* Predictable performance (polynomial in the length of the inputs, and linear in the length of the matched string). +* Limited resource usage (memory, CPU time (also related to the previous point)). +* Absence of vendor-specific syntax extensions (for portability). + +In practice, most implementations fall short of these goals, due to a variety of issues: + +* Unclear wording in the standards, leading to diverging outcomes between implementations. +* Exponential matching time due to backtracking implementation (sometimes even mandated by non-standard syntax extensions). +* Excessive consumption of stack memory. +* Spurious matching failures (either indication of a non-match, or an error). +* Incorrect capturing behaviour of parenthesised subexpressions. + +Here we test a small sample of regular expressions that are known to be hard to match properly against a couple of popular regex engines. + +## Run it + +Build using Lix: + + nix-build -A default # or libcxx, or musl + +List all available engines: + + result/bin/driver list + +Check the specified engines (tries all if none is specified, not recommended on libstdc++ because `std` may crash): + + result/bin/driver check [engine]… + +Print the match results of the specified engines (tries all if none is specified, not recommended on libstdc++ because `std` may crash): + + result/bin/driver results [engine]… + +## List of supported engines + +* Boost.Regex (`boost`) +* C standard library (`c`) +* Oniguruma (`oniguruma`, does not claim POSIX compliance) +* PCRE2 (`pcre`, does not claim POSIX compliance) +* RE2 (`re2`, does not claim POSIX compliance) +* C++ standard library (`std`) +* TRE (`tre`) diff --git a/default.nix b/default.nix new file mode 100644 index 0000000..2aa00be --- /dev/null +++ b/default.nix @@ -0,0 +1,14 @@ +let + pkgs = import (builtins.fetchTarball { + url = "https://github.com/NixOS/nixpkgs/archive/d0e1602ddde669d5beb01aec49d71a51937ed7be.tar.gz"; + sha256 = "0g0m7zhpnbgzwn4gmqhjvqd9v6d917p1dg3fk1kwxs2x7v7c1zd4"; + }) { }; +in +{ + inherit pkgs; + default = pkgs.callPackage ./package.nix { }; + libcxx = pkgs.pkgsLLVM.callPackage ./package.nix { + boost = null; # fails to compile + }; + musl = pkgs.pkgsStatic.callPackage ./package.nix { }; +} diff --git a/meson.build b/meson.build new file mode 100644 index 0000000..d11bd0b --- /dev/null +++ b/meson.build @@ -0,0 +1,30 @@ +project('regex-is-hard', 'cpp', default_options: [ + 'buildtype=debugoptimized', + 'cpp_std=c++20', + 'warning_level=3', +]) + +boost = dependency('boost', modules: ['regex'], required: false) +oniguruma = dependency('oniguruma') +pcre = dependency('libpcre2-8') +re2 = dependency('re2') +tre = dependency('tre') + +config_h = configure_file(configuration: { + 'HAVE_BOOST': boost.found().to_int(), +}, output: 'config.h') + +sources = [ + 'src/main.cc', + 'src/data.cc', + 'src/engine_c.cc', + 'src/engine_oniguruma.cc', + 'src/engine_pcre.cc', + 'src/engine_re2.cc', + 'src/engine_std.cc', + 'src/engine_tre.cc', +] +if boost.found() + sources += 'src/engine_boost.cc' +endif +driver = executable('driver', sources, dependencies: [boost, oniguruma, pcre, re2, tre], install: true) diff --git a/package.nix b/package.nix new file mode 100644 index 0000000..0e83540 --- /dev/null +++ b/package.nix @@ -0,0 +1,39 @@ +{ + stdenv, + lib, + meson, + ninja, + pkg-config, + boost, + oniguruma, + pcre2, + re2, + tre, +}: +stdenv.mkDerivation { + name = "regex-is-hard"; + + src = lib.fileset.toSource { + root = ./.; + fileset = lib.fileset.unions [ + ./meson.build + ./src + ]; + }; + + strictDeps = true; + + nativeBuildInputs = [ + meson + ninja + pkg-config + ]; + + buildInputs = [ + boost + oniguruma + pcre2 + re2 + tre + ]; +} diff --git a/shell.nix b/shell.nix new file mode 100644 index 0000000..7585d01 --- /dev/null +++ b/shell.nix @@ -0,0 +1,15 @@ +let + defaultNix = import ./.; + inherit (defaultNix) pkgs; +in +pkgs.mkShell { + strictDeps = true; + + inputsFrom = [ defaultNix.default ]; + + nativeBuildInputs = with pkgs; [ + clang-tools + nil + nixfmt-rfc-style + ]; +} diff --git a/src/data.cc b/src/data.cc new file mode 100644 index 0000000..9921bd5 --- /dev/null +++ b/src/data.cc @@ -0,0 +1,47 @@ +#include "config.h" +#include "data.hh" + +#if HAVE_BOOST +std::unique_ptr compileBoost(const std::string & re); +#endif +std::unique_ptr compileC(const std::string & re); +std::unique_ptr compileOniguruma(const std::string & re); +std::unique_ptr compilePCRE(const std::string & re); +std::unique_ptr compileRE2(const std::string & re); +std::unique_ptr compileStd(const std::string & re); +std::unique_ptr compileTRE(const std::string & re); + +std::map(*)(const std::string &)> engines { +#if HAVE_BOOST + {"boost", compileBoost}, +#endif + {"c", compileC}, + {"oniguruma", compileOniguruma}, + {"pcre", compilePCRE}, + {"re2", compileRE2}, + {"std", compileStd}, + {"tre", compileTRE}, +}; + +std::string aaaaa(500000, 'a'); + +std::vector>>> testCases { + {"\\.*(.*)", {{".keep", {{".keep", "keep"}}}}}, + {".*(ex|gexp).*", {{"regexp", {{"regexp", "ex"}}}}}, + {".*(gexp|ex).*", {{"regexp", {{"regexp", "ex"}}}}}, + {"F.chsin", {{"Füchsin", {{"Füchsin"}}}}}, + {"F..chsin", {{"Füchsin", std::nullopt}}}, + {"(a*)*", { + {"a", {{"a", "a"}}}, // unclear + {"aaaaaaaaaaaaaah", std::nullopt}, + {"aaaaaaaaaaaaaaah", std::nullopt}, + {"aaaaaaaaaaaaaaaah", std::nullopt}, + }}, + {"(a+)+", { + {"a", {{"a", "a"}}}, + {"aaaaaaaaaaaaaaaaaaaaaah", std::nullopt}, + }}, + {"(a(h)?)*", {{"aha", {{"aha", "a", std::nullopt}}}}}, + {"((aa)*)*", {{"aa", {{"aa", "", std::nullopt}}}}}, // unclear + {".*", {{aaaaa, {{aaaaa}}}}}, // keep this at the bottom because libstdc++ crashes +}; diff --git a/src/data.hh b/src/data.hh new file mode 100644 index 0000000..0d3376e --- /dev/null +++ b/src/data.hh @@ -0,0 +1,9 @@ +#pragma once + +#include "engine.hh" + +#include +#include + +extern std::map(*)(const std::string &)> engines; +extern std::vector>>> testCases; diff --git a/src/engine.hh b/src/engine.hh new file mode 100644 index 0000000..79c4444 --- /dev/null +++ b/src/engine.hh @@ -0,0 +1,13 @@ +#pragma once + +#include +#include +#include + +using MatchResult = std::optional>>; + +class Regex { +public: + virtual ~Regex() {} + virtual MatchResult match(std::string_view haystack) = 0; +}; diff --git a/src/engine_boost.cc b/src/engine_boost.cc new file mode 100644 index 0000000..19f90c9 --- /dev/null +++ b/src/engine_boost.cc @@ -0,0 +1,33 @@ +#include "engine.hh" + +#include + +class RegexBoost : public Regex { + boost::regex inner; + +public: + RegexBoost(const std::string & re) + : inner(re, boost::regex::extended) + {} + + MatchResult match(std::string_view haystack) override { + boost::cmatch matches; + if (boost::regex_match(haystack.begin(), haystack.end(), matches, inner)) { + std::vector> result; + for (const auto & match : matches) { + if (match.matched) { + result.push_back(std::string_view(match.first, match.second)); + } else { + result.push_back(std::nullopt); + } + } + return result; + } else { + return std::nullopt; + } + } +}; + +std::unique_ptr compileBoost(const std::string & re) { + return std::make_unique(re); +} diff --git a/src/engine_c.cc b/src/engine_c.cc new file mode 100644 index 0000000..61a3e14 --- /dev/null +++ b/src/engine_c.cc @@ -0,0 +1,52 @@ +#include "engine.hh" + +#include +#include +#include +#include + +class RegexC : public Regex { + regex_t inner; + std::vector matches; + +public: + RegexC(const std::string & re) { + // Bug: does not work for regex with embedded null bytes. + int code = regcomp(&inner, std::format("^{}$", re).data(), REG_EXTENDED); + if (code != 0) { + throw code; + } + matches = std::vector(inner.re_nsub + 1); + } + + RegexC(const RegexC &) = delete; + RegexC & operator =(const RegexC &) = delete; + + ~RegexC() { + regfree(&inner); + } + + MatchResult match(std::string_view haystack) override { + // Bug: does not work for haystack with embedded null bytes. + int code = regexec(&inner, std::string(haystack).data(), inner.re_nsub + 1, matches.data(), 0); + if (code == 0) { + std::vector> result; + for (const auto & match : matches) { + if (match.rm_so != -1 || match.rm_eo != -1) { + result.push_back(std::string_view(haystack.data() + match.rm_so, haystack.data() + match.rm_eo)); + } else { + result.push_back(std::nullopt); + } + } + return result; + } else if (code == REG_NOMATCH) { + return std::nullopt; + } else { + throw code; + } + } +}; + +std::unique_ptr compileC(const std::string & re) { + return std::make_unique(re); +} diff --git a/src/engine_oniguruma.cc b/src/engine_oniguruma.cc new file mode 100644 index 0000000..ec5148b --- /dev/null +++ b/src/engine_oniguruma.cc @@ -0,0 +1,69 @@ +#include "engine.hh" + +#include +#include + +bool onigurumaInitialised = false; + +class RegexOniguruma : public Regex { + OnigRegex inner; + OnigRegion * matches; + +public: + RegexOniguruma(const std::string & re) { + if (!onigurumaInitialised) { + OnigEncoding encodings[] { ONIG_ENCODING_UTF8 }; + onig_initialize(encodings, sizeof(encodings) / sizeof(OnigEncoding)); + onigurumaInitialised = true; + } + OnigErrorInfo error; + int code = onig_new( + &inner, + reinterpret_cast(re.data()), + reinterpret_cast(re.data() + re.size()), + ONIG_OPTION_NONE, + ONIG_ENCODING_UTF8, + ONIG_SYNTAX_POSIX_EXTENDED, + &error + ); + if (code != ONIG_NORMAL) { + throw code; + } + matches = onig_region_new(); + } + + RegexOniguruma(const RegexOniguruma &) = delete; + RegexOniguruma & operator =(const RegexOniguruma &) = delete; + + ~RegexOniguruma() { + onig_region_free(matches, 1); + onig_free(inner); + } + + MatchResult match(std::string_view haystack) override { + int code = onig_match( + inner, + reinterpret_cast(haystack.data()), + reinterpret_cast(haystack.data() + haystack.size()), + reinterpret_cast(haystack.data()), + matches, + ONIG_OPTION_MATCH_WHOLE_STRING + ); + if (code >= 0) { + std::vector> result; + size_t n = matches->num_regs; + for (size_t i = 0; i < n; ++i) { + result.push_back(std::string_view(haystack.data() + matches->beg[i], haystack.data() + matches->end[i])); + } + return result; + } else if (code == ONIG_MISMATCH) { + return std::nullopt; + } else { + throw code; + } + } +}; + +std::unique_ptr compileOniguruma(const std::string & re) { + return std::make_unique(re); +} diff --git a/src/engine_pcre.cc b/src/engine_pcre.cc new file mode 100644 index 0000000..49450c2 --- /dev/null +++ b/src/engine_pcre.cc @@ -0,0 +1,57 @@ +#include "engine.hh" + +#define PCRE2_CODE_UNIT_WIDTH 8 + +#include +#include +#include + +class RegexPCRE : public Regex { + pcre2_code * inner; + pcre2_match_data * matches; + +public: + RegexPCRE(const std::string & re) { + int errcode; + size_t erroffset; + inner = pcre2_compile(reinterpret_cast(re.data()), re.size(), PCRE2_ANCHORED | PCRE2_ENDANCHORED, &errcode, &erroffset, nullptr); + if (!inner) { + throw errcode; + } + matches = pcre2_match_data_create_from_pattern(inner, nullptr); + } + + RegexPCRE(const RegexPCRE &) = delete; + RegexPCRE & operator =(const RegexPCRE &) = delete; + + ~RegexPCRE() { + pcre2_match_data_free(matches); + pcre2_code_free(inner); + } + + MatchResult match(std::string_view haystack) override { + int code = pcre2_match(inner, reinterpret_cast(haystack.data()), haystack.size(), 0, 0, matches, nullptr); + if (code > 0) { + auto pMatch = pcre2_get_ovector_pointer(matches); + std::vector> result; + for (size_t i = 0; i < static_cast(pcre2_get_ovector_count(matches)); ++i) { + auto start = pMatch[2 * i]; + auto end = pMatch[2 * i + 1]; + if (start != static_cast(-1) || end != static_cast(-1)) { + result.push_back(std::string_view(haystack.data() + start, haystack.data() + end)); + } else { + result.push_back(std::nullopt); + } + } + return result; + } else if (code == PCRE2_ERROR_NOMATCH) { + return std::nullopt; + } else { + throw code; + } + } +}; + +std::unique_ptr compilePCRE(const std::string & re) { + return std::make_unique(re); +} diff --git a/src/engine_re2.cc b/src/engine_re2.cc new file mode 100644 index 0000000..48f40c4 --- /dev/null +++ b/src/engine_re2.cc @@ -0,0 +1,37 @@ +#include "engine.hh" + +#include + +class RegexRE2 : public Regex { + RE2 inner; + std::vector> matches; + std::vector matchArgs; + std::vector matchArgPointers; + +public: + RegexRE2(const std::string & re) + : inner(re) + { + size_t n = inner.NumberOfCapturingGroups(); + matches = std::vector>(n + 1); + for (size_t i = 1; i <= n; ++i) { + matchArgs.emplace_back(&matches[i]); + } + for (const auto & arg : matchArgs) { + matchArgPointers.push_back(&arg); + } + } + + MatchResult match(std::string_view haystack) override { + if (RE2::FullMatchN(haystack, inner, matchArgPointers.data(), matchArgPointers.size())) { + matches[0] = haystack; + return matches; + } else { + return std::nullopt; + } + } +}; + +std::unique_ptr compileRE2(const std::string & re) { + return std::make_unique(re); +} diff --git a/src/engine_std.cc b/src/engine_std.cc new file mode 100644 index 0000000..bba2662 --- /dev/null +++ b/src/engine_std.cc @@ -0,0 +1,33 @@ +#include "engine.hh" + +#include + +class RegexStd : public Regex { + std::regex inner; + +public: + RegexStd(const std::string & re) + : inner(re, std::regex::extended) + {} + + MatchResult match(std::string_view haystack) override { + std::cmatch matches; + if (std::regex_match(haystack.begin(), haystack.end(), matches, inner)) { + std::vector> result; + for (const auto & match : matches) { + if (match.matched) { + result.push_back(std::string_view(match.first, match.second)); + } else { + result.push_back(std::nullopt); + } + } + return result; + } else { + return std::nullopt; + } + } +}; + +std::unique_ptr compileStd(const std::string & re) { + return std::make_unique(re); +} diff --git a/src/engine_tre.cc b/src/engine_tre.cc new file mode 100644 index 0000000..c5085f9 --- /dev/null +++ b/src/engine_tre.cc @@ -0,0 +1,50 @@ +#include "engine.hh" + +#include +#include +#include + +class RegexTRE : public Regex { + regex_t inner; + std::vector matches; + +public: + RegexTRE(const std::string & re) { + auto anchored = std::format("^{}$", re); + int code = tre_regncomp(&inner, anchored.data(), anchored.size(), REG_EXTENDED); + if (code != 0) { + throw code; + } + matches = std::vector(inner.re_nsub + 1); + } + + RegexTRE(const RegexTRE &) = delete; + RegexTRE & operator =(const RegexTRE &) = delete; + + ~RegexTRE() { + tre_regfree(&inner); + } + + MatchResult match(std::string_view haystack) override { + int code = tre_regnexec(&inner, haystack.data(), haystack.size(), inner.re_nsub + 1, matches.data(), 0); + if (code == 0) { + std::vector> result; + for (const auto & match : matches) { + if (match.rm_so != -1 || match.rm_eo != -1) { + result.push_back(std::string_view(haystack.data() + match.rm_so, haystack.data() + match.rm_eo)); + } else { + result.push_back(std::nullopt); + } + } + return result; + } else if (code == REG_NOMATCH) { + return std::nullopt; + } else { + throw code; + } + } +}; + +std::unique_ptr compileTRE(const std::string & re) { + return std::make_unique(re); +} diff --git a/src/main.cc b/src/main.cc new file mode 100644 index 0000000..abcd8cb --- /dev/null +++ b/src/main.cc @@ -0,0 +1,102 @@ +#include "data.hh" + +#include +#include +#include +#include +#include + +void runStep(const std::string & description, std::function action) { + auto start = std::chrono::steady_clock::now(); + const char * result; + try { + result = action() ? "\x1b[32mOK\x1b[0m" : "\x1b[31mFAIL\x1b[0m"; + } catch (...) { + result = "\x1b[31mEXCEPTION\x1b[0m"; + } + auto time = std::chrono::duration_cast(std::chrono::steady_clock::now() - start); + std::cout << std::format("{}: {} ({})", description, result, time) << std::endl; +} + +void check(std::vector enginesToTest) { + for (const auto & engine : enginesToTest) { + std::cout << std::format("Engine: {}", engine) << std::endl; + for (const auto & [re, examples] : testCases) { + std::unique_ptr needle; + runStep(std::format(" compile \x1b[35m\"{}\"\x1b[0m", re), [&]() { + needle = engines.at(engine)(re); + return true; + }); + if (!needle) { + continue; + } + for (const auto & [haystack, expectedMatches] : examples) { + auto haystackDescription = haystack.size() <= 20 ? haystack : std::format("{}[{} characters omitted]{}", haystack.substr(0, 10), haystack.size() - 20, haystack.substr(haystack.size() - 10)); + runStep(std::format(" match \x1b[35m\"{}\"\x1b[0m", haystackDescription), [&]() { + return needle->match(haystack) == expectedMatches; + }); + } + } + } +} + +void results(std::vector enginesToTest) { + for (const auto & engine : enginesToTest) { + std::cout << std::format("Engine: {}", engine) << std::endl; + for (const auto & [re, examples] : testCases) { + std::unique_ptr needle; + runStep(std::format(" compile \x1b[35m\"{}\"\x1b[0m", re), [&]() { + needle = engines.at(engine)(re); + return true; + }); + if (!needle) { + continue; + } + for (const auto & [haystack, _] : examples) { + try { + auto matches = needle->match(haystack); + std::cout << std::format(" match \x1b[35m\"{}\"\x1b[0m: ", haystack); + if (matches) { + std::cout << "["; + for (const auto & match : *matches) { + if (match) { + std::cout << std::format(" \x1b[35m\"{}\"\x1b[0m", *match); + } else { + std::cout << " null"; + } + } + std::cout << " ]" << std::endl; + } else { + std::cout << "null" << std::endl; + } + } catch (...) { + std::cout << std::format(" match \x1b[35m\"{}\"\x1b[0m: \x1b[31mEXCEPTION\x1b[0m", haystack) << std::endl; + } + } + } + } +} + +int main(int argc, char ** argv) { + std::vector enginesToTest; + if (argc >= 3) { + for (int i = 2; i < argc; ++i) { + enginesToTest.emplace_back(argv[i]); + } + } else { + for (const auto & [key, _] : engines) { + enginesToTest.push_back(key); + } + } + if (argc <= 1 || strcmp(argv[1], "check") == 0) { + check(enginesToTest); + } else if (strcmp(argv[1], "results") == 0) { + results(enginesToTest); + } else if (strcmp(argv[1], "list") == 0) { + for (const auto & engine : enginesToTest) { + std::cout << engine << std::endl; + } + } else { + return 1; + } +}