commit c8bdd1547b6252786493907b11f4f902ddeabee2 Author: Alois Wohlschlager Date: Thu Aug 29 19:36:29 2024 +0200 initial commit diff --git a/.envrc b/.envrc new file mode 100644 index 0000000..1d953f4 --- /dev/null +++ b/.envrc @@ -0,0 +1 @@ +use nix diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..774339f --- /dev/null +++ b/.gitignore @@ -0,0 +1,4 @@ +/.cache +/.direnv +result +result-* diff --git a/README.md b/README.md new file mode 100644 index 0000000..431532c --- /dev/null +++ b/README.md @@ -0,0 +1,47 @@ +# Regular expressions are hard + +Writing a high-quality implementation of POSIX Extended Regular Expressions does not seem easy. +Ideally, the following features would be offered at the same time: + +* Strict standards compliance (see https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/V1_chap09.html and https://pubs.opengroup.org/onlinepubs/9799919799/functions/regexec.html). +* Predictable performance (polynomial in the length of the inputs, and linear in the length of the matched string). +* Limited resource usage (memory, CPU time (also related to the previous point)). +* Absence of vendor-specific syntax extensions (for portability). + +In practice, most implementations fall short of these goals, due to a variety of issues: + +* Unclear wording in the standards, leading to diverging outcomes between implementations. +* Exponential matching time due to backtracking implementation (sometimes even mandated by non-standard syntax extensions). +* Excessive consumption of stack memory. +* Spurious matching failures (either indication of a non-match, or an error). +* Incorrect capturing behaviour of parenthesised subexpressions. + +Here we test a small sample of regular expressions that are known to be hard to match properly against a couple of popular regex engines. + +## Run it + +Build using Lix: + + nix-build -A default # or libcxx, or musl + +List all available engines: + + result/bin/driver list + +Check the specified engines (tries all if none is specified, not recommended on libstdc++ because `std` may crash): + + result/bin/driver check [engine]… + +Print the match results of the specified engines (tries all if none is specified, not recommended on libstdc++ because `std` may crash): + + result/bin/driver results [engine]… + +## List of supported engines + +* Boost.Regex (`boost`) +* C standard library (`c`) +* Oniguruma (`oniguruma`, does not claim POSIX compliance) +* PCRE2 (`pcre`, does not claim POSIX compliance) +* RE2 (`re2`, does not claim POSIX compliance) +* C++ standard library (`std`) +* TRE (`tre`) diff --git a/default.nix b/default.nix new file mode 100644 index 0000000..2aa00be --- /dev/null +++ b/default.nix @@ -0,0 +1,14 @@ +let + pkgs = import (builtins.fetchTarball { + url = "https://github.com/NixOS/nixpkgs/archive/d0e1602ddde669d5beb01aec49d71a51937ed7be.tar.gz"; + sha256 = "0g0m7zhpnbgzwn4gmqhjvqd9v6d917p1dg3fk1kwxs2x7v7c1zd4"; + }) { }; +in +{ + inherit pkgs; + default = pkgs.callPackage ./package.nix { }; + libcxx = pkgs.pkgsLLVM.callPackage ./package.nix { + boost = null; # fails to compile + }; + musl = pkgs.pkgsStatic.callPackage ./package.nix { }; +} diff --git a/meson.build b/meson.build new file mode 100644 index 0000000..d11bd0b --- /dev/null +++ b/meson.build @@ -0,0 +1,30 @@ +project('regex-is-hard', 'cpp', default_options: [ + 'buildtype=debugoptimized', + 'cpp_std=c++20', + 'warning_level=3', +]) + +boost = dependency('boost', modules: ['regex'], required: false) +oniguruma = dependency('oniguruma') +pcre = dependency('libpcre2-8') +re2 = dependency('re2') +tre = dependency('tre') + +config_h = configure_file(configuration: { + 'HAVE_BOOST': boost.found().to_int(), +}, output: 'config.h') + +sources = [ + 'src/main.cc', + 'src/data.cc', + 'src/engine_c.cc', + 'src/engine_oniguruma.cc', + 'src/engine_pcre.cc', + 'src/engine_re2.cc', + 'src/engine_std.cc', + 'src/engine_tre.cc', +] +if boost.found() + sources += 'src/engine_boost.cc' +endif +driver = executable('driver', sources, dependencies: [boost, oniguruma, pcre, re2, tre], install: true) diff --git a/package.nix b/package.nix new file mode 100644 index 0000000..0e83540 --- /dev/null +++ b/package.nix @@ -0,0 +1,39 @@ +{ + stdenv, + lib, + meson, + ninja, + pkg-config, + boost, + oniguruma, + pcre2, + re2, + tre, +}: +stdenv.mkDerivation { + name = "regex-is-hard"; + + src = lib.fileset.toSource { + root = ./.; + fileset = lib.fileset.unions [ + ./meson.build + ./src + ]; + }; + + strictDeps = true; + + nativeBuildInputs = [ + meson + ninja + pkg-config + ]; + + buildInputs = [ + boost + oniguruma + pcre2 + re2 + tre + ]; +} diff --git a/shell.nix b/shell.nix new file mode 100644 index 0000000..7585d01 --- /dev/null +++ b/shell.nix @@ -0,0 +1,15 @@ +let + defaultNix = import ./.; + inherit (defaultNix) pkgs; +in +pkgs.mkShell { + strictDeps = true; + + inputsFrom = [ defaultNix.default ]; + + nativeBuildInputs = with pkgs; [ + clang-tools + nil + nixfmt-rfc-style + ]; +} diff --git a/src/data.cc b/src/data.cc new file mode 100644 index 0000000..9921bd5 --- /dev/null +++ b/src/data.cc @@ -0,0 +1,47 @@ +#include "config.h" +#include "data.hh" + +#if HAVE_BOOST +std::unique_ptr compileBoost(const std::string & re); +#endif +std::unique_ptr compileC(const std::string & re); +std::unique_ptr compileOniguruma(const std::string & re); +std::unique_ptr compilePCRE(const std::string & re); +std::unique_ptr compileRE2(const std::string & re); +std::unique_ptr compileStd(const std::string & re); +std::unique_ptr compileTRE(const std::string & re); + +std::map(*)(const std::string &)> engines { +#if HAVE_BOOST + {"boost", compileBoost}, +#endif + {"c", compileC}, + {"oniguruma", compileOniguruma}, + {"pcre", compilePCRE}, + {"re2", compileRE2}, + {"std", compileStd}, + {"tre", compileTRE}, +}; + +std::string aaaaa(500000, 'a'); + +std::vector>>> testCases { + {"\\.*(.*)", {{".keep", {{".keep", "keep"}}}}}, + {".*(ex|gexp).*", {{"regexp", {{"regexp", "ex"}}}}}, + {".*(gexp|ex).*", {{"regexp", {{"regexp", "ex"}}}}}, + {"F.chsin", {{"Füchsin", {{"Füchsin"}}}}}, + {"F..chsin", {{"Füchsin", std::nullopt}}}, + {"(a*)*", { + {"a", {{"a", "a"}}}, // unclear + {"aaaaaaaaaaaaaah", std::nullopt}, + {"aaaaaaaaaaaaaaah", std::nullopt}, + {"aaaaaaaaaaaaaaaah", std::nullopt}, + }}, + {"(a+)+", { + {"a", {{"a", "a"}}}, + {"aaaaaaaaaaaaaaaaaaaaaah", std::nullopt}, + }}, + {"(a(h)?)*", {{"aha", {{"aha", "a", std::nullopt}}}}}, + {"((aa)*)*", {{"aa", {{"aa", "", std::nullopt}}}}}, // unclear + {".*", {{aaaaa, {{aaaaa}}}}}, // keep this at the bottom because libstdc++ crashes +}; diff --git a/src/data.hh b/src/data.hh new file mode 100644 index 0000000..0d3376e --- /dev/null +++ b/src/data.hh @@ -0,0 +1,9 @@ +#pragma once + +#include "engine.hh" + +#include +#include + +extern std::map(*)(const std::string &)> engines; +extern std::vector>>> testCases; diff --git a/src/engine.hh b/src/engine.hh new file mode 100644 index 0000000..79c4444 --- /dev/null +++ b/src/engine.hh @@ -0,0 +1,13 @@ +#pragma once + +#include +#include +#include + +using MatchResult = std::optional>>; + +class Regex { +public: + virtual ~Regex() {} + virtual MatchResult match(std::string_view haystack) = 0; +}; diff --git a/src/engine_boost.cc b/src/engine_boost.cc new file mode 100644 index 0000000..19f90c9 --- /dev/null +++ b/src/engine_boost.cc @@ -0,0 +1,33 @@ +#include "engine.hh" + +#include + +class RegexBoost : public Regex { + boost::regex inner; + +public: + RegexBoost(const std::string & re) + : inner(re, boost::regex::extended) + {} + + MatchResult match(std::string_view haystack) override { + boost::cmatch matches; + if (boost::regex_match(haystack.begin(), haystack.end(), matches, inner)) { + std::vector> result; + for (const auto & match : matches) { + if (match.matched) { + result.push_back(std::string_view(match.first, match.second)); + } else { + result.push_back(std::nullopt); + } + } + return result; + } else { + return std::nullopt; + } + } +}; + +std::unique_ptr compileBoost(const std::string & re) { + return std::make_unique(re); +} diff --git a/src/engine_c.cc b/src/engine_c.cc new file mode 100644 index 0000000..61a3e14 --- /dev/null +++ b/src/engine_c.cc @@ -0,0 +1,52 @@ +#include "engine.hh" + +#include +#include +#include +#include + +class RegexC : public Regex { + regex_t inner; + std::vector matches; + +public: + RegexC(const std::string & re) { + // Bug: does not work for regex with embedded null bytes. + int code = regcomp(&inner, std::format("^{}$", re).data(), REG_EXTENDED); + if (code != 0) { + throw code; + } + matches = std::vector(inner.re_nsub + 1); + } + + RegexC(const RegexC &) = delete; + RegexC & operator =(const RegexC &) = delete; + + ~RegexC() { + regfree(&inner); + } + + MatchResult match(std::string_view haystack) override { + // Bug: does not work for haystack with embedded null bytes. + int code = regexec(&inner, std::string(haystack).data(), inner.re_nsub + 1, matches.data(), 0); + if (code == 0) { + std::vector> result; + for (const auto & match : matches) { + if (match.rm_so != -1 || match.rm_eo != -1) { + result.push_back(std::string_view(haystack.data() + match.rm_so, haystack.data() + match.rm_eo)); + } else { + result.push_back(std::nullopt); + } + } + return result; + } else if (code == REG_NOMATCH) { + return std::nullopt; + } else { + throw code; + } + } +}; + +std::unique_ptr compileC(const std::string & re) { + return std::make_unique(re); +} diff --git a/src/engine_oniguruma.cc b/src/engine_oniguruma.cc new file mode 100644 index 0000000..ec5148b --- /dev/null +++ b/src/engine_oniguruma.cc @@ -0,0 +1,69 @@ +#include "engine.hh" + +#include +#include + +bool onigurumaInitialised = false; + +class RegexOniguruma : public Regex { + OnigRegex inner; + OnigRegion * matches; + +public: + RegexOniguruma(const std::string & re) { + if (!onigurumaInitialised) { + OnigEncoding encodings[] { ONIG_ENCODING_UTF8 }; + onig_initialize(encodings, sizeof(encodings) / sizeof(OnigEncoding)); + onigurumaInitialised = true; + } + OnigErrorInfo error; + int code = onig_new( + &inner, + reinterpret_cast(re.data()), + reinterpret_cast(re.data() + re.size()), + ONIG_OPTION_NONE, + ONIG_ENCODING_UTF8, + ONIG_SYNTAX_POSIX_EXTENDED, + &error + ); + if (code != ONIG_NORMAL) { + throw code; + } + matches = onig_region_new(); + } + + RegexOniguruma(const RegexOniguruma &) = delete; + RegexOniguruma & operator =(const RegexOniguruma &) = delete; + + ~RegexOniguruma() { + onig_region_free(matches, 1); + onig_free(inner); + } + + MatchResult match(std::string_view haystack) override { + int code = onig_match( + inner, + reinterpret_cast(haystack.data()), + reinterpret_cast(haystack.data() + haystack.size()), + reinterpret_cast(haystack.data()), + matches, + ONIG_OPTION_MATCH_WHOLE_STRING + ); + if (code >= 0) { + std::vector> result; + size_t n = matches->num_regs; + for (size_t i = 0; i < n; ++i) { + result.push_back(std::string_view(haystack.data() + matches->beg[i], haystack.data() + matches->end[i])); + } + return result; + } else if (code == ONIG_MISMATCH) { + return std::nullopt; + } else { + throw code; + } + } +}; + +std::unique_ptr compileOniguruma(const std::string & re) { + return std::make_unique(re); +} diff --git a/src/engine_pcre.cc b/src/engine_pcre.cc new file mode 100644 index 0000000..49450c2 --- /dev/null +++ b/src/engine_pcre.cc @@ -0,0 +1,57 @@ +#include "engine.hh" + +#define PCRE2_CODE_UNIT_WIDTH 8 + +#include +#include +#include + +class RegexPCRE : public Regex { + pcre2_code * inner; + pcre2_match_data * matches; + +public: + RegexPCRE(const std::string & re) { + int errcode; + size_t erroffset; + inner = pcre2_compile(reinterpret_cast(re.data()), re.size(), PCRE2_ANCHORED | PCRE2_ENDANCHORED, &errcode, &erroffset, nullptr); + if (!inner) { + throw errcode; + } + matches = pcre2_match_data_create_from_pattern(inner, nullptr); + } + + RegexPCRE(const RegexPCRE &) = delete; + RegexPCRE & operator =(const RegexPCRE &) = delete; + + ~RegexPCRE() { + pcre2_match_data_free(matches); + pcre2_code_free(inner); + } + + MatchResult match(std::string_view haystack) override { + int code = pcre2_match(inner, reinterpret_cast(haystack.data()), haystack.size(), 0, 0, matches, nullptr); + if (code > 0) { + auto pMatch = pcre2_get_ovector_pointer(matches); + std::vector> result; + for (size_t i = 0; i < static_cast(pcre2_get_ovector_count(matches)); ++i) { + auto start = pMatch[2 * i]; + auto end = pMatch[2 * i + 1]; + if (start != static_cast(-1) || end != static_cast(-1)) { + result.push_back(std::string_view(haystack.data() + start, haystack.data() + end)); + } else { + result.push_back(std::nullopt); + } + } + return result; + } else if (code == PCRE2_ERROR_NOMATCH) { + return std::nullopt; + } else { + throw code; + } + } +}; + +std::unique_ptr compilePCRE(const std::string & re) { + return std::make_unique(re); +} diff --git a/src/engine_re2.cc b/src/engine_re2.cc new file mode 100644 index 0000000..48f40c4 --- /dev/null +++ b/src/engine_re2.cc @@ -0,0 +1,37 @@ +#include "engine.hh" + +#include + +class RegexRE2 : public Regex { + RE2 inner; + std::vector> matches; + std::vector matchArgs; + std::vector matchArgPointers; + +public: + RegexRE2(const std::string & re) + : inner(re) + { + size_t n = inner.NumberOfCapturingGroups(); + matches = std::vector>(n + 1); + for (size_t i = 1; i <= n; ++i) { + matchArgs.emplace_back(&matches[i]); + } + for (const auto & arg : matchArgs) { + matchArgPointers.push_back(&arg); + } + } + + MatchResult match(std::string_view haystack) override { + if (RE2::FullMatchN(haystack, inner, matchArgPointers.data(), matchArgPointers.size())) { + matches[0] = haystack; + return matches; + } else { + return std::nullopt; + } + } +}; + +std::unique_ptr compileRE2(const std::string & re) { + return std::make_unique(re); +} diff --git a/src/engine_std.cc b/src/engine_std.cc new file mode 100644 index 0000000..bba2662 --- /dev/null +++ b/src/engine_std.cc @@ -0,0 +1,33 @@ +#include "engine.hh" + +#include + +class RegexStd : public Regex { + std::regex inner; + +public: + RegexStd(const std::string & re) + : inner(re, std::regex::extended) + {} + + MatchResult match(std::string_view haystack) override { + std::cmatch matches; + if (std::regex_match(haystack.begin(), haystack.end(), matches, inner)) { + std::vector> result; + for (const auto & match : matches) { + if (match.matched) { + result.push_back(std::string_view(match.first, match.second)); + } else { + result.push_back(std::nullopt); + } + } + return result; + } else { + return std::nullopt; + } + } +}; + +std::unique_ptr compileStd(const std::string & re) { + return std::make_unique(re); +} diff --git a/src/engine_tre.cc b/src/engine_tre.cc new file mode 100644 index 0000000..c5085f9 --- /dev/null +++ b/src/engine_tre.cc @@ -0,0 +1,50 @@ +#include "engine.hh" + +#include +#include +#include + +class RegexTRE : public Regex { + regex_t inner; + std::vector matches; + +public: + RegexTRE(const std::string & re) { + auto anchored = std::format("^{}$", re); + int code = tre_regncomp(&inner, anchored.data(), anchored.size(), REG_EXTENDED); + if (code != 0) { + throw code; + } + matches = std::vector(inner.re_nsub + 1); + } + + RegexTRE(const RegexTRE &) = delete; + RegexTRE & operator =(const RegexTRE &) = delete; + + ~RegexTRE() { + tre_regfree(&inner); + } + + MatchResult match(std::string_view haystack) override { + int code = tre_regnexec(&inner, haystack.data(), haystack.size(), inner.re_nsub + 1, matches.data(), 0); + if (code == 0) { + std::vector> result; + for (const auto & match : matches) { + if (match.rm_so != -1 || match.rm_eo != -1) { + result.push_back(std::string_view(haystack.data() + match.rm_so, haystack.data() + match.rm_eo)); + } else { + result.push_back(std::nullopt); + } + } + return result; + } else if (code == REG_NOMATCH) { + return std::nullopt; + } else { + throw code; + } + } +}; + +std::unique_ptr compileTRE(const std::string & re) { + return std::make_unique(re); +} diff --git a/src/main.cc b/src/main.cc new file mode 100644 index 0000000..abcd8cb --- /dev/null +++ b/src/main.cc @@ -0,0 +1,102 @@ +#include "data.hh" + +#include +#include +#include +#include +#include + +void runStep(const std::string & description, std::function action) { + auto start = std::chrono::steady_clock::now(); + const char * result; + try { + result = action() ? "\x1b[32mOK\x1b[0m" : "\x1b[31mFAIL\x1b[0m"; + } catch (...) { + result = "\x1b[31mEXCEPTION\x1b[0m"; + } + auto time = std::chrono::duration_cast(std::chrono::steady_clock::now() - start); + std::cout << std::format("{}: {} ({})", description, result, time) << std::endl; +} + +void check(std::vector enginesToTest) { + for (const auto & engine : enginesToTest) { + std::cout << std::format("Engine: {}", engine) << std::endl; + for (const auto & [re, examples] : testCases) { + std::unique_ptr needle; + runStep(std::format(" compile \x1b[35m\"{}\"\x1b[0m", re), [&]() { + needle = engines.at(engine)(re); + return true; + }); + if (!needle) { + continue; + } + for (const auto & [haystack, expectedMatches] : examples) { + auto haystackDescription = haystack.size() <= 20 ? haystack : std::format("{}[{} characters omitted]{}", haystack.substr(0, 10), haystack.size() - 20, haystack.substr(haystack.size() - 10)); + runStep(std::format(" match \x1b[35m\"{}\"\x1b[0m", haystackDescription), [&]() { + return needle->match(haystack) == expectedMatches; + }); + } + } + } +} + +void results(std::vector enginesToTest) { + for (const auto & engine : enginesToTest) { + std::cout << std::format("Engine: {}", engine) << std::endl; + for (const auto & [re, examples] : testCases) { + std::unique_ptr needle; + runStep(std::format(" compile \x1b[35m\"{}\"\x1b[0m", re), [&]() { + needle = engines.at(engine)(re); + return true; + }); + if (!needle) { + continue; + } + for (const auto & [haystack, _] : examples) { + try { + auto matches = needle->match(haystack); + std::cout << std::format(" match \x1b[35m\"{}\"\x1b[0m: ", haystack); + if (matches) { + std::cout << "["; + for (const auto & match : *matches) { + if (match) { + std::cout << std::format(" \x1b[35m\"{}\"\x1b[0m", *match); + } else { + std::cout << " null"; + } + } + std::cout << " ]" << std::endl; + } else { + std::cout << "null" << std::endl; + } + } catch (...) { + std::cout << std::format(" match \x1b[35m\"{}\"\x1b[0m: \x1b[31mEXCEPTION\x1b[0m", haystack) << std::endl; + } + } + } + } +} + +int main(int argc, char ** argv) { + std::vector enginesToTest; + if (argc >= 3) { + for (int i = 2; i < argc; ++i) { + enginesToTest.emplace_back(argv[i]); + } + } else { + for (const auto & [key, _] : engines) { + enginesToTest.push_back(key); + } + } + if (argc <= 1 || strcmp(argv[1], "check") == 0) { + check(enginesToTest); + } else if (strcmp(argv[1], "results") == 0) { + results(enginesToTest); + } else if (strcmp(argv[1], "list") == 0) { + for (const auto & engine : enginesToTest) { + std::cout << engine << std::endl; + } + } else { + return 1; + } +}