initial commit

This commit is contained in:
alois31 2024-08-29 19:36:29 +02:00
commit c8bdd1547b
Signed by: alois31
GPG key ID: E0F59EA5E5216914
18 changed files with 652 additions and 0 deletions

1
.envrc Normal file
View file

@ -0,0 +1 @@
use nix

4
.gitignore vendored Normal file
View file

@ -0,0 +1,4 @@
/.cache
/.direnv
result
result-*

47
README.md Normal file
View file

@ -0,0 +1,47 @@
# Regular expressions are hard
Writing a high-quality implementation of POSIX Extended Regular Expressions does not seem easy.
Ideally, the following features would be offered at the same time:
* Strict standards compliance (see https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/V1_chap09.html and https://pubs.opengroup.org/onlinepubs/9799919799/functions/regexec.html).
* Predictable performance (polynomial in the length of the inputs, and linear in the length of the matched string).
* Limited resource usage (memory, CPU time (also related to the previous point)).
* Absence of vendor-specific syntax extensions (for portability).
In practice, most implementations fall short of these goals, due to a variety of issues:
* Unclear wording in the standards, leading to diverging outcomes between implementations.
* Exponential matching time due to backtracking implementation (sometimes even mandated by non-standard syntax extensions).
* Excessive consumption of stack memory.
* Spurious matching failures (either indication of a non-match, or an error).
* Incorrect capturing behaviour of parenthesised subexpressions.
Here we test a small sample of regular expressions that are known to be hard to match properly against a couple of popular regex engines.
## Run it
Build using Lix:
nix-build -A default # or libcxx, or musl
List all available engines:
result/bin/driver list
Check the specified engines (tries all if none is specified, not recommended on libstdc++ because `std` may crash):
result/bin/driver check [engine]…
Print the match results of the specified engines (tries all if none is specified, not recommended on libstdc++ because `std` may crash):
result/bin/driver results [engine]…
## List of supported engines
* Boost.Regex (`boost`)
* C standard library (`c`)
* Oniguruma (`oniguruma`, does not claim POSIX compliance)
* PCRE2 (`pcre`, does not claim POSIX compliance)
* RE2 (`re2`, does not claim POSIX compliance)
* C++ standard library (`std`)
* TRE (`tre`)

14
default.nix Normal file
View file

@ -0,0 +1,14 @@
let
pkgs = import (builtins.fetchTarball {
url = "https://github.com/NixOS/nixpkgs/archive/d0e1602ddde669d5beb01aec49d71a51937ed7be.tar.gz";
sha256 = "0g0m7zhpnbgzwn4gmqhjvqd9v6d917p1dg3fk1kwxs2x7v7c1zd4";
}) { };
in
{
inherit pkgs;
default = pkgs.callPackage ./package.nix { };
libcxx = pkgs.pkgsLLVM.callPackage ./package.nix {
boost = null; # fails to compile
};
musl = pkgs.pkgsStatic.callPackage ./package.nix { };
}

30
meson.build Normal file
View file

@ -0,0 +1,30 @@
project('regex-is-hard', 'cpp', default_options: [
'buildtype=debugoptimized',
'cpp_std=c++20',
'warning_level=3',
])
boost = dependency('boost', modules: ['regex'], required: false)
oniguruma = dependency('oniguruma')
pcre = dependency('libpcre2-8')
re2 = dependency('re2')
tre = dependency('tre')
config_h = configure_file(configuration: {
'HAVE_BOOST': boost.found().to_int(),
}, output: 'config.h')
sources = [
'src/main.cc',
'src/data.cc',
'src/engine_c.cc',
'src/engine_oniguruma.cc',
'src/engine_pcre.cc',
'src/engine_re2.cc',
'src/engine_std.cc',
'src/engine_tre.cc',
]
if boost.found()
sources += 'src/engine_boost.cc'
endif
driver = executable('driver', sources, dependencies: [boost, oniguruma, pcre, re2, tre], install: true)

39
package.nix Normal file
View file

@ -0,0 +1,39 @@
{
stdenv,
lib,
meson,
ninja,
pkg-config,
boost,
oniguruma,
pcre2,
re2,
tre,
}:
stdenv.mkDerivation {
name = "regex-is-hard";
src = lib.fileset.toSource {
root = ./.;
fileset = lib.fileset.unions [
./meson.build
./src
];
};
strictDeps = true;
nativeBuildInputs = [
meson
ninja
pkg-config
];
buildInputs = [
boost
oniguruma
pcre2
re2
tre
];
}

15
shell.nix Normal file
View file

@ -0,0 +1,15 @@
let
defaultNix = import ./.;
inherit (defaultNix) pkgs;
in
pkgs.mkShell {
strictDeps = true;
inputsFrom = [ defaultNix.default ];
nativeBuildInputs = with pkgs; [
clang-tools
nil
nixfmt-rfc-style
];
}

47
src/data.cc Normal file
View file

@ -0,0 +1,47 @@
#include "config.h"
#include "data.hh"
#if HAVE_BOOST
std::unique_ptr<Regex> compileBoost(const std::string & re);
#endif
std::unique_ptr<Regex> compileC(const std::string & re);
std::unique_ptr<Regex> compileOniguruma(const std::string & re);
std::unique_ptr<Regex> compilePCRE(const std::string & re);
std::unique_ptr<Regex> compileRE2(const std::string & re);
std::unique_ptr<Regex> compileStd(const std::string & re);
std::unique_ptr<Regex> compileTRE(const std::string & re);
std::map<std::string, std::unique_ptr<Regex>(*)(const std::string &)> engines {
#if HAVE_BOOST
{"boost", compileBoost},
#endif
{"c", compileC},
{"oniguruma", compileOniguruma},
{"pcre", compilePCRE},
{"re2", compileRE2},
{"std", compileStd},
{"tre", compileTRE},
};
std::string aaaaa(500000, 'a');
std::vector<std::pair<std::string, std::vector<std::pair<std::string, MatchResult>>>> testCases {
{"\\.*(.*)", {{".keep", {{".keep", "keep"}}}}},
{".*(ex|gexp).*", {{"regexp", {{"regexp", "ex"}}}}},
{".*(gexp|ex).*", {{"regexp", {{"regexp", "ex"}}}}},
{"F.chsin", {{"Füchsin", {{"Füchsin"}}}}},
{"F..chsin", {{"Füchsin", std::nullopt}}},
{"(a*)*", {
{"a", {{"a", "a"}}}, // unclear
{"aaaaaaaaaaaaaah", std::nullopt},
{"aaaaaaaaaaaaaaah", std::nullopt},
{"aaaaaaaaaaaaaaaah", std::nullopt},
}},
{"(a+)+", {
{"a", {{"a", "a"}}},
{"aaaaaaaaaaaaaaaaaaaaaah", std::nullopt},
}},
{"(a(h)?)*", {{"aha", {{"aha", "a", std::nullopt}}}}},
{"((aa)*)*", {{"aa", {{"aa", "", std::nullopt}}}}}, // unclear
{".*", {{aaaaa, {{aaaaa}}}}}, // keep this at the bottom because libstdc++ crashes
};

9
src/data.hh Normal file
View file

@ -0,0 +1,9 @@
#pragma once
#include "engine.hh"
#include <map>
#include <memory>
extern std::map<std::string, std::unique_ptr<Regex>(*)(const std::string &)> engines;
extern std::vector<std::pair<std::string, std::vector<std::pair<std::string, MatchResult>>>> testCases;

13
src/engine.hh Normal file
View file

@ -0,0 +1,13 @@
#pragma once
#include <optional>
#include <string_view>
#include <vector>
using MatchResult = std::optional<std::vector<std::optional<std::string_view>>>;
class Regex {
public:
virtual ~Regex() {}
virtual MatchResult match(std::string_view haystack) = 0;
};

33
src/engine_boost.cc Normal file
View file

@ -0,0 +1,33 @@
#include "engine.hh"
#include <boost/regex.hpp>
class RegexBoost : public Regex {
boost::regex inner;
public:
RegexBoost(const std::string & re)
: inner(re, boost::regex::extended)
{}
MatchResult match(std::string_view haystack) override {
boost::cmatch matches;
if (boost::regex_match(haystack.begin(), haystack.end(), matches, inner)) {
std::vector<std::optional<std::string_view>> result;
for (const auto & match : matches) {
if (match.matched) {
result.push_back(std::string_view(match.first, match.second));
} else {
result.push_back(std::nullopt);
}
}
return result;
} else {
return std::nullopt;
}
}
};
std::unique_ptr<Regex> compileBoost(const std::string & re) {
return std::make_unique<RegexBoost>(re);
}

52
src/engine_c.cc Normal file
View file

@ -0,0 +1,52 @@
#include "engine.hh"
#include <format>
#include <memory>
#include <regex.h>
#include <string>
class RegexC : public Regex {
regex_t inner;
std::vector<regmatch_t> matches;
public:
RegexC(const std::string & re) {
// Bug: does not work for regex with embedded null bytes.
int code = regcomp(&inner, std::format("^{}$", re).data(), REG_EXTENDED);
if (code != 0) {
throw code;
}
matches = std::vector<regmatch_t>(inner.re_nsub + 1);
}
RegexC(const RegexC &) = delete;
RegexC & operator =(const RegexC &) = delete;
~RegexC() {
regfree(&inner);
}
MatchResult match(std::string_view haystack) override {
// Bug: does not work for haystack with embedded null bytes.
int code = regexec(&inner, std::string(haystack).data(), inner.re_nsub + 1, matches.data(), 0);
if (code == 0) {
std::vector<std::optional<std::string_view>> result;
for (const auto & match : matches) {
if (match.rm_so != -1 || match.rm_eo != -1) {
result.push_back(std::string_view(haystack.data() + match.rm_so, haystack.data() + match.rm_eo));
} else {
result.push_back(std::nullopt);
}
}
return result;
} else if (code == REG_NOMATCH) {
return std::nullopt;
} else {
throw code;
}
}
};
std::unique_ptr<Regex> compileC(const std::string & re) {
return std::make_unique<RegexC>(re);
}

69
src/engine_oniguruma.cc Normal file
View file

@ -0,0 +1,69 @@
#include "engine.hh"
#include <memory>
#include <oniguruma.h>
bool onigurumaInitialised = false;
class RegexOniguruma : public Regex {
OnigRegex inner;
OnigRegion * matches;
public:
RegexOniguruma(const std::string & re) {
if (!onigurumaInitialised) {
OnigEncoding encodings[] { ONIG_ENCODING_UTF8 };
onig_initialize(encodings, sizeof(encodings) / sizeof(OnigEncoding));
onigurumaInitialised = true;
}
OnigErrorInfo error;
int code = onig_new(
&inner,
reinterpret_cast<const unsigned char *>(re.data()),
reinterpret_cast<const unsigned char *>(re.data() + re.size()),
ONIG_OPTION_NONE,
ONIG_ENCODING_UTF8,
ONIG_SYNTAX_POSIX_EXTENDED,
&error
);
if (code != ONIG_NORMAL) {
throw code;
}
matches = onig_region_new();
}
RegexOniguruma(const RegexOniguruma &) = delete;
RegexOniguruma & operator =(const RegexOniguruma &) = delete;
~RegexOniguruma() {
onig_region_free(matches, 1);
onig_free(inner);
}
MatchResult match(std::string_view haystack) override {
int code = onig_match(
inner,
reinterpret_cast<const unsigned char *>(haystack.data()),
reinterpret_cast<const unsigned char *>(haystack.data() + haystack.size()),
reinterpret_cast<const unsigned char *>(haystack.data()),
matches,
ONIG_OPTION_MATCH_WHOLE_STRING
);
if (code >= 0) {
std::vector<std::optional<std::string_view>> result;
size_t n = matches->num_regs;
for (size_t i = 0; i < n; ++i) {
result.push_back(std::string_view(haystack.data() + matches->beg[i], haystack.data() + matches->end[i]));
}
return result;
} else if (code == ONIG_MISMATCH) {
return std::nullopt;
} else {
throw code;
}
}
};
std::unique_ptr<Regex> compileOniguruma(const std::string & re) {
return std::make_unique<RegexOniguruma>(re);
}

57
src/engine_pcre.cc Normal file
View file

@ -0,0 +1,57 @@
#include "engine.hh"
#define PCRE2_CODE_UNIT_WIDTH 8
#include <memory>
#include <pcre2.h>
#include <string>
class RegexPCRE : public Regex {
pcre2_code * inner;
pcre2_match_data * matches;
public:
RegexPCRE(const std::string & re) {
int errcode;
size_t erroffset;
inner = pcre2_compile(reinterpret_cast<const unsigned char *>(re.data()), re.size(), PCRE2_ANCHORED | PCRE2_ENDANCHORED, &errcode, &erroffset, nullptr);
if (!inner) {
throw errcode;
}
matches = pcre2_match_data_create_from_pattern(inner, nullptr);
}
RegexPCRE(const RegexPCRE &) = delete;
RegexPCRE & operator =(const RegexPCRE &) = delete;
~RegexPCRE() {
pcre2_match_data_free(matches);
pcre2_code_free(inner);
}
MatchResult match(std::string_view haystack) override {
int code = pcre2_match(inner, reinterpret_cast<const unsigned char *>(haystack.data()), haystack.size(), 0, 0, matches, nullptr);
if (code > 0) {
auto pMatch = pcre2_get_ovector_pointer(matches);
std::vector<std::optional<std::string_view>> result;
for (size_t i = 0; i < static_cast<size_t>(pcre2_get_ovector_count(matches)); ++i) {
auto start = pMatch[2 * i];
auto end = pMatch[2 * i + 1];
if (start != static_cast<size_t>(-1) || end != static_cast<size_t>(-1)) {
result.push_back(std::string_view(haystack.data() + start, haystack.data() + end));
} else {
result.push_back(std::nullopt);
}
}
return result;
} else if (code == PCRE2_ERROR_NOMATCH) {
return std::nullopt;
} else {
throw code;
}
}
};
std::unique_ptr<Regex> compilePCRE(const std::string & re) {
return std::make_unique<RegexPCRE>(re);
}

37
src/engine_re2.cc Normal file
View file

@ -0,0 +1,37 @@
#include "engine.hh"
#include <re2/re2.h>
class RegexRE2 : public Regex {
RE2 inner;
std::vector<std::optional<std::string_view>> matches;
std::vector<RE2::Arg> matchArgs;
std::vector<const RE2::Arg *> matchArgPointers;
public:
RegexRE2(const std::string & re)
: inner(re)
{
size_t n = inner.NumberOfCapturingGroups();
matches = std::vector<std::optional<std::string_view>>(n + 1);
for (size_t i = 1; i <= n; ++i) {
matchArgs.emplace_back(&matches[i]);
}
for (const auto & arg : matchArgs) {
matchArgPointers.push_back(&arg);
}
}
MatchResult match(std::string_view haystack) override {
if (RE2::FullMatchN(haystack, inner, matchArgPointers.data(), matchArgPointers.size())) {
matches[0] = haystack;
return matches;
} else {
return std::nullopt;
}
}
};
std::unique_ptr<Regex> compileRE2(const std::string & re) {
return std::make_unique<RegexRE2>(re);
}

33
src/engine_std.cc Normal file
View file

@ -0,0 +1,33 @@
#include "engine.hh"
#include <regex>
class RegexStd : public Regex {
std::regex inner;
public:
RegexStd(const std::string & re)
: inner(re, std::regex::extended)
{}
MatchResult match(std::string_view haystack) override {
std::cmatch matches;
if (std::regex_match(haystack.begin(), haystack.end(), matches, inner)) {
std::vector<std::optional<std::string_view>> result;
for (const auto & match : matches) {
if (match.matched) {
result.push_back(std::string_view(match.first, match.second));
} else {
result.push_back(std::nullopt);
}
}
return result;
} else {
return std::nullopt;
}
}
};
std::unique_ptr<Regex> compileStd(const std::string & re) {
return std::make_unique<RegexStd>(re);
}

50
src/engine_tre.cc Normal file
View file

@ -0,0 +1,50 @@
#include "engine.hh"
#include <format>
#include <memory>
#include <tre/tre.h>
class RegexTRE : public Regex {
regex_t inner;
std::vector<regmatch_t> matches;
public:
RegexTRE(const std::string & re) {
auto anchored = std::format("^{}$", re);
int code = tre_regncomp(&inner, anchored.data(), anchored.size(), REG_EXTENDED);
if (code != 0) {
throw code;
}
matches = std::vector<regmatch_t>(inner.re_nsub + 1);
}
RegexTRE(const RegexTRE &) = delete;
RegexTRE & operator =(const RegexTRE &) = delete;
~RegexTRE() {
tre_regfree(&inner);
}
MatchResult match(std::string_view haystack) override {
int code = tre_regnexec(&inner, haystack.data(), haystack.size(), inner.re_nsub + 1, matches.data(), 0);
if (code == 0) {
std::vector<std::optional<std::string_view>> result;
for (const auto & match : matches) {
if (match.rm_so != -1 || match.rm_eo != -1) {
result.push_back(std::string_view(haystack.data() + match.rm_so, haystack.data() + match.rm_eo));
} else {
result.push_back(std::nullopt);
}
}
return result;
} else if (code == REG_NOMATCH) {
return std::nullopt;
} else {
throw code;
}
}
};
std::unique_ptr<Regex> compileTRE(const std::string & re) {
return std::make_unique<RegexTRE>(re);
}

102
src/main.cc Normal file
View file

@ -0,0 +1,102 @@
#include "data.hh"
#include <chrono>
#include <cstring>
#include <format>
#include <functional>
#include <iostream>
void runStep(const std::string & description, std::function<bool()> action) {
auto start = std::chrono::steady_clock::now();
const char * result;
try {
result = action() ? "\x1b[32mOK\x1b[0m" : "\x1b[31mFAIL\x1b[0m";
} catch (...) {
result = "\x1b[31mEXCEPTION\x1b[0m";
}
auto time = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - start);
std::cout << std::format("{}: {} ({})", description, result, time) << std::endl;
}
void check(std::vector<std::string> enginesToTest) {
for (const auto & engine : enginesToTest) {
std::cout << std::format("Engine: {}", engine) << std::endl;
for (const auto & [re, examples] : testCases) {
std::unique_ptr<Regex> needle;
runStep(std::format(" compile \x1b[35m\"{}\"\x1b[0m", re), [&]() {
needle = engines.at(engine)(re);
return true;
});
if (!needle) {
continue;
}
for (const auto & [haystack, expectedMatches] : examples) {
auto haystackDescription = haystack.size() <= 20 ? haystack : std::format("{}[{} characters omitted]{}", haystack.substr(0, 10), haystack.size() - 20, haystack.substr(haystack.size() - 10));
runStep(std::format(" match \x1b[35m\"{}\"\x1b[0m", haystackDescription), [&]() {
return needle->match(haystack) == expectedMatches;
});
}
}
}
}
void results(std::vector<std::string> enginesToTest) {
for (const auto & engine : enginesToTest) {
std::cout << std::format("Engine: {}", engine) << std::endl;
for (const auto & [re, examples] : testCases) {
std::unique_ptr<Regex> needle;
runStep(std::format(" compile \x1b[35m\"{}\"\x1b[0m", re), [&]() {
needle = engines.at(engine)(re);
return true;
});
if (!needle) {
continue;
}
for (const auto & [haystack, _] : examples) {
try {
auto matches = needle->match(haystack);
std::cout << std::format(" match \x1b[35m\"{}\"\x1b[0m: ", haystack);
if (matches) {
std::cout << "[";
for (const auto & match : *matches) {
if (match) {
std::cout << std::format(" \x1b[35m\"{}\"\x1b[0m", *match);
} else {
std::cout << " null";
}
}
std::cout << " ]" << std::endl;
} else {
std::cout << "null" << std::endl;
}
} catch (...) {
std::cout << std::format(" match \x1b[35m\"{}\"\x1b[0m: \x1b[31mEXCEPTION\x1b[0m", haystack) << std::endl;
}
}
}
}
}
int main(int argc, char ** argv) {
std::vector<std::string> enginesToTest;
if (argc >= 3) {
for (int i = 2; i < argc; ++i) {
enginesToTest.emplace_back(argv[i]);
}
} else {
for (const auto & [key, _] : engines) {
enginesToTest.push_back(key);
}
}
if (argc <= 1 || strcmp(argv[1], "check") == 0) {
check(enginesToTest);
} else if (strcmp(argv[1], "results") == 0) {
results(enginesToTest);
} else if (strcmp(argv[1], "list") == 0) {
for (const auto & engine : enginesToTest) {
std::cout << engine << std::endl;
}
} else {
return 1;
}
}