initial commit
This commit is contained in:
commit
c8bdd1547b
4
.gitignore
vendored
Normal file
4
.gitignore
vendored
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
/.cache
|
||||||
|
/.direnv
|
||||||
|
result
|
||||||
|
result-*
|
47
README.md
Normal file
47
README.md
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
# Regular expressions are hard
|
||||||
|
|
||||||
|
Writing a high-quality implementation of POSIX Extended Regular Expressions does not seem easy.
|
||||||
|
Ideally, the following features would be offered at the same time:
|
||||||
|
|
||||||
|
* Strict standards compliance (see https://pubs.opengroup.org/onlinepubs/9799919799/basedefs/V1_chap09.html and https://pubs.opengroup.org/onlinepubs/9799919799/functions/regexec.html).
|
||||||
|
* Predictable performance (polynomial in the length of the inputs, and linear in the length of the matched string).
|
||||||
|
* Limited resource usage (memory, CPU time (also related to the previous point)).
|
||||||
|
* Absence of vendor-specific syntax extensions (for portability).
|
||||||
|
|
||||||
|
In practice, most implementations fall short of these goals, due to a variety of issues:
|
||||||
|
|
||||||
|
* Unclear wording in the standards, leading to diverging outcomes between implementations.
|
||||||
|
* Exponential matching time due to backtracking implementation (sometimes even mandated by non-standard syntax extensions).
|
||||||
|
* Excessive consumption of stack memory.
|
||||||
|
* Spurious matching failures (either indication of a non-match, or an error).
|
||||||
|
* Incorrect capturing behaviour of parenthesised subexpressions.
|
||||||
|
|
||||||
|
Here we test a small sample of regular expressions that are known to be hard to match properly against a couple of popular regex engines.
|
||||||
|
|
||||||
|
## Run it
|
||||||
|
|
||||||
|
Build using Lix:
|
||||||
|
|
||||||
|
nix-build -A default # or libcxx, or musl
|
||||||
|
|
||||||
|
List all available engines:
|
||||||
|
|
||||||
|
result/bin/driver list
|
||||||
|
|
||||||
|
Check the specified engines (tries all if none is specified, not recommended on libstdc++ because `std` may crash):
|
||||||
|
|
||||||
|
result/bin/driver check [engine]…
|
||||||
|
|
||||||
|
Print the match results of the specified engines (tries all if none is specified, not recommended on libstdc++ because `std` may crash):
|
||||||
|
|
||||||
|
result/bin/driver results [engine]…
|
||||||
|
|
||||||
|
## List of supported engines
|
||||||
|
|
||||||
|
* Boost.Regex (`boost`)
|
||||||
|
* C standard library (`c`)
|
||||||
|
* Oniguruma (`oniguruma`, does not claim POSIX compliance)
|
||||||
|
* PCRE2 (`pcre`, does not claim POSIX compliance)
|
||||||
|
* RE2 (`re2`, does not claim POSIX compliance)
|
||||||
|
* C++ standard library (`std`)
|
||||||
|
* TRE (`tre`)
|
14
default.nix
Normal file
14
default.nix
Normal file
|
@ -0,0 +1,14 @@
|
||||||
|
let
|
||||||
|
pkgs = import (builtins.fetchTarball {
|
||||||
|
url = "https://github.com/NixOS/nixpkgs/archive/d0e1602ddde669d5beb01aec49d71a51937ed7be.tar.gz";
|
||||||
|
sha256 = "0g0m7zhpnbgzwn4gmqhjvqd9v6d917p1dg3fk1kwxs2x7v7c1zd4";
|
||||||
|
}) { };
|
||||||
|
in
|
||||||
|
{
|
||||||
|
inherit pkgs;
|
||||||
|
default = pkgs.callPackage ./package.nix { };
|
||||||
|
libcxx = pkgs.pkgsLLVM.callPackage ./package.nix {
|
||||||
|
boost = null; # fails to compile
|
||||||
|
};
|
||||||
|
musl = pkgs.pkgsStatic.callPackage ./package.nix { };
|
||||||
|
}
|
30
meson.build
Normal file
30
meson.build
Normal file
|
@ -0,0 +1,30 @@
|
||||||
|
project('regex-is-hard', 'cpp', default_options: [
|
||||||
|
'buildtype=debugoptimized',
|
||||||
|
'cpp_std=c++20',
|
||||||
|
'warning_level=3',
|
||||||
|
])
|
||||||
|
|
||||||
|
boost = dependency('boost', modules: ['regex'], required: false)
|
||||||
|
oniguruma = dependency('oniguruma')
|
||||||
|
pcre = dependency('libpcre2-8')
|
||||||
|
re2 = dependency('re2')
|
||||||
|
tre = dependency('tre')
|
||||||
|
|
||||||
|
config_h = configure_file(configuration: {
|
||||||
|
'HAVE_BOOST': boost.found().to_int(),
|
||||||
|
}, output: 'config.h')
|
||||||
|
|
||||||
|
sources = [
|
||||||
|
'src/main.cc',
|
||||||
|
'src/data.cc',
|
||||||
|
'src/engine_c.cc',
|
||||||
|
'src/engine_oniguruma.cc',
|
||||||
|
'src/engine_pcre.cc',
|
||||||
|
'src/engine_re2.cc',
|
||||||
|
'src/engine_std.cc',
|
||||||
|
'src/engine_tre.cc',
|
||||||
|
]
|
||||||
|
if boost.found()
|
||||||
|
sources += 'src/engine_boost.cc'
|
||||||
|
endif
|
||||||
|
driver = executable('driver', sources, dependencies: [boost, oniguruma, pcre, re2, tre], install: true)
|
39
package.nix
Normal file
39
package.nix
Normal file
|
@ -0,0 +1,39 @@
|
||||||
|
{
|
||||||
|
stdenv,
|
||||||
|
lib,
|
||||||
|
meson,
|
||||||
|
ninja,
|
||||||
|
pkg-config,
|
||||||
|
boost,
|
||||||
|
oniguruma,
|
||||||
|
pcre2,
|
||||||
|
re2,
|
||||||
|
tre,
|
||||||
|
}:
|
||||||
|
stdenv.mkDerivation {
|
||||||
|
name = "regex-is-hard";
|
||||||
|
|
||||||
|
src = lib.fileset.toSource {
|
||||||
|
root = ./.;
|
||||||
|
fileset = lib.fileset.unions [
|
||||||
|
./meson.build
|
||||||
|
./src
|
||||||
|
];
|
||||||
|
};
|
||||||
|
|
||||||
|
strictDeps = true;
|
||||||
|
|
||||||
|
nativeBuildInputs = [
|
||||||
|
meson
|
||||||
|
ninja
|
||||||
|
pkg-config
|
||||||
|
];
|
||||||
|
|
||||||
|
buildInputs = [
|
||||||
|
boost
|
||||||
|
oniguruma
|
||||||
|
pcre2
|
||||||
|
re2
|
||||||
|
tre
|
||||||
|
];
|
||||||
|
}
|
15
shell.nix
Normal file
15
shell.nix
Normal file
|
@ -0,0 +1,15 @@
|
||||||
|
let
|
||||||
|
defaultNix = import ./.;
|
||||||
|
inherit (defaultNix) pkgs;
|
||||||
|
in
|
||||||
|
pkgs.mkShell {
|
||||||
|
strictDeps = true;
|
||||||
|
|
||||||
|
inputsFrom = [ defaultNix.default ];
|
||||||
|
|
||||||
|
nativeBuildInputs = with pkgs; [
|
||||||
|
clang-tools
|
||||||
|
nil
|
||||||
|
nixfmt-rfc-style
|
||||||
|
];
|
||||||
|
}
|
47
src/data.cc
Normal file
47
src/data.cc
Normal file
|
@ -0,0 +1,47 @@
|
||||||
|
#include "config.h"
|
||||||
|
#include "data.hh"
|
||||||
|
|
||||||
|
#if HAVE_BOOST
|
||||||
|
std::unique_ptr<Regex> compileBoost(const std::string & re);
|
||||||
|
#endif
|
||||||
|
std::unique_ptr<Regex> compileC(const std::string & re);
|
||||||
|
std::unique_ptr<Regex> compileOniguruma(const std::string & re);
|
||||||
|
std::unique_ptr<Regex> compilePCRE(const std::string & re);
|
||||||
|
std::unique_ptr<Regex> compileRE2(const std::string & re);
|
||||||
|
std::unique_ptr<Regex> compileStd(const std::string & re);
|
||||||
|
std::unique_ptr<Regex> compileTRE(const std::string & re);
|
||||||
|
|
||||||
|
std::map<std::string, std::unique_ptr<Regex>(*)(const std::string &)> engines {
|
||||||
|
#if HAVE_BOOST
|
||||||
|
{"boost", compileBoost},
|
||||||
|
#endif
|
||||||
|
{"c", compileC},
|
||||||
|
{"oniguruma", compileOniguruma},
|
||||||
|
{"pcre", compilePCRE},
|
||||||
|
{"re2", compileRE2},
|
||||||
|
{"std", compileStd},
|
||||||
|
{"tre", compileTRE},
|
||||||
|
};
|
||||||
|
|
||||||
|
std::string aaaaa(500000, 'a');
|
||||||
|
|
||||||
|
std::vector<std::pair<std::string, std::vector<std::pair<std::string, MatchResult>>>> testCases {
|
||||||
|
{"\\.*(.*)", {{".keep", {{".keep", "keep"}}}}},
|
||||||
|
{".*(ex|gexp).*", {{"regexp", {{"regexp", "ex"}}}}},
|
||||||
|
{".*(gexp|ex).*", {{"regexp", {{"regexp", "ex"}}}}},
|
||||||
|
{"F.chsin", {{"Füchsin", {{"Füchsin"}}}}},
|
||||||
|
{"F..chsin", {{"Füchsin", std::nullopt}}},
|
||||||
|
{"(a*)*", {
|
||||||
|
{"a", {{"a", "a"}}}, // unclear
|
||||||
|
{"aaaaaaaaaaaaaah", std::nullopt},
|
||||||
|
{"aaaaaaaaaaaaaaah", std::nullopt},
|
||||||
|
{"aaaaaaaaaaaaaaaah", std::nullopt},
|
||||||
|
}},
|
||||||
|
{"(a+)+", {
|
||||||
|
{"a", {{"a", "a"}}},
|
||||||
|
{"aaaaaaaaaaaaaaaaaaaaaah", std::nullopt},
|
||||||
|
}},
|
||||||
|
{"(a(h)?)*", {{"aha", {{"aha", "a", std::nullopt}}}}},
|
||||||
|
{"((aa)*)*", {{"aa", {{"aa", "", std::nullopt}}}}}, // unclear
|
||||||
|
{".*", {{aaaaa, {{aaaaa}}}}}, // keep this at the bottom because libstdc++ crashes
|
||||||
|
};
|
9
src/data.hh
Normal file
9
src/data.hh
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include "engine.hh"
|
||||||
|
|
||||||
|
#include <map>
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
|
extern std::map<std::string, std::unique_ptr<Regex>(*)(const std::string &)> engines;
|
||||||
|
extern std::vector<std::pair<std::string, std::vector<std::pair<std::string, MatchResult>>>> testCases;
|
13
src/engine.hh
Normal file
13
src/engine.hh
Normal file
|
@ -0,0 +1,13 @@
|
||||||
|
#pragma once
|
||||||
|
|
||||||
|
#include <optional>
|
||||||
|
#include <string_view>
|
||||||
|
#include <vector>
|
||||||
|
|
||||||
|
using MatchResult = std::optional<std::vector<std::optional<std::string_view>>>;
|
||||||
|
|
||||||
|
class Regex {
|
||||||
|
public:
|
||||||
|
virtual ~Regex() {}
|
||||||
|
virtual MatchResult match(std::string_view haystack) = 0;
|
||||||
|
};
|
33
src/engine_boost.cc
Normal file
33
src/engine_boost.cc
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
#include "engine.hh"
|
||||||
|
|
||||||
|
#include <boost/regex.hpp>
|
||||||
|
|
||||||
|
class RegexBoost : public Regex {
|
||||||
|
boost::regex inner;
|
||||||
|
|
||||||
|
public:
|
||||||
|
RegexBoost(const std::string & re)
|
||||||
|
: inner(re, boost::regex::extended)
|
||||||
|
{}
|
||||||
|
|
||||||
|
MatchResult match(std::string_view haystack) override {
|
||||||
|
boost::cmatch matches;
|
||||||
|
if (boost::regex_match(haystack.begin(), haystack.end(), matches, inner)) {
|
||||||
|
std::vector<std::optional<std::string_view>> result;
|
||||||
|
for (const auto & match : matches) {
|
||||||
|
if (match.matched) {
|
||||||
|
result.push_back(std::string_view(match.first, match.second));
|
||||||
|
} else {
|
||||||
|
result.push_back(std::nullopt);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
} else {
|
||||||
|
return std::nullopt;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
std::unique_ptr<Regex> compileBoost(const std::string & re) {
|
||||||
|
return std::make_unique<RegexBoost>(re);
|
||||||
|
}
|
52
src/engine_c.cc
Normal file
52
src/engine_c.cc
Normal file
|
@ -0,0 +1,52 @@
|
||||||
|
#include "engine.hh"
|
||||||
|
|
||||||
|
#include <format>
|
||||||
|
#include <memory>
|
||||||
|
#include <regex.h>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
class RegexC : public Regex {
|
||||||
|
regex_t inner;
|
||||||
|
std::vector<regmatch_t> matches;
|
||||||
|
|
||||||
|
public:
|
||||||
|
RegexC(const std::string & re) {
|
||||||
|
// Bug: does not work for regex with embedded null bytes.
|
||||||
|
int code = regcomp(&inner, std::format("^{}$", re).data(), REG_EXTENDED);
|
||||||
|
if (code != 0) {
|
||||||
|
throw code;
|
||||||
|
}
|
||||||
|
matches = std::vector<regmatch_t>(inner.re_nsub + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
RegexC(const RegexC &) = delete;
|
||||||
|
RegexC & operator =(const RegexC &) = delete;
|
||||||
|
|
||||||
|
~RegexC() {
|
||||||
|
regfree(&inner);
|
||||||
|
}
|
||||||
|
|
||||||
|
MatchResult match(std::string_view haystack) override {
|
||||||
|
// Bug: does not work for haystack with embedded null bytes.
|
||||||
|
int code = regexec(&inner, std::string(haystack).data(), inner.re_nsub + 1, matches.data(), 0);
|
||||||
|
if (code == 0) {
|
||||||
|
std::vector<std::optional<std::string_view>> result;
|
||||||
|
for (const auto & match : matches) {
|
||||||
|
if (match.rm_so != -1 || match.rm_eo != -1) {
|
||||||
|
result.push_back(std::string_view(haystack.data() + match.rm_so, haystack.data() + match.rm_eo));
|
||||||
|
} else {
|
||||||
|
result.push_back(std::nullopt);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
} else if (code == REG_NOMATCH) {
|
||||||
|
return std::nullopt;
|
||||||
|
} else {
|
||||||
|
throw code;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
std::unique_ptr<Regex> compileC(const std::string & re) {
|
||||||
|
return std::make_unique<RegexC>(re);
|
||||||
|
}
|
69
src/engine_oniguruma.cc
Normal file
69
src/engine_oniguruma.cc
Normal file
|
@ -0,0 +1,69 @@
|
||||||
|
#include "engine.hh"
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include <oniguruma.h>
|
||||||
|
|
||||||
|
bool onigurumaInitialised = false;
|
||||||
|
|
||||||
|
class RegexOniguruma : public Regex {
|
||||||
|
OnigRegex inner;
|
||||||
|
OnigRegion * matches;
|
||||||
|
|
||||||
|
public:
|
||||||
|
RegexOniguruma(const std::string & re) {
|
||||||
|
if (!onigurumaInitialised) {
|
||||||
|
OnigEncoding encodings[] { ONIG_ENCODING_UTF8 };
|
||||||
|
onig_initialize(encodings, sizeof(encodings) / sizeof(OnigEncoding));
|
||||||
|
onigurumaInitialised = true;
|
||||||
|
}
|
||||||
|
OnigErrorInfo error;
|
||||||
|
int code = onig_new(
|
||||||
|
&inner,
|
||||||
|
reinterpret_cast<const unsigned char *>(re.data()),
|
||||||
|
reinterpret_cast<const unsigned char *>(re.data() + re.size()),
|
||||||
|
ONIG_OPTION_NONE,
|
||||||
|
ONIG_ENCODING_UTF8,
|
||||||
|
ONIG_SYNTAX_POSIX_EXTENDED,
|
||||||
|
&error
|
||||||
|
);
|
||||||
|
if (code != ONIG_NORMAL) {
|
||||||
|
throw code;
|
||||||
|
}
|
||||||
|
matches = onig_region_new();
|
||||||
|
}
|
||||||
|
|
||||||
|
RegexOniguruma(const RegexOniguruma &) = delete;
|
||||||
|
RegexOniguruma & operator =(const RegexOniguruma &) = delete;
|
||||||
|
|
||||||
|
~RegexOniguruma() {
|
||||||
|
onig_region_free(matches, 1);
|
||||||
|
onig_free(inner);
|
||||||
|
}
|
||||||
|
|
||||||
|
MatchResult match(std::string_view haystack) override {
|
||||||
|
int code = onig_match(
|
||||||
|
inner,
|
||||||
|
reinterpret_cast<const unsigned char *>(haystack.data()),
|
||||||
|
reinterpret_cast<const unsigned char *>(haystack.data() + haystack.size()),
|
||||||
|
reinterpret_cast<const unsigned char *>(haystack.data()),
|
||||||
|
matches,
|
||||||
|
ONIG_OPTION_MATCH_WHOLE_STRING
|
||||||
|
);
|
||||||
|
if (code >= 0) {
|
||||||
|
std::vector<std::optional<std::string_view>> result;
|
||||||
|
size_t n = matches->num_regs;
|
||||||
|
for (size_t i = 0; i < n; ++i) {
|
||||||
|
result.push_back(std::string_view(haystack.data() + matches->beg[i], haystack.data() + matches->end[i]));
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
} else if (code == ONIG_MISMATCH) {
|
||||||
|
return std::nullopt;
|
||||||
|
} else {
|
||||||
|
throw code;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
std::unique_ptr<Regex> compileOniguruma(const std::string & re) {
|
||||||
|
return std::make_unique<RegexOniguruma>(re);
|
||||||
|
}
|
57
src/engine_pcre.cc
Normal file
57
src/engine_pcre.cc
Normal file
|
@ -0,0 +1,57 @@
|
||||||
|
#include "engine.hh"
|
||||||
|
|
||||||
|
#define PCRE2_CODE_UNIT_WIDTH 8
|
||||||
|
|
||||||
|
#include <memory>
|
||||||
|
#include <pcre2.h>
|
||||||
|
#include <string>
|
||||||
|
|
||||||
|
class RegexPCRE : public Regex {
|
||||||
|
pcre2_code * inner;
|
||||||
|
pcre2_match_data * matches;
|
||||||
|
|
||||||
|
public:
|
||||||
|
RegexPCRE(const std::string & re) {
|
||||||
|
int errcode;
|
||||||
|
size_t erroffset;
|
||||||
|
inner = pcre2_compile(reinterpret_cast<const unsigned char *>(re.data()), re.size(), PCRE2_ANCHORED | PCRE2_ENDANCHORED, &errcode, &erroffset, nullptr);
|
||||||
|
if (!inner) {
|
||||||
|
throw errcode;
|
||||||
|
}
|
||||||
|
matches = pcre2_match_data_create_from_pattern(inner, nullptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
RegexPCRE(const RegexPCRE &) = delete;
|
||||||
|
RegexPCRE & operator =(const RegexPCRE &) = delete;
|
||||||
|
|
||||||
|
~RegexPCRE() {
|
||||||
|
pcre2_match_data_free(matches);
|
||||||
|
pcre2_code_free(inner);
|
||||||
|
}
|
||||||
|
|
||||||
|
MatchResult match(std::string_view haystack) override {
|
||||||
|
int code = pcre2_match(inner, reinterpret_cast<const unsigned char *>(haystack.data()), haystack.size(), 0, 0, matches, nullptr);
|
||||||
|
if (code > 0) {
|
||||||
|
auto pMatch = pcre2_get_ovector_pointer(matches);
|
||||||
|
std::vector<std::optional<std::string_view>> result;
|
||||||
|
for (size_t i = 0; i < static_cast<size_t>(pcre2_get_ovector_count(matches)); ++i) {
|
||||||
|
auto start = pMatch[2 * i];
|
||||||
|
auto end = pMatch[2 * i + 1];
|
||||||
|
if (start != static_cast<size_t>(-1) || end != static_cast<size_t>(-1)) {
|
||||||
|
result.push_back(std::string_view(haystack.data() + start, haystack.data() + end));
|
||||||
|
} else {
|
||||||
|
result.push_back(std::nullopt);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
} else if (code == PCRE2_ERROR_NOMATCH) {
|
||||||
|
return std::nullopt;
|
||||||
|
} else {
|
||||||
|
throw code;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
std::unique_ptr<Regex> compilePCRE(const std::string & re) {
|
||||||
|
return std::make_unique<RegexPCRE>(re);
|
||||||
|
}
|
37
src/engine_re2.cc
Normal file
37
src/engine_re2.cc
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
#include "engine.hh"
|
||||||
|
|
||||||
|
#include <re2/re2.h>
|
||||||
|
|
||||||
|
class RegexRE2 : public Regex {
|
||||||
|
RE2 inner;
|
||||||
|
std::vector<std::optional<std::string_view>> matches;
|
||||||
|
std::vector<RE2::Arg> matchArgs;
|
||||||
|
std::vector<const RE2::Arg *> matchArgPointers;
|
||||||
|
|
||||||
|
public:
|
||||||
|
RegexRE2(const std::string & re)
|
||||||
|
: inner(re)
|
||||||
|
{
|
||||||
|
size_t n = inner.NumberOfCapturingGroups();
|
||||||
|
matches = std::vector<std::optional<std::string_view>>(n + 1);
|
||||||
|
for (size_t i = 1; i <= n; ++i) {
|
||||||
|
matchArgs.emplace_back(&matches[i]);
|
||||||
|
}
|
||||||
|
for (const auto & arg : matchArgs) {
|
||||||
|
matchArgPointers.push_back(&arg);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
MatchResult match(std::string_view haystack) override {
|
||||||
|
if (RE2::FullMatchN(haystack, inner, matchArgPointers.data(), matchArgPointers.size())) {
|
||||||
|
matches[0] = haystack;
|
||||||
|
return matches;
|
||||||
|
} else {
|
||||||
|
return std::nullopt;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
std::unique_ptr<Regex> compileRE2(const std::string & re) {
|
||||||
|
return std::make_unique<RegexRE2>(re);
|
||||||
|
}
|
33
src/engine_std.cc
Normal file
33
src/engine_std.cc
Normal file
|
@ -0,0 +1,33 @@
|
||||||
|
#include "engine.hh"
|
||||||
|
|
||||||
|
#include <regex>
|
||||||
|
|
||||||
|
class RegexStd : public Regex {
|
||||||
|
std::regex inner;
|
||||||
|
|
||||||
|
public:
|
||||||
|
RegexStd(const std::string & re)
|
||||||
|
: inner(re, std::regex::extended)
|
||||||
|
{}
|
||||||
|
|
||||||
|
MatchResult match(std::string_view haystack) override {
|
||||||
|
std::cmatch matches;
|
||||||
|
if (std::regex_match(haystack.begin(), haystack.end(), matches, inner)) {
|
||||||
|
std::vector<std::optional<std::string_view>> result;
|
||||||
|
for (const auto & match : matches) {
|
||||||
|
if (match.matched) {
|
||||||
|
result.push_back(std::string_view(match.first, match.second));
|
||||||
|
} else {
|
||||||
|
result.push_back(std::nullopt);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
} else {
|
||||||
|
return std::nullopt;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
std::unique_ptr<Regex> compileStd(const std::string & re) {
|
||||||
|
return std::make_unique<RegexStd>(re);
|
||||||
|
}
|
50
src/engine_tre.cc
Normal file
50
src/engine_tre.cc
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
#include "engine.hh"
|
||||||
|
|
||||||
|
#include <format>
|
||||||
|
#include <memory>
|
||||||
|
#include <tre/tre.h>
|
||||||
|
|
||||||
|
class RegexTRE : public Regex {
|
||||||
|
regex_t inner;
|
||||||
|
std::vector<regmatch_t> matches;
|
||||||
|
|
||||||
|
public:
|
||||||
|
RegexTRE(const std::string & re) {
|
||||||
|
auto anchored = std::format("^{}$", re);
|
||||||
|
int code = tre_regncomp(&inner, anchored.data(), anchored.size(), REG_EXTENDED);
|
||||||
|
if (code != 0) {
|
||||||
|
throw code;
|
||||||
|
}
|
||||||
|
matches = std::vector<regmatch_t>(inner.re_nsub + 1);
|
||||||
|
}
|
||||||
|
|
||||||
|
RegexTRE(const RegexTRE &) = delete;
|
||||||
|
RegexTRE & operator =(const RegexTRE &) = delete;
|
||||||
|
|
||||||
|
~RegexTRE() {
|
||||||
|
tre_regfree(&inner);
|
||||||
|
}
|
||||||
|
|
||||||
|
MatchResult match(std::string_view haystack) override {
|
||||||
|
int code = tre_regnexec(&inner, haystack.data(), haystack.size(), inner.re_nsub + 1, matches.data(), 0);
|
||||||
|
if (code == 0) {
|
||||||
|
std::vector<std::optional<std::string_view>> result;
|
||||||
|
for (const auto & match : matches) {
|
||||||
|
if (match.rm_so != -1 || match.rm_eo != -1) {
|
||||||
|
result.push_back(std::string_view(haystack.data() + match.rm_so, haystack.data() + match.rm_eo));
|
||||||
|
} else {
|
||||||
|
result.push_back(std::nullopt);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return result;
|
||||||
|
} else if (code == REG_NOMATCH) {
|
||||||
|
return std::nullopt;
|
||||||
|
} else {
|
||||||
|
throw code;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
std::unique_ptr<Regex> compileTRE(const std::string & re) {
|
||||||
|
return std::make_unique<RegexTRE>(re);
|
||||||
|
}
|
102
src/main.cc
Normal file
102
src/main.cc
Normal file
|
@ -0,0 +1,102 @@
|
||||||
|
#include "data.hh"
|
||||||
|
|
||||||
|
#include <chrono>
|
||||||
|
#include <cstring>
|
||||||
|
#include <format>
|
||||||
|
#include <functional>
|
||||||
|
#include <iostream>
|
||||||
|
|
||||||
|
void runStep(const std::string & description, std::function<bool()> action) {
|
||||||
|
auto start = std::chrono::steady_clock::now();
|
||||||
|
const char * result;
|
||||||
|
try {
|
||||||
|
result = action() ? "\x1b[32mOK\x1b[0m" : "\x1b[31mFAIL\x1b[0m";
|
||||||
|
} catch (...) {
|
||||||
|
result = "\x1b[31mEXCEPTION\x1b[0m";
|
||||||
|
}
|
||||||
|
auto time = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::steady_clock::now() - start);
|
||||||
|
std::cout << std::format("{}: {} ({})", description, result, time) << std::endl;
|
||||||
|
}
|
||||||
|
|
||||||
|
void check(std::vector<std::string> enginesToTest) {
|
||||||
|
for (const auto & engine : enginesToTest) {
|
||||||
|
std::cout << std::format("Engine: {}", engine) << std::endl;
|
||||||
|
for (const auto & [re, examples] : testCases) {
|
||||||
|
std::unique_ptr<Regex> needle;
|
||||||
|
runStep(std::format(" compile \x1b[35m\"{}\"\x1b[0m", re), [&]() {
|
||||||
|
needle = engines.at(engine)(re);
|
||||||
|
return true;
|
||||||
|
});
|
||||||
|
if (!needle) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
for (const auto & [haystack, expectedMatches] : examples) {
|
||||||
|
auto haystackDescription = haystack.size() <= 20 ? haystack : std::format("{}[{} characters omitted]{}", haystack.substr(0, 10), haystack.size() - 20, haystack.substr(haystack.size() - 10));
|
||||||
|
runStep(std::format(" match \x1b[35m\"{}\"\x1b[0m", haystackDescription), [&]() {
|
||||||
|
return needle->match(haystack) == expectedMatches;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void results(std::vector<std::string> enginesToTest) {
|
||||||
|
for (const auto & engine : enginesToTest) {
|
||||||
|
std::cout << std::format("Engine: {}", engine) << std::endl;
|
||||||
|
for (const auto & [re, examples] : testCases) {
|
||||||
|
std::unique_ptr<Regex> needle;
|
||||||
|
runStep(std::format(" compile \x1b[35m\"{}\"\x1b[0m", re), [&]() {
|
||||||
|
needle = engines.at(engine)(re);
|
||||||
|
return true;
|
||||||
|
});
|
||||||
|
if (!needle) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
for (const auto & [haystack, _] : examples) {
|
||||||
|
try {
|
||||||
|
auto matches = needle->match(haystack);
|
||||||
|
std::cout << std::format(" match \x1b[35m\"{}\"\x1b[0m: ", haystack);
|
||||||
|
if (matches) {
|
||||||
|
std::cout << "[";
|
||||||
|
for (const auto & match : *matches) {
|
||||||
|
if (match) {
|
||||||
|
std::cout << std::format(" \x1b[35m\"{}\"\x1b[0m", *match);
|
||||||
|
} else {
|
||||||
|
std::cout << " null";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
std::cout << " ]" << std::endl;
|
||||||
|
} else {
|
||||||
|
std::cout << "null" << std::endl;
|
||||||
|
}
|
||||||
|
} catch (...) {
|
||||||
|
std::cout << std::format(" match \x1b[35m\"{}\"\x1b[0m: \x1b[31mEXCEPTION\x1b[0m", haystack) << std::endl;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
int main(int argc, char ** argv) {
|
||||||
|
std::vector<std::string> enginesToTest;
|
||||||
|
if (argc >= 3) {
|
||||||
|
for (int i = 2; i < argc; ++i) {
|
||||||
|
enginesToTest.emplace_back(argv[i]);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
for (const auto & [key, _] : engines) {
|
||||||
|
enginesToTest.push_back(key);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if (argc <= 1 || strcmp(argv[1], "check") == 0) {
|
||||||
|
check(enginesToTest);
|
||||||
|
} else if (strcmp(argv[1], "results") == 0) {
|
||||||
|
results(enginesToTest);
|
||||||
|
} else if (strcmp(argv[1], "list") == 0) {
|
||||||
|
for (const auto & engine : enginesToTest) {
|
||||||
|
std::cout << engine << std::endl;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
return 1;
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in a new issue