Merge branch 'tokenize' of https://github.com/nbp/nix
This commit is contained in:
commit
2ee1b9359b
3 changed files with 153 additions and 0 deletions
|
@ -873,6 +873,43 @@ builtins.sort builtins.lessThan [ 483 249 526 147 42 77 ]
|
||||||
</varlistentry>
|
</varlistentry>
|
||||||
|
|
||||||
|
|
||||||
|
<varlistentry><term><function>builtins.split</function>
|
||||||
|
<replaceable>regex</replaceable> <replaceable>str</replaceable></term>
|
||||||
|
|
||||||
|
<listitem><para>Returns a list composed of non matched strings interleaved
|
||||||
|
with the lists of the <link
|
||||||
|
xlink:href="http://pubs.opengroup.org/onlinepubs/9699919799/basedefs/V1_chap09.html#tag_09_04">extended
|
||||||
|
POSIX regular expression</link> <replaceable>regex</replaceable> matches
|
||||||
|
of <replaceable>str</replaceable>. Each item in the lists of matched
|
||||||
|
sequences is a regex group.
|
||||||
|
|
||||||
|
<programlisting>
|
||||||
|
builtins.split "(a)b" "abc"
|
||||||
|
</programlisting>
|
||||||
|
|
||||||
|
Evaluates to <literal>[ "" [ "a" ] "c" ]</literal>.
|
||||||
|
|
||||||
|
<programlisting>
|
||||||
|
builtins.split "([ac])" "abc"
|
||||||
|
</programlisting>
|
||||||
|
|
||||||
|
Evaluates to <literal>[ "" [ "a" ] "b" [ "c" ] "" ]</literal>.
|
||||||
|
|
||||||
|
<programlisting>
|
||||||
|
builtins.split "(a)|(c)" "abc"
|
||||||
|
</programlisting>
|
||||||
|
|
||||||
|
Evaluates to <literal>[ "" [ "a" null ] "b" [ null "c" ] "" ]</literal>.
|
||||||
|
|
||||||
|
<programlisting>
|
||||||
|
builtins.split "([[:upper:]]+)" " FOO "
|
||||||
|
</programlisting>
|
||||||
|
|
||||||
|
Evaluates to <literal>[ " " [ "FOO" ] " " ]</literal>.
|
||||||
|
|
||||||
|
</para></listitem>
|
||||||
|
</varlistentry>
|
||||||
|
|
||||||
<varlistentry><term><function>builtins.stringLength</function>
|
<varlistentry><term><function>builtins.stringLength</function>
|
||||||
<replaceable>e</replaceable></term>
|
<replaceable>e</replaceable></term>
|
||||||
|
|
||||||
|
|
|
@ -1745,6 +1745,73 @@ static void prim_match(EvalState & state, const Pos & pos, Value * * args, Value
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
/* Split a string with a regular expression, and return a list of the
|
||||||
|
non-matching parts interleaved by the lists of the matching groups. */
|
||||||
|
static void prim_split(EvalState & state, const Pos & pos, Value * * args, Value & v)
|
||||||
|
{
|
||||||
|
auto re = state.forceStringNoCtx(*args[0], pos);
|
||||||
|
|
||||||
|
try {
|
||||||
|
|
||||||
|
std::regex regex(re, std::regex::extended);
|
||||||
|
|
||||||
|
PathSet context;
|
||||||
|
const std::string str = state.forceString(*args[1], context, pos);
|
||||||
|
|
||||||
|
auto begin = std::sregex_iterator(str.begin(), str.end(), regex);
|
||||||
|
auto end = std::sregex_iterator();
|
||||||
|
|
||||||
|
// Any matches results are surrounded by non-matching results.
|
||||||
|
const size_t len = std::distance(begin, end);
|
||||||
|
state.mkList(v, 2 * len + 1);
|
||||||
|
size_t idx = 0;
|
||||||
|
Value * elem;
|
||||||
|
|
||||||
|
if (len == 0) {
|
||||||
|
v.listElems()[idx++] = args[1];
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (std::sregex_iterator i = begin; i != end; ++i) {
|
||||||
|
assert(idx <= 2 * len + 1 - 3);
|
||||||
|
std::smatch match = *i;
|
||||||
|
|
||||||
|
// Add a string for non-matched characters.
|
||||||
|
elem = v.listElems()[idx++] = state.allocValue();
|
||||||
|
mkString(*elem, match.prefix().str().c_str());
|
||||||
|
|
||||||
|
// Add a list for matched substrings.
|
||||||
|
const size_t slen = match.size() - 1;
|
||||||
|
elem = v.listElems()[idx++] = state.allocValue();
|
||||||
|
|
||||||
|
// Start at 1, beacause the first match is the whole string.
|
||||||
|
state.mkList(*elem, slen);
|
||||||
|
for (size_t si = 0; si < slen; ++si) {
|
||||||
|
if (!match[si + 1].matched)
|
||||||
|
mkNull(*(elem->listElems()[si] = state.allocValue()));
|
||||||
|
else
|
||||||
|
mkString(*(elem->listElems()[si] = state.allocValue()), match[si + 1].str().c_str());
|
||||||
|
}
|
||||||
|
|
||||||
|
// Add a string for non-matched suffix characters.
|
||||||
|
if (idx == 2 * len) {
|
||||||
|
elem = v.listElems()[idx++] = state.allocValue();
|
||||||
|
mkString(*elem, match.suffix().str().c_str());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
assert(idx == 2 * len + 1);
|
||||||
|
|
||||||
|
} catch (std::regex_error &e) {
|
||||||
|
if (e.code() == std::regex_constants::error_space) {
|
||||||
|
// limit is _GLIBCXX_REGEX_STATE_LIMIT for libstdc++
|
||||||
|
throw EvalError("memory limit exceeded by regular expression '%s', at %s", re, pos);
|
||||||
|
} else {
|
||||||
|
throw EvalError("invalid regular expression '%s', at %s", re, pos);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
static void prim_concatStringSep(EvalState & state, const Pos & pos, Value * * args, Value & v)
|
static void prim_concatStringSep(EvalState & state, const Pos & pos, Value * * args, Value & v)
|
||||||
{
|
{
|
||||||
PathSet context;
|
PathSet context;
|
||||||
|
@ -2039,6 +2106,7 @@ void EvalState::createBaseEnv()
|
||||||
addPrimOp("__unsafeDiscardOutputDependency", 1, prim_unsafeDiscardOutputDependency);
|
addPrimOp("__unsafeDiscardOutputDependency", 1, prim_unsafeDiscardOutputDependency);
|
||||||
addPrimOp("__hashString", 2, prim_hashString);
|
addPrimOp("__hashString", 2, prim_hashString);
|
||||||
addPrimOp("__match", 2, prim_match);
|
addPrimOp("__match", 2, prim_match);
|
||||||
|
addPrimOp("__split", 2, prim_split);
|
||||||
addPrimOp("__concatStringsSep", 2, prim_concatStringSep);
|
addPrimOp("__concatStringsSep", 2, prim_concatStringSep);
|
||||||
addPrimOp("__replaceStrings", 3, prim_replaceStrings);
|
addPrimOp("__replaceStrings", 3, prim_replaceStrings);
|
||||||
|
|
||||||
|
|
48
tests/lang/eval-okay-regex-split.nix
Normal file
48
tests/lang/eval-okay-regex-split.nix
Normal file
|
@ -0,0 +1,48 @@
|
||||||
|
with builtins;
|
||||||
|
|
||||||
|
# Non capturing regex returns empty lists
|
||||||
|
assert split "foobar" "foobar" == ["" [] ""];
|
||||||
|
assert split "fo*" "f" == ["" [] ""];
|
||||||
|
assert split "fo+" "f" == ["f"];
|
||||||
|
assert split "fo*" "fo" == ["" [] ""];
|
||||||
|
assert split "fo*" "foo" == ["" [] ""];
|
||||||
|
assert split "fo+" "foo" == ["" [] ""];
|
||||||
|
assert split "fo{1,2}" "foo" == ["" [] ""];
|
||||||
|
assert split "fo{1,2}" "fooo" == ["" [] "o"];
|
||||||
|
assert split "fo*" "foobar" == ["" [] "bar"];
|
||||||
|
|
||||||
|
# Capturing regex returns a list of sub-matches
|
||||||
|
assert split "(fo*)" "f" == ["" ["f"] ""];
|
||||||
|
assert split "(fo+)" "f" == ["f"];
|
||||||
|
assert split "(fo*)" "fo" == ["" ["fo"] ""];
|
||||||
|
assert split "(f)(o*)" "f" == ["" ["f" ""] ""];
|
||||||
|
assert split "(f)(o*)" "foo" == ["" ["f" "oo"] ""];
|
||||||
|
assert split "(fo+)" "foo" == ["" ["foo"] ""];
|
||||||
|
assert split "(fo{1,2})" "foo" == ["" ["foo"] ""];
|
||||||
|
assert split "(fo{1,2})" "fooo" == ["" ["foo"] "o"];
|
||||||
|
assert split "(fo*)" "foobar" == ["" ["foo"] "bar"];
|
||||||
|
|
||||||
|
# Matches are greedy.
|
||||||
|
assert split "(o+)" "oooofoooo" == ["" ["oooo"] "f" ["oooo"] ""];
|
||||||
|
|
||||||
|
# Matches multiple times.
|
||||||
|
assert split "(b)" "foobarbaz" == ["foo" ["b"] "ar" ["b"] "az"];
|
||||||
|
|
||||||
|
# Split large strings containing newlines. null are inserted when a
|
||||||
|
# pattern within the current did not match anything.
|
||||||
|
assert split "[[:space:]]+|([',.!?])" ''
|
||||||
|
Nix Rocks!
|
||||||
|
That's why I use it.
|
||||||
|
'' == [
|
||||||
|
"Nix" [ null ] "Rocks" ["!"] "" [ null ]
|
||||||
|
"That" ["'"] "s" [ null ] "why" [ null ] "I" [ null ] "use" [ null ] "it" ["."] "" [ null ]
|
||||||
|
""
|
||||||
|
];
|
||||||
|
|
||||||
|
# Documentation examples
|
||||||
|
assert split "(a)b" "abc" == [ "" [ "a" ] "c" ];
|
||||||
|
assert split "([ac])" "abc" == [ "" [ "a" ] "b" [ "c" ] "" ];
|
||||||
|
assert split "(a)|(c)" "abc" == [ "" [ "a" null ] "b" [ null "c" ] "" ];
|
||||||
|
assert split "([[:upper:]]+)" " FOO " == [ " " [ "FOO" ] " " ];
|
||||||
|
|
||||||
|
true
|
Loading…
Reference in a new issue