diff --git a/doc/manual/expressions/builtins.xml b/doc/manual/expressions/builtins.xml
index 86c36da1b..615314880 100644
--- a/doc/manual/expressions/builtins.xml
+++ b/doc/manual/expressions/builtins.xml
@@ -873,6 +873,43 @@ builtins.sort builtins.lessThan [ 483 249 526 147 42 77 ]
+ builtins.split
+ regex str
+
+ Returns a list composed of non matched strings interleaved
+ with the lists of the extended
+ POSIX regular expression regex matches
+ of str. Each item in the lists of matched
+ sequences is a regex group.
+
+
+builtins.split "(a)b" "abc"
+
+
+Evaluates to [ "" [ "a" ] "c" ].
+
+
+builtins.split "([ac])" "abc"
+
+
+Evaluates to [ "" [ "a" ] "b" [ "c" ] "" ].
+
+
+builtins.split "(a)|(c)" "abc"
+
+
+Evaluates to [ "" [ "a" null ] "b" [ null "c" ] "" ].
+
+
+builtins.split "([[:upper:]]+)" " FOO "
+
+
+Evaluates to [ " " [ "FOO" ] " " ].
+
+
+
+
builtins.stringLength
e
diff --git a/src/libexpr/primops.cc b/src/libexpr/primops.cc
index 4e51e8ff2..fcd3f8efe 100644
--- a/src/libexpr/primops.cc
+++ b/src/libexpr/primops.cc
@@ -1745,6 +1745,73 @@ static void prim_match(EvalState & state, const Pos & pos, Value * * args, Value
}
+/* Split a string with a regular expression, and return a list of the
+ non-matching parts interleaved by the lists of the matching groups. */
+static void prim_split(EvalState & state, const Pos & pos, Value * * args, Value & v)
+{
+ auto re = state.forceStringNoCtx(*args[0], pos);
+
+ try {
+
+ std::regex regex(re, std::regex::extended);
+
+ PathSet context;
+ const std::string str = state.forceString(*args[1], context, pos);
+
+ auto begin = std::sregex_iterator(str.begin(), str.end(), regex);
+ auto end = std::sregex_iterator();
+
+ // Any matches results are surrounded by non-matching results.
+ const size_t len = std::distance(begin, end);
+ state.mkList(v, 2 * len + 1);
+ size_t idx = 0;
+ Value * elem;
+
+ if (len == 0) {
+ v.listElems()[idx++] = args[1];
+ return;
+ }
+
+ for (std::sregex_iterator i = begin; i != end; ++i) {
+ assert(idx <= 2 * len + 1 - 3);
+ std::smatch match = *i;
+
+ // Add a string for non-matched characters.
+ elem = v.listElems()[idx++] = state.allocValue();
+ mkString(*elem, match.prefix().str().c_str());
+
+ // Add a list for matched substrings.
+ const size_t slen = match.size() - 1;
+ elem = v.listElems()[idx++] = state.allocValue();
+
+ // Start at 1, beacause the first match is the whole string.
+ state.mkList(*elem, slen);
+ for (size_t si = 0; si < slen; ++si) {
+ if (!match[si + 1].matched)
+ mkNull(*(elem->listElems()[si] = state.allocValue()));
+ else
+ mkString(*(elem->listElems()[si] = state.allocValue()), match[si + 1].str().c_str());
+ }
+
+ // Add a string for non-matched suffix characters.
+ if (idx == 2 * len) {
+ elem = v.listElems()[idx++] = state.allocValue();
+ mkString(*elem, match.suffix().str().c_str());
+ }
+ }
+ assert(idx == 2 * len + 1);
+
+ } catch (std::regex_error &e) {
+ if (e.code() == std::regex_constants::error_space) {
+ // limit is _GLIBCXX_REGEX_STATE_LIMIT for libstdc++
+ throw EvalError("memory limit exceeded by regular expression '%s', at %s", re, pos);
+ } else {
+ throw EvalError("invalid regular expression '%s', at %s", re, pos);
+ }
+ }
+}
+
+
static void prim_concatStringSep(EvalState & state, const Pos & pos, Value * * args, Value & v)
{
PathSet context;
@@ -2039,6 +2106,7 @@ void EvalState::createBaseEnv()
addPrimOp("__unsafeDiscardOutputDependency", 1, prim_unsafeDiscardOutputDependency);
addPrimOp("__hashString", 2, prim_hashString);
addPrimOp("__match", 2, prim_match);
+ addPrimOp("__split", 2, prim_split);
addPrimOp("__concatStringsSep", 2, prim_concatStringSep);
addPrimOp("__replaceStrings", 3, prim_replaceStrings);
diff --git a/tests/lang/eval-okay-regex-split.nix b/tests/lang/eval-okay-regex-split.nix
new file mode 100644
index 000000000..0073e0577
--- /dev/null
+++ b/tests/lang/eval-okay-regex-split.nix
@@ -0,0 +1,48 @@
+with builtins;
+
+# Non capturing regex returns empty lists
+assert split "foobar" "foobar" == ["" [] ""];
+assert split "fo*" "f" == ["" [] ""];
+assert split "fo+" "f" == ["f"];
+assert split "fo*" "fo" == ["" [] ""];
+assert split "fo*" "foo" == ["" [] ""];
+assert split "fo+" "foo" == ["" [] ""];
+assert split "fo{1,2}" "foo" == ["" [] ""];
+assert split "fo{1,2}" "fooo" == ["" [] "o"];
+assert split "fo*" "foobar" == ["" [] "bar"];
+
+# Capturing regex returns a list of sub-matches
+assert split "(fo*)" "f" == ["" ["f"] ""];
+assert split "(fo+)" "f" == ["f"];
+assert split "(fo*)" "fo" == ["" ["fo"] ""];
+assert split "(f)(o*)" "f" == ["" ["f" ""] ""];
+assert split "(f)(o*)" "foo" == ["" ["f" "oo"] ""];
+assert split "(fo+)" "foo" == ["" ["foo"] ""];
+assert split "(fo{1,2})" "foo" == ["" ["foo"] ""];
+assert split "(fo{1,2})" "fooo" == ["" ["foo"] "o"];
+assert split "(fo*)" "foobar" == ["" ["foo"] "bar"];
+
+# Matches are greedy.
+assert split "(o+)" "oooofoooo" == ["" ["oooo"] "f" ["oooo"] ""];
+
+# Matches multiple times.
+assert split "(b)" "foobarbaz" == ["foo" ["b"] "ar" ["b"] "az"];
+
+# Split large strings containing newlines. null are inserted when a
+# pattern within the current did not match anything.
+assert split "[[:space:]]+|([',.!?])" ''
+ Nix Rocks!
+ That's why I use it.
+'' == [
+ "Nix" [ null ] "Rocks" ["!"] "" [ null ]
+ "That" ["'"] "s" [ null ] "why" [ null ] "I" [ null ] "use" [ null ] "it" ["."] "" [ null ]
+ ""
+];
+
+# Documentation examples
+assert split "(a)b" "abc" == [ "" [ "a" ] "c" ];
+assert split "([ac])" "abc" == [ "" [ "a" ] "b" [ "c" ] "" ];
+assert split "(a)|(c)" "abc" == [ "" [ "a" null ] "b" [ null "c" ] "" ];
+assert split "([[:upper:]]+)" " FOO " == [ " " [ "FOO" ] " " ];
+
+true