From a2628b43bbfe4368a3b5963e8b80eb6f463d94c3 Mon Sep 17 00:00:00 2001 From: Eelco Dolstra Date: Mon, 3 Feb 2020 15:27:26 +0100 Subject: [PATCH] Fix URL parser Fixes #3062. --- src/libexpr/flake/flakeref.cc | 4 +++- src/libstore/fetchers/parse.cc | 18 ++++++++---------- src/libstore/fetchers/regex.hh | 24 ++++++++++++++---------- 3 files changed, 25 insertions(+), 21 deletions(-) diff --git a/src/libexpr/flake/flakeref.cc b/src/libexpr/flake/flakeref.cc index a34d766d7..c46661df8 100644 --- a/src/libexpr/flake/flakeref.cc +++ b/src/libexpr/flake/flakeref.cc @@ -67,8 +67,10 @@ std::pair parseFlakeRefWithFragment( { using namespace fetchers; + static std::string fnRegex = "[0-9a-zA-Z-._~!$&'\"()*+,;=]+"; + static std::regex pathUrlRegex( - "(" + pathRegex + "/?)" + "(/?" + fnRegex + "(?:/" + fnRegex + ")*/?)" + "(?:\\?(" + queryRegex + "))?" + "(?:#(" + queryRegex + "))?", std::regex::ECMAScript); diff --git a/src/libstore/fetchers/parse.cc b/src/libstore/fetchers/parse.cc index dc1b3efe6..4f7cb3c6b 100644 --- a/src/libstore/fetchers/parse.cc +++ b/src/libstore/fetchers/parse.cc @@ -11,24 +11,22 @@ std::regex flakeIdRegex(flakeIdRegexS, std::regex::ECMAScript); ParsedURL parseURL(const std::string & url) { static std::regex uriRegex( - "(((" + schemeRegex + "):" - + "(//(" + authorityRegex + "))?" - + "(" + pathRegex + "))" + "((" + schemeRegex + "):" + + "(?:(?://(" + authorityRegex + ")(" + absPathRegex + "))|(/?" + pathRegex + ")))" + "(?:\\?(" + queryRegex + "))?" - + "(?:#(" + queryRegex + "))?" - + ")", + + "(?:#(" + queryRegex + "))?", std::regex::ECMAScript); std::smatch match; if (std::regex_match(url, match, uriRegex)) { - auto & base = match[2]; - std::string scheme = match[3]; + auto & base = match[1]; + std::string scheme = match[2]; auto authority = match[4].matched ? std::optional(match[5]) : std::nullopt; - std::string path = match[6]; - auto & query = match[7]; - auto & fragment = match[8]; + std::string path = match[4].matched ? match[4] : match[5]; + auto & query = match[6]; + auto & fragment = match[7]; auto isFile = scheme.find("file") != std::string::npos; diff --git a/src/libstore/fetchers/regex.hh b/src/libstore/fetchers/regex.hh index 504d7bf18..e0989edfc 100644 --- a/src/libstore/fetchers/regex.hh +++ b/src/libstore/fetchers/regex.hh @@ -5,16 +5,20 @@ namespace nix::fetchers { // URI stuff. -const static std::string pctEncoded = "%[0-9a-fA-F][0-9a-fA-F]"; -const static std::string schemeRegex = "[a-z+]+"; -const static std::string authorityRegex = - "(?:(?:[a-z])*@)?" - "[a-zA-Z0-9._~-]*"; -const static std::string segmentRegex = "[a-zA-Z0-9._~-]+"; -const static std::string pathRegex = "(?:/?" + segmentRegex + "(?:/" + segmentRegex + ")*|/?)"; -const static std::string pcharRegex = - "(?:[a-zA-Z0-9-._~!$&'\"()*+,;=:@ ]|" + pctEncoded + ")"; -const static std::string queryRegex = "(?:" + pcharRegex + "|[/?])*"; +const static std::string pctEncoded = "(?:%[0-9a-fA-F][0-9a-fA-F])"; +const static std::string schemeRegex = "(?:[a-z+]+)"; +const static std::string ipv6AddressRegex = "(?:\\[[0-9a-fA-F:]+\\])"; +const static std::string unreservedRegex = "(?:[a-zA-Z0-9-._~])"; +const static std::string subdelimsRegex = "(?:[!$&'\"()*+,;=])"; +const static std::string hostnameRegex = "(?:(?:" + unreservedRegex + "|" + pctEncoded + "|" + subdelimsRegex + ")*)"; +const static std::string hostRegex = "(?:" + ipv6AddressRegex + "|" + hostnameRegex + ")"; +const static std::string userRegex = "(?:(?:" + unreservedRegex + "|" + pctEncoded + "|" + subdelimsRegex + "|:)*)"; +const static std::string authorityRegex = "(?:" + userRegex + "@)?" + hostRegex + "(?::[0-9]+)?"; +const static std::string pcharRegex = "(?:" + unreservedRegex + "|" + pctEncoded + "|" + subdelimsRegex + "|[:@])"; +const static std::string queryRegex = "(?:" + pcharRegex + "|[/? \"])*"; +const static std::string segmentRegex = "(?:" + pcharRegex + "+)"; +const static std::string absPathRegex = "(?:(?:/" + segmentRegex + ")*/?)"; +const static std::string pathRegex = "(?:" + segmentRegex + "(?:/" + segmentRegex + ")*/?)"; // A Git ref (i.e. branch or tag name). const static std::string refRegexS = "[a-zA-Z0-9][a-zA-Z0-9_.-]*"; // FIXME: check