From aee3d639b5096349413021537ae842c8c33ef6cf Mon Sep 17 00:00:00 2001 From: Rebecca Turner Date: Thu, 14 Mar 2024 17:44:43 -0700 Subject: [PATCH] Move `shell_words` into its own file Change-Id: I34c0ebfb6dcea49bf632d8880e04075335a132bf --- src/libutil/meson.build | 2 + src/libutil/shlex.cc | 77 +++++++++++++++++++++++++++++++++++++ src/libutil/shlex.hh | 30 +++++++++++++++ src/nix-build/nix-build.cc | 61 +---------------------------- tests/unit/libutil/shlex.cc | 57 +++++++++++++++++++++++++++ 5 files changed, 168 insertions(+), 59 deletions(-) create mode 100644 src/libutil/shlex.cc create mode 100644 src/libutil/shlex.hh create mode 100644 tests/unit/libutil/shlex.cc diff --git a/src/libutil/meson.build b/src/libutil/meson.build index 084d7ed11..91a7b33ba 100644 --- a/src/libutil/meson.build +++ b/src/libutil/meson.build @@ -20,6 +20,7 @@ libutil_sources = files( 'position.cc', 'references.cc', 'serialise.cc', + 'shlex.cc', 'signals.cc', 'source-path.cc', 'suggestions.cc', @@ -69,6 +70,7 @@ libutil_headers = files( 'regex-combinators.hh', 'repair-flag.hh', 'serialise.hh', + 'shlex.hh', 'signals.hh', 'source-path.hh', 'split.hh', diff --git a/src/libutil/shlex.cc b/src/libutil/shlex.cc new file mode 100644 index 000000000..b5f340251 --- /dev/null +++ b/src/libutil/shlex.cc @@ -0,0 +1,77 @@ +#include "shlex.hh" +#include "util.hh" + +namespace nix { + +std::vector shell_split(const std::string & input) +{ + std::vector result; + + // Hack: `shell_split` is janky and parses ` a` as `{"", "a"}`, so we trim + // whitespace before starting. + auto inputTrimmed = trim(input); + + if (inputTrimmed.empty()) { + return result; + } + + std::regex whitespace("^\\s+"); + auto begin = inputTrimmed.cbegin(); + std::string currentToken; + enum State { sBegin, sSingleQuote, sDoubleQuote }; + State state = sBegin; + auto iterator = begin; + + for (; iterator != inputTrimmed.cend(); ++iterator) { + if (state == sBegin) { + std::smatch match; + if (regex_search(iterator, inputTrimmed.cend(), match, whitespace)) { + currentToken.append(begin, iterator); + result.push_back(currentToken); + iterator = match[0].second; + if (iterator == inputTrimmed.cend()) { + return result; + } + begin = iterator; + currentToken.clear(); + } + } + + switch (*iterator) { + case '\'': + if (state != sDoubleQuote) { + currentToken.append(begin, iterator); + begin = iterator + 1; + state = state == sBegin ? sSingleQuote : sBegin; + } + break; + + case '"': + if (state != sSingleQuote) { + currentToken.append(begin, iterator); + begin = iterator + 1; + state = state == sBegin ? sDoubleQuote : sBegin; + } + break; + + case '\\': + if (state != sSingleQuote) { + // perl shellwords mostly just treats the next char as part + // of the string with no special processing + currentToken.append(begin, iterator); + begin = ++iterator; + } + break; + } + } + + if (state != sBegin) { + throw ShlexError(input); + } + + currentToken.append(begin, iterator); + result.push_back(currentToken); + return result; +} + +} diff --git a/src/libutil/shlex.hh b/src/libutil/shlex.hh new file mode 100644 index 000000000..4e7a48597 --- /dev/null +++ b/src/libutil/shlex.hh @@ -0,0 +1,30 @@ +#pragma once + +#include +#include +#include + +#include "error.hh" + +namespace nix { + +class ShlexError : public Error +{ +public: + const std::string input; + + ShlexError(const std::string input) + : Error("Failed to parse shell arguments (unterminated quote?): %1%", input) + , input(input) + { + } +}; + +/** + * Parse a string into shell arguments. + * + * Takes care of whitespace, quotes, and backslashes (at least a bit). + */ +std::vector shell_split(const std::string & input); + +} // namespace nix diff --git a/src/nix-build/nix-build.cc b/src/nix-build/nix-build.cc index 3928e39a9..9a3994842 100644 --- a/src/nix-build/nix-build.cc +++ b/src/nix-build/nix-build.cc @@ -23,70 +23,13 @@ #include "common-eval-args.hh" #include "attr-path.hh" #include "legacy.hh" +#include "shlex.hh" using namespace nix; using namespace std::string_literals; extern char * * environ __attribute__((weak)); -/* Recreate the effect of the perl shellwords function, breaking up a - * string into arguments like a shell word, including escapes - */ -static std::vector shellwords(const std::string & s) -{ - std::regex whitespace("^\\s+"); - auto begin = s.cbegin(); - std::vector res; - std::string cur; - enum state { - sBegin, - sSingleQuote, - sDoubleQuote - }; - state st = sBegin; - auto it = begin; - for (; it != s.cend(); ++it) { - if (st == sBegin) { - std::smatch match; - if (regex_search(it, s.cend(), match, whitespace)) { - cur.append(begin, it); - res.push_back(cur); - it = match[0].second; - if (it == s.cend()) return res; - begin = it; - cur.clear(); - } - } - switch (*it) { - case '\'': - if (st != sDoubleQuote) { - cur.append(begin, it); - begin = it + 1; - st = st == sBegin ? sSingleQuote : sBegin; - } - break; - case '"': - if (st != sSingleQuote) { - cur.append(begin, it); - begin = it + 1; - st = st == sBegin ? sDoubleQuote : sBegin; - } - break; - case '\\': - if (st != sSingleQuote) { - /* perl shellwords mostly just treats the next char as part of the string with no special processing */ - cur.append(begin, it); - begin = ++it; - } - break; - } - } - if (st != sBegin) throw Error("unterminated quote in shebang line"); - cur.append(begin, it); - res.push_back(cur); - return res; -} - static void main_nix_build(int argc, char * * argv) { auto dryRun = false; @@ -143,7 +86,7 @@ static void main_nix_build(int argc, char * * argv) line = chomp(line); std::smatch match; if (std::regex_match(line, match, std::regex("^#!\\s*nix-shell\\s+(.*)$"))) - for (const auto & word : shellwords(match[1].str())) + for (const auto & word : shell_split(match[1].str())) args.push_back(word); } } diff --git a/tests/unit/libutil/shlex.cc b/tests/unit/libutil/shlex.cc new file mode 100644 index 000000000..2a13635f0 --- /dev/null +++ b/tests/unit/libutil/shlex.cc @@ -0,0 +1,57 @@ +#include "shlex.hh" + +#include +#include +#include + +using testing::Eq; + +namespace nix { + +TEST(Shlex, shell_split) { + ASSERT_THAT(shell_split(""), Eq>({})); + ASSERT_THAT(shell_split(" "), Eq>({})); + + ASSERT_THAT( + shell_split("puppy doggy"), + Eq>({ + "puppy", + "doggy", + }) + ); + + ASSERT_THAT( + shell_split("goldie \"puppy 'doggy'\" sweety"), + Eq>({ + "goldie", + "puppy 'doggy'", + "sweety", + }) + ); + + ASSERT_THAT( + shell_split("\"pupp\\\"y\""), + Eq>({ "pupp\"y" }) + ); + + ASSERT_THAT( + shell_split("goldie 'puppy' doggy"), + Eq>({ + "goldie", + "puppy", + "doggy", + }) + ); + + ASSERT_THAT( + shell_split("'pupp\\\"y'"), + Eq>({ + "pupp\\\"y", + }) + ); + + ASSERT_THROW(shell_split("\"puppy"), ShlexError); + ASSERT_THROW(shell_split("'puppy"), ShlexError); +} + +} // namespace nix