Move shell_words into its own file

Change-Id: I34c0ebfb6dcea49bf632d8880e04075335a132bf
2024-03-14 17:44:43 -07:00 · 2024-03-14 17:44:43 -07:00 · aee3d639b5
parent da22dbc333
commit aee3d639b5
5 changed files with 168 additions and 59 deletions
--- a/src/libutil/meson.build
+++ b/src/libutil/meson.build
@ -20,6 +20,7 @@ libutil_sources = files(
  'position.cc',
  'references.cc',
  'serialise.cc',
  'shlex.cc',
  'signals.cc',
  'source-path.cc',
  'suggestions.cc',
@ -69,6 +70,7 @@ libutil_headers = files(
  'regex-combinators.hh',
  'repair-flag.hh',
  'serialise.hh',
  'shlex.hh',
  'signals.hh',
  'source-path.hh',
  'split.hh',
--- a/src/libutil/shlex.cc
+++ b/src/libutil/shlex.cc
@ -0,0 +1,77 @@
 #include "shlex.hh"
 #include "util.hh"
 namespace nix {
 std::vector<std::string> shell_split(const std::string & input)
 {
    std::vector<std::string> result;
    // Hack: `shell_split` is janky and parses ` a` as `{"", "a"}`, so we trim
    // whitespace before starting.
    auto inputTrimmed = trim(input);
    if (inputTrimmed.empty()) {
        return result;
    }
    std::regex whitespace("^\\s+");
    auto begin = inputTrimmed.cbegin();
    std::string currentToken;
    enum State { sBegin, sSingleQuote, sDoubleQuote };
    State state = sBegin;
    auto iterator = begin;
    for (; iterator != inputTrimmed.cend(); ++iterator) {
        if (state == sBegin) {
            std::smatch match;
            if (regex_search(iterator, inputTrimmed.cend(), match, whitespace)) {
                currentToken.append(begin, iterator);
                result.push_back(currentToken);
                iterator = match[0].second;
                if (iterator == inputTrimmed.cend()) {
                    return result;
                }
                begin = iterator;
                currentToken.clear();
            }
        }
        switch (*iterator) {
        case '\'':
            if (state != sDoubleQuote) {
                currentToken.append(begin, iterator);
                begin = iterator + 1;
                state = state == sBegin ? sSingleQuote : sBegin;
            }
            break;
        case '"':
            if (state != sSingleQuote) {
                currentToken.append(begin, iterator);
                begin = iterator + 1;
                state = state == sBegin ? sDoubleQuote : sBegin;
            }
            break;
        case '\\':
            if (state != sSingleQuote) {
                // perl shellwords mostly just treats the next char as part
                // of the string with no special processing
                currentToken.append(begin, iterator);
                begin = ++iterator;
            }
            break;
        }
    }
    if (state != sBegin) {
        throw ShlexError(input);
    }
    currentToken.append(begin, iterator);
    result.push_back(currentToken);
    return result;
 }
 }
--- a/src/libutil/shlex.hh
+++ b/src/libutil/shlex.hh
@ -0,0 +1,30 @@
 #pragma once
 #include <regex>
 #include <string>
 #include <vector>
 #include "error.hh"
 namespace nix {
 class ShlexError : public Error
 {
 public:
    const std::string input;
    ShlexError(const std::string input)
        : Error("Failed to parse shell arguments (unterminated quote?): %1%", input)
        , input(input)
    {
    }
 };
 /**
 * Parse a string into shell arguments.
 *
 * Takes care of whitespace, quotes, and backslashes (at least a bit).
 */
 std::vector<std::string> shell_split(const std::string & input);
 } // namespace nix
--- a/src/nix-build/nix-build.cc
+++ b/src/nix-build/nix-build.cc
@ -23,70 +23,13 @@
 #include "common-eval-args.hh"
 #include "attr-path.hh"
 #include "legacy.hh"
 #include "shlex.hh"
 using namespace nix;
 using namespace std::string_literals;
 extern char * * environ __attribute__((weak));
 /* Recreate the effect of the perl shellwords function, breaking up a
 * string into arguments like a shell word, including escapes
 */
 static std::vector<std::string> shellwords(const std::string & s)
 {
    std::regex whitespace("^\\s+");
    auto begin = s.cbegin();
    std::vector<std::string> res;
    std::string cur;
    enum state {
        sBegin,
        sSingleQuote,
        sDoubleQuote
    };
    state st = sBegin;
    auto it = begin;
    for (; it != s.cend(); ++it) {
        if (st == sBegin) {
            std::smatch match;
            if (regex_search(it, s.cend(), match, whitespace)) {
                cur.append(begin, it);
                res.push_back(cur);
                it = match[0].second;
                if (it == s.cend()) return res;
                begin = it;
                cur.clear();
            }
        }
        switch (*it) {
            case '\'':
                if (st != sDoubleQuote) {
                    cur.append(begin, it);
                    begin = it + 1;
                    st = st == sBegin ? sSingleQuote : sBegin;
                }
                break;
            case '"':
                if (st != sSingleQuote) {
                    cur.append(begin, it);
                    begin = it + 1;
                    st = st == sBegin ? sDoubleQuote : sBegin;
                }
                break;
            case '\\':
                if (st != sSingleQuote) {
                    /* perl shellwords mostly just treats the next char as part of the string with no special processing */
                    cur.append(begin, it);
                    begin = ++it;
                }
                break;
        }
    }
    if (st != sBegin) throw Error("unterminated quote in shebang line");
    cur.append(begin, it);
    res.push_back(cur);
    return res;
 }
 static void main_nix_build(int argc, char * * argv)
 {
    auto dryRun = false;
@ -143,7 +86,7 @@ static void main_nix_build(int argc, char * * argv)
                    line = chomp(line);
                    std::smatch match;
                    if (std::regex_match(line, match, std::regex("^#!\\s*nix-shell\\s+(.*)$")))
-                        for (const auto & word : shellwords(match[1].str()))
+                        for (const auto & word : shell_split(match[1].str()))
                            args.push_back(word);
                }
            }
--- a/tests/unit/libutil/shlex.cc
+++ b/tests/unit/libutil/shlex.cc
@ -0,0 +1,57 @@
 #include "shlex.hh"
 #include <gtest/gtest.h>
 #include <gmock/gmock.h>
 #include <sstream>
 using testing::Eq;
 namespace nix {
 TEST(Shlex, shell_split) {
    ASSERT_THAT(shell_split(""), Eq<std::vector<std::string>>({}));
    ASSERT_THAT(shell_split("  "), Eq<std::vector<std::string>>({}));
    ASSERT_THAT(
        shell_split("puppy doggy"),
        Eq<std::vector<std::string>>({
            "puppy",
            "doggy",
        })
    );
    ASSERT_THAT(
        shell_split("goldie \"puppy 'doggy'\" sweety"),
        Eq<std::vector<std::string>>({
            "goldie",
            "puppy 'doggy'",
            "sweety",
        })
    );
    ASSERT_THAT(
        shell_split("\"pupp\\\"y\""),
        Eq<std::vector<std::string>>({ "pupp\"y" })
    );
    ASSERT_THAT(
        shell_split("goldie 'puppy' doggy"),
        Eq<std::vector<std::string>>({
            "goldie",
            "puppy",
            "doggy",
        })
    );
    ASSERT_THAT(
        shell_split("'pupp\\\"y'"),
        Eq<std::vector<std::string>>({
            "pupp\\\"y",
        })
    );
    ASSERT_THROW(shell_split("\"puppy"), ShlexError);
    ASSERT_THROW(shell_split("'puppy"), ShlexError);
 }
 } // namespace nix