Move shell_words into its own file

Change-Id: I34c0ebfb6dcea49bf632d8880e04075335a132bf
2024-03-14 17:44:43 -07:00 · 2024-03-14 17:44:43 -07:00 · 17d3572fe8
parent b4d07656ff
commit 17d3572fe8
6 changed files with 190 additions and 71 deletions
--- a/src/libutil/shlex.cc
+++ b/src/libutil/shlex.cc
@ -0,0 +1,77 @@
+#include "shlex.hh"
+#include "util.hh"
+
+namespace nix {
+
+std::vector<std::string> shell_split(const std::string & input)
+{
+    std::vector<std::string> result;
+
+    // Hack: `shell_split` is janky and parses ` a` as `{"", "a"}`, so we trim
+    // whitespace before starting.
+    auto inputTrimmed = trim(input);
+
+    if (inputTrimmed.empty()) {
+        return result;
+    }
+
+    std::regex whitespace("^\\s+");
+    auto begin = inputTrimmed.cbegin();
+    std::string currentToken;
+    enum State { sBegin, sSingleQuote, sDoubleQuote };
+    State state = sBegin;
+    auto iterator = begin;
+
+    for (; iterator != inputTrimmed.cend(); ++iterator) {
+        if (state == sBegin) {
+            std::smatch match;
+            if (regex_search(iterator, inputTrimmed.cend(), match, whitespace)) {
+                currentToken.append(begin, iterator);
+                result.push_back(currentToken);
+                iterator = match[0].second;
+                if (iterator == inputTrimmed.cend()) {
+                    return result;
+                }
+                begin = iterator;
+                currentToken.clear();
+            }
+        }
+
+        switch (*iterator) {
+        case '\'':
+            if (state != sDoubleQuote) {
+                currentToken.append(begin, iterator);
+                begin = iterator + 1;
+                state = state == sBegin ? sSingleQuote : sBegin;
+            }
+            break;
+
+        case '"':
+            if (state != sSingleQuote) {
+                currentToken.append(begin, iterator);
+                begin = iterator + 1;
+                state = state == sBegin ? sDoubleQuote : sBegin;
+            }
+            break;
+
+        case '\\':
+            if (state != sSingleQuote) {
+                // perl shellwords mostly just treats the next char as part
+                // of the string with no special processing
+                currentToken.append(begin, iterator);
+                begin = ++iterator;
+            }
+            break;
+        }
+    }
+
+    if (state != sBegin) {
+        throw ShlexError(input);
+    }
+
+    currentToken.append(begin, iterator);
+    result.push_back(currentToken);
+    return result;
+}
+
+}
--- a/src/libutil/shlex.hh
+++ b/src/libutil/shlex.hh
@ -0,0 +1,30 @@
+#pragma once
+
+#include <regex>
+#include <string>
+#include <vector>
+
+#include "error.hh"
+
+namespace nix {
+
+class ShlexError : public Error
+{
+public:
+    const std::string input;
+
+    ShlexError(const std::string input)
+        : Error("Failed to parse shell arguments (unterminated quote?): %1%", input)
+        , input(input)
+    {
+    }
+};
+
+/**
+ * Parse a string into shell arguments.
+ *
+ * Takes care of whitespace, quotes, and backslashes (at least a bit).
+ */
+std::vector<std::string> shell_split(const std::string & input);
+
+} // namespace nix
--- a/src/nix-build/nix-build.cc
+++ b/src/nix-build/nix-build.cc
@ -23,70 +23,13 @@
 #include "common-eval-args.hh"
 #include "attr-path.hh"
 #include "legacy.hh"
+#include "shlex.hh"

 using namespace nix;
 using namespace std::string_literals;

 extern char * * environ __attribute__((weak));

-/* Recreate the effect of the perl shellwords function, breaking up a
- * string into arguments like a shell word, including escapes
- */
-static std::vector<std::string> shellwords(const std::string & s)
-{
-    std::regex whitespace("^\\s+");
-    auto begin = s.cbegin();
-    std::vector<std::string> res;
-    std::string cur;
-    enum state {
-        sBegin,
-        sSingleQuote,
-        sDoubleQuote
-    };
-    state st = sBegin;
-    auto it = begin;
-    for (; it != s.cend(); ++it) {
-        if (st == sBegin) {
-            std::smatch match;
-            if (regex_search(it, s.cend(), match, whitespace)) {
-                cur.append(begin, it);
-                res.push_back(cur);
-                it = match[0].second;
-                if (it == s.cend()) return res;
-                begin = it;
-                cur.clear();
-            }
-        }
-        switch (*it) {
-            case '\'':
-                if (st != sDoubleQuote) {
-                    cur.append(begin, it);
-                    begin = it + 1;
-                    st = st == sBegin ? sSingleQuote : sBegin;
-                }
-                break;
-            case '"':
-                if (st != sSingleQuote) {
-                    cur.append(begin, it);
-                    begin = it + 1;
-                    st = st == sBegin ? sDoubleQuote : sBegin;
-                }
-                break;
-            case '\\':
-                if (st != sSingleQuote) {
-                    /* perl shellwords mostly just treats the next char as part of the string with no special processing */
-                    cur.append(begin, it);
-                    begin = ++it;
-                }
-                break;
-        }
-    }
-    if (st != sBegin) throw Error("unterminated quote in shebang line");
-    cur.append(begin, it);
-    res.push_back(cur);
-    return res;
-}
-
 static void main_nix_build(int argc, char * * argv)
 {
    auto dryRun = false;
@ -143,7 +86,7 @@ static void main_nix_build(int argc, char * * argv)
                    line = chomp(line);
                    std::smatch match;
                    if (std::regex_match(line, match, std::regex("^#!\\s*nix-shell\\s+(.*)$")))
-                        for (const auto & word : shellwords(match[1].str()))
+                        for (const auto & word : shell_split(match[1].str()))
                            args.push_back(word);
                }
            }
--- a/tests/unit/libutil-support/tests/cli-literate-parser.cc
+++ b/tests/unit/libutil-support/tests/cli-literate-parser.cc
@ -23,7 +23,8 @@ constexpr auto CLILiterateParser::stateDebug(State const & s) -> const char *
            [](Command const&) -> const char * { return "command"; },
            [](OutputLine const&) -> const char * { return "output_line"; }},
        // clang-format on
-        s);
+        s
+    );
 }

 auto CLILiterateParser::Node::print() const -> std::string
@ -51,7 +52,8 @@ void PrintTo(std::vector<CLILiterateParser::Node> const & nodes, std::ostream *
    }
 }

-auto CLILiterateParser::parse(std::string prompt, std::string_view const & input, size_t indent) -> std::vector<Node>
+auto CLILiterateParser::parse(std::string prompt, std::string_view const & input, size_t indent)
+    -> std::vector<Node>
 {
    CLILiterateParser p{std::move(prompt), indent};
    p.feed(input);
@ -105,13 +107,17 @@ void CLILiterateParser::feed(char c)
                } else {
                    // didn't match the prompt, so it must have actually been output.
                    s.lineAccumulator.push_back(c);
-                    transition(OutputLine{AccumulatingState{.lineAccumulator = std::move(s.lineAccumulator)}});
+                    transition(OutputLine{
+                        AccumulatingState{.lineAccumulator = std::move(s.lineAccumulator)}
+                    });
                    return;
                }
                s.lineAccumulator.push_back(c);
            },
-            [&](AccumulatingState & s) { s.lineAccumulator.push_back(c); }},
-        state_);
+            [&](AccumulatingState & s) { s.lineAccumulator.push_back(c); }
+        },
+        state_
+    );
 }

 void CLILiterateParser::onNewline()
@ -140,8 +146,10 @@ void CLILiterateParser::onNewline()
            [&](Prompt & s) {
                // INDENT followed by newline is also considered a blank output line
                return Node::mkOutput(std::move(s.lineAccumulator));
-            }},
-        lastState));
+            }
+        },
+        lastState
+    ));

    transition(Indent{});
    lastWasOutput_ = newLastWasOutput;
@ -171,8 +179,9 @@ auto CLILiterateParser::syntax() const -> std::vector<Node> const &
    return syntax_;
 }

-auto CLILiterateParser::unparse(const std::string & prompt, const std::vector<Node> & syntax, size_t indent)
-    -> std::string
+auto CLILiterateParser::unparse(
+    const std::string & prompt, const std::vector<Node> & syntax, size_t indent
+) -> std::string
 {
    std::string indent_str(indent, ' ');
    std::ostringstream out{};
--- a/tests/unit/libutil-support/tests/cli-literate-parser.hh
+++ b/tests/unit/libutil-support/tests/cli-literate-parser.hh
@ -79,10 +79,13 @@ public:
    void feed(std::string_view s);

    /** Parses an input in a non-streaming fashion */
-    static auto parse(std::string prompt, std::string_view const & input, size_t indent = 2) -> std::vector<Node>;
+    static auto parse(std::string prompt, std::string_view const & input, size_t indent = 2)
+        -> std::vector<Node>;

    /** Returns, losslessly, the string that would have generated a syntax tree */
-    static auto unparse(std::string const & prompt, std::vector<Node> const & syntax, size_t indent = 2) -> std::string;
+    static auto
+    unparse(std::string const & prompt, std::vector<Node> const & syntax, size_t indent = 2)
+        -> std::string;

    /** Consumes a CLILiterateParser and gives you the syntax out of it */
    auto intoSyntax() && -> std::vector<Node>;
@ -115,7 +118,7 @@ private:
    using State = std::variant<Indent, Commentary, Prompt, Command, OutputLine>;
    State state_;

-    constexpr static auto stateDebug(State const&) -> const char *;
+    constexpr static auto stateDebug(State const &) -> const char *;

    const std::string prompt_;
    const size_t indent_;
--- a/tests/unit/libutil/shlex.cc
+++ b/tests/unit/libutil/shlex.cc
@ -0,0 +1,57 @@
+#include "shlex.hh"
+
+#include <gtest/gtest.h>
+#include <gmock/gmock.h>
+#include <sstream>
+
+using testing::Eq;
+
+namespace nix {
+
+TEST(Shlex, shell_split) {
+    ASSERT_THAT(shell_split(""), Eq<std::vector<std::string>>({}));
+    ASSERT_THAT(shell_split("  "), Eq<std::vector<std::string>>({}));
+
+    ASSERT_THAT(
+        shell_split("puppy doggy"),
+        Eq<std::vector<std::string>>({
+            "puppy",
+            "doggy",
+        })
+    );
+
+    ASSERT_THAT(
+        shell_split("goldie \"puppy 'doggy'\" sweety"),
+        Eq<std::vector<std::string>>({
+            "goldie",
+            "puppy 'doggy'",
+            "sweety",
+        })
+    );
+
+    ASSERT_THAT(
+        shell_split("\"pupp\\\"y\""),
+        Eq<std::vector<std::string>>({ "pupp\"y" })
+    );
+
+    ASSERT_THAT(
+        shell_split("goldie 'puppy' doggy"),
+        Eq<std::vector<std::string>>({
+            "goldie",
+            "puppy",
+            "doggy",
+        })
+    );
+
+    ASSERT_THAT(
+        shell_split("'pupp\\\"y'"),
+        Eq<std::vector<std::string>>({
+            "pupp\\\"y",
+        })
+    );
+
+    ASSERT_THROW(shell_split("\"puppy"), ShlexError);
+    ASSERT_THROW(shell_split("'puppy"), ShlexError);
+}
+
+} // namespace nix