Merge pull request #5812 from pennae/small-perf-improvements

improve parser performance a bit
2022-01-17 19:49:52 +01:00 · 2022-01-17 19:49:52 +01:00 · fc2443a67c
parent 008ddef4b0 72f42093e7
commit fc2443a67c
9 changed files with 82 additions and 53 deletions
--- a/src/libexpr/eval.hh
+++ b/src/libexpr/eval.hh
@ -179,8 +179,8 @@ public:
    Expr * parseExprFromFile(const Path & path, StaticEnv & staticEnv);
    /* Parse a Nix expression from the specified string. */
-    Expr * parseExprFromString(std::string_view s, const Path & basePath, StaticEnv & staticEnv);
+    Expr * parseExprFromString(std::string s, const Path & basePath, StaticEnv & staticEnv);
-    Expr * parseExprFromString(std::string_view s, const Path & basePath);
+    Expr * parseExprFromString(std::string s, const Path & basePath);
    Expr * parseStdin();
@ -308,7 +308,7 @@ private:
    friend struct ExprAttrs;
    friend struct ExprLet;
-    Expr * parse(const char * text, FileOrigin origin, const Path & path,
+    Expr * parse(char * text, size_t length, FileOrigin origin, const Path & path,
        const Path & basePath, StaticEnv & staticEnv);
 public:
--- a/src/libexpr/lexer.l
+++ b/src/libexpr/lexer.l
@ -64,29 +64,32 @@ static void adjustLoc(YYLTYPE * loc, const char * s, size_t len)
 }
-// FIXME: optimize
+// we make use of the fact that the parser receives a private copy of the input
-static Expr * unescapeStr(SymbolTable & symbols, const char * s, size_t length)
+// string and can munge around in it.
 static Expr * unescapeStr(SymbolTable & symbols, char * s, size_t length)
 {
-    string t;
+    char * result = s;
-    t.reserve(length);
+    char * t = s;
    char c;
    // the input string is terminated with *two* NULs, so we can safely take
    // *one* character after the one being checked against.
    while ((c = *s++)) {
        if (c == '\\') {
            assert(*s);
            c = *s++;
-            if (c == 'n') t += '\n';
+            if (c == 'n') *t = '\n';
-            else if (c == 'r') t += '\r';
+            else if (c == 'r') *t = '\r';
-            else if (c == 't') t += '\t';
+            else if (c == 't') *t = '\t';
-            else t += c;
+            else *t = c;
        }
        else if (c == '\r') {
            /* Normalise CR and CR/LF into LF. */
-            t += '\n';
+            *t = '\n';
            if (*s == '\n') s++; /* cr/lf */
        }
-        else t += c;
+        else *t = c;
        t++;
    }
-    return new ExprString(symbols.create(t));
+    return new ExprString(symbols.create({result, size_t(t - result)}));
 }
@ -139,7 +142,7 @@ or          { return OR_KW; }
 \/\/        { return UPDATE; }
 \+\+        { return CONCAT; }
-{ID}        { yylval->id = strdup(yytext); return ID; }
+{ID}        { yylval->id = {yytext, (size_t) yyleng}; return ID; }
 {INT}       { errno = 0;
              try {
                  yylval->n = boost::lexical_cast<int64_t>(yytext);
@ -221,14 +224,14 @@ or          { return OR_KW; }
 <PATH_START>{PATH_SEG} {
  POP_STATE();
  PUSH_STATE(INPATH_SLASH);
-  yylval->path = strdup(yytext);
+  yylval->path = {yytext, (size_t) yyleng};
  return PATH;
 }
 <PATH_START>{HPATH_START} {
  POP_STATE();
  PUSH_STATE(INPATH_SLASH);
-  yylval->path = strdup(yytext);
+  yylval->path = {yytext, (size_t) yyleng};
  return HPATH;
 }
@ -237,7 +240,7 @@ or          { return OR_KW; }
    PUSH_STATE(INPATH_SLASH);
  else
    PUSH_STATE(INPATH);
-  yylval->path = strdup(yytext);
+  yylval->path = {yytext, (size_t) yyleng};
  return PATH;
 }
 {HPATH} {
@ -245,7 +248,7 @@ or          { return OR_KW; }
    PUSH_STATE(INPATH_SLASH);
  else
    PUSH_STATE(INPATH);
-  yylval->path = strdup(yytext);
+  yylval->path = {yytext, (size_t) yyleng};
  return HPATH;
 }
@ -280,8 +283,8 @@ or          { return OR_KW; }
  throw ParseError("path has a trailing slash");
 }
-{SPATH}     { yylval->path = strdup(yytext); return SPATH; }
+{SPATH}     { yylval->path = {yytext, (size_t) yyleng}; return SPATH; }
-{URI}       { yylval->uri = strdup(yytext); return URI; }
+{URI}       { yylval->uri = {yytext, (size_t) yyleng}; return URI; }
 [ \t\r\n]+    /* eat up whitespace */
 \#[^\r\n]*    /* single-line comments */
--- a/src/libexpr/nixexpr.cc
+++ b/src/libexpr/nixexpr.cc
@ -473,7 +473,7 @@ string ExprLambda::showNamePos() const
 size_t SymbolTable::totalSize() const
 {
    size_t n = 0;
-    for (auto & i : symbols)
+    for (auto & i : store)
        n += i.size();
    return n;
 }
--- a/src/libexpr/parser.y
+++ b/src/libexpr/parser.y
@ -273,9 +273,16 @@ void yyerror(YYLTYPE * loc, yyscan_t scanner, ParseData * data, const char * err
  nix::Formal * formal;
  nix::NixInt n;
  nix::NixFloat nf;
-  const char * id; // !!! -> Symbol
+  // using C a struct allows us to avoid having to define the special
-  char * path;
+  // members that using string_view here would implicitly delete.
-  char * uri;
+  struct StringToken {
    const char * p;
    size_t l;
    operator std::string_view() const { return {p, l}; }
  };
  StringToken id; // !!! -> Symbol
  StringToken path;
  StringToken uri;
  std::vector<nix::AttrName> * attrNames;
  std::vector<std::pair<nix::Pos, nix::Expr *> > * string_parts;
 }
@ -397,7 +404,7 @@ expr_select
 expr_simple
  : ID {
-      if (strcmp($1, "__curPos") == 0)
+      if (strncmp($1.p, "__curPos", $1.l) == 0)
          $$ = new ExprPos(CUR_POS);
      else
          $$ = new ExprVar(CUR_POS, data->symbols.create($1));
@ -414,7 +421,7 @@ expr_simple
      $$ = new ExprConcatStrings(CUR_POS, false, $2);
  }
  | SPATH {
-      string path($1 + 1, strlen($1) - 2);
+      string path($1.p + 1, $1.l - 2);
      $$ = new ExprCall(CUR_POS,
          new ExprVar(data->symbols.create("__findFile")),
          {new ExprVar(data->symbols.create("__nixPath")),
@ -460,14 +467,14 @@ string_parts_interpolated
 path_start
  : PATH {
-    Path path(absPath($1, data->basePath));
+    Path path(absPath({$1.p, $1.l}, data->basePath));
    /* add back in the trailing '/' to the first segment */
-    if ($1[strlen($1)-1] == '/' && strlen($1) > 1)
+    if ($1.p[$1.l-1] == '/' && $1.l > 1)
      path += "/";
    $$ = new ExprPath(path);
  }
  | HPATH {
-    Path path(getHome() + string($1 + 1));
+    Path path(getHome() + string($1.p + 1, $1.l - 1));
    $$ = new ExprPath(path);
  }
  ;
@ -543,7 +550,7 @@ attrpath
 attr
  : ID { $$ = $1; }
-  | OR_KW { $$ = "or"; }
+  | OR_KW { $$ = {"or", 2}; }
  ;
 string_attr
@ -589,7 +596,7 @@ formal
 namespace nix {
-Expr * EvalState::parse(const char * text, FileOrigin origin,
+Expr * EvalState::parse(char * text, size_t length, FileOrigin origin,
    const Path & path, const Path & basePath, StaticEnv & staticEnv)
 {
    yyscan_t scanner;
@ -609,7 +616,7 @@ Expr * EvalState::parse(const char * text, FileOrigin origin,
    data.basePath = basePath;
    yylex_init(&scanner);
-    yy_scan_string(text, scanner);
+    yy_scan_buffer(text, length, scanner);
    int res = yyparse(scanner, &data);
    yylex_destroy(scanner);
@ -655,26 +662,33 @@ Expr * EvalState::parseExprFromFile(const Path & path)
 Expr * EvalState::parseExprFromFile(const Path & path, StaticEnv & staticEnv)
 {
-    return parse(readFile(path).c_str(), foFile, path, dirOf(path), staticEnv);
+    auto buffer = readFile(path);
    // readFile should have left some extra space for terminators
    buffer.append("\0\0", 2);
    return parse(buffer.data(), buffer.size(), foFile, path, dirOf(path), staticEnv);
 }
-Expr * EvalState::parseExprFromString(std::string_view s, const Path & basePath, StaticEnv & staticEnv)
+Expr * EvalState::parseExprFromString(std::string s, const Path & basePath, StaticEnv & staticEnv)
 {
-    return parse(s.data(), foString, "", basePath, staticEnv);
+    s.append("\0\0", 2);
    return parse(s.data(), s.size(), foString, "", basePath, staticEnv);
 }
-Expr * EvalState::parseExprFromString(std::string_view s, const Path & basePath)
+Expr * EvalState::parseExprFromString(std::string s, const Path & basePath)
 {
-    return parseExprFromString(s, basePath, staticBaseEnv);
+    return parseExprFromString(std::move(s), basePath, staticBaseEnv);
 }
 Expr * EvalState::parseStdin()
 {
    //Activity act(*logger, lvlTalkative, format("parsing standard input"));
-    return parse(drainFD(0).data(), foStdin, "", absPath("."), staticBaseEnv);
+    auto buffer = drainFD(0);
    // drainFD should have left some extra space for terminators
    buffer.append("\0\0", 2);
    return parse(buffer.data(), buffer.size(), foStdin, "", absPath("."), staticBaseEnv);
 }
--- a/src/libexpr/primops.cc
+++ b/src/libexpr/primops.cc
@ -378,7 +378,7 @@ void prim_exec(EvalState & state, const Pos & pos, Value * * args, Value & v)
    auto output = runProgram(program, true, commandArgs);
    Expr * parsed;
    try {
-        parsed = state.parseExprFromString(output, pos.file);
+        parsed = state.parseExprFromString(std::move(output), pos.file);
    } catch (Error & e) {
        e.addTrace(pos, "While parsing the output from '%1%'", program);
        throw;
@ -3915,9 +3915,12 @@ void EvalState::createBaseEnv()
    /* Note: we have to initialize the 'derivation' constant *after*
       building baseEnv/staticBaseEnv because it uses 'builtins'. */
-    eval(parse(
+    char code[] =
        #include "primops/derivation.nix.gen.hh"
-        , foFile, sDerivationNix, "/", staticBaseEnv), *vDerivation);
+        // the parser needs two NUL bytes as terminators; one of them
        // is implied by being a C string.
        "\0";
    eval(parse(code, sizeof(code), foFile, sDerivationNix, "/", staticBaseEnv), *vDerivation);
 }
--- a/src/libexpr/symbol-table.hh
+++ b/src/libexpr/symbol-table.hh
@ -1,7 +1,8 @@
 #pragma once
 #include <list>
 #include <map>
-#include <unordered_set>
+#include <unordered_map>
 #include "types.hh"
@ -70,15 +71,21 @@ public:
 class SymbolTable
 {
 private:
-    typedef std::unordered_set<string> Symbols;
+    std::unordered_map<std::string_view, Symbol> symbols;
-    Symbols symbols;
+    std::list<string> store;
 public:
    Symbol create(std::string_view s)
    {
-        // FIXME: avoid allocation if 's' already exists in the symbol table.
+        // Most symbols are looked up more than once, so we trade off insertion performance
-        std::pair<Symbols::iterator, bool> res = symbols.emplace(std::string(s));
+        // for lookup performance.
-        return Symbol(&*res.first);
+        // TODO: could probably be done more efficiently with transparent Hash and Equals
        // on the original implementation using unordered_set
        auto it = symbols.find(s);
        if (it != symbols.end()) return it->second;
        const string & rawSym = store.emplace_back(s);
        return symbols.emplace(rawSym, Symbol(&rawSym)).first->second;
    }
    size_t size() const
@ -91,7 +98,7 @@ public:
    template<typename T>
    void dump(T callback)
    {
-        for (auto & s : symbols)
+        for (auto & s : store)
            callback(s);
    }
 };
--- a/src/libutil/util.cc
+++ b/src/libutil/util.cc
@ -668,7 +668,9 @@ void writeFull(int fd, std::string_view s, bool allowInterrupts)
 string drainFD(int fd, bool block, const size_t reserveSize)
 {
-    StringSink sink(reserveSize);
+    // the parser needs two extra bytes to append terminating characters, other users will
    // not care very much about the extra memory.
    StringSink sink(reserveSize + 2);
    drainFD(fd, sink, block);
    return std::move(*sink.s);
 }
--- a/src/nix-build/nix-build.cc
+++ b/src/nix-build/nix-build.cc
@ -292,7 +292,7 @@ static void main_nix_build(int argc, char * * argv)
    else
        for (auto i : left) {
            if (fromArgs)
-                exprs.push_back(state->parseExprFromString(i, absPath(".")));
+                exprs.push_back(state->parseExprFromString(std::move(i), absPath(".")));
            else {
                auto absolute = i;
                try {
--- a/src/nix/repl.cc
+++ b/src/nix/repl.cc
@ -703,7 +703,7 @@ void NixRepl::addVarToScope(const Symbol & name, Value & v)
 Expr * NixRepl::parseString(string s)
 {
-    Expr * e = state->parseExprFromString(s, curDir, staticEnv);
+    Expr * e = state->parseExprFromString(std::move(s), curDir, staticEnv);
    return e;
 }