Merge pull request #5812 from pennae/small-perf-improvements

improve parser performance a bit
This commit is contained in:
Eelco Dolstra 2022-01-17 19:49:52 +01:00 committed by GitHub
commit fc2443a67c
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 82 additions and 53 deletions

View file

@ -179,8 +179,8 @@ public:
Expr * parseExprFromFile(const Path & path, StaticEnv & staticEnv); Expr * parseExprFromFile(const Path & path, StaticEnv & staticEnv);
/* Parse a Nix expression from the specified string. */ /* Parse a Nix expression from the specified string. */
Expr * parseExprFromString(std::string_view s, const Path & basePath, StaticEnv & staticEnv); Expr * parseExprFromString(std::string s, const Path & basePath, StaticEnv & staticEnv);
Expr * parseExprFromString(std::string_view s, const Path & basePath); Expr * parseExprFromString(std::string s, const Path & basePath);
Expr * parseStdin(); Expr * parseStdin();
@ -308,7 +308,7 @@ private:
friend struct ExprAttrs; friend struct ExprAttrs;
friend struct ExprLet; friend struct ExprLet;
Expr * parse(const char * text, FileOrigin origin, const Path & path, Expr * parse(char * text, size_t length, FileOrigin origin, const Path & path,
const Path & basePath, StaticEnv & staticEnv); const Path & basePath, StaticEnv & staticEnv);
public: public:

View file

@ -64,29 +64,32 @@ static void adjustLoc(YYLTYPE * loc, const char * s, size_t len)
} }
// FIXME: optimize // we make use of the fact that the parser receives a private copy of the input
static Expr * unescapeStr(SymbolTable & symbols, const char * s, size_t length) // string and can munge around in it.
static Expr * unescapeStr(SymbolTable & symbols, char * s, size_t length)
{ {
string t; char * result = s;
t.reserve(length); char * t = s;
char c; char c;
// the input string is terminated with *two* NULs, so we can safely take
// *one* character after the one being checked against.
while ((c = *s++)) { while ((c = *s++)) {
if (c == '\\') { if (c == '\\') {
assert(*s);
c = *s++; c = *s++;
if (c == 'n') t += '\n'; if (c == 'n') *t = '\n';
else if (c == 'r') t += '\r'; else if (c == 'r') *t = '\r';
else if (c == 't') t += '\t'; else if (c == 't') *t = '\t';
else t += c; else *t = c;
} }
else if (c == '\r') { else if (c == '\r') {
/* Normalise CR and CR/LF into LF. */ /* Normalise CR and CR/LF into LF. */
t += '\n'; *t = '\n';
if (*s == '\n') s++; /* cr/lf */ if (*s == '\n') s++; /* cr/lf */
} }
else t += c; else *t = c;
t++;
} }
return new ExprString(symbols.create(t)); return new ExprString(symbols.create({result, size_t(t - result)}));
} }
@ -139,7 +142,7 @@ or { return OR_KW; }
\/\/ { return UPDATE; } \/\/ { return UPDATE; }
\+\+ { return CONCAT; } \+\+ { return CONCAT; }
{ID} { yylval->id = strdup(yytext); return ID; } {ID} { yylval->id = {yytext, (size_t) yyleng}; return ID; }
{INT} { errno = 0; {INT} { errno = 0;
try { try {
yylval->n = boost::lexical_cast<int64_t>(yytext); yylval->n = boost::lexical_cast<int64_t>(yytext);
@ -221,14 +224,14 @@ or { return OR_KW; }
<PATH_START>{PATH_SEG} { <PATH_START>{PATH_SEG} {
POP_STATE(); POP_STATE();
PUSH_STATE(INPATH_SLASH); PUSH_STATE(INPATH_SLASH);
yylval->path = strdup(yytext); yylval->path = {yytext, (size_t) yyleng};
return PATH; return PATH;
} }
<PATH_START>{HPATH_START} { <PATH_START>{HPATH_START} {
POP_STATE(); POP_STATE();
PUSH_STATE(INPATH_SLASH); PUSH_STATE(INPATH_SLASH);
yylval->path = strdup(yytext); yylval->path = {yytext, (size_t) yyleng};
return HPATH; return HPATH;
} }
@ -237,7 +240,7 @@ or { return OR_KW; }
PUSH_STATE(INPATH_SLASH); PUSH_STATE(INPATH_SLASH);
else else
PUSH_STATE(INPATH); PUSH_STATE(INPATH);
yylval->path = strdup(yytext); yylval->path = {yytext, (size_t) yyleng};
return PATH; return PATH;
} }
{HPATH} { {HPATH} {
@ -245,7 +248,7 @@ or { return OR_KW; }
PUSH_STATE(INPATH_SLASH); PUSH_STATE(INPATH_SLASH);
else else
PUSH_STATE(INPATH); PUSH_STATE(INPATH);
yylval->path = strdup(yytext); yylval->path = {yytext, (size_t) yyleng};
return HPATH; return HPATH;
} }
@ -280,8 +283,8 @@ or { return OR_KW; }
throw ParseError("path has a trailing slash"); throw ParseError("path has a trailing slash");
} }
{SPATH} { yylval->path = strdup(yytext); return SPATH; } {SPATH} { yylval->path = {yytext, (size_t) yyleng}; return SPATH; }
{URI} { yylval->uri = strdup(yytext); return URI; } {URI} { yylval->uri = {yytext, (size_t) yyleng}; return URI; }
[ \t\r\n]+ /* eat up whitespace */ [ \t\r\n]+ /* eat up whitespace */
\#[^\r\n]* /* single-line comments */ \#[^\r\n]* /* single-line comments */

View file

@ -473,7 +473,7 @@ string ExprLambda::showNamePos() const
size_t SymbolTable::totalSize() const size_t SymbolTable::totalSize() const
{ {
size_t n = 0; size_t n = 0;
for (auto & i : symbols) for (auto & i : store)
n += i.size(); n += i.size();
return n; return n;
} }

View file

@ -273,9 +273,16 @@ void yyerror(YYLTYPE * loc, yyscan_t scanner, ParseData * data, const char * err
nix::Formal * formal; nix::Formal * formal;
nix::NixInt n; nix::NixInt n;
nix::NixFloat nf; nix::NixFloat nf;
const char * id; // !!! -> Symbol // using C a struct allows us to avoid having to define the special
char * path; // members that using string_view here would implicitly delete.
char * uri; struct StringToken {
const char * p;
size_t l;
operator std::string_view() const { return {p, l}; }
};
StringToken id; // !!! -> Symbol
StringToken path;
StringToken uri;
std::vector<nix::AttrName> * attrNames; std::vector<nix::AttrName> * attrNames;
std::vector<std::pair<nix::Pos, nix::Expr *> > * string_parts; std::vector<std::pair<nix::Pos, nix::Expr *> > * string_parts;
} }
@ -397,7 +404,7 @@ expr_select
expr_simple expr_simple
: ID { : ID {
if (strcmp($1, "__curPos") == 0) if (strncmp($1.p, "__curPos", $1.l) == 0)
$$ = new ExprPos(CUR_POS); $$ = new ExprPos(CUR_POS);
else else
$$ = new ExprVar(CUR_POS, data->symbols.create($1)); $$ = new ExprVar(CUR_POS, data->symbols.create($1));
@ -414,7 +421,7 @@ expr_simple
$$ = new ExprConcatStrings(CUR_POS, false, $2); $$ = new ExprConcatStrings(CUR_POS, false, $2);
} }
| SPATH { | SPATH {
string path($1 + 1, strlen($1) - 2); string path($1.p + 1, $1.l - 2);
$$ = new ExprCall(CUR_POS, $$ = new ExprCall(CUR_POS,
new ExprVar(data->symbols.create("__findFile")), new ExprVar(data->symbols.create("__findFile")),
{new ExprVar(data->symbols.create("__nixPath")), {new ExprVar(data->symbols.create("__nixPath")),
@ -460,14 +467,14 @@ string_parts_interpolated
path_start path_start
: PATH { : PATH {
Path path(absPath($1, data->basePath)); Path path(absPath({$1.p, $1.l}, data->basePath));
/* add back in the trailing '/' to the first segment */ /* add back in the trailing '/' to the first segment */
if ($1[strlen($1)-1] == '/' && strlen($1) > 1) if ($1.p[$1.l-1] == '/' && $1.l > 1)
path += "/"; path += "/";
$$ = new ExprPath(path); $$ = new ExprPath(path);
} }
| HPATH { | HPATH {
Path path(getHome() + string($1 + 1)); Path path(getHome() + string($1.p + 1, $1.l - 1));
$$ = new ExprPath(path); $$ = new ExprPath(path);
} }
; ;
@ -543,7 +550,7 @@ attrpath
attr attr
: ID { $$ = $1; } : ID { $$ = $1; }
| OR_KW { $$ = "or"; } | OR_KW { $$ = {"or", 2}; }
; ;
string_attr string_attr
@ -589,7 +596,7 @@ formal
namespace nix { namespace nix {
Expr * EvalState::parse(const char * text, FileOrigin origin, Expr * EvalState::parse(char * text, size_t length, FileOrigin origin,
const Path & path, const Path & basePath, StaticEnv & staticEnv) const Path & path, const Path & basePath, StaticEnv & staticEnv)
{ {
yyscan_t scanner; yyscan_t scanner;
@ -609,7 +616,7 @@ Expr * EvalState::parse(const char * text, FileOrigin origin,
data.basePath = basePath; data.basePath = basePath;
yylex_init(&scanner); yylex_init(&scanner);
yy_scan_string(text, scanner); yy_scan_buffer(text, length, scanner);
int res = yyparse(scanner, &data); int res = yyparse(scanner, &data);
yylex_destroy(scanner); yylex_destroy(scanner);
@ -655,26 +662,33 @@ Expr * EvalState::parseExprFromFile(const Path & path)
Expr * EvalState::parseExprFromFile(const Path & path, StaticEnv & staticEnv) Expr * EvalState::parseExprFromFile(const Path & path, StaticEnv & staticEnv)
{ {
return parse(readFile(path).c_str(), foFile, path, dirOf(path), staticEnv); auto buffer = readFile(path);
// readFile should have left some extra space for terminators
buffer.append("\0\0", 2);
return parse(buffer.data(), buffer.size(), foFile, path, dirOf(path), staticEnv);
} }
Expr * EvalState::parseExprFromString(std::string_view s, const Path & basePath, StaticEnv & staticEnv) Expr * EvalState::parseExprFromString(std::string s, const Path & basePath, StaticEnv & staticEnv)
{ {
return parse(s.data(), foString, "", basePath, staticEnv); s.append("\0\0", 2);
return parse(s.data(), s.size(), foString, "", basePath, staticEnv);
} }
Expr * EvalState::parseExprFromString(std::string_view s, const Path & basePath) Expr * EvalState::parseExprFromString(std::string s, const Path & basePath)
{ {
return parseExprFromString(s, basePath, staticBaseEnv); return parseExprFromString(std::move(s), basePath, staticBaseEnv);
} }
Expr * EvalState::parseStdin() Expr * EvalState::parseStdin()
{ {
//Activity act(*logger, lvlTalkative, format("parsing standard input")); //Activity act(*logger, lvlTalkative, format("parsing standard input"));
return parse(drainFD(0).data(), foStdin, "", absPath("."), staticBaseEnv); auto buffer = drainFD(0);
// drainFD should have left some extra space for terminators
buffer.append("\0\0", 2);
return parse(buffer.data(), buffer.size(), foStdin, "", absPath("."), staticBaseEnv);
} }

View file

@ -378,7 +378,7 @@ void prim_exec(EvalState & state, const Pos & pos, Value * * args, Value & v)
auto output = runProgram(program, true, commandArgs); auto output = runProgram(program, true, commandArgs);
Expr * parsed; Expr * parsed;
try { try {
parsed = state.parseExprFromString(output, pos.file); parsed = state.parseExprFromString(std::move(output), pos.file);
} catch (Error & e) { } catch (Error & e) {
e.addTrace(pos, "While parsing the output from '%1%'", program); e.addTrace(pos, "While parsing the output from '%1%'", program);
throw; throw;
@ -3915,9 +3915,12 @@ void EvalState::createBaseEnv()
/* Note: we have to initialize the 'derivation' constant *after* /* Note: we have to initialize the 'derivation' constant *after*
building baseEnv/staticBaseEnv because it uses 'builtins'. */ building baseEnv/staticBaseEnv because it uses 'builtins'. */
eval(parse( char code[] =
#include "primops/derivation.nix.gen.hh" #include "primops/derivation.nix.gen.hh"
, foFile, sDerivationNix, "/", staticBaseEnv), *vDerivation); // the parser needs two NUL bytes as terminators; one of them
// is implied by being a C string.
"\0";
eval(parse(code, sizeof(code), foFile, sDerivationNix, "/", staticBaseEnv), *vDerivation);
} }

View file

@ -1,7 +1,8 @@
#pragma once #pragma once
#include <list>
#include <map> #include <map>
#include <unordered_set> #include <unordered_map>
#include "types.hh" #include "types.hh"
@ -70,15 +71,21 @@ public:
class SymbolTable class SymbolTable
{ {
private: private:
typedef std::unordered_set<string> Symbols; std::unordered_map<std::string_view, Symbol> symbols;
Symbols symbols; std::list<string> store;
public: public:
Symbol create(std::string_view s) Symbol create(std::string_view s)
{ {
// FIXME: avoid allocation if 's' already exists in the symbol table. // Most symbols are looked up more than once, so we trade off insertion performance
std::pair<Symbols::iterator, bool> res = symbols.emplace(std::string(s)); // for lookup performance.
return Symbol(&*res.first); // TODO: could probably be done more efficiently with transparent Hash and Equals
// on the original implementation using unordered_set
auto it = symbols.find(s);
if (it != symbols.end()) return it->second;
const string & rawSym = store.emplace_back(s);
return symbols.emplace(rawSym, Symbol(&rawSym)).first->second;
} }
size_t size() const size_t size() const
@ -91,7 +98,7 @@ public:
template<typename T> template<typename T>
void dump(T callback) void dump(T callback)
{ {
for (auto & s : symbols) for (auto & s : store)
callback(s); callback(s);
} }
}; };

View file

@ -668,7 +668,9 @@ void writeFull(int fd, std::string_view s, bool allowInterrupts)
string drainFD(int fd, bool block, const size_t reserveSize) string drainFD(int fd, bool block, const size_t reserveSize)
{ {
StringSink sink(reserveSize); // the parser needs two extra bytes to append terminating characters, other users will
// not care very much about the extra memory.
StringSink sink(reserveSize + 2);
drainFD(fd, sink, block); drainFD(fd, sink, block);
return std::move(*sink.s); return std::move(*sink.s);
} }

View file

@ -292,7 +292,7 @@ static void main_nix_build(int argc, char * * argv)
else else
for (auto i : left) { for (auto i : left) {
if (fromArgs) if (fromArgs)
exprs.push_back(state->parseExprFromString(i, absPath("."))); exprs.push_back(state->parseExprFromString(std::move(i), absPath(".")));
else { else {
auto absolute = i; auto absolute = i;
try { try {

View file

@ -703,7 +703,7 @@ void NixRepl::addVarToScope(const Symbol & name, Value & v)
Expr * NixRepl::parseString(string s) Expr * NixRepl::parseString(string s)
{ {
Expr * e = state->parseExprFromString(s, curDir, staticEnv); Expr * e = state->parseExprFromString(std::move(s), curDir, staticEnv);
return e; return e;
} }