diff --git a/configure.ac b/configure.ac index 6b7fbae73..b2d652926 100644 --- a/configure.ac +++ b/configure.ac @@ -342,6 +342,16 @@ AC_SUBST(doc_generate) # Look for lowdown library. PKG_CHECK_MODULES([LOWDOWN], [lowdown >= 0.9.0], [CXXFLAGS="$LOWDOWN_CFLAGS $CXXFLAGS"]) +# Look for pegtl. +# pegtl has only cmake support, no pkg-config. +AC_ARG_VAR([PEGTL_HEADERS], [include path of pegtl headers]) +AC_LANG_PUSH(C++) +AC_SUBST(PEGTL_HEADERS) +[CXXFLAGS="-I $PEGTL_HEADERS $CXXFLAGS"] +AC_CHECK_HEADER(tao/pegtl.hpp, [], [AC_MSG_ERROR([PEGTL not found.])]) +AC_LANG_POP(C++) + + # Setuid installations. AC_CHECK_FUNCS([setresuid setreuid lchown]) diff --git a/flake.nix b/flake.nix index 62037e1e3..913908760 100644 --- a/flake.nix +++ b/flake.nix @@ -192,6 +192,25 @@ boehmgc = final.boehmgc-nix; busybox-sandbox-shell = final.busybox-sandbox-shell or final.default-busybox-sandbox-shell; }; + + pegtl = final.callPackage ( + { stdenv, cmake, ninja }: + + stdenv.mkDerivation { + pname = "pegtl"; + version = "3.2.7"; + + src = final.fetchFromGitHub { + repo = "PEGTL"; + owner = "taocpp"; + rev = "refs/tags/3.2.7"; + hash = "sha256-IV5YNGE4EWVrmg2Sia/rcU8jCuiBynQGJM6n3DCWTQU="; + }; + + nativeBuildInputs = [ cmake ninja ]; + } + ) {}; + }; in { diff --git a/package.nix b/package.nix index 8d33759b7..c61b44eff 100644 --- a/package.nix +++ b/package.nix @@ -7,7 +7,6 @@ aws-sdk-cpp, boehmgc, nlohmann_json, - bison, changelog-d, boost, brotli, @@ -16,7 +15,6 @@ doxygen, editline, fileset, - flex, git, gtest, jq, @@ -29,6 +27,7 @@ mdbook-linkcheck, mercurial, openssl, + pegtl, pkg-config, rapidcheck, sqlite, @@ -127,9 +126,6 @@ in stdenv.mkDerivation (finalAttrs: { dontBuild = false; nativeBuildInputs = [ - bison - flex - ] ++ [ (lib.getBin lowdown) mdbook mdbook-linkcheck @@ -212,7 +208,8 @@ in stdenv.mkDerivation (finalAttrs: { ++ lib.optionals (finalAttrs.doCheck || internalApiDocs) testConfigureFlags ++ lib.optional (!canRunInstalled) "--disable-doc-gen" ++ [ (lib.enableFeature internalApiDocs "internal-api-docs") ] - ++ lib.optional (!forDevShell) "--sysconfdir=/etc"; + ++ lib.optional (!forDevShell) "--sysconfdir=/etc" + ++ [ "PEGTL_HEADERS=${lib.getDev pegtl}/include" ]; installTargets = lib.optional internalApiDocs "internal-api-html"; diff --git a/src/libexpr/eval.cc b/src/libexpr/eval.cc index da064b667..1f0dfda0a 100644 --- a/src/libexpr/eval.cc +++ b/src/libexpr/eval.cc @@ -18,7 +18,6 @@ #include "gc-small-vector.hh" #include "fetch-to-store.hh" #include "flake/flakeref.hh" -#include "parser-tab.hh" #include #include @@ -2858,21 +2857,6 @@ std::optional EvalState::resolveSearchPathPath(const SearchPath::Pa } -Expr * EvalState::parse( - char * text, - size_t length, - Pos::Origin origin, - const SourcePath & basePath, - std::shared_ptr & staticEnv) -{ - auto result = parseExprFromBuf(text, length, origin, basePath, symbols, positions, exprSymbols); - - result->bindVars(*this, staticEnv); - - return result; -} - - std::string ExternalValueBase::coerceToString(EvalState & state, const PosIdx & pos, NixStringContext & context, bool copyMore, bool copyToStore) const { state.error( diff --git a/src/libexpr/lexer.l b/src/libexpr/lexer.l deleted file mode 100644 index e98166f15..000000000 --- a/src/libexpr/lexer.l +++ /dev/null @@ -1,302 +0,0 @@ -%option reentrant bison-bridge bison-locations -%option align -%option noyywrap -%option never-interactive -%option stack -%option nodefault -%option nounput noyy_top_state - - -%s DEFAULT -%x STRING -%x IND_STRING -%x INPATH -%x INPATH_SLASH -%x PATH_START - - -%{ -#ifdef __clang__ -#pragma clang diagnostic ignored "-Wunneeded-internal-declaration" -#endif - -// yacc generates code that uses unannotated fallthrough. -#pragma GCC diagnostic ignored "-Wimplicit-fallthrough" -#ifdef __clang__ -#pragma clang diagnostic ignored "-Wimplicit-fallthrough" -#endif - -#include - -#include "nixexpr.hh" -#include "parser-tab.hh" - -using namespace nix; - -#define THROW(...) \ - do { \ - state->error.reset(new auto(__VA_ARGS__)); \ - return YYerror; \ - } while (0) - -namespace nix { - -#define CUR_POS state->at(*yylloc) - -static void initLoc(YYLTYPE * loc) -{ - loc->first_line = loc->last_line = 0; - loc->first_column = loc->last_column = 0; -} - -static void adjustLoc(YYLTYPE * loc, const char * s, size_t len) -{ - loc->stash(); - - loc->first_column = loc->last_column; - loc->last_column += len; -} - - -// we make use of the fact that the parser receives a private copy of the input -// string and can munge around in it. -static StringToken unescapeStr(SymbolTable & symbols, char * s, size_t length) -{ - char * result = s; - char * t = s; - char c; - // the input string is terminated with *two* NULs, so we can safely take - // *one* character after the one being checked against. - while ((c = *s++)) { - if (c == '\\') { - c = *s++; - if (c == 'n') *t = '\n'; - else if (c == 'r') *t = '\r'; - else if (c == 't') *t = '\t'; - else *t = c; - } - else if (c == '\r') { - /* Normalise CR and CR/LF into LF. */ - *t = '\n'; - if (*s == '\n') s++; /* cr/lf */ - } - else *t = c; - t++; - } - return {result, size_t(t - result)}; -} - - -} - -#define YY_USER_INIT initLoc(yylloc) -#define YY_USER_ACTION adjustLoc(yylloc, yytext, yyleng); - -#define PUSH_STATE(state) yy_push_state(state, yyscanner) -#define POP_STATE() yy_pop_state(yyscanner) - -%} - - -ANY .|\n -ID [a-zA-Z\_][a-zA-Z0-9\_\'\-]* -INT [0-9]+ -FLOAT (([1-9][0-9]*\.[0-9]*)|(0?\.[0-9]+))([Ee][+-]?[0-9]+)? -PATH_CHAR [a-zA-Z0-9\.\_\-\+] -PATH {PATH_CHAR}*(\/{PATH_CHAR}+)+\/? -PATH_SEG {PATH_CHAR}*\/ -HPATH \~(\/{PATH_CHAR}+)+\/? -HPATH_START \~\/ -SPATH \<{PATH_CHAR}+(\/{PATH_CHAR}+)*\> -URI [a-zA-Z][a-zA-Z0-9\+\-\.]*\:[a-zA-Z0-9\%\/\?\:\@\&\=\+\$\,\-\_\.\!\~\*\']+ - - -%% - - -if { return IF; } -then { return THEN; } -else { return ELSE; } -assert { return ASSERT; } -with { return WITH; } -let { return LET; } -in { return IN; } -rec { return REC; } -inherit { return INHERIT; } -or { return OR_KW; } -\.\.\. { return ELLIPSIS; } - -\=\= { return EQ; } -\!\= { return NEQ; } -\<\= { return LEQ; } -\>\= { return GEQ; } -\&\& { return AND; } -\|\| { return OR; } -\-\> { return IMPL; } -\/\/ { return UPDATE; } -\+\+ { return CONCAT; } - -{ID} { yylval->id = {yytext, (size_t) yyleng}; return ID; } -{INT} { errno = 0; - try { - yylval->n = boost::lexical_cast(yytext); - } catch (const boost::bad_lexical_cast &) { - THROW(ParseError({ - .msg = HintFmt("invalid integer '%1%'", yytext), - .pos = state->positions[CUR_POS], - })); - } - return INT; - } -{FLOAT} { errno = 0; - yylval->nf = strtod(yytext, 0); - if (errno != 0) - THROW(ParseError({ - .msg = HintFmt("invalid float '%1%'", yytext), - .pos = state->positions[CUR_POS], - })); - return FLOAT; - } - -\$\{ { PUSH_STATE(DEFAULT); return DOLLAR_CURLY; } - -\} { /* State INITIAL only exists at the bottom of the stack and is - used as a marker. DEFAULT replaces it everywhere else. - Popping when in INITIAL state causes an empty stack exception, - so don't */ - if (YYSTATE != INITIAL) - POP_STATE(); - return '}'; - } -\{ { PUSH_STATE(DEFAULT); return '{'; } - -\" { PUSH_STATE(STRING); return '"'; } -([^\$\"\\]|\$[^\{\"\\]|\\{ANY}|\$\\{ANY})*\$/\" | -([^\$\"\\]|\$[^\{\"\\]|\\{ANY}|\$\\{ANY})+ { - /* It is impossible to match strings ending with '$' with one - regex because trailing contexts are only valid at the end - of a rule. (A sane but undocumented limitation.) */ - yylval->str = unescapeStr(state->symbols, yytext, yyleng); - return STR; - } -\$\{ { PUSH_STATE(DEFAULT); return DOLLAR_CURLY; } -\" { POP_STATE(); return '"'; } -\$|\\|\$\\ { - /* This can only occur when we reach EOF, otherwise the above - (...|\$[^\{\"\\]|\\.|\$\\.)+ would have triggered. - This is technically invalid, but we leave the problem to the - parser who fails with exact location. */ - return EOF; - } - -\'\'(\ *\n)? { PUSH_STATE(IND_STRING); return IND_STRING_OPEN; } -([^\$\']|\$[^\{\']|\'[^\'\$])+ { - yylval->str = {yytext, (size_t) yyleng, true}; - return IND_STR; - } -\'\'\$ | -\$ { - yylval->str = {"$", 1}; - return IND_STR; - } -\'\'\' { - yylval->str = {"''", 2}; - return IND_STR; - } -\'\'\\{ANY} { - yylval->str = unescapeStr(state->symbols, yytext + 2, yyleng - 2); - return IND_STR; - } -\$\{ { PUSH_STATE(DEFAULT); return DOLLAR_CURLY; } -\'\' { POP_STATE(); return IND_STRING_CLOSE; } -\' { - yylval->str = {"'", 1}; - return IND_STR; - } - -{PATH_SEG}\$\{ | -{HPATH_START}\$\{ { - PUSH_STATE(PATH_START); - yyless(0); - yylloc->unstash(); -} - -{PATH_SEG} { - POP_STATE(); - PUSH_STATE(INPATH_SLASH); - yylval->path = {yytext, (size_t) yyleng}; - return PATH; -} - -{HPATH_START} { - POP_STATE(); - PUSH_STATE(INPATH_SLASH); - yylval->path = {yytext, (size_t) yyleng}; - return HPATH; -} - -{PATH} { - if (yytext[yyleng-1] == '/') - PUSH_STATE(INPATH_SLASH); - else - PUSH_STATE(INPATH); - yylval->path = {yytext, (size_t) yyleng}; - return PATH; -} -{HPATH} { - if (yytext[yyleng-1] == '/') - PUSH_STATE(INPATH_SLASH); - else - PUSH_STATE(INPATH); - yylval->path = {yytext, (size_t) yyleng}; - return HPATH; -} - -\$\{ { - POP_STATE(); - PUSH_STATE(INPATH); - PUSH_STATE(DEFAULT); - return DOLLAR_CURLY; -} -{PATH}|{PATH_SEG}|{PATH_CHAR}+ { - POP_STATE(); - if (yytext[yyleng-1] == '/') - PUSH_STATE(INPATH_SLASH); - else - PUSH_STATE(INPATH); - yylval->str = {yytext, (size_t) yyleng}; - return STR; -} -{ANY} | -<> { - /* if we encounter a non-path character we inform the parser that the path has - ended with a PATH_END token and re-parse this character in the default - context (it may be ')', ';', or something of that sort) */ - POP_STATE(); - yyless(0); - yylloc->unstash(); - return PATH_END; -} - -{ANY} | -<> { - THROW(ParseError({ - .msg = HintFmt("path has a trailing slash"), - .pos = state->positions[CUR_POS], - })); -} - -{SPATH} { yylval->path = {yytext, (size_t) yyleng}; return SPATH; } -{URI} { yylval->uri = {yytext, (size_t) yyleng}; return URI; } - -[ \t\r\n]+ /* eat up whitespace */ -\#[^\r\n]* /* single-line comments */ -\/\*([^*]|\*+[^*/])*\*+\/ /* long comments */ - -{ANY} { - /* Don't return a negative number, as this will cause - Bison to stop parsing without an error. */ - return (unsigned char) yytext[0]; - } - -%% diff --git a/src/libexpr/local.mk b/src/libexpr/local.mk index ed6bc761a..d24a410cf 100644 --- a/src/libexpr/local.mk +++ b/src/libexpr/local.mk @@ -7,12 +7,12 @@ libexpr_DIR := $(d) libexpr_SOURCES := \ $(wildcard $(d)/*.cc) \ $(wildcard $(d)/value/*.cc) \ + $(wildcard $(d)/parser/*.cc) \ $(wildcard $(d)/primops/*.cc) \ - $(wildcard $(d)/flake/*.cc) \ - $(d)/lexer-tab.cc \ - $(d)/parser-tab.cc + $(wildcard $(d)/flake/*.cc) -libexpr_CXXFLAGS += -I src/libutil -I src/libstore -I src/libfetchers -I src/libmain -I src/libexpr +libexpr_CXXFLAGS += -I src/libutil -I src/libstore -I src/libfetchers -I src/libmain -I src/libexpr \ + -I pegtl/include libexpr_LIBS = libutil libstore libfetchers @@ -26,16 +26,6 @@ endif # because inline functions in libexpr's header files call libgc. libexpr_LDFLAGS_PROPAGATED = $(BDW_GC_LIBS) -libexpr_ORDER_AFTER := $(d)/parser-tab.cc $(d)/parser-tab.hh $(d)/lexer-tab.cc $(d)/lexer-tab.hh - -$(d)/parser-tab.cc $(d)/parser-tab.hh: $(d)/parser.y - $(trace-gen) bison -v -o $(libexpr_DIR)/parser-tab.cc $< -d - -$(d)/lexer-tab.cc $(d)/lexer-tab.hh: $(d)/lexer.l - $(trace-gen) flex --outfile $(libexpr_DIR)/lexer-tab.cc --header-file=$(libexpr_DIR)/lexer-tab.hh $< - -clean-files += $(d)/parser-tab.cc $(d)/parser-tab.hh $(d)/lexer-tab.cc $(d)/lexer-tab.hh - $(eval $(call install-file-in, $(buildprefix)$(d)/nix-expr.pc, $(libdir)/pkgconfig, 0644)) $(foreach i, $(wildcard src/libexpr/value/*.hh), \ diff --git a/src/libexpr/parser.y b/src/libexpr/parser.y deleted file mode 100644 index 43960a4a2..000000000 --- a/src/libexpr/parser.y +++ /dev/null @@ -1,503 +0,0 @@ -%glr-parser -%define api.pure -%locations -%define parse.error verbose -%defines -/* %no-lines */ -%parse-param { void * scanner } -%parse-param { nix::ParserState * state } -%lex-param { void * scanner } -%lex-param { nix::ParserState * state } -%expect 1 -%expect-rr 1 - -%code requires { - -#ifndef BISON_HEADER -#define BISON_HEADER - -#include - -#include "finally.hh" -#include "util.hh" - -#include "nixexpr.hh" -#include "eval.hh" -#include "eval-settings.hh" -#include "globals.hh" -#include "parser-state.hh" - -#define YYLTYPE ::nix::ParserLocation -#define YY_DECL int yylex \ - (YYSTYPE * yylval_param, YYLTYPE * yylloc_param, yyscan_t yyscanner, nix::ParserState * state) - -namespace nix { - -Expr * parseExprFromBuf( - char * text, - size_t length, - Pos::Origin origin, - const SourcePath & basePath, - SymbolTable & symbols, - PosTable & positions, - const Expr::AstSymbols & astSymbols); - -} - -#endif - -} - -%{ - -#include "parser-tab.hh" -#include "lexer-tab.hh" - -YY_DECL; - -using namespace nix; - -#define CUR_POS state->at(*yylocp) - -// otherwise destructors cause compiler errors -#pragma GCC diagnostic ignored "-Wswitch-enum" - -#define THROW(err, ...) \ - do { \ - state->error.reset(new auto(err)); \ - [](auto... d) { (delete d, ...); }(__VA_ARGS__); \ - YYABORT; \ - } while (0) - -void yyerror(YYLTYPE * loc, yyscan_t scanner, ParserState * state, const char * error) -{ - if (std::string_view(error).starts_with("syntax error, unexpected end of file")) { - loc->first_column = loc->last_column; - loc->first_line = loc->last_line; - } - throw ParseError({ - .msg = HintFmt(error), - .pos = state->positions[state->at(*loc)] - }); -} - -template -static std::unique_ptr unp(T * e) -{ - return std::unique_ptr(e); -} - -template, typename... Args> -static std::vector vec(Args && ... args) -{ - std::vector result; - result.reserve(sizeof...(Args)); - (result.emplace_back(std::forward(args)), ...); - return result; -} - - -%} - -%union { - // !!! We're probably leaking stuff here. - nix::Expr * e; - nix::ExprList * list; - nix::ExprAttrs * attrs; - nix::Formals * formals; - nix::Formal * formal; - nix::NixInt n; - nix::NixFloat nf; - nix::StringToken id; // !!! -> Symbol - nix::StringToken path; - nix::StringToken uri; - nix::StringToken str; - std::vector * attrNames; - std::vector> * inheritAttrs; - std::vector>> * string_parts; - std::vector, nix::StringToken>>> * ind_string_parts; -} - -%destructor { delete $$; } -%destructor { delete $$; } -%destructor { delete $$; } -%destructor { delete $$; } -%destructor { delete $$; } -%destructor { delete $$; } -%destructor { delete $$; } -%destructor { delete $$; } -%destructor { delete $$; } - -%type start -%type expr expr_function expr_if expr_op -%type expr_select expr_simple expr_app -%type expr_list -%type binds -%type formals -%type formal -%type attrpath -%type attrs -%type string_parts_interpolated -%type ind_string_parts -%type path_start string_parts string_attr -%type attr -%token ID -%token STR IND_STR -%token INT -%token FLOAT -%token PATH HPATH SPATH PATH_END -%token URI -%token IF THEN ELSE ASSERT WITH LET IN REC INHERIT EQ NEQ AND OR IMPL OR_KW -%token DOLLAR_CURLY /* == ${ */ -%token IND_STRING_OPEN IND_STRING_CLOSE -%token ELLIPSIS - -%right IMPL -%left OR -%left AND -%nonassoc EQ NEQ -%nonassoc '<' '>' LEQ GEQ -%right UPDATE -%left NOT -%left '+' '-' -%left '*' '/' -%right CONCAT -%nonassoc '?' -%nonassoc NEGATE - -%% - -start: expr { state->result = $1; $$ = 0; }; - -expr: expr_function; - -expr_function - : ID ':' expr_function - { $$ = new ExprLambda(CUR_POS, state->symbols.create($1), nullptr, unp($3)); } - | '{' formals '}' ':' expr_function - { if (auto e = state->validateFormals($2)) THROW(*e); - $$ = new ExprLambda(CUR_POS, unp($2), unp($5)); - } - | '{' formals '}' '@' ID ':' expr_function - { - auto arg = state->symbols.create($5); - if (auto e = state->validateFormals($2, CUR_POS, arg)) THROW(*e, $2, $7); - $$ = new ExprLambda(CUR_POS, arg, unp($2), unp($7)); - } - | ID '@' '{' formals '}' ':' expr_function - { - auto arg = state->symbols.create($1); - if (auto e = state->validateFormals($4, CUR_POS, arg)) THROW(*e, $4, $7); - $$ = new ExprLambda(CUR_POS, arg, unp($4), unp($7)); - } - | ASSERT expr ';' expr_function - { $$ = new ExprAssert(CUR_POS, unp($2), unp($4)); } - | WITH expr ';' expr_function - { $$ = new ExprWith(CUR_POS, unp($2), unp($4)); } - | LET binds IN expr_function - { if (!$2->dynamicAttrs.empty()) - THROW(ParseError({ - .msg = HintFmt("dynamic attributes not allowed in let"), - .pos = state->positions[CUR_POS] - }), $2, $4); - $$ = new ExprLet(unp($2), unp($4)); - } - | expr_if - ; - -expr_if - : IF expr THEN expr ELSE expr { $$ = new ExprIf(CUR_POS, unp($2), unp($4), unp($6)); } - | expr_op - ; - -expr_op - : '!' expr_op %prec NOT { $$ = new ExprOpNot(unp($2)); } - | '-' expr_op %prec NEGATE { $$ = new ExprCall(CUR_POS, std::make_unique(state->s.sub), vec(std::make_unique(0), unp($2))); } - | expr_op EQ expr_op { $$ = new ExprOpEq(unp($1), unp($3)); } - | expr_op NEQ expr_op { $$ = new ExprOpNEq(unp($1), unp($3)); } - | expr_op '<' expr_op { $$ = new ExprCall(state->at(@2), std::make_unique(state->s.lessThan), vec($1, $3)); } - | expr_op LEQ expr_op { $$ = new ExprOpNot(std::make_unique(state->at(@2), std::make_unique(state->s.lessThan), vec($3, $1))); } - | expr_op '>' expr_op { $$ = new ExprCall(state->at(@2), std::make_unique(state->s.lessThan), vec($3, $1)); } - | expr_op GEQ expr_op { $$ = new ExprOpNot(std::make_unique(state->at(@2), std::make_unique(state->s.lessThan), vec($1, $3))); } - | expr_op AND expr_op { $$ = new ExprOpAnd(state->at(@2), unp($1), unp($3)); } - | expr_op OR expr_op { $$ = new ExprOpOr(state->at(@2), unp($1), unp($3)); } - | expr_op IMPL expr_op { $$ = new ExprOpImpl(state->at(@2), unp($1), unp($3)); } - | expr_op UPDATE expr_op { $$ = new ExprOpUpdate(state->at(@2), unp($1), unp($3)); } - | expr_op '?' attrpath { $$ = new ExprOpHasAttr(unp($1), std::move(*$3)); delete $3; } - | expr_op '+' expr_op - { $$ = new ExprConcatStrings(state->at(@2), false, vec>>(std::pair(state->at(@1), unp($1)), std::pair(state->at(@3), unp($3)))); } - | expr_op '-' expr_op { $$ = new ExprCall(state->at(@2), std::make_unique(state->s.sub), vec($1, $3)); } - | expr_op '*' expr_op { $$ = new ExprCall(state->at(@2), std::make_unique(state->s.mul), vec($1, $3)); } - | expr_op '/' expr_op { $$ = new ExprCall(state->at(@2), std::make_unique(state->s.div), vec($1, $3)); } - | expr_op CONCAT expr_op { $$ = new ExprOpConcatLists(state->at(@2), unp($1), unp($3)); } - | expr_app - ; - -expr_app - : expr_app expr_select { - if (auto e2 = dynamic_cast($1)) { - e2->args.emplace_back($2); - $$ = $1; - } else - $$ = new ExprCall(CUR_POS, unp($1), vec(unp($2))); - } - | expr_select - ; - -expr_select - : expr_simple '.' attrpath - { $$ = new ExprSelect(CUR_POS, unp($1), std::move(*$3), nullptr); delete $3; } - | expr_simple '.' attrpath OR_KW expr_select - { $$ = new ExprSelect(CUR_POS, unp($1), std::move(*$3), unp($5)); delete $3; } - | /* Backwards compatibility: because Nixpkgs has a rarely used - function named ‘or’, allow stuff like ‘map or [...]’. */ - expr_simple OR_KW - { $$ = new ExprCall(CUR_POS, unp($1), vec(std::make_unique(CUR_POS, state->s.or_))); } - | expr_simple - ; - -expr_simple - : ID { - std::string_view s = "__curPos"; - if ($1.l == s.size() && strncmp($1.p, s.data(), s.size()) == 0) - $$ = new ExprPos(CUR_POS); - else - $$ = new ExprVar(CUR_POS, state->symbols.create($1)); - } - | INT { $$ = new ExprInt($1); } - | FLOAT { $$ = new ExprFloat($1); } - | '"' string_parts '"' { $$ = $2; } - | IND_STRING_OPEN ind_string_parts IND_STRING_CLOSE { - $$ = state->stripIndentation(CUR_POS, std::move(*$2)).release(); - delete $2; - } - | path_start PATH_END - | path_start string_parts_interpolated PATH_END { - $2->emplace($2->begin(), state->at(@1), $1); - $$ = new ExprConcatStrings(CUR_POS, false, std::move(*$2)); - delete $2; - } - | SPATH { - std::string path($1.p + 1, $1.l - 2); - $$ = new ExprCall(CUR_POS, - std::make_unique(state->s.findFile), - vec(std::make_unique(state->s.nixPath), - std::make_unique(std::move(path)))); - } - | URI { - static bool noURLLiterals = experimentalFeatureSettings.isEnabled(Xp::NoUrlLiterals); - if (noURLLiterals) - THROW(ParseError({ - .msg = HintFmt("URL literals are disabled"), - .pos = state->positions[CUR_POS] - })); - $$ = new ExprString(std::string($1)); - } - | '(' expr ')' { $$ = $2; } - /* Let expressions `let {..., body = ...}' are just desugared - into `(rec {..., body = ...}).body'. */ - | LET '{' binds '}' - { $3->recursive = true; $$ = new ExprSelect(noPos, unp($3), state->s.body); } - | REC '{' binds '}' - { $3->recursive = true; $$ = $3; } - | '{' binds '}' - { $$ = $2; } - | '[' expr_list ']' { $$ = $2; } - ; - -string_parts - : STR { $$ = new ExprString(std::string($1)); } - | string_parts_interpolated - { $$ = new ExprConcatStrings(CUR_POS, true, std::move(*$1)); - delete $1; - } - | { $$ = new ExprString(""); } - ; - -string_parts_interpolated - : string_parts_interpolated STR - { $$ = $1; $1->emplace_back(state->at(@2), new ExprString(std::string($2))); } - | string_parts_interpolated DOLLAR_CURLY expr '}' { $$ = $1; $1->emplace_back(state->at(@2), $3); } - | DOLLAR_CURLY expr '}' { $$ = new std::vector>>; $$->emplace_back(state->at(@1), $2); } - | STR DOLLAR_CURLY expr '}' { - $$ = new std::vector>>; - $$->emplace_back(state->at(@1), new ExprString(std::string($1))); - $$->emplace_back(state->at(@2), $3); - } - ; - -path_start - : PATH { - Path path(absPath({$1.p, $1.l}, state->basePath.path.abs())); - /* add back in the trailing '/' to the first segment */ - if ($1.p[$1.l-1] == '/' && $1.l > 1) - path += "/"; - $$ = new ExprPath(path); - } - | HPATH { - if (evalSettings.pureEval) { - THROW(Error( - "the path '%s' can not be resolved in pure mode", - std::string_view($1.p, $1.l) - )); - } - Path path(getHome() + std::string($1.p + 1, $1.l - 1)); - $$ = new ExprPath(path); - } - ; - -ind_string_parts - : ind_string_parts IND_STR { $$ = $1; $1->emplace_back(state->at(@2), $2); } - | ind_string_parts DOLLAR_CURLY expr '}' { $$ = $1; $1->emplace_back(state->at(@2), unp($3)); } - | { $$ = new std::vector, StringToken>>>; } - ; - -binds - : binds attrpath '=' expr ';' - { $$ = $1; - if (auto e = state->addAttr($$, std::move(*$2), unp($4), state->at(@2))) THROW(*e, $1, $2); - delete $2; - } - | binds INHERIT attrs ';' - { $$ = $1; - for (auto & [i, iPos] : *$3) { - if ($$->attrs.find(i.symbol) != $$->attrs.end()) - THROW(state->dupAttr(i.symbol, iPos, $$->attrs[i.symbol].pos), $1); - $$->attrs.emplace( - i.symbol, - ExprAttrs::AttrDef(std::make_unique(iPos, i.symbol), iPos, ExprAttrs::AttrDef::Kind::Inherited)); - } - delete $3; - } - | binds INHERIT '(' expr ')' attrs ';' - { $$ = $1; - if (!$$->inheritFromExprs) - $$->inheritFromExprs = std::make_unique>>(); - $$->inheritFromExprs->push_back(unp($4)); - for (auto & [i, iPos] : *$6) { - if ($$->attrs.find(i.symbol) != $$->attrs.end()) - THROW(state->dupAttr(i.symbol, iPos, $$->attrs[i.symbol].pos), $1); - auto from = std::make_unique(state->at(@4), $$->inheritFromExprs->size() - 1); - $$->attrs.emplace( - i.symbol, - ExprAttrs::AttrDef( - std::make_unique(iPos, std::move(from), i.symbol), - iPos, - ExprAttrs::AttrDef::Kind::InheritedFrom)); - } - delete $6; - } - | { $$ = new ExprAttrs(state->at(@0)); } - ; - -attrs - : attrs attr { $$ = $1; $1->emplace_back(AttrName(state->symbols.create($2)), state->at(@2)); } - | attrs string_attr - { $$ = $1; - ExprString * str = dynamic_cast($2); - if (str) { - $$->emplace_back(AttrName(state->symbols.create(str->s)), state->at(@2)); - delete str; - } else - THROW(ParseError({ - .msg = HintFmt("dynamic attributes not allowed in inherit"), - .pos = state->positions[state->at(@2)] - }), $1, $2); - } - | { $$ = new std::vector>; } - ; - -attrpath - : attrpath '.' attr { $$ = $1; $1->push_back(AttrName(state->symbols.create($3))); } - | attrpath '.' string_attr - { $$ = $1; - ExprString * str = dynamic_cast($3); - if (str) { - $$->push_back(AttrName(state->symbols.create(str->s))); - delete str; - } else - $$->emplace_back(unp($3)); - } - | attr { $$ = new std::vector; $$->push_back(AttrName(state->symbols.create($1))); } - | string_attr - { $$ = new std::vector; - ExprString *str = dynamic_cast($1); - if (str) { - $$->push_back(AttrName(state->symbols.create(str->s))); - delete str; - } else - $$->emplace_back(unp($1)); - } - ; - -attr - : ID - | OR_KW { $$ = {"or", 2}; } - ; - -string_attr - : '"' string_parts '"' { $$ = $2; } - | DOLLAR_CURLY expr '}' { $$ = $2; } - ; - -expr_list - : expr_list expr_select { $$ = $1; $1->elems.emplace_back($2); /* !!! dangerous */ } - | { $$ = new ExprList; } - ; - -formals - : formal ',' formals - { $$ = $3; $$->formals.emplace_back(std::move(*$1)); delete $1; } - | formal - { $$ = new Formals; $$->formals.emplace_back(std::move(*$1)); $$->ellipsis = false; delete $1; } - | - { $$ = new Formals; $$->ellipsis = false; } - | ELLIPSIS - { $$ = new Formals; $$->ellipsis = true; } - ; - -formal - : ID { $$ = new Formal{CUR_POS, state->symbols.create($1), nullptr}; } - | ID '?' expr { $$ = new Formal{CUR_POS, state->symbols.create($1), unp($3)}; } - ; - -%% - -#include "eval.hh" - - -namespace nix { - -Expr * parseExprFromBuf( - char * text, - size_t length, - Pos::Origin origin, - const SourcePath & basePath, - SymbolTable & symbols, - PosTable & positions, - const Expr::AstSymbols & astSymbols) -{ - yyscan_t scanner; - ParserState state { - .symbols = symbols, - .positions = positions, - .basePath = basePath, - .origin = positions.addOrigin(origin, length), - .s = astSymbols, - }; - - yylex_init(&scanner); - Finally _destroy([&] { yylex_destroy(scanner); }); - - yy_scan_buffer(text, length, scanner); - yyparse(scanner, &state); - if (state.error) { - delete state.result; - throw *state.error; - } - - return state.result; -} - - -} diff --git a/src/libexpr/parser/change_head.hh b/src/libexpr/parser/change_head.hh new file mode 100644 index 000000000..490c2a0f0 --- /dev/null +++ b/src/libexpr/parser/change_head.hh @@ -0,0 +1,65 @@ +#pragma once + +#include + +namespace nix::parser { + +// modified copy of change_state, as the manual suggest for more involved +// state manipulation. we want to change only the first state parameter, +// and we care about the *initial* position of a rule application (not the +// past-the-end position as pegtl change_state provides) +template +struct change_head : tao::pegtl::maybe_nothing +{ + template< + typename Rule, + tao::pegtl::apply_mode A, + tao::pegtl::rewind_mode M, + template class Action, + template class Control, + typename ParseInput, + typename State, + typename... States + > + [[nodiscard]] static bool match(ParseInput & in, State && st, States &&... sts) + { + const auto begin = in.iterator(); + + if constexpr (std::is_constructible_v) { + NewState s(st, sts...); + if (tao::pegtl::match(in, s, sts...)) { + if constexpr (A == tao::pegtl::apply_mode::action) { + _success>(0, begin, in, s, st, sts...); + } + return true; + } + return false; + } else if constexpr (std::is_default_constructible_v) { + NewState s; + if (tao::pegtl::match(in, s, sts...)) { + if constexpr (A == tao::pegtl::apply_mode::action) { + _success>(0, begin, in, s, st, sts...); + } + return true; + } + return false; + } else { + static_assert(decltype(sizeof(NewState))(), "unable to instantiate new state"); + } + } + + template + static void _success(void *, auto & begin, ParseInput & in, S & ... sts) + { + const typename ParseInput::action_t at(begin, in); + Target::success(at, sts...); + } + + template + static void _success(decltype(Target::success0(std::declval()...), 0), auto &, auto &, S & ... sts) + { + Target::success0(sts...); + } +}; + +} diff --git a/src/libexpr/parser/grammar.hh b/src/libexpr/parser/grammar.hh new file mode 100644 index 000000000..e7224b4ef --- /dev/null +++ b/src/libexpr/parser/grammar.hh @@ -0,0 +1,706 @@ +#pragma once + +#include "tao/pegtl.hpp" +#include +#include + +#include + +// NOTE +// nix line endings are \n, \r\n, \r. the grammar does not use eol or +// eolf rules in favor of reproducing the old flex lexer as faithfully as +// possible, and deferring calculation of positions to downstream users. + +namespace nix::parser::grammar { + +using namespace tao::pegtl; +namespace p = tao::pegtl; + +// character classes +namespace c { + +struct path : sor< + ranges<'a', 'z', 'A', 'Z', '0', '9'>, + one<'.', '_', '-', '+'> +> {}; +struct path_sep : one<'/'> {}; + +struct id_first : ranges<'a', 'z', 'A', 'Z', '_'> {}; +struct id_rest : sor< + ranges<'a', 'z', 'A', 'Z', '0', '9'>, + one<'_', '\'', '-'> +> {}; + +struct uri_scheme_first : ranges<'a', 'z', 'A', 'Z'> {}; +struct uri_scheme_rest : sor< + ranges<'a', 'z', 'A', 'Z', '0', '9'>, + one<'+', '-', '.'> +> {}; +struct uri_sep : one<':'> {}; +struct uri_rest : sor< + ranges<'a', 'z', 'A', 'Z', '0', '9'>, + one<'%', '/', '?', ':', '@', '&', '=', '+', '$', ',', '-', '_', '.', '!', '~', '*', '\''> +> {}; + +} + +// "tokens". PEGs don't really care about tokens, we merely use them as a convenient +// way of writing down keywords and a couple complicated syntax rules. +namespace t { + +struct _extend_as_path : seq< + star, + not_at, + not_at, + c::path_sep, + sor +> {}; +struct _extend_as_uri : seq< + star, + c::uri_sep, + c::uri_rest +> {}; + +// keywords might be extended to identifiers, paths, or uris. +// NOTE this assumes that keywords are a-zA-Z only, otherwise uri schemes would never +// match correctly. +// NOTE not a simple seq<...> because this would report incorrect positions for +// keywords used inside must<> if a prefix of the keyword matches. +template +struct _keyword : sor< + seq< + S, + not_at, + not_at<_extend_as_path>, + not_at<_extend_as_uri> + >, + failure +> {}; + +struct kw_if : _keyword {}; +struct kw_then : _keyword {}; +struct kw_else : _keyword {}; +struct kw_assert : _keyword {}; +struct kw_with : _keyword {}; +struct kw_let : _keyword {}; +struct kw_in : _keyword {}; +struct kw_rec : _keyword {}; +struct kw_inherit : _keyword {}; +struct kw_or : _keyword {}; + +// `-` can be a unary prefix op, a binary infix op, or the first character +// of a path or -> (ex 1->1--1) +// `/` can be a path leader or an operator (ex a?a /a) +struct op_minus : seq, not_at'>>, not_at<_extend_as_path>> {}; +struct op_div : seq, not_at> {}; + +// match a rule, making sure we are not matching it where a keyword would match. +// using minus like this is a lot faster than flipping the order and using seq. +template +struct _not_at_any_keyword : minus< + seq, + sor< + TAO_PEGTL_STRING("inherit"), + TAO_PEGTL_STRING("assert"), + TAO_PEGTL_STRING("else"), + TAO_PEGTL_STRING("then"), + TAO_PEGTL_STRING("with"), + TAO_PEGTL_STRING("let"), + TAO_PEGTL_STRING("rec"), + TAO_PEGTL_STRING("if"), + TAO_PEGTL_STRING("in"), + TAO_PEGTL_STRING("or") + > +> {}; + +// identifiers are kind of horrid: +// +// - uri_scheme_first ⊂ id_first +// - uri_scheme_first ⊂ uri_scheme_rest ⊂ path +// - id_first ⊂ id_rest ∖ { ' } ⊂ path +// - id_first ∩ (path ∖ uri_scheme_first) = { _ } +// - uri_sep ∉ ⋃ { id_first, id_rest, uri_scheme_first, uri_scheme_rest, path } +// - path_sep ∉ ⋃ { id_first, id_rest, uri_scheme_first, uri_scheme_rest } +// +// and we want, without reading the input more than once, a string that +// matches (id_first id_rest*) and is not followed by any number of +// characters such that the extended string matches path or uri rules. +// +// since the first character must be either _ or a uri scheme character +// we can ignore path-like bits at the beginning. uri_sep cannot appear anywhere +// in an identifier, so it's only needed in lookahead checks at the uri-like +// prefix. likewise path_sep cannot appear anywhere in the idenfier, so it's +// only needed in lookahead checks in the path-like prefix. +// +// in total that gives us a decomposition of +// +// (uri-scheme-like? (?! continues-as-uri) | _) +// (path-segment-like? (?! continues-as-path)) +// id_rest* +struct identifier : _not_at_any_keyword< + // we don't use (at, ...) matches here because identifiers are + // a really hot path and rewinding as needed by at<> isn't entirely free. + sor< + seq< + c::uri_scheme_first, + star>, + not_at<_extend_as_uri> + >, + one<'_'> + >, + star, one<'_', '-'>>>, + not_at<_extend_as_path>, + star +> {}; + +// floats may extend ints, thus these rules are very similar. +struct integer : seq< + sor< + seq, star, not_at>>, + seq, not_at, digit>, star> + >, + not_at<_extend_as_path> +> {}; + +struct floating : seq< + sor< + seq, star, one<'.'>, star>, + seq>, one<'.'>, plus> + >, + opt, opt>, plus>, + not_at<_extend_as_path> +> {}; + +struct uri : seq< + c::uri_scheme_first, + star, + c::uri_sep, + plus +> {}; + +struct sep : sor< + plus>, + seq, star>>, + seq, until>> +> {}; + +} + + + +using seps = star; + + +// marker for semantic rules. not handling one of these in an action that cares about +// semantics is probably an error. +struct semantic {}; + + +struct expr; + +struct _string { + template + struct literal : semantic, seq {}; + struct cr_lf : semantic, seq, opt>> {}; + struct interpolation : semantic, seq< + p::string<'$', '{'>, seps, + must, seps, + must> + > {}; + struct escape : semantic, must {}; +}; +struct string : _string, seq< + one<'"'>, + star< + sor< + _string::literal>>, + _string::cr_lf, + _string::interpolation, + _string::literal, opt>>, + seq, _string::escape> + > + >, + must> +> {}; + +struct _ind_string { + template + struct literal : semantic, seq {}; + struct interpolation : semantic, seq< + p::string<'$', '{'>, seps, + must, seps, + must> + > {}; + struct escape : semantic, must {}; +}; +struct ind_string : _ind_string, seq< + TAO_PEGTL_STRING("''"), + opt>, one<'\n'>>, + star< + sor< + _ind_string::literal< + true, + plus< + sor< + not_one<'$', '\''>, + seq, not_one<'{', '\''>>, + seq, not_one<'\'', '$'>> + > + > + >, + _ind_string::interpolation, + _ind_string::literal>, + _ind_string::literal, not_at>>, + seq, _ind_string::literal>>, + seq< + p::string<'\'', '\''>, + sor< + _ind_string::literal>, + seq, _ind_string::escape> + > + > + > + >, + must +> {}; + +struct _path { + // legacy lexer rules. extra l_ to avoid reserved c++ identifiers. + struct _l_PATH : seq, plus>, opt> {}; + struct _l_PATH_SEG : seq, c::path_sep> {}; + struct _l_HPATH : seq, plus>, opt> {}; + struct _l_HPATH_START : TAO_PEGTL_STRING("~/") {}; + struct _path_str : sor<_l_PATH, _l_PATH_SEG, plus> {}; + // modern rules + template + struct literal : semantic, seq {}; + struct interpolation : semantic, seq< + p::string<'$', '{'>, seps, + must, seps, + must> + > {}; + struct anchor : semantic, sor< + _l_PATH, + seq<_l_PATH_SEG, at> + > {}; + struct home_anchor : semantic, sor< + _l_HPATH, + seq<_l_HPATH_START, at> + > {}; + struct searched_path : semantic, list, c::path_sep> {}; + struct forbid_prefix_triple_slash : sor, failure> {}; + struct forbid_prefix_double_slash_no_interp : sor< + not_at, not_at>, + failure + > {}; + // legacy parser rules + struct _str_rest : seq< + must, + opt>, + must, + star< + sor< + literal<_path_str>, + interpolation + > + > + > {}; +}; +struct path : _path, sor< + seq< + sor<_path::anchor, _path::home_anchor>, + _path::_str_rest + >, + seq, _path::searched_path, one<'>'>> +> {}; + +struct _formal { + struct name : semantic, t::identifier {}; + struct default_value : semantic, must {}; +}; +struct formal : semantic, _formal, seq< + _formal::name, + opt, seps, _formal::default_value> +> {}; + +struct _formals { + struct ellipsis : semantic, p::ellipsis {}; +}; +struct formals : semantic, _formals, seq< + one<'{'>, seps, + // formals and attrsets share a two-token head sequence ('{' ). + // this rule unrolls the formals list a bit to provide better error messages than + // "expected '='" at the first ',' if formals are incorrect. + sor< + one<'}'>, + seq<_formals::ellipsis, seps, must>>, + seq< + formal, seps, + if_then_else< + at>, + seq< + star, seps, formal, seps>, + opt, seps, opt<_formals::ellipsis, seps>>, + must> + >, + one<'}'> + > + > + > +> {}; + +struct _attr { + struct simple : semantic, sor {}; + struct string : semantic, seq {}; + struct expr : semantic, seq< + TAO_PEGTL_STRING("${"), seps, + must, seps, + must> + > {}; +}; +struct attr : _attr, sor< + _attr::simple, + _attr::string, + _attr::expr +> {}; + +struct attrpath : list, t::sep> {}; + +struct _inherit { + struct from : semantic, must {}; + struct attrs : list {}; +}; +struct inherit : _inherit, seq< + t::kw_inherit, seps, + opt, seps, _inherit::from, seps, must>, seps>, + opt<_inherit::attrs, seps>, + must> +> {}; + +struct _binding { + struct path : semantic, attrpath {}; + struct equal : one<'='> {}; + struct value : semantic, must {}; +}; +struct binding : _binding, seq< + _binding::path, seps, + must<_binding::equal>, seps, + _binding::value, seps, + must> +> {}; + +struct bindings : opt, seps>> {}; + +struct op { + enum class kind { + // NOTE non-associativity is *NOT* handled in the grammar structure. + // handling it in the grammar itself instead of in semantic actions + // slows down the parser significantly and makes the rules *much* + // harder to read. maybe this will be different at some point when + // ! does not sit between two binary precedence levels. + nonAssoc, + leftAssoc, + rightAssoc, + unary, + }; + template + struct _op : Rule { + static constexpr unsigned precedence = Precedence; + static constexpr op::kind kind = Kind; + }; + + struct unary_minus : _op {}; + + // treating this like a unary postfix operator is sketchy, but that's + // the most reasonable way to implement the operator precedence set forth + // by the language way back. it'd be much better if `.` and `?` had the same + // precedence, but alas. + struct has_attr : _op, seps, must>, 4> {}; + + struct concat : _op {}; + struct mul : _op, 6> {}; + struct div : _op {}; + struct plus : _op, 7> {}; + struct minus : _op {}; + struct not_ : _op, 8, kind::unary> {}; + struct update : _op {}; + struct less_eq : _op {}; + struct greater_eq : _op="), 10, kind::nonAssoc> {}; + struct less : _op, 10, kind::nonAssoc> {}; + struct greater : _op'>, 10, kind::nonAssoc> {}; + struct equals : _op {}; + struct not_equals : _op {}; + struct and_ : _op {}; + struct or_ : _op {}; + struct implies : _op"), 14, kind::rightAssoc> {}; +}; + +struct _expr { + template class OpenMod = seq, typename... Init> + struct _attrset : seq< + Init..., + OpenMod>, seps, + bindings, seps, + must> + > {}; + + struct select; + + struct id : semantic, t::identifier {}; + struct int_ : semantic, t::integer {}; + struct float_ : semantic, t::floating {}; + struct string : semantic, seq {}; + struct ind_string : semantic, seq {}; + struct path : semantic, seq {}; + struct uri : semantic, t::uri {}; + struct ancient_let : semantic, _attrset {}; + struct rec_set : semantic, _attrset {}; + struct set : semantic, _attrset<> {}; + + struct _list { + struct entry : semantic, seq {}; + struct as_app_or : semantic, t::kw_or {}; + }; + struct _app { + struct first_arg : semantic, seq {}; + // can be used to stash a position of the application head node + struct select_or_fn : seq