lix/src/libexpr/symbol-table.hh
pennae 8775be3393 store Symbols in a table as well, like positions
this slightly increases the amount of memory used for any given symbol, but this
increase is more than made up for if the symbol is referenced more than once in
the EvalState that holds it. on average every symbol should be referenced at
least twice (once to introduce a binding, once to use it), so we expect no
increase in memory on average.

symbol tables are limited to 2³² entries like position tables, and similar
arguments apply to why overflow is not likely: 2³² symbols would require as many
string instances (at 24 bytes each) and map entries (at 24 bytes or more each,
assuming that the map holds on average at most one item per bucket as the docs
say). a full symbol table would require at least 192GB of memory just for
symbols, which is well out of reach. (an ofborg eval of nixpks today creates
less than a million symbols!)
2022-04-21 21:56:31 +02:00

107 lines
2.4 KiB
C++

#pragma once
#include <list>
#include <map>
#include <unordered_map>
#include "types.hh"
namespace nix {
/* Symbol table used by the parser and evaluator to represent and look
up identifiers and attributes efficiently. SymbolTable::create()
converts a string into a symbol. Symbols have the property that
they can be compared efficiently (using a pointer equality test),
because the symbol table stores only one copy of each string. */
class Symbol
{
friend class SymbolTable;
private:
std::string s;
public:
Symbol(std::string_view s) : s(s) { }
// FIXME: remove
bool operator == (std::string_view s2) const
{
return s == s2;
}
operator const std::string & () const
{
return s;
}
operator const std::string_view () const
{
return s;
}
friend std::ostream & operator << (std::ostream & str, const Symbol & sym);
};
class SymbolIdx
{
friend class SymbolTable;
private:
uint32_t id;
explicit SymbolIdx(uint32_t id): id(id) {}
public:
SymbolIdx() : id(0) {}
explicit operator bool() const { return id > 0; }
bool operator<(const SymbolIdx other) const { return id < other.id; }
bool operator==(const SymbolIdx other) const { return id == other.id; }
bool operator!=(const SymbolIdx other) const { return id != other.id; }
};
class SymbolTable
{
private:
std::unordered_map<std::string_view, std::pair<const Symbol *, uint32_t>> symbols;
ChunkedVector<Symbol, 8192> store{16};
public:
SymbolIdx create(std::string_view s)
{
// Most symbols are looked up more than once, so we trade off insertion performance
// for lookup performance.
// TODO: could probably be done more efficiently with transparent Hash and Equals
// on the original implementation using unordered_set
auto it = symbols.find(s);
if (it != symbols.end()) return SymbolIdx(it->second.second + 1);
const auto & [rawSym, idx] = store.add(s);
symbols.emplace(rawSym, std::make_pair(&rawSym, idx));
return SymbolIdx(idx + 1);
}
const Symbol & operator[](SymbolIdx s) const
{
if (s.id == 0 || s.id > store.size())
abort();
return store[s.id - 1];
}
size_t size() const
{
return store.size();
}
size_t totalSize() const;
template<typename T>
void dump(T callback) const
{
store.forEach(callback);
}
};
}