lix/src/libstore/references.cc

#include "references.hh"
#include "hash.hh"
#include "util.hh"
#include "archive.hh"

#include <map>
#include <cstdlib>


namespace nix {


static unsigned int refLength = 32; /* characters */


static void search(const unsigned char * s, size_t len,
    StringSet & hashes, StringSet & seen)
{
    static bool initialised = false;
    static bool isBase32[256];
    if (!initialised) {
        for (unsigned int i = 0; i < 256; ++i) isBase32[i] = false;
        for (unsigned int i = 0; i < base32Chars.size(); ++i)
            isBase32[(unsigned char) base32Chars[i]] = true;
        initialised = true;
    }

    for (size_t i = 0; i + refLength <= len; ) {
        int j;
        bool match = true;
        for (j = refLength - 1; j >= 0; --j)
            if (!isBase32[(unsigned char) s[i + j]]) {
                i += j + 1;
                match = false;
                break;
            }
        if (!match) continue;
        string ref((const char *) s + i, refLength);
        if (hashes.erase(ref)) {
            debug(format("found reference to '%1%' at offset '%2%'")
                  % ref % i);
            seen.insert(ref);
        }
        ++i;
    }
}


struct RefScanSink : Sink
{
    HashSink hashSink;
    StringSet hashes;
    StringSet seen;

    string tail;

    RefScanSink() : hashSink(htSHA256) { }

    void operator () (const unsigned char * data, size_t len);
};


void RefScanSink::operator () (const unsigned char * data, size_t len)
{
    hashSink(data, len);

    /* It's possible that a reference spans the previous and current
       fragment, so search in the concatenation of the tail of the
       previous fragment and the start of the current fragment. */
    string s = tail + string((const char *) data, len > refLength ? refLength : len);
    search((const unsigned char *) s.data(), s.size(), hashes, seen);

    search(data, len, hashes, seen);

    size_t tailLen = len <= refLength ? len : refLength;
    tail =
        string(tail, tail.size() < refLength - tailLen ? 0 : tail.size() - (refLength - tailLen)) +
        string((const char *) data + len - tailLen, tailLen);
}


PathSet scanForReferences(const string & path,
    const PathSet & refs, HashResult & hash)
{
    RefScanSink sink;
    std::map<string, Path> backMap;

    /* For efficiency (and a higher hit rate), just search for the
       hash part of the file name.  (This assumes that all references
       have the form `HASH-bla'). */
    for (auto & i : refs) {
        auto baseName = std::string(baseNameOf(i));
        string::size_type pos = baseName.find('-');
        if (pos == string::npos)
            throw Error("bad reference '%1%'", i);
        string s = string(baseName, 0, pos);
        assert(s.size() == refLength);
        assert(backMap.find(s) == backMap.end());
        // parseHash(htSHA256, s);
        sink.hashes.insert(s);
        backMap[s] = i;
    }

    /* Look for the hashes in the NAR dump of the path. */
    dumpPath(path, sink);

    /* Map the hashes found back to their store paths. */
    PathSet found;
    for (auto & i : sink.seen) {
        std::map<string, Path>::iterator j;
        if ((j = backMap.find(i)) == backMap.end()) abort();
        found.insert(j->second);
    }

    hash = sink.hashSink.finish();

    return found;
}


RewritingSink::RewritingSink(const std::string & from, const std::string & to, Sink & nextSink)
    : from(from), to(to), nextSink(nextSink)
{
    assert(from.size() == to.size());
}

void RewritingSink::operator () (const unsigned char * data, size_t len)
{
    std::string s(prev);
    s.append((const char *) data, len);

    size_t j = 0;
    while ((j = s.find(from, j)) != string::npos) {
        matches.push_back(pos + j);
        s.replace(j, from.size(), to);
    }

    prev = s.size() < from.size() ? s : std::string(s, s.size() - from.size() + 1, from.size() - 1);

    auto consumed = s.size() - prev.size();

    pos += consumed;

    if (consumed) nextSink((unsigned char *) s.data(), consumed);
}

void RewritingSink::flush()
{
    if (prev.empty()) return;
    pos += prev.size();
    nextSink((unsigned char *) prev.data(), prev.size());
    prev.clear();
}

HashModuloSink::HashModuloSink(HashType ht, const std::string & modulus)
    : hashSink(ht)
    , rewritingSink(modulus, std::string(modulus.size(), 0), hashSink)
{
}

void HashModuloSink::operator () (const unsigned char * data, size_t len)
{
    rewritingSink(data, len);
}

HashResult HashModuloSink::finish()
{
    rewritingSink.flush();

    /* Hash the positions of the self-references. This ensures that a
       NAR with self-references and a NAR with some of the
       self-references already zeroed out do not produce a hash
       collision. FIXME: proof. */
    for (auto & pos : rewritingSink.matches) {
        auto s = fmt("|%d", pos);
        hashSink((unsigned char *) s.data(), s.size());
    }

    auto h = hashSink.finish();
    return {h.first, rewritingSink.pos};
}

}
* Use a proper namespace. * Optimise header file usage a bit. * Compile the parser as C++. 2006-09-04 21:06:23 +00:00			`#include "references.hh"`
			`#include "hash.hh"`
			`#include "util.hh"`
* Scan for references and compute the SHA-256 hash of the output in one pass. This halves the amount of I/O. 2009-03-28 20:51:33 +00:00			`#include "archive.hh"`
* Use a proper namespace. * Optimise header file usage a bit. * Compile the parser as C++. 2006-09-04 21:06:23 +00:00
* The new normaliser now passes the unit tests. 2003-07-15 21:24:05 +00:00			`#include <map>`
2009-03-30 19:35:55 +00:00			`#include <cstdlib>`
* The new normaliser now passes the unit tests. 2003-07-15 21:24:05 +00:00
* Use a proper namespace. * Optimise header file usage a bit. * Compile the parser as C++. 2006-09-04 21:06:23 +00:00
			`namespace nix {`
* After building, scan for actual file system references as opposed to declared references. This prunes the reference graph, thus allowing better garbage collection and more efficient derivate distribution. 2003-07-14 10:23:11 +00:00

* Did something useful while waiting at IAD: reference scanning is now much faster. 2005-11-16 08:27:06 +00:00			`static unsigned int refLength = 32; /* characters */`


Fix some random -Wconversion warnings 2018-05-02 11:56:34 +00:00			`static void search(const unsigned char * s, size_t len,`
* Scan for references and compute the SHA-256 hash of the output in one pass. This halves the amount of I/O. 2009-03-28 20:51:33 +00:00			`StringSet & hashes, StringSet & seen)`
* After building, scan for actual file system references as opposed to declared references. This prunes the reference graph, thus allowing better garbage collection and more efficient derivate distribution. 2003-07-14 10:23:11 +00:00			`{`
* Did something useful while waiting at IAD: reference scanning is now much faster. 2005-11-16 08:27:06 +00:00			`static bool initialised = false;`
			`static bool isBase32[256];`
			`if (!initialised) {`
			`for (unsigned int i = 0; i < 256; ++i) isBase32[i] = false;`
			`for (unsigned int i = 0; i < base32Chars.size(); ++i)`
			`isBase32[(unsigned char) base32Chars[i]] = true;`
			`initialised = true;`
			`}`
OCD: foreach -> C++11 ranged for 2015-07-17 17:24:28 +00:00
Fix some random -Wconversion warnings 2018-05-02 11:56:34 +00:00			`for (size_t i = 0; i + refLength <= len; ) {`
* Did something useful while waiting at IAD: reference scanning is now much faster. 2005-11-16 08:27:06 +00:00			`int j;`
			`bool match = true;`
			`for (j = refLength - 1; j >= 0; --j)`
			`if (!isBase32[(unsigned char) s[i + j]]) {`
			`i += j + 1;`
			`match = false;`
			`break;`
			`}`
			`if (!match) continue;`
* Don't allocate the buffer twice. 2006-09-22 11:28:23 +00:00			`string ref((const char *) s + i, refLength);`
OCD performance fix: {find,count}+insert => insert 2019-10-09 13:51:52 +00:00			`if (hashes.erase(ref)) {`
Replace Unicode quotes in user-facing strings by ASCII Relevant RFC: NixOS/rfcs#4 $ ag -l \| xargs sed -i -e "/\"/s/’/'/g;/\"/s/‘/'/g" 2017-07-30 11:27:57 +00:00			`debug(format("found reference to '%1%' at offset '%2%'")`
* Did something useful while waiting at IAD: reference scanning is now much faster. 2005-11-16 08:27:06 +00:00			`% ref % i);`
			`seen.insert(ref);`
* After building, scan for actual file system references as opposed to declared references. This prunes the reference graph, thus allowing better garbage collection and more efficient derivate distribution. 2003-07-14 10:23:11 +00:00			`}`
* Did something useful while waiting at IAD: reference scanning is now much faster. 2005-11-16 08:27:06 +00:00			`++i;`
* After building, scan for actual file system references as opposed to declared references. This prunes the reference graph, thus allowing better garbage collection and more efficient derivate distribution. 2003-07-14 10:23:11 +00:00			`}`
			`}`


* Scan for references and compute the SHA-256 hash of the output in one pass. This halves the amount of I/O. 2009-03-28 20:51:33 +00:00			`struct RefScanSink : Sink`
* After building, scan for actual file system references as opposed to declared references. This prunes the reference graph, thus allowing better garbage collection and more efficient derivate distribution. 2003-07-14 10:23:11 +00:00			`{`
* Scan for references and compute the SHA-256 hash of the output in one pass. This halves the amount of I/O. 2009-03-28 20:51:33 +00:00			`HashSink hashSink;`
			`StringSet hashes;`
			`StringSet seen;`
* Input sources should be in the set of all referenceable paths too. 2005-02-11 16:03:47 +00:00
* Scan for references and compute the SHA-256 hash of the output in one pass. This halves the amount of I/O. 2009-03-28 20:51:33 +00:00			`string tail;`
* After building, scan for actual file system references as opposed to declared references. This prunes the reference graph, thus allowing better garbage collection and more efficient derivate distribution. 2003-07-14 10:23:11 +00:00
Revert the `enum struct` change Not a regular git revert as there have been many merges and things. 2020-06-18 22:09:22 +00:00			`RefScanSink() : hashSink(htSHA256) { }`
OCD: foreach -> C++11 ranged for 2015-07-17 17:24:28 +00:00
* Refactoring: move sink/source buffering into separate classes. * Buffer the HashSink. This speeds up hashing a bit because it prevents lots of calls to the hash update functions (e.g. nix-hash went from 9.3s to 8.7s of user time on the closure of my /var/run/current-system). 2011-12-15 16:19:53 +00:00			`void operator () (const unsigned char * data, size_t len);`
* Scan for references and compute the SHA-256 hash of the output in one pass. This halves the amount of I/O. 2009-03-28 20:51:33 +00:00			`};`
* After building, scan for actual file system references as opposed to declared references. This prunes the reference graph, thus allowing better garbage collection and more efficient derivate distribution. 2003-07-14 10:23:11 +00:00

* Refactoring: move sink/source buffering into separate classes. * Buffer the HashSink. This speeds up hashing a bit because it prevents lots of calls to the hash update functions (e.g. nix-hash went from 9.3s to 8.7s of user time on the closure of my /var/run/current-system). 2011-12-15 16:19:53 +00:00			`void RefScanSink::operator () (const unsigned char * data, size_t len)`
* Scan for references and compute the SHA-256 hash of the output in one pass. This halves the amount of I/O. 2009-03-28 20:51:33 +00:00			`{`
			`hashSink(data, len);`
* Don't allocate more than SIZE_MAX bytes. 2006-09-22 11:13:35 +00:00
* Scan for references and compute the SHA-256 hash of the output in one pass. This halves the amount of I/O. 2009-03-28 20:51:33 +00:00			`/* It's possible that a reference spans the previous and current`
			`fragment, so search in the concatenation of the tail of the`
			`previous fragment and the start of the current fragment. */`
			`string s = tail + string((const char *) data, len > refLength ? refLength : len);`
Use data() instead of c_str() where appropriate 2012-02-09 17:27:45 +00:00			`search((const unsigned char *) s.data(), s.size(), hashes, seen);`
* After building, scan for actual file system references as opposed to declared references. This prunes the reference graph, thus allowing better garbage collection and more efficient derivate distribution. 2003-07-14 10:23:11 +00:00
* Scan for references and compute the SHA-256 hash of the output in one pass. This halves the amount of I/O. 2009-03-28 20:51:33 +00:00			`search(data, len, hashes, seen);`
* After building, scan for actual file system references as opposed to declared references. This prunes the reference graph, thus allowing better garbage collection and more efficient derivate distribution. 2003-07-14 10:23:11 +00:00
Fix some random -Wconversion warnings 2018-05-02 11:56:34 +00:00			`size_t tailLen = len <= refLength ? len : refLength;`
* Scan for references and compute the SHA-256 hash of the output in one pass. This halves the amount of I/O. 2009-03-28 20:51:33 +00:00			`tail =`
			`string(tail, tail.size() < refLength - tailLen ? 0 : tail.size() - (refLength - tailLen)) +`
			`string((const char *) data + len - tailLen, tailLen);`
* After building, scan for actual file system references as opposed to declared references. This prunes the reference graph, thus allowing better garbage collection and more efficient derivate distribution. 2003-07-14 10:23:11 +00:00			`}`


* Scan for references and compute the SHA-256 hash of the output in one pass. This halves the amount of I/O. 2009-03-28 20:51:33 +00:00			`PathSet scanForReferences(const string & path,`
* Store the size of a store path in the database (to be precise, the size of the NAR serialisation of the path, i.e., `nix-store --dump PATH'). This is useful for Hydra. 2010-11-16 17:11:46 +00:00			`const PathSet & refs, HashResult & hash)`
* After building, scan for actual file system references as opposed to declared references. This prunes the reference graph, thus allowing better garbage collection and more efficient derivate distribution. 2003-07-14 10:23:11 +00:00			`{`
* Scan for references and compute the SHA-256 hash of the output in one pass. This halves the amount of I/O. 2009-03-28 20:51:33 +00:00			`RefScanSink sink;`
* Use a proper namespace. * Optimise header file usage a bit. * Compile the parser as C++. 2006-09-04 21:06:23 +00:00			`std::map<string, Path> backMap;`
* After building, scan for actual file system references as opposed to declared references. This prunes the reference graph, thus allowing better garbage collection and more efficient derivate distribution. 2003-07-14 10:23:11 +00:00
			`/* For efficiency (and a higher hit rate), just search for the`
			`hash part of the file name. (This assumes that all references`
			have the form `HASH-bla'). */
OCD: foreach -> C++11 ranged for 2015-07-17 17:24:28 +00:00			`for (auto & i : refs) {`
Make the Store API more type-safe Most functions now take a StorePath argument rather than a Path (which is just an alias for std::string). The StorePath constructor ensures that the path is syntactically correct (i.e. it looks like <store-dir>/<base32-hash>-<name>). Similarly, functions like buildPaths() now take a StorePathWithOutputs, rather than abusing Path by adding a '!<outputs>' suffix. Note that the StorePath type is implemented in Rust. This involves some hackery to allow Rust values to be used directly in C++, via a helper type whose destructor calls the Rust type's drop() function. The main issue is the dynamic nature of C++ move semantics: after we have moved a Rust value, we should not call the drop function on the original value. So when we move a value, we set the original value to bitwise zero, and the destructor only calls drop() if the value is not bitwise zero. This should be sufficient for most types. Also lots of minor cleanups to the C++ API to make it more modern (e.g. using std::optional and std::string_view in some places). 2019-12-05 18:11:09 +00:00			`auto baseName = std::string(baseNameOf(i));`
* 64-bit compatibility fixes (for problems revealed by building on an Athlon 64 running 64-bit SUSE). A patched ATerm library is required to run Nix succesfully. 2006-05-11 02:19:43 +00:00			`string::size_type pos = baseName.find('-');`
* Shorten SHA-256 hashes used in store path name generation to 160 bits, then encode them in a radix-32 representation (using digits and letters except e, o, u, and t). This produces store paths like /nix/store/4i0zb0z7f88mwghjirkz702a71dcfivn-aterm-2.3.1. The nice thing about this is that the hash part of the file name is still 32 characters, as before with MD5. (Of course, shortening SHA-256 to 160 bits makes it no better than SHA-160 in theory, but hopefully it's a bit more resistant to attacks; it's certainly a lot slower.) 2005-01-14 16:04:03 +00:00			`if (pos == string::npos)`
remove 'format' from Error constructor calls 2020-04-21 23:07:07 +00:00			`throw Error("bad reference '%1%'", i);`
* Shorten SHA-256 hashes used in store path name generation to 160 bits, then encode them in a radix-32 representation (using digits and letters except e, o, u, and t). This produces store paths like /nix/store/4i0zb0z7f88mwghjirkz702a71dcfivn-aterm-2.3.1. The nice thing about this is that the hash part of the file name is still 32 characters, as before with MD5. (Of course, shortening SHA-256 to 160 bits makes it no better than SHA-160 in theory, but hopefully it's a bit more resistant to attacks; it's certainly a lot slower.) 2005-01-14 16:04:03 +00:00			`string s = string(baseName, 0, pos);`
* Did something useful while waiting at IAD: reference scanning is now much faster. 2005-11-16 08:27:06 +00:00			`assert(s.size() == refLength);`
			`assert(backMap.find(s) == backMap.end());`
Revert the `enum struct` change Not a regular git revert as there have been many merges and things. 2020-06-18 22:09:22 +00:00			`// parseHash(htSHA256, s);`
* Scan for references and compute the SHA-256 hash of the output in one pass. This halves the amount of I/O. 2009-03-28 20:51:33 +00:00			`sink.hashes.insert(s);`
OCD: foreach -> C++11 ranged for 2015-07-17 17:24:28 +00:00			`backMap[s] = i;`
* After building, scan for actual file system references as opposed to declared references. This prunes the reference graph, thus allowing better garbage collection and more efficient derivate distribution. 2003-07-14 10:23:11 +00:00			`}`

* Scan for references and compute the SHA-256 hash of the output in one pass. This halves the amount of I/O. 2009-03-28 20:51:33 +00:00			`/* Look for the hashes in the NAR dump of the path. */`
			`dumpPath(path, sink);`
* The new normaliser now passes the unit tests. 2003-07-15 21:24:05 +00:00
* Scan for references and compute the SHA-256 hash of the output in one pass. This halves the amount of I/O. 2009-03-28 20:51:33 +00:00			`/* Map the hashes found back to their store paths. */`
* Did something useful while waiting at IAD: reference scanning is now much faster. 2005-11-16 08:27:06 +00:00			`PathSet found;`
OCD: foreach -> C++11 ranged for 2015-07-17 17:24:28 +00:00			`for (auto & i : sink.seen) {`
* Use a proper namespace. * Optimise header file usage a bit. * Compile the parser as C++. 2006-09-04 21:06:23 +00:00			`std::map<string, Path>::iterator j;`
OCD: foreach -> C++11 ranged for 2015-07-17 17:24:28 +00:00			`if ((j = backMap.find(i)) == backMap.end()) abort();`
* Did something useful while waiting at IAD: reference scanning is now much faster. 2005-11-16 08:27:06 +00:00			`found.insert(j->second);`
* The new normaliser now passes the unit tests. 2003-07-15 21:24:05 +00:00			`}`
* After building, scan for actual file system references as opposed to declared references. This prunes the reference graph, thus allowing better garbage collection and more efficient derivate distribution. 2003-07-14 10:23:11 +00:00
* Scan for references and compute the SHA-256 hash of the output in one pass. This halves the amount of I/O. 2009-03-28 20:51:33 +00:00			`hash = sink.hashSink.finish();`
OCD: foreach -> C++11 ranged for 2015-07-17 17:24:28 +00:00
* The new normaliser now passes the unit tests. 2003-07-15 21:24:05 +00:00			`return found;`
* After building, scan for actual file system references as opposed to declared references. This prunes the reference graph, thus allowing better garbage collection and more efficient derivate distribution. 2003-07-14 10:23:11 +00:00			`}`
* Use a proper namespace. * Optimise header file usage a bit. * Compile the parser as C++. 2006-09-04 21:06:23 +00:00
* Scan for references and compute the SHA-256 hash of the output in one pass. This halves the amount of I/O. 2009-03-28 20:51:33 +00:00
Allow content-addressable paths to have references This adds a command 'nix make-content-addressable' that rewrites the specified store paths into content-addressable paths. The advantage of such paths is that 1) they can be imported without signatures; 2) they can enable deduplication in cases where derivation changes do not cause output changes (apart from store path hashes). For example, $ nix make-content-addressable -r nixpkgs.cowsay rewrote '/nix/store/g1g31ah55xdia1jdqabv1imf6mcw0nb1-glibc-2.25-49' to '/nix/store/48jfj7bg78a8n4f2nhg269rgw1936vj4-glibc-2.25-49' ... rewrote '/nix/store/qbi6rzpk0bxjw8lw6azn2mc7ynnn455q-cowsay-3.03+dfsg1-16' to '/nix/store/iq6g2x4q62xp7y7493bibx0qn5w7xz67-cowsay-3.03+dfsg1-16' We can then copy the resulting closure to another store without signatures: $ nix copy --trusted-public-keys '' ---to ~/my-nix /nix/store/iq6g2x4q62xp7y7493bibx0qn5w7xz67-cowsay-3.03+dfsg1-16 In order to support self-references in content-addressable paths, these paths are hashed "modulo" self-references, meaning that self-references are zeroed out during hashing. Somewhat annoyingly, this means that the NAR hash stored in the Nix database is no longer necessarily equal to the output of "nix hash-path"; for content-addressable paths, you need to pass the --modulo flag: $ nix path-info --json /nix/store/iq6g2x4q62xp7y7493bibx0qn5w7xz67-cowsay-3.03+dfsg1-16 \| jq -r .[].narHash sha256:0ri611gdilz2c9rsibqhsipbfs9vwcqvs811a52i2bnkhv7w9mgw $ nix hash-path --type sha256 --base32 /nix/store/iq6g2x4q62xp7y7493bibx0qn5w7xz67-cowsay-3.03+dfsg1-16 1ggznh07khq0hz6id09pqws3a8q9pn03ya3c03nwck1kwq8rclzs $ nix hash-path --type sha256 --base32 /nix/store/iq6g2x4q62xp7y7493bibx0qn5w7xz67-cowsay-3.03+dfsg1-16 --modulo iq6g2x4q62xp7y7493bibx0qn5w7xz67 0ri611gdilz2c9rsibqhsipbfs9vwcqvs811a52i2bnkhv7w9mgw 2018-03-29 22:56:13 +00:00			`RewritingSink::RewritingSink(const std::string & from, const std::string & to, Sink & nextSink)`
			`: from(from), to(to), nextSink(nextSink)`
			`{`
			`assert(from.size() == to.size());`
			`}`

			`void RewritingSink::operator () (const unsigned char * data, size_t len)`
			`{`
			`std::string s(prev);`
			`s.append((const char *) data, len);`

			`size_t j = 0;`
			`while ((j = s.find(from, j)) != string::npos) {`
			`matches.push_back(pos + j);`
			`s.replace(j, from.size(), to);`
			`}`

			`prev = s.size() < from.size() ? s : std::string(s, s.size() - from.size() + 1, from.size() - 1);`

			`auto consumed = s.size() - prev.size();`

			`pos += consumed;`

			`if (consumed) nextSink((unsigned char *) s.data(), consumed);`
			`}`

			`void RewritingSink::flush()`
			`{`
			`if (prev.empty()) return;`
			`pos += prev.size();`
			`nextSink((unsigned char *) prev.data(), prev.size());`
			`prev.clear();`
			`}`

			`HashModuloSink::HashModuloSink(HashType ht, const std::string & modulus)`
			`: hashSink(ht)`
			`, rewritingSink(modulus, std::string(modulus.size(), 0), hashSink)`
			`{`
			`}`

			`void HashModuloSink::operator () (const unsigned char * data, size_t len)`
			`{`
			`rewritingSink(data, len);`
			`}`

			`HashResult HashModuloSink::finish()`
			`{`
			`rewritingSink.flush();`

			`/* Hash the positions of the self-references. This ensures that a`
			`NAR with self-references and a NAR with some of the`
			`self-references already zeroed out do not produce a hash`
			`collision. FIXME: proof. */`
			`for (auto & pos : rewritingSink.matches) {`
			`auto s = fmt("\|%d", pos);`
			`hashSink((unsigned char *) s.data(), s.size());`
			`}`

			`auto h = hashSink.finish();`
			`return {h.first, rewritingSink.pos};`
			`}`

* Use a proper namespace. * Optimise header file usage a bit. * Compile the parser as C++. 2006-09-04 21:06:23 +00:00			`}`