* Scan for references and compute the SHA-256 hash of the output in

one pass.  This halves the amount of I/O.
This commit is contained in:
Eelco Dolstra 2009-03-28 20:51:33 +00:00
parent c7152c8f97
commit 6e946c8e72
3 changed files with 51 additions and 90 deletions

View file

@ -1864,15 +1864,17 @@ void DerivationGoal::computeClosure()
/* Get rid of all weird permissions. */ /* Get rid of all weird permissions. */
canonicalisePathMetaData(path); canonicalisePathMetaData(path);
/* For this output path, find the references to other paths contained /* For this output path, find the references to other paths
in it. */ contained in it. Compute the SHA-256 NAR hash at the same
PathSet references = scanForReferences(path, allPaths); time. The hash is stored in the database so that we can
verify later on whether nobody has messed with the store. */
Hash hash;
PathSet references = scanForReferences(path, allPaths, hash);
contentHashes[path] = hash;
/* For debugging, print out the referenced and unreferenced /* For debugging, print out the referenced and unreferenced
paths. */ paths. */
for (PathSet::iterator i = inputPaths.begin(); foreach (PathSet::iterator, i, inputPaths) {
i != inputPaths.end(); ++i)
{
PathSet::iterator j = references.find(*i); PathSet::iterator j = references.find(*i);
if (j == references.end()) if (j == references.end())
debug(format("unreferenced input: `%1%'") % *i); debug(format("unreferenced input: `%1%'") % *i);
@ -1892,12 +1894,6 @@ void DerivationGoal::computeClosure()
if (allowed.find(*i) == allowed.end()) if (allowed.find(*i) == allowed.end())
throw BuildError(format("output is not allowed to refer to path `%1%'") % *i); throw BuildError(format("output is not allowed to refer to path `%1%'") % *i);
} }
/* Hash the contents of the path. The hash is stored in the
database so that we can verify later on whether nobody has
messed with the store. !!! inefficient: it would be nice
if we could combine this with filterReferences(). */
contentHashes[path] = hashPath(htSHA256, path);
} }
/* Register each output path as valid, and register the sets of /* Register each output path as valid, and register the sets of

View file

@ -1,18 +1,10 @@
#include "references.hh" #include "references.hh"
#include "hash.hh" #include "hash.hh"
#include "util.hh" #include "util.hh"
#include "archive.hh"
#include <cerrno>
#include <cstring>
#include <cstdlib>
#include <map> #include <map>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <dirent.h>
#include <fcntl.h>
namespace nix { namespace nix {
@ -20,8 +12,8 @@ namespace nix {
static unsigned int refLength = 32; /* characters */ static unsigned int refLength = 32; /* characters */
static void search(size_t len, const unsigned char * s, static void search(const unsigned char * s, unsigned int len,
StringSet & ids, StringSet & seen) StringSet & hashes, StringSet & seen)
{ {
static bool initialised = false; static bool initialised = false;
static bool isBase32[256]; static bool isBase32[256];
@ -43,93 +35,60 @@ static void search(size_t len, const unsigned char * s,
} }
if (!match) continue; if (!match) continue;
string ref((const char *) s + i, refLength); string ref((const char *) s + i, refLength);
if (ids.find(ref) != ids.end()) { if (hashes.find(ref) != hashes.end()) {
debug(format("found reference to `%1%' at offset `%2%'") debug(format("found reference to `%1%' at offset `%2%'")
% ref % i); % ref % i);
seen.insert(ref); seen.insert(ref);
ids.erase(ref); hashes.erase(ref);
} }
++i; ++i;
} }
} }
void checkPath(const string & path, struct RefScanSink : Sink
StringSet & ids, StringSet & seen)
{ {
checkInterrupt(); HashSink hashSink;
StringSet hashes;
StringSet seen;
debug(format("checking `%1%'") % path); string tail;
struct stat st; RefScanSink() : hashSink(htSHA256) { }
if (lstat(path.c_str(), &st))
throw SysError(format("getting attributes of path `%1%'") % path);
if (S_ISDIR(st.st_mode)) { void operator () (const unsigned char * data, unsigned int len);
Strings names = readDirectory(path); };
for (Strings::iterator i = names.begin(); i != names.end(); i++) {
search(i->size(), (const unsigned char *) i->c_str(), ids, seen);
checkPath(path + "/" + *i, ids, seen);
}
}
else if (S_ISREG(st.st_mode)) {
AutoCloseFD fd = open(path.c_str(), O_RDONLY); void RefScanSink::operator () (const unsigned char * data, unsigned int len)
if (fd == -1) throw SysError(format("opening file `%1%'") % path); {
hashSink(data, len);
size_t bufSize = 1024 * 1024; /* It's possible that a reference spans the previous and current
assert(refLength <= bufSize); fragment, so search in the concatenation of the tail of the
unsigned char * buf = new unsigned char[bufSize]; previous fragment and the start of the current fragment. */
string s = tail + string((const char *) data, len > refLength ? refLength : len);
search((const unsigned char *) s.c_str(), s.size(), hashes, seen);
size_t left = st.st_size; search(data, len, hashes, seen);
bool firstBlock = true;
while (left > 0) { unsigned int tailLen = len <= refLength ? len : refLength;
checkInterrupt(); tail =
string(tail, tail.size() < refLength - tailLen ? 0 : tail.size() - (refLength - tailLen)) +
size_t read = left > bufSize ? bufSize : left; string((const char *) data + len - tailLen, tailLen);
size_t copiedBytes = 0;
if (!firstBlock) {
/* Move the last (refLength - 1) bytes from the last
block to the start of the buffer to deal with
references that cross block boundaries. */
copiedBytes = refLength - 1;
if (read + copiedBytes > bufSize)
read -= copiedBytes;
memcpy(buf, buf + (bufSize - copiedBytes), copiedBytes);
}
firstBlock = false;
readFull(fd, buf + copiedBytes, read);
left -= read;
search(copiedBytes + read, buf, ids, seen);
}
delete[] buf; /* !!! autodelete */
}
else if (S_ISLNK(st.st_mode)) {
string target = readLink(path);
search(target.size(), (const unsigned char *) target.c_str(), ids, seen);
}
else throw Error(format("unknown file type: %1%") % path);
} }
PathSet scanForReferences(const string & path, const PathSet & paths) PathSet scanForReferences(const string & path,
const PathSet & refs, Hash & hash)
{ {
RefScanSink sink;
std::map<string, Path> backMap; std::map<string, Path> backMap;
StringSet ids;
StringSet seen;
/* For efficiency (and a higher hit rate), just search for the /* For efficiency (and a higher hit rate), just search for the
hash part of the file name. (This assumes that all references hash part of the file name. (This assumes that all references
have the form `HASH-bla'). */ have the form `HASH-bla'). */
for (PathSet::const_iterator i = paths.begin(); i != paths.end(); i++) { foreach (PathSet::const_iterator, i, refs) {
string baseName = baseNameOf(*i); string baseName = baseNameOf(*i);
string::size_type pos = baseName.find('-'); string::size_type pos = baseName.find('-');
if (pos == string::npos) if (pos == string::npos)
@ -138,19 +97,23 @@ PathSet scanForReferences(const string & path, const PathSet & paths)
assert(s.size() == refLength); assert(s.size() == refLength);
assert(backMap.find(s) == backMap.end()); assert(backMap.find(s) == backMap.end());
// parseHash(htSHA256, s); // parseHash(htSHA256, s);
ids.insert(s); sink.hashes.insert(s);
backMap[s] = *i; backMap[s] = *i;
} }
checkPath(path, ids, seen); /* Look for the hashes in the NAR dump of the path. */
dumpPath(path, sink);
/* Map the hashes found back to their store paths. */
PathSet found; PathSet found;
for (StringSet::iterator i = seen.begin(); i != seen.end(); i++) { foreach (StringSet::iterator, i, sink.seen) {
std::map<string, Path>::iterator j; std::map<string, Path>::iterator j;
if ((j = backMap.find(*i)) == backMap.end()) abort(); if ((j = backMap.find(*i)) == backMap.end()) abort();
found.insert(j->second); found.insert(j->second);
} }
hash = sink.hashSink.finish();
return found; return found;
} }

View file

@ -2,10 +2,12 @@
#define __REFERENCES_H #define __REFERENCES_H
#include "types.hh" #include "types.hh"
#include "hash.hh"
namespace nix { namespace nix {
PathSet scanForReferences(const Path & path, const PathSet & refs); PathSet scanForReferences(const Path & path, const PathSet & refs,
Hash & hash);
} }