* Scan for references and compute the SHA-256 hash of the output in

one pass.  This halves the amount of I/O.
This commit is contained in:
Eelco Dolstra 2009-03-28 20:51:33 +00:00
parent c7152c8f97
commit 6e946c8e72
3 changed files with 51 additions and 90 deletions

View file

@ -1864,15 +1864,17 @@ void DerivationGoal::computeClosure()
/* Get rid of all weird permissions. */ /* Get rid of all weird permissions. */
canonicalisePathMetaData(path); canonicalisePathMetaData(path);
/* For this output path, find the references to other paths contained /* For this output path, find the references to other paths
in it. */ contained in it. Compute the SHA-256 NAR hash at the same
PathSet references = scanForReferences(path, allPaths); time. The hash is stored in the database so that we can
verify later on whether nobody has messed with the store. */
Hash hash;
PathSet references = scanForReferences(path, allPaths, hash);
contentHashes[path] = hash;
/* For debugging, print out the referenced and unreferenced /* For debugging, print out the referenced and unreferenced
paths. */ paths. */
for (PathSet::iterator i = inputPaths.begin(); foreach (PathSet::iterator, i, inputPaths) {
i != inputPaths.end(); ++i)
{
PathSet::iterator j = references.find(*i); PathSet::iterator j = references.find(*i);
if (j == references.end()) if (j == references.end())
debug(format("unreferenced input: `%1%'") % *i); debug(format("unreferenced input: `%1%'") % *i);
@ -1892,12 +1894,6 @@ void DerivationGoal::computeClosure()
if (allowed.find(*i) == allowed.end()) if (allowed.find(*i) == allowed.end())
throw BuildError(format("output is not allowed to refer to path `%1%'") % *i); throw BuildError(format("output is not allowed to refer to path `%1%'") % *i);
} }
/* Hash the contents of the path. The hash is stored in the
database so that we can verify later on whether nobody has
messed with the store. !!! inefficient: it would be nice
if we could combine this with filterReferences(). */
contentHashes[path] = hashPath(htSHA256, path);
} }
/* Register each output path as valid, and register the sets of /* Register each output path as valid, and register the sets of

View file

@ -1,18 +1,10 @@
#include "references.hh" #include "references.hh"
#include "hash.hh" #include "hash.hh"
#include "util.hh" #include "util.hh"
#include "archive.hh"
#include <cerrno>
#include <cstring>
#include <cstdlib>
#include <map> #include <map>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include <dirent.h>
#include <fcntl.h>
namespace nix { namespace nix {
@ -20,8 +12,8 @@ namespace nix {
static unsigned int refLength = 32; /* characters */ static unsigned int refLength = 32; /* characters */
static void search(size_t len, const unsigned char * s, static void search(const unsigned char * s, unsigned int len,
StringSet & ids, StringSet & seen) StringSet & hashes, StringSet & seen)
{ {
static bool initialised = false; static bool initialised = false;
static bool isBase32[256]; static bool isBase32[256];
@ -43,93 +35,60 @@ static void search(size_t len, const unsigned char * s,
} }
if (!match) continue; if (!match) continue;
string ref((const char *) s + i, refLength); string ref((const char *) s + i, refLength);
if (ids.find(ref) != ids.end()) { if (hashes.find(ref) != hashes.end()) {
debug(format("found reference to `%1%' at offset `%2%'") debug(format("found reference to `%1%' at offset `%2%'")
% ref % i); % ref % i);
seen.insert(ref); seen.insert(ref);
ids.erase(ref); hashes.erase(ref);
} }
++i; ++i;
} }
} }
void checkPath(const string & path, struct RefScanSink : Sink
StringSet & ids, StringSet & seen)
{ {
checkInterrupt(); HashSink hashSink;
StringSet hashes;
debug(format("checking `%1%'") % path);
struct stat st;
if (lstat(path.c_str(), &st))
throw SysError(format("getting attributes of path `%1%'") % path);
if (S_ISDIR(st.st_mode)) {
Strings names = readDirectory(path);
for (Strings::iterator i = names.begin(); i != names.end(); i++) {
search(i->size(), (const unsigned char *) i->c_str(), ids, seen);
checkPath(path + "/" + *i, ids, seen);
}
}
else if (S_ISREG(st.st_mode)) {
AutoCloseFD fd = open(path.c_str(), O_RDONLY);
if (fd == -1) throw SysError(format("opening file `%1%'") % path);
size_t bufSize = 1024 * 1024;
assert(refLength <= bufSize);
unsigned char * buf = new unsigned char[bufSize];
size_t left = st.st_size;
bool firstBlock = true;
while (left > 0) {
checkInterrupt();
size_t read = left > bufSize ? bufSize : left;
size_t copiedBytes = 0;
if (!firstBlock) {
/* Move the last (refLength - 1) bytes from the last
block to the start of the buffer to deal with
references that cross block boundaries. */
copiedBytes = refLength - 1;
if (read + copiedBytes > bufSize)
read -= copiedBytes;
memcpy(buf, buf + (bufSize - copiedBytes), copiedBytes);
}
firstBlock = false;
readFull(fd, buf + copiedBytes, read);
left -= read;
search(copiedBytes + read, buf, ids, seen);
}
delete[] buf; /* !!! autodelete */
}
else if (S_ISLNK(st.st_mode)) {
string target = readLink(path);
search(target.size(), (const unsigned char *) target.c_str(), ids, seen);
}
else throw Error(format("unknown file type: %1%") % path);
}
PathSet scanForReferences(const string & path, const PathSet & paths)
{
std::map<string, Path> backMap;
StringSet ids;
StringSet seen; StringSet seen;
string tail;
RefScanSink() : hashSink(htSHA256) { }
void operator () (const unsigned char * data, unsigned int len);
};
void RefScanSink::operator () (const unsigned char * data, unsigned int len)
{
hashSink(data, len);
/* It's possible that a reference spans the previous and current
fragment, so search in the concatenation of the tail of the
previous fragment and the start of the current fragment. */
string s = tail + string((const char *) data, len > refLength ? refLength : len);
search((const unsigned char *) s.c_str(), s.size(), hashes, seen);
search(data, len, hashes, seen);
unsigned int tailLen = len <= refLength ? len : refLength;
tail =
string(tail, tail.size() < refLength - tailLen ? 0 : tail.size() - (refLength - tailLen)) +
string((const char *) data + len - tailLen, tailLen);
}
PathSet scanForReferences(const string & path,
const PathSet & refs, Hash & hash)
{
RefScanSink sink;
std::map<string, Path> backMap;
/* For efficiency (and a higher hit rate), just search for the /* For efficiency (and a higher hit rate), just search for the
hash part of the file name. (This assumes that all references hash part of the file name. (This assumes that all references
have the form `HASH-bla'). */ have the form `HASH-bla'). */
for (PathSet::const_iterator i = paths.begin(); i != paths.end(); i++) { foreach (PathSet::const_iterator, i, refs) {
string baseName = baseNameOf(*i); string baseName = baseNameOf(*i);
string::size_type pos = baseName.find('-'); string::size_type pos = baseName.find('-');
if (pos == string::npos) if (pos == string::npos)
@ -138,19 +97,23 @@ PathSet scanForReferences(const string & path, const PathSet & paths)
assert(s.size() == refLength); assert(s.size() == refLength);
assert(backMap.find(s) == backMap.end()); assert(backMap.find(s) == backMap.end());
// parseHash(htSHA256, s); // parseHash(htSHA256, s);
ids.insert(s); sink.hashes.insert(s);
backMap[s] = *i; backMap[s] = *i;
} }
checkPath(path, ids, seen); /* Look for the hashes in the NAR dump of the path. */
dumpPath(path, sink);
/* Map the hashes found back to their store paths. */
PathSet found; PathSet found;
for (StringSet::iterator i = seen.begin(); i != seen.end(); i++) { foreach (StringSet::iterator, i, sink.seen) {
std::map<string, Path>::iterator j; std::map<string, Path>::iterator j;
if ((j = backMap.find(*i)) == backMap.end()) abort(); if ((j = backMap.find(*i)) == backMap.end()) abort();
found.insert(j->second); found.insert(j->second);
} }
hash = sink.hashSink.finish();
return found; return found;
} }

View file

@ -2,10 +2,12 @@
#define __REFERENCES_H #define __REFERENCES_H
#include "types.hh" #include "types.hh"
#include "hash.hh"
namespace nix { namespace nix {
PathSet scanForReferences(const Path & path, const PathSet & refs); PathSet scanForReferences(const Path & path, const PathSet & refs,
Hash & hash);
} }