From a84f503d863fd77de9b6ecf149399c2ca7642b75 Mon Sep 17 00:00:00 2001 From: wmertens Date: Sat, 10 May 2014 15:53:01 +0200 Subject: [PATCH 1/4] Shortcut already-hardlinked files If an inode in the Nix store has more than 1 link, it probably means that it was linked into .links/ by us. If so, skip. There's a possibility that something else hardlinked the file, so it would be nice to be able to override this. Also, by looking at the number of hardlinks for each of the files in .links/, you can get deduplication numbers and space savings. --- src/libstore/optimise-store.cc | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/libstore/optimise-store.cc b/src/libstore/optimise-store.cc index d833f3aa0..1b81f6407 100644 --- a/src/libstore/optimise-store.cc +++ b/src/libstore/optimise-store.cc @@ -71,6 +71,16 @@ void LocalStore::optimisePath_(OptimiseStats & stats, const Path & path) return; } + stats.totalFiles++; + + /* If a store inode has 2 or more links we presume that it was + already linked by us */ + /* TODO: allow overriding this behavior */ + if (st.st_nlink > 1) { + printMsg(lvlDebug, format("`%1%' is already linked, with %2% other file(s).") % path % (st.st_nlink - 2)); + return; + } + /* Hash the file. Note that hashPath() returns the hash over the NAR serialisation, which includes the execute bit on the file. Thus, executable and non-executable files with the same @@ -81,7 +91,6 @@ void LocalStore::optimisePath_(OptimiseStats & stats, const Path & path) contents of the symlink (i.e. the result of readlink()), not the contents of the target (which may not even exist). */ Hash hash = hashPath(htSHA256, path).first; - stats.totalFiles++; printMsg(lvlDebug, format("`%1%' has hash `%2%'") % path % printHash(hash)); /* Check if this is a known hash. */ From e974f20c9811c3efe09cfca9bda7816f9091c0d5 Mon Sep 17 00:00:00 2001 From: Wout Mertens Date: Tue, 13 May 2014 23:10:06 +0200 Subject: [PATCH 2/4] Preload linked hashes to speed up lookups By preloading all inodes in the /nix/store/.links directory, we can quickly determine of a hardlinked file was already linked to the hashed links. This is tolerant of removing the .links directory, it will simply recalculate all hashes in the store. --- src/libstore/local-store.hh | 14 ++++++++++++- src/libstore/optimise-store.cc | 37 +++++++++++++++++++++++++--------- 2 files changed, 41 insertions(+), 10 deletions(-) diff --git a/src/libstore/local-store.hh b/src/libstore/local-store.hh index 09639e74c..71229f7a6 100644 --- a/src/libstore/local-store.hh +++ b/src/libstore/local-store.hh @@ -6,6 +6,11 @@ #include "util.hh" #include "pathlocks.hh" +#if HAVE_TR1_UNORDERED_SET +#include +#endif + + class sqlite3; class sqlite3_stmt; @@ -303,7 +308,14 @@ private: void checkDerivationOutputs(const Path & drvPath, const Derivation & drv); - void optimisePath_(OptimiseStats & stats, const Path & path); +#if HAVE_TR1_UNORDERED_SET + typedef std::tr1::unordered_set Hashes; +#else + typedef std::set Hashes; +#endif + + void loadHashes(Hashes & hashes); + void optimisePath_(OptimiseStats & stats, const Path & path, Hashes & hashes); // Internal versions that are not wrapped in retry_sqlite. bool isValidPath_(const Path & path); diff --git a/src/libstore/optimise-store.cc b/src/libstore/optimise-store.cc index 1b81f6407..78174e177 100644 --- a/src/libstore/optimise-store.cc +++ b/src/libstore/optimise-store.cc @@ -39,8 +39,22 @@ struct MakeReadOnly } }; +// TODO Make this a map and keep count and size stats, for giggles +void LocalStore::loadHashes(Hashes & hashes) +{ + printMsg(lvlDebug, "loading hash inodes in memory"); + Strings names = readDirectory(linksDir); + foreach (Strings::iterator, i, names) { + struct stat st; + string path = linksDir + "/" + *i; + if (lstat(path.c_str(), &st)) + throw SysError(format("getting attributes of path `%1%'") % path); + hashes.insert(st.st_ino); + } + printMsg(lvlDebug, format("loaded %1% hashes") % hashes.size()); +} -void LocalStore::optimisePath_(OptimiseStats & stats, const Path & path) +void LocalStore::optimisePath_(OptimiseStats & stats, const Path & path, Hashes & hashes) { checkInterrupt(); @@ -51,7 +65,7 @@ void LocalStore::optimisePath_(OptimiseStats & stats, const Path & path) if (S_ISDIR(st.st_mode)) { Strings names = readDirectory(path); foreach (Strings::iterator, i, names) - optimisePath_(stats, path + "/" + *i); + optimisePath_(stats, path + "/" + *i, hashes); return; } @@ -73,10 +87,7 @@ void LocalStore::optimisePath_(OptimiseStats & stats, const Path & path) stats.totalFiles++; - /* If a store inode has 2 or more links we presume that it was - already linked by us */ - /* TODO: allow overriding this behavior */ - if (st.st_nlink > 1) { + if (st.st_nlink > 1 && hashes.count(st.st_ino)) { printMsg(lvlDebug, format("`%1%' is already linked, with %2% other file(s).") % path % (st.st_nlink - 2)); return; } @@ -98,7 +109,10 @@ void LocalStore::optimisePath_(OptimiseStats & stats, const Path & path) if (!pathExists(linkPath)) { /* Nope, create a hard link in the links directory. */ - if (link(path.c_str(), linkPath.c_str()) == 0) return; + if (link(path.c_str(), linkPath.c_str()) == 0) { + hashes.insert(st.st_ino); + return; + } if (errno != EEXIST) throw SysError(format("cannot link `%1%' to `%2%'") % linkPath % path); /* Fall through if another process created ‘linkPath’ before @@ -169,12 +183,15 @@ void LocalStore::optimisePath_(OptimiseStats & stats, const Path & path) void LocalStore::optimiseStore(OptimiseStats & stats) { PathSet paths = queryAllValidPaths(); + Hashes hashes; + + loadHashes(hashes); foreach (PathSet::iterator, i, paths) { addTempRoot(*i); if (!isValidPath(*i)) continue; /* path was GC'ed, probably */ startNest(nest, lvlChatty, format("hashing files in `%1%'") % *i); - optimisePath_(stats, *i); + optimisePath_(stats, *i, hashes); } } @@ -182,7 +199,9 @@ void LocalStore::optimiseStore(OptimiseStats & stats) void LocalStore::optimisePath(const Path & path) { OptimiseStats stats; - if (settings.autoOptimiseStore) optimisePath_(stats, path); + Hashes hashes; + + if (settings.autoOptimiseStore) optimisePath_(stats, path, hashes); } From d73ffc552f78e0d9048e3bcc1e84452d1e8d2ede Mon Sep 17 00:00:00 2001 From: Wout Mertens Date: Wed, 14 May 2014 22:52:10 +0200 Subject: [PATCH 3/4] Use the inodes given by readdir directly --- src/libstore/local-store.hh | 8 +++---- src/libstore/optimise-store.cc | 38 +++++++++++++++++++--------------- 2 files changed, 25 insertions(+), 21 deletions(-) diff --git a/src/libstore/local-store.hh b/src/libstore/local-store.hh index 71229f7a6..b335092a2 100644 --- a/src/libstore/local-store.hh +++ b/src/libstore/local-store.hh @@ -309,13 +309,13 @@ private: void checkDerivationOutputs(const Path & drvPath, const Derivation & drv); #if HAVE_TR1_UNORDERED_SET - typedef std::tr1::unordered_set Hashes; + typedef std::tr1::unordered_set InodeHash; #else - typedef std::set Hashes; + typedef std::set InodeHash; #endif - void loadHashes(Hashes & hashes); - void optimisePath_(OptimiseStats & stats, const Path & path, Hashes & hashes); + InodeHash loadInodeHash(); + void optimisePath_(OptimiseStats & stats, const Path & path, InodeHash & inodeHash); // Internal versions that are not wrapped in retry_sqlite. bool isValidPath_(const Path & path); diff --git a/src/libstore/optimise-store.cc b/src/libstore/optimise-store.cc index 78174e177..ed4180190 100644 --- a/src/libstore/optimise-store.cc +++ b/src/libstore/optimise-store.cc @@ -39,22 +39,28 @@ struct MakeReadOnly } }; -// TODO Make this a map and keep count and size stats, for giggles -void LocalStore::loadHashes(Hashes & hashes) +LocalStore::InodeHash LocalStore::loadInodeHash() { printMsg(lvlDebug, "loading hash inodes in memory"); - Strings names = readDirectory(linksDir); - foreach (Strings::iterator, i, names) { - struct stat st; - string path = linksDir + "/" + *i; - if (lstat(path.c_str(), &st)) - throw SysError(format("getting attributes of path `%1%'") % path); - hashes.insert(st.st_ino); + InodeHash hashes; + + AutoCloseDir dir = opendir(linksDir.c_str()); + if (!dir) throw SysError(format("opening directory `%1%'") % linksDir); + + struct dirent * dirent; + while (errno = 0, dirent = readdir(dir)) { /* sic */ + checkInterrupt(); + // We don't care if we hit non-hash files, anything goes + hashes.insert(dirent->d_ino); } - printMsg(lvlDebug, format("loaded %1% hashes") % hashes.size()); + if (errno) throw SysError(format("reading directory `%1%'") % linksDir); + + printMsg(lvlInfo, format("loaded %1% hash inodes") % hashes.size()); + + return hashes; } -void LocalStore::optimisePath_(OptimiseStats & stats, const Path & path, Hashes & hashes) +void LocalStore::optimisePath_(OptimiseStats & stats, const Path & path, InodeHash & hashes) { checkInterrupt(); @@ -183,15 +189,13 @@ void LocalStore::optimisePath_(OptimiseStats & stats, const Path & path, Hashes void LocalStore::optimiseStore(OptimiseStats & stats) { PathSet paths = queryAllValidPaths(); - Hashes hashes; - - loadHashes(hashes); + InodeHash inodeHash = loadInodeHash(); foreach (PathSet::iterator, i, paths) { addTempRoot(*i); if (!isValidPath(*i)) continue; /* path was GC'ed, probably */ startNest(nest, lvlChatty, format("hashing files in `%1%'") % *i); - optimisePath_(stats, *i, hashes); + optimisePath_(stats, *i, inodeHash); } } @@ -199,9 +203,9 @@ void LocalStore::optimiseStore(OptimiseStats & stats) void LocalStore::optimisePath(const Path & path) { OptimiseStats stats; - Hashes hashes; + InodeHash inodeHash; - if (settings.autoOptimiseStore) optimisePath_(stats, path, hashes); + if (settings.autoOptimiseStore) optimisePath_(stats, path, inodeHash); } From 3b9ea8452f102595874826e349fa38f85c00aa39 Mon Sep 17 00:00:00 2001 From: Wout Mertens Date: Thu, 15 May 2014 09:02:22 +0200 Subject: [PATCH 4/4] Shortcut store files before lstat readdir() already returns the inode numbers, so we don't need to call lstat to know if a file was already linked or not. --- src/libstore/local-store.hh | 1 + src/libstore/optimise-store.cc | 45 +++++++++++++++++++++++++++------- 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/src/libstore/local-store.hh b/src/libstore/local-store.hh index b335092a2..487bb711e 100644 --- a/src/libstore/local-store.hh +++ b/src/libstore/local-store.hh @@ -315,6 +315,7 @@ private: #endif InodeHash loadInodeHash(); + Strings readDirectoryIgnoringInodes(const Path & path, const InodeHash & inodeHash, OptimiseStats & stats); void optimisePath_(OptimiseStats & stats, const Path & path, InodeHash & inodeHash); // Internal versions that are not wrapped in retry_sqlite. diff --git a/src/libstore/optimise-store.cc b/src/libstore/optimise-store.cc index ed4180190..3615bc300 100644 --- a/src/libstore/optimise-store.cc +++ b/src/libstore/optimise-store.cc @@ -42,7 +42,7 @@ struct MakeReadOnly LocalStore::InodeHash LocalStore::loadInodeHash() { printMsg(lvlDebug, "loading hash inodes in memory"); - InodeHash hashes; + InodeHash inodeHash; AutoCloseDir dir = opendir(linksDir.c_str()); if (!dir) throw SysError(format("opening directory `%1%'") % linksDir); @@ -51,16 +51,42 @@ LocalStore::InodeHash LocalStore::loadInodeHash() while (errno = 0, dirent = readdir(dir)) { /* sic */ checkInterrupt(); // We don't care if we hit non-hash files, anything goes - hashes.insert(dirent->d_ino); + inodeHash.insert(dirent->d_ino); } if (errno) throw SysError(format("reading directory `%1%'") % linksDir); - printMsg(lvlInfo, format("loaded %1% hash inodes") % hashes.size()); + printMsg(lvlInfo, format("loaded %1% hash inodes") % inodeHash.size()); - return hashes; + return inodeHash; } -void LocalStore::optimisePath_(OptimiseStats & stats, const Path & path, InodeHash & hashes) +Strings LocalStore::readDirectoryIgnoringInodes(const Path & path, const InodeHash & inodeHash, OptimiseStats & stats) +{ + Strings names; + + AutoCloseDir dir = opendir(path.c_str()); + if (!dir) throw SysError(format("opening directory `%1%'") % path); + + struct dirent * dirent; + while (errno = 0, dirent = readdir(dir)) { /* sic */ + checkInterrupt(); + + if (inodeHash.count(dirent->d_ino)) { + printMsg(lvlDebug, format("`%1%' is already linked") % dirent->d_name); + stats.totalFiles++; + continue; + } + + string name = dirent->d_name; + if (name == "." || name == "..") continue; + names.push_back(name); + } + if (errno) throw SysError(format("reading directory `%1%'") % path); + + return names; +} + +void LocalStore::optimisePath_(OptimiseStats & stats, const Path & path, InodeHash & inodeHash) { checkInterrupt(); @@ -69,9 +95,9 @@ void LocalStore::optimisePath_(OptimiseStats & stats, const Path & path, InodeHa throw SysError(format("getting attributes of path `%1%'") % path); if (S_ISDIR(st.st_mode)) { - Strings names = readDirectory(path); + Strings names = readDirectoryIgnoringInodes(path, inodeHash, stats); foreach (Strings::iterator, i, names) - optimisePath_(stats, path + "/" + *i, hashes); + optimisePath_(stats, path + "/" + *i, inodeHash); return; } @@ -93,7 +119,8 @@ void LocalStore::optimisePath_(OptimiseStats & stats, const Path & path, InodeHa stats.totalFiles++; - if (st.st_nlink > 1 && hashes.count(st.st_ino)) { + /* This can still happen on top-level files */ + if (st.st_nlink > 1 && inodeHash.count(st.st_ino)) { printMsg(lvlDebug, format("`%1%' is already linked, with %2% other file(s).") % path % (st.st_nlink - 2)); return; } @@ -116,7 +143,7 @@ void LocalStore::optimisePath_(OptimiseStats & stats, const Path & path, InodeHa if (!pathExists(linkPath)) { /* Nope, create a hard link in the links directory. */ if (link(path.c_str(), linkPath.c_str()) == 0) { - hashes.insert(st.st_ino); + inodeHash.insert(st.st_ino); return; } if (errno != EEXIST)