From dcf0fb4a0135c91412dee7d19f09a8789db2c1e8 Mon Sep 17 00:00:00 2001 From: Niklas Beisert Date: Sun, 17 Mar 2024 14:10:20 +0100 Subject: [PATCH 1/2] data comparison methods moved to Fileinfo --- Fileinfo.cc | 47 +++++++++++++++++++++++++++++++++++++ Fileinfo.hh | 21 +++++++++++++++++ Rdutil.cc | 67 ++++++++--------------------------------------------- 3 files changed, 78 insertions(+), 57 deletions(-) diff --git a/Fileinfo.cc b/Fileinfo.cc index 46d05e9..a714eaf 100644 --- a/Fileinfo.cc +++ b/Fileinfo.cc @@ -12,6 +12,7 @@ #include //for strerror #include //for file reading #include //for cout etc +#include //for comparison // os #include //for file info @@ -319,3 +320,49 @@ Fileinfo::static_makehardlink(Fileinfo& A, const Fileinfo& B) { return A.makehardlink(B); } + +bool +Fileinfo::cmpSize(const Fileinfo& a, const Fileinfo& b) +{ + return a.size() < b.size(); +} + +bool +Fileinfo::cmpDeviceInode(const Fileinfo& a, const Fileinfo& b) +{ + return std::make_tuple(a.device(), a.inode()) < + std::make_tuple(b.device(), b.inode()); +} + +bool +Fileinfo::cmpDepthName(const Fileinfo& a, const Fileinfo& b) +{ + // inefficient, make it a reference. + return std::make_tuple(a.depth(), a.name()) < + std::make_tuple(b.depth(), b.name()); +} + +bool +Fileinfo::cmpBuffers(const Fileinfo& a, const Fileinfo& b) +{ + return std::memcmp(a.getbyteptr(), b.getbyteptr(), a.getbuffersize()) < 0; +} + +bool +Fileinfo::hasEqualBuffers(const Fileinfo& a, const Fileinfo& b) +{ + return std::memcmp(a.getbyteptr(), b.getbyteptr(), a.getbuffersize()) == 0; +} + +bool +Fileinfo::cmpSizeBuffers(const Fileinfo& a, const Fileinfo& b) +{ + return (a.size() < b.size()) || (a.size() == b.size() && cmpBuffers(a, b)); +} + +bool +Fileinfo::cmpRank(const Fileinfo& a, const Fileinfo& b) +{ + return std::make_tuple(a.get_cmdline_index(), a.depth(), a.getidentity()) < + std::make_tuple(b.get_cmdline_index(), b.depth(), b.getidentity()); +} diff --git a/Fileinfo.hh b/Fileinfo.hh index 73c89f7..3ec14ce 100644 --- a/Fileinfo.hh +++ b/Fileinfo.hh @@ -100,6 +100,27 @@ public: // deletes file A, that is a duplicate of B static int static_deletefile(Fileinfo& A, const Fileinfo& B); + // compares file size + static bool cmpSize(const Fileinfo& a, const Fileinfo& b); + + // compares file device and inode + static bool cmpDeviceInode(const Fileinfo& a, const Fileinfo& b); + + // compares depth and name + static bool cmpDepthName(const Fileinfo& a, const Fileinfo& b); + + // compares buffers + static bool cmpBuffers(const Fileinfo& a, const Fileinfo& b); + + // compares buffers + static bool hasEqualBuffers(const Fileinfo& a, const Fileinfo& b); + + // compares file size then buffers + static bool cmpSizeBuffers(const Fileinfo& a, const Fileinfo& b); + + // compares rank + static bool cmpRank(const Fileinfo& a, const Fileinfo& b); + // sets the deleteflag void setdeleteflag(bool flag) { m_delete = flag; } diff --git a/Rdutil.cc b/Rdutil.cc index f1f2ed7..aaf54fb 100644 --- a/Rdutil.cc +++ b/Rdutil.cc @@ -188,53 +188,6 @@ Rdutil::markitems() } namespace { -bool -cmpDeviceInode(const Fileinfo& a, const Fileinfo& b) -{ - return std::make_tuple(a.device(), a.inode()) < - std::make_tuple(b.device(), b.inode()); -} -// compares rank as described in RANKING on man page. -bool -cmpRank(const Fileinfo& a, const Fileinfo& b) -{ - return std::make_tuple(a.get_cmdline_index(), a.depth(), a.getidentity()) < - std::make_tuple(b.get_cmdline_index(), b.depth(), b.getidentity()); -} -bool -cmpDepthName(const Fileinfo& a, const Fileinfo& b) -{ - // inefficient, make it a reference. - return std::make_tuple(a.depth(), a.name()) < - std::make_tuple(b.depth(), b.name()); -} -// compares buffers -bool -cmpBuffers(const Fileinfo& a, const Fileinfo& b) -{ - return std::memcmp(a.getbyteptr(), b.getbyteptr(), a.getbuffersize()) < 0; -} - -#if !defined(NDEBUG) -bool -hasEqualBuffers(const Fileinfo& a, const Fileinfo& b) -{ - return std::memcmp(a.getbyteptr(), b.getbyteptr(), a.getbuffersize()) == 0; -} -#endif - -// compares file size -bool -cmpSize(const Fileinfo& a, const Fileinfo& b) -{ - return a.size() < b.size(); -} -bool -cmpSizeThenBuffer(const Fileinfo& a, const Fileinfo& b) -{ - return (a.size() < b.size()) || (a.size() == b.size() && cmpBuffers(a, b)); -} - /** * goes through first to last, finds ranges of equal elements (determined by * cmp) and invokes callback on each subrange. @@ -286,7 +239,7 @@ int Rdutil::sortOnDeviceAndInode() { - std::sort(m_list.begin(), m_list.end(), cmpDeviceInode); + std::sort(m_list.begin(), m_list.end(), Fileinfo::cmpDeviceInode); return 0; } @@ -296,14 +249,14 @@ Rdutil::sort_on_depth_and_name(std::size_t index_of_first) assert(index_of_first <= m_list.size()); auto it = std::begin(m_list) + static_cast(index_of_first); - std::sort(it, std::end(m_list), cmpDepthName); + std::sort(it, std::end(m_list), Fileinfo::cmpDepthName); } std::size_t Rdutil::removeIdenticalInodes() { // sort list on device and inode. - auto cmp = cmpDeviceInode; + auto cmp = Fileinfo::cmpDeviceInode; std::sort(m_list.begin(), m_list.end(), cmp); // loop over ranges of adjacent elements @@ -312,7 +265,7 @@ Rdutil::removeIdenticalInodes() m_list.begin(), m_list.end(), cmp, [](Iterator first, Iterator last) { // let the highest-ranking element not be deleted. do this in order, to be // cache friendly. - auto best = std::min_element(first, last, cmpRank); + auto best = std::min_element(first, last, Fileinfo::cmpRank); std::for_each(first, best, [](Fileinfo& f) { f.setdeleteflag(true); }); best->setdeleteflag(false); std::for_each(best + 1, last, [](Fileinfo& f) { f.setdeleteflag(true); }); @@ -324,7 +277,7 @@ std::size_t Rdutil::removeUniqueSizes() { // sort list on size - auto cmp = cmpSize; + auto cmp = Fileinfo::cmpSize; std::sort(m_list.begin(), m_list.end(), cmp); // loop over ranges of adjacent elements @@ -346,10 +299,10 @@ std::size_t Rdutil::removeUniqSizeAndBuffer() { // sort list on size - const auto cmp = cmpSize; + const auto cmp = Fileinfo::cmpSize; std::sort(m_list.begin(), m_list.end(), cmp); - const auto bufcmp = cmpBuffers; + const auto bufcmp = Fileinfo::cmpBuffers; // loop over ranges of adjacent elements using Iterator = decltype(m_list.begin()); @@ -377,7 +330,7 @@ Rdutil::removeUniqSizeAndBuffer() void Rdutil::markduplicates() { - const auto cmp = cmpSizeThenBuffer; + const auto cmp = Fileinfo::cmpSizeBuffers; assert(std::is_sorted(m_list.begin(), m_list.end(), cmp)); // loop over ranges of adjacent elements @@ -391,7 +344,7 @@ Rdutil::markduplicates() assert(std::distance(first, last) >= 2); // the one with the lowest rank is the original - auto orig = std::min_element(first, last, cmpRank); + auto orig = std::min_element(first, last, Fileinfo::cmpRank); assert(orig != last); // place it first, so later stages will find the original first. std::iter_swap(first, orig); @@ -399,7 +352,7 @@ Rdutil::markduplicates() // make sure they are all duplicates assert(last == find_if_not(first, last, [orig](const Fileinfo& a) { - return orig->size() == a.size() && hasEqualBuffers(*orig, a); + return orig->size() == a.size() && Fileinfo::hasEqualBuffers(*orig, a); })); // mark the files with the appropriate tag. From dbf27a1118a8741fd9353ecf756b117ebc8054b4 Mon Sep 17 00:00:00 2001 From: Niklas Beisert Date: Sun, 17 Mar 2024 14:14:39 +0100 Subject: [PATCH 2/2] add mtime, mode and uid/gid distinction --- Fileinfo.cc | 75 ++++++++++++++++++++++++++++++++++++++++++++++++++++- Fileinfo.hh | 21 ++++++++++++++- Rdutil.cc | 6 ++--- rdfind.cc | 4 +-- 4 files changed, 99 insertions(+), 7 deletions(-) diff --git a/Fileinfo.cc b/Fileinfo.cc index a714eaf..81b1440 100644 --- a/Fileinfo.cc +++ b/Fileinfo.cc @@ -103,7 +103,7 @@ Fileinfo::fillwithbytes(enum readtobuffermode filltype, } bool -Fileinfo::readfileinfo() +Fileinfo::readfileinfo(bool use_time, bool use_mode, bool use_ids) { struct stat info; m_info.is_file = false; @@ -118,6 +118,11 @@ Fileinfo::readfileinfo() m_info.stat_size = 0; m_info.stat_ino = 0; m_info.stat_dev = 0; + m_info.stat_mtim.tv_sec = 0; + m_info.stat_mtim.tv_nsec = 0; + m_info.stat_mode = 0; + m_info.stat_uid = 0; + m_info.stat_gid = 0; std::cerr << "readfileinfo.cc:Something went wrong when reading file " "info from \"" << m_filename << "\" :" << std::strerror(errno) << std::endl; @@ -129,6 +134,26 @@ Fileinfo::readfileinfo() m_info.stat_ino = info.st_ino; m_info.stat_dev = info.st_dev; + if (use_time) { + m_info.stat_mtim.tv_sec = info.st_mtim.tv_sec; + m_info.stat_mtim.tv_nsec = info.st_mtim.tv_nsec; + } else { + m_info.stat_mtim.tv_sec = 0; + m_info.stat_mtim.tv_nsec = 0; + } + if (use_mode) { + m_info.stat_mode = info.st_mode; + } else { + m_info.stat_mode = 0; + } + if (use_ids) { + m_info.stat_uid = info.st_uid; + m_info.stat_gid = info.st_gid; + } else { + m_info.stat_uid = 0; + m_info.stat_gid = 0; + } + m_info.is_file = S_ISREG(info.st_mode); m_info.is_directory = S_ISDIR(info.st_mode); return true; @@ -160,6 +185,11 @@ Fileinfo::Fileinfostat::Fileinfostat() stat_size = 99999; stat_ino = 99999; stat_dev = 99999; + stat_mtim.tv_sec = 99999; + stat_mtim.tv_nsec = 99999; + stat_mode = 99999; + stat_uid = 99999; + stat_gid = 99999; is_file = false; is_directory = false; } @@ -327,6 +357,36 @@ Fileinfo::cmpSize(const Fileinfo& a, const Fileinfo& b) return a.size() < b.size(); } +bool +Fileinfo::cmpMeta(const Fileinfo& a, const Fileinfo& b) +{ + return std::make_tuple(a.m_info.stat_mtim.tv_sec, + a.m_info.stat_mtim.tv_nsec, + a.m_info.stat_mode, + a.m_info.stat_uid, + a.m_info.stat_gid) < + std::make_tuple(b.m_info.stat_mtim.tv_sec, + b.m_info.stat_mtim.tv_nsec, + b.m_info.stat_mode, + b.m_info.stat_uid, + b.m_info.stat_gid); +} + +bool +Fileinfo::hasEqualMeta(const Fileinfo& a, const Fileinfo& b) +{ + return std::make_tuple(a.m_info.stat_mtim.tv_sec, + a.m_info.stat_mtim.tv_nsec, + a.m_info.stat_mode, + a.m_info.stat_uid, + a.m_info.stat_gid) == + std::make_tuple(b.m_info.stat_mtim.tv_sec, + b.m_info.stat_mtim.tv_nsec, + b.m_info.stat_mode, + b.m_info.stat_uid, + b.m_info.stat_gid); +} + bool Fileinfo::cmpDeviceInode(const Fileinfo& a, const Fileinfo& b) { @@ -354,12 +414,25 @@ Fileinfo::hasEqualBuffers(const Fileinfo& a, const Fileinfo& b) return std::memcmp(a.getbyteptr(), b.getbyteptr(), a.getbuffersize()) == 0; } +bool +Fileinfo::cmpSizeMeta(const Fileinfo& a, const Fileinfo& b) +{ + return (a.size() < b.size()) || (a.size() == b.size() && cmpMeta(a, b)); +} + bool Fileinfo::cmpSizeBuffers(const Fileinfo& a, const Fileinfo& b) { return (a.size() < b.size()) || (a.size() == b.size() && cmpBuffers(a, b)); } +bool +Fileinfo::cmpSizeMetaBuffers(const Fileinfo& a, const Fileinfo& b) +{ + return (a.size() < b.size()) || (a.size() == b.size() && cmpMeta(a, b)) || + (a.size() == b.size() && hasEqualMeta(a, b) && cmpBuffers(a,b)); +} + bool Fileinfo::cmpRank(const Fileinfo& a, const Fileinfo& b) { diff --git a/Fileinfo.hh b/Fileinfo.hh index 3ec14ce..bc4a2c5 100644 --- a/Fileinfo.hh +++ b/Fileinfo.hh @@ -73,9 +73,12 @@ public: /** * reads info about the file, by querying the filesystem. + * @param use_time use timestamp for comparison + * @param use_mode use mode for comparison + * @param use_ids use uid and gid for comparison * @return false if it was not possible to get the information. */ - bool readfileinfo(); + bool readfileinfo(bool use_time, bool use_mode, bool use_ids); duptype getduptype() const { return m_duptype; } @@ -103,6 +106,12 @@ public: // compares file size static bool cmpSize(const Fileinfo& a, const Fileinfo& b); + // compares metadata + static bool cmpMeta(const Fileinfo& a, const Fileinfo& b); + + // compares metadata + static bool hasEqualMeta(const Fileinfo& a, const Fileinfo& b); + // compares file device and inode static bool cmpDeviceInode(const Fileinfo& a, const Fileinfo& b); @@ -115,9 +124,15 @@ public: // compares buffers static bool hasEqualBuffers(const Fileinfo& a, const Fileinfo& b); + // compares size and metadata + static bool cmpSizeMeta(const Fileinfo& a, const Fileinfo& b); + // compares file size then buffers static bool cmpSizeBuffers(const Fileinfo& a, const Fileinfo& b); + // compares size then Meta then buffers + static bool cmpSizeMetaBuffers(const Fileinfo& a, const Fileinfo& b); + // compares rank static bool cmpRank(const Fileinfo& a, const Fileinfo& b); @@ -183,6 +198,10 @@ private: filesizetype stat_size; // size unsigned long stat_ino; // inode unsigned long stat_dev; // device + timespec stat_mtim; // modification time + unsigned long stat_mode; // access flags + unsigned long stat_uid; // user id + unsigned long stat_gid; // group id bool is_file; bool is_directory; Fileinfostat(); diff --git a/Rdutil.cc b/Rdutil.cc index aaf54fb..ce40327 100644 --- a/Rdutil.cc +++ b/Rdutil.cc @@ -277,7 +277,7 @@ std::size_t Rdutil::removeUniqueSizes() { // sort list on size - auto cmp = Fileinfo::cmpSize; + auto cmp = Fileinfo::cmpSizeMeta; std::sort(m_list.begin(), m_list.end(), cmp); // loop over ranges of adjacent elements @@ -299,7 +299,7 @@ std::size_t Rdutil::removeUniqSizeAndBuffer() { // sort list on size - const auto cmp = Fileinfo::cmpSize; + const auto cmp = Fileinfo::cmpSizeMeta; std::sort(m_list.begin(), m_list.end(), cmp); const auto bufcmp = Fileinfo::cmpBuffers; @@ -330,7 +330,7 @@ Rdutil::removeUniqSizeAndBuffer() void Rdutil::markduplicates() { - const auto cmp = Fileinfo::cmpSizeBuffers; + const auto cmp = Fileinfo::cmpSizeMetaBuffers; assert(std::is_sorted(m_list.begin(), m_list.end(), cmp)); // loop over ranges of adjacent elements diff --git a/rdfind.cc b/rdfind.cc index 23f9a2b..b7e4730 100644 --- a/rdfind.cc +++ b/rdfind.cc @@ -258,7 +258,7 @@ report(const std::string& path, const std::string& name, int depth) std::string expandedname = path.empty() ? name : (path + "/" + name); Fileinfo tmp(std::move(expandedname), current_cmdline_index, depth); - if (tmp.readfileinfo()) { + if (tmp.readfileinfo(true, true, true)) { if (tmp.isRegularFile()) { const auto size = tmp.size(); if (size >= global_options->minimumfilesize && @@ -349,7 +349,7 @@ main(int narg, const char* argv[]) gswd.totalsize(std::cout) << std::endl; std::cout << "Removed " << gswd.removeUniqueSizes() - << " files due to unique sizes from list. "; + << " files due to unique size and metadata from list. "; std::cout << filelist.size() << " files left." << std::endl; // ok. we now need to do something stronger to disambiguate the duplicate