Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added file distinction based on metadata (mtime, mode, owner) #155

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 121 additions & 1 deletion Fileinfo.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include <cstring> //for strerror
#include <fstream> //for file reading
#include <iostream> //for cout etc
#include <tuple> //for comparison

// os
#include <sys/stat.h> //for file info
Expand Down Expand Up @@ -102,7 +103,7 @@ Fileinfo::fillwithbytes(enum readtobuffermode filltype,
}

bool
Fileinfo::readfileinfo()
Fileinfo::readfileinfo(bool use_time, bool use_mode, bool use_ids)
{
struct stat info;
m_info.is_file = false;
Expand All @@ -117,6 +118,11 @@ Fileinfo::readfileinfo()
m_info.stat_size = 0;
m_info.stat_ino = 0;
m_info.stat_dev = 0;
m_info.stat_mtim.tv_sec = 0;
m_info.stat_mtim.tv_nsec = 0;
m_info.stat_mode = 0;
m_info.stat_uid = 0;
m_info.stat_gid = 0;
std::cerr << "readfileinfo.cc:Something went wrong when reading file "
"info from \""
<< m_filename << "\" :" << std::strerror(errno) << std::endl;
Expand All @@ -128,6 +134,26 @@ Fileinfo::readfileinfo()
m_info.stat_ino = info.st_ino;
m_info.stat_dev = info.st_dev;

if (use_time) {
m_info.stat_mtim.tv_sec = info.st_mtim.tv_sec;
m_info.stat_mtim.tv_nsec = info.st_mtim.tv_nsec;
} else {
m_info.stat_mtim.tv_sec = 0;
m_info.stat_mtim.tv_nsec = 0;
}
if (use_mode) {
m_info.stat_mode = info.st_mode;
} else {
m_info.stat_mode = 0;
}
if (use_ids) {
m_info.stat_uid = info.st_uid;
m_info.stat_gid = info.st_gid;
} else {
m_info.stat_uid = 0;
m_info.stat_gid = 0;
}

m_info.is_file = S_ISREG(info.st_mode);
m_info.is_directory = S_ISDIR(info.st_mode);
return true;
Expand Down Expand Up @@ -159,6 +185,11 @@ Fileinfo::Fileinfostat::Fileinfostat()
stat_size = 99999;
stat_ino = 99999;
stat_dev = 99999;
stat_mtim.tv_sec = 99999;
stat_mtim.tv_nsec = 99999;
stat_mode = 99999;
stat_uid = 99999;
stat_gid = 99999;
is_file = false;
is_directory = false;
}
Expand Down Expand Up @@ -319,3 +350,92 @@ Fileinfo::static_makehardlink(Fileinfo& A, const Fileinfo& B)
{
return A.makehardlink(B);
}

bool
Fileinfo::cmpSize(const Fileinfo& a, const Fileinfo& b)
{
return a.size() < b.size();
}

bool
Fileinfo::cmpMeta(const Fileinfo& a, const Fileinfo& b)
{
return std::make_tuple(a.m_info.stat_mtim.tv_sec,
a.m_info.stat_mtim.tv_nsec,
a.m_info.stat_mode,
a.m_info.stat_uid,
a.m_info.stat_gid) <
std::make_tuple(b.m_info.stat_mtim.tv_sec,
b.m_info.stat_mtim.tv_nsec,
b.m_info.stat_mode,
b.m_info.stat_uid,
b.m_info.stat_gid);
}

bool
Fileinfo::hasEqualMeta(const Fileinfo& a, const Fileinfo& b)
{
return std::make_tuple(a.m_info.stat_mtim.tv_sec,
a.m_info.stat_mtim.tv_nsec,
a.m_info.stat_mode,
a.m_info.stat_uid,
a.m_info.stat_gid) ==
std::make_tuple(b.m_info.stat_mtim.tv_sec,
b.m_info.stat_mtim.tv_nsec,
b.m_info.stat_mode,
b.m_info.stat_uid,
b.m_info.stat_gid);
}

bool
Fileinfo::cmpDeviceInode(const Fileinfo& a, const Fileinfo& b)
{
return std::make_tuple(a.device(), a.inode()) <
std::make_tuple(b.device(), b.inode());
}

bool
Fileinfo::cmpDepthName(const Fileinfo& a, const Fileinfo& b)
{
// inefficient, make it a reference.
return std::make_tuple(a.depth(), a.name()) <
std::make_tuple(b.depth(), b.name());
}

bool
Fileinfo::cmpBuffers(const Fileinfo& a, const Fileinfo& b)
{
return std::memcmp(a.getbyteptr(), b.getbyteptr(), a.getbuffersize()) < 0;
}

bool
Fileinfo::hasEqualBuffers(const Fileinfo& a, const Fileinfo& b)
{
return std::memcmp(a.getbyteptr(), b.getbyteptr(), a.getbuffersize()) == 0;
}

bool
Fileinfo::cmpSizeMeta(const Fileinfo& a, const Fileinfo& b)
{
return (a.size() < b.size()) || (a.size() == b.size() && cmpMeta(a, b));
}

bool
Fileinfo::cmpSizeBuffers(const Fileinfo& a, const Fileinfo& b)
{
return (a.size() < b.size()) || (a.size() == b.size() && cmpBuffers(a, b));
}

bool
Fileinfo::cmpSizeMetaBuffers(const Fileinfo& a, const Fileinfo& b)
{
return (a.size() < b.size()) || (a.size() == b.size() && cmpMeta(a, b)) ||
(a.size() == b.size() && hasEqualMeta(a, b) && cmpBuffers(a,b));
}

bool
Fileinfo::cmpRank(const Fileinfo& a, const Fileinfo& b)
{
return std::make_tuple(a.get_cmdline_index(), a.depth(), a.getidentity()) <
std::make_tuple(b.get_cmdline_index(), b.depth(), b.getidentity());
}
42 changes: 41 additions & 1 deletion Fileinfo.hh
Original file line number Diff line number Diff line change
Expand Up @@ -73,9 +73,12 @@ public:

/**
* reads info about the file, by querying the filesystem.
* @param use_time use timestamp for comparison
* @param use_mode use mode for comparison
* @param use_ids use uid and gid for comparison
* @return false if it was not possible to get the information.
*/
bool readfileinfo();
bool readfileinfo(bool use_time, bool use_mode, bool use_ids);

duptype getduptype() const { return m_duptype; }

Expand All @@ -100,6 +103,39 @@ public:
// deletes file A, that is a duplicate of B
static int static_deletefile(Fileinfo& A, const Fileinfo& B);

// compares file size
static bool cmpSize(const Fileinfo& a, const Fileinfo& b);

// compares metadata
static bool cmpMeta(const Fileinfo& a, const Fileinfo& b);

// compares metadata
static bool hasEqualMeta(const Fileinfo& a, const Fileinfo& b);

// compares file device and inode
static bool cmpDeviceInode(const Fileinfo& a, const Fileinfo& b);

// compares depth and name
static bool cmpDepthName(const Fileinfo& a, const Fileinfo& b);

// compares buffers
static bool cmpBuffers(const Fileinfo& a, const Fileinfo& b);

// compares buffers
static bool hasEqualBuffers(const Fileinfo& a, const Fileinfo& b);

// compares size and metadata
static bool cmpSizeMeta(const Fileinfo& a, const Fileinfo& b);

// compares file size then buffers
static bool cmpSizeBuffers(const Fileinfo& a, const Fileinfo& b);

// compares size then Meta then buffers
static bool cmpSizeMetaBuffers(const Fileinfo& a, const Fileinfo& b);

// compares rank
static bool cmpRank(const Fileinfo& a, const Fileinfo& b);

// sets the deleteflag
void setdeleteflag(bool flag) { m_delete = flag; }

Expand Down Expand Up @@ -162,6 +198,10 @@ private:
filesizetype stat_size; // size
unsigned long stat_ino; // inode
unsigned long stat_dev; // device
timespec stat_mtim; // modification time
unsigned long stat_mode; // access flags
unsigned long stat_uid; // user id
unsigned long stat_gid; // group id
bool is_file;
bool is_directory;
Fileinfostat();
Expand Down
67 changes: 10 additions & 57 deletions Rdutil.cc
Original file line number Diff line number Diff line change
Expand Up @@ -188,53 +188,6 @@ Rdutil::markitems()
}

namespace {
bool
cmpDeviceInode(const Fileinfo& a, const Fileinfo& b)
{
return std::make_tuple(a.device(), a.inode()) <
std::make_tuple(b.device(), b.inode());
}
// compares rank as described in RANKING on man page.
bool
cmpRank(const Fileinfo& a, const Fileinfo& b)
{
return std::make_tuple(a.get_cmdline_index(), a.depth(), a.getidentity()) <
std::make_tuple(b.get_cmdline_index(), b.depth(), b.getidentity());
}
bool
cmpDepthName(const Fileinfo& a, const Fileinfo& b)
{
// inefficient, make it a reference.
return std::make_tuple(a.depth(), a.name()) <
std::make_tuple(b.depth(), b.name());
}
// compares buffers
bool
cmpBuffers(const Fileinfo& a, const Fileinfo& b)
{
return std::memcmp(a.getbyteptr(), b.getbyteptr(), a.getbuffersize()) < 0;
}

#if !defined(NDEBUG)
bool
hasEqualBuffers(const Fileinfo& a, const Fileinfo& b)
{
return std::memcmp(a.getbyteptr(), b.getbyteptr(), a.getbuffersize()) == 0;
}
#endif

// compares file size
bool
cmpSize(const Fileinfo& a, const Fileinfo& b)
{
return a.size() < b.size();
}
bool
cmpSizeThenBuffer(const Fileinfo& a, const Fileinfo& b)
{
return (a.size() < b.size()) || (a.size() == b.size() && cmpBuffers(a, b));
}

/**
* goes through first to last, finds ranges of equal elements (determined by
* cmp) and invokes callback on each subrange.
Expand Down Expand Up @@ -286,7 +239,7 @@ int
Rdutil::sortOnDeviceAndInode()
{

std::sort(m_list.begin(), m_list.end(), cmpDeviceInode);
std::sort(m_list.begin(), m_list.end(), Fileinfo::cmpDeviceInode);
return 0;
}

Expand All @@ -296,14 +249,14 @@ Rdutil::sort_on_depth_and_name(std::size_t index_of_first)
assert(index_of_first <= m_list.size());

auto it = std::begin(m_list) + static_cast<std::ptrdiff_t>(index_of_first);
std::sort(it, std::end(m_list), cmpDepthName);
std::sort(it, std::end(m_list), Fileinfo::cmpDepthName);
}

std::size_t
Rdutil::removeIdenticalInodes()
{
// sort list on device and inode.
auto cmp = cmpDeviceInode;
auto cmp = Fileinfo::cmpDeviceInode;
std::sort(m_list.begin(), m_list.end(), cmp);

// loop over ranges of adjacent elements
Expand All @@ -312,7 +265,7 @@ Rdutil::removeIdenticalInodes()
m_list.begin(), m_list.end(), cmp, [](Iterator first, Iterator last) {
// let the highest-ranking element not be deleted. do this in order, to be
// cache friendly.
auto best = std::min_element(first, last, cmpRank);
auto best = std::min_element(first, last, Fileinfo::cmpRank);
std::for_each(first, best, [](Fileinfo& f) { f.setdeleteflag(true); });
best->setdeleteflag(false);
std::for_each(best + 1, last, [](Fileinfo& f) { f.setdeleteflag(true); });
Expand All @@ -324,7 +277,7 @@ std::size_t
Rdutil::removeUniqueSizes()
{
// sort list on size
auto cmp = cmpSize;
auto cmp = Fileinfo::cmpSizeMeta;
std::sort(m_list.begin(), m_list.end(), cmp);

// loop over ranges of adjacent elements
Expand All @@ -346,10 +299,10 @@ std::size_t
Rdutil::removeUniqSizeAndBuffer()
{
// sort list on size
const auto cmp = cmpSize;
const auto cmp = Fileinfo::cmpSizeMeta;
std::sort(m_list.begin(), m_list.end(), cmp);

const auto bufcmp = cmpBuffers;
const auto bufcmp = Fileinfo::cmpBuffers;

// loop over ranges of adjacent elements
using Iterator = decltype(m_list.begin());
Expand Down Expand Up @@ -377,7 +330,7 @@ Rdutil::removeUniqSizeAndBuffer()
void
Rdutil::markduplicates()
{
const auto cmp = cmpSizeThenBuffer;
const auto cmp = Fileinfo::cmpSizeMetaBuffers;
assert(std::is_sorted(m_list.begin(), m_list.end(), cmp));

// loop over ranges of adjacent elements
Expand All @@ -391,15 +344,15 @@ Rdutil::markduplicates()
assert(std::distance(first, last) >= 2);

// the one with the lowest rank is the original
auto orig = std::min_element(first, last, cmpRank);
auto orig = std::min_element(first, last, Fileinfo::cmpRank);
assert(orig != last);
// place it first, so later stages will find the original first.
std::iter_swap(first, orig);
orig = first;

// make sure they are all duplicates
assert(last == find_if_not(first, last, [orig](const Fileinfo& a) {
return orig->size() == a.size() && hasEqualBuffers(*orig, a);
return orig->size() == a.size() && Fileinfo::hasEqualBuffers(*orig, a);
}));

// mark the files with the appropriate tag.
Expand Down
4 changes: 2 additions & 2 deletions rdfind.cc
Original file line number Diff line number Diff line change
Expand Up @@ -258,7 +258,7 @@ report(const std::string& path, const std::string& name, int depth)
std::string expandedname = path.empty() ? name : (path + "/" + name);

Fileinfo tmp(std::move(expandedname), current_cmdline_index, depth);
if (tmp.readfileinfo()) {
if (tmp.readfileinfo(true, true, true)) {
if (tmp.isRegularFile()) {
const auto size = tmp.size();
if (size >= global_options->minimumfilesize &&
Expand Down Expand Up @@ -349,7 +349,7 @@ main(int narg, const char* argv[])
gswd.totalsize(std::cout) << std::endl;

std::cout << "Removed " << gswd.removeUniqueSizes()
<< " files due to unique sizes from list. ";
<< " files due to unique size and metadata from list. ";
std::cout << filelist.size() << " files left." << std::endl;

// ok. we now need to do something stronger to disambiguate the duplicate
Expand Down