-
Notifications
You must be signed in to change notification settings - Fork 4
/
finddup
executable file
·265 lines (200 loc) · 6.13 KB
/
finddup
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
#! /usr/bin/env python
#
# Script to find duplicate files
# Uses multiprocessing to scan files in parallel.
#
# Sudhi Herle <[email protected]>
# GPLv2
#
import os, sys, os.path
import re, zlib, mmap, signal
import multiprocessing as m
from os.path import basename, dirname, join, isfile, isdir
from optparse import Option, OptionParser, OptionValueError
try:
import hashlib
md5sum = hashlib.md5
except:
import md5
md5sum = md5.md5
KB = 1024
MB = 1024 * KB
GB = 1024 * MB
TB = 1024 * GB
PB = 1024 * TB
Divisors = [
('PB', PB),
('TB', TB),
('GB', GB),
('MB', MB),
('kB', KB),
]
usage = """%s dir [dir...]
%s - Find duplicate files in one or more directories
""" % (sys.argv[0], sys.argv[0])
parser = OptionParser(usage)
parser.add_option("-s", "--shell", dest="shell_cmds", action="store_true",
default=False,
help="Generate shell commands to delete duplicate files [%default]")
opt, args = parser.parse_args()
if len(args) < 1:
args.append('.')
Ignore_re = ('\.svn',
'.*~$',
'\.*sw.$',
'\.CVS',
'\.hg',
)
Ignore_re_list = map(lambda x: re.compile(x), Ignore_re)
def block_sigs():
"""Block signals in the worker process"""
signal.signal(signal.SIGINT, signal.SIG_IGN)
signal.signal(signal.SIGPIPE, signal.SIG_IGN)
signal.signal(signal.SIGHUP, signal.SIG_IGN)
def ignore(name):
"""Return True if this pat must be ignored, false otherwise"""
global Ignore_re_list
for x in Ignore_re_list:
if x.search(name) is not None:
return True
return False
class bundle:
def __init__(self, **kwargs):
self.__dict__.update(kwargs)
class cksum_db:
def __init__(self, cksum):
self.w = {}
self.m = m.Pool(processes=None, initializer=block_sigs)
self.cksum = cksum
def addfile(self, fn):
x = self.m.apply_async(self.cksum, args=(fn,))
self.w[fn] = x
def dups(self):
"""Return a dict of duplicate files. Each entry in the dict
is an array of files - whose contents are identical according to
the checksum criteria of the class."""
db = {}
for k, v in self.w.items():
z = v.get()
a = db.setdefault(z.cksum, [])
a.append(z)
#print a
ck = dict([(k,v) for k, v in db.items() if len(v) > 1])
return ck
def mmap_gen(fn):
"""Generator that yields mmap'd chunks of memory and length"""
MM_CHUNKSIZE = 2 * GB
fd = open(fn, 'rb')
fdn = fd.fileno()
st = os.fstat(fdn)
n = 0 + st.st_size
off = 0
while n > 0:
z = MM_CHUNKSIZE if n > MM_CHUNKSIZE else n
try:
mm = mmap.mmap(fdn, z, access=mmap.ACCESS_READ, offset=off)
yield mm, z
except Exception as ex:
ss = "can't mmap %d bytes of %s: %s" % (z, fn, str(ex))
raise Exception(ss)
mm.close()
n -= z
off += z
fd.close()
def cksum_slow(filename):
m = md5sum()
sz = 0
for mm, n in mmap_gen(filename):
buf = mm.read(n)
m.update(buf)
sz += n
b = bundle(size=sz, fname=filename, cksum=m.hexdigest())
return b
def cksum_quick(filename):
v = 0
sz = 0
for mm, n in mmap_gen(filename):
buf = mm.read(n)
v = zlib.adler32(buf, v)
sz += n
v = -v if v < 0 else v
x = "%#x" % v
b = bundle(size=sz, fname=filename, cksum=x)
return b
def descend(db, dn):
"""Descend into directory 'dn' and gather files into dups"""
for root, dirs, files in os.walk(dn, 1):
for f in files:
here = join(root, f)
if not isfile(here): continue
if not ignore(here):
db.addfile(here)
def human(n):
"""Return human readable size for n bytes"""
global Divisors
for d in Divisors:
sz = d[1]
if n > sz:
s = "%4.2f %s" % (float(n) / sz, d[0])
return s
return "%lu" % n
class shell:
"""Abstraction to print shell commands to remove dups"""
def __init__(self):
self.tot = 0
print("#! /bin/sh\n")
def dups(self, k, keep, rm):
waste = keep.size * len(rm)
self.tot += waste
s = '\n'.join([ "rm -f '%s'" % x.fname for x in rm ])
print("# %s: %s, saving %s\n#rm -f '%s'\n%s\n" % \
(k, human(keep.size), human(waste), keep.fname, s))
def finish(self):
print("# %s saved" % human(self.tot))
class plain:
"""Abstraction to simply print the duplicated files"""
def __init__(self):
self.tot = 0
print("Report of duplicate files\n")
def dups(self, k, keep, rm):
waste = keep.size * len(rm)
self.tot += waste
s = '\n\t'.join( [x.fname for x in rm ])
print("%s: %s, wasted %s\n\tKEEP %s\n%s" % \
(k, human(keep.size), human(waste), keep.fname, s))
def finish(self):
print("\n%s total wasted space" % human(self.tot))
def sighandler(a, b):
#warn("** Keyboard interrupt. Exiting ..")
sys.exit(1)
# startoff by installing signal handlers
signal.signal(signal.SIGINT, sighandler)
# We detect dups in two stages.
# In the first stage, we use a quick checksum to coarsely distinguish
# files. If two files have a checksum collision, we are not sure if they
# are identical or just a checksum collision.
# So, for those files, we use a strong checksum to disambiguate.
db = cksum_db(cksum_quick)
zargs = []
for d in args:
if not isdir(d):
print("Skipping non-directory %s" % d, file=sys.stderr)
continue
descend(db, d)
zargs.append(d)
# Now, we use a slow checksum for the files that maybe dups
db2 = cksum_db(cksum_slow)
dups = db.dups()
for k, v in dups.items():
for f in v:
#print "# maybe dup %s => %s" % (k, f)
db2.addfile(f.fname)
dups = db2.dups()
# We always keep the files in first arg
pr = shell() if opt.shell_cmds else plain()
for k, v in dups.items():
keep = v[0]
rm = v[1:]
pr.dups(k, keep, rm)
pr.finish()
# vim: expandtab:sw=4:ts=4:tw=72:notextmode: