-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfdups.py
executable file
·89 lines (74 loc) · 2.05 KB
/
fdups.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/env python3
import argparse
import hashlib
import os
from threading import BoundedSemaphore
import shelve
from multiprocessing import Pool
import re
def calculate_md5(fname):
try:
fnames_checked[fname]
return None
except KeyError:
pass
md5 = hashlib.md5(open(fname, 'rb').read()).hexdigest()
return (md5, fname)
def register_sum(results):
if results is None:
return
fname = results[1]
md5 = results[0]
dict_sema.acquire()
fnames_checked[fname] = ""
try:
md5_hash[md5] += [fname]
except KeyError:
md5_hash[md5] = [fname]
dict_sema.release()
parser = argparse.ArgumentParser()
parser.add_argument("-a", "--all",
action="store",
dest="all_configs",
nargs = "*",
default=None,
help="Build all configs for the given project")
parser.add_argument("-j",
action="store",
dest="j",
default=32,
help="Use that many parallel threads")
parser.add_argument("--match",
action="store",
dest="match",
default=None,
help="Match only specific files")
args, unknown = parser.parse_known_args()
used_threads = 0
dict_sema = BoundedSemaphore()
md5_hash = shelve.open(".fdups_fdups.db")
fnames_checked = shelve.open(".fdups_flist.db")
# make sure all the files/folders exist
for f in fnames_checked:
if not os.path.isfile(f):
fnames_checked[f] = None
for f in md5_hash:
for fyle in md5_hash[f]:
if not os.path.isfile(fyle):
md5_hash[f].remove(fyle)
p = Pool(int(args.j))
for f in unknown:
for root, dirs, files in os.walk(f):
for fyle in files:
if args.match is None or re.search(args.match, fyle):
p.apply_async(calculate_md5,
args = (os.path.join(root, fyle), ),
callback = register_sum)
p.close()
p.join()
for key in md5_hash.keys():
#print(key, md5_hash[key])
if len(md5_hash[key]) > 1:
print(key, md5_hash[key])
fnames_checked.close()
md5_hash.close()