-
Notifications
You must be signed in to change notification settings - Fork 71
/
extract_all.py
40 lines (35 loc) · 1.23 KB
/
extract_all.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
# extract all sources in one big directory to make it easy to grep through all the sources
import os
import shlex
from tqdm import tqdm
import shutil
import subprocess
from subprocess import STDOUT, check_output
from extstats.CONSTS import CRX_DIRECTORY as DIR
from distutils.version import LooseVersion
DESTINATION = 'crawled/sources/{ext_id}/{version}'
def sort_semverfiles(files):
def keyfunc(filename):
return LooseVersion(filename.replace('.zip', ''))
return sorted(files, key=keyfunc)
for ext in os.listdir(DIR):
files = os.listdir(DIR + ext)
files_details = []
latest = sort_semverfiles(files)[-1]
fullpath = DIR + ext + '/' + latest
size = os.path.getsize(fullpath)
if size > 100000000: #100mb
continue
print(fullpath, size)
dest = DESTINATION.format(ext_id=ext, version=latest.replace('.zip', ''))
try:
shutil.rmtree(dest)
except FileNotFoundError:
pass
os.makedirs(dest, exist_ok=True)
try:
check_output('unzip {} -d {}'.format(shlex.quote(fullpath), shlex.quote(dest)), timeout=60, shell=True)
except subprocess.CalledProcessError as e:
print('error:', e.returncode, ' - ', e.cmd)
except subprocess.TimeoutExpired:
pass