-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathubase.py
executable file
·388 lines (323 loc) · 15.8 KB
/
ubase.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
#!/usr/bin/env python
import pandas as pd
import argparse
from collections import defaultdict
import csv
from fnmatch import fnmatch
import logging
import os
import re
from filenames import DEFAULT_PATTERN
from oscar import *
# libraries.io only looks for `from ...`:
# https://github.com/librariesio/pydeps/blob/master/pydeps.rb
# {pn} = package name pattern
# there are no commas in from X import ..., but for simplicity it's ok
IMPORT_PATTERN = re.compile(
"^\s*(?:from|import)\s+({pn}(?:\s*,\s*{pn})*)".format(pn="[a-zA-Z0-9\._]*"),
re.M)
# Obtained using ghd.pypi.get_builtins() from
# https://docs.python.org/2/library/index.html
# https://docs.python.org/3/library/index.html
BUILTINS = {
'', 'AL', 'BaseHTTPServer', 'Bastion', 'CGIHTTPServer', 'ColorPicker',
'ConfigParser', 'Cookie', 'DEVICE', 'DocXMLRPCServer', 'EasyDialogs', 'FL',
'FrameWork', 'GL', 'HTMLParser', 'MacOS', 'MimeWriter', 'MiniAEFrame',
'Queue', 'SUNAUDIODEV', 'ScrolledText', 'SimpleHTTPServer',
'SimpleXMLRPCServer', 'SocketServer', 'StringIO', 'Tix', 'Tkinter',
'UserDict', 'UserList', 'UserString', '__builtin__', '__future__',
'__main__', '_dummy_thread', '_thread', '_winreg', 'abc', 'aepack',
'aetools', 'aetypes', 'aifc', 'al', 'and', 'anydbm', 'argparse', 'array',
'ast', 'asynchat', 'asyncio', 'asyncore', 'atexit', 'audioop', 'autoGIL',
'base64', 'bdb', 'binascii', 'binhex', 'bisect', 'bsddb', 'buffer',
'builtins', 'bytearray', 'bytes', 'bz2', 'cPickle', 'cStringIO',
'calendar', 'cd', 'cgi', 'cgitb', 'chunk', 'cmath', 'cmd', 'code',
'codecs', 'codeop', 'collections', 'colorsys', 'commands', 'compileall',
'complex', 'concurrent', 'configparser', 'contextlib', 'cookielib', 'copy',
'copy_reg', 'copyreg', 'crypt', 'csv', 'ctypes', 'curses', 'datetime',
'dbhash', 'dbm', 'decimal', 'dict', 'difflib', 'dircache', 'dis',
'distutils', 'dl', 'doctest', 'dumbdbm', 'dummy_thread', 'dummy_threading',
'email', 'ensurepip', 'enum', 'errno', 'faulthandler', 'fcntl', 'filecmp',
'fileinput', 'findertools', 'fl', 'float', 'flp', 'fm', 'fnmatch',
'formatter', 'fpectl', 'fpformat', 'fractions', 'frozenset', 'ftplib',
'functools', 'future_builtins', 'gc', 'gdbm', 'gensuitemodule', 'getopt',
'getpass', 'gettext', 'gl', 'glob', 'grp', 'gzip', 'hashlib', 'heapq',
'hmac', 'hotshot', 'html', 'htmlentitydefs', 'htmllib', 'http', 'httplib',
'ic', 'imageop', 'imaplib', 'imgfile', 'imghdr', 'imp', 'import',
'importlib', 'imputil', 'inspect', 'int', 'io', 'ioctl', 'ipaddress',
'itertools', 'jpeg', 'json', 'keyword', 'linecache', 'list', 'locale',
'logging', 'long', 'lzma', 'macostools', 'macpath', 'mailbox', 'mailcap',
'marshal', 'math', 'md5', 'memoryview', 'mhlib', 'mimetools', 'mimetypes',
'mimify', 'mmap', 'modulefinder', 'msilib', 'msvcrt', 'multifile',
'multiprocessing', 'mutex', 'netrc', 'new', 'nis', 'nntplib', 'not',
'numbers', 'operator', 'optparse', 'or', 'os', 'ossaudiodev', 'parser',
'pathlib', 'pdb', 'pickle', 'pickletools', 'pip', 'pipes', 'pkgutil',
'platform', 'plistlib', 'popen2', 'poplib', 'posix', 'posixfile', 'pprint',
'pty', 'pwd', 'py_compile', 'pyclbr', 'pydoc', 'queue', 'quopri', 'random',
'range', 're', 'readline', 'repr', 'reprlib', 'resource', 'rexec', 'rfc822',
'rlcompleter', 'robotparser', 'runpy', 'sched', 'secrets', 'select',
'selectors', 'set', 'sets', 'sgmllib', 'sha', 'shelve', 'shlex', 'shutil',
'signal', 'site', 'smtpd', 'smtplib', 'sndhdr', 'socket', 'socketserver',
'spwd', 'sqlite3', 'ssl', 'stat', 'statistics', 'statvfs', 'str', 'string',
'stringprep', 'struct', 'subprocess', 'sunau', 'sunaudiodev', 'symbol',
'symtable', 'sys', 'sysconfig', 'syslog', 'tabnanny', 'tarfile',
'telnetlib', 'tempfile', 'termios', 'test', 'textwrap', 'thread',
'threading', 'time', 'timeit', 'tkinter', 'token', 'tokenize', 'trace',
'traceback', 'tracemalloc', 'ttk', 'tty', 'tuple', 'turtle', 'types',
'typing', 'unicode', 'unicodedata', 'unittest', 'urllib', 'urllib2',
'urlparse', 'user', 'uu', 'uuid', 'venv', 'warnings', 'wave', 'weakref',
'webbrowser', 'whichdb', 'winreg', 'winsound', 'with', 'wsgiref', 'xdrlib',
'xmlrpc', 'xmlrpclib', 'xrange', 'zipapp', 'zipfile', 'zipimport', 'zlib'
}
def top_namespace(namespace):
""" Get the top level namespace
For relative imports, an empty string is returned.
>>> top_namespace('matplotlib.pyplot')
'matplotlib'
>>> top_namespace('pandas')
'pandas'
>>> top_namespace('.utils')
''
"""
return namespace.split('.', 1)[0]
def blob_imports(blob_sha, max_size=4096):
""" Mine import statements in a Python file.
Notes:
- it only returns top-level dependencies
(e.g. `from x imoprt y` will consider `x` only, not `x.y`)
- it also returns builtins
(it will include csv if there is an `import csv` statement)
- it doesn't handle `importlib` magic
- it doesn't check if the code is commented out or unreachable
How it works:
look for lines `import X [as Y]` and `from X import Y`
return list of X-es
Package name can have:
lower and capital case letters, digits, underscores
Note: it cannot start with a digit (underscore is fine)
Package name cannot contain:
hyphen, dot, or start with a digit
Special case: multiple imports:
`import csv, re`
:param blob_sha: sha of the blob to use
:param max_size: max number of data bytes to consider
:return: generator of dependencies as strings
# https://github.com/django/django/tree/42eb0c09
>>> files = Commit('42eb0c09bcf062b9336d1f1a728813e4a599ad47').tree.files
>>> list(blob_imports(files['scripts/manage_translations.py']))
['os', 'argparse', 'subprocess', 'django.core.management']
# https://github.com/tornadoweb/tornado/tree/5e7e0577
>>> files = Commit('5e7e05773913221bc168f4dd3a24bcee22d63bef').tree.files
>>> list(blob_imports(files['setup.py'])) # doctest: +NORMALIZE_WHITESPACE
['os', 'platform', 'sys', 'warnings', 'setuptools', 'setuptools',
'distutils.core', 'distutils.core', 'distutils.command.build_ext']
# https://github.com/block-cat/zm_bom/blob/master/__init__.py
>>> files = Commit('28993f161ac3b0c22968664ca0e617d3ce9c2d70').tree.files
>>> list(blob_imports(files['__init__.py']))
['zm_bom', 'zm_bom_line']
KNOWN BUG, too expensive to fix: line continuations are not handled.
E.g.:
from bla.blah.blah \
import foo
'foo' will be counted as a separate import.
Live example:
Project cms-sw_cmssw,
Commit 902d319c4ffa26721a783f0efe6197f08752c9d8,
File RecoTauTag/Configuration/python/RecoPFTauTag_cff.py
Blob 9310647d843b322e83236bb94edc113398201c08
The effect of this bug is negligible comparing to how it will
increase parsing time
"""
# 2m without doing anything
# 3m with data read only
# 4m with multiline re
# ?? multiline re + split on commas
# 20m per 166 projects for the full cycle split+match by line
import_statements = IMPORT_PATTERN.findall(Blob(blob_sha).data[:max_size])
# now, split multiple imports, e.g. import os, sys
for import_statement in import_statements:
for namespace in import_statement.split(","):
# empty imports (syntax error that happen sometimes)
# will be filtered out by BUILTINS
yield namespace.strip()
def importable_paths(path):
""" Get a list of modules that could be imported locally
Python 3 doesn't require __init__.py files to consider folder a module,
so any folders with Python files are also included.
>>> importable_paths('my_module/utils.py')
['my_module', 'utils']
"""
chunks = [chunk
for chunk in path.split("/")
if chunk and (chunk[0].isalpha() or chunk[0] == "_")]
if chunks:
chunks[-1] = chunks[-1].rsplit('.', 1)[0]
return chunks
def commit_imports(commit, imports_cache=None, pattern=DEFAULT_PATTERN):
# type: (Commit, dict, str) -> dict
""" Get commit imports
How it works:
- collect all blob imports
- take only top namespaces (i.e. django out of django.foo.bar)
- removes local imports (i.e. my_module if my_module.py is present)
- removes builtins (like csv, multiprocessing etc)
Args:
commit (oscar.Commit): a commit to analyze
imports_cache (dict): a dictionary of blob_sha: set(top namespaces)
pattern (str): filename pattern to consider, *.py by default
Returns:
dict: blob_sha: set of top namespaces
"""
cache = imports_cache or {}
if not commit:
return {}
# paths: file path: blob sha
paths = {path: blob_sha for path, blob_sha in commit.tree.files.items()
if fnmatch(path, pattern) and 'test' not in path}
filenames = set().union(*(importable_paths(path) for path in paths))
# blob imports: blob sha: set(top namespaces)
imports = {blob_sha: cache.get(blob_sha,
{top_namespace(ns) for ns in blob_imports(blob_sha)}
- filenames - BUILTINS)
for blob_sha in paths.values()}
return imports
def commits_fp_monthly(commits):
""" Filter out Project.commits_fp to only leave latest commit in a month
This method is concieved as performance optimization - since we aggregate
usage by month, it makes sense to only view last commit in a month.
Commits with invalid dates (dead CMOS battery, invalid data etc) have None
as authored date and will be ignored.
"""
month = None
for commit in commits:
if (commit.authored_at and month != commit.authored_at.strftime("%Y-%m")
) or not commit.parent_shas:
month = commit.authored_at.strftime("%Y-%m")
yield commit
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Count Python namespace usage given a list of projects")
parser.add_argument('pattern', default=DEFAULT_PATTERN, nargs="?",
help='File extensions to use')
parser.add_argument('-i', '--input', default="-",
type=argparse.FileType('r'),
help='Input filename, "-" or skip for stdin')
parser.add_argument('-o', '--output', default="-",
type=argparse.FileType('w'),
help='Output filename, "-" or skip for stdout')
parser.add_argument('-d', '--date-format', default="%Y-%m", type=str,
help='Date format, %Y-%m by default')
parser.add_argument('-S', '--snapshots-dir', type=str, nargs="?",
help='Directory path to for intermediate snapshots')
parser.add_argument('-s', '--snapshots-interval', default=10000, type=int,
help='Snapshots interval, every processed N files')
parser.add_argument('-v', '--verbose', action='store_true',
help="Log progress to stderr")
args = parser.parse_args()
output_fields = ('project', 'date', 'added', 'removed', 'commit', 'parent')
writer = csv.DictWriter(args.output, output_fields)
writer.writeheader()
if args.snapshots_dir and not os.path.isdir(args.snapshots_dir):
parser.exit(1, "Snapshot dir does not exist")
logging.basicConfig(format='%(asctime)s %(message)s',
level=logging.INFO if args.verbose else logging.WARNING)
projects = args.input
# projects = ['user2589_minicms', 'YeonjuGo_cmssw']
# 57 bytes of RAM to store 20-char bin_sha
# bool: 24 bytes
# dict overhead: 48 bytes per key-value pair
# Total: 57 + 24 + 48 = 129 bytes per commit
# worst case: 1B commits = 129Gb RAM
# 700+ GB available, so it should work
processed_commits = {} # bin_sha: bool(terminal)
stats = defaultdict(
lambda: defaultdict(int)) # [namespace][month] = increment
# terminal commit stats, [month] = number
commit_stats = {
'total': defaultdict(int),
'terminal': defaultdict(int)
}
counter = 0
def snapshot():
if not args.snapshots_dir:
return
# saving usage
pd.DataFrame(stats).T.fillna(0).astype(int).to_csv(
os.path.join(args.snapshots_dir, "usage_snapshot_%d.csv" % counter))
# saving commit stats
pd.DataFrame(commit_stats).T.fillna(0).astype(int).to_csv(
os.path.join(args.snapshots_dir, "commit_stats_%d.csv" % counter))
# saving processed_commits would take few GB per snapshot, nah
# ~800M projects in total, ~1M (projected) use Python
for counter, project_name in enumerate(projects):
project_name = project_name.rstrip("\r\n")
if project_name == 'EMPTY': # special value
continue
project = Project(project_name)
commits = tuple(project.commits_fp)
full_length = len(commits)
commits = tuple(commits_fp_monthly(commits))
reduced_length = len(commits)
logging.info("#%d: %s (%d/%d commits)", counter, project_name,
full_length, reduced_length)
imports = None
cum_imports = set()
for i, commit in enumerate(commits):
date = commit.authored_at.strftime(args.date_format)
if commit.bin_sha in processed_commits:
# we have seen this commit before.
# if we got here from a continuation line, unmark it as terminal
if imports is not None:
if processed_commits[commit.bin_sha]:
# i.e. if this commit was marked as terminal before
commit_stats['terminal'][date] -= 1
processed_commits[commit.bin_sha] = False
# otherwise, stop processing this project - we've seen
# all commits up to this point
break
# this is a new commit
logging.debug("Processing %s", commit.sha)
commit_stats['total'][date] += 1
commit_stats['terminal'][date] += imports is None
if i + 1 < len(commits):
parent = commits[i + 1]
else:
parent = None
# mark commit as processed
processed_commits[commit.bin_sha] = imports is None
if imports is None: # starting from project head commit
try: # handle missing Tree objects
# in this case, it is more appropriate to ignore this commit
# than to consider it empty, because it will be recorded
# as removal of all dependencies
imports = commit_imports(commit, {}, args.pattern)
except ObjectNotFound:
continue
cum_imports = set().union(*imports.values())
# else we've got imports from the prev iteration already
try: # similar to the imports above
parent_imports = commit_imports(parent, imports, args.pattern)
except ObjectNotFound:
continue
cum_parent_imports = set().union(*parent_imports.values())
deleted = cum_parent_imports - cum_imports
added = cum_imports - cum_parent_imports
for dep in deleted:
stats[dep][date] -= 1
for dep in added:
stats[dep][date] += 1
if added or deleted:
writer.writerow({
'project': project_name,
'added': ",".join(added),
'removed': ",".join(deleted),
'date': date,
'commit': commit.sha,
'parent': parent and parent.sha
})
args.output.flush()
imports = parent_imports
cum_imports = cum_parent_imports
if counter and not counter % args.snapshots_interval:
snapshot()
snapshot()