-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathubase_control.py
executable file
·113 lines (95 loc) · 4.3 KB
/
ubase_control.py
1
#!/usr/bin/env pythonimport pandas as pdimport argparsefrom collections import defaultdictimport csvimport loggingimport osfrom oscar import *from ubase import blob_imports, top_namespace, BUILTINSif __name__ == "__main__": parser = argparse.ArgumentParser( description="Do the same as ubase.py but using relations." "This is intended to be a control trial to check the " "accuracy of data collection") parser.add_argument('-i', '--input', default="-", type=argparse.FileType('r'), help='Input filename, "-" or skip for stdin') parser.add_argument('-d', '--date-format', default="%Y-%m", type=str, help='Date format, %Y-%m by default') parser.add_argument('-S', '--snapshots-dir', type=str, help='Directory path to for intermediate snapshots') parser.add_argument('-s', '--snapshots-interval', default=1000000, type=int, help='Snapshots interval, every processed N files') parser.add_argument('-v', '--verbose', action='store_true', help="Log progress to stderr") args = parser.parse_args() if not os.path.isdir(args.snapshots_dir): parser.exit(1, "Snapshot dir does not exist") logging.basicConfig(format='%(asctime)s %(message)s', level=logging.INFO if args.verbose else logging.WARNING) files = args.input # Test input - user2589_minicms files # files = [ # "minicms/admin.py", # "minicms/models.py", # "minicms/views.py", # "minicms/urls.py", # "minicms/views.py", # ] # project_stats[project][namespace] = first_use project_stats = defaultdict(dict) # [namespace][month] = set(projects) # project_stats[project][namespace] = commit_introduced commit_stats = defaultdict(dict) # [namespace][month] = set(projects) counter = 0 def snapshot(): if not args.snapshots_dir: return # saving project stats pd.DataFrame(project_stats).T.applymap( lambda x: x.strftime(args.date_format)).to_csv( os.path.join(args.snapshots_dir, "proj_snapshot_%d.csv" % counter)) # saving commit stats pd.DataFrame(commit_stats).T.fillna(0).astype(int).to_csv( os.path.join(args.snapshots_dir, "commit_stats_%d.csv" % counter)) # Algorithm: # - iterate filenames # - get commits changing those files # - get namespaces used by these commits # - get list of projects having this commit # - add all these projects to project_stats from the date of commit and on # - if there is at least one new project, increase stats from commit date # Deficiencies: # - dropped dependencies are ignored # - local imports are (mostly) ignored # - ... name your own for counter, filename in enumerate(files): print(filename) logging.info("#%d: %s", counter, filename) commits = sorted(File(filename).commits, key=lambda c: c.authored_at) # filter out (some) local imports # it will not filter out files imoprted from the same folder chunks = filename.split("/") # foo.py => foo chunks[-1] = chunks[-1].rsplit(".", 1)[0] local_imports = set(chunks) for commit in commits: project = min(commit.project_names) print("\t%s: %s" % (commit.sha, project)) blob_namespaces = ( (top_namespace(ns) for ns in blob_imports(bs)) for bs in commit.blob_shas_rel) namespaces = set().union(*blob_namespaces) print("\t" + ",".join(namespaces)) for namespace in namespaces - BUILTINS - local_imports: # taking min here to get rid of forks # just in case this namespace was used in this project by # another filename if namespace not in project_stats[project] or \ project_stats[project][namespace] > commit.authored_at: project_stats[project][namespace] = commit.authored_at commit_stats[project][namespace] = commit.sha counter += 1 if counter and not counter % args.snapshots_interval: snapshot() snapshot()