-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_miner.py
92 lines (76 loc) · 2.94 KB
/
data_miner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
import re
class DataMiner(object):
def extract_number_commits(self, raw_data):
"""
Returns the number of commits contained in the data.
Used with prettified output from:
git log --pretty=format:'[%h] %an %ad %s' --date=short --numstat
:param raw_data: the output of the git log
:return: number of commits in the output
"""
num_commits = 0
lines = raw_data.strip().split("\n")
for line in lines:
if line.strip().startswith("["):
num_commits += 1
return num_commits
def extract_number_authors(self, raw_data):
"""
Returns the number of authors who have committed commits.
Used with prettified output from:
git log --pretty=format:'[%h] %an %ad %s' --date=short --numstat
:param raw_data: the output of the git log
:return: number of committing authors
"""
authors = set()
lines = raw_data.strip().split("\n")
for line in lines:
# Something like:
# [cef94a2] mjc23 2017-07-10 Added DataMiner class
# As names can contain spaces we need to match up to the date
m = re.match(r"\s*\[\w*\]\s(\S[\S\s]*\S)\s\d\d\d\d-\d\d-\d\d.*", line)
if m is not None:
authors.add(m.groups()[0])
return len(authors)
def extract_number_entities_changed(self, raw_data):
"""
Returns the number of times files have had changes committed.
Used with prettified output from:
git log --pretty=format:'[%h] %an %ad %s' --date=short --numstat
:param raw_data: the output of the git log
:return: number of times files changed
"""
files = self.extract_changes_per_file(raw_data)
num_changes = 0
for k, v in files.items():
num_changes += v
return num_changes
def extract_number_entities(self, raw_data):
"""
Returns the number of files that have been changed at least once.
:param raw_data: the output of the git log
:return: number of files changed
"""
files = self.extract_changes_per_file(raw_data)
return len(files)
def extract_changes_per_file(self, raw_data):
"""
Returns the number of times each file has changed.
:param raw_data: the output of the git log
:return: dict of filename containing number of changes
"""
files = {}
lines = raw_data.strip().split("\n")
for line in lines:
# Something like:
# 3 0 data_miner.py
# or:
# - - some.png
m = re.match(r"\s*[\d|-]+\s*[\d|-]+\s*(\S*)", line)
if m is not None:
name = m.groups()[0]
if name in files:
files[name] += 1
else:
files[name] = 1
return files