forked from liuchaoss/codematcher-demo
-
Notifications
You must be signed in to change notification settings - Fork 0
/
reranking.py
152 lines (128 loc) · 4.18 KB
/
reranking.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import util
import operator
import re
def matcher_name(words, line, cmd):
"""
用论文中的方程1计算Sname
:param words: 查询词列表
:param line: 查询结果中的方法
:param cmd: es查询用的正则表达式
:return:Sname的值
"""
cmd = str(cmd).replace('.*', ' ').strip().split(' ')
line = str(line).replace('\n', '')
word_usage = len(cmd) / len(words)
line_coverage = len(''.join(cmd)) / len(line)
score = word_usage * line_coverage
return score
def matcher_api(query, line, jdk):
"""
:param query:查询词列表
:param line: 返回结果中"paesed对应的内容"
:param jdk: jdk文件反序列化的对象
:return:
"""
line = str(line).replace('\n', '').lower()
index = []
freq = 0
count = 0
for word in query:
pattern = re.compile(word.lower())
wi = [i.start() for i in pattern.finditer(line)]
if len(wi) > 0:
freq += len(wi) * len(word)
count += 1
index.append(wi)
word_usage = count / len(query)
line_coverage = freq / len(line)
max_sequence = len(sequence(index)) / len(query)
apis = line.split(',')
api_count = 0
jdk_count = 0
for api in apis:
if '.' in api:
api_count += 1
if '(' in api or '[' in api or '<' in api:
api = api[:api.rfind('.')]
if api in jdk:
jdk_count += 1
jdk_percent = 0
if api_count > 0:
jdk_percent = jdk_count / api_count
score = word_usage * line_coverage * max_sequence * jdk_percent
return score
def sequence(seq):
orders = []
scores = []
for i in range(len(seq)):
scores.append(0)
for si in seq[i]:
orders.append([si])
for k in range(len(orders)):
sik = orders[k][-1]
for j in range(i + 1, len(seq)):
for l in range(len(seq[j])):
sjl = seq[j][l]
if sik < sjl:
temp = []
temp.extend(orders[k])
temp.append(sjl)
orders.append(temp)
for o in orders:
scores[len(o) - 1] += 1
return scores
def reranking(query_parse, data, cmds, jdk):
"""
:param query_parse: 一个列表,列表中的第一个元素为处理后的查询词列表,第二个元素为单词列表的importance
:param data: 模糊查询结果列表
:param cmds: 模糊查询结果列表对应的查询正则表达式
:return:展示给用户的结果
"""
# jdk = util.load_pkl('data/jdk_vocab.pkl')
query = query_parse[0]
lines = []
scores = list()
for j in range(len(data)):
res = data[j]['_source']
line = res['method']
cmd = cmds[j]
scores.append([j, matcher_name(query, line, cmd)])
scores.sort(key=operator.itemgetter(1), reverse=True)
scores = scores[:100]
for j in range(len(scores)):
idx = scores[j][0]
res = data[idx]['_source']
line = res['parsed']
scores[j].append(matcher_api(query, line, jdk))
scores.sort(key=operator.itemgetter(1, 2), reverse=True)
count = 10
if len(data) < 10:
count = len(data)
for j in range(count):
idx = scores[j][0]
line = str(data[idx]['_source']['source'])
token = 'for (int'
if line.find(token) > -1:
l = ''
ds = line.split('for (int')
l += ds[0]
for k in range(1, len(ds)):
db = str(ds[k])
di = db.find('{')
d = db[:di - 1]
key = d[:d.find('=') - 1].strip()
dd = d.split(key)
keyy = '@ ' + key
kk = ''
for m in range(1, len(dd) - 1):
if dd[m-1][-1].isalnum() and dd[m][0].isalnum():
kk += key + dd[m]
else:
kk += keyy + dd[m]
kk += dd[-1]
# dd = dd[0] + keyy.join(dd[1:])
l += token + ' ' + key + kk + db[di:]
line = l
print()
lines.append(line)
return lines