-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSourceFileToVector.py
188 lines (159 loc) · 5.73 KB
/
SourceFileToVector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
import glob
import os.path
from collections import OrderedDict
import javalang
import pygments
import xmltodict
from pygments.lexers import JavaLexer
from pygments.token import Token
import pandas as pd
import csv
import numpy as np
from assets import *
from datasets import *
class ProjectVersion:
__slots__ = ['name', 'version', 'src_files']
def __init__(self,name,version):
self.name = name
self.version = version
self.src_files = OrderedDict()
class SourceFile:
__slots__ = ['path_file','version' , 'interVector', 'tradFeature', 'label', 'tradgene']
def __init__ (self,path_file):
self.path_file = path_file
self.version = None
self.interVector = None
self.tradFeature = None
self.label = None
self.tradgene = None
class Parser():
"class containing different parsers"
__slots__ = ['name', 'root', 'versions', 'src', 'mapdata', 'labeldata']
def __init__ (self, project):
self.name = project.name
self.root = project.root
self.versions = project.versions
self.src = project.src
self.mapdata = project.mapdata
self.labeldata = project.labeldata
def Parser(self):
Project = OrderedDict()
for version in self.versions:
Project[version] = ProjectVersion(self.name,version)
vocab = []
maxlen = 0
#build vocab for all version
for version in self.mapdata:
with open(str(version), 'r') as csv_file:
reader = csv.reader(csv_file, delimiter=',')
reader.__next__()
for line in reader:
#get path_file of each file
path_file =os.path.normpath((str(self.root) + line[0][27:-24]).replace("\\","/"))
with open(path_file, encoding='cp1256') as file:
src = file.read()
parser_tree = None
try:
parser_tree = javalang.parse.parse(src)
for path, node in parser_tree:
if (str(type(node)) in class_selec):
if node.name not in vocab:
vocab.append(node.name)
except:
pass
# read each version in project
for version, versionmap, versionlabel in zip(self.versions, self.mapdata, self.labeldata):
print(version)
print(versionmap)
print(versionlabel)
token_matrix = []
with open(str(versionmap), 'r') as csv_file:
reader = csv.reader(csv_file, delimiter=',')
reader.__next__()
for line in reader:
#get path_file of each file
path_file =os.path.normpath((str(self.root) + line[0][27:-24]).replace("\\","/"))
with open(path_file, encoding='cp1256') as file:
src = file.read()
token_vector = []
parser_tree = None
try:
parser_tree = javalang.parse.parse(src)
for path, node in parser_tree:
if (str(type(node)) in class_selec):
token_vector.append(node.name)
except:
pass
# Get the package declaration if exists
if parser_tree and parser_tree.package:
package_name = parser_tree.package.name
else:
package_name = None
if (maxlen < len(token_vector)):
maxlen = len(token_vector)
for version, versionmap, versionlabel in zip(self.versions, self.mapdata, self.labeldata):
token_matrix = []
with open(str(versionmap), 'r') as csv_file:
reader = csv.reader(csv_file, delimiter=',')
reader.__next__()
for line in reader:
#get path_file of each file
path_file =os.path.normpath((str(self.root) + line[0][27:-24]).replace("\\","/"))
with open(path_file, encoding='cp1256') as file:
src = file.read()
token_vector = []
parser_tree = None
try:
parser_tree = javalang.parse.parse(src)
for path, node in parser_tree:
if (str(type(node)) in class_selec):
token_vector.append(node.name)
except:
pass
# Get the package declaration if exists
if parser_tree and parser_tree.package:
package_name = parser_tree.package.name
else:
package_name = None
# If source file has package declaration
if package_name:
src_id = (package_name + '.' +
os.path.basename(path_file))
else:
src_id = os.path.basename(path_file)
Project[version].src_files[src_id] = SourceFile(path_file)
token_matrix.append(token_vector)
#change token_vector to inter_vector
for src_id, token_vector in zip(Project[version].src_files,token_matrix):
vectori = []
for para in token_vector:
vectori.append(vocab.index(para) + 1)
for _ in range(maxlen - len(vectori)):
vectori.append(0)
Project[version].src_files[src_id].interVector = vectori
# print(Project[version].src_files[src_id].interVector)
# get Traditional Feature
with open(str(versionlabel), 'r') as csv_file:
reader = csv.reader(csv_file, delimiter=',')
reader.__next__()
for line in reader:
src_id = line[2] + ".java"
if (src_id in Project[version].src_files):
Project[version].src_files[src_id].tradFeature=[float(i) for i in line[3:-1]]
Project[version].src_files[src_id].label = 0 if (int(line[-1])==0) else 1
Project[version].src_files[src_id].version = line[1]
else:
continue
return Project
def test():
import datasets
parser = Parser(datasets.camel)
t = parser.Parser()
i = 0
for src_id in t[1.4].src_files:
print( src_id, t[1.4].src_files[src_id].tradFeature )
i = i+1
if i==3:
break
if __name__ == "__main__":
test()