-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathLoadData.py
105 lines (78 loc) · 2.89 KB
/
LoadData.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import re
import numpy as np
import sys
def LoadData(Filename, dataset):
# type: (object, object) -> object
#open the file
"""
:rtype: object
"""
print Filename
FILE=open(Filename,'r')
if not FILE:
print 'Open File failed \n'
FILE.close()
sys.exit(-2)
# judge the dataset
matchMQ = re.match(r'^([Mm][Qq]+)',dataset)
matchMS = re.match(r'^([Mm][Ss]+)',dataset)
print dataset, matchMS, matchMQ
# load the data
original_data={}
docid=-1
mean=np.zeros(136)
for line in FILE:
line=line.strip('\n')
if matchMQ:# MQ2007 MQ2008
m = re.match(r'^(\d+) qid\:([^\s]+) (.*?) \#docid = ([^\s]+) inc = ([^\s]+) prob = ([^\s]+).$',line)
elif matchMS:# MS
m = re.match(r'^(\d+) qid\:([^\s]+) (.*?).$',line)
else:# OHSUMED
m = re.match(r'^(\d+) qid\:([^\s]+) (.*?) \#docid = ([^\s]+).$',line)
# read the data
if m:
docid += 1
label = m.group(1)
label=int(label)
queryid = m.group(2)
feature=[]
feature_str=m.group(3).strip().split(' ')
for f in feature_str:
feature.append(float(f.split(':')[1]))
feature=np.asarray(feature)
mean += feature
if not original_data.has_key(queryid):
original_data[queryid]={}
if not original_data[queryid].has_key(label):
original_data[queryid][label]={}
if not original_data[queryid][label].has_key(docid):
original_data[queryid][label][docid]= feature
else:
print 'Error to parse Feature at line \n'
sys.exit(-2)
mean=mean/docid
sigma=np.zeros(136)
for queryid in original_data.keys():
for label in original_data[queryid].keys():
for doc in original_data[queryid][label].keys():
feature = original_data[queryid][label][doc]
sigma += (feature-mean)*(feature-mean)
sigma = sigma/docid
sigma = np.sqrt(sigma)
for i in range(len(sigma)):
if sigma[i]==0:
sigma[i]=1
Data={}
for queryid in original_data.keys():
label_matrix=[]
feature_matrix=[]
for label in original_data[queryid].keys():
for doc in original_data[queryid][label].keys():
label_matrix.append(label)
feature_matrix.append((original_data[queryid][label][doc]-mean)/sigma)
label_matrix=np.asarray(label_matrix)
feature_matrix=np.asarray(feature_matrix)
Data[queryid]={'label':label_matrix, 'feature':feature_matrix}
FILE.close()
# print original_data
return Data