-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhw2.py
127 lines (101 loc) · 2.95 KB
/
hw2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/env python
from collections import defaultdict
import numpy as np
import math
import sys
import random
def read_data_file(filename):
data_file = open(filename)
dict = {}
for line in data_file:
l = line.split()
docIdx = eval(l[0])
wordIdx = eval(l[1])
count = eval(l[2])
if docIdx not in dict:
dict[docIdx] = defaultdict(int)
dict[docIdx][wordIdx] += count
data_file.close()
return dict
def read_data_label(filename):
data_file = open(filename)
dict = defaultdict(int)
map = defaultdict(int)
count = 0
for line in data_file:
groupId = eval(line)
dict[groupId] += 1
count += 1
map[count] = groupId
data_file.close()
return dict, map
voc_file = open("data/vocabulary.txt")
voc_dict = defaultdict(int)
count = 0
for line in voc_file:
count += 1
voc_dict[line] = count
voc_file.close()
train_data = read_data_file("data/train.data")
train_label, train_map = read_data_label("data/train.label")
group_num = len(train_label)
doc_num = len(train_map)
voc_num = len(voc_dict)
pi = [train_label[groupId] * 1.0 / doc_num for groupId in range(1, 21)]
# smoothing
p = np.ones((group_num, voc_num))
for docId in train_data:
for vId in train_data[docId]:
p[train_map[docId] - 1][vId - 1] += train_data[docId][vId]
for groupId in range(len(p)):
group_sum = sum(p[groupId])
for vId in range(len(p[groupId])):
p[groupId][vId] = p[groupId][vId] / group_sum
# routine groupId: 1 - 20
def helper(data, groupId):
result = 0.0
result += math.log(pi[groupId - 1])
for wordId in data:
result += data[wordId] * math.log(p[groupId - 1][wordId - 1])
return result
def choose(data):
m = -sys.maxint - 1
result = 0
for groupId in range(1, 21):
temp = helper(data, groupId)
if temp > m:
m = temp
result = groupId
return result
test_data = read_data_file("data/test.data")
test_label, test_map = read_data_label("data/test.label")
test_num = len(test_map)
error_num = 0
for docId in test_map:
if choose(test_data[docId]) != test_map[docId]:
error_num += 1
error_rate = error_num * 1.0 / test_num
print "error_rate " + str(error_rate)
# 0.21892071952
# better model
train_indexs = random.sample(range(1, doc_num + 1), doc_num * 8 / 10)
train_indexs.sort()
# validation_indexs = [i for i in range(1, doc_num + 1) if i not in train_indexs]
t_data = {}
t_label = defaultdict(int)
t_map = {}
v_data = {}
v_label = defaultdict(int)
v_map = {}
for docId in train_data:
if docId in train_indexs:
t_data[docId] = train_data[docId]
t_map[docId] = train_map[docId]
t_label[train_map[docId]] += 1
else:
v_data[docId] = train_data[docId]
v_map[docId] = train_map[docId]
v_label[train_map[docId]] += 1
# False, True, 40000 0.131322094055
# False, True, 45000 0.130434782609
# False, True, 50000 0.132209405501