-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnaive_attempt.py
146 lines (122 loc) · 4.34 KB
/
naive_attempt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
"""
Understanding the data set
"""
import sys
import getopt
import nltk.probability
import random
from nltk.corpus.reader import CategorizedPlaintextCorpusReader
from nltk.model import NgramModel
from itertools import cycle
def process_plaintext(dir_path):
reader = CategorizedPlaintextCorpusReader(dir_path,
r'.*\.txt', cat_pattern=r'.+_.+_(.*)\.txt')
facilitator_files = reader.fileids(categories='facilitator')
participant_files = reader.fileids(categories='participant')
print facilitator_files, participant_files
#print reader.categories()
#print len(reader.words())
#print len(reader.sents())
fac_words = [word for word in reader.words(facilitator_files)]
par_words = [word for word in reader.words(participant_files)]
fac_words = edit_tokens(fac_words)
par_words = edit_tokens(par_words)
speakers = (
[(word, 'facilitator') for word in reader.words(facilitator_files)] +
[(word, 'participant') for word in reader.words(participant_files)]
)
features = get_features(speakers)
size = int(len(features) * 0.3)
nb_train = features[size:]
nb_test = features[:size]
classifier = nltk.NaiveBayesClassifier.train(nb_train)
print "Classifier labels:", classifier.labels()
print classifier.show_most_informative_features()
print "Clasify test:", nltk.classify.accuracy(classifier, nb_test)
#print classifier.classify(get_features(["Yolo", "bag", "sp"], False))
#random.shuffle(speakers)
three_quarters = int(len(speakers) * 0.75)
train = speakers[:three_quarters]
test = speakers[three_quarters:]
est = lambda fdist, bins: nltk.probability.LaplaceProbDist(fdist)
un_lm = NgramModel(1, train, estimator=est)
bi_lm = NgramModel(2, train, estimator=est)
tr_lm = NgramModel(3, train, estimator=est)
qu_lm = NgramModel(4, train, estimator=est)
pe_lm = NgramModel(5, train, estimator=est)
print un_lm
print bi_lm
print tr_lm
print qu_lm
print pe_lm
print "1 gram Perplexity:", un_lm.perplexity(test)
print "2 gram Perplexity:", bi_lm.perplexity(test)
print "3 gram Perplexity:", tr_lm.perplexity(test)
print "4 gram Perplexity:", qu_lm.perplexity(test)
print "5 gram Perplexity:", pe_lm.perplexity(test)
print bi_lm.generate(10, ["uh", "sp"])
fd_fac = nltk.FreqDist(fac_words)
vocab_fac = fd_fac.keys()
fd_par = nltk.FreqDist(par_words)
vocab_par = fd_par.keys()
print "Fac Vocab: " , len(vocab_fac)
print "Fac Tokens: " , len(fac_words)
print vocab_fac[:20]
print "Par Vocab: " , len(vocab_par)
print "Par Tokens: " , len(par_words)
print vocab_par[:20]
fd_par.plot(50)
def get_features(arr, known=True):
features = []
if known:
for i, (w, subject) in enumerate(arr):
print i, w
features.append( (prev_features(arr, i), subject) )
else:
for i, w in enumerate(arr):
print i, w
features.append( (prev_features(arr, i)) )
return features
def prev_features(words, i):
features = {}
if i == 0:
features["prev-word"] = "<S>"
else:
features["prev-word"] = words[i-1]
if i == 0 or i == 1:
features["prev-word-2"] = "<S>"
else:
features["prev-word-2"] = words[i-2]
return features
def edit_tokens (input_words):
"""
Merge "{", ".+", "}" into one token
"""
length = len(input_words)
for i, w in enumerate(input_words):
if (w == "{"):
if (i+2 < length):
if (input_words[i+2] == "}"):
input_words[i:i+3] = [''.join(input_words[i:i+3])]
return input_words
def main(argv):
# parse command line options
try:
# list of argv from below,
# string of one letter options e.g. -h, -w
# list of long options, e.g. --help
opts, args = getopt.getopt(argv, "h", ["help"])
except getopt.error, msg:
print msg
print "for help use --help"
sys.exit(2)
# process options
for o, a in opts:
if o in ("-h", "--help"):
print __doc__
sys.exit(0)
process_plaintext('./ps16_dev_data/plain_text/AS1_split_fac_part_plain_text')
if __name__ == "__main__":
# first argv is the script name, which we don't care about
# so just pass the rest of it to main
main(sys.argv[1:])