-
Notifications
You must be signed in to change notification settings - Fork 21
/
sentistrength_id.py
245 lines (219 loc) · 11.1 KB
/
sentistrength_id.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
# coding: utf-8
import re
from collections import OrderedDict
import numpy as np
class sentistrength:
def __init__(self, config=dict()):
self.negasi = [line.replace('\n','') for line in open("negatingword.txt").read().splitlines()]
self.tanya = [line.replace('\n','') for line in open("questionword.txt").read().splitlines()]
#create sentiment words dictionary
self.sentiwords_txt = [line.replace('\n','').split(":") for line in open("sentiwords_id.txt").read().splitlines()]
self.sentiwords_dict = OrderedDict()
for term in self.sentiwords_txt:
self.sentiwords_dict[term[0]] = int(term[1])
#create emoticon dictionary
self.emoticon_txt = [line.replace('\n','').split(" | ") for line in open("emoticon_id.txt").read().splitlines()]
self.emoticon_dict = OrderedDict()
for term in self.emoticon_txt:
self.emoticon_dict[term[0]] = int(term[1])
#create idioms dictionary
self.idioms_txt = [line.replace('\n','').split(":") for line in open("idioms_id.txt").read().splitlines()]
self.idioms_dict = OrderedDict()
for term in self.idioms_txt:
self.idioms_dict[term[0]] = int(term[1])
#create boosterwords dictionary
self.boosterwords_txt = [line.replace('\n','').split(":") for line in open("boosterwords_id.txt").read().splitlines()]
self.boosterwords_dict = OrderedDict()
for term in self.boosterwords_txt:
self.boosterwords_dict[term[0]] = int(term[1])
self.negation_conf = config["negation"]
self.booster_conf = config["booster"]
self.ungkapan_conf = config["ungkapan"]
self.consecutive_conf = config["consecutive"]
self.repeated_conf = config["repeated"]
self.emoticon_conf = config["emoticon"]
self.question_conf = config["question"]
self.exclamation_conf = config["exclamation"]
self.punctuation_conf = config["punctuation"]
self.mean_conf = False
def senti(self,term):
try:
return self.sentiwords_dict[term]
except:
return 0
def emosikon(self,term):
try:
return self.emoticon_dict[term]
except:
return 0
def ungkapan(self,term):
try:
return self.idioms_dict[term]
except:
return 0
def booster(self, term):
try:
return self.boosterwords_dict[term]
except:
return 0
def cek_negationword(self, prev_term, prev_term2):
#jika kata sebelumnya (index-1) adalah kata negasi, negasikan nilai -+nya
if prev_term in self.negasi or prev_term2+" "+prev_term in self.negasi:
# print prev_term
self.score = -abs(self.score) if self.score>0 else abs(self.score)
def cek_boosterword(self,term):
booster_score = self.booster(term)
if booster_score !=0 and self.score>0: self.score += booster_score
if booster_score !=0 and self.score<0: self.score -= booster_score
def cek_consecutive_term(self, prev_term):
if self.prev_score>0 and self.score >=3: self.score+=1
if self.prev_score<0 and self.score <=-3: self.score-=1
def cek_ungkapan(self, bigram,trigram, i):
bigram = ' '.join(bigram)
trigram = ' '.join(trigram)
ungkapan_score = self.ungkapan(bigram)
if ungkapan_score==0:
ungkapan_score = self.ungkapan(trigram)
if ungkapan_score!=0:
self.score = ungkapan_score
self.prev_score = 0
self.pre_max_pos[i-1] = 1
self.pre_max_neg[i-1] = -1
self.max_pos = self.pre_max_pos[i-2] #if len(self.pre_max_pos)>1 else 1
self.max_neg = self.pre_max_neg[i-2] #if len(self.pre_max_neg)>1 else -1
self.sentence_score[i-1] = re.sub(r'\[\d\]','',self.sentence_score[i-1])
def cek_repeated_punctuation(self, next_term):
if re.search(r'!{2,}',next_term) and self.score >=3: self.score+=1
if re.search(r'!{2,}',next_term) and self.score <=-3: self.score-=1
def remove_extra_repeated_char(self, term):
return re.sub(r'([A-Za-z])\1{2,}',r'\1',term)
def plural_to_singular(self, term):
return re.sub(r'([A-Za-z]+)\-\1', r'\1',term)
def classify(self):
result = "neutral"
try:
if self.mean_conf:
mean_p = np.mean(self.mean_pos)
mean_n = np.mean(self.mean_neg)
print mean_p, mean_n
if mean_p > mean_n:
result = "positive"
elif mean_p < mean_n and not self.is_tanya:
result = "negative"
elif mean_p < mean_n and self.is_tanya:
result = "neutral"
else:
if abs(self.sentences_max_pos) > abs(self.sentences_max_neg):
result = "positive"
elif abs(self.sentences_max_pos) < abs(self.sentences_max_neg):
result = "negative"
elif abs(self.sentences_max_pos) == abs(self.sentences_max_neg):
result = "neutral"
except:
print "error ",self.sentences_max_pos, self.sentences_max_neg
return result
def cek_neutral_term(self,terms,i):
if terms[i-1] in self.neutral_term or terms[i+1] in self.neutral_term: self.score=1
def main(self,sentence):
self.neutral_term = ['jika','kalau']
sentences = sentence.split('.')
self.sentences_max_neg = -1
self.sentences_max_pos = 1
self.sentences_score = []
self.sentences_text = []
for sentence in sentences:
self.max_neg = -1
self.max_pos = 1
self.mean_neg = [1]
self.mean_pos = [1]
self.sentence_score=[]
terms = sentence.split()
# terms = re.split(r'[\s,.]',sentence)
terms_length = len(terms)
self.is_tanya = False
self.sentence_text = ''
# print self.max_pos, self.max_neg
#SEMUA KALIMAT YANG MEMILIKI TANDA SERU MEMILIKI +ve minimal 2
if self.exclamation_conf and re.search('!',sentence): self.max_pos = 2
self.prev_score = 0
self.pre_max_pos = []
self.pre_max_neg = []
for i,term in enumerate(terms):
# repeated_term = ''
is_extra_char = False
plural = ''
self.score = 0
# if re.search(r'[A-Za-z\-.]+',term):
# print term
if re.search(r'([A-Za-z])\1{3,}',term):
is_extra_char = True
# repeated_term =term
term = self.remove_extra_repeated_char(term)
if re.search(r'([A-Za-z]+)\-\1',term):
plural = term
term = self.plural_to_singular(term)
#GET SENTI SCORE#
self.score = self.senti(term)
# print "senti score",term, self.score
#NEGATION HANDLER#
if self.negation_conf and self.score !=0 and i>0:self.cek_negationword(terms[i-1],terms[i-2])
# print "negation score",term, self.score
#BOOSTERWORD HANDLER#
if self.booster_conf and self.score !=0 and i>0 and i<=(terms_length-1):self.cek_boosterword(terms[i-1])
if self.booster_conf and self.score !=0 and i>=0 and i<(terms_length-1):self.cek_boosterword(terms[i+1])
# print "booster score",term, self.score
#IDIOM/UNGKAPAN HANDLER#
if self.ungkapan_conf and i>0 and i<=(terms_length-1):self.cek_ungkapan([terms[i-1],term],[terms[i-2],terms[i-1],term],i)
# if self.ungkapan_conf and i>=0 and i<(terms_length-1):self.cek_ungkapan([term,terms[i+1]])
# print "idiom score",term, self.score
#CONSECUTIVE SENTIMENT WORD#
if self.consecutive_conf and i>0 and i<=(terms_length-1) and self.score !=0:self.cek_consecutive_term(terms[i-1])
# print "consecutive score",term, self.score
#+1 SENTI SCORE IF REPEATED CHAR ON POSITIVE/NEGATIVE +2 IF NEUTRAL TERM
if self.repeated_conf and is_extra_char==True and self.score>0: self.score+=1
if self.repeated_conf and is_extra_char==True and self.score<0: self.score-=1
if self.repeated_conf and is_extra_char==True and self.score==0: self.score=2
# print "repeat char score", term, self.score
if self.punctuation_conf and i>=0 and i<(terms_length-1): self.cek_repeated_punctuation(terms[i+1])
# CEK APAKAH TERDAPAT KATA TANYA
if self.question_conf and (term in self.tanya or re.search(r'\?',term)):self.is_tanya = True
# CEK neutral term
if self.score!=0 and i>1 and i<(terms_length-2): self.cek_neutral_term(terms,i)
# if self.score!=0 and i>0 and i<(terms_length-4): self.cek_neutral_term(terms,i)
if self.emoticon_conf and self.score==0: self.score = self.emosikon(term)
self.prev_score = self.score
if self.mean_conf and self.score>0: self.mean_pos.append(self.score)
if self.mean_conf and self.score<0: self.mean_neg.append(abs(self.score))
#GET MAX SCORE +ve/-ve
self.max_pos= self.score if self.score > self.max_pos else self.max_pos
self.max_neg= self.score if self.score < self.max_neg else self.max_neg
#insert score info current term
self.pre_max_pos.append(self.max_pos)
self.pre_max_neg.append(self.max_neg)
# print self.pre_max_pos, self.pre_max_neg
if plural !='': term = plural
self.sentence_text += ' {}'.format(term)
if self.score != 0:term = "{} [{}]".format(term, self.score)
self.sentence_score.append(term)
self.sentences_text.append(self.sentence_text)
self.sentences_score.append(" ".join(self.sentence_score))
if self.is_tanya:
self.max_neg = -1
self.sentences_max_pos = self.max_pos if self.max_pos > self.sentences_max_pos else self.sentences_max_pos
self.sentences_max_neg = self.max_neg if self.max_neg < self.sentences_max_neg else self.sentences_max_neg
# print self.sentences_max_pos, self.sentences_max_neg
sentence_result = self.classify()
# print self.sentences_text
return {"classified_text":". ".join(self.sentences_score),"tweet_text":". ".join(self.sentences_text),"sentence_score":self.sentences_score,"max_positive":self.sentences_max_pos,"max_negative":self.sentences_max_neg,"kelas":sentence_result}
config = dict()
config["negation"] = True
config["booster"] = True
config["ungkapan"] = True
config["consecutive"] = True
config["repeated"] = True
config["emoticon"] = True
config["question"] = True
config["exclamation"] = True
config["punctuation"] = True
senti = sentistrength(config)
print senti.main("agnezmo pintar dan cantik sekali tetapi lintah darat :)")