forked from PaddlePaddle/PaddleHub
-
Notifications
You must be signed in to change notification settings - Fork 0
/
processor.py
62 lines (56 loc) · 1.83 KB
/
processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
# -*- coding:utf-8 -*-
import io
import numpy as np
def load_vocab(file_path):
"""
load the given vocabulary
"""
vocab = {}
with io.open(file_path, 'r', encoding='utf8') as f:
wid = 0
for line in f:
parts = line.rstrip().split('\t')
vocab[parts[0]] = int(parts[1])
vocab["<unk>"] = len(vocab)
return vocab
def preprocess(lac, texts, word_dict, use_gpu=False, batch_size=1):
"""
firstly, the predicted texts are segmented by lac module
then, the word segmention results input into senta
"""
result = []
input_dict = {'text': texts}
processed = lac.lexical_analysis(data=input_dict, use_gpu=use_gpu, batch_size=batch_size)
unk_id = word_dict["<unk>"]
for index, data in enumerate(processed):
result_i = {'processed': []}
result_i['origin'] = texts[index]
for word in data['word']:
if word in word_dict:
_index = word_dict[word]
else:
_index = unk_id
result_i['processed'].append(_index)
result.append(result_i)
return result
def postprocess(predict_out, texts):
"""
Convert model's output tensor to sentiment label
"""
predict_out = predict_out.as_ndarray()
batch_size = len(texts)
result = []
for index in range(batch_size):
result_i = {}
result_i['text'] = texts[index]['origin']
label = int(np.argmax(predict_out[index]))
if label == 0:
key = 'negative'
else:
key = 'positive'
result_i['sentiment_label'] = label
result_i['sentiment_key'] = key
result_i['positive_probs'] = float('%.4f' % predict_out[index, 1])
result_i['negative_probs'] = float('%.4f' % (1 - predict_out[index, 1]))
result.append(result_i)
return result