forked from PaddlePaddle/PaddleHub
-
Notifications
You must be signed in to change notification settings - Fork 0
/
processor.py
60 lines (51 loc) · 1.91 KB
/
processor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
# -*- coding: utf-8 -*-
import io
def load_vocab(file_path):
"""
load the given vocabulary
"""
vocab = {}
with io.open(file_path, 'r', encoding='utf8') as f:
wid = 0
for line in f:
line = line.rstrip()
parts = line.split('\t')
vocab[parts[0]] = int(parts[1])
vocab["<unk>"] = len(vocab)
return vocab
text_a_key = "text_1"
text_b_key = "text_2"
def preprocess(lac, word_dict, data_dict, use_gpu=False, batch_size=1):
"""
Convert the word str to word id and pad the text
"""
result = {text_a_key: [], text_b_key: []}
processed_a = lac.lexical_analysis(data={'text': data_dict[text_a_key]}, use_gpu=use_gpu, batch_size=batch_size)
processed_b = lac.lexical_analysis(data={'text': data_dict[text_b_key]}, use_gpu=use_gpu)
unk_id = word_dict['<unk>']
for index, (text_a, text_b) in enumerate(zip(processed_a, processed_b)):
result_i = {'processed': []}
result_i['origin'] = data_dict[text_a_key][index]
for word in text_a['word']:
_index = word_dict.get(word, unk_id)
result_i['processed'].append(_index)
result[text_a_key].append(result_i)
result_i = {'processed': []}
result_i['origin'] = data_dict[text_b_key][index]
for word in text_b['word']:
_index = word_dict.get(word, unk_id)
result_i['processed'].append(_index)
result[text_b_key].append(result_i)
return result
def postprocess(pred, data_info):
"""
Convert model's output tensor to pornography label
"""
result = []
for index in range(len(data_info[text_a_key])):
result_i = {}
result_i[text_a_key] = data_info[text_a_key][index]['origin']
result_i[text_b_key] = data_info[text_b_key][index]['origin']
result_i['similarity'] = float('%.4f' % pred[index][0])
result.append(result_i)
return result