forked from PaddlePaddle/PaddleHub
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenizer.py
127 lines (108 loc) · 4.09 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import os
import numpy as np
from paddlehub.common.logger import logger
class Tokenizer(object):
"""Base tokenizer class.
"""
def __init__(self):
pass
def tokenize(self, text):
raise NotImplementedError
class SimpleTokenizer(Tokenizer):
"""Simple version FMM(Forward Maximun Matching) word tokenizer. This tokenizer can only
be used in topic model demo, but not in real business application scenarios.
Notes: This tokenizer can only recognize the words in the corresponding vocab file.
"""
def __init__(self, vocab_path):
super().__init__()
self.__max_word_len = 0
self.__vocab = set()
self.__load_vocab(vocab_path)
def tokenize(self, text):
"""Tokenize the input string `text`, and return the tokenize result.
"""
text_len = len(text)
result = []
i = 0
while i < text_len:
word = found_word = ""
# Deal with English characters.
if self.__is_eng_char(text[i]):
for j in range(i, text_len + 1):
if j < text_len and self.__is_eng_char(text[j]):
word += self.__tolower(text[j])
else:
# Forward matching by character granularity.
if word in self.__vocab:
result.append(word)
i = j - 1
break
else:
for j in range(i, min(i + self.__max_word_len, text_len)):
word += text[j]
if word in self.__vocab:
found_word = word
if len(found_word) > 0:
result.append(found_word)
i += len(found_word) - 1
i += 1
return result
def contains(self, word):
"""Check whether the word is in the vocabulary.
"""
return word in self.__vocab
def __load_vocab(self, vocab_path):
"""Load the word dictionary.
"""
with open(vocab_path, 'r', encoding='utf-8') as fin:
vocab_size = 0
for line in fin.readlines():
fields = line.strip().split('\t')
assert len(fields) >= 2
word = fields[1]
self.__max_word_len = max(self.__max_word_len, len(word))
self.__vocab.add(word)
vocab_size += 1
def __is_eng_char(self, c):
"""Check whether char c is an English character.
"""
return (c >= 'A' and c <= 'Z') or (c >= 'a' and c <= 'z')
def __tolower(self, c):
"""Return the lowercase character of the corresponding character, or return
the original character if there is no corresponding lowercase character.
"""
return c.lower()
class LACTokenizer(Tokenizer):
def __init__(self, vocab_path, lac):
super().__init__()
self.__max_word_len = 0
self.__vocab = set()
self.__lac = lac
self.__load_vocab(vocab_path)
def __load_vocab(self, vocab_path):
"""Load the word dictionary.
"""
with open(vocab_path, 'r', encoding='utf-8') as fin:
vocab_size = 0
for line in fin.readlines():
fields = line.strip().split('\t')
assert len(fields) >= 2
word = fields[1]
self.__max_word_len = max(self.__max_word_len, len(word))
self.__vocab.add(word)
vocab_size += 1
def tokenize(self, text):
results = self.__lac.lexical_analysis(texts=[text], use_gpu=False, batch_size=1, return_tag=True)
# Change English words to lower case.
# And just preserve the word in vocab.
words = results[0]["word"]
result = []
for word in words:
word = word.lower()
if word in self.__vocab:
result.append(word)
return result
def contains(self, word):
"""Check whether the word is in the vocabulary.
"""
return word in self.__vocab