forked from zhongbin1/bert_tokenization_for_java
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFullTokenizer.java
41 lines (30 loc) · 1.04 KB
/
FullTokenizer.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
package bert;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
public class FullTokenizer {
private Map<String, Integer> vocab;
private BasicTokenizer basicTokenizer;
private WordpieceTokenizer wordpieceTokenizer;
public FullTokenizer(Map<String, Integer> vocab){
this.vocab = vocab;
this.basicTokenizer = new BasicTokenizer();
this.wordpieceTokenizer = new WordpieceTokenizer(vocab);
}
public List<String> tokenize(String text){
List<String> splitTopkens = new ArrayList<String>();
for(String token : basicTokenizer.tokenize(text)){
for(String subToken : wordpieceTokenizer.tokenize(token)){
splitTopkens.add(subToken);
}
}
return splitTopkens;
}
public List<Integer> convertTokensToIds(List<String> tokens){
List<Integer> outputIds = new ArrayList<Integer>();
for(String token : tokens){
outputIds.add(this.vocab.get(token));
}
return outputIds;
}
}