-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathPreprocess.java
73 lines (54 loc) · 1.75 KB
/
Preprocess.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
package bert;
import java.io.*;
import java.util.*;
public class Preprocess {
public Preprocess() {
}
public Map<String, Integer> load(String filePath) {
Map<String, Integer> map = new HashMap<String, Integer>();
/* 读取数据 */
try {
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(new File(filePath)),
"UTF-8"));
int index = 0;
String token = null;
while ((token = br.readLine()) != null) {
map.put(token, index);
index += 1;
}
br.close();
} catch (Exception e) {
System.err.println("read errors :" + e);
}
return map;
}
// 全角转半角
public String full2HalfChange(String QJstr) {
StringBuffer outStrBuf = new StringBuffer();
String Tstr = "";
byte[] b = null;
try {
for (int i = 0; i < QJstr.length(); i++) {
Tstr = QJstr.substring(i, i + 1);
if (Tstr.equals(" ")) {
outStrBuf.append(" ");
continue;
}
b = Tstr.getBytes("unicode");
// 得到 unicode 字节数据
if (b[2] == -1) {
// 表示全角?
b[3] = (byte) (b[3] + 32);
b[2] = 0;
outStrBuf.append(new String(b, "unicode"));
} else {
outStrBuf.append(Tstr);
}
}
} catch (Exception e) {
e.printStackTrace();
return QJstr;
}
return outStrBuf.toString();
}
}