-
Notifications
You must be signed in to change notification settings - Fork 1
/
tokenize.cpp
127 lines (107 loc) · 3 KB
/
tokenize.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#include <list>
#include <string>
#include <sstream>
using namespace std;
static void
replace(string & s, const string & s1, const string & s2, const char skip = 0);
void
tokenize(const string & s1, list<string> & lt)
{
lt.clear();
string s(s1);
if (s[0] == '"') s.replace(0, 1, "`` ");
replace(s, " \"", " `` ");
replace(s, "(\"", "( `` ");
replace(s, "[\"", "[ `` ");
replace(s, "{\"", "{ `` ");
replace(s, "<\"", "< `` ");
replace(s, "...", " ... ");
replace(s, ",", " , ");
replace(s, ";", " ; ");
replace(s, ":", " : ");
replace(s, "@", " @ ");
replace(s, "#", " # ");
replace(s, "$", " $ ");
replace(s, "%", " % ");
replace(s, "&", " & ");
int pos = s.size() - 1;
while (pos > 0 && s[pos] == ' ') pos--;
while (pos > 0) {
char c = s[pos];
if (c == '[' || c == ']' || c == ')' || c == '}' || c == '>' ||
c == '"' || c == '\'') {
pos--; continue;
}
break;
}
if (pos >= 0 && s[pos] == '.' && !(pos > 0 && s[pos-1] == '.')) s.replace(pos, 1, " .");
replace(s, "?", " ? ");
replace(s, "!", " ! ");
replace(s, "[", " [ ");
replace(s, "]", " ] ");
replace(s, "(", " ( ");
replace(s, ")", " ) ");
replace(s, "{", " { ");
replace(s, "}", " } ");
replace(s, "<", " < ");
replace(s, ">", " > ");
replace(s, "--", " -- ");
s.replace(string::size_type(0), 0, " ");
s.replace(s.size(), 0, " ");
replace(s, "\"", " '' ");
replace(s, "' ", " ' ", '\'');
replace(s, "'s ", " 's ");
replace(s, "'S ", " 'S ");
replace(s, "'m ", " 'm ");
replace(s, "'M ", " 'M ");
replace(s, "'d ", " 'd ");
replace(s, "'D ", " 'D ");
replace(s, "'ll ", " 'll ");
replace(s, "'re ", " 're ");
replace(s, "'ve ", " 've ");
replace(s, "n't ", " n't ");
replace(s, "'LL ", " 'LL ");
replace(s, "'RE ", " 'RE ");
replace(s, "'VE ", " 'VE ");
replace(s, "N'T ", " N'T ");
replace(s, " Cannot ", " Can not ");
replace(s, " cannot ", " can not ");
replace(s, " D'ye ", " D' ye ");
replace(s, " d'ye ", " d' ye ");
replace(s, " Gimme ", " Gim me ");
replace(s, " gimme ", " gim me ");
replace(s, " Gonna ", " Gon na ");
replace(s, " gonna ", " gon na ");
replace(s, " Gotta ", " Got ta ");
replace(s, " gotta ", " got ta ");
replace(s, " Lemme ", " Lem me ");
replace(s, " lemme ", " lem me ");
replace(s, " More'n ", " More 'n ");
replace(s, " more'n ", " more 'n ");
replace(s, "'Tis ", " 'T is ");
replace(s, "'tis ", " 't is ");
replace(s, "'Twas ", " 'T was ");
replace(s, "'twas ", " 't was ");
replace(s, " Wanna ", " Wan na ");
replace(s, " wanna ", " wanna ");
istringstream is(s);
string t;
while (is >> t) {
lt.push_back(t);
}
}
static void
replace(string & s, const string & s1, const string & s2, const char skip)
{
string::size_type pos = 0;
while (1) {
string::size_type i = s.find(s1, pos);
if (i == string::npos) break;
if (i > 0 && s[i-1] == skip) {
pos = i + 1;
continue;
}
s.replace(i, s1.size(), s2);
pos = i + s2.size();
}
}