-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathTokenizer.php
116 lines (86 loc) · 2.61 KB
/
Tokenizer.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
<?php
require_once('Token.php');
require_once('Tokens.php');
class Tokenizer {
private static function joinSquareBrackets($tokens) {
$new_tokens = array();
for ($i = 0; $i < count($tokens); $i++) {
// current token start with "[" and next token ends with "]"
if (isset($tokens[$i + 1]) && preg_match('/^\$\[\$/', $tokens[$i]) && preg_match('/\$\]\$$/', $tokens[$i + 1])) {
$new_tokens[] = $tokens[$i] . ' ' . $tokens[$i + 1];
// skip next token
$i++;
}
else {
$new_tokens[] = $tokens[$i];
}
}
return $new_tokens;
}
private static function splitAtCharacter($tokens, $string, $regex) {
$new_tokens = array();
foreach ($tokens as $token) {
if (preg_match($regex, $token)) {
$extra_tokens = preg_split($regex, $token);
foreach ($extra_tokens as $j => $extra_token) {
if ($extra_token !== '') {
$new_tokens[] = $extra_token;
}
if ($j < count($extra_tokens) - 1) {
$new_tokens[] = $string;
}
}
}
else {
$new_tokens[] = $token;
}
}
return $new_tokens;
}
private static function splitAtComma($tokens) {
return self::splitAtCharacter($tokens, ',', '/,/');
}
private static function splitAtDots($tokens) {
return self::splitAtCharacter($tokens, '.', '/\./');
}
private static function splitAtQuotes($tokens) {
$tokens = self::splitAtCharacter($tokens, '„', '/„/');
$tokens = self::splitAtCharacter($tokens, '“', '/“/');
$tokens = self::splitAtCharacter($tokens, '»', '/»/');
$tokens = self::splitAtCharacter($tokens, '«', '/«/');
return $tokens;
}
private static function splitAtSemicolons($tokens) {
return self::splitAtCharacter($tokens, ';', '/;/');
}
private static function splitAtWhitespaces($text) {
// remove duplicate whitespace
$tokens = preg_split('/\s/', $text, -1, PREG_SPLIT_NO_EMPTY);
// add double whitespace
$text = implode(' ', $tokens);
// split text with empty tokens
$tokens = preg_split('/\s/', $text);
// add whitespace to empty tokens
for ($i = 0; $i < count($tokens); $i++) {
if ($tokens[$i] === '') {
$tokens[$i] = ' ';
}
}
return $tokens;
}
public static function tokenize($text) {
$tokens = self::splitAtWhitespaces($text);
$tokens = self::splitAtComma($tokens);
$tokens = self::splitAtDots($tokens);
$tokens = self::splitAtSemicolons($tokens);
$tokens = self::joinSquareBrackets($tokens);
$tokens = self::wrapTokens($tokens);
return $tokens;
}
public static function wrapTokens($tokens) {
foreach ($tokens as $i => $token) {
$tokens[$i] = new Token($token);
}
return new Tokens($tokens);
}
}