-
Notifications
You must be signed in to change notification settings - Fork 0
/
lexer.go
218 lines (179 loc) · 4.44 KB
/
lexer.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
package parserlexer
import (
"fmt"
"strings"
"unicode"
"unicode/utf8"
)
func Parse(text string) (string, error) {
p := parser{
lex: lex(text),
}
p.parse()
if p.errItem != nil {
return "", fmt.Errorf("error processing the following %q", p.errItem.value)
}
return p.result, nil
}
type parser struct {
result string
lex *lexer
errItem *item
}
func (p *parser) parse() {
sb := strings.Builder{}
for item := range p.lex.items {
switch item.kind {
case itemEOF:
p.result = sb.String()
return
case itemError:
p.errItem = &item
return
case itemText:
sb.WriteString(reverse(item.value))
case itemWhiteSpace:
sb.WriteString(item.value)
}
}
}
type itemKind int
const (
itemWhiteSpace itemKind = iota
itemEOF
itemText
itemError
)
// item is accumulated while lexing the provided input, and emitted over a
// channel to the parser. Items could also be called tokens as we tokenize the
// input.
type item struct {
position int
// kind signals how we've classified the data we have accumulated while
// scanning the string.
kind itemKind
// value is the segment of data we've accumulated.
value string
}
const eof = -1
// stateFn is a function that is specific to a state within the string.
type stateFn func(*lexer) stateFn
// lex creates a lexer and starts scanning the provided input.
func lex(input string) *lexer {
l := &lexer{
input: input,
state: lexText,
items: make(chan item, 1),
}
go l.scan()
return l
}
// lexer is created to manage an individual scanning/parsing operation.
type lexer struct {
input string // we'll store the string being parsed
start int // the position we started scanning
position int // the current position of our scan
width int // we'll be using runes which can be double byte
state stateFn // the current state function
items chan item // the channel we'll use to communicate between the lexer and the parser
}
// emit sends a item over the channel so the parser can collect and manage
// each segment.
func (l *lexer) emit(k itemKind) {
accumulation := l.input[l.start:l.position]
i := item{
position: l.start,
kind: k,
value: accumulation,
}
l.items <- i
l.ignore() // reset our scanner now that we've dispatched a segment
}
// nextItem pulls an item from the lexer's result channel.
func (l *lexer) nextItem() item {
return <-l.items
}
// ignore resets the start position to the current scan position effectively
// ignoring any input.
func (l *lexer) ignore() {
l.start = l.position
}
// next advances the lexer state to the next rune.
func (l *lexer) next() (r rune) {
if l.position >= len(l.input) {
l.width = 0
return eof
}
r, l.width = utf8.DecodeRuneInString(l.input[l.position:])
l.position += l.width
return r
}
// backup allows us to step back one run1e which is helpful when you've crossed
// a boundary from one state to another.
func (l *lexer) backup() {
l.position = l.position - 1
}
// scan will step through the provided text and execute state functions as
// state changes are observed in the provided input.
func (l *lexer) scan() {
// When we begin processing, let's assume we're going to process text.
// One state function will return another until `nil` is returned to signal
// the end of our process.
for fn := lexText; fn != nil; {
fn = fn(l)
}
close(l.items)
}
func (l *lexer) errorf(format string, args ...interface{}) stateFn {
msg := fmt.Sprintf(format, args...)
l.items <- item{
kind: itemError,
value: msg,
}
return nil
}
// lexEOF emits the accumulated data classified by the provided itemKind and
// signals that we've reached the end of our lexing by returning `nil` instead
// of a state function.
func (l *lexer) lexEOF(k itemKind) stateFn {
// l.backup()
if l.start > l.position {
l.ignore()
}
l.emit(k)
l.emit(itemEOF)
return nil
}
// lexText scans what is expected to be text.
func lexText(l *lexer) stateFn {
for {
r := l.next()
switch {
case r == eof:
return l.lexEOF(itemText)
case unicode.IsSpace(r):
l.backup()
// emit any text we've accumulated.
if l.position > l.start {
l.emit(itemText)
}
return lexWhitespace
}
}
}
// lexWhitespace scans what is expected to be whitespace.
func lexWhitespace(l *lexer) stateFn {
for {
r := l.next()
switch {
case r == eof:
return l.lexEOF(itemWhiteSpace)
case !unicode.IsSpace(r):
l.backup()
if l.position > l.start {
l.emit(itemWhiteSpace)
}
return lexText
}
}
}