-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathparserCombinator.js
189 lines (182 loc) · 6.21 KB
/
parserCombinator.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
let errManager = {
restTokens: -1,
errorInfo: ''
}
// (let ((y 2)) ((lambda (x) (+ x y)) 1))
// 首先定义 Parser, 它是一个接受 tokens 返回 Result 的函数
// Parser := Tokens => Result | null
// tokens 就定义为数组好了
// Tokens := Array
// Result 是一个数组,第一个元素是解析结果数组,第二元素是剩下的tokens, 如果为 null 则为解析失败
// 然后我们来定义第一个 parser
// 这个 parser 对任何一个 token 都解析成功,并吃掉它,相当于 G -> ε
const Successed = tokens => [{type: 'any', value: tokens[0].token}, tokens.slice(1)]
// 一个高阶函数,用于创建标识符解析器, 比如说 G -> s 解析 s 终结符
const ID = (id, regex = false) => tokens => {
let r = null
if (tokens[0] && regex)
r = RegExp(id).test(tokens[0].token) ? [{type: 'identifier', value: tokens[0].token}, tokens.slice(1)] : null
else if (tokens[0])
r = id === tokens[0].token ? [{type: 'identifier', value: tokens[0].token}, tokens.slice(1)] : null
// console.log('match token: \t'+ id + '\t --- \t'+tokens.join(' '))
if (r) return r
// 如果出现语法错误,以剩下 tokens 最少的为最优 parser 结果,并报错
if (tokens.length <= errManager.restTokens.length) {
const index = errManager.restTokens[0].strIndex
const input = errManager.restTokens[0].input
const token = errManager.restTokens[0].token
errManager.restTokens = tokens
errManager.errorInfo = `syntax error on '${tokens[0] === undefined ? 'empty string' : tokens[0].token}'.\n'${input.slice(0, index) + ' ' + token}' expected an '${id}'.`
}
return r
}
// 只要有一个解析器解析成功就是解析成功, 相当文法中的 | 符号,比如 G -> A | B | C
const OR = (...parsers) => tokens => {
for (const p of parsers) {
const result = p(tokens)
if (result) return result
}
return null
}
// 只有全部解析器都解析成功才成功, 相当于文法的连接
// 比如对于文法 G -> A B C
// 只有 A B C 都解析成功, G 才解析成功
const SEQ = (...parsers) => tokens => {
let result = []
let rest = tokens
for (const p of parsers) {
const r = p(rest)
if (r) {
result = result.concat(r[0])
rest = r[1]
} else return null
}
return [result, rest]
}
// 对 tokens 使用一个 parser 解析任意次,直到解析失败,将结果返回,max 设置为 -1 相当于正则里的 *
const REP = (parser, max = -1) => tokens => {
let count = 0;
let result = []
let rest = tokens
while(count >= max) {
const r = parser(rest)
if (r) {
result = result.concat(r[0])
rest = r[1]
count++
} else break
}
return [result, rest]
}
const NUM = tokens => {
const r = ID(/\d/g, true)(tokens)
if (r) {
r[0].type = 'number'
r[0].value = Number(r[0].value)
}
return r
}
const BOOL = tokens => {
const t = ID('#t')(tokens)
const f = ID('#f')(tokens)
return t ? [{type: 'bool', value: true}, t[1]] : f ? [{type: 'bool', value: false}, f[1]] : null
}
const STR = tokens => {
const r = ID(/^"(.*)"$/g, true)(tokens)
if (r) return r[0].type = 'string'
return null
}
// lisp 语法定义, 参考 R5RS
// <EXP> -> <VAR> | <LITERAL> | PRODCALL | LAMBDA | LET
const EXP = tokens => {
if (tokens.length == 0) return null
return OR(VAR, LITERAL, PRODCALL, LAMBDA, LET)(tokens)
}
// <SELFEVAL> -> <BOOL> | <STR> | <NUM>
const SELFEVAL = tokens => {
const r = OR(BOOL, STR, NUM)(tokens)
if (r) r[0].type = 'selfeval'
return r
}
// <syntactic keyword> -> let | def | lambda | if ...
const SYNTAXKEYWORD = tokens => {
const r = OR(ID('('), ID(')'),ID('let'), ID('def'), ID('lambda'), ID('if'))(tokens)
if (r) r[0].type = 'syntaxkeyword'
return r
}
// <VAR> -> <any <identifier> that is not also a <syntactic keyword>
const VAR = tokens => {
const OPERATOR = OR(ID('+'), ID('-'), ID('*'), ID('/'))
const NOTSYNTAXKEYWORD = tokens => {
if (!SYNTAXKEYWORD(tokens)&&!SELFEVAL(tokens)&&!!tokens[0].token) {
return Successed(tokens)
}
return null
}
const r = OR(OPERATOR, NOTSYNTAXKEYWORD)(tokens)
if (r) r[0].type = 'var'
return r
}
// <LITERAL> -> <SELFEVAL>
const LITERAL = SELFEVAL
// <OPERATOR> -> EXP
const OPERATOR = tokens => EXP(tokens)
// <PRODCALL> -> (<OPERATOR> <EXP>*)
const PRODCALL = tokens => {
const r = SEQ(ID('('), OPERATOR, REP(EXP), ID(')'))(tokens)
// 去掉 ()
return r ? [{type: 'procedurecall', value: r[0].slice(1, -1)}, r[1]] : null
}
// <BINDSPEC> -> (<VAR> <EXP>)
const BINDSPEC = tokens => {
const r = SEQ(ID('('), VAR, EXP, ID(')'))(tokens)
if (r) return [{type: 'bindspec', value: [r[0][1], r[0][2]]}, r[1]]
return r
}
// <DEFINE> -> (define <VAR> <EXP>)
const DEFINE = tokens => {
const r = SEQ(ID('('), ID('define'), VAR, EXP, ID(')'))(tokens)
if (r) return [{type: 'define', value: [r[0][2].value, r[0][3]]},r[1]]
return r
}
// <BODY> -> <DEFINE>* EXP <EXP>*
const BODY = tokens => {
const r = SEQ(REP(DEFINE), EXP, REP(EXP))(tokens)
if (r) return [{type: 'body', value: r[0]}, r[1]]
return r
}
// <LET> -> (let (<BINDSPEC>*) <BODY>) | (let <VAR> (<BINDSPEC>*) <BODY>)
const LET = tokens => {
const r = OR(SEQ(ID('('), ID('let'), ID('('), REP(BINDSPEC), ID(')'), BODY, ID(')')), SEQ(ID('('), ID('let'), VAR, ID('('), REP(BINDSPEC), ID(')'), BODY, ID(')')))(tokens)
if (r) return [{type: 'let', value: {
var: r[0][2].type == 'var' ? r[0][2] : null,
bindspecs: r[0].filter(i => i.type == 'bindspec'),
body: r[0].find(i => i.type == 'body')
}}, r[1]]
return r
}
// <FORMALS> -> (<VAR>*)
const FORMALS = tokens => {
const r = SEQ(ID('('),REP(VAR), ID(')'))(tokens)
if (r) return [{type: 'formals', value: r[0].slice(1, -1)}, r[1]]
return r
}
// <LAMBDA> -> (lambda <FORMALS> <BODY>)
const LAMBDA = tokens => {
const r = SEQ(ID('('), ID('lambda'), FORMALS, BODY, ID(')'))(tokens)
if (r) return [{type: 'lambda', formals: r[0][2], body:r[0][3]}, r[1]]
return r
}
// <COMMAND> ->
const COMMAND = tokens => EXP(tokens)
const COMMANDORDEF = tokens => OR(COMMAND, DEFINE)(tokens)
// <PROGRAM> -> <command or definition>*
const PROGRAM = tokens => REP(COMMANDORDEF)(tokens)
function program(tokens, err) {
errManager = { ...err }
return {
ast: EXP(tokens),
err: errManager
}
}
module.exports = program