-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathsimp.scm
398 lines (333 loc) · 14.3 KB
/
simp.scm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
;;; Olin Shivers, June 1998
;;; Copyright (c) 1998 by the Scheme Underground.
;;; One export: (simplify-regexp re) -> re
;;; Regexp simplifier
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; (| (in c1 ...) (in c2 ...) re ...) => (| (in c1 ... c2 ...) re ...)
;;; (| (not-in c1 ...) (not-in c2 ...)) => (| (not-in [intersect (c1 ...)
;;; (c2 ...)])
;;; A run of BOS's or a run of EOS's in a sequence may be elided.
;;; Nested exponents can be collapsed (*, +, ?) -- multiply the "from's"
;;; together; multiply the "to's" together.
;;; Exponent range [1,1] simplifies, as does [0,0].
;;; Uniquify branches
;;; Adjacent literals in a sequence can be collapsed
;;; A singleton-char char class can be collapsed into a constant
;;; Nested choices can be collapsed
;;; Nested sequences can be collapsed
;;; An empty sequence (:) can be turned into an empty-string match "".
;;; Singleton choices and sequences can be reduced to their body.
;;;
;;; The simplifier is carefully written so that it won't blow up
;;; when applied to a dynamic regexp -- that is,
;;; - a chunk of Scheme code that produces a regexp instead of
;;; an actual regexp value;
;;; - a repeat regexp whose FROM or TO fields are chunks of Scheme code
;;; rather than integers;
;;; - a char-set regexp whose CSET field is a chunk of Scheme code rather
;;; than an actual char-set value.
;;; This is useful because the RX macro can build such a regexp as part
;;; of its expansion process.
(define (simplify-regexp re)
(if (and (regexp? re)
(number? (re-tsm re)))
(receive (simp-re pre-dsm) (simp-re re)
(re-dsm simp-re pre-dsm (- (re-tsm re) (+ (re-tsm simp-re) pre-dsm))))
re))
(define (simp-re re)
(cond
((re-string? re) (values re 0))
((re-seq? re) (simp-seq re))
((re-choice? re) (simp-choice re))
;; Singleton char-sets reduce to the character.
;; Bear in mind the cset field might be Scheme code instead
;; of an actual char set if the regexp is dynamic.
((re-char-set? re)
(values (let ((cs (re-char-set:cset re)))
(if (and (char-set? cs)
(= 1 (char-set-size cs)))
(make-re-string (string (car (char-set->list cs))))
re))
0))
((re-repeat? re) (simp-repeat re))
((re-submatch? re) (simp-submatch re))
((re-dsm? re) (simp-dsm re))
(else (values re 0))))
;;; If the body of a submatch is the empty re, reduce it to the empty re.
(define (simp-submatch re)
(let ((tsm (re-submatch:tsm re))
(pre-dsm (re-submatch:pre-dsm re)))
(receive (body1 pre-dsm1) (simp-re (re-submatch:body re))
(if (re-empty? body1)
(values re-empty tsm)
(values (make-re-submatch/tsm body1 (+ pre-dsm pre-dsm1) tsm)
0)))))
;;; - Flatten nested DSM's.
;;; - Return pre-dsm field and body field as the two return values.
(define (simp-dsm re)
(receive (body pre-dsm1) (simp-re (re-dsm:body re))
(values body (+ (re-dsm:pre-dsm re) pre-dsm1))))
;;; Simplifying sequences
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; - Collapse nested sequences and DSM's.
;;; - Merge adjacent strings, identical adjacent anchors (bos, eos, etc.).
;;; - Bubble DSM's forwards past elts that don't contain live submatches.
;;; (Going past live submatches would switch the submatch indexes around,
;;; which would be an error). This helps to coalesce DSMs and if we bring
;;; them all the way to the front, we can pop them off and make them a
;;; pre-dsm for the entire seq record.
;;; - If an elt is the re-empty, reduce the whole re to the empty re.
;;; - Reduce singleton and empty seq.
(define (simp-seq re)
(let ((tsm (re-seq:tsm re))
(elts (map simplify-regexp (re-seq:elts re))))
(cond
((null? elts)
(values re-trivial 0)) ; Empty seq
((number? tsm)
(call-with-current-continuation
(lambda (abort)
(receive (pre-dsm head tail) (simp-seq1 elts abort tsm)
(values (if (pair? tail)
(make-re-seq/tsm (cons head tail) (- tsm pre-dsm))
head) ; Singleton seq
pre-dsm)))))
(else (values re tsm))))) ; dynamic components
;;; Simplify the non-empty sequence ELTS.
;;; - Return the result split out into three values:
;;; [head-elt-pre-dsm, head-elt, tail].
;;; - If any elt is the empty (impossible) re, abort by calling
;;; (abort elt tsm). TSM is otherwise unused.
;;; - If any elt is dynamic, abort as well.
(define (simp-seq1 elts abort tsm)
(let recur ((elt (car elts)) (elts (cdr elts)))
(receive (elt pre-dsm) (open-dsm elt)
(cond
((re-seq? elt) ; Flatten nested seqs.
(let ((sub-elts (re-seq:elts elt)))
(recur (re-dsm (car sub-elts) pre-dsm 0)
(append (cdr sub-elts) elts))))
((re-empty? elt) (abort elt tsm)) ; Bomb out on the empty
; (impossible) re.
((pair? elts)
(receive (next-pre-dsm next tail) ; Simplify the tail,
(recur (car elts) (cdr elts)) ; then think about
; the head:
;; This guy is called when we couldn't find any other
;; simplification. If ELT contains live submatches, then
;; there really is nothing to be done at this step -- just
;; assemble the pieces together and return them. If ELT
;; *doesn't* contain any live submatches, do the same, but
;; bubble its following next-pre-dsm submatches forwards.
(define (no-simp)
(if (has-live-submatches? elt)
(values pre-dsm elt (cons (re-dsm next next-pre-dsm 0) tail))
(values (+ pre-dsm next-pre-dsm) elt (cons next tail))))
;; Coalesces two adjacent bol's, two adjacent eol's, etc.
(define (coalesce-anchor anchor?)
(if (and (anchor? elt) (anchor? next))
(values (+ pre-dsm next-pre-dsm) elt tail)
(no-simp)))
(cond
((re-trivial? elt) ; Drop trivial re's.
(values (+ pre-dsm next-pre-dsm) next tail))
;; Coalesce adjacent strings
((re-string? elt)
(if (re-string? next)
(values (+ pre-dsm next-pre-dsm)
(make-re-string (string-append (re-string:chars elt)
(re-string:chars next)))
tail)
(no-simp)))
;; Coalesce adjacent bol/eol/bos/eos's.
((re-bol? elt) (coalesce-anchor re-bol?))
((re-eol? elt) (coalesce-anchor re-eol?))
((re-bos? elt) (coalesce-anchor re-bos?))
((re-eos? elt) (coalesce-anchor re-eos?))
(else (no-simp)))))
(else (values pre-dsm elt '()))))))
;;; Simplifying choices
;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
;;; - Collapse nested choices and DSM's.
;;; - Delete re-empty's.
;;; - Merge sets; merge identical anchors (bos, eos, etc.).
;;; But you can't merge across an element that contains a live submatch,
;;; see below.
;;; - A singleton string "c" is included into the char-set merge as a
;;; singleton set.
;;; - Bubble DSM's forwards past elts that don't contain live submatches.
;;; (Going past live submatches would switch the submatch indexes around,
;;; which would be an error). This helps to coalesce DSMs and if we bring
;;; them all the way to the front, we can pop them off and make them a
;;; pre-dsm for the entire seq record.
;;; - Reduce singleton and empty choice.
;;;
;;; You have to be careful simplifying choices -- you can't merge two sets
;;; that appear on different sides of an element containing a live submatch.
;;; The problem is that the assignment of submatches breaks ties left-to-right.
;;; So these aren't the same:
;;; (| (submatch "x") any) (| any (submatch "x"))
;;; The first assigns the submatch, the second doesn't -- the ANY gets credit.
;;; We want to collapse multiple char-sets, bos's, and eos's, but we have
;;; to deal with this issue. So
;;; - When we coalesce anchors, we retain the *leftmost* one.
;;; - We coalesce sets that appear between live-submatch boundaries.
;;; When we do this, we subtract from the set any char that was in
;;; an earlier coalesced char-set. If this gets us down to the empty set,
;;; we drop it. If it gets us down to a singleton set, we convert it into
;;; a singleton string.
;;; Whew. I had to think about this one.
(define (simp-choice re)
(let ((tsm (re-choice:tsm re)))
(receive (pre-dsm cset bos? eos? bol? eol? tail)
(simp-choice1 (map simplify-regexp (re-choice:elts re)))
(let ((tail (assemble-boundary-tail char-set:empty cset
bos? eos? bol? eol?
#f #f #f #f
tail)))
(values (if (pair? tail)
(if (pair? (cdr tail))
(make-re-choice/tsm tail (- tsm pre-dsm))
(car tail)) ; Singleton choice
re-empty) ; Empty choice
pre-dsm)))))
;;; Given the return values from simp-choice1, this tacks all
;;; the various pieces (CSET, BOS?, EOS?, etc.) onto the front of
;;; TAIL. However, elements are not added onto TAIL that are already
;;; described by PREV-CSET, PREV-BOS?, etc. -- they will be added onto
;;; some earlier bit of the final result.
(define (assemble-boundary-tail prev-cset cset
bos? eos? bol? eol?
prev-bos? prev-eos?
prev-bol? prev-eol?
tail)
(let* ((cset (char-set-difference cset prev-cset))
(numchars (char-set-size cset))
(tail (if (and eos? (not prev-eos?)) (cons re-eos tail) tail))
(tail (if (and eol? (not prev-eol?)) (cons re-eol tail) tail))
(tail (if (and bol? (not prev-bol?)) (cons re-bol tail) tail))
(tail (if (and bos? (not prev-bos?)) (cons re-bos tail) tail))
(tail (cond
((zero? numchars) tail) ; Drop empty char set.
((= 1 numchars) ; {c} => "c"
(cons (make-re-string (string (car (char-set->list cset))))
tail))
(else (cons (make-re-char-set cset) tail)))))
tail))
;;; Simplify the non-empty list of choices ELTS.
;;; Return the result split out into the values
;;; [pre-dsm, cset, bos?, eos?, bol?, eol?, tail]
(define (simp-choice1 elts)
(let recur ((elts elts)
(prev-cset char-set:empty) ; Chars we've already seen.
(prev-bos? #f) (prev-eos? #f) ; These flags say if we've
(prev-bol? #f) (prev-eol? #f)) ; already seen one of these anchors.
(if (pair? elts)
(let ((elt (car elts))
(elts (cdr elts)))
(receive (elt pre-dsm) (open-dsm elt)
(if (re-choice? elt)
;; Flatten nested choices.
(let ((sub-elts (re-choice:elts elt)))
(receive (tail-pre-dsm cset bos? eos? bol? eol? tail)
(recur (append sub-elts elts)
prev-cset
prev-bos? prev-eos?
prev-bol? prev-eol?)
(values (+ pre-dsm tail-pre-dsm)
cset bos? eos? bol? eol? tail)))
;; Simplify the tail, then think about the head.
(receive (tail-pre-dsm cset bos? eos? bol? eol? tail)
(recur elts
(cond
((and (re-string? elt)
(= 1 (string-length (re-string:chars elt))))
(char-set-union prev-cset
(string->char-set (re-string:chars elt))))
;; The cset might be a Scheme exp.
((and (re-char-set? elt)
(char-set? (re-char-set:cset elt)))
(char-set-union prev-cset
(re-char-set:cset elt)))
(else prev-cset))
(or prev-bos? (re-bos? elt))
(or prev-eos? (re-eos? elt))
(or prev-bol? (re-bol? elt))
(or prev-eol? (re-eol? elt)))
;; This guy is called when we couldn't find any other
;; simplification. If ELT contains live submatches, then we
;; are at a merge boundary, and have to take all the
;; TAIL-PRE-DSM, CSET, BOS?, EOS?, ... stuff we've collected
;; and tack them onto TAIL as elements, then put ELT on
;; front. Otherwise, we can commute TAIL-PRE-DSM, CSET,
;; BOS?, etc. with ELT, since it contains no live
;; submatches, so just tack ELT onto TAIL.
(define (no-simp)
(if (has-live-submatches? elt)
(let ((tail (assemble-boundary-tail prev-cset cset
bos? eos?
bol? eol?
prev-bos? prev-eos?
prev-bol? prev-eol?
tail)))
(values pre-dsm char-set:empty #f #f #f #f
(if (pair? tail)
;; Tack tail-pre-dsm onto
;; TAIL's first elt.
(cons elt
(cons (re-dsm (car tail)
tail-pre-dsm 0)
(cdr tail)))
;; Squirrel case: TAIL is empty, so use
;; TAIL-PRE-DSM as ELT's post-dsm.
(list (re-dsm elt 0 tail-pre-dsm)))))
;; ELT has no live submatches, so we can commute all
;; the recursion state forwards past it.
(values (+ pre-dsm tail-pre-dsm)
cset bos? eos? bol? eol?
(cons elt tail))))
(cond
((and (re-char-set? elt)
(char-set? (re-char-set:cset elt))) ; Might be Scheme code
(values (+ pre-dsm tail-pre-dsm)
(char-set-union cset (re-char-set:cset elt))
bos? eos? bol? eol? tail))
;; Treat a singleton string "c" as a singleton set {c}.
((and (re-string? elt) (= 1 (string-length (re-string:chars elt))))
(values (+ pre-dsm tail-pre-dsm)
(char-set-union cset (string->char-set (re-string:chars elt)))
bos? eos? bol? eol? tail))
;; Coalesce bol/eol/bos/eos's.
((re-bos? elt) (values (+ pre-dsm tail-pre-dsm) cset
#t eos? bol? eol? tail))
((re-eos? elt) (values (+ pre-dsm tail-pre-dsm) cset
bos? #t bol? eol? tail))
((re-bol? elt) (values (+ pre-dsm tail-pre-dsm) cset
bos? eos? #t eol? tail))
((re-eol? elt) (values (+ pre-dsm tail-pre-dsm) cset
bos? eos? bol? #t tail))
(else (no-simp)))))))
(values 0 char-set:empty #f #f #f #f '()))))
(define (simp-repeat re)
(let ((from (re-repeat:from re))
(to (re-repeat:to re))
(body (re-repeat:body re)))
(receive (simp-body pre-dsm) (simp-re body) ; Simplify body.
;; The fancy reductions are all handled by REDUCE-REPEAT.
(reduce-repeat from to simp-body pre-dsm))))
;;; Does RE contain a live submatch?
;;; If RE is dynamic, we can't tell, so we err conservatively,
;;; which means we say "yes."
(define (has-live-submatches? re)
(or (re-submatch? re)
(cond ((re-seq? re) (every has-live-submatches? (re-seq:elts re)))
((re-choice? re) (every has-live-submatches? (re-choice:elts re)))
((re-repeat? re) (has-live-submatches? (re-repeat:body re)))
((re-dsm? re) (has-live-submatches? (re-dsm:body re)))
;; If it's not one of these things, then this isn't a regexp -- it's
;; a chunk of Scheme code producing a regexp, and we conservatively
;; return #T -- the expression *might* produce a regexp containing
;; a live submatch:
(else (not (or (re-char-set? re) (re-string? re)
(re-bos? re) (re-eos? re)
(re-bol? re) (re-eol? re)))))))