-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathinterpreter.ml
755 lines (655 loc) · 31.5 KB
/
interpreter.ml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
(** * Bytecode Interpreter *)
(* A VM-based interpreter of the bytecode that handles thread priority *)
open Bytecode
open Regex
open Map
open Array
open Format
open Oracle
open Compiler
open Cdn
open Anchors
open Regs
open Charclasses
open Flags
module type INTERP = sig
val regs_name : unit -> string
val build_oracle : compiled_regex -> string -> oracle
val build_capture : compiled_regex -> string -> oracle -> (int Array.t) option
val matcher : compiled_regex -> string -> (int Array.t) option
val full_match : raw_regex -> string -> (int Array.t) option
val get_linear_result : raw_regex -> string -> string
end
(* I might move the last two functions out of this signature *)
module Interpreter (Regs:REGS) : INTERP = struct
let regs_name () = Regs.name
(** * Direction *)
(* In our algorithm, the interpreter can traverse the string in a forward way or in a backward way *)
(* Backward is used when building the oracle for lookaheads *)
(* Or when building capture groups for lookbehinds *)
let print_direction (d:direction) : string =
match d with
| Forward -> "Forward"
| Backward -> "Backward"
let oracle_direction (l:lookaround) : direction =
match l with
| Lookahead | NegLookahead -> Backward
| Lookbehind | NegLookbehind -> Forward
let capture_direction (l:lookaround) : direction =
match l with
| Lookahead -> Forward
| Lookbehind -> Backward
| _ -> failwith "No capture on negative lookarounds"
(* negative lookarounds don't capture anything *)
let capture_type (l:lookaround) : bool =
match l with
| Lookahead | Lookbehind -> true
| NegLookahead | NegLookbehind -> false
(* increments the current position in the string depending on the direction *)
let incr_cp (cp:int) (dir:direction) : int =
match dir with
| Forward -> cp + 1
| Backward -> cp - 1
let init_cp (dir:direction) (str_size:int) : int =
match dir with
| Forward -> 0
| Backward -> str_size
(* there are (str_size+1) distinct positions: before and after the string count *)
(* when writing to the oracle, if we've been going backward we need an offset *)
let cp_offset (dir:direction) : int =
match dir with
| Forward -> 0
| Backward -> 1
(** * Capture Registers *)
(* just printing the contents of an Array for debugging purposes *)
let debug_regs (regs:int Array.t) : string =
let s = ref "" in
for c = 0 to (Array.length regs)-1 do
s := !s ^ string_of_int c ^ ": " ^ string_of_int (regs.(c)) ^ " | "
done;
!s
(* each thread stores capture registers for the capture groups it has matched so far *)
(* module Regs = (List_Regs : REGS) *)
(** * String Manipulation *)
let get_char (str:string) (cp:int) : char option =
try Some (String.get str cp) with
| Invalid_argument _ -> None (* when reaching the end of the string *)
(** * Threads *)
(* each thread has a program counter in the bytecode and a map of capture registers *)
type thread =
{
mutable pc: int;
mutable capture_regs: Regs.regs; (* cp and clock for each capture group *)
mutable look_regs: Regs.regs; (* cp and clock for each lookaround *)
mutable quant_regs: Regs.regs; (* cp (if nulled) and clock for each quantifier *)
mutable exit_allowed : bool; (* are we allowed to exit the current loop *)
}
let init_thread (initcap:Regs.regs) (initlook:Regs.regs) (initquant:Regs.regs): thread =
{ pc = 0; capture_regs = initcap; look_regs = initlook; quant_regs = initquant; exit_allowed = false }
(** * PC Sets *)
(* for each PC of the bytecode, we want to track if, in a step of the interpreter, this PC has already been processed *)
(* this prevents us from handling the same pc twice: only the top priority thread with that pc should be handled *)
(* We use a mutable array of booleans, since we know the size of the bytecode *)
type pcset = bool Array.t
let init_pcset (bytecode_size: int) : pcset =
assert (bytecode_size > 0);
Array.make bytecode_size false
(* add a pc to the set after it has been processed *)
let pc_add (pcs:pcset) (pc:label) : unit =
pcs.(pc) <- true
(* check if a pc has been processed already *)
let pc_mem (pcs:pcset) (pc:label) : bool =
pcs.(pc)
(* adds a thread and char at the head of a blocked list only if it's not already in *)
(* modifies the pcset in place *)
let add_thread (t:thread) (x:char_expectation) (current:(thread*char_expectation) list) (inset:pcset) : (thread*char_expectation) list =
if (pc_mem inset t.pc) then current
else begin
pc_add inset t.pc;
(t,x)::current
end
(** * Boolean PC Sets *)
(* For each PC of the bytecode, remembers if it has been handled for each possible state of the exit_allowed boolean *)
type bpcset =
{
true_set: pcset;
false_set: pcset;
}
let init_bpcset (bytecode_size: int) : bpcset =
{ true_set = init_pcset(bytecode_size); false_set = init_pcset(bytecode_size) }
let bpc_add (bpcs:bpcset) (pc:label) (exit_bool:bool) : unit =
match exit_bool with
| true -> pc_add bpcs.true_set pc
| false -> pc_add bpcs.false_set pc
let bpc_mem (bpcs:bpcset) (pc:label) (exit_bool:bool) : bool =
match exit_bool with
| true -> pc_mem bpcs.true_set pc
| false -> pc_mem bpcs.false_set pc
(** * Interpreter States *)
(* The interpreter alternates between two different steps to keep threads synchronized:
- first it advances all of its states through epsilon transitions, until they reach a Consume instruction
- then it reads the next input character of the string at the current position and advances or kill each thread
*)
type interpreter_state =
{
mutable cp: int; (* current position in the input string *)
mutable active: thread list; (* ordered list of threads. high to low priority *)
mutable processed: bpcset; (* already processed pcs. Similar to isPcProcessed in Experimental *)
(* mutable because we reset it each step, but its modifications during a step are done in-place *)
mutable blocked: (thread*char_expectation) list; (* threads stuck at a Consume instruction. low to high priority. *)
mutable isblocked: pcset; (* already blocked pcs, to avoid duplicates *)
mutable bestmatch: thread option; (* best match found so far, but there might be a higher priotity one still *)
mutable context: char_context; (* prev and next character at the current position *)
mutable clock: int; (* global clock *)
mutable cdn: cdn_table; (* nullability table for cdn + *)
}
(* initializing the context *)
(* when starting the interpreter at any position, possibly in the middle of the string *)
let cp_context (cp:int) (str:string) (dir:direction) : char_context =
let nextop = get_char str cp in
let prevop = get_char str (cp-1) in
match dir with
| Forward -> { prevchar = prevop ; nextchar = nextop }
| Backward -> { prevchar = nextop ; nextchar = prevop }
let init_state (c:code) (initcp:int) (initcap:Regs.regs) (initlook:Regs.regs) (initquant:Regs.regs) (initclk:int) (initctx:char_context) : interpreter_state =
{ cp = initcp;
active = [init_thread initcap initlook initquant];
processed = init_bpcset (size c);
blocked = [];
isblocked = init_pcset (size c);
bestmatch = None;
context = initctx;
clock = initclk;
cdn = init_cdn();
}
(** * Debugging Utilities *)
let print_exit_allowed (b:bool) : string =
if b then "\027[32m✔\027[0m" else "\027[31m✘\027[0m"
let print_thread (t:thread) : string = string_of_int t.pc ^ print_exit_allowed t.exit_allowed
let print_active (l:thread list) : string =
" ACTIVE: " ^ List.fold_left (fun s t -> if (s = "") then (print_thread t) else (print_thread t) ^ ", " ^ s) "" l
let print_blk (b:thread * char_expectation) : string =
"(" ^ print_thread (fst b) ^ ":" ^ expectation_to_string (snd b) ^ ")"
let print_blocked (l:(thread*char_expectation) list) : string =
" BLOCKED: " ^ List.fold_left (fun s b -> if (s = "") then (print_blk b) else (print_blk b) ^ ", " ^ s) "" l ^ "\n"
let print_cp (cp:int) : string =
" CP: " ^ string_of_int cp ^ "\n"
let print_match (b:thread option) =
match b with
| None -> "None\n"
| Some t -> print_thread t ^ "\n" ^ Regs.to_string t.capture_regs
let print_bestmatch (b:thread option) =
" BEST: " ^ print_match b ^ "\n"
(** * Printing Results *)
(* once the registers have been transformed to an array *)
let get_op (c:int Array.t) (reg:int) : int option =
let value = c.(reg) in
if (value < 0) then None else Some value
(* extracting a capture group slice given its registers *)
let print_slice (str:string) (startreg:int option) (endreg:int option) : string =
match startreg with
| None -> "Undefined"
| Some startv ->
begin match endreg with
| None -> failwith "startreg is set but not endreg"
| Some endv ->
if (endv >= startv) then
String.sub str startv (endv - startv)
else
String.sub str endv (startv - endv)
(* capture groups defined in lookbehind have their capture regs reversed *)
end
(* printing all capture groups *)
let print_cap_regs (c:int Array.t) (max_groups:int) (str:string) : string =
let s = ref "" in
for i = 0 to max_groups do
s := !s ^ "#" ^ string_of_int i ^ ":";
let startr = get_op c (start_reg i) in
let endr = get_op c (end_reg i) in
s := !s ^ print_slice str startr endr ^ "\n"
done;
!s
let print_cap_option (c:(int Array.t) option) (max_groups:int) (str:string) : string =
match c with
| None -> "NoMatch\n"
| Some ca -> print_cap_regs ca max_groups str
let print_result (r:regex) (str:string) (c:(int Array.t) option) : string =
let s = ref "" in
if !verbose then
s := !s ^ ("Result of matching " ^ print_regex r ^ " on string " ^ str ^ " : \n");
let max = max_group r in
!s ^ print_cap_option c max str ^ "\n"
(** * Filtering For Capture Reset *)
(* At the end of the algorithm, we get many capture groups that should be reset *)
(* We use the quantifier registers to filter out those that are too old *)
(* modifies regs in-place *)
let rec filter_capture (r:regex) (cap_regs:int Array.t) (cap_clocks: int Array.t) (look_clocks:int Array.t) (quant_clocks:int Array.t) (maxclock:int) : unit =
match r with
| Re_empty | Re_character _ | Re_anchor _ -> ()
| Re_alt (r1,r2) -> filter_capture r1 cap_regs cap_clocks look_clocks quant_clocks maxclock;
filter_capture r2 cap_regs cap_clocks look_clocks quant_clocks maxclock
| Re_con (r1,r2) -> filter_capture r1 cap_regs cap_clocks look_clocks quant_clocks maxclock;
filter_capture r2 cap_regs cap_clocks look_clocks quant_clocks maxclock
| Re_quant (nul, qid, quant, r1) ->
let quant_val = quant_clocks.(qid) in (* the last time we went in *)
if (quant_val < maxclock) then
(* the last repetition of the inner quantifier happened before the last repetition of the outer one *)
filter_all r1 cap_regs
else
filter_capture r1 cap_regs cap_clocks look_clocks quant_clocks quant_val
| Re_capture (cid, r1) ->
let start = cap_clocks.(start_reg cid) in
if (start < 0) then
filter_all r1 cap_regs (* there is already no value for this capture group, we can clear everything inside *)
else begin
if (start < maxclock) (* cleaning the value of group cid (and everything inside) if its value is too old *)
then begin cap_regs.(start_reg cid) <- -1; filter_all r1 cap_regs end
else filter_capture r1 cap_regs cap_clocks look_clocks quant_clocks maxclock
end
| Re_lookaround (lid, l, r1) ->
let look_val = look_clocks.(lid) in (* the last time we needed the lookaround to hold *)
if (look_val < 0) then
filter_all r1 cap_regs (* we didn't need the lookaround: clear everything inside *)
else begin
if (look_val < maxclock) (* cleaning everything inside the lookaround since it's too old *)
then filter_all r1 cap_regs
else filter_capture r1 cap_regs cap_clocks look_clocks quant_clocks (-1)
(* resetting the maxclock to -1: lookaround clocks are reset *)
end
and filter_all (r:regex) (regs:int Array.t) : unit = (* clearing all capture group inside a regex *)
match r with
| Re_empty | Re_character _ | Re_anchor _ -> ()
| Re_alt (r1,r2) -> filter_all r1 regs; filter_all r2 regs
| Re_con (r1,r2) -> filter_all r1 regs; filter_all r2 regs
| Re_quant (nul, qid, quant, r1) ->
filter_all r1 regs
| Re_capture (cid, r1) ->
regs.(start_reg cid) <- -1; filter_all r1 regs
| Re_lookaround (lid, l, r1) -> filter_all r1 regs
(* we transform the registers to an Array with constant-time access and insertion when filtering *)
let filter_reset (r:regex) (capture:Regs.regs) (look:Regs.regs) (quant:Regs.regs) (maxclock:int) : int Array.t =
let (cap_regs, cap_clocks) = Regs.to_arrays capture in
let (_, look_clocks) = Regs.to_arrays look in
let (_, quant_clocks) = Regs.to_arrays quant in
filter_capture r cap_regs cap_clocks look_clocks quant_clocks maxclock;
cap_regs
(** * Interpreter *)
(* modifies the state by advancing all threads along epsilon transitions *)
(* calls itself recursively until there are no more active threads *)
(* the direction is only used to evaluate anchors *)
let rec advance_epsilon (c:code) (s:interpreter_state) (o:oracle) (dir:direction) : unit =
if !debug then Printf.printf "%s\n%!" ("Clock "^string_of_int s.clock^"|Epsilon active: " ^ print_active s.active);
match s.active with
| [] -> () (* done advancing epsilon transitions *)
| t::ac -> (* t: highest priority active thread *)
let i = get_instr c t.pc in
if (bpc_mem s.processed t.pc t.exit_allowed) then (* killing the lower priority thread if it has already been processed *)
begin s.active <- ac; advance_epsilon c s o dir end
else begin
s.clock <- s.clock + 1; (* augmenting the global clock *)
bpc_add s.processed t.pc t.exit_allowed; (* adding the current pc being handled to the set of proccessed pcs *)
match i with
| Consume ce -> (* adding the thread to the list of blocked thread if it isn't already there *)
s.blocked <- add_thread t ce s.blocked s.isblocked; (* also updates isblocked *)
s.active <- ac;
advance_epsilon c s o dir
| Accept -> (* updates the best match and don't consider the remain active threads *)
s.active <- [];
s.bestmatch <- Some t;
() (* no recursive call *)
| Jmp x ->
t.pc <- x;
advance_epsilon c s o dir
| Fork (x,y) -> (* x has higher priority *)
t.pc <- y;
s.active <- {pc = x;
capture_regs = Regs.copy t.capture_regs;
look_regs = Regs.copy t.look_regs;
quant_regs = Regs.copy t.quant_regs;
exit_allowed = t.exit_allowed}::s.active;
advance_epsilon c s o dir
| SetRegisterToCP r ->
(* modifying the capture regs of the current thread *)
t.capture_regs <- Regs.set_reg t.capture_regs r (Some s.cp) s.clock;
t.pc <- t.pc + 1;
advance_epsilon c s o dir
| SetQuantToClock (q,b) ->
(* saving the current cp if we are nulling a + *)
let ocp = if b then (Some s.cp) else None in
(* adding the last iteration clock *)
t.quant_regs <- Regs.set_reg t.quant_regs q ocp s.clock;
t.pc <- t.pc + 1;
advance_epsilon c s o dir
| CheckOracle l ->
if (get_oracle o s.cp l)
then begin
t.pc <- t.pc + 1; (* keeping the thread alive *)
(* remembering the cp where we last needed the oracle *)
t.look_regs <- Regs.set_reg t.look_regs l (Some s.cp) s.clock;
end
else s.active <- ac; (* killing the thread *)
advance_epsilon c s o dir
| NegCheckOracle l ->
if (get_oracle o s.cp l)
then s.active <- ac (* killing the thread *)
else t.pc <- t.pc + 1;(* keeping the thread alive *)
advance_epsilon c s o dir
| WriteOracle l ->
(* we reached a match but we want to write that into the oracle. we don't discard lower priority threads *)
s.active <- ac; (* no need to consider that thread anymore *)
set_oracle o s.cp l; (* writing to the oracle *)
advance_epsilon c s o dir (* we keep searching for more matches *)
| BeginLoop ->
(* we need to set exit_allowed to false: now exiting a loop is forbidden according to JS semantics *)
t.exit_allowed <- false;
t.pc <- t.pc + 1;
advance_epsilon c s o dir
| EndLoop ->
(* this transition is only possible if we didn't begin this loop during this epsilon transition phase *)
begin match t.exit_allowed with
| true -> t.pc <- t.pc+1; advance_epsilon c s o dir
| false -> s.active <- ac; advance_epsilon c s o dir (* killing the current thread *)
end
| CheckNullable qid ->
if (cdn_get s.cdn qid)
then t.pc <- t.pc+1 (* keeping the thread alive *)
else s.active <- ac; (* killing the thread *)
advance_epsilon c s o dir
| AnchorAssertion a ->
if (is_satisfied a s.context dir)
then t.pc <- t.pc+1 (* keeping the thread alive *)
else s.active <- ac; (* killing the thread *)
advance_epsilon c s o dir
| Fail ->
s.active <- ac; (* killing the current thread *)
advance_epsilon c s o dir
end
(* modifies the state by consuming the next character *)
(* calls itself recursively until there are no more blocked threads *)
let rec consume (s:interpreter_state): unit =
match s.blocked with
| [] -> ()
| (t,ce)::blocked' ->
s.blocked <- blocked';
if (is_accepted s.context.nextchar ce) then
begin t.exit_allowed <- true; t.pc <- t.pc + 1; s.active <- t::s.active end;
(* adding t to the list of active threads *)
(* since t just consumed something, we set its exit_allowed flag to true *)
consume s
(** * Null interpreter *)
(* an interpreter that does not read the string, but instead simply follows epsilon transitions *)
(* this is used to reconstruct the capture groups of last nulled plus iteration at the end of a match *)
(* the direction is only used to evaluate anchors *)
(* This function expects that s contains an up-to-date cp, context and cdn table *)
let null_interp (c:code) (s:interpreter_state) (o:oracle) (dir:direction): thread option =
if !verbose then Printf.printf "%s CP%d\n" ("\n\027[36mNull Interpreter:\027[0m ") (s.cp);
if !verbose then Printf.printf "%s\n" (print_code c);
if !debug then
begin
Printf.printf "%s" (print_cp s.cp);
Printf.printf "%s" (print_active s.active);
Printf.printf "%s%!" (print_bestmatch s.bestmatch);
end;
if !debug then
begin
Printf.printf "At CP%d, CDN table:%s\n" (s.cp) (print_cdn_table s.cdn);
end;
(* follow epsilon transitions *)
advance_epsilon c s o dir;
if !debug then
begin
Printf.printf "%s\n%!" (print_blocked s.blocked);
end;
s.bestmatch
(** * Finding the top priority match in a bytecode automaton *)
(* This functions assumes that s.context already contains the correct characters *)
(* this does not yet reconstruct any plus, simply alternates advance epsilon and consume *)
let rec find_match (c:code) (str:string) (s:interpreter_state) (o:oracle) (dir:direction) (cdn:cdns): thread option =
if !debug then
begin
Printf.printf "%s" (print_cp s.cp);
Printf.printf "%s" (print_active s.active);
Printf.printf "%s%!" (print_bestmatch s.bestmatch);
end;
(* building the CDN table *)
s.cdn <- build_cdn cdn s.cp o s.context dir;
if !debug then
begin
Printf.printf "At CP%d, CDN table:%s\n" (s.cp) (print_cdn_table s.cdn);
end;
(* follow epsilon transitions *)
advance_epsilon c s o dir;
if !debug then
begin
Printf.printf "%s\n%!" (print_blocked s.blocked);
end;
(* checking if there are still surviving threads *)
match s.blocked, s.context.nextchar with
| [], _ -> s.bestmatch (* no more surviving threads *)
| _, None -> s.bestmatch (* we reached the end of the string *)
| _, _ ->
(* advancing blocked threads *)
consume s;
(* resetting the processed, blocked sets and the CDN table *)
s.processed <- init_bpcset (size c);
s.isblocked <- init_pcset (size c);
s.cdn <- init_cdn();
(* advancing the current position *)
s.cp <- incr_cp s.cp dir;
(* updating the context *)
let newchar = get_char str (s.cp - cp_offset dir) in
update_context s.context newchar;
(* recursive call *)
find_match c str s o dir cdn
(** * Reconstructing Nullable + Values *)
(* when the winning thread of a match decided to go through the nullable path of a +, we might need to reconstruct any groups set during that nullable path *)
(* for this, we need the bytecode of every nullable plus, and the AST of the regex we previously matched *)
(* so that we can reconstruct exactly the plusses that are defined inside that AST *)
let reconstruct_plus_groups (thread:thread) (ast:regex) (plus_bc:code Array.t) (s:string) (o:oracle) (dir:direction): thread =
let capture = ref thread.capture_regs in
let look = ref thread.look_regs in
let quant = ref thread.quant_regs in
(* goes through the regex, if it encounters a nulled +, it calls the null interpreter *)
let rec nulled_plus (reg:regex) : unit =
match reg with
| Re_empty | Re_character _ -> ()
| Re_alt (r1, r2) | Re_con (r1, r2) ->
nulled_plus r1; nulled_plus r2
| Re_capture (_,r1) -> nulled_plus r1
| Re_lookaround (lid,lk,r1) -> ()
| Re_anchor _ -> ()
(* from shallowest to deepest plus: *)
| Re_quant (nul,qid,quanttype,body) ->
begin match (Regs.get_cp !quant qid) with
| None -> nulled_plus body (* recursive call: an inner + may have been nulled *)
| Some start_cp -> (* the last iteration of the plus was nulling *)
let start_clock = int_of_opt (Regs.get_clock !quant qid) in
if !debug then Printf.printf ("QID: %d | start_clock: %d\n") qid start_clock;
let bytecode = plus_bc.(qid) in
let ctx = cp_context start_cp s dir in
let inits = (init_state bytecode start_cp !capture !look !quant start_clock ctx) in
let subcdn = compile_cdns body in
let subtable = build_cdn subcdn start_cp o ctx dir in
inits.cdn <- subtable;
let result = null_interp bytecode inits o dir in
begin match result with
| None -> failwith "expected a nullable plus"
| Some w -> (* there's a winning thread when nulling *)
(* updating all registers *)
capture := w.capture_regs;
look := w.look_regs;
quant := w.quant_regs;
end;
nulled_children body subtable start_cp
end
(* goes through the subregex when a plus above was nulled *)
(* for all its children that got nulled while nulling the parent plus, *)
(* the CDN table can be shared *)
and nulled_children (reg:regex) (cdnt:cdn_table) (cp:int) : unit =
match reg with
| Re_empty | Re_character _ -> ()
| Re_alt (r1, r2) | Re_con (r1, r2) ->
nulled_children r1 cdnt cp; nulled_children r2 cdnt cp
| Re_capture (_,r1) -> nulled_children r1 cdnt cp
| Re_lookaround (lid,lk,r1) -> ()
| Re_anchor _ -> ()
| Re_quant (nul,qid,quanttype,body) ->
begin match (Regs.get_cp !quant qid) with
| None -> nulled_children body cdnt cp
| Some start_cp ->
if (start_cp = cp) then begin
(* otherwise we don't have to reconstruct, it was nulled in a previous iteration *)
let start_clock = int_of_opt (Regs.get_clock !quant qid) in
let bytecode = plus_bc.(qid) in
let ctx = cp_context cp s dir in
let inits = (init_state bytecode cp !capture !look !quant start_clock ctx) in
inits.cdn <- cdnt;
let result = null_interp bytecode inits o dir in
begin match result with
| None -> failwith "expected a nullable children plus"
| Some w -> (* there's a winning thread when nulling *)
(* updating all registers *)
capture := w.capture_regs;
look := w.look_regs;
quant := w.quant_regs;
end;
nulled_children body cdnt cp
end
else ()
end
in
nulled_plus ast;
{pc = thread.pc; capture_regs = !capture; look_regs = !look; quant_regs = !quant; exit_allowed = thread.exit_allowed}
(** * Finds a match in an a bytecode automaton AND reconstructs the corresponding plus groups *)
(* running the interpreter on some code, with a particular initial interpreter state *)
(* also reconstructs the + groups *)
let find_match_plus (c:code) (ast:regex) (plus_bc:code Array.t) (s:string) (o:oracle) (dir:direction) (start_cp:int) (capture:Regs.regs) (look:Regs.regs) (quant:Regs.regs) (start_clock:int) (cdn:cdns): thread option =
if !verbose then Printf.printf "%s - %s\n" ("\n\027[36mInterpreter:\027[0m "^s) (print_direction dir);
if !verbose then Printf.printf "%s\n" (print_code c);
if !verbose then Printf.printf "%s\n" (print_cdns cdn);
if !verbose then Printf.printf "%s\n" (print_context (cp_context start_cp s dir));
let initstate = init_state c start_cp capture look quant start_clock (cp_context start_cp s dir) in
let result = find_match c s initstate o dir cdn in
(* reconstruct + groups *)
let full_result =
match result with
| None -> None
| Some thread -> Some (reconstruct_plus_groups thread ast plus_bc s o dir)
in
if !verbose then Printf.printf "%s\n" ("\027[36mResult:\027[0m "^(print_match full_result));
full_result
(* (\* running the interpreter using the default initial state *\)
* let interp_default_init (cr:compiled_regex) (s:string) (o:oracle) (dir:direction) (cdn:cdns): thread option =
* let maxcap = max_group cr.ast in
* let maxlook = max_lookaround cr.ast in
* let maxquant = max_quant cr.ast in
* let capture = Regs.init_regs (2*maxcap+2) in
* let look = Regs.init_regs (maxlook+1) in
* let quant = Regs.init_regs (maxquant+1) in
* interp cr s o dir (init_cp dir (String.length s)) capture look quant 0 cdn
*
* (\* for tests, sometimes we only want to know if there is a match *\)
* let boolean_interp (cr:compiled_regex) (s:string) (o:oracle) (dir:direction) (cdn:cdns): bool =
* match (interp_default_init cr s o dir cdn) with
* | None -> false
* | _ -> true *)
(* the following functions work on the main regex, in its compiled_regex form *)
(** * Bulding the Oracle *)
(* we consider lookarounds by reverse order of their identifiers *)
(* we do not need to do this for the main regex *)
let build_oracle (cr:compiled_regex) (str:string): oracle =
let maxlook = max_lookaround cr.main_ast in
let maxcap = max_group cr.main_ast in
let maxquant = max_quant cr.main_ast in
let o = create_oracle (String.length str) (maxlook + 1) in
for lid = maxlook downto 1 do
let bytecode = cr.look_build_bc.(lid) in
let looktype = cr.look_types.(lid) in
let direction = oracle_direction looktype in
let lookcdn = cr.look_cdns.(lid) in
let initcp = init_cp direction (String.length str) in
let initctx = cp_context initcp str direction in
(* TODO: we could reuse capture, lookmem and quants instead of reallocating for each lookaround *)
let capture = Regs.init_regs (2*maxcap+2) in
let lookmem = Regs.init_regs (maxlook+1) in
let quant = Regs.init_regs (maxquant+1) in
let initstate = init_state bytecode initcp capture lookmem quant 0 initctx in
if !verbose then Printf.printf "%s\n" (print_code bytecode);
if !verbose then Printf.printf "%s\n" (print_cdns lookcdn);
(* no need to call find_match_plus, we don't care about any capture groups *)
(* inside lookarounds in the oracle building phase *)
ignore (find_match bytecode str initstate o direction lookcdn)
done;
o (* returning the modified oracle *)
(** * Finding the main match and reconstructing lookaround capture groups *)
(* returns the register array if there is a match *)
(* also filters the return value for capture reset *)
let build_capture (cr:compiled_regex) (str:string) (o:oracle): (int Array.t) option =
let max_look = max_lookaround cr.main_ast in
let max_cap = max_group cr.main_ast in
let max_quant = max_quant cr.main_ast in
let capture = Regs.init_regs(2*max_cap+2) in
let look = Regs.init_regs (max_look+1) in
let quant = Regs.init_regs (max_quant+1) in
let main_bytecode = cr.main_bc in
let main_cdn = cr.main_cdns in
(* performing the match of the main expression, with plus group reconstruction *)
let main_result =
find_match_plus main_bytecode cr.main_ast cr.plus_bc str o Forward 0 capture look quant 0 main_cdn in
match main_result with
| None -> None
| Some thread ->
(* we have a match and want to rebuild capture groups in lookarounds*)
let capture = ref thread.capture_regs in
let look = ref thread.look_regs in
let quant = ref thread.quant_regs in
for lid=1 to max_look do
match (Regs.get_cp !look lid) with
| None -> () (* the lookaround wasn't needed in the match *)
| Some cp -> (* the lookaround had a match at cp *)
let looktype = cr.look_types.(lid) in
if (capture_type looktype) then (* not for negative lookarounds *)
let bytecode = cr.look_capture_bc.(lid) in
let direction = capture_direction looktype in
let lookcdn = cr.look_cdns.(lid) in
let lookast = cr.look_ast.(lid) in
let result = find_match_plus bytecode lookast cr.plus_bc str o direction cp !capture !look !quant 0 lookcdn in
begin match result with
| None -> failwith "result expected from the oracle"
| Some t ->
(* updating all registers *)
capture := t.capture_regs;
look := t.look_regs;
quant := t.quant_regs
end
done;
if !debug then
begin
Printf.printf "regs: %s\n%!" (Regs.to_string !capture);
let (prefilter,preclocks) = Regs.to_arrays(!capture) in
let (_,quantclocks) = Regs.to_arrays(!quant) in
Printf.printf "pre-filtering regs: %s\n%!" (debug_regs prefilter);
Printf.printf "pre-filtering clocks: %s\n%!" (debug_regs preclocks);
Printf.printf "pre-filtering quant clocks: %s\n%!" (debug_regs quantclocks);
end;
let match_capture = filter_reset cr.main_ast !capture !look !quant (-1) in (* filtering old values *)
if !debug then Printf.printf "filtered regs: %s\n%!" (debug_regs match_capture);
Some (match_capture)
(** * The Full matcher *)
(* builds the oracle, then matches the main expression and rebuilds missing capture groups, and filters *)
let matcher (cr:compiled_regex) (str:string) : (int Array.t) option =
let o = build_oracle cr str in
if !debug then
Printf.printf "%s\n" (print_oracle o);
let ca = build_capture cr str o in
if !verbose then
Printf.printf "%s\n" (print_result cr.main_ast str ca);
ca
let full_match (raw:raw_regex) (str:string) : (int Array.t) option =
let re = annotate raw in
let cr = full_compilation re in
matcher cr str
let get_linear_result (raw:raw_regex) (str:string) : string =
let capop = full_match raw str in
print_result (annotate raw) str capop
end