forked from pjkundert/cpppo
-
Notifications
You must be signed in to change notification settings - Fork 0
/
automata.py
1553 lines (1329 loc) · 78.6 KB
/
automata.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#
# Cpppo -- Communication Protocol Python Parser and Originator
#
# Copyright (c) 2013, Hard Consulting Corporation.
#
# Cpppo is free software: you can redistribute it and/or modify it under the
# terms of the GNU General Public License as published by the Free Software
# Foundation, either version 3 of the License, or (at your option) any later
# version. See the LICENSE file at the top of the source tree.
#
# Cpppo is distributed in the hope that it will be useful, but WITHOUT ANY
# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
# A PARTICULAR PURPOSE. See the GNU General Public License for more details.
#
from __future__ import absolute_import
from __future__ import print_function
from __future__ import division
import array
import logging
import struct
import sys
import threading
import traceback
import greenery.lego
import greenery.fsm
from . import misc
__author__ = "Perry Kundert"
__email__ = "[email protected]"
__copyright__ = "Copyright (c) 2013 Hard Consulting Corporation"
__license__ = "Dual License: GPLv3 (or later) and Commercial (see LICENSE)"
log = logging.getLogger( __package__ )
log_cfg = {
"level": logging.WARNING,
"datefmt": '%m-%d %H:%M:%S',
"format": '%(asctime)s.%(msecs).03d %(threadName)10.10s %(name)-8.8s %(levelname)-8.8s %(funcName)-10.10s %(message)s',
}
# Python2/3 compatibility types, for ascii/unicode str type
# Types produced by iterators over various input stream types
type_bytes_iter = str if sys.version_info[0] < 3 else int
type_str_iter = str
# The base class of string types
type_str_base = basestring if sys.version_info[0] < 3 else str
# The array.array typecode for iterated items of various input stream types
type_unicode_array_symbol = 'u'
type_str_array_symbol = 'c' if sys.version_info[0] < 3 else 'u'
type_bytes_array_symbol = 'c' if sys.version_info[0] < 3 else 'B'
# Various default data path contexts/extensions
path_ext_input = '.input' # default destination input
# If a greenery.fsm (which generally has an alphabet of str symbols), and we
# want to use it on a binary stream of those symbols, we need to encode the
# symbols from the str to the corresponding binary symbol(s). This will
# basically be a no-op in Python2 (bytes is synonymous with str), but on Python3
# will properly encode the Unicode str symbols to bytes, and then return an
# iterable over the result.
type_unicode_encoder = lambda s: ( b for b in s.encode( 'utf-8' ))
type_str_encoder = None if sys.version_info[0] < 3 else type_unicode_encoder
#
# peekable/peeking
# chainable/chaining
#
# Iterator wrappers with the ability to peek ahead and push back unused
# input, and the ability to chain further input iterables to an existing
# chainable iterator.
#
# So, user code can simply use the peekable and chainable types. These will
# detect the required iterator features, and if not present, return an instance
# of the appropriate peeking or chaining class over the provided iterator.
#
# BACKGROUND
#
# When __new__ returns an instance of its cls, standard Python object
# creation invokes its __init__; we don't want to do this, if we detect a
# compatible x-ing iterator (including, specifically, an instance of x-ing
# itself). Therefore, we cannot simply implement the __new__ classmethod on the
# x-ing class; we must create a different class, called x-able which checks if
# the provided iterator has the required capabilities of an x-ing, and returns
# the original (already compatible) iterator if so. Otherwise, it creates an
# instance of the required x-ing class over the supplied iterator, and returns
# it. Since Python detects that x-able's __new__ returned an x-ing instance or
# something like it (certainly not an x-able instance), Python doesn't invoke
# its __init__.
#
class peekable( object ):
"""Checks that the supplied iterable has (at least) the peek/sent methods, and
returns it if so. Otherwise, creates a peeking iterator with it."""
def __new__( cls, iterable=None ):
if hasattr( iterable, 'peek' ) and hasattr( iterable, 'sent' ):
return iterable
return peeking( iterable=iterable )
class chainable( object ):
"""Checks if the supplied iterable is already chaining, and returns it. If
not, creates a chaining iterator with it. This is used by methods expecting
an iterable with chaining capabilities, to avoid re-wrapping if provided."""
def __new__( cls, iterable=None ):
if isinstance( iterable, chaining ):
return iterable
return chaining( iterable=iterable )
class rememberable( object ):
"""Checks if the supplied iterable is already remembering, and returns it. If
not, creates a remembering iterator with it."""
def __new__( cls, iterable=None ):
if isinstance( iterable, remembering ):
return iterable
return remembering( iterable=iterable )
class peeking( object ):
"""An iterator with peek and push, allowing inspection of the upcoming
object, and push back of arbitrary numbers of objects. Also remembers
how many objects (via next, net push) have been sent."""
def __init__( self, iterable=None ):
self._iter = iter( [] if iterable is None else iterable )
self._back = [] # stack of input objects
self._sent = 0 # how many objects returned (net)
@property
def sent( self ):
return self._sent
def __iter__( self ):
return self
def push( self, item ):
self._back.append( item )
self._sent -= 1
def peek( self ):
"""Returns the next item (if any), otherwise None."""
if not self._back:
try:
self.push( next( self ))
except StopIteration:
return None
return self._back[-1]
def next( self ):
"""Ensure we invoke the proper super-class __next__ for Python 2/3
compatibility (ie. cannot simply use 'next = __next__')
"""
return self.__next__()
def __next__( self ):
"""Returns any items restored by a previous push, then any available
from the current iterator."""
try:
item = self._back.pop() if self._back else next( self._iter )
except:
raise
else:
self._sent += 1
return item
class chaining( peeking ):
"""An peekable iterator also allowing the chaining of iterables for input.
Will continuously raise StopIteration at the end of the last iterable; chain
a non-iterable to terminate any consumers with a TypeError when no more
input is available to chain."""
def __init__( self, iterable=None ):
super( chaining, self ).__init__( iterable=iterable )
self._chain = [] # queue of iterators
def chain( self, iterable ):
self._chain.insert( 0, iterable )
def __next__( self ):
"""Returns any items restored by a previous push, then any available
from the current iterator, then attempts to queue up the next iterator(s)
and return an item from it. Will continue to raise StopIteration while
no more iterables are available. Load a non-iterable (eg. None) to
terminate any user gaining input from next( self ) with a TypeError,
including self.peek(). The failing non-iterable will persist."""
try:
result = self._back.pop() if self._back else next( self._iter )
except StopIteration:
# Try next chained iterable 'til we find one with something to return
while self._chain:
self._iter = iter( self._chain[-1] )
self._chain.pop() # iff no exception; else non-iterable persists!
try:
result = next( self._iter )
except StopIteration:
continue
else:
self._sent += 1
return result
# We've run out of iterables, and still no items; re-raise StopIteration
raise
else:
self._sent += 1
return result
class remembering( chaining ):
"""A chaining iterator that remembers its yielded symbols. Can confirm whether items pushed back
are correct, as well as forget. Access self.memory property to see previously delivered
symbols, and call self.forget() to clear."""
def __init__( self, *args, **kwds ):
super( remembering, self ).__init__( *args, **kwds )
self.memory = []
def forget( self ):
self.memory = []
def __next__( self ):
result = super( remembering, self ).__next__()
self.memory.append( result )
return result
def push( self, item ):
"""If we're employing push, it'd better be consistent with our memory!"""
if self.memory:
assert self.memory.pop() == item
super( remembering, self ).push( item )
class decide( object ):
"""A type of object that may be supplied as a state transition target, instead of a state. It must
be able to represent itself as a str, and it must have a .state property, and it must be
callable with machine,source,path,data arguments and return a state or None.
Decides on a target state if the predicate on machine, path, source and data context is True.
If the predicate evaluates to not True, or the result of execute is None, the transition
is deemed to *not* have been taken, and the next state/decide (if any) is used."""
def __init__( self, name, state=None, predicate=None ):
self.name = name
self.state = state
if predicate is None:
predicate = lambda machine=None, source=None, path=None, data=None: True
self.predicate = predicate
def __str__( self ):
return "%s?%s" % ( self.name, self.state )
def __repr__( self ):
return '<%s>' % ( self )
def __call__( self, machine=None, source=None, path=None, data=None ):
return self.execute(
truth=self.predicate( machine=machine, source=source, path=path, data=data ),
machine=machine, source=source, path=path, data=data )
def execute( self, truth, machine=None, source=None, path=None, data=None ):
target = self.state if truth else None
#log.info( "%s %-13.13s -> %10s w/ data: %r", machine.name_centered(), self, target, data )
return target
class NonTerminal( Exception ):
"""A state machine has been forced to terminate in a non-terminal state"""
pass
class state( dict ):
"""The foundation state class.
On entry, processes the pending source symbol (if implemented), and then reports the outgoing
transition state (if any) based on the next source symbol. If a state requires a source symbol
to process, it must be available (and acceptable) to process when the state is entered.
On exit, a state may issue non-transition events (machine, None) when no input symbol is
available, or when no outgoing transitions are available for the pending source symbol.
The base 'state' is a Null (no-input) state, which neither tests nor consumes an input value,
but simply transitions to the next appropriate state if a matching transition exists. Useful
for decision points, eg:
q = state( "Quoted String" )
q['"'] = quoted_double
q["'"] = quoted_single
Implements the basic state transition generator structure, returning non-transition events
(machine,None) 'til an input that the state accepts/validate is available, then processed, and
then an input matching an outgoing edge in the underlying dictionary is seen.
This allows composition of state machines; the higher level machinery runs the lower level
machine transition generator, passing through state changes 'til done (StopIteration), watching
for non-transition events. When one occurs, the higher level machinery must change the
environment (consume the next source symbol using its own state machinery, chain fresh input to
the source symbol generator, discard the sub-machines and supply the remaining input to a new
one, ...)
If no transition can be determined based on the next input symbol, more complex transitions can
be configured, which may access the entire machine state, as well as the input symbol 'source',
and the 'data' artifact.
If a limit on the number of possible input symbols is known (or will be known at run-time), it
can be provided as limit. Protocols will often include a length field specifying the size of
the upcoming portion of the protocol; this can be provided to the dfa as a limit, preventing the
machine from consuming more input than it should. The ending symbol is computed after this
state processes its input symbol (if any). The incoming symbol source iterator's .sent property
is tested before transitioning, and transitioning is terminated if the computed ending symbol
has been reached."""
def __init__( self, name, terminal=False, alphabet=None, context=None, extension=None,
encoder=None, typecode=None, greedy=True, limit=None ):
if isinstance( name, state ):
# Initialization from another state; copy transition dict/recognizers
other = name
super( state, self ).__init__( other )
self.recognizers = list( other.recognizers )
self._name = other.name + "'"
self._name_centered = None
self._terminal = other._terminal if terminal is None else terminal
self.alphabet = other.alphabet if alphabet is None else alphabet
self._context = other._context if context is None else context
self._extension = other._extension if extension is None else extension
self.encoder = other.encoder if encoder is None else encoder
self.typecode = other.typecode if typecode is None else typecode
self.greedy = other.greedy if greedy is None else greedy
self.limit = other.limit if limit is None else limit
else:
super( state, self ).__init__()
self.recognizers = []
self._name = name
self._name_centered = None
self._terminal = terminal
self.alphabet = alphabet # None, type, container or predicate
self._context = context # Context added to path with '.'
self._extension = extension # plus extension, to place output in data
self.encoder = encoder
self.typecode = typecode # Unused in base, but part of standard interface
self.greedy = greedy
self.limit = limit
# Any state evaluates to True (to easily distinguish from None), even if its dict is empty.
def __nonzero__( self ):
return True
__bool__ = __nonzero__ # Python3
# Make us hashable for use in sets, etc. (and don't define equality in terms of content), even
# though we're derived from a dict.
def __hash__( self ):
return id( self )
def __eq__( self, other ):
return self is other
def __ne__( self, other ):
return self is not other
# Support with ..., to enforce mutual exclusion if necessary; the base state retains no
# information specific to the operation of any one state machine, so may be in simultaneous use.
def __enter__( self ):
"""May be used by multiple machines simultaneously."""
return self
def __exit__( self, typ, val, tbk ):
return False # suppress no exceptions
def safe( self ):
"""Confirm that the instance is safe to mutate."""
pass
# Support alternative representations in derived classes
@property
def name( self ):
"""String representation."""
return self._name
def name_centered( self ):
"""Return the 'str' representation of the state centered in a field of whitespace around an axis
character if any. Derived classes may enhance the implementation of self.name to include
other data.
"""
if self._name_centered is None:
self._name_centered = misc.centeraxis(
self, width=40, clip=True, left_right=lambda w: (w*4//4, w - w*3//4) )
return self._name_centered
def __str__( self ):
return ( '(' + ( '(' if self.terminal else ' ' )
+ self.name
+ ( ')' if self.terminal else ' ' ) + ')' )
def __repr__( self ):
return '<%s>' % ( self )
@property
def terminal( self ):
"""Property indicating whether we are in a terminal state; derived classes may have more complex
procedures for determining this, eg. they may reflect the terminal condition of some other
state machine."""
return self._terminal
# Data context
@property
def extension( self ):
return self._extension or ''
def context( self, path=None, extension=None ):
"""Yields:
Returns the state's data context, optionally joined with the specified path and
extension "<path>[.<context>]<extension>", eg:
>>> s.context()
>>> "boo"
>>> s.context( path="a.b", extension='_' )
>>> "a.b.boo_"
Any path and context are joined with '.', and an extension is just added
to the base path plus any context."""
pre = path or ''
add = self._context or ''
dot = '.' if ( pre and add ) else ''
ext = extension if extension is not None else self.extension
return pre + dot + add + ext
#
# [x] = <state> -- Store an outgoing "edge" (input symbol 'x' and target <state>)
# [x] -- Find an outgoing transition for symbol 'x', or raise KeyError
# get(x,<default>) -- Find an outgoing transition for symbol 'x', or return default
# encode(inp) -- Transform the input symbol according to the supplied encoder
#
def encode( self, inp ):
"""All input symbols are encoded using the supplied encoder; the resultant encoded symbol or
tuple of symbols are used to establish and find transitions."""
if inp is None or self.encoder is None:
return inp
enc = tuple( self.encoder( inp ))
return enc if len( enc ) > 1 else enc[0]
def __setitem__( self, inp, target ):
"""After ensuring that target is a state or a callable (that should return a state), remember a
recognizer predicate, or the simple input-->state mapping."""
if not isinstance( target, state ) and target is not None and not hasattr( target, '__call__' ):
raise KeyError( "Target must be a state, None, or a state decision function" )
if hasattr( inp, '__call__' ):
#log.debug( "%s [%-10.10r] == %-10s (via %r)", self.name_centered(),
# "*", target, inp )
self.recognizers.append( (inp,target) )
else:
# We can allow zero or more "decide" callables followed by zero or one state.
enc = self.encode( inp )
present = super( state, self ).setdefault( enc, target )
if present is not target:
# Something there already; ensure it's a list, and add new target
if type( present ) is not list:
present = [ present ]
present.append( target )
super( state, self ).__setitem__( enc, present )
if type( present ) is list:
# and make sure we only allow: <decide>, ... <decide>, [<state>]
assert all( not isinstance( v, state ) for v in present[:-1] )
#log.debug( "%s [%-10.10r] == %-10s%s", self.name_centered(),
# inp, present, ( "" if enc is inp else (" (via encoding, on %s)" % repr( enc ))))
def __getitem__( self, inp ):
"""Default is a dictionary lookup of the target state, for the encoded input from most specific
to least; the exact input, followed by predicates matching an input, followed by default
True input transition, and including finally None, for transitions on no input available or
where no more specific input test passes. If no transition found, raise KeyError. None is
a valid target.
This usually returns a <state>, but may return a list: <decide>, ..., <decide>[, <state>]
"""
enc = self.encode( inp )
try:
return super( state, self ).__getitem__( enc )
except KeyError:
pass
if enc is not None:
for pred,target in self.recognizers:
if pred( enc ):
return target
try:
return super( state, self ).__getitem__( True )
except KeyError:
pass
return super( state, self ).__getitem__( None )
def get( self, inp, default=None ):
"""The base dict get() doesn't use __getitem__, so we must implement it."""
try:
return self[inp]
except KeyError:
return default
# Input symbol validation and processing
def validate( self, inp ):
"""Test input for validity to process. The base implementation support Null (no-input) by
accepting None. Otherwise, the symbol must be consistent with the supplied alphabet (if not
None); a type, a set/list/tuple of symbols, or a predicate. This test should be a
formality; the state machine should only transition into the state on known valid symbols."""
result = False
if inp is None or self.alphabet is None:
result = True
elif type( self.alphabet ) is type:
result = isinstance( inp, self.alphabet )
elif hasattr( self.alphabet, '__contains__' ):
result = inp in self.alphabet
elif hasattr( self.alphabet, '__call__' ):
result = self.alphabet( inp )
else:
raise TypeError("Unknown alphabet: %r" % ( self.alphabet ))
#log.debug( "%s [%-10.10r]=%s=%r", self.name_centered(),
# inp, ( "~" if result else "!" ), self.alphabet )
return result
def accepts( self, source, machine=None, path=None, data=None ):
"""If input valid returning True, or False to be re-invoked (later) when appropriate input is
available; default implematation logs."""
inp = source.peek()
valid = self.validate( inp )
#log.debug( "%s %-10.10r(#%5d): %s", ( machine or self ).name_centered(),
# inp, source.sent, "accepted" if valid else "rejected" )
return valid
def process( self, source, machine=None, path=None, data=None ):
"""Process the validated input. The base implementation is a NULL (no input consumed) state,
which ignores the input. This is itself useful for selecting one of N sub-machines based on
an input, without consuming it. It is still a DFA, because there is one edge for each
input."""
pass
# State transition machinery
def run( self, source, machine=None, path=None, data=None, ending=None ):
"""A generator which will attempt to process input in the present state; if not acceptable
(self.accepts/self.validate returns False), yields non-transition event, and then tries
again to process an acceptable input.
Once processed successfully, computes and yields the outgoing state, or None if no matching
input available (or matching input is designated unacceptable by explicit transition to
None), but current state is not terminal. If greedy, will continue as long as transitions
are available; otherwise, the machine will stop as soon as a terminal state reached.
Loops yielding non-transition (machine,None) until we've found an input that passes
accepts/validate to process. This will allow higher levels of the state machine to process
pending inputs unrecognized by this machine, allowing it to proceed.
Then, loops yielding non-transitions until we find an outgoing state for the pending input
symbol (or no transition, but the state is terminal); the state is yielded, and the
generator completes w/ StopIteration.
This may seem strange, but we want to keep "punting" (giving higher level state machinery a
crack at the current unrecognized input symbol) 'til we're left with one we can handle.
This allows lower level machines to process some input and "push" a new symbol
(unrecognizable to this level, but usable by a higher level), finally proceeding when that
symbol is processed by the higher level machine.
We deal with preventing an infinite of non-transition (machine,None), either due to not
finding an acceptable input symbol, or not being able to find an acceptable output
transition, by seeing if we are about to yield the same state, with the exact same input
symbol and number of symbols sent (a no-progress loop). In either case, if we're not a
terminal state, we raise a NonTerminal exception. This is the standard failure mode of a
language grammar parsing failure when presented with a non-matching sentence of input: input
not recognized, no next state determined, no input consumed.
Yeilds:
machine,state -- if a target state can be found for the given input, or a no-input [None]
transition is specified. The caller will probably want to continue processing the input
in the new state.
machine,None -- if no acceptable input is available, and no NULL (None/no-input)
transition is available. Indicates that more input is required, or that a higher-level
state machine may accept and process the next input (and then this state machine's
generator may proceed to process the remaining input).
Raises:
StopIteration -- if no more state transitions are available, but we're in terminal state.
TypeError -- if a non-iterable has been provided to source chainable iterator, to force
termination of the state machinery.
NonTerminal -- if sub-machine terminates in a non-terminal state
"""
self.safe()
# Convert the source into something that all delegated generators can consume from and push
# back onto; this will only only convert a standard iterable to a peekable once.
source = peekable( source )
# We have entered this state; yield 0 or more (machine,None) non-transition events 'til we
# find an acceptable input. Higher level machines may choose to consume inputs we cannot,
# and then either continue accepting output yielded from this generator, or discard it.
# However, if we get multiple yields with no change in the next symbol or number of symbols
# sent, we must fail; it is unacceptable to transition into a state and then not process
# input (use other means to force stoppage before entry, such as input limits or a None
# transition)
seen = set()
while not self.accepts( source=source, machine=machine, path=path, data=data ):
crumb = (None,source.peek(),source.sent)
assert crumb not in seen, \
"%s detected no progress before finding acceptable symbol" % ( self )
seen.add( crumb )
yield machine,None
self.process( source=source, machine=machine, path=path, data=data )
limit = limit_src = self.limit
# We have processed input; after this point, we are guaranteed to invoke terminate, and
# we'll only try to transition if there are no exceptions.
exception = None
try:
# If instance establishes input source symbol constraints, prepare to enforce them. Any
# self.limit supplied may serve to establish or reduce an incoming ending symbol sent
# amount (never increase it). The limit is harvested once before running the
# sub-machine, so it is OK for the referenced limit to disappear during processing
# (eg. the length erased by new values harvested by the sub-machine).
if limit is not None:
if isinstance( limit, type_str_base ):
limit_src = self.context( path, limit_src )
limit = data.get( limit_src, 0 )
elif hasattr( limit, '__call__' ):
limit_src = misc.function_name( limit )
limit = limit( # provide our machine's context in predicate's path
source=source, machine=machine, path=self.context( path ), data=data )
assert isinstance( limit, int ), \
"Supplied limit=%r (== %r) must be (or reference) an int, not a %s" % (
limit_src, limit, type( limit ))
log.info( "%s -- limit=%r == %r; ending at symbol %r vs. %r", self.name_centered(),
limit_src, limit, source.sent + limit, ending )
if ending is None or source.sent + limit < ending:
ending = source.sent + limit
# Run the sub-machine; it is assumed that it ensures that sub-machine is deterministic
# (doesn't enter a no-progress loop). About 33% of the runtime...
for which,state in self.delegate(
source=source, machine=machine, path=path, data=data, ending=ending ):
yield which,state
except GeneratorExit as exc:
# GeneratorExit is not derived from Exception, to avoid normal except handlers. If this
# occurs, the generator has been discarded before completion; we won't be performing our
# own state transition. This exception will be masked from self.terminate, if we are in
# a terminal state, to allow normal termination activities to complete just as if a
# StopIteration had occurred.
if not self.terminal:
log.debug( "%s -- early termination in non-terminal state", self.name_centered() )
exception = exc
else:
log.info( "%s -- early termination in terminal state; masking GeneratorExit",
self.name_centered() )
raise
except Exception as exc:
# Trap other random Exceptions. Any Exception will prevent us from processing our own
# transition; only a clean termination of the above loop (where exception is None) will
# allow us to proceed to yield our own transitions.
exception = exc
raise
except:
# Unknown exception type; probably bad news
typ, exc, tbk = sys.exc_info()
exception = exc
log.info( "%s -- failed with unknown exception %s\n%s", self.name_centered(),
repr( exception ), ''.join( traceback.format_exception( typ, exc, tbk )))
raise
finally:
self.terminate( exception, machine=machine, path=path, data=data )
# If a symbol limit was provided, ensure we haven't exceeded it, and don't transition if
# we've met it. We can't decide that here, because we actually want to keep taking None
# transitions 'til we find a terminal state, even if we've run out of input symbols. So,
# pass it down to transition.
seen = set()
for which,state in self.transition(
source=source, machine=machine, path=path, data=data, ending=ending ):
crumb = (state,source.peek(),source.sent)
if crumb in seen:
break
seen.add( crumb )
yield which,state
if ending is not None:
# And finally, make certain we haven't blown our symbol limit, a catastrophic failure.
assert source.sent <= ending, \
"%s exceeded limit on incoming symbols by %d" % (
self.name_centered(), source.sent - ending )
def transition( self, source, machine=None, path=None, data=None, ending=None ):
"""We have processed input in a state; now, see if we can find a transition we should yield.
We may yield 1 or more (machine,None) non-transition events before an input is available to
decide on a transition. Remember; a state may have an "epsilon" (no-input) transition; this
will be taken immediately, even on an input symbol of None (no input available)! Also, it
may specify a transition to None (input symbol unacceptable) on any input; this is necessary
to be able to have sub-machines that reach a terminal state (say, accepting some phrase of
the grammar), but where the higher-level DFA knows that this phrase is invalid in the
grammar.
If not greedy, will cease producing transitions at a terminal state. If greedy (the
default), we'll continue to produce transitions until we reach a terminal state and cannot
transition on the next input symbol.
If a state isn't terminal and cannot transition, it may yield non-transition events forever.
This may seem a bit like OCD, but remember -- this is a base class, and derived classes may
define self.terminal in complex ways; we may "become" terminal later... Normally, you'd wrap
a machine made out of 1 or more raw state objects in a dfa (which detects no-progress
loops), instead of using it directly.
If a state is greedy and cannot transition, but transition might be possible given the right
symbol, it must yield non-transitions 'til there is an input symbol available. Otherwise,
we cannot know for certain whether or not the state could have transitioned, had there been
input available. If we do not wait, the state machine will have different behaviour,
depending on whether the next input symbol was immediately available, or if we had to yield
a non-transition to give the caller a chance to acquire input.
If we've become symbol limited, we must only follow None transitions; we don't want to
consider the next input symbol at all, because it could lead us down a "choice" path,
instead of allowing us to successfully stop at a erminal state.
TODO: Consider whether we should look for and quit at the first available terminal state,
once we've become symbol limited (basically, stop being greedy as soon as we're limited).
This would prevent us from going into "idle" states (ones we want to enter whenever we're
stopped awaiting input.) Is this valuable? We might be able to distinguish between
"awaiting input" and "limited" in the state machine...
"""
limited = ending is not None and source.sent >= ending
while not self.terminal or self.greedy:
inp = None if limited else source.peek()# raise TypeError to force stoppage
try:
choice = self.__getitem__( inp )
except KeyError:
# No transition available for current symbol (or None, if no symbol available). If
# there is *any* possibility that a transition might be possible if input *were*
# available, we need to yield a non-transition.
if limited:
log.debug( "%s -- stopped due to reaching symbol limit %d", self.name_centered(), ending )
elif inp is None and not limited and (
self.recognizers or not all( k is None for k in self.keys() )):
#log.info( "%s <non trans>", self.name_centered() )
yield machine,None # 0+ non-transitions...
continue
break # No other transition possible; done
# Found the transition or choice list (could be a state, a decide or decide, ...,
# decide[, state]). Evaluate each target state/decide instance, 'til we find a
# state/None. Even decides could end up yielding None, if all decide evaluate to None,
# and no non-None default state .
for potential in ( choice if type( choice ) is list else [ choice ] ):
if potential is None or isinstance( potential, state ):
target = potential
break
try:
#log.debug( "%s <selfdecide> on %s", self.name_centered(), choice )
target = potential( # this decision is made in our parent machine's context
source=source, machine=machine, path=path, data=data )
except Exception as exc:
log.warning( "%s <selfdecide> on %s failed: %r", self.name_centered(),
potential, exc )
raise
#log.debug( "%s <selfdecide> on %s == %s", self.name_centered(),
# potential, target )
if target:
break
#log.debug( "%s <self trans> into %s", self.name_centered(), target )
yield machine,target # 0+ non-transitions, followed by a 1 transition
break # and done!
# StopIteration after yielding a transition, or if self is a terminal state
def delegate( self, source, machine=None, path=None, data=None, ending=None ):
"""Base state class delegate generator does nothing."""
raise StopIteration
yield None
def initialize( self, machine=None, path=None, data=None ):
"""Done once at state entry."""
log.debug( "%s -- initialized", self.name_centered() )
def terminate( self, exception, machine=None, path=None, data=None ):
"""Invoked on termination (after yielding our final state transition). Exception could be:
None Controlled termination after completing final transition. The terminal
StopIteration: condition should be True
GeneratorExit: The DFA state generator has been discarded.
Exception, *: Unknown failure of state machinery.
"""
if log.isEnabledFor( logging.DEBUG ):
log.debug( "%s -- terminated %s, w/ data: %r", self.name_centered(),
"normally" if exception is None else repr( exception ), data )
# Traversal of state machine graph
def nodes( self, seen=None ):
"""Generate all states not yet seen."""
if seen is None:
seen = set()
if id( self ) not in seen:
seen.add( id( self ))
yield self
for _,target in self.recognizers:
if target:
for output in target.nodes( seen=seen ):
yield output
for value in self.values():
if type( value ) is not list:
value = [ value ]
for target in value:
# None is a valid state/decide target; skip
if target is None:
continue
if not isinstance( target, state ): # Only state and *.state types allowed
target = target.state
if target is None:
continue
for output in target.nodes( seen=seen ):
yield output
def edges( self ):
"""Generate (input,state|decide) tuples for all outgoing edges."""
for pred,target in self.recognizers:
yield (pred,target)
for inp,target in sorted( self.items(), key=lambda tup: misc.natural( tup[0] )):
if type( target ) is list:
for t in target:
yield (inp,t)
else:
yield (inp,target)
# Support for producing state machinery from a regular expression specification
@classmethod
def from_regex( cls, machine, encoder=None, **kwds ):
"""Create a graph of instances of 'cls' (a state class), as specified by the given textual
regex or greenery.fsm/lego machine. All supplied keyword args are pass to the 'cls'
constructor (eg. context). The initial state is however always a simple no-input 'state'
instance, as we do not want to process the first symbol until it has been accepted by the
regex.
The greenery.fsm alphabet is usually native Python str symbols; convert to symbols in the
target state machine's alphabet when making the transitions. For example, if we want to
deal in a stream of bytes, then we need to convert the greenery.fsm transition symbols from
str to str/int (on Python 2/3). If 'encoder' is supplied, then we can use this for the
conversion; it must be a generator that produces 1 or more encoded symbol for each input
symbol.
A greenery.fsm is also designed to be greedy on failure; it will accept and consume any
unaccepted characters in a final non-terminal state. Recognize these dead states and
drop them; we want to produce a state machine that fails on invalid inputs.
Returns the resultant regular expression string and lego representation, the fsm, and the
initial state of the resultant state machine:
('regex', <greenery.lego.lego>, <greenery.fsm.fsm>, <state>)
WARNING: This class method is used in constructors, which may be invoked on module loads;
do not use logging, as it may not have been set up yet.
"""
# Accept any of regex/lego/fsm, and build the missing ones.
regexstr, regex = None, None
if isinstance( machine, type_str_base ):
log.debug( "Converting Regex to greenery.lego: %r", machine )
regexstr = machine
machine = greenery.lego.parse( regexstr )
if isinstance( machine, greenery.lego.lego ):
log.debug( "Converting greenery.lego to fsm: %r", machine )
regex = machine
machine = regex.fsm()
if not isinstance( machine, greenery.fsm.fsm ):
raise TypeError("Provide a regular expression, or a greenery.lego/fsm, not: %s %r" % (
type( machine ), machine ))
if regex is None:
regex = machine.lego()
if regexstr is None:
regexstr = str( regex )
# Create a state machine identical to the greenery.fsm 'machine'. There are no "no-input"
# (NULL) transitions in a greenery.fsm; the None (./anychar) transition is equivalent to the
# default "True" transition. Detect "dead" states; non-terminal states where all outgoing
# edges loop back onto itself. This is used by the greenery state machine to absorb
# sequences of input that are impossible in the regular expression's grammar, and remain in
# a non-terminal state. We want our machine to fail (yield a non-transition) on that input,
# instead. So, below, we'll explicitly store a transition to None (a non-transition) for
# any transition into a dead state.
log.debug( "greenery.fsm:\n%s", machine )
states = {}
for pre,tab in machine.map.items():
terminal = pre in machine.finals
initial = pre == machine.initial
loopback = all( dst == pre for dst in tab.values() )
dead = loopback and not terminal and not initial
node = cls( str( pre ), terminal=terminal, **kwds )
#log.debug( "%s --> %r %-10s, %-10s, %-10s", node.name_centered(), tab.values(),
# "initial" if initial else "", "terminal" if terminal else "", "dead" if dead else "" )
if not dead:
states[pre] = node # must check for dead states in mapping below...
# Now, apply the supplied encoder to convert the state machine's symbols (eg. utf-8) into
# some other input symbols (eg. bytes); if encoder is None, the input symbols are in the
# same alphabet as the state machine's transition symbols. If a state machine symbols
# encode into multiple input symbols, extra (non-terminal) states will be added for each
# additional symbol. Can only do this for states/symbols with either no other outgoing
# transitions, or one "None" (anychar) transition. We ensure we process the None transition
# first, so its there in states[pre][True] before processing encoder.
for pre,tab in machine.map.items():
if pre not in states:
# These are transitions out of a dead (non-terminal, loopback) state. Skip them; any
# symbol after this point is NOT in the grammar defined by the regular expression;
# yield a non-transition.
#log.debug( "dead state %s; ignoring", pre )
continue
for sym in sorted( tab, key=lambda k: [] if k is None else [k] ):
nxt = tab[sym]
if sym is None:
sym = True
elif encoder:
# Add intervening states for Done; fall thru and link up the last newly added
# state to the 'nxt'. No new states added or linked if only one symbol results.
# We need to find an unused state number (the map index may not be simple
# increasing integers)
xformed = list( enumerate( encoder( sym )))
assert len( xformed ) > 0
#log.debug( "%s <- %-10.10r: Encoded to %r", states[pre].name_centered(), sym, xformed )
if len( xformed ) > 1:
assert ( 1 <= len( machine.map[pre] ) <= 2 ), \
"Can only expand 1 (symbol) or 2 (symbol/anychar) transitions: %r" % (
machine.map[pre] )
if len( machine.map[pre] ) == 2:
assert ( None in machine.map[pre] ), \
"If 2 transitions, one must be '.' (anychar): %r" % (
machine.map[pre] )
# Add and link up additional required states; lst will index last added one (if
# any; otherwise it will be pre)
lst = pre
for num,enc in xformed[:-1]:
add = len( states )
while add in machine.map:
add += 1
states[add] \
= cls( name=str( pre ) + '_' + str( num ), terminal=False, **kwds )
states[lst][enc] \
= states[add]
#log.debug( "%s <- %-10.10r --> %s (extra state)", states[lst].name_centered(),
# enc, states[add] )
if True in states[pre]:
states[add][True] \
= states[pre][True]
#log.debug( "%s <- %-10.10r --> %s (dup wild)", states[add].name_centered(),
# True, states[pre][True] )
lst = add
# If we added extra states, fall thru and link the last added one (as 'pre') up
# to 'nxt'
num,enc = xformed[-1]
if len( xformed ):
pre = lst
sym = enc
# If this is a transition into a "dead" state, we'll make it an explicit transition
# to None (a non-transition), forcing the regular expression dfa to cease, rejecting
# the rest of the symbols. The dfa will be terminal, iff A) it was marked terminal
# itself, and B) if the final sub-state was a terminal state. If there is already a
# wildcard ('True') transition to None, then we can skip
dst = states.get( nxt ) # will be None if 'nxt' is a "dead" state
redundant = dst is None and states[pre].get( True, True ) is None
#log.debug( "%s <- %-10.10r --> %s %s", states[pre].name_centered(), sym, dst,
# "redundant; skipping" if redundant else "" )
if redundant:
# This symbol targets a dead state (results in a non-Transition), and there is
# already a wild-card (True) to a non-transition (None). Skip it.
continue
states[pre][sym]= dst
# We create a non-input state copying the initial state's transitions, so we don't consume
# the first symbol of input before it is accepted by the regex.
return (regexstr, regex, machine, state( states[machine.initial] ))
class state_input( state ):
"""A state that consumes and saves its input symbol by appending it to the specified
<path>.<context><extension> index/attribute in the supplied data artifact. Creates an
array.array of the specified typecode, if no such path exists.
The input alphabet type, and the corresponding array typecode capable of containing individual
elements of the alphabet must be specified; default is str/'c' or str/'u' as appropriate for
Python2/3 (the alternative for a binary datastream might be bytes/'c' or bytes/'B')."""
def __init__( self, name, **kwds ):
# overrides with default if keyword unset OR None
if kwds.get( "alphabet" ) is None:
kwds["alphabet"] = type_str_iter
if kwds.get( "extension" ) is None: