From 9c74c94a0bee93386c24d6eefb7bcd3155777321 Mon Sep 17 00:00:00 2001 From: "Flavio S. Glock" Date: Tue, 9 Jul 2024 14:14:04 +0200 Subject: [PATCH] Perlito5 - misc/Perl5_parser string parse fix --- misc/perl5_parser/perl5_parser.pl | 133 +++++++++++++----------------- 1 file changed, 58 insertions(+), 75 deletions(-) diff --git a/misc/perl5_parser/perl5_parser.pl b/misc/perl5_parser/perl5_parser.pl index 5d6d09f00..bb2e21fd3 100644 --- a/misc/perl5_parser/perl5_parser.pl +++ b/misc/perl5_parser/perl5_parser.pl @@ -4,6 +4,17 @@ use utf8; use Data::Dumper; +# +# AST format +# +# { +# 'index' => 520, # first token id (in the @$tokens array) +# 'next' => 521, # next token id +# 'type' => 'INTEGER', # AST object type (INTEGER, STRING, ...) +# 'value' => '3' # AST contents can be STRING, ARRAY or HASH +# } +# + ## # uncomment to debug autovivification ## package TieArrayNoAutovivification { ## use Tie::Array; @@ -122,7 +133,6 @@ # # known problems: # 10E10 tokenizes to 10,E10 -# q=>= tokenizes to q,=>,= (but Perl also has problems with this) # q!=!=="=" tokenizes to q,!=,!=, ... # sub tokenize { @@ -637,10 +647,24 @@ sub parse_number { '<' => '>', ); +sub parse_string_delimiter_fixup { + my ( $tokens, $index ) = @_; + my $delim = $tokens->[$index][1]; + if ( length($delim) > 1 ) { # delimiter looks like: != + $delim = substr( $delim, 0, 1 ); + $tokens->[$index][1] = substr( $delim, 1 ); + $index--; + } + if ( $quote_pair{$delim} ) { $delim = $quote_pair{$delim} } # q< ... > + return ( $delim, $index + 1 ); +} + sub parse_single_quote_string { - my ( $tokens, $index, $pos, $quote ) = @_; + my ( $tokens, $index, $pos ) = @_; # 'abc' + my $quote; + ( $quote, $pos ) = parse_string_delimiter_fixup( $tokens, $pos ); my $value; while ( $tokens->[$pos][0] != END_TOKEN() ) { if ( $quote eq $tokens->[$pos][1] ) { @@ -657,9 +681,11 @@ sub parse_single_quote_string { } sub parse_double_quote_string { - my ( $tokens, $index, $pos, $quote ) = @_; + my ( $tokens, $index, $pos ) = @_; # "abc" + my $quote; + ( $quote, $pos ) = parse_string_delimiter_fixup( $tokens, $pos ); my @ops; my $value = ''; while ( $tokens->[$pos][0] != END_TOKEN() ) { @@ -702,9 +728,11 @@ sub parse_double_quote_string { } sub parse_regex_string { - my ( $tokens, $index, $pos, $quote ) = @_; + my ( $tokens, $index, $pos ) = @_; # /abc/ + my $quote; + ( $quote, $pos ) = parse_string_delimiter_fixup( $tokens, $pos ); my @ops; my $value = ''; while ( $tokens->[$pos][0] != END_TOKEN() ) { @@ -746,25 +774,6 @@ sub parse_regex_string { } } -sub parse_string { - my ( $tokens, $index ) = @_; - my $pos = $index; - if ( $tokens->[$pos][0] == STRING_DELIM() ) { - my $quote = $tokens->[$pos][1]; - $pos++; # " - if ( $quote eq "'" ) { - return parse_single_quote_string( $tokens, $index, $pos, $quote ); - } - elsif ( $quote eq '"' ) { - return parse_double_quote_string( $tokens, $index, $pos, $quote ); - } - elsif ( $quote eq '`' ) { - return parse_fail( $tokens, $index ); - } - } - return parse_fail( $tokens, $index ); -} - sub parse_delim_expression { my ( $tokens, $index, $start, $stop ) = @_; my $pos = $index; @@ -849,67 +858,40 @@ sub parse_term { return $ast; } elsif ( $stmt eq 'q' ) { # q!...! - my $delim = $tokens->[$pos][1]; - if ( length($delim) > 1 ) { - - # tokenization fail; delimiter is ambiguous - } - else { - if ( $quote_pair{$delim} ) { $delim = $quote_pair{$delim} } # q< ... > - $pos++; # 'abc' - return parse_single_quote_string( $tokens, $index, $pos, $delim ); - } + return parse_single_quote_string( $tokens, $index, $pos ); } - elsif ( $stmt eq 'qq' ) { # qq!...! - my $delim = $tokens->[$pos][1]; - if ( length($delim) > 1 ) { - - # tokenization fail; delimiter is ambiguous - } - else { - if ( $quote_pair{$delim} ) { $delim = $quote_pair{$delim} } # q< ... > - $pos++; # "abc" - return parse_double_quote_string( $tokens, $index, $pos, $delim ); - } + elsif ( $stmt eq 'qq' ) { # qq!...! + return parse_double_quote_string( $tokens, $index, $pos ); } - elsif ( $stmt eq 'm' ) { # /.../ - my $delim = $tokens->[$pos][1]; - if ( length($delim) > 1 ) { - - # tokenization fail; delimiter is ambiguous + elsif ( $stmt eq 'm' ) { # /.../ + $ast = parse_regex_string( $tokens, $index, $pos ); + if ( $ast->{FAIL} ) { + return parse_fail( $tokens, $index ); } - else { - if ( $quote_pair{$delim} ) { $delim = $quote_pair{$delim} } # m< ... > - $pos++; # m/abc/ - $ast = parse_regex_string( $tokens, $index, $pos, $delim ); - if ( $ast->{FAIL} ) { - return parse_fail( $tokens, $index ); - } - # TODO parse regex modifiers - return { type => 'REGEX', index => $index, value => { args => $ast }, next => $ast->{next} }; - } + # TODO parse regex modifiers + return { type => 'REGEX', index => $index, value => { args => $ast }, next => $ast->{next} }; } - elsif ( $stmt eq 'qw' ) { # qw/.../ - my $delim = $tokens->[$pos][1]; - if ( length($delim) > 1 ) { - - # tokenization fail; delimiter is ambiguous - } - else { - if ( $quote_pair{$delim} ) { $delim = $quote_pair{$delim} } # qw< ... > - $pos++; - $ast = parse_single_quote_string( $tokens, $index, $pos, $delim ); - if ( $ast->{FAIL} ) { - return parse_fail( $tokens, $index ); - } - return { type => 'SPLIT', index => $index, value => [ ' ', $ast ], next => $ast->{next} }; + elsif ( $stmt eq 'qw' ) { # qw/.../ + $ast = parse_single_quote_string( $tokens, $index, $pos ); + if ( $ast->{FAIL} ) { + return parse_fail( $tokens, $index ); } + return { type => 'SPLIT', index => $index, value => [ ' ', $ast ], next => $ast->{next} }; } $ast = { type => 'BAREWORD', value => $tokens->[$index][1], next => $index + 1 }; } elsif ( $type == STRING_DELIM() ) { - $ast = parse_string( $tokens, $index ); + my $quote = $tokens->[$index][1]; + if ( $quote eq "'" ) { + return parse_single_quote_string( $tokens, $index, $index ); + } + elsif ( $quote eq '"' ) { + return parse_double_quote_string( $tokens, $index, $index ); + } + elsif ( $quote eq '`' ) { # TODO not implemented + } + return parse_fail( $tokens, $index ); } elsif ( $type == PAREN_OPEN() ) { $ast = parse_delim_expression( $tokens, $index, PAREN_OPEN(), PAREN_CLOSE() ); @@ -923,7 +905,7 @@ sub parse_term { elsif ( $type == SLASH() ) { # /.../ - $ast = parse_regex_string( $tokens, $index, $index + 1, $tokens->[$index][1] ); + $ast = parse_regex_string( $tokens, $index, $index ); if ( $ast->{FAIL} ) { return parse_fail( $tokens, $index ); } @@ -1013,6 +995,7 @@ sub token_as_string { } sub main { + $Data::Dumper::Sortkeys = 1; binmode( STDOUT, ":utf8" ); my $perl_code = join( '', ); my $tokens = tokenize($perl_code);