From eb71e084ba7675f78d6e53549a7db8b93cd4f995 Mon Sep 17 00:00:00 2001 From: Apocalypse Date: Thu, 13 Nov 2014 18:33:51 -0800 Subject: [PATCH 1/4] speedup POE::Filter::Line by reducing usage of backtracking regex --- lib/POE/Filter/Line.pm | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lib/POE/Filter/Line.pm b/lib/POE/Filter/Line.pm index 025d840e3..c4e5524a3 100644 --- a/lib/POE/Filter/Line.pm +++ b/lib/POE/Filter/Line.pm @@ -151,8 +151,12 @@ sub get_one { # Autodetect is done, or it never started. Parse some buffer! unless ($self->[AUTODETECT_STATE]) { DEBUG and warn unpack 'H*', $self->[INPUT_REGEXP]; + + # Avoid running the super-backtracking regex which is slow! + last LINE unless $self->[FRAMING_BUFFER] =~ $self->[INPUT_REGEXP]; last LINE unless $self->[FRAMING_BUFFER] =~ s/^(.*?)$self->[INPUT_REGEXP]//s; + DEBUG and warn "got line: <<", unpack('H*', $1), ">>\n"; my $line = $1; die "Next line exceeds maximum line length" From 820207a3f87a831690d49dbc38228bdf1a4efff3 Mon Sep 17 00:00:00 2001 From: Apocalypse Date: Fri, 14 Nov 2014 10:00:31 -0800 Subject: [PATCH 2/4] use substr instead of backtracking regex, thanks rcaputo --- lib/POE/Filter/Line.pm | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/lib/POE/Filter/Line.pm b/lib/POE/Filter/Line.pm index c4e5524a3..02fb91dd2 100644 --- a/lib/POE/Filter/Line.pm +++ b/lib/POE/Filter/Line.pm @@ -152,17 +152,16 @@ sub get_one { unless ($self->[AUTODETECT_STATE]) { DEBUG and warn unpack 'H*', $self->[INPUT_REGEXP]; - # Avoid running the super-backtracking regex which is slow! - last LINE unless $self->[FRAMING_BUFFER] =~ $self->[INPUT_REGEXP]; - last LINE - unless $self->[FRAMING_BUFFER] =~ s/^(.*?)$self->[INPUT_REGEXP]//s; - - DEBUG and warn "got line: <<", unpack('H*', $1), ">>\n"; - my $line = $1; - die "Next line exceeds maximum line length" + if ($self->[FRAMING_BUFFER] =~ $self->[INPUT_REGEXP]) { + my $line = substr($self->[FRAMING_BUFFER], 0, $-[0]); + $self->[FRAMING_BUFFER] = substr($self->[FRAMING_BUFFER], $+[0]); + DEBUG and warn "got line: <<", unpack('H*', $line), ">>\n"; + die "Next line exceeds maximum line length" if length( $line ) > $self->[MAX_LENGTH]; - - return [ $line ]; + return [ $line ]; + } else { + last LINE; + } } # Waiting for the first line ending. Look for a generic newline. From ccecd864949e3aa940564f91c07772030f446db7 Mon Sep 17 00:00:00 2001 From: Apocalypse Date: Fri, 14 Nov 2014 10:08:36 -0800 Subject: [PATCH 3/4] apply the same avoidance of the backtracking regex to the autodetect logic --- lib/POE/Filter/Line.pm | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lib/POE/Filter/Line.pm b/lib/POE/Filter/Line.pm index 02fb91dd2..9b3ee1163 100644 --- a/lib/POE/Filter/Line.pm +++ b/lib/POE/Filter/Line.pm @@ -153,11 +153,10 @@ sub get_one { DEBUG and warn unpack 'H*', $self->[INPUT_REGEXP]; if ($self->[FRAMING_BUFFER] =~ $self->[INPUT_REGEXP]) { + die "Next line exceeds maximum line length" if $-[0] > $self->[MAX_LENGTH]; my $line = substr($self->[FRAMING_BUFFER], 0, $-[0]); $self->[FRAMING_BUFFER] = substr($self->[FRAMING_BUFFER], $+[0]); DEBUG and warn "got line: <<", unpack('H*', $line), ">>\n"; - die "Next line exceeds maximum line length" - if length( $line ) > $self->[MAX_LENGTH]; return [ $line ]; } else { last LINE; @@ -166,6 +165,11 @@ sub get_one { # Waiting for the first line ending. Look for a generic newline. if ($self->[AUTODETECT_STATE] & AUTO_STATE_FIRST) { + # avoid the backtracking regex below, APOCAL didn't change it to the + # magic substr tricks employed above because of the interactions between + # the line and autodetection of the INPUT_REGEXP so this is the lazy + # solution as autodetection only happens once at "startup" :) + last LINE unless $self->[FRAMING_BUFFER] =~ /(?:\x0D\x0A?|\x0A\x0D?)/; last LINE unless $self->[FRAMING_BUFFER] =~ s/^(.*?)(\x0D\x0A?|\x0A\x0D?)//; From 85b3cab354bf6c7ad2934f1b735341d8dde4dfdb Mon Sep 17 00:00:00 2001 From: Apocalypse Date: Fri, 14 Nov 2014 10:41:51 -0800 Subject: [PATCH 4/4] make sure our regexp are qr// and document how it works --- lib/POE/Filter/Line.pm | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/lib/POE/Filter/Line.pm b/lib/POE/Filter/Line.pm index 9b3ee1163..1e26c2556 100644 --- a/lib/POE/Filter/Line.pm +++ b/lib/POE/Filter/Line.pm @@ -84,7 +84,7 @@ sub new { if defined $params{InputLiteral}; } else { - $input_regexp = "(\\x0D\\x0A?|\\x0A\\x0D?)"; + $input_regexp = "(?:\\x0D\\x0A?|\\x0A\\x0D?)"; } if (defined $params{OutputLiteral}) { @@ -105,12 +105,12 @@ sub new { if scalar keys %params; my $self = bless [ - '', # FRAMING_BUFFER - $input_regexp, # INPUT_REGEXP - $output_literal, # OUTPUT_LITERAL - $autodetect, # AUTODETECT_STATE - $max_length, # MAX_LENGTH - $max_buffer # MAX_BUFFER + '', # FRAMING_BUFFER + qr/$input_regexp/, # INPUT_REGEXP + $output_literal, # OUTPUT_LITERAL + $autodetect, # AUTODETECT_STATE + $max_length, # MAX_LENGTH + $max_buffer # MAX_BUFFER ], $type; DEBUG and warn join ':', @$self; @@ -172,7 +172,8 @@ sub get_one { last LINE unless $self->[FRAMING_BUFFER] =~ /(?:\x0D\x0A?|\x0A\x0D?)/; last LINE unless $self->[FRAMING_BUFFER] =~ s/^(.*?)(\x0D\x0A?|\x0A\x0D?)//; - + die "Next line exceeds maximum line length" + if length( $1 ) > $self->[MAX_LENGTH]; my $line = $1; # The newline can be complete under two conditions. First: If @@ -182,7 +183,7 @@ sub get_one { (length $self->[FRAMING_BUFFER]) ) { DEBUG and warn "detected complete newline after line: <<$1>>\n"; - $self->[INPUT_REGEXP] = $2; + $self->[INPUT_REGEXP] = qr/$2/; $self->[AUTODETECT_STATE] = AUTO_STATE_DONE; } @@ -194,8 +195,6 @@ sub get_one { $self->[INPUT_REGEXP] = $2; $self->[AUTODETECT_STATE] = AUTO_STATE_SECOND; } - die "Next line exceeds maximum line length" - if length( $line ) > $self->[MAX_LENGTH]; return [ $line ]; } @@ -226,7 +225,7 @@ sub get_one { # Regardless, whatever is in INPUT_REGEXP is now a complete # newline. End autodetection, post-process the found newline, # and loop to see if there are other lines in the buffer. - $self->[INPUT_REGEXP] = $self->[INPUT_REGEXP]; + $self->[INPUT_REGEXP] = qr/$self->[INPUT_REGEXP]/; $self->[AUTODETECT_STATE] = AUTO_STATE_DONE; next LINE; } @@ -351,12 +350,14 @@ undef: ); C may be used in place of C to recognize -line terminators based on a regular expression. In this example, -input is terminated by two or more consecutive newlines. On output, -the paragraph separator is "---" on a line by itself. +line terminators based on a regular expression. Please make sure the regexp +is a sane one and doesn't capture anything in order to boost performance! + +In this example, input is terminated by two or more consecutive newlines. +On output, the paragraph separator is "---" on a line by itself. my $paragraph_filter = POE::Filter::Line->new( - InputRegexp => "([\x0D\x0A]{2,})", + InputRegexp => "[\x0D\x0A]{2,}", OutputLiteral => "\n---\n", );