From 0007350d36514936a5d98d90b3c98ef99028232d Mon Sep 17 00:00:00 2001 From: Andreas Abel <andreas.abel@ifi.lmu.de> Date: Sun, 23 Jan 2022 21:03:30 +0100 Subject: [PATCH] Fix #197 by only lexing numeric literals in multiplicity expressions. In issue #141, multiplicity annotations in regexes where extended to the general, multi-digit case {nnn,mmm}. However, lexing numeric literals broke parsing of regexes like: 32|64 [01-89] The solution here is to only lex numeric literals in a special lexer state called `multiplicity` which is entered by the parser when parsing multiplicity braces {nnn,mmm}. This restores alex' handling of digits as characters in the non-multiplicity situations. --- alex.cabal | 1 + src/Parser.y | 22 +++++++++++++--------- src/Scan.x | 12 +++++++++--- tests/Makefile | 1 + tests/issue_197.x | 43 +++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 67 insertions(+), 12 deletions(-) create mode 100644 tests/issue_197.x diff --git a/alex.cabal b/alex.cabal index 063b0da..c0c97e1 100644 --- a/alex.cabal +++ b/alex.cabal @@ -97,6 +97,7 @@ extra-source-files: tests/issue_71.x tests/issue_119.x tests/issue_141.x + tests/issue_197.x source-repository head type: git diff --git a/src/Parser.y b/src/Parser.y index bf627fa..8f580fc 100644 --- a/src/Parser.y +++ b/src/Parser.y @@ -174,15 +174,19 @@ rep :: { RExp -> RExp } : '*' { Star } | '+' { Plus } | '?' { Ques } - -- Single digits are CHAR, not NUM. - -- TODO: these don't check for digits - -- properly. - | '{' CHAR '}' { repeat_rng (digit $2) Nothing } - | '{' CHAR ',' '}' { repeat_rng (digit $2) (Just Nothing) } - | '{' CHAR ',' CHAR '}' { repeat_rng (digit $2) (Just (Just (digit $4))) } - | '{' NUM '}' { repeat_rng $2 Nothing } - | '{' NUM ',' '}' { repeat_rng $2 (Just Nothing) } - | '{' NUM ',' NUM '}' { repeat_rng $2 (Just (Just $4)) } + | begin_mult '{' mult '}' { $3 } +-- A bit counterintuitively, we need @begin_mult@ already before the left brace, +-- not just before @mult@. This might be due to the lookahead in the parser. + +-- Enter the "multiplicity" lexer mode to scan number literals +begin_mult :: { () } + : {- empty -} {% setStartCode multiplicity } + +-- Parse a numeric multiplicity. +mult :: { RExp -> RExp } + : NUM { repeat_rng $1 Nothing } + | NUM ',' { repeat_rng $1 (Just Nothing) } + | NUM ',' NUM { repeat_rng $1 (Just (Just $3)) } rexp0 :: { RExp } : '(' ')' { Eps } diff --git a/src/Scan.x b/src/Scan.x index 76be8a4..870f2f2 100644 --- a/src/Scan.x +++ b/src/Scan.x @@ -11,7 +11,7 @@ ------------------------------------------------------------------------------- { -module Scan (lexer, AlexPosn(..), Token(..), Tkn(..), tokPosn) where +module Scan (lexer, AlexPosn(..), Token(..), Tkn(..), tokPosn, multiplicity) where import Data.Char import ParseMonad @@ -56,8 +56,7 @@ alex :- <0> \\ x $hexdig+ { hexch } <0> \\ o $octal+ { octch } <0> \\ $printable { escape } -<0> $nonspecial # [\<] { char } -- includes 1 digit numbers -<0> $digit+ { num } -- should be after char +<0> $nonspecial # [\<] { char } <0> @smac { smac } <0> @rmac { rmac } @@ -75,6 +74,13 @@ alex :- -- so don't try to interpret the opening { as a code block. <afterstartcodes> \{ (\n | [^$digit ]) { special `andBegin` 0 } <afterstartcodes> () { skip `andBegin` 0 } -- note: empty pattern + +-- Numeric literals are only lexed in multiplicity braces e.g. {nnn,mmm}. +-- Switching to the @multiplicity@ lexer state happens in the parser. +<multiplicity> $digit+ { num } +<multiplicity> \, { special } +<multiplicity> \} { special `andBegin` 0 } + { -- ----------------------------------------------------------------------------- diff --git a/tests/Makefile b/tests/Makefile index 4f9b737..b0a90c3 100644 --- a/tests/Makefile +++ b/tests/Makefile @@ -55,6 +55,7 @@ TESTS = \ issue_71.x \ issue_119.x \ issue_141.x \ + issue_197.x \ monad_typeclass.x \ monad_typeclass_bytestring.x \ monadUserState_typeclass.x \ diff --git a/tests/issue_197.x b/tests/issue_197.x new file mode 100644 index 0000000..0b7ac56 --- /dev/null +++ b/tests/issue_197.x @@ -0,0 +1,43 @@ +{ +-- Issue #197 +-- reported 2022-01-21 by https://github.com/Commelina +-- fixed 2022-01-23 by Andreas Abel & John Ericson +-- +-- Problem was: +-- Surface syntax regressed and could no longer handle character strings +-- that looked like numbers. + +module Main (main) where + +import System.Exit +} + +%wrapper "posn" +%token "Token" + +@iec60559suffix = (32|64|128)[x]? +@any = [01-89]+[x]? + +:- + +$white+ ; +@iec60559suffix { \ _ -> Good } +@any { \ _ -> Bad } + +{ +data Token = Good String | Bad String + deriving (Eq, Show) + +input = "32 32x 99 99x 128x" +expected_result = [Good "32", Good "32x", Bad "99", Bad "99x", Good "128x"] + +main :: IO () +main + | result == expected_result = do + exitWith ExitSuccess + | otherwise = do + print result + exitFailure + where + result = alexScanTokens input +}