From 0007350d36514936a5d98d90b3c98ef99028232d Mon Sep 17 00:00:00 2001
From: Andreas Abel <andreas.abel@ifi.lmu.de>
Date: Sun, 23 Jan 2022 21:03:30 +0100
Subject: [PATCH] Fix #197 by only lexing numeric literals in multiplicity
 expressions.

In issue #141, multiplicity annotations in regexes where extended to
the general, multi-digit case {nnn,mmm}.  However, lexing numeric
literals broke parsing of regexes like:

   32|64
   [01-89]

The solution here is to only lex numeric literals in a special lexer
state called `multiplicity` which is entered by the parser when
parsing multiplicity braces {nnn,mmm}.

This restores alex' handling of digits as characters in the
non-multiplicity situations.
---
 alex.cabal        |  1 +
 src/Parser.y      | 22 +++++++++++++---------
 src/Scan.x        | 12 +++++++++---
 tests/Makefile    |  1 +
 tests/issue_197.x | 43 +++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 67 insertions(+), 12 deletions(-)
 create mode 100644 tests/issue_197.x

diff --git a/alex.cabal b/alex.cabal
index 063b0da..c0c97e1 100644
--- a/alex.cabal
+++ b/alex.cabal
@@ -97,6 +97,7 @@ extra-source-files:
         tests/issue_71.x
         tests/issue_119.x
         tests/issue_141.x
+        tests/issue_197.x
 
 source-repository head
     type:     git
diff --git a/src/Parser.y b/src/Parser.y
index bf627fa..8f580fc 100644
--- a/src/Parser.y
+++ b/src/Parser.y
@@ -174,15 +174,19 @@ rep	:: { RExp -> RExp }
 	: '*' 				{ Star }
 	| '+' 				{ Plus }
 	| '?' 				{ Ques }
-					-- Single digits are CHAR, not NUM.
-					-- TODO: these don't check for digits
-					-- properly.
-	| '{' CHAR '}'			{ repeat_rng (digit $2) Nothing }
-	| '{' CHAR ',' '}'		{ repeat_rng (digit $2) (Just Nothing) }
-	| '{' CHAR ',' CHAR '}' 	{ repeat_rng (digit $2) (Just (Just (digit $4))) }
-	| '{' NUM '}'			{ repeat_rng $2 Nothing }
-	| '{' NUM ',' '}'		{ repeat_rng $2 (Just Nothing) }
-	| '{' NUM ',' NUM '}'           { repeat_rng $2 (Just (Just $4)) }
+	| begin_mult '{' mult '}'	{ $3 }
+-- A bit counterintuitively, we need @begin_mult@ already before the left brace,
+-- not just before @mult@.  This might be due to the lookahead in the parser.
+
+-- Enter the "multiplicity" lexer mode to scan number literals
+begin_mult :: { () }
+	: {- empty -}			{% setStartCode multiplicity }
+
+-- Parse a numeric multiplicity.
+mult	:: { RExp -> RExp }
+	: NUM				{ repeat_rng $1 Nothing }
+	| NUM ','			{ repeat_rng $1 (Just Nothing) }
+	| NUM ',' NUM			{ repeat_rng $1 (Just (Just $3)) }
 
 rexp0	:: { RExp }
 	: '(' ')'  			{ Eps }
diff --git a/src/Scan.x b/src/Scan.x
index 76be8a4..870f2f2 100644
--- a/src/Scan.x
+++ b/src/Scan.x
@@ -11,7 +11,7 @@
 -------------------------------------------------------------------------------
 
 {
-module Scan (lexer, AlexPosn(..), Token(..), Tkn(..), tokPosn) where
+module Scan (lexer, AlexPosn(..), Token(..), Tkn(..), tokPosn, multiplicity) where
 
 import Data.Char
 import ParseMonad
@@ -56,8 +56,7 @@ alex :-
 <0> \\ x $hexdig+               { hexch }
 <0> \\ o $octal+                { octch }
 <0> \\ $printable               { escape }
-<0> $nonspecial # [\<]          { char } -- includes 1 digit numbers
-<0> $digit+                     { num  } -- should be after char
+<0> $nonspecial # [\<]          { char }
 <0> @smac                       { smac }
 <0> @rmac                       { rmac }
 
@@ -75,6 +74,13 @@ alex :-
 -- so don't try to interpret the opening { as a code block.
 <afterstartcodes> \{ (\n | [^$digit ])  { special `andBegin` 0 }
 <afterstartcodes> ()            { skip `andBegin` 0 }  -- note: empty pattern
+
+-- Numeric literals are only lexed in multiplicity braces e.g. {nnn,mmm}.
+-- Switching to the @multiplicity@ lexer state happens in the parser.
+<multiplicity> $digit+          { num }
+<multiplicity> \,               { special }
+<multiplicity> \}               { special `andBegin` 0 }
+
 {
 
 -- -----------------------------------------------------------------------------
diff --git a/tests/Makefile b/tests/Makefile
index 4f9b737..b0a90c3 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -55,6 +55,7 @@ TESTS = \
         issue_71.x \
         issue_119.x \
         issue_141.x \
+        issue_197.x \
         monad_typeclass.x \
         monad_typeclass_bytestring.x \
         monadUserState_typeclass.x \
diff --git a/tests/issue_197.x b/tests/issue_197.x
new file mode 100644
index 0000000..0b7ac56
--- /dev/null
+++ b/tests/issue_197.x
@@ -0,0 +1,43 @@
+{
+-- Issue #197
+-- reported 2022-01-21 by https://github.com/Commelina
+-- fixed 2022-01-23 by Andreas Abel & John Ericson
+--
+-- Problem was:
+-- Surface syntax regressed and could no longer handle character strings
+-- that looked like numbers.
+
+module Main (main) where
+
+import System.Exit
+}
+
+%wrapper "posn"
+%token   "Token"
+
+@iec60559suffix = (32|64|128)[x]?
+@any            = [01-89]+[x]?
+
+:-
+
+$white+         ;
+@iec60559suffix { \ _ -> Good }
+@any            { \ _ -> Bad }
+
+{
+data Token = Good String | Bad String
+  deriving (Eq, Show)
+
+input           = "32 32x 99 99x 128x"
+expected_result = [Good "32", Good "32x", Bad "99", Bad "99x", Good "128x"]
+
+main :: IO ()
+main
+  | result == expected_result = do
+      exitWith ExitSuccess
+  | otherwise = do
+      print result
+      exitFailure
+  where
+  result = alexScanTokens input
+}