Skip to content

Commit

Permalink
Fix #197 by only lexing numeric literals in multiplicity expressions. (
Browse files Browse the repository at this point in the history
…#202)

In issue #141, multiplicity annotations in regexes where extended to
the general, multi-digit case {nnn,mmm}.  However, lexing numeric
literals broke parsing of regexes like:

   32|64
   [01-89]

The solution here is to only lex numeric literals in a special lexer
state called `multiplicity` which is entered by the parser when
parsing multiplicity braces {nnn,mmm}.

This restores alex' handling of digits as characters in the
non-multiplicity situations.
  • Loading branch information
andreasabel authored Jan 23, 2022
1 parent e4843f2 commit e907ecb
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 12 deletions.
1 change: 1 addition & 0 deletions alex.cabal
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ extra-source-files:
tests/issue_71.x
tests/issue_119.x
tests/issue_141.x
tests/issue_197.x

source-repository head
type: git
Expand Down
22 changes: 13 additions & 9 deletions src/Parser.y
Original file line number Diff line number Diff line change
Expand Up @@ -174,15 +174,19 @@ rep :: { RExp -> RExp }
: '*' { Star }
| '+' { Plus }
| '?' { Ques }
-- Single digits are CHAR, not NUM.
-- TODO: these don't check for digits
-- properly.
| '{' CHAR '}' { repeat_rng (digit $2) Nothing }
| '{' CHAR ',' '}' { repeat_rng (digit $2) (Just Nothing) }
| '{' CHAR ',' CHAR '}' { repeat_rng (digit $2) (Just (Just (digit $4))) }
| '{' NUM '}' { repeat_rng $2 Nothing }
| '{' NUM ',' '}' { repeat_rng $2 (Just Nothing) }
| '{' NUM ',' NUM '}' { repeat_rng $2 (Just (Just $4)) }
| begin_mult '{' mult '}' { $3 }
-- A bit counterintuitively, we need @begin_mult@ already before the left brace,
-- not just before @mult@. This might be due to the lookahead in the parser.
-- Enter the "multiplicity" lexer mode to scan number literals
begin_mult :: { () }
: {- empty -} {% setStartCode multiplicity }
-- Parse a numeric multiplicity.
mult :: { RExp -> RExp }
: NUM { repeat_rng $1 Nothing }
| NUM ',' { repeat_rng $1 (Just Nothing) }
| NUM ',' NUM { repeat_rng $1 (Just (Just $3)) }
rexp0 :: { RExp }
: '(' ')' { Eps }
Expand Down
12 changes: 9 additions & 3 deletions src/Scan.x
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
-------------------------------------------------------------------------------

{
module Scan (lexer, AlexPosn(..), Token(..), Tkn(..), tokPosn) where
module Scan (lexer, AlexPosn(..), Token(..), Tkn(..), tokPosn, multiplicity) where

import Data.Char
import ParseMonad
Expand Down Expand Up @@ -56,8 +56,7 @@ alex :-
<0> \\ x $hexdig+ { hexch }
<0> \\ o $octal+ { octch }
<0> \\ $printable { escape }
<0> $nonspecial # [\<] { char } -- includes 1 digit numbers
<0> $digit+ { num } -- should be after char
<0> $nonspecial # [\<] { char }
<0> @smac { smac }
<0> @rmac { rmac }
Expand All @@ -75,6 +74,13 @@ alex :-
-- so don't try to interpret the opening { as a code block.
<afterstartcodes> \{ (\n | [^$digit ]) { special `andBegin` 0 }
<afterstartcodes> () { skip `andBegin` 0 } -- note: empty pattern

-- Numeric literals are only lexed in multiplicity braces e.g. {nnn,mmm}.
-- Switching to the @multiplicity@ lexer state happens in the parser.
<multiplicity> $digit+ { num }
<multiplicity> \, { special }
<multiplicity> \} { special `andBegin` 0 }

{

-- -----------------------------------------------------------------------------
Expand Down
1 change: 1 addition & 0 deletions tests/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ TESTS = \
issue_71.x \
issue_119.x \
issue_141.x \
issue_197.x \
monad_typeclass.x \
monad_typeclass_bytestring.x \
monadUserState_typeclass.x \
Expand Down
43 changes: 43 additions & 0 deletions tests/issue_197.x
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
{
-- Issue #197
-- reported 2022-01-21 by https://github.com/Commelina
-- fixed 2022-01-23 by Andreas Abel & John Ericson
--
-- Problem was:
-- Surface syntax regressed and could no longer handle character strings
-- that looked like numbers.

module Main (main) where

import System.Exit
}

%wrapper "posn"
%token "Token"

@iec60559suffix = (32|64|128)[x]?
@any = [01-89]+[x]?

:-

$white+ ;
@iec60559suffix { \ _ -> Good }
@any { \ _ -> Bad }

{
data Token = Good String | Bad String
deriving (Eq, Show)

input = "32 32x 99 99x 128x"
expected_result = [Good "32", Good "32x", Bad "99", Bad "99x", Good "128x"]

main :: IO ()
main
| result == expected_result = do
exitWith ExitSuccess
| otherwise = do
print result
exitFailure
where
result = alexScanTokens input
}

0 comments on commit e907ecb

Please sign in to comment.