From d04ca8b8d738953636ae1f45fb74cab86f2466b3 Mon Sep 17 00:00:00 2001 From: Andreas Abel Date: Fri, 31 Jan 2020 11:11:51 +0100 Subject: [PATCH 1/2] [ whitespace ] removed trailing whitespace --- src/ParseMonad.hs | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/src/ParseMonad.hs b/src/ParseMonad.hs index 5f85cc6b..f429d8f4 100644 --- a/src/ParseMonad.hs +++ b/src/ParseMonad.hs @@ -1,5 +1,5 @@ -- ----------------------------------------------------------------------------- --- +-- -- ParseMonad.hs, part of Alex -- -- (c) Simon Marlow 2003 @@ -49,7 +49,7 @@ alexGetChar (_, _ ,_ : _, _) = undefined -- hide compiler warning alexGetByte :: AlexInput -> Maybe (Byte,AlexInput) alexGetByte (p,c,(b:bs),s) = Just (b,(p,c,bs,s)) alexGetByte (_,_,[],[]) = Nothing -alexGetByte (p,_,[],(c:s)) = let p' = alexMove p c +alexGetByte (p,_,[],(c:s)) = let p' = alexMove p c (b:bs) = UTF8.encode c in p' `seq` Just (b, (p', c, bs, s)) @@ -121,24 +121,24 @@ failP str = P $ \PState{ input = (p,_,_,_) } -> Left (Just p,str) lookupSMac :: (AlexPosn,String) -> P CharSet lookupSMac (posn,smac) - = P $ \s@PState{ smac_env = senv } -> + = P $ \s@PState{ smac_env = senv } -> case Map.lookup smac senv of Just ok -> Right (s,ok) Nothing -> Left (Just posn, "unknown set macro: $" ++ smac) lookupRMac :: String -> P RExp -lookupRMac rmac - = P $ \s@PState{ rmac_env = renv } -> +lookupRMac rmac + = P $ \s@PState{ rmac_env = renv } -> case Map.lookup rmac renv of Just ok -> Right (s,ok) Nothing -> Left (Nothing, "unknown regex macro: %" ++ rmac) newSMac :: String -> CharSet -> P () -newSMac smac set +newSMac smac set = P $ \s -> Right (s{smac_env = Map.insert smac set (smac_env s)}, ()) newRMac :: String -> RExp -> P () -newRMac rmac rexp +newRMac rmac rexp = P $ \s -> Right (s{rmac_env = Map.insert rmac rexp (rmac_env s)}, ()) setStartCode :: StartCode -> P () From 564f22a57ae6f34ab46884d1ae2e8284fdd09efa Mon Sep 17 00:00:00 2001 From: Andreas Abel Date: Fri, 31 Jan 2020 10:57:43 +0100 Subject: [PATCH 2/2] [ #71 ] Emit warnings for nullable tokens. The parser for .x files will now generate warnings for rules whose regular expression is nullable, i.e., matches the empty input. Exceptions are: 1. The user wrote regular expression (). In this case they probably meant to match the empty input. 2. Startcodes are involved. Then, matching the empty input might still change the lexer state, thus, the lexer might not continue to match the empty input. Warnings are kept as a stack in the state of the parse monad, and printed upon successful completion of the parser in Main. Example (excerpt of `issue_71.x`): ``` $whitespace = [\ \n\t] @whitespaces = $whitespace* :- @whitespaces { \ _ _ -> Whitespaces } ^ Warning here! Warning: issue_71.x:24:14: Regular expression [..]* matches the empty string. ``` Since the parser does not generate abstract syntax with position information, it is hard to give the exact warning location, i.e., the location of the offending regular expression. To keep the changes minimal, we record the location of the token _after_ the regular expression, i.e., the location where the code part begins. This approximate location should be good enough to jump to the regex in question. Another problem is that the exact text of the regular expression is not printed in the warning, only what `Show RExp` gives us. This could be fixed if we had the exact location information of the regular expression; we could then cut the regular expression out of the input string. Alternatively, the parser could be modified to return a _concrete_ syntax for the regular expression from which its original text could be recovered. The abstract reg.ex. would then be built from the concrete one in a second step. At this point, I abstain from such invasive changes to Alex for the sake of improving this rare warning. --- src/AbsSyn.hs | 29 +++++++++++++------- src/Main.hs | 13 +++++++-- src/ParseMonad.hs | 69 ++++++++++++++++++++++++++++++++++++----------- src/Parser.y | 27 ++++++++++++------- 4 files changed, 101 insertions(+), 37 deletions(-) diff --git a/src/AbsSyn.hs b/src/AbsSyn.hs index 3eb8f6b3..7aca5849 100644 --- a/src/AbsSyn.hs +++ b/src/AbsSyn.hs @@ -14,7 +14,7 @@ module AbsSyn ( wrapperName, Scanner(..), RECtx(..), - RExp(..), + RExp(..), nullable, DFA(..), State(..), SNum, StartCode, Accept(..), RightContext(..), showRCtx, strtype, encodeStartCodes, extractActions, @@ -188,7 +188,7 @@ usesPreds dfa -- Regular expressions -- `RExp' provides an abstract syntax for regular expressions. `Eps' will --- match empty strings; `Ch p' matches strings containinng a single character +-- match empty strings; `Ch p' matches strings containing a single character -- `c' if `p c' is true; `re1 :%% re2' matches a string if `re1' matches one of -- its prefixes and `re2' matches the rest; `re1 :| re2' matches a string if -- `re1' or `re2' matches it; `Star re', `Plus re' and `Ques re' can be @@ -196,13 +196,13 @@ usesPreds dfa -- for a formal definition of the semantics of these operators. data RExp - = Eps - | Ch CharSet - | RExp :%% RExp - | RExp :| RExp - | Star RExp - | Plus RExp - | Ques RExp + = Eps -- ^ Empty. + | Ch CharSet -- ^ Singleton. + | RExp :%% RExp -- ^ Sequence. + | RExp :| RExp -- ^ Alternative. + | Star RExp -- ^ Zero or more repetitions. + | Plus RExp -- ^ One or more repetitions. + | Ques RExp -- ^ Zero or one repetitions. instance Show RExp where showsPrec _ Eps = showString "()" @@ -213,6 +213,17 @@ instance Show RExp where showsPrec _ (Plus r) = shows r . ('+':) showsPrec _ (Ques r) = shows r . ('?':) +-- | A regular expression is nullable if it matches the empty string. +nullable :: RExp -> Bool +nullable Eps = True +nullable Ch{} = False +nullable (l :%% r) = nullable l && nullable r +nullable (l :| r) = nullable l || nullable r +nullable Star{} = True +nullable (Plus r) = nullable r +nullable Ques{} = True + + {------------------------------------------------------------------------------ Abstract Regular Expression ------------------------------------------------------------------------------} diff --git a/src/Main.hs b/src/Main.hs index 1f578733..e35c9992 100644 --- a/src/Main.hs +++ b/src/Main.hs @@ -18,7 +18,7 @@ import Info import Map ( Map ) import qualified Map hiding ( Map ) import Output -import ParseMonad ( runP ) +import ParseMonad ( runP, Warning(..) ) import Parser import Scan import Util ( hline ) @@ -114,11 +114,20 @@ parseScript file prg = Left (Nothing, err) -> die (file ++ ": " ++ err ++ "\n") - Right script@(_, _, scanner, _) -> do + Right (warnings, script@(_, _, scanner, _)) -> do -- issue 46: give proper error when lexer definition is empty when (null $ scannerTokens scanner) $ dieAlex $ file ++ " contains no lexer rules\n" + -- issue 71: warn about nullable regular expressions + mapM_ printWarning warnings return script + where + printWarning (WarnNullableRExp (AlexPn _ line col) msg) = + hPutStrLn stderr $ concat + [ "Warning: " + , file , ":", show line , ":" , show col , ": " + , msg + ] alex :: [CLIFlags] -> FilePath diff --git a/src/ParseMonad.hs b/src/ParseMonad.hs index f429d8f4..7272226d 100644 --- a/src/ParseMonad.hs +++ b/src/ParseMonad.hs @@ -9,7 +9,7 @@ module ParseMonad ( AlexInput, alexInputPrevChar, alexGetChar, alexGetByte, AlexPosn(..), alexStartPos, - + Warning(..), warnIfNullable, P, runP, StartCode, failP, lookupSMac, lookupRMac, newSMac, newRMac, setStartCode, getStartCode, getInput, setInput, ) where @@ -23,7 +23,7 @@ import UTF8 #if __GLASGOW_HASKELL__ < 710 import Control.Applicative ( Applicative(..) ) #endif -import Control.Monad ( liftM, ap ) +import Control.Monad ( liftM, ap, when ) import Data.Word (Word8) -- ----------------------------------------------------------------------------- -- The input type @@ -57,7 +57,7 @@ alexGetByte (p,_,[],(c:s)) = let p' = alexMove p c -- Token positions -- `Posn' records the location of a token in the input text. It has three --- fields: the address (number of chacaters preceding the token), line number +-- fields: the address (number of charaters preceding the token), line number -- and column of a token within the file. `start_pos' gives the position of the -- start of the file and `eof_pos' a standard encoding for the end of file. -- `move_pos' calculates the new position after traversing a given character, @@ -77,15 +77,22 @@ alexMove (AlexPn a l c) _ = AlexPn (a+1) l (c+1) -- ----------------------------------------------------------------------------- -- Alex lexing/parsing monad +data Warning + = WarnNullableRExp + { _warnPos :: AlexPosn -- ^ The position of the code following the regex. + , _warnText :: String -- ^ Warning text. + } + type ParseError = (Maybe AlexPosn, String) type StartCode = Int -data PState = PState { - smac_env :: Map String CharSet, - rmac_env :: Map String RExp, - startcode :: Int, - input :: AlexInput - } +data PState = PState + { warnings :: [Warning] -- ^ Stack of warnings, top = last warning. + , smac_env :: Map String CharSet + , rmac_env :: Map String RExp + , startcode :: Int + , input :: AlexInput + } newtype P a = P { unP :: PState -> Either ParseError (PState,a) } @@ -102,15 +109,27 @@ instance Monad P where Right (env',ok) -> unP (k ok) env' return = pure -runP :: String -> (Map String CharSet, Map String RExp) - -> P a -> Either ParseError a -runP str (senv,renv) (P p) +-- | Run the parser on given input. +runP :: String + -- ^ Input string. + -> (Map String CharSet, Map String RExp) + -- ^ Character set and regex definitions. + -> P a + -- ^ Parsing computation. + -> Either ParseError ([Warning], a) + -- ^ List of warnings in first-to-last order, result. +runP str (senv,renv) (P p) = case p initial_state of Left err -> Left err - Right (_,a) -> Right a - where initial_state = - PState{ smac_env=senv, rmac_env=renv, - startcode = 0, input=(alexStartPos,'\n',[],str) } + Right (s, a) -> Right (reverse (warnings s), a) + where + initial_state = PState + { warnings = [] + , smac_env = senv + , rmac_env = renv + , startcode = 0 + , input = (alexStartPos, '\n', [], str) + } failP :: String -> P a failP str = P $ \PState{ input = (p,_,_,_) } -> Left (Just p,str) @@ -152,3 +171,21 @@ getInput = P $ \s -> Right (s, input s) setInput :: AlexInput -> P () setInput inp = P $ \s -> Right (s{ input = inp }, ()) + +-- | Add a warning if given regular expression is nullable +-- unless the user wrote the regex 'Eps'. +warnIfNullable + :: RExp -- ^ Regular expression. + -> AlexPosn -- ^ Position associated to regular expression. + -> P () +-- If the user wrote @()@, they wanted to match the empty sequence! +-- Thus, skip the warning then. +warnIfNullable Eps _ = return () +warnIfNullable r pos = when (nullable r) $ P $ \ s -> + Right (s{ warnings = WarnNullableRExp pos w : warnings s}, ()) + where + w = unwords + [ "Regular expression" + , show r + , "matches the empty string." + ] diff --git a/src/Parser.y b/src/Parser.y index 0747cb1f..bf627fa8 100644 --- a/src/Parser.y +++ b/src/Parser.y @@ -109,16 +109,23 @@ tokendefs :: { [RECtx] } | {- empty -} { [] } tokendef :: { [RECtx] } - : startcodes rule { [ replaceCodes $1 $2 ] } + : startcodes rule { [ replaceCodes $1 (snd $2) ] } | startcodes '{' rules '}' { map (replaceCodes $1) $3 } - | rule { [ $1 ] } - -rule :: { RECtx } - : context rhs { let (l,e,r) = $1 in - RECtx [] l e r $2 } + | rule {% do + let (pos, res@(RECtx _ _ e _ _)) = $1 + warnIfNullable e pos + return [ res ] + } + +rule :: { (AlexPosn, RECtx) } + : context rhs { let + (l, e, r) = $1 + (pos, code) = $2 + in (pos, RECtx [] l e r code) + } rules :: { [RECtx] } - : rule rules { $1 : $2 } + : rule rules { snd $1 : $2 } | {- empty -} { [] } startcodes :: { [(String,StartCode)] } @@ -132,9 +139,9 @@ startcode :: { String } : ZERO { "0" } | ID { $1 } -rhs :: { Maybe Code } - : CODE { case $1 of T _ (CodeT code) -> Just code } - | ';' { Nothing } +rhs :: { (AlexPosn, Maybe Code) } + : CODE { case $1 of T pos (CodeT code) -> (pos, Just code) } + | ';' { (tokPosn $1, Nothing) } context :: { Maybe CharSet, RExp, RightContext RExp } : left_ctx rexp right_ctx { (Just $1,$2,$3) }