From 4ac4d5dc55c5930784bff884c370f01e91b3048e Mon Sep 17 00:00:00 2001 From: John Ericson Date: Fri, 20 Sep 2024 11:44:23 -0400 Subject: [PATCH] Improve the abstract syntax for tokens Instead of deferring the handling of `$$` to the backends, properly parse the `$$` syntax up front, and store the result in the AST. Note that the GLR backend was improperly substituting the `$$` twice. Now that we have better types, this was surfaced as type error, and then removed. --- .../src/Happy/Backend/GLR/ProduceCode.lhs | 23 ++++---- .../src/Happy/Backend/LALR/ProduceCode.lhs | 14 ++--- lib/frontend/boot-src/Parser.ly | 6 +- lib/frontend/src/Happy/Frontend/AbsSyn.lhs | 38 +++++++++++-- lib/frontend/src/Happy/Frontend/Parser.hs | 10 ++-- lib/grammar/src/Happy/Grammar.lhs | 55 +++++++++---------- lib/tabular/src/Happy/Tabular/Info.lhs | 5 +- 7 files changed, 89 insertions(+), 62 deletions(-) diff --git a/lib/backend-glr/src/Happy/Backend/GLR/ProduceCode.lhs b/lib/backend-glr/src/Happy/Backend/GLR/ProduceCode.lhs index e319d47e..cfb34ef1 100644 --- a/lib/backend-glr/src/Happy/Backend/GLR/ProduceCode.lhs +++ b/lib/backend-glr/src/Happy/Backend/GLR/ProduceCode.lhs @@ -273,9 +273,9 @@ that will be used for them in the GLR parser. > | (i,tok) <- token_specs g ] -- Tokens (terminals) > ++ [(eof_term g,"HappyEOF")] -- EOF symbol (internal terminal) > where -> mkMatch tok = case mapDollarDollar tok of -> Nothing -> tok -> Just fn -> fn "_" +> mkMatch tok = case tok of +> TokenFixed t -> t +> TokenWithValue e -> substExpressionWithHole e "_" > toGSym :: [(Int, String)] -> Int -> String > toGSym gsMap i @@ -314,10 +314,7 @@ It also shares identical reduction values as CAFs > where > startLine > = unwords [ name , show_st exts state, "(" , getTok , ") =" ] -> getTok = let tok = toGSym gsMap symInt -> in case mapDollarDollar tok of -> Nothing -> tok -> Just f -> f "_" +> getTok = toGSym gsMap symInt > mkAct act > = case act of > LR'Shift newSt _ -> "Shift " ++ show newSt ++ " []" @@ -421,7 +418,7 @@ Creating a type for storing semantic rules use in later stages. > type SemInfo -> = [(String, String, [Int], [((Int,Int), ([(Int,String)],String), [Int])])] +> = [(String, String, [Int], [((Int, Int), ([(Int, TokenSpec)], String), [Int])])] > mkGSemType :: Options -> Grammar String -> Pragmas -> (ShowS, SemInfo) > mkGSemType (TreeDecode,_,_) g pragmas @@ -566,17 +563,17 @@ Creates the appropriate semantic values. > nodes UseFiltering = "(" ++ foldr (\l -> mkHappyVar (l+1) . showChar ':') "[])" mask > ] -> mk_lambda :: [(Int, String)] -> Int -> String -> String +> mk_lambda :: [(Int, TokenSpec)] -> Int -> String -> String > mk_lambda pats v > = (\s -> "\\" ++ s ++ " -> ") . mk_binder id pats v -> mk_binder :: (String -> String) -> [(Int, String)] -> Int -> String -> String +> mk_binder :: (String -> String) -> [(Int, TokenSpec)] -> Int -> String -> String > mk_binder wrap pats v > = case lookup v pats of > Nothing -> mkHappyVar v -> Just p -> case mapDollarDollar p of -> Nothing -> wrap . mkHappyVar v . showChar '@' . brack p -> Just fn -> wrap . brack' (fn . mkHappyVar v) +> Just p -> case p of +> TokenFixed p' -> wrap . mkHappyVar v . showChar '@' . brack p' +> TokenWithValue e -> wrap . brack' (substExpressionWithHole e . mkHappyVar v) --- diff --git a/lib/backend-lalr/src/Happy/Backend/LALR/ProduceCode.lhs b/lib/backend-lalr/src/Happy/Backend/LALR/ProduceCode.lhs index 74abc0a6..b4343e93 100644 --- a/lib/backend-lalr/src/Happy/Backend/LALR/ProduceCode.lhs +++ b/lib/backend-lalr/src/Happy/Backend/LALR/ProduceCode.lhs @@ -417,19 +417,17 @@ The token conversion function. Use a variable rather than '_' to replace '$$', so we can use it on the left hand side of '@'. -> removeDollarDollar xs = case mapDollarDollar xs of -> Nothing -> xs -> Just fn -> fn "happy_dollar_dollar" +> removeDollarDollar tok = case tok of +> TokenFixed t -> t +> TokenWithValue e -> substExpressionWithHole e "happy_dollar_dollar" > mkHappyTerminalVar :: Int -> Int -> String -> String > mkHappyTerminalVar i t = -> case tok_str_fn of +> case lookup t token_rep of > Nothing -> pat -> Just fn -> brack (fn (pat [])) +> Just (TokenFixed _) -> pat +> Just (TokenWithValue e) -> brack $ substExpressionWithHole e $ pat [] > where -> tok_str_fn = case lookup t token_rep of -> Nothing -> Nothing -> Just str' -> mapDollarDollar str' > pat = mkHappyVar i > tokIndex i = i - n_nonterminals - n_starts - 2 diff --git a/lib/frontend/boot-src/Parser.ly b/lib/frontend/boot-src/Parser.ly index 72019e67..b193ae52 100644 --- a/lib/frontend/boot-src/Parser.ly +++ b/lib/frontend/boot-src/Parser.ly @@ -132,12 +132,12 @@ The parser. > : id { Just $1 } > | {- nothing -} { Nothing } -> tokenSpecs :: { [(String,String)] } +> tokenSpecs :: { [(String, TokenSpec)] } > : tokenSpec tokenSpecs { $1:$2 } > | tokenSpec { [$1] } -> tokenSpec :: { (String,String) } -> : id code { ($1,$2) } +> tokenSpec :: { (String, TokenSpec) } +> : id code { ($1, parseTokenSpec $2) } > ids :: { [String] } > : id ids { $1 : $2 } diff --git a/lib/frontend/src/Happy/Frontend/AbsSyn.lhs b/lib/frontend/src/Happy/Frontend/AbsSyn.lhs index 77e98e04..f521e3a9 100644 --- a/lib/frontend/src/Happy/Frontend/AbsSyn.lhs +++ b/lib/frontend/src/Happy/Frontend/AbsSyn.lhs @@ -13,10 +13,18 @@ Here is the abstract syntax of the language we parse. > getImportedIdentity, getMonad, getError, > getPrios, getPrioNames, getExpect, getErrorHandlerType, > getAttributes, getAttributetype, getAttributeGrammarExtras, -> Rule(..), Prod(..), Term(..), Prec(..) +> parseTokenSpec, +> Rule(..), Prod(..), Term(..), Prec(..), +> TokenSpec(..) -- reexport > ) where -> import Happy.Grammar (ErrorHandlerType(..), AttributeGrammarExtras(..)) +> import Data.Char (isAlphaNum) +> import Happy.Grammar +> ( ErrorHandlerType(..) +> , TokenSpec(..) +> , ExpressionWithHole(..) +> , AttributeGrammarExtras(..) +> ) > data BookendedAbsSyn > = BookendedAbsSyn @@ -63,7 +71,7 @@ generate some error messages. > > data Directive a > = TokenType String -- %tokentype -> | TokenSpec [(a,String)] -- %token +> | TokenSpec [(a, TokenSpec)] -- %token > | TokenName String (Maybe String) Bool -- %name/%partial (True <=> %partial) > | TokenLexer String String -- %lexer > | TokenErrorHandlerType String -- %errorhandlertype @@ -109,7 +117,7 @@ generate some error messages. > [] -> (False,"()","HappyIdentity","Prelude.>>=","Prelude.return") > _ -> error "multiple monad directives" -> getTokenSpec :: [Directive t] -> [(t, String)] +> getTokenSpec :: [Directive t] -> [(t, TokenSpec)] > getTokenSpec ds = concat [ t | (TokenSpec t) <- ds ] > getPrios :: [Directive t] -> [Directive t] @@ -170,3 +178,25 @@ generate some error messages. > attributetype = at > } > (_ : _, Nothing) -> error "attributes found without attribute type directive" + +> -- | Parse a token spec. +> -- +> -- The first occurence of '$$' indicates an expression in which the '$$' +> -- will be substituted for the actual lexed token. '$$' in string or char +> -- literals ('".."' and '\'.\'') however does not count. +> parseTokenSpec :: String -> TokenSpec +> parseTokenSpec code0 = go code0 "" +> where go code acc = +> case code of +> [] -> TokenFixed code0 +> +> '"' :r -> case reads code :: [(String,String)] of +> [] -> go r ('"':acc) +> (s,r'):_ -> go r' (reverse (show s) ++ acc) +> a:'\'' :r | isAlphaNum a -> go r ('\'':a:acc) +> '\'' :r -> case reads code :: [(Char,String)] of +> [] -> go r ('\'':acc) +> (c,r'):_ -> go r' (reverse (show c) ++ acc) +> '\\':'$':r -> go r ('$':acc) +> '$':'$':r -> TokenWithValue $ ExpressionWithHole (reverse acc) r +> c:r -> go r (c:acc) diff --git a/lib/frontend/src/Happy/Frontend/Parser.hs b/lib/frontend/src/Happy/Frontend/Parser.hs index 81fa5a16..6c19582d 100644 --- a/lib/frontend/src/Happy/Frontend/Parser.hs +++ b/lib/frontend/src/Happy/Frontend/Parser.hs @@ -135,15 +135,15 @@ happyIn19 x = Happy_GHC_Exts.unsafeCoerce# (HappyWrap19 x) happyOut19 :: (HappyAbsSyn ) -> HappyWrap19 happyOut19 x = Happy_GHC_Exts.unsafeCoerce# x {-# INLINE happyOut19 #-} -newtype HappyWrap20 = HappyWrap20 ([(String,String)]) -happyIn20 :: ([(String,String)]) -> (HappyAbsSyn ) +newtype HappyWrap20 = HappyWrap20 ([(String, TokenSpec)]) +happyIn20 :: ([(String, TokenSpec)]) -> (HappyAbsSyn ) happyIn20 x = Happy_GHC_Exts.unsafeCoerce# (HappyWrap20 x) {-# INLINE happyIn20 #-} happyOut20 :: (HappyAbsSyn ) -> HappyWrap20 happyOut20 x = Happy_GHC_Exts.unsafeCoerce# x {-# INLINE happyOut20 #-} -newtype HappyWrap21 = HappyWrap21 ((String,String)) -happyIn21 :: ((String,String)) -> (HappyAbsSyn ) +newtype HappyWrap21 = HappyWrap21 ((String, TokenSpec)) +happyIn21 :: ((String, TokenSpec)) -> (HappyAbsSyn ) happyIn21 x = Happy_GHC_Exts.unsafeCoerce# (HappyWrap21 x) {-# INLINE happyIn21 #-} happyOut21 :: (HappyAbsSyn ) -> HappyWrap21 @@ -768,7 +768,7 @@ happyReduction_51 happy_x_2 = case happyOutTok happy_x_1 of { (TokenInfo happy_var_1 TokId) -> case happyOutTok happy_x_2 of { (TokenInfo happy_var_2 TokCodeQuote) -> happyIn21 - ((happy_var_1,happy_var_2) + ((happy_var_1, parseTokenSpec happy_var_2) )}} happyReduce_52 :: () => Happy_GHC_Exts.Int# -> Token -> Happy_GHC_Exts.Int# -> Happy_IntList -> HappyStk (HappyAbsSyn ) -> P (HappyAbsSyn ) diff --git a/lib/grammar/src/Happy/Grammar.lhs b/lib/grammar/src/Happy/Grammar.lhs index b3d670cb..dd5a111d 100644 --- a/lib/grammar/src/Happy/Grammar.lhs +++ b/lib/grammar/src/Happy/Grammar.lhs @@ -8,32 +8,52 @@ The Grammar data type. > module Happy.Grammar ( > Name, > -> Production(..), Grammar(..), +> Production(..), +> TokenSpec(..), +> Grammar(..), > AttributeGrammarExtras(..), > Priority(..), > Assoc(..), > Pragmas(..), ErrorHandlerType(..), > +> ExpressionWithHole(..), +> substExpressionWithHole, > errorName, errorTok, startName, dummyName, firstStartTok, dummyTok, > eofName, epsilonTok, -> -> mapDollarDollar > ) where > import Data.Array -> import Data.Char (isAlphaNum) > type Name = Int > data Production eliminator > = Production Name [Name] (eliminator,[Int]) Priority > deriving Show +> data TokenSpec +> +> -- | The token is just a fixed value +> = TokenFixed String +> +> -- | The token is an expression involving the value of the lexed token. +> | TokenWithValue ExpressionWithHole +> +> deriving (Eq, Show) + +> -- | The overall expression is +> -- 'tokLeft ++ substitutedForHole ++ tokRight'. +> data ExpressionWithHole +> = ExpressionWithHole { +> exprLeft :: String, +> exprRight :: String +> } +> deriving (Eq, Show) + > data Grammar eliminator > = Grammar { > productions :: [Production eliminator], > lookupProdNo :: Int -> Production eliminator, > lookupProdsOfName :: Name -> [Int], -> token_specs :: [(Name,String)], +> token_specs :: [(Name, TokenSpec)], > terminals :: [Name], > non_terminals :: [Name], > starts :: [(String,Name,Name,Bool)], @@ -146,26 +166,5 @@ For array-based parsers, see the note in Tabular/LALR.lhs. > errorTok = 1 > epsilonTok = 0 ------------------------------------------------------------------------------ -Replace $$ with an arbitrary string, being careful to avoid ".." and '.'. - -> mapDollarDollar :: String -> Maybe (String -> String) -> mapDollarDollar = fmap (\(l, r) repr -> l ++ repr ++ r) . mapDollarDollar' -> - -> mapDollarDollar' :: String -> Maybe (String, String) -> mapDollarDollar' code0 = go code0 "" -> where go code acc = -> case code of -> [] -> Nothing -> -> '"' :r -> case reads code :: [(String,String)] of -> [] -> go r ('"':acc) -> (s,r'):_ -> go r' (reverse (show s) ++ acc) -> a:'\'' :r | isAlphaNum a -> go r ('\'':a:acc) -> '\'' :r -> case reads code :: [(Char,String)] of -> [] -> go r ('\'':acc) -> (c,r'):_ -> go r' (reverse (show c) ++ acc) -> '\\':'$':r -> go r ('$':acc) -> '$':'$':r -> Just (reverse acc, r) -> c:r -> go r (c:acc) +> substExpressionWithHole :: ExpressionWithHole -> String -> String +> substExpressionWithHole (ExpressionWithHole l r) = \repr -> l ++ repr ++ r diff --git a/lib/tabular/src/Happy/Tabular/Info.lhs b/lib/tabular/src/Happy/Tabular/Info.lhs index fedb9fbf..6e33a4d7 100644 --- a/lib/tabular/src/Happy/Tabular/Info.lhs +++ b/lib/tabular/src/Happy/Tabular/Info.lhs @@ -182,7 +182,10 @@ Produce a file of parser information, useful for debugging the parser. > showTerminal (t,s) > = str "\t" > . showJName 15 t -> . str "{ " . str s . str " }" +> . str "{ " . showToken s . str " }" + +> showToken (TokenFixed s) = str s +> showToken (TokenWithValue e) = str $ substExpressionWithHole e "$$" > showNonTerminals > = banner "Non-terminals"