BNFC · chaserhkj · Dec 10, 2023 · Dec 10, 2023 · Jan 10, 2024 · Jan 26, 2024
diff --git a/source/src/BNFC/Backend/TreeSitter.hs b/source/src/BNFC/Backend/TreeSitter.hs
@@ -1,7 +1,5 @@
 {-
     BNF Converter: TreeSitter Grammar Generator
-    Copyright (C) 2004  Author:  Markus Forsberg, Michael Pellauer,
-                                 Bjorn Bringert
 
     Description   : This module generates the grammar.js input file for
                     tree-sitter.
@@ -22,11 +20,13 @@ import BNFC.PrettyPrint
 -- | Entry point: create grammar.js file
 makeTreeSitter :: SharedOptions -> CF -> Backend
 makeTreeSitter opts cf = do
-  mkfile "grammar.js" comment (render $ cfToTreeSitter name cf)
+  -- Always remove zero width match for now, if needed, can be changed
+  -- to remove on flag in the future
+  mkfile "grammar.js" comment (render $ cfToTreeSitter name cf True)
   where
     name = lang opts
 
 comment :: String -> String
 comment = ("// " ++)
 
--- | TODO: Add Makefile generation for tree-sitter
+-- TODO: Add Makefile generation for tree-sitter
diff --git a/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs b/source/src/BNFC/Backend/TreeSitter/CFtoTreeSitter.hs
@@ -1,7 +1,5 @@
 {-
     BNF Converter: TreeSitter Grammar Generator
-    Copyright (C) 2004  Author:  Markus Forsberg, Michael Pellauer,
-                                 Bjorn Bringert
 
     Description   : This module converts BNFC grammar to the contents of a
                     tree-sitter grammar.js file
@@ -10,23 +8,32 @@
     Created       : 08 Nov, 2023
 
 -}
+{-# LANGUAGE MultiParamTypeClasses #-}
+{-# LANGUAGE FlexibleInstances #-}
 
 module BNFC.Backend.TreeSitter.CFtoTreeSitter where
 
 import BNFC.Abs (Reg (RSeq, RSeqs, RStar, RAny))
 import BNFC.Backend.TreeSitter.RegToJSReg
+    ( escapeCharFrom, printRegJSReg )
 import BNFC.CF
 import BNFC.Lexing (mkRegMultilineComment)
 import BNFC.PrettyPrint
+
 import Prelude hiding ((<>))
+import qualified Data.Map as Map
+import Data.Map ()
 
 -- | Indent one level of 2 spaces
 indent :: Doc -> Doc
 indent = nest 2
 
 -- | Create content of grammar.js file
-cfToTreeSitter :: String -> CF -> Doc
-cfToTreeSitter name cf =
+cfToTreeSitter :: String -- ^ Name of the language
+                -> CF    -- ^ Context-Free grammar of the language
+                -> Bool  -- ^ Flags to enable zero-width match elimination
+                -> Doc   -- ^ grammar.js file generated
+cfToTreeSitter name cf removeZero =
   -- Overall structure of grammar.js
   text "module.exports = grammar({"
     $+$ indent
@@ -42,7 +49,7 @@ cfToTreeSitter name cf =
     rulesSection =
       text "rules: {"
         $+$ indent
-          ( prRules cf
+          ( prRules cf removeZero
               $+$ prUsrTokenRules cf
               $+$ prBuiltinTokenRules cf
           )
@@ -96,7 +103,7 @@ prWord cf =
     identUsed = isUsedCat cf (TokenCat catIdent)
     usrTokens = tokenPragmas cf
     usrTokensFormatted =
-      map (text . refName . formatCatName False . TokenCat . fst) $ usrTokens
+      map (text . refName . formatCatName False . TokenCat . fst) usrTokens
 
 -- | Print builtin token rules according to their usage
 prBuiltinTokenRules :: CF -> Doc
@@ -120,48 +127,162 @@ stringRule =
 identRule =
   defineSymbol "token_Ident" <+> text "/[a-zA-Z][a-zA-Z\\d_']*/" <> ","
 
+-- Tree Sitter does not support empty strings well enough
+-- (Ref: https://github.com/tree-sitter/tree-sitter/issues/98), thus we need to
+-- handle empty strings differently using the optional keyword
+-- Crucially, any named symbols in Tree Sitter cannot have a zero width match,
+-- unless it is the start symbol
+-- Thus, we look for all categories that match zero width, tag them as "optional"
+-- And when that cat is referred to, we use `optional($.cat_name)` in place of
+-- `$.cat_name` to circumvent this issue
+
+-- | Cat with a tag indicating if it is optional
+data CatOpt' a = Always a | Optional a
+type CatOpt = CatOpt' Cat
+
+-- | type class for OCat or Cat
+class UnwrapCat a where
+  -- | unwrap to original Cat
+  unwrap:: a -> Cat
+
+instance UnwrapCat Cat where
+  unwrap = id
+
+instance UnwrapCat CatOpt where
+  unwrap (Always c) = c
+  unwrap (Optional c) = c
+
+-- | Rule with RHS tagged
+data RuleOpt = Rule' {srcRule::Rule, taggedRhs::SentFormOpt}
+type SentFormOpt = [Either CatOpt String]
+
+-- | type class for a rule data type that can be formatted
+class FormatRule a where
+  -- | get the original rule
+  getRule:: a -> Rule
+  -- | format the RHS of the rule
+  formatRuleRhs:: a -> [Doc]
+
+instance FormatRule Rule where
+  getRule = id
+
+  formatRuleRhs r = 
+    map (\case
+      Left c -> text $ refName $ formatCatName False c
+      Right term -> quoted term) $ rhsRule r
+
+
+instance FormatRule RuleOpt where
+  getRule = srcRule
+
+  formatRuleRhs r = 
+    map (\case
+      Left (Always c) -> text $ refName $ formatCatName False c
+      Left (Optional c) -> wrapFun "optional" False $
+        text (refName $ formatCatName False c)
+      Right term -> quoted term) $ taggedRhs r
+
+-- | Analyzes the grammar with the entrance symbol.
+-- This function finds all rules for the entrance symbol, and groups
+-- all remaining categories with their rules, including internal rules.
+analyzeCF :: CF -- ^ Context-free grammar of the language
+  -> Cat -- ^ Category object for the entrance symbol
+  -> ([(Cat, [Rule])], [Rule]) -- ^ (groups of remaining categories and rules, entrance rules)
+analyzeCF cf entryCat = ([(c, rulesForCat' cf c)| c <- allCats, c /= entryCat],
+  rulesForCat' cf entryCat)
+  where allCats = reallyAllCats cf
+
+-- | Analyzes the grammar with the entrance symbol.
+-- This version of analyze function performs zero-width match analysis on all symbols and
+-- returns with optional flags determined and tagged for all returning Cats and rules.
+-- Returns (list of remaining tagged categories and tagged rules, tagged entrance rules)
+analyzeCFOptional :: CF  -- ^ Context-free grammar of the language
+  -> Cat -- ^ Category object of the entrance symbol
+  -> ([(CatOpt, [RuleOpt])], [RuleOpt]) -- ^ (groups of tagged remaining categories and tagged rules, tagged entrance rules)
+analyzeCFOptional cf entryCat =
+  (
+    -- Empty rules are excluded from normal categories since they are handled by
+    -- "optional()" keywords in tree-sitter
+    [(wrapCat c, 
+      map wrapRule $ filter (not . ruleIsEmpty) $ rulesForCat' cf c)
+      | c <- allCats, c /= entryCat],
+    -- Tree-sitter should support zero-width matches with root (i.e. entrance) symbol
+    -- thus no need to filter them out
+    map wrapRule $ rulesForCat' cf entryCat
+  )
+  where
+    allCats = reallyAllCats cf
+    -- Stores mapping from Cat to optional flag
+    -- Currently we only recognize optional rules if any RHS of the rules is empty
+    -- list, and ignore more complex cases.
+    -- Complex optional cases may trigger tree-sitter to fail or bug out.
+    catOptMap = Map.fromList $
+      map (\c -> (c, any ruleIsEmpty (rulesForCat' cf c)))
+      -- Always format entrance symbols as non-optional since
+      -- tree-sitter should support zero-width matches on them 
+      $ filter (/= entryCat) allCats
+    -- Tags Cat to Cat' using catOptMap
+    wrapCat:: Cat -> CatOpt
+    wrapCat c = if Map.findWithDefault False c catOptMap
+      then Optional c
+      else Always c
+    wrapRule r = Rule' {srcRule = r, 
+      taggedRhs = map wrapSentFormItem $ rhsRule r
+    }
+    wrapSentFormItem :: Either Cat String -> Either CatOpt String
+    wrapSentFormItem (Left c) = Left $ wrapCat c
+    wrapSentFormItem (Right s) = Right s
+    ruleIsEmpty = null . rhsRule . getRule
+
 -- | First print the entrypoint rule, tree-sitter always use the
 --   first rule as entrypoint and does not support multi-entrypoint.
 --   Then print rest of the rules
-prRules :: CF -> Doc
-prRules cf =
+prRules :: CF -> Bool -> Doc
+prRules cf removeZero =
   if onlyOneEntry
     then
-      prOneCat entryRules entryCat
-        $+$ prOtherRules entryCat cf
+      -- entry rules are formatted without optional
+      -- tree-sitter should support zero-width (a.k.a empty) matches for top level symbols
+      if removeZero
+        then
+          prOneCat entryRulesOpt entryCat
+            $+$ prOtherRules otherCatRulesOpt
+        else
+          prOneCat entryRules entryCat
+            $+$ prOtherRules otherCatRules
     else error "Tree-sitter only supports one entrypoint"
   where
     --If entrypoint is defined, there must be only one entrypoint
     --If it is not defined, defaults to use the first rule as entrypoint
     onlyOneEntry = not (hasEntryPoint cf) || onlyOneEntryDefined
     onlyOneEntryDefined = length (allEntryPoints cf) == 1
     entryCat = firstEntry cf
-    entryRules = rulesForCat' cf entryCat
+    (otherCatRulesOpt, entryRulesOpt) = analyzeCFOptional cf entryCat
+    (otherCatRules, entryRules) = analyzeCF cf entryCat
 
 -- | Print all other rules except the entrypoint
-prOtherRules :: Cat -> CF -> Doc
-prOtherRules entryCat cf = vcat' $ map mkOne rules
+prOtherRules :: (UnwrapCat a, FormatRule b) => [(a, [b])] -> Doc
+prOtherRules otherRules = vcat' $ map mkOne otherRules
   where
-    rules = [(c, r) | (c, r) <- ruleGroupsInternals cf, c /= entryCat]
-    mkOne (cat, rules) = prOneCat rules cat
+    mkOne (cat, rules) = prOneCat rules $ unwrap cat
 
 prUsrTokenRules :: CF -> Doc
 prUsrTokenRules cf = vcat' $ map prOneToken tokens
   where
     tokens = tokenPragmas cf
 
 -- | Check if a set of rules contains internal rules
-hasInternal :: [Rule] -> Bool
-hasInternal = not . all isParsable
+hasInternal :: (FormatRule a) => [a] -> Bool
+hasInternal = not . all (isParsable . getRule)
 
 -- | Generates one or two tree-sitter rule(s) for one non-terminal from CF.
 -- Uses choice function from tree-sitter to combine rules for the non-terminal
 -- If the non-terminal has internal rules, an internal version of the non-terminal
 -- will be created (prefixed with "_" in tree-sitter), and all internal rules will
 -- be sectioned as such.
-prOneCat :: [Rule] -> NonTerminal -> Doc
+prOneCat :: FormatRule a => [a] -> NonTerminal -> Doc
 prOneCat rules nt =
-  defineSymbol (formatCatName False nt)
+  defineSymbol (formatCatName False $ nt)
     $+$ indent (appendComma parRhs)
     $+$ internalRules
   where
@@ -170,10 +291,10 @@ prOneCat rules nt =
       if int
         then defineSymbol (formatCatName True nt) $+$ indent (appendComma intRhs)
         else empty
-    parRhs = wrapChoice $ transChoice ++ genChoice (filter isParsable rules)
+    parRhs = wrapChoice $ transChoice ++ genChoice (filter (isParsable . getRule) rules)
     transChoice = [text $ refName $ formatCatName True nt | int]
-    intRhs = wrapChoice $ genChoice (filter (not . isParsable) rules)
-    genChoice = map (wrapSeq . formatRhs . rhsRule)
+    intRhs = wrapChoice $ genChoice (filter (not . isParsable. getRule) rules)
+    genChoice = map (wrapSeq . formatRuleRhs)
 
 -- | Generate one tree-sitter rule for one defined token
 prOneToken :: (TokenCat, Reg) -> Doc
@@ -220,15 +341,14 @@ wrapFun fun newline arg = joinOp [text fun <> text "(", indent arg, text ")"]
 refName :: String -> String
 refName = ("$." ++)
 
--- | Format right hand side into list of strings
-formatRhs :: SentForm -> [Doc]
-formatRhs =
-  map (\case
-    Left c -> text $ refName $ formatCatName False c
-    Right term -> quoted term)
+stringLiteralReserved:: String
+stringLiteralReserved = "\"\\"
+
+escapeStringLiteral:: String -> String
+escapeStringLiteral = concatMap $ escapeCharFrom stringLiteralReserved
 
 quoted :: String -> Doc
-quoted s = text "\"" <> text s <> text "\""
+quoted s = text "\"" <> text (escapeStringLiteral s) <> text "\""
 
 -- | Format string for cat name, prefix "_" if the name is for internal rules
 formatCatName :: Bool -> Cat -> String

diff --git a/source/src/BNFC/Backend/TreeSitter/RegToJSReg.hs b/source/src/BNFC/Backend/TreeSitter/RegToJSReg.hs
@@ -1,7 +1,5 @@
 {-
     BNF Converter: TreeSitter Grammar Generator
-    Copyright (C) 2004  Author:  Markus Forsberg, Michael Pellauer,
-                                 Bjorn Bringert
 
     Description   : This module converts BNFC Reg to Javascript regular
                     expressions that is used in
@@ -12,7 +10,7 @@
 -}
 {-# LANGUAGE LambdaCase #-}
 
-module BNFC.Backend.TreeSitter.RegToJSReg (printRegJSReg) where
+module BNFC.Backend.TreeSitter.RegToJSReg (printRegJSReg, escapeCharFrom) where
 
 import BNFC.Abs
 

diff --git a/testing/src/ParameterizedTests.hs b/testing/src/ParameterizedTests.hs
@@ -419,6 +419,7 @@ parameters = concat
     , javaParams { tpName = "Java (with jflex and line numbers)"
                  , tpBnfcOptions = ["--java", "--jflex", "-l"] }
     ]
+  , [ treeSitter ]
   ]
   where
     base = baseParameters
@@ -442,6 +443,14 @@ parameters = concat
         , tpBnfcOptions = ["--ocaml"]
         , tpRunTestProg = haskellRunTestProg
         }
+    treeSitter = TP
+        { tpName        = "tree-sitter"
+        , tpBuild       = do
+            cmd "tree-sitter" "generate" . (:[]) =<< findFile "grammar.js"
+        , tpBnfcOptions = ["--tree-sitter"]
+        , tpRunTestProg = \ _lang args -> do
+            cmd "tree-sitter" "parse" args
+        }
 
 -- | Helper function that runs bnfc with the context's options and an
 --   option to generate 'tpMakefile'.