From c387cdc8b02f25df9f42f471e6b65c1e6d297db8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20K=C3=A4llberg?= Date: Wed, 4 May 2022 09:54:34 +0800 Subject: [PATCH 01/14] Add stylish haskell config --- .stylish-haskell.yaml | 380 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 380 insertions(+) create mode 100644 .stylish-haskell.yaml diff --git a/.stylish-haskell.yaml b/.stylish-haskell.yaml new file mode 100644 index 0000000..87205d1 --- /dev/null +++ b/.stylish-haskell.yaml @@ -0,0 +1,380 @@ +# stylish-haskell configuration file +# ================================== + +# The stylish-haskell tool is mainly configured by specifying steps. These steps +# are a list, so they have an order, and one specific step may appear more than +# once (if needed). Each file is processed by these steps in the given order. +steps: + # Convert some ASCII sequences to their Unicode equivalents. This is disabled + # by default. + # - unicode_syntax: + # # In order to make this work, we also need to insert the UnicodeSyntax + # # language pragma. If this flag is set to true, we insert it when it's + # # not already present. You may want to disable it if you configure + # # language extensions using some other method than pragmas. Default: + # # true. + # add_language_pragma: true + + # Format module header + # + # Currently, this option is not configurable and will format all exports and + # module declarations to minimize diffs + # + # - module_header: + # # How many spaces use for indentation in the module header. + # indent: 4 + # + # # Should export lists be sorted? Sorting is only performed within the + # # export section, as delineated by Haddock comments. + # sort: true + # + # # See `separate_lists` for the `imports` step. + # separate_lists: true + # + # # When to break the "where". + # # Possible values: + # # - exports: only break when there is an explicit export list. + # # - single: only break when the export list counts more than one export. + # # - inline: only break when the export list is too long. This is + # # determined by the `columns` setting. Not applicable when the export + # # list contains comments as newlines will be required. + # # - always: always break before the "where". + # break_where: exports + # + # # Where to put open bracket + # # Possible values: + # # - same_line: put open bracket on the same line as the module name, before the + # # comment of the module + # # - next_line: put open bracket on the next line, after module comment + # open_bracket: next_line + + # Format record definitions. This is disabled by default. + # + # You can control the layout of record fields. The only rules that can't be configured + # are these: + # + # - "|" is always aligned with "=" + # - "," in fields is always aligned with "{" + # - "}" is likewise always aligned with "{" + # + # - records: + # # How to format equals sign between type constructor and data constructor. + # # Possible values: + # # - "same_line" -- leave "=" AND data constructor on the same line as the type constructor. + # # - "indent N" -- insert a new line and N spaces from the beginning of the next line. + # equals: "indent 2" + # + # # How to format first field of each record constructor. + # # Possible values: + # # - "same_line" -- "{" and first field goes on the same line as the data constructor. + # # - "indent N" -- insert a new line and N spaces from the beginning of the data constructor + # first_field: "indent 2" + # + # # How many spaces to insert between the column with "," and the beginning of the comment in the next line. + # field_comment: 2 + # + # # How many spaces to insert before "deriving" clause. Deriving clauses are always on separate lines. + # deriving: 2 + # + # # How many spaces to insert before "via" clause counted from indentation of deriving clause + # # Possible values: + # # - "same_line" -- "via" part goes on the same line as "deriving" keyword. + # # - "indent N" -- insert a new line and N spaces from the beginning of "deriving" keyword. + # via: "indent 2" + # + # # Sort typeclass names in the "deriving" list alphabetically. + # sort_deriving: true + # + # # Whether or not to break enums onto several lines + # # + # # Default: false + # break_enums: false + # + # # Whether or not to break single constructor data types before `=` sign + # # + # # Default: true + # break_single_constructors: true + # + # # Whether or not to curry constraints on function. + # # + # # E.g: @allValues :: Enum a => Bounded a => Proxy a -> [a]@ + # # + # # Instead of @allValues :: (Enum a, Bounded a) => Proxy a -> [a]@ + # # + # # Default: false + # curried_context: false + + # Align the right hand side of some elements. This is quite conservative + # and only applies to statements where each element occupies a single + # line. + # Possible values: + # - always - Always align statements. + # - adjacent - Align statements that are on adjacent lines in groups. + # - never - Never align statements. + # All default to always. + - simple_align: + cases: always + top_level_patterns: always + records: always + multi_way_if: always + + # Import cleanup + - imports: + # There are different ways we can align names and lists. + # + # - global: Align the import names and import list throughout the entire + # file. + # + # - file: Like global, but don't add padding when there are no qualified + # imports in the file. + # + # - group: Only align the imports per group (a group is formed by adjacent + # import lines). + # + # - none: Do not perform any alignment. + # + # Default: global. + align: none + + # The following options affect only import list alignment. + # + # List align has following options: + # + # - after_alias: Import list is aligned with end of import including + # 'as' and 'hiding' keywords. + # + # > import qualified Data.List as List (concat, foldl, foldr, head, + # > init, last, length) + # + # - with_alias: Import list is aligned with start of alias or hiding. + # + # > import qualified Data.List as List (concat, foldl, foldr, head, + # > init, last, length) + # + # - with_module_name: Import list is aligned `list_padding` spaces after + # the module name. + # + # > import qualified Data.List as List (concat, foldl, foldr, head, + # init, last, length) + # + # This is mainly intended for use with `pad_module_names: false`. + # + # > import qualified Data.List as List (concat, foldl, foldr, head, + # init, last, length, scanl, scanr, take, drop, + # sort, nub) + # + # - new_line: Import list starts always on new line. + # + # > import qualified Data.List as List + # > (concat, foldl, foldr, head, init, last, length) + # + # - repeat: Repeat the module name to align the import list. + # + # > import qualified Data.List as List (concat, foldl, foldr, head) + # > import qualified Data.List as List (init, last, length) + # + # Default: after_alias + list_align: after_alias + + # Right-pad the module names to align imports in a group: + # + # - true: a little more readable + # + # > import qualified Data.List as List (concat, foldl, foldr, + # > init, last, length) + # > import qualified Data.List.Extra as List (concat, foldl, foldr, + # > init, last, length) + # + # - false: diff-safe + # + # > import qualified Data.List as List (concat, foldl, foldr, init, + # > last, length) + # > import qualified Data.List.Extra as List (concat, foldl, foldr, + # > init, last, length) + # + # Default: true + pad_module_names: true + + # Long list align style takes effect when import is too long. This is + # determined by 'columns' setting. + # + # - inline: This option will put as much specs on same line as possible. + # + # - new_line: Import list will start on new line. + # + # - new_line_multiline: Import list will start on new line when it's + # short enough to fit to single line. Otherwise it'll be multiline. + # + # - multiline: One line per import list entry. + # Type with constructor list acts like single import. + # + # > import qualified Data.Map as M + # > ( empty + # > , singleton + # > , ... + # > , delete + # > ) + # + # Default: inline + long_list_align: inline + + # Align empty list (importing instances) + # + # Empty list align has following options + # + # - inherit: inherit list_align setting + # + # - right_after: () is right after the module name: + # + # > import Vector.Instances () + # + # Default: inherit + empty_list_align: inherit + + # List padding determines indentation of import list on lines after import. + # This option affects 'long_list_align'. + # + # - : constant value + # + # - module_name: align under start of module name. + # Useful for 'file' and 'group' align settings. + # + # Default: 4 + list_padding: 4 + + # Separate lists option affects formatting of import list for type + # or class. The only difference is single space between type and list + # of constructors, selectors and class functions. + # + # - true: There is single space between Foldable type and list of it's + # functions. + # + # > import Data.Foldable (Foldable (fold, foldl, foldMap)) + # + # - false: There is no space between Foldable type and list of it's + # functions. + # + # > import Data.Foldable (Foldable(fold, foldl, foldMap)) + # + # Default: true + separate_lists: true + + # Space surround option affects formatting of import lists on a single + # line. The only difference is single space after the initial + # parenthesis and a single space before the terminal parenthesis. + # + # - true: There is single space associated with the enclosing + # parenthesis. + # + # > import Data.Foo ( foo ) + # + # - false: There is no space associated with the enclosing parenthesis + # + # > import Data.Foo (foo) + # + # Default: false + space_surround: false + + # Post qualify option moves any qualifies found in import declarations + # to the end of the declaration. This also adjust padding for any + # unqualified import declarations. + # + # - true: Qualified as is moved to the end of the + # declaration. + # + # > import Data.Bar + # > import Data.Foo qualified as F + # + # - false: Qualified remains in the default location and unqualified + # imports are padded to align with qualified imports. + # + # > import Data.Bar + # > import qualified Data.Foo as F + # + # Default: false + post_qualify: false + + + # Language pragmas + - language_pragmas: + # We can generate different styles of language pragma lists. + # + # - vertical: Vertical-spaced language pragmas, one per line. + # + # - compact: A more compact style. + # + # - compact_line: Similar to compact, but wrap each line with + # `{-# LANGUAGE #-}'. + # + # - vertical_compact: Similar to vertical, but use only one language pragma. + # + # Default: vertical. + style: vertical + + # Align affects alignment of closing pragma brackets. + # + # - true: Brackets are aligned in same column. + # + # - false: Brackets are not aligned together. There is only one space + # between actual import and closing bracket. + # + # Default: true + align: true + + # stylish-haskell can detect redundancy of some language pragmas. If this + # is set to true, it will remove those redundant pragmas. Default: true. + remove_redundant: true + + # Language prefix to be used for pragma declaration, this allows you to + # use other options non case-sensitive like "language" or "Language". + # If a non correct String is provided, it will default to: LANGUAGE. + language_prefix: LANGUAGE + + # Replace tabs by spaces. This is disabled by default. + # - tabs: + # # Number of spaces to use for each tab. Default: 8, as specified by the + # # Haskell report. + # spaces: 8 + + # Remove trailing whitespace + - trailing_whitespace: {} + + # Squash multiple spaces between the left and right hand sides of some + # elements into single spaces. Basically, this undoes the effect of + # simple_align but is a bit less conservative. + # - squash: {} + +# A common setting is the number of columns (parts of) code will be wrapped +# to. Different steps take this into account. +# +# Set this to null to disable all line wrapping. +# +# Default: 80. +columns: 80 + +# By default, line endings are converted according to the OS. You can override +# preferred format here. +# +# - native: Native newline format. CRLF on Windows, LF on other OSes. +# +# - lf: Convert to LF ("\n"). +# +# - crlf: Convert to CRLF ("\r\n"). +# +# Default: native. +newline: native + +# Sometimes, language extensions are specified in a cabal file or from the +# command line instead of using language pragmas in the file. stylish-haskell +# needs to be aware of these, so it can parse the file correctly. +# +# No language extensions are enabled by default. +# language_extensions: + # - TemplateHaskell + # - QuasiQuotes + +# Attempt to find the cabal file in ancestors of the current directory, and +# parse options (currently only language extensions) from that. +# +# Default: true +cabal: true From 006e030c1f1d2f094612a8887cfb9121bcaf6e98 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20K=C3=A4llberg?= Date: Wed, 4 May 2022 09:07:57 +0800 Subject: [PATCH 02/14] Format code with stylish-haskell --- UD2GF.hs | 46 +++++++++++++++++++++++----------------------- tests/TestSuite.hs | 10 +++++----- 2 files changed, 28 insertions(+), 28 deletions(-) diff --git a/UD2GF.hs b/UD2GF.hs index eb600f1..0c71bab 100644 --- a/UD2GF.hs +++ b/UD2GF.hs @@ -1,24 +1,24 @@ module UD2GF where +import Backend +import GFConcepts import RTree -import UDConcepts import UDAnnotations -import GFConcepts +import UDConcepts import UDOptions import UDVisualization -import Backend import PGF hiding (CncLabels) -import qualified Data.Map as M -import Data.List import Data.Char +import Data.List +import qualified Data.Map as M import Data.Maybe -import Text.PrettyPrint (render, cat) +import Text.PrettyPrint (cat, render) -import Debug.Trace (trace) import Data.Function (on) import Data.Ord (comparing) +import Debug.Trace (trace) --------- -- to debug @@ -66,7 +66,7 @@ showUD2GF opts env sentence = do ifOpt opts "ud" $ prt sentence case errors sentence of - [] -> return () + [] -> return () errs -> ifOpt opts "err" $ unlines errs let udtree = udSentence2tree sentence @@ -132,11 +132,11 @@ showUD2GF opts env sentence = do data UD2GFStat = UD2GFStat { - totalWords :: Int, - interpretedWords :: Int, - unknownWords :: Int, - totalSentences :: Int, - completeSentences :: Int, + totalWords :: Int, + interpretedWords :: Int, + unknownWords :: Int, + totalSentences :: Int, + completeSentences :: Int, typecorrectSentences :: Int } deriving Show @@ -172,7 +172,7 @@ data CheckResult = CheckResult { prCheckResult cr = unlines $ case resultUnknowns cr of - [] -> [] + [] -> [] uks -> [unwords $ "unknown words:" : map showCId uks] ++ [resultMessage cr] @@ -187,7 +187,7 @@ checkAbsTreeResult env t = CheckResult { where pgf = pgfGrammar env (mt,msg) = case inferExpr pgf (abstree2expr t) of - Left tce -> (Nothing, render (ppTcError tce)) + Left tce -> (Nothing, render (ppTcError tce)) Right (exp,typ) -> (Just exp, "type checking OK") @@ -209,12 +209,12 @@ data DevNode = DevNode { deriving Show mapDevAbsTree :: (AbsTreeInfo -> AbsTreeInfo) -> DevNode -> DevNode -mapDevAbsTree f dn = dn { devAbsTrees = map f (devAbsTrees dn) } +mapDevAbsTree f dn = dn { devAbsTrees = map f (devAbsTrees dn) } data AbsTreeInfo = AbsTreeInfo { atiAbsTree :: AbsTree - , atiCat :: Cat - , atiUDIds :: [UDId] + , atiCat :: Cat + , atiUDIds :: [UDId] } deriving (Show, Eq) @@ -287,7 +287,7 @@ addBackups0 tr@(RTree dn trs) = case map collectBackup (tr:trs) of -- backups f theAbsTreeInfo :: DevTree -> AbsTreeInfo theAbsTreeInfo dt = case devAbsTrees (root dt) of [t] -> t - _ -> error $ "no unique abstree in " ++ prDevNode 2 (root dt) + _ -> error $ "no unique abstree in " ++ prDevNode 2 (root dt) -- split trees showing just one GF tree in each DevTree splitDevTree :: DevTree -> [DevTree] @@ -298,7 +298,7 @@ splitDevTree tr@(RTree dn trs) = case elem (devIndex d) usage of True -> case sortOn ((1000-) . sizeRTree . atiAbsTree) [dt | dt@AbsTreeInfo { atiAbsTree = t} <- devAbsTrees d, isSubRTree t ast] of t:_ -> RTree (d{devAbsTrees = [t]}) (map (chase t) ts) - _ -> error $ "wrong indexing in\n" ++ prLinesRTree (prDevNode 1) tr + _ -> error $ "wrong indexing in\n" ++ prLinesRTree (prDevNode 1) tr False -> head $ splitDevTree $ RTree (d{devNeedBackup = True}) ts ---- head prtStatus udids = "[" ++ concat (intersperse "," (map prt udids)) ++ "]" @@ -485,7 +485,7 @@ analyseWords env = mapRTree lemma2fun all (\f -> M.notMember f (disabledFunctions (cncLabels env))) (allNodesRTree at)] Right c -> case elem (w,c) auxWords of True -> [(newWordTree w c, c)] - _ -> [] + _ -> [] auxWords = [(lemma,cat) | ((fun_,lemma),(cat,labels_)) <- M.assocs (lemmaLabels (cncLabels env))] @@ -542,5 +542,5 @@ udtree2devtree = markClosest . initialize -- >>> select [1,2,3] -- [(1,[2,3]),(2,[1,3]),(3,[1,2])] select :: [a] -> [(a,[a])] -select [] = [] -select (a : as) = (a,as) : [ (b,a:bs) | (b,bs) <-select as ] \ No newline at end of file +select [] = [] +select (a : as) = (a,as) : [ (b,a:bs) | (b,bs) <-select as ] diff --git a/tests/TestSuite.hs b/tests/TestSuite.hs index 8232791..50e95c8 100644 --- a/tests/TestSuite.hs +++ b/tests/TestSuite.hs @@ -1,10 +1,10 @@ module Main where +import PGF import Test.Hspec import UD2GF -import PGF import UDAnnotations -import UDConcepts (UDData(UDData)) +import UDConcepts (UDData (UDData)) myUDEnv :: IO UDEnv @@ -40,7 +40,7 @@ bestTrees env conll = map exprStr exprs exprs = getExprs [] env conll exprStr expr = case expr of (x : _xs) -> showExpr [] x - _ -> "bestTree: ud2gf failed" + _ -> "bestTree: ud2gf failed" bestTree :: UDEnv -> String -> String @@ -49,10 +49,10 @@ bestTree env conll = exprStr exprs = getExprs [] env conll exprStr = case exprs of (x : _xs) : _xss -> showExpr [] x - _ -> "bestTree: ud2gf failed" + _ -> "bestTree: ud2gf failed" theCatSleepsAlready :: String -theCatSleepsAlready = unlines +theCatSleepsAlready = unlines [ "1\tthe\tthe\tDET\tQuant\tFORM=0\t2\tdet\t_\tFUN=DefArt" , "2\tcat\tcat\tNOUN\tNN\tNumber=Sing\t3\tnsubj\t_\tFUN=cat_N" , "3\tsleeps\tsleep\tVERB\tVBZ\tMood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin\t0\troot\t_\tFUN=sleepVBZ" From 5dc0d8231fc9c443c5bdf37cf3d0da6b93cbf85f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20K=C3=A4llberg?= Date: Mon, 22 Nov 2021 13:40:29 +0800 Subject: [PATCH 03/14] ud2gf: Fall back to morphoanalysis if gf parsing fails --- UD2GF.hs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/UD2GF.hs b/UD2GF.hs index 0c71bab..75d75e0 100644 --- a/UD2GF.hs +++ b/UD2GF.hs @@ -458,6 +458,7 @@ combineTrees env = analyseWords :: UDEnv -> DevTree -> DevTree analyseWords env = mapRTree lemma2fun where + morpho = buildMorpho (pgfGrammar env) (actLanguage env) lemma2fun dn = dn { devAbsTrees = [AbsTreeInfo { atiAbsTree = t, atiCat = c, atiUDIds = [devIndex dn]} | (t,c) <- justWords], devStatus = [devIndex dn], @@ -480,6 +481,7 @@ analyseWords env = mapRTree lemma2fun --- this can fail if c is discontinuous, or return false positives if w is a form of another word parseWord w ec = case ec of Left c -> case parse (pgfGrammar env) (actLanguage env) (mkType [] c []) w of + [] -> [(RTree name [], c) | (name, _) <- lookupMorpho morpho w] ts -> [(at,c) | t <- ts, let at = expr2abstree t, all (\f -> M.notMember f (disabledFunctions (cncLabels env))) (allNodesRTree at)] From 0cce2d79eed909c7a0e66a1592960013734dd9f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20K=C3=A4llberg?= Date: Mon, 22 Nov 2021 14:25:30 +0800 Subject: [PATCH 04/14] Use correct category for morpho parses --- UD2GF.hs | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/UD2GF.hs b/UD2GF.hs index 75d75e0..4d3b145 100644 --- a/UD2GF.hs +++ b/UD2GF.hs @@ -471,7 +471,7 @@ analyseWords env = mapRTree lemma2fun -- find all functions that are possible parses of the word in any appropriate category --- it is still possible that some other category is meant - getWordTrees w cs = case concatMap (parseWord w) cs of + getWordTrees w cs = case morphoFallback w $ concatMap (parseWord w) cs of [] -> case cs of [] -> (True,[(newWordTree w unknownCat, unknownCat)]) _ -> (True,[(newWordTree w ec, ec) | c <- cs, let ec = either id id c]) @@ -481,7 +481,6 @@ analyseWords env = mapRTree lemma2fun --- this can fail if c is discontinuous, or return false positives if w is a form of another word parseWord w ec = case ec of Left c -> case parse (pgfGrammar env) (actLanguage env) (mkType [] c []) w of - [] -> [(RTree name [], c) | (name, _) <- lookupMorpho morpho w] ts -> [(at,c) | t <- ts, let at = expr2abstree t, all (\f -> M.notMember f (disabledFunctions (cncLabels env))) (allNodesRTree at)] @@ -491,6 +490,18 @@ analyseWords env = mapRTree lemma2fun auxWords = [(lemma,cat) | ((fun_,lemma),(cat,labels_)) <- M.assocs (lemmaLabels (cncLabels env))] + -- Fall back to morphoanalysis if gf parse fails + -- TODO: We might want to use the morphoanalysis for all words, not just when parse fails + morphoFallback :: String -> [(RTree Lemma, CId)] -> [(RTree CId, CId)] + morphoFallback w [] = + [(RTree name [], c) + | (name, _) <- lookupMorpho morpho w + , let expr = mkApp name [] + , Right (e,tp) <- pure $ inferExpr (pgfGrammar env) expr + , ([], c, []) <- pure $ unType tp + ] + morphoFallback _ xs = xs + -- auxiliaries newWordTree w c = RTree (mkCId (w ++ "_" ++ showCId c)) [] --- isNewWordFun f = isInfixOf "__x__" (showCId f) From f1dc8aaa1f0570c7f51a406b87bd4f0c5302a2a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20K=C3=A4llberg?= Date: Mon, 22 Nov 2021 14:26:05 +0800 Subject: [PATCH 05/14] Use functionType instead of inferType --- UD2GF.hs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/UD2GF.hs b/UD2GF.hs index 4d3b145..a656836 100644 --- a/UD2GF.hs +++ b/UD2GF.hs @@ -496,8 +496,7 @@ analyseWords env = mapRTree lemma2fun morphoFallback w [] = [(RTree name [], c) | (name, _) <- lookupMorpho morpho w - , let expr = mkApp name [] - , Right (e,tp) <- pure $ inferExpr (pgfGrammar env) expr + , Just tp <- pure $ functionType (pgfGrammar env) name , ([], c, []) <- pure $ unType tp ] morphoFallback _ xs = xs From 1bb37e695b872062ad2535ffa79b29614ce78adf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20K=C3=A4llberg?= Date: Tue, 23 Nov 2021 10:49:11 +0800 Subject: [PATCH 06/14] Use GF string literals for unknown words --- GFConcepts.hs | 1 + UD2GF.hs | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/GFConcepts.hs b/GFConcepts.hs index 898362e..7446a9e 100644 --- a/GFConcepts.hs +++ b/GFConcepts.hs @@ -42,6 +42,7 @@ expr2abstree e = case unApp e of _ -> error ("ERROR: no constructor tree from " ++ showExpr [] e) abstree2expr :: AbsTree -> PGF.Expr +abstree2expr tr@(RTree f []) | [(str,"")] <- reads (showCId f) = mkStr str abstree2expr tr@(RTree f ts) = mkApp f (map abstree2expr ts) postOrderRTree :: RTree a -> RTree (a,Int) diff --git a/UD2GF.hs b/UD2GF.hs index a656836..1bac6f2 100644 --- a/UD2GF.hs +++ b/UD2GF.hs @@ -502,7 +502,8 @@ analyseWords env = mapRTree lemma2fun morphoFallback _ xs = xs -- auxiliaries -newWordTree w c = RTree (mkCId (w ++ "_" ++ showCId c)) [] --- +-- newWordTree w c = RTree (mkCId (w ++ "_" ++ showCId c)) [] --- +newWordTree w c = RTree (mkCId ("Str" ++ showCId c)) [RTree (mkCId (show w)) []] --- isNewWordFun f = isInfixOf "__x__" (showCId f) unknownCat = mkCId "Adv" --- treat unknown words as adverbs ---- TODO: from config quote s = "\"" ++ s ++ "\"" From 522b61129ea36005ebac2583bc3a6b84600ec1aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20K=C3=A4llberg?= Date: Tue, 23 Nov 2021 11:04:37 +0800 Subject: [PATCH 07/14] Don't wrap string literals in double quotes This reduces the problem with showCId escaping everything --- GFConcepts.hs | 4 +++- UD2GF.hs | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/GFConcepts.hs b/GFConcepts.hs index 7446a9e..d75594e 100644 --- a/GFConcepts.hs +++ b/GFConcepts.hs @@ -39,10 +39,12 @@ pgf2functions pgf = [(fun,(val,[arg | (_,_,ty) <- hs, let (_,arg,_) = unType ty] expr2abstree :: PGF.Expr -> AbsTree expr2abstree e = case unApp e of Just (f,es) -> RTree f (map expr2abstree es) + -- _ | Just q <- unStr e -> RTree (mkCId "StrLit") [RTree (mkCId (show q)) []] _ -> error ("ERROR: no constructor tree from " ++ showExpr [] e) abstree2expr :: AbsTree -> PGF.Expr -abstree2expr tr@(RTree f []) | [(str,"")] <- reads (showCId f) = mkStr str +-- TODO: showCId escapes things more than I would like. +abstree2expr tr@(RTree f []) | Just str <- stripPrefix "__strlit__" (showCId f) = mkStr str abstree2expr tr@(RTree f ts) = mkApp f (map abstree2expr ts) postOrderRTree :: RTree a -> RTree (a,Int) diff --git a/UD2GF.hs b/UD2GF.hs index 1bac6f2..7649595 100644 --- a/UD2GF.hs +++ b/UD2GF.hs @@ -503,7 +503,7 @@ analyseWords env = mapRTree lemma2fun -- auxiliaries -- newWordTree w c = RTree (mkCId (w ++ "_" ++ showCId c)) [] --- -newWordTree w c = RTree (mkCId ("Str" ++ showCId c)) [RTree (mkCId (show w)) []] --- +newWordTree w c = RTree (mkCId ("Str" ++ showCId c)) [RTree (mkCId ("__strlit__" ++ w)) []] --- isNewWordFun f = isInfixOf "__x__" (showCId f) unknownCat = mkCId "Adv" --- treat unknown words as adverbs ---- TODO: from config quote s = "\"" ++ s ++ "\"" From 97b02f807ee8c84ee610b70a64a22438e9b56a9c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20K=C3=A4llberg?= Date: Tue, 23 Nov 2021 11:22:00 +0800 Subject: [PATCH 08/14] Don't treat strlits as unknown words --- UD2GF.hs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/UD2GF.hs b/UD2GF.hs index 7649595..c60d6aa 100644 --- a/UD2GF.hs +++ b/UD2GF.hs @@ -181,7 +181,7 @@ prCheckResult cr = unlines $ checkAbsTreeResult :: UDEnv -> AbsTree -> CheckResult checkAbsTreeResult env t = CheckResult { resultTree = mt, - resultUnknowns = [f | f <- allNodesRTree t, Nothing <- [functionType pgf f]], + resultUnknowns = [f | f <- allNodesRTree t, "__strlit__" `isPrefixOf` showCId f, Nothing <- [functionType pgf f]], resultMessage = msg } where From ae5a40e784eab5ddc470b17125a6317c6bd81123 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20K=C3=A4llberg?= Date: Tue, 23 Nov 2021 11:30:42 +0800 Subject: [PATCH 09/14] Refactor and fix bug --- GFConcepts.hs | 9 +++++++-- UD2GF.hs | 4 ++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/GFConcepts.hs b/GFConcepts.hs index d75594e..c13a448 100644 --- a/GFConcepts.hs +++ b/GFConcepts.hs @@ -44,9 +44,14 @@ expr2abstree e = case unApp e of abstree2expr :: AbsTree -> PGF.Expr -- TODO: showCId escapes things more than I would like. -abstree2expr tr@(RTree f []) | Just str <- stripPrefix "__strlit__" (showCId f) = mkStr str +abstree2expr tr@(RTree f []) | Just str <- asStringLiteral f = mkStr str abstree2expr tr@(RTree f ts) = mkApp f (map abstree2expr ts) +-- | Check if a CId is actually a string literal in disguise +asStringLiteral :: CId -> Maybe String +asStringLiteral f = stripPrefix stringLiteralPrefix (showCId f) +stringLiteralPrefix = "__strlit__" + postOrderRTree :: RTree a -> RTree (a,Int) postOrderRTree = post 0 where @@ -54,7 +59,7 @@ postOrderRTree = post 0 where post n t = case t of RTree a ts -> case posts n ts of (nts,nn) -> RTree (a,nn) nts - + posts :: Int -> [RTree a] -> ([RTree (a,Int)],Int) posts n ts = case ts of [] -> ([],n) diff --git a/UD2GF.hs b/UD2GF.hs index c60d6aa..9bb8ba3 100644 --- a/UD2GF.hs +++ b/UD2GF.hs @@ -181,7 +181,7 @@ prCheckResult cr = unlines $ checkAbsTreeResult :: UDEnv -> AbsTree -> CheckResult checkAbsTreeResult env t = CheckResult { resultTree = mt, - resultUnknowns = [f | f <- allNodesRTree t, "__strlit__" `isPrefixOf` showCId f, Nothing <- [functionType pgf f]], + resultUnknowns = [f | f <- allNodesRTree t, Nothing <- [asStringLiteral f], Nothing <- [functionType pgf f]], resultMessage = msg } where @@ -503,7 +503,7 @@ analyseWords env = mapRTree lemma2fun -- auxiliaries -- newWordTree w c = RTree (mkCId (w ++ "_" ++ showCId c)) [] --- -newWordTree w c = RTree (mkCId ("Str" ++ showCId c)) [RTree (mkCId ("__strlit__" ++ w)) []] --- +newWordTree w c = RTree (mkCId ("Str" ++ showCId c)) [RTree (mkCId (stringLiteralPrefix ++ w)) []] --- isNewWordFun f = isInfixOf "__x__" (showCId f) unknownCat = mkCId "Adv" --- treat unknown words as adverbs ---- TODO: from config quote s = "\"" ++ s ++ "\"" From 06392217ea3170f7ad127b560ca177d77407ec60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20K=C3=A4llberg?= Date: Wed, 1 Dec 2021 12:03:49 +0800 Subject: [PATCH 10/14] Unescape string literals that were put inside CIds --- GFConcepts.hs | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/GFConcepts.hs b/GFConcepts.hs index c13a448..137a822 100644 --- a/GFConcepts.hs +++ b/GFConcepts.hs @@ -43,13 +43,24 @@ expr2abstree e = case unApp e of _ -> error ("ERROR: no constructor tree from " ++ showExpr [] e) abstree2expr :: AbsTree -> PGF.Expr --- TODO: showCId escapes things more than I would like. abstree2expr tr@(RTree f []) | Just str <- asStringLiteral f = mkStr str abstree2expr tr@(RTree f ts) = mkApp f (map abstree2expr ts) -- | Check if a CId is actually a string literal in disguise asStringLiteral :: CId -> Maybe String -asStringLiteral f = stripPrefix stringLiteralPrefix (showCId f) +asStringLiteral f = stripPrefix stringLiteralPrefix $ unescape (showCId f) + +-- | showCId will escape non-valid literals, so we need to unescape them +unescape :: String -> String +unescape ('\'':str) = unescapeBackslashes str +unescape str = str + +unescapeBackslashes :: String -> String +unescapeBackslashes ('\\':x:xs) = x : unescapeBackslashes xs +unescapeBackslashes ['\''] = "" +unescapeBackslashes (x:xs) = x : unescapeBackslashes xs +unescapeBackslashes "" = error "missing final endquote" + stringLiteralPrefix = "__strlit__" postOrderRTree :: RTree a -> RTree (a,Int) From 871014000a8b6f32e8181e3e9aaef5f45d4a76e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20K=C3=A4llberg?= Date: Wed, 1 Dec 2021 15:17:46 +0800 Subject: [PATCH 11/14] Use the word instead of lemma for morphoanalyze --- UD2GF.hs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/UD2GF.hs b/UD2GF.hs index 9bb8ba3..4003616 100644 --- a/UD2GF.hs +++ b/UD2GF.hs @@ -465,13 +465,13 @@ analyseWords env = mapRTree lemma2fun devIsUnknown = isUnknown } where - (isUnknown,justWords) = getWordTrees (devLemma dn) (cats (devPOS dn)) + (isUnknown,justWords) = getWordTrees (devWord dn) (devLemma dn) (cats (devPOS dn)) cats pos = maybe [] (map (either (Left. fst) Right)) $ M.lookup pos (catsForPOS env) -- find all functions that are possible parses of the word in any appropriate category --- it is still possible that some other category is meant - getWordTrees w cs = case morphoFallback w $ concatMap (parseWord w) cs of + getWordTrees wf w cs = case morphoFallback wf $ concatMap (parseWord w) cs of [] -> case cs of [] -> (True,[(newWordTree w unknownCat, unknownCat)]) _ -> (True,[(newWordTree w ec, ec) | c <- cs, let ec = either id id c]) From 4a820c618bcdb283028c06c91d0840369d2bd082 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20K=C3=A4llberg?= Date: Wed, 1 Dec 2021 16:17:53 +0800 Subject: [PATCH 12/14] Only generate Str fallbacks if the functions exist in the grammar --- UD2GF.hs | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/UD2GF.hs b/UD2GF.hs index 4003616..ea50fc1 100644 --- a/UD2GF.hs +++ b/UD2GF.hs @@ -474,10 +474,14 @@ analyseWords env = mapRTree lemma2fun getWordTrees wf w cs = case morphoFallback wf $ concatMap (parseWord w) cs of [] -> case cs of [] -> (True,[(newWordTree w unknownCat, unknownCat)]) - _ -> (True,[(newWordTree w ec, ec) | c <- cs, let ec = either id id c]) + _ -> (True,[(newWordTree w ec, ec) | c <- cs, let ec = either id id c, strFunExists ec]) fs -> (False,fs) + strFunExists c | Just typ <- functionType (pgfGrammar env) f = True + | otherwise = False + where f = mkCId ("Str" ++ showCId c) + --- this can fail if c is discontinuous, or return false positives if w is a form of another word parseWord w ec = case ec of Left c -> case parse (pgfGrammar env) (actLanguage env) (mkType [] c []) w of From 74185a1ec4199849b772c61d96e597432e557080 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20K=C3=A4llberg?= Date: Thu, 2 Dec 2021 10:57:03 +0800 Subject: [PATCH 13/14] If no str functions exists, give all of them This prevents a Prelude.head error form occurring --- UD2GF.hs | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/UD2GF.hs b/UD2GF.hs index ea50fc1..6838af3 100644 --- a/UD2GF.hs +++ b/UD2GF.hs @@ -474,10 +474,17 @@ analyseWords env = mapRTree lemma2fun getWordTrees wf w cs = case morphoFallback wf $ concatMap (parseWord w) cs of [] -> case cs of [] -> (True,[(newWordTree w unknownCat, unknownCat)]) - _ -> (True,[(newWordTree w ec, ec) | c <- cs, let ec = either id id c, strFunExists ec]) + _ -> (True,[(newWordTree w ec, ec) | c <- cs, let ec = either id id c, strFunExists ec] + `ifEmpty` [(newWordTree w ec, ec) | c <- cs, let ec = either id id c]) fs -> (False,fs) + -- | Return the first non-empty list + ifEmpty [] xs = xs + ifEmpty xs _ = xs + infixl 3 `ifEmpty` + + -- Verify that a StrSomeCat function exists in grammar strFunExists c | Just typ <- functionType (pgfGrammar env) f = True | otherwise = False where f = mkCId ("Str" ++ showCId c) From d94a5fc26b1783198e0252302e1975c763c0497d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20K=C3=A4llberg?= Date: Thu, 2 Dec 2021 11:00:26 +0800 Subject: [PATCH 14/14] If parse fails, try parsing the lowercase version --- UD2GF.hs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/UD2GF.hs b/UD2GF.hs index 6838af3..47f7398 100644 --- a/UD2GF.hs +++ b/UD2GF.hs @@ -471,7 +471,7 @@ analyseWords env = mapRTree lemma2fun -- find all functions that are possible parses of the word in any appropriate category --- it is still possible that some other category is meant - getWordTrees wf w cs = case morphoFallback wf $ concatMap (parseWord w) cs of + getWordTrees wf w cs = case concatMap (parseWord w) cs `ifEmpty` concatMap (parseWord (map toLower w)) cs `ifEmpty` morphoFallback wf of [] -> case cs of [] -> (True,[(newWordTree w unknownCat, unknownCat)]) _ -> (True,[(newWordTree w ec, ec) | c <- cs, let ec = either id id c, strFunExists ec] @@ -482,7 +482,7 @@ analyseWords env = mapRTree lemma2fun -- | Return the first non-empty list ifEmpty [] xs = xs ifEmpty xs _ = xs - infixl 3 `ifEmpty` + infixr 3 `ifEmpty` -- Verify that a StrSomeCat function exists in grammar strFunExists c | Just typ <- functionType (pgfGrammar env) f = True @@ -503,14 +503,13 @@ analyseWords env = mapRTree lemma2fun -- Fall back to morphoanalysis if gf parse fails -- TODO: We might want to use the morphoanalysis for all words, not just when parse fails - morphoFallback :: String -> [(RTree Lemma, CId)] -> [(RTree CId, CId)] - morphoFallback w [] = + morphoFallback :: String -> [(RTree CId, CId)] + morphoFallback w = [(RTree name [], c) | (name, _) <- lookupMorpho morpho w , Just tp <- pure $ functionType (pgfGrammar env) name , ([], c, []) <- pure $ unType tp ] - morphoFallback _ xs = xs -- auxiliaries -- newWordTree w c = RTree (mkCId (w ++ "_" ++ showCId c)) [] ---