From 7f8d8ecadc917621686ae60d7665f25102f0a99e Mon Sep 17 00:00:00 2001 From: rocky Date: Sat, 7 Dec 2024 19:21:40 -0500 Subject: [PATCH 1/3] Add list of box operators and operator-to-string These can be used in the scanner and parser --- mathics_scanner/data/named-characters.yml | 1 - mathics_scanner/data/operators-additional.yml | 121 ------------------ mathics_scanner/data/operators.yml | 104 +++++++-------- .../generate/build_operator_tables.py | 14 ++ 4 files changed, 67 insertions(+), 173 deletions(-) delete mode 100644 mathics_scanner/data/operators-additional.yml diff --git a/mathics_scanner/data/named-characters.yml b/mathics_scanner/data/named-characters.yml index 73654fd..59fec86 100644 --- a/mathics_scanner/data/named-characters.yml +++ b/mathics_scanner/data/named-characters.yml @@ -14,7 +14,6 @@ # 5. Unicode symbols cannot be overloaded, i.e. should not be used for more than one underlying function. # For example, ≫ (U+226B, "Much Greater-Than") is already used for GreaterGreater and therefore should not be an alias for >> for Put. # Likewise, ≪ (U+226A, "Much Less-Than") for Get, ∷ (U+2237, "Proportion") for MessageName, etc. - # # Field definitions # ================= diff --git a/mathics_scanner/data/operators-additional.yml b/mathics_scanner/data/operators-additional.yml deleted file mode 100644 index 00db270..0000000 --- a/mathics_scanner/data/operators-additional.yml +++ /dev/null @@ -1,121 +0,0 @@ -# -# Additional information not in CSV or -# Note: we keep the misspelling of "meaningfull" -# and the uncoverted types like None and True - -ApplyTo: - actual-precedence: 75 - Precedence: 75 # CSV has 604 which is wrong - Precedence-corrected: 75 - Precedence-Function: 75 - WolframLanguageData: 76 - WolframLanguageData-corrected: 75 - UnicodeCharacters.tr: - UnicodeCharacters-corrected.tr: 75 - arity: Binary - affix: Infix - associativity: left - meaningfull: "true" - # comments: - -Derivative: - actual-precedence: 770 - Precedence: 670 # CSV has 604 which is wrong - Precedence-corrected: 670 - Precedence-Function: 670 - WolframLanguageData: 19 - WolframLanguageData-corrected: 19 - UnicodeCharacters.tr: - UnicodeCharacters-corrected.tr: 680 - # N-tokens: {} - # L-tokens: {"''"} - # O-tokens: {} - # usage: {{"expr", "''"}} - # parse: {"Derivative", "[", "n", "]", "[", "expr", "]"} - # FullForm: Derivative[n][expr] - arity: Unary - affix: Postfix - associativity: left - meaningfull: "true" - # comments: - -Information: - actual-precedence: 670 - Precedence: 670 - Precedence-corrected: 670 - WolframLanguageData: null - WolframLanguageData-corrected: - UnicodeCharacters.tr: - UnicodeCharacters-corrected.tr: - # N-tokens: - # L-tokens: - # O-tokens: - # usage: "?? AddTo" - # parse: {"Information", "[", "AddTo", "]"} - # FullForm: Information[AddTo] - arity: Binary - affix: Infix - associativity: None - meaningfull: "true" - # comments: - -# This operator is a little sketchy -InterpretedBox: - actual-precedence: 670 - Precedence: 670 - Precedence-corrected: 670 - WolframLanguageData: null - WolframLanguageData-corrected: - UnicodeCharacters.tr: - UnicodeCharacters-corrected.tr: - # N-tokens: - # L-tokens: - # O-tokens: - # usage: "\! \(2+2\)" - # parse: {"FullForm", "[", "expr1", "]"} - # FullForm: FullForm[expr] - arity: Binary - affix: Infix - associativity: None - meaningfull: "true" - # comments: - -Postfix: - actual-precedence: 640 - Precedence: 640 - Precedence-corrected: 640 - WolframLanguageData: null - WolframLanguageData-corrected: - UnicodeCharacters.tr: - UnicodeCharacters-corrected.tr: - # N-tokens: - # L-tokens: - # O-tokens: - usage: "expr // FormName" - # parse: {"FullForm", "[", "expr1", "]"} - # FullForm: FullForm[expr] - arity: Binary - affix: Infix - associativity: None - meaningfull: "true" - # comments: - -Prefix: - actual-precedence: 640 - Precedence: 640 - Precedence-corrected: 640 - WolframLanguageData: null - WolframLanguageData-corrected: - UnicodeCharacters.tr: - UnicodeCharacters-corrected.tr: - # N-tokens: - # L-tokens: - # O-tokens: - usage: "expr1 @ expr2" - # parse: {"expr1", "[", "expr2", "]"} - # FullForm: expr1[expr2] - arity: Binary - affix: Infix - associativity: None - meaningfull: "true" - # comments: diff --git a/mathics_scanner/data/operators.yml b/mathics_scanner/data/operators.yml index 7211a22..3b53c54 100644 --- a/mathics_scanner/data/operators.yml +++ b/mathics_scanner/data/operators.yml @@ -111,6 +111,19 @@ # "non-associative-operators", "unknown" under the key "miscellaneous-operators", # and None as "flat_binary_operators. # +# box-operator +# ------------ +# +# This field exists and is set true if the operator is used in boxing. Boxing +# operators are enclosed in \( \) pairs. +# +# operator +# -------- +# +# This field exists for box-operators. In the future, we may expand to other operators. +# (Non box operators are listed in fields of named-characters.) +# It is the string vlaue for the operator. + # meaningful # --------- # This field "true" if WMA defines a meaning for the operator and "false" if not. @@ -1911,13 +1924,12 @@ FormBox: WolframLanguageData-corrected: 78 UnicodeCharacters.tr: UnicodeCharacters-corrected.tr: 160 - # N-tokens: {} - # L-tokens: {"\\`"} - # O-tokens: {} - usage: "expr1 \\ expr2" + operator: "\\`" + usage: ["\\(`input\\)", "\\(form\\`input\\)"] arity: Binary affix: Infix associativity: "unknown" + box-operator: true meaningful: true # comments: @@ -1928,13 +1940,12 @@ FractionBox: WolframLanguageData-corrected: 31 UnicodeCharacters.tr: UnicodeCharacters-corrected.tr: 590 - # N-tokens: {} - # L-tokens: {"\/"} - # O-tokens: {} - usage: "\\( expr1 \/ expr2 \\)" + operator: "\/" + usage: "\\(x\/y\\)" arity: Binary affix: Infix associativity: "unknown" + box-operator: true meaningful: true # comments: @@ -2445,14 +2456,13 @@ InterpretedBox: WolframLanguageData-corrected: None UnicodeCharacters.tr: None UnicodeCharacters-corrected.tr: None - # N-tokens: None - # L-tokens: None - # O-tokens: None - # usage: "None" + operator: "\\!" + # usage: "\\!\(...\)" FullForm: None - arity: Binary - affix: Infix + arity: Unary + affix: Prefix associativity: null + box-operator: true meaningful: true # comments: None @@ -4511,13 +4521,12 @@ OverscriptBox: WolframLanguageData-corrected: 7 UnicodeCharacters.tr: UnicodeCharacters-corrected.tr: 790 - # N-tokens: {} - # L-tokens: {"\&"} - # O-tokens: {} - # usage: "expr1 \& expr2" + operator: "\\&" + usage: "\\(x\\&y\\)" arity: Binary affix: Infix associativity: "unknown" + box-operator: true meaningful: true # comments: @@ -4528,12 +4537,11 @@ OverunderscriptBox: WolframLanguageData-corrected: 7.5 UnicodeCharacters.tr: UnicodeCharacters-corrected.tr: 785 - # N-tokens: {} - # L-tokens: {"\&"} - # O-tokens: {"\%"} - # usage: "expr1", "\&", "expr2 \% expr3" + operator: ["\\+", "\\%"] + usage: "\\(x\\+y\\%z\\)" arity: Ternary affix: Infix + box-operator: true associativity: "unknown" meaningful: true # comments: @@ -5079,13 +5087,12 @@ RadicalBox: WolframLanguageData-corrected: 22 UnicodeCharacters.tr: UnicodeCharacters-corrected.tr: 640 - # N-tokens: {"\@"} - # L-tokens: {} - # O-tokens: {"\%"} - # usage: "\@", "expr1 \% expr2" + operator: ["\\@", "\\%"] + usage: "\\(\\@x\\%n\\)" FullForm: - arity: Binary + arity: Ternary affix: Prefix + box-operator: true associativity: right meaningful: true # comments: @@ -5868,14 +5875,13 @@ SqrtBox: WolframLanguageData-corrected: 22 UnicodeCharacters.tr: 650 UnicodeCharacters-corrected.tr: 640 - # N-tokens: {"\@"} - # L-tokens: {} - # O-tokens: {} - # usage: "{{"\@", "expr"}}" + operator: "\\@" + usage: "\\(\\@x\\)" FullForm: arity: Unary affix: Prefix associativity: "unknown" + box-operator: true meaningful: true # comments: @@ -6274,20 +6280,19 @@ SuperDagger: meaningful: true # comments: -SuperscriptBox: +SubSuperscriptBox: precedence: 590 WolframLanguageData: 21 WolframLanguageData-corrected: 21 UnicodeCharacters.tr: UnicodeCharacters-corrected.tr: 660 - # N-tokens: {} - # L-tokens: {"\^"} - # O-tokens: {} - # usage: "expr1 \^ expr2" + operator: ["\_", "\\%"] + usage: "\\(x\_y\\%z\\)" FullForm: - arity: Binary + arity: Ternary affix: Infix associativity: "unknown" + box-operator: true meaningful: true # comments: @@ -6325,21 +6330,20 @@ SupersetEqual: meaningful: false # comments: -SupersubscriptBox: +SupercriptBox: Precedence-Function: 690 precedence: 590 WolframLanguageData: 21 WolframLanguageData-corrected: 21 UnicodeCharacters.tr: UnicodeCharacters-corrected.tr: 660 - # N-tokens: {} - # L-tokens: {"\^"} - # O-tokens: {"\%"} - # usage: "expr1", "\^", "expr2 \% expr3" + operator: "\\^" + usage: "\\(x\\^y\\)" FullForm: - arity: Ternary + arity: Binary affix: Infix associativity: right + box-operator: true meaningful: true # comments: @@ -6666,14 +6670,13 @@ UnderoverscriptBox: WolframLanguageData-corrected: 7.5 UnicodeCharacters.tr: UnicodeCharacters-corrected.tr: 785 - # N-tokens: {} - # L-tokens: {"\+"} - # O-tokens: {"\%"} - # usage: "expr1", "\+", "expr2 \% expr3" + operator: ["\\+", "\\%"] + usage: "\\(x\\+y\\%z\\)" FullForm: arity: Ternary affix: Infix associativity: "unknown" + box-operator: true meaningful: true # comments: @@ -6684,13 +6687,12 @@ UnderscriptBox: WolframLanguageData-corrected: 7 UnicodeCharacters.tr: UnicodeCharacters-corrected.tr: 790 - # N-tokens: {} - # L-tokens: {"\+"} - # O-tokens: {} - usage: "expr1 \\+ expr2" + operator: "\\+" + usage: "\\(x\\+y\\)" FullForm: arity: Binary affix: Infix + box-operator: true associativity: "unknown" meaningful: true # comments: diff --git a/mathics_scanner/generate/build_operator_tables.py b/mathics_scanner/generate/build_operator_tables.py index fedafa4..94e979e 100755 --- a/mathics_scanner/generate/build_operator_tables.py +++ b/mathics_scanner/generate/build_operator_tables.py @@ -5,6 +5,7 @@ import json import os.path as osp import sys +from collections import defaultdict from pathlib import Path from typing import Dict @@ -53,6 +54,7 @@ def compile_tables( for k, v in operator_data.items(): operator_precedence[k] = v["precedence"] + box_operators = {} flat_binary_operators = {} left_binary_operators = {} miscellaneous_operators = {} @@ -60,6 +62,7 @@ def compile_tables( no_meaning_postfix_operators = {} no_meaning_prefix_operators = {} nonassoc_binary_operators = {} + operator2string = defaultdict(list) postfix_operators = {} prefix_operators = {} right_binary_operators = {} @@ -96,6 +99,9 @@ def compile_tables( elif affix == "Postfix": operator_dict = postfix_operators + if operator_info.get("box-operator", False): + box_operators[operator_name] = operator_info["operator"] + # operator_dict tables are tied into the Mathics3 # parser. Extend this table, for example to # include the operator unicode, requires @@ -108,6 +114,12 @@ def compile_tables( continue unicode_char = character_info.get("unicode-equivalent", "no-unicode") + ascii_chars = character_info.get("ascii", "no-ascii") + + if unicode_char != "no-unicode": + operator2string[operator_name].append(unicode_char) + if ascii_chars != "no-ascii": + operator2string[operator_name].append(ascii_chars) if operator_info.get("meaningful", True) is False and ( character_data.get(operator_name) @@ -128,6 +140,7 @@ def compile_tables( print(f"FIXME: affix {affix} of {operator_name} not handled") return { + "box-operators": box_operators, "flat-binary-operators": flat_binary_operators, "left-binary-operators": left_binary_operators, "miscellaneous-operators": miscellaneous_operators, @@ -135,6 +148,7 @@ def compile_tables( "no-meaning-postfix-operators": no_meaning_postfix_operators, "no-meaning-prefix-operators": no_meaning_prefix_operators, "non-associative-binary-operators": nonassoc_binary_operators, + "operator-to_string": operator2string, "operator-precedence": operator_precedence, "postfix-operators": postfix_operators, "prefix-operators": prefix_operators, From 8d4aa62af64e5c7fbfc1e65555344a46d9bff65a Mon Sep 17 00:00:00 2001 From: rocky Date: Sat, 7 Dec 2024 20:05:03 -0500 Subject: [PATCH 2/3] Get box operator for scanner from JSON --- mathics_scanner/data/operators.yml | 40 ++++++++++++++---------------- mathics_scanner/tokeniser.py | 32 ++++++++---------------- 2 files changed, 29 insertions(+), 43 deletions(-) diff --git a/mathics_scanner/data/operators.yml b/mathics_scanner/data/operators.yml index 3b53c54..0b7a1f3 100644 --- a/mathics_scanner/data/operators.yml +++ b/mathics_scanner/data/operators.yml @@ -6062,14 +6062,13 @@ SubscriptBox: WolframLanguageData-corrected: 8 UnicodeCharacters.tr: UnicodeCharacters-corrected.tr: 775 - # N-tokens: {} - # L-tokens: {"\_"} - # O-tokens: {} - # usage: "expr1 \_ expr2" + operator: "\\_" + usage: "\\(x\\\_y\\)" FullForm: arity: Binary affix: Infix associativity: "unknown" + box-operator: true meaningful: true # comments: @@ -6313,6 +6312,22 @@ Superset: meaningful: false # comments: +SuperscriptBox: + precedence: 590 + WolframLanguageData: 21 + WolframLanguageData-corrected: 21 + UnicodeCharacters.tr: + UnicodeCharacters-corrected.tr: 660 + operator: "\\^" + usage: "\\(x\\^_y\\)" + FullForm: + arity: Binary + affix: Infix + associativity: "unknown" + box-operator: true + meaningful: true + # comments: + SupersetEqual: precedence: 250 WolframLanguageData: @@ -6330,23 +6345,6 @@ SupersetEqual: meaningful: false # comments: -SupercriptBox: - Precedence-Function: 690 - precedence: 590 - WolframLanguageData: 21 - WolframLanguageData-corrected: 21 - UnicodeCharacters.tr: - UnicodeCharacters-corrected.tr: 660 - operator: "\\^" - usage: "\\(x\\^y\\)" - FullForm: - arity: Binary - affix: Infix - associativity: right - box-operator: true - meaningful: true - # comments: - TagSet: Precedence-Function: 670 precedence: 40 diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index 442b434..cdaac50 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -109,18 +109,6 @@ def init_module(): # ("LeftRowBox", r" \\\( "), ("RightRowBox", r" \\\) "), - # Box Operators which are valid only inside Box delimiters - ("InterpretedBox", r" \\\! "), - ("SuperscriptBox", r" \\\^ "), - ("SubscriptBox", r" \\\_ "), - ("OverscriptBox", r" \\\& "), - ("UnderscriptBox", r" \\\+ "), - ("OtherscriptBox", r" \\\% "), - ("FractionBox", r" \\\/ "), - ("SqrtBox", r" \\\@ "), - ("RadicalBox", r" \\\@ "), - ("FormBox", r" \\\` "), - # # End Box Operators # ("Information", r"\?\?"), @@ -222,11 +210,11 @@ def init_module(): ("VerticalSeparator", r" \uF432 "), ] - for table in ("no-meaning-infix-operators",): + for table in ("box-operators", "no-meaning-infix-operators"): table_info = OPERATOR_DATA[table] for operator_name, unicode in table_info.items(): - # if any([tup[0] == operator_name for tup in tokens]): - # print(f"Please remove {operator_name}") + if any([tup[0] == operator_name for tup in tokens]): + print(f"Please remove {operator_name}") tokens.append((operator_name, f" {unicode} ")) literal_tokens = { @@ -273,17 +261,17 @@ def init_module(): "\\": [ "LeftRowBox", "RightRowBox", + "FormBox", + "FractionBox", "InterpretedBox", - "SuperscriptBox", - "SubscriptBox", + "OverunderscriptBox", "OverscriptBox", - "UnderscriptBox", - "OtherscriptBox", - "FractionBox", - "SqrtBox", "RadicalBox", - "FormBox", "RawBackslash", + "SqrtBox", + "SubscriptBox", + "SuperscriptBox", + "UnderscriptBox", ], "]": ["RawRightBracket"], "^": ["UpSetDelayed", "UpSet", "Power"], From dee31211f62393d18255a3419e975d979ce6461d Mon Sep 17 00:00:00 2001 From: rocky Date: Sat, 7 Dec 2024 20:41:11 -0500 Subject: [PATCH 3/3] Handle boxing ternary operators They list two operators. We just need the first of these in "tokens". --- mathics_scanner/tokeniser.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py index cdaac50..798729a 100644 --- a/mathics_scanner/tokeniser.py +++ b/mathics_scanner/tokeniser.py @@ -210,12 +210,18 @@ def init_module(): ("VerticalSeparator", r" \uF432 "), ] - for table in ("box-operators", "no-meaning-infix-operators"): - table_info = OPERATOR_DATA[table] + for table_name in ("box-operators", "no-meaning-infix-operators"): + table_info = OPERATOR_DATA[table_name] for operator_name, unicode in table_info.items(): - if any([tup[0] == operator_name for tup in tokens]): - print(f"Please remove {operator_name}") - tokens.append((operator_name, f" {unicode} ")) + # if any([tup[0] == operator_name for tup in tokens]): + # print(f"Please remove {operator_name}") + + # Ternary operators have two character symbols + # in a list. For tokens, we just want the first + # of the pair + if isinstance(unicode, list): + unicode = unicode[0] + tokens.append((operator_name, rf" {unicode} ")) literal_tokens = { "!": ["Unequal", "Factorial2", "Factorial"], @@ -264,7 +270,6 @@ def init_module(): "FormBox", "FractionBox", "InterpretedBox", - "OverunderscriptBox", "OverscriptBox", "RadicalBox", "RawBackslash", @@ -272,6 +277,7 @@ def init_module(): "SubscriptBox", "SuperscriptBox", "UnderscriptBox", + "UnderoverscriptBox", ], "]": ["RawRightBracket"], "^": ["UpSetDelayed", "UpSet", "Power"],