From 7f8d8ecadc917621686ae60d7665f25102f0a99e Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Sat, 7 Dec 2024 19:21:40 -0500
Subject: [PATCH 1/3] Add list of box operators and operator-to-string

These can be used in the scanner and parser
---
 mathics_scanner/data/named-characters.yml     |   1 -
 mathics_scanner/data/operators-additional.yml | 121 ------------------
 mathics_scanner/data/operators.yml            | 104 +++++++--------
 .../generate/build_operator_tables.py         |  14 ++
 4 files changed, 67 insertions(+), 173 deletions(-)
 delete mode 100644 mathics_scanner/data/operators-additional.yml

diff --git a/mathics_scanner/data/named-characters.yml b/mathics_scanner/data/named-characters.yml
index 73654fd..59fec86 100644
--- a/mathics_scanner/data/named-characters.yml
+++ b/mathics_scanner/data/named-characters.yml
@@ -14,7 +14,6 @@
 # 5. Unicode symbols cannot be overloaded, i.e. should not be used for more than one underlying function.
 #    For example, ≫ (U+226B, "Much Greater-Than") is already used for GreaterGreater and therefore should not be an alias for >> for Put.
 #    Likewise, ≪ (U+226A, "Much Less-Than") for Get, ∷ (U+2237, "Proportion") for MessageName, etc.
-
 #
 # Field definitions
 # =================
diff --git a/mathics_scanner/data/operators-additional.yml b/mathics_scanner/data/operators-additional.yml
deleted file mode 100644
index 00db270..0000000
--- a/mathics_scanner/data/operators-additional.yml
+++ /dev/null
@@ -1,121 +0,0 @@
-#
-# Additional information not in CSV or
-# Note: we keep the misspelling of "meaningfull"
-# and the uncoverted types like None and True
-
-ApplyTo:
-  actual-precedence: 75
-  Precedence: 75  # CSV has 604 which is wrong
-  Precedence-corrected: 75
-  Precedence-Function: 75
-  WolframLanguageData: 76
-  WolframLanguageData-corrected: 75
-  UnicodeCharacters.tr:
-  UnicodeCharacters-corrected.tr: 75
-  arity: Binary
-  affix: Infix
-  associativity: left
-  meaningfull: "true"
-  # comments:
-
-Derivative:
-  actual-precedence: 770
-  Precedence: 670  # CSV has 604 which is wrong
-  Precedence-corrected: 670
-  Precedence-Function: 670
-  WolframLanguageData: 19
-  WolframLanguageData-corrected: 19
-  UnicodeCharacters.tr:
-  UnicodeCharacters-corrected.tr: 680
-  # N-tokens: {}
-  # L-tokens: {"''"}
-  # O-tokens: {}
-  # usage: {{"expr", "''"}}
-  # parse: {"Derivative", "[", "n", "]", "[", "expr", "]"}
-  # FullForm: Derivative[n][expr]
-  arity: Unary
-  affix: Postfix
-  associativity: left
-  meaningfull: "true"
-  # comments:
-
-Information:
-  actual-precedence: 670
-  Precedence: 670
-  Precedence-corrected: 670
-  WolframLanguageData: null
-  WolframLanguageData-corrected:
-  UnicodeCharacters.tr:
-  UnicodeCharacters-corrected.tr:
-  # N-tokens:
-  # L-tokens:
-  # O-tokens:
-  # usage: "?? AddTo"
-  # parse: {"Information", "[", "AddTo",  "]"}
-  # FullForm: Information[AddTo]
-  arity: Binary
-  affix: Infix
-  associativity: None
-  meaningfull: "true"
-  # comments:
-
-# This operator is a little sketchy
-InterpretedBox:
-  actual-precedence: 670
-  Precedence: 670
-  Precedence-corrected: 670
-  WolframLanguageData: null
-  WolframLanguageData-corrected:
-  UnicodeCharacters.tr:
-  UnicodeCharacters-corrected.tr:
-  # N-tokens:
-  # L-tokens:
-  # O-tokens:
-  # usage: "\! \(2+2\)"
-  # parse: {"FullForm", "[", "expr1",  "]"}
-  # FullForm: FullForm[expr]
-  arity: Binary
-  affix: Infix
-  associativity: None
-  meaningfull: "true"
-  # comments:
-
-Postfix:
-  actual-precedence: 640
-  Precedence: 640
-  Precedence-corrected: 640
-  WolframLanguageData: null
-  WolframLanguageData-corrected:
-  UnicodeCharacters.tr:
-  UnicodeCharacters-corrected.tr:
-  # N-tokens:
-  # L-tokens:
-  # O-tokens:
-  usage: "expr // FormName"
-  # parse: {"FullForm", "[", "expr1",  "]"}
-  # FullForm: FullForm[expr]
-  arity: Binary
-  affix: Infix
-  associativity: None
-  meaningfull: "true"
-  # comments:
-
-Prefix:
-  actual-precedence: 640
-  Precedence: 640
-  Precedence-corrected: 640
-  WolframLanguageData: null
-  WolframLanguageData-corrected:
-  UnicodeCharacters.tr:
-  UnicodeCharacters-corrected.tr:
-  # N-tokens:
-  # L-tokens:
-  # O-tokens:
-  usage: "expr1 @ expr2"
-  # parse: {"expr1", "[", "expr2",  "]"}
-  # FullForm: expr1[expr2]
-  arity: Binary
-  affix: Infix
-  associativity: None
-  meaningfull: "true"
-  # comments:
diff --git a/mathics_scanner/data/operators.yml b/mathics_scanner/data/operators.yml
index 7211a22..3b53c54 100644
--- a/mathics_scanner/data/operators.yml
+++ b/mathics_scanner/data/operators.yml
@@ -111,6 +111,19 @@
 #       "non-associative-operators", "unknown" under the key "miscellaneous-operators",
 #        and None as "flat_binary_operators.
 #
+# box-operator
+# ------------
+#
+# This field exists and is set true if the operator is used in boxing. Boxing
+# operators are enclosed in \(  \) pairs.
+#
+# operator
+# --------
+#
+# This field exists for box-operators. In the future, we may expand to other operators.
+# (Non box operators are listed in fields of named-characters.)
+# It is the string vlaue for the operator.
+
 # meaningful
 # ---------
 #  This field "true" if WMA defines a meaning for the operator and "false" if not.
@@ -1911,13 +1924,12 @@ FormBox:
   WolframLanguageData-corrected: 78
   UnicodeCharacters.tr:
   UnicodeCharacters-corrected.tr: 160
-  # N-tokens: {}
-  # L-tokens: {"\\`"}
-  # O-tokens: {}
-  usage: "expr1 \\ expr2"
+  operator: "\\`"
+  usage: ["\\(`input\\)", "\\(form\\`input\\)"]
   arity: Binary
   affix: Infix
   associativity: "unknown"
+  box-operator: true
   meaningful: true
   # comments:
 
@@ -1928,13 +1940,12 @@ FractionBox:
   WolframLanguageData-corrected: 31
   UnicodeCharacters.tr:
   UnicodeCharacters-corrected.tr: 590
-  # N-tokens: {}
-  # L-tokens: {"\/"}
-  # O-tokens: {}
-  usage: "\\( expr1 \/  expr2 \\)"
+  operator: "\/"
+  usage: "\\(x\/y\\)"
   arity: Binary
   affix: Infix
   associativity: "unknown"
+  box-operator: true
   meaningful: true
   # comments:
 
@@ -2445,14 +2456,13 @@ InterpretedBox:
   WolframLanguageData-corrected: None
   UnicodeCharacters.tr: None
   UnicodeCharacters-corrected.tr: None
-  # N-tokens: None
-  # L-tokens: None
-  # O-tokens: None
-  # usage: "None"
+  operator: "\\!"
+  # usage: "\\!\(...\)"
   FullForm: None
-  arity: Binary
-  affix: Infix
+  arity: Unary
+  affix: Prefix
   associativity: null
+  box-operator: true
   meaningful: true
   # comments: None
 
@@ -4511,13 +4521,12 @@ OverscriptBox:
   WolframLanguageData-corrected: 7
   UnicodeCharacters.tr:
   UnicodeCharacters-corrected.tr: 790
-  # N-tokens: {}
-  # L-tokens: {"\&"}
-  # O-tokens: {}
-  # usage: "expr1 \& expr2"
+  operator: "\\&"
+  usage: "\\(x\\&y\\)"
   arity: Binary
   affix: Infix
   associativity: "unknown"
+  box-operator: true
   meaningful: true
   # comments:
 
@@ -4528,12 +4537,11 @@ OverunderscriptBox:
   WolframLanguageData-corrected: 7.5
   UnicodeCharacters.tr:
   UnicodeCharacters-corrected.tr: 785
-  # N-tokens: {}
-  # L-tokens: {"\&"}
-  # O-tokens: {"\%"}
-  # usage: "expr1", "\&", "expr2 \% expr3"
+  operator: ["\\+", "\\%"]
+  usage: "\\(x\\+y\\%z\\)"
   arity: Ternary
   affix: Infix
+  box-operator: true
   associativity: "unknown"
   meaningful: true
   # comments:
@@ -5079,13 +5087,12 @@ RadicalBox:
   WolframLanguageData-corrected: 22
   UnicodeCharacters.tr:
   UnicodeCharacters-corrected.tr: 640
-  # N-tokens: {"\@"}
-  # L-tokens: {}
-  # O-tokens: {"\%"}
-  # usage: "\@", "expr1 \% expr2"
+  operator: ["\\@", "\\%"]
+  usage: "\\(\\@x\\%n\\)"
   FullForm:
-  arity: Binary
+  arity: Ternary
   affix: Prefix
+  box-operator: true
   associativity: right
   meaningful: true
   # comments:
@@ -5868,14 +5875,13 @@ SqrtBox:
   WolframLanguageData-corrected: 22
   UnicodeCharacters.tr: 650
   UnicodeCharacters-corrected.tr: 640
-  # N-tokens: {"\@"}
-  # L-tokens: {}
-  # O-tokens: {}
-  # usage: "{{"\@", "expr"}}"
+  operator: "\\@"
+  usage: "\\(\\@x\\)"
   FullForm:
   arity: Unary
   affix: Prefix
   associativity: "unknown"
+  box-operator: true
   meaningful: true
   # comments:
 
@@ -6274,20 +6280,19 @@ SuperDagger:
   meaningful: true
   # comments:
 
-SuperscriptBox:
+SubSuperscriptBox:
   precedence: 590
   WolframLanguageData: 21
   WolframLanguageData-corrected: 21
   UnicodeCharacters.tr:
   UnicodeCharacters-corrected.tr: 660
-  # N-tokens: {}
-  # L-tokens: {"\^"}
-  # O-tokens: {}
-  # usage: "expr1 \^ expr2"
+  operator: ["\_", "\\%"]
+  usage: "\\(x\_y\\%z\\)"
   FullForm:
-  arity: Binary
+  arity: Ternary
   affix: Infix
   associativity: "unknown"
+  box-operator: true
   meaningful: true
   # comments:
 
@@ -6325,21 +6330,20 @@ SupersetEqual:
   meaningful: false
   # comments:
 
-SupersubscriptBox:
+SupercriptBox:
   Precedence-Function: 690
   precedence: 590
   WolframLanguageData: 21
   WolframLanguageData-corrected: 21
   UnicodeCharacters.tr:
   UnicodeCharacters-corrected.tr: 660
-  # N-tokens: {}
-  # L-tokens: {"\^"}
-  # O-tokens: {"\%"}
-  # usage: "expr1", "\^", "expr2 \% expr3"
+  operator: "\\^"
+  usage: "\\(x\\^y\\)"
   FullForm:
-  arity: Ternary
+  arity: Binary
   affix: Infix
   associativity: right
+  box-operator: true
   meaningful: true
   # comments:
 
@@ -6666,14 +6670,13 @@ UnderoverscriptBox:
   WolframLanguageData-corrected: 7.5
   UnicodeCharacters.tr:
   UnicodeCharacters-corrected.tr: 785
-  # N-tokens: {}
-  # L-tokens: {"\+"}
-  # O-tokens: {"\%"}
-  # usage: "expr1", "\+", "expr2 \% expr3"
+  operator: ["\\+", "\\%"]
+  usage: "\\(x\\+y\\%z\\)"
   FullForm:
   arity: Ternary
   affix: Infix
   associativity: "unknown"
+  box-operator: true
   meaningful: true
   # comments:
 
@@ -6684,13 +6687,12 @@ UnderscriptBox:
   WolframLanguageData-corrected: 7
   UnicodeCharacters.tr:
   UnicodeCharacters-corrected.tr: 790
-  # N-tokens: {}
-  # L-tokens: {"\+"}
-  # O-tokens: {}
-  usage: "expr1 \\+ expr2"
+  operator: "\\+"
+  usage: "\\(x\\+y\\)"
   FullForm:
   arity: Binary
   affix: Infix
+  box-operator: true
   associativity: "unknown"
   meaningful: true
   # comments:
diff --git a/mathics_scanner/generate/build_operator_tables.py b/mathics_scanner/generate/build_operator_tables.py
index fedafa4..94e979e 100755
--- a/mathics_scanner/generate/build_operator_tables.py
+++ b/mathics_scanner/generate/build_operator_tables.py
@@ -5,6 +5,7 @@
 import json
 import os.path as osp
 import sys
+from collections import defaultdict
 from pathlib import Path
 from typing import Dict
 
@@ -53,6 +54,7 @@ def compile_tables(
     for k, v in operator_data.items():
         operator_precedence[k] = v["precedence"]
 
+    box_operators = {}
     flat_binary_operators = {}
     left_binary_operators = {}
     miscellaneous_operators = {}
@@ -60,6 +62,7 @@ def compile_tables(
     no_meaning_postfix_operators = {}
     no_meaning_prefix_operators = {}
     nonassoc_binary_operators = {}
+    operator2string = defaultdict(list)
     postfix_operators = {}
     prefix_operators = {}
     right_binary_operators = {}
@@ -96,6 +99,9 @@ def compile_tables(
         elif affix == "Postfix":
             operator_dict = postfix_operators
 
+        if operator_info.get("box-operator", False):
+            box_operators[operator_name] = operator_info["operator"]
+
         # operator_dict tables are tied into the Mathics3
         # parser. Extend this table, for example to
         # include the operator unicode, requires
@@ -108,6 +114,12 @@ def compile_tables(
             continue
 
         unicode_char = character_info.get("unicode-equivalent", "no-unicode")
+        ascii_chars = character_info.get("ascii", "no-ascii")
+
+        if unicode_char != "no-unicode":
+            operator2string[operator_name].append(unicode_char)
+        if ascii_chars != "no-ascii":
+            operator2string[operator_name].append(ascii_chars)
 
         if operator_info.get("meaningful", True) is False and (
             character_data.get(operator_name)
@@ -128,6 +140,7 @@ def compile_tables(
                 print(f"FIXME: affix {affix} of {operator_name} not handled")
 
     return {
+        "box-operators": box_operators,
         "flat-binary-operators": flat_binary_operators,
         "left-binary-operators": left_binary_operators,
         "miscellaneous-operators": miscellaneous_operators,
@@ -135,6 +148,7 @@ def compile_tables(
         "no-meaning-postfix-operators": no_meaning_postfix_operators,
         "no-meaning-prefix-operators": no_meaning_prefix_operators,
         "non-associative-binary-operators": nonassoc_binary_operators,
+        "operator-to_string": operator2string,
         "operator-precedence": operator_precedence,
         "postfix-operators": postfix_operators,
         "prefix-operators": prefix_operators,

From 8d4aa62af64e5c7fbfc1e65555344a46d9bff65a Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Sat, 7 Dec 2024 20:05:03 -0500
Subject: [PATCH 2/3] Get box operator for scanner from JSON

---
 mathics_scanner/data/operators.yml | 40 ++++++++++++++----------------
 mathics_scanner/tokeniser.py       | 32 ++++++++----------------
 2 files changed, 29 insertions(+), 43 deletions(-)

diff --git a/mathics_scanner/data/operators.yml b/mathics_scanner/data/operators.yml
index 3b53c54..0b7a1f3 100644
--- a/mathics_scanner/data/operators.yml
+++ b/mathics_scanner/data/operators.yml
@@ -6062,14 +6062,13 @@ SubscriptBox:
   WolframLanguageData-corrected: 8
   UnicodeCharacters.tr:
   UnicodeCharacters-corrected.tr: 775
-  # N-tokens: {}
-  # L-tokens: {"\_"}
-  # O-tokens: {}
-  # usage: "expr1 \_ expr2"
+  operator: "\\_"
+  usage: "\\(x\\\_y\\)"
   FullForm:
   arity: Binary
   affix: Infix
   associativity: "unknown"
+  box-operator: true
   meaningful: true
   # comments:
 
@@ -6313,6 +6312,22 @@ Superset:
   meaningful: false
   # comments:
 
+SuperscriptBox:
+  precedence: 590
+  WolframLanguageData: 21
+  WolframLanguageData-corrected: 21
+  UnicodeCharacters.tr:
+  UnicodeCharacters-corrected.tr: 660
+  operator: "\\^"
+  usage: "\\(x\\^_y\\)"
+  FullForm:
+  arity: Binary
+  affix: Infix
+  associativity: "unknown"
+  box-operator: true
+  meaningful: true
+  # comments:
+
 SupersetEqual:
   precedence: 250
   WolframLanguageData:
@@ -6330,23 +6345,6 @@ SupersetEqual:
   meaningful: false
   # comments:
 
-SupercriptBox:
-  Precedence-Function: 690
-  precedence: 590
-  WolframLanguageData: 21
-  WolframLanguageData-corrected: 21
-  UnicodeCharacters.tr:
-  UnicodeCharacters-corrected.tr: 660
-  operator: "\\^"
-  usage: "\\(x\\^y\\)"
-  FullForm:
-  arity: Binary
-  affix: Infix
-  associativity: right
-  box-operator: true
-  meaningful: true
-  # comments:
-
 TagSet:
   Precedence-Function: 670
   precedence: 40
diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
index 442b434..cdaac50 100644
--- a/mathics_scanner/tokeniser.py
+++ b/mathics_scanner/tokeniser.py
@@ -109,18 +109,6 @@ def init_module():
         #
         ("LeftRowBox", r" \\\( "),
         ("RightRowBox", r" \\\) "),
-        # Box Operators which are valid only inside Box delimiters
-        ("InterpretedBox", r" \\\! "),
-        ("SuperscriptBox", r" \\\^ "),
-        ("SubscriptBox", r" \\\_ "),
-        ("OverscriptBox", r" \\\& "),
-        ("UnderscriptBox", r" \\\+ "),
-        ("OtherscriptBox", r" \\\% "),
-        ("FractionBox", r" \\\/ "),
-        ("SqrtBox", r" \\\@ "),
-        ("RadicalBox", r" \\\@ "),
-        ("FormBox", r" \\\` "),
-        #
         # End Box Operators
         #
         ("Information", r"\?\?"),
@@ -222,11 +210,11 @@ def init_module():
         ("VerticalSeparator", r" \uF432 "),
     ]
 
-    for table in ("no-meaning-infix-operators",):
+    for table in ("box-operators", "no-meaning-infix-operators"):
         table_info = OPERATOR_DATA[table]
         for operator_name, unicode in table_info.items():
-            # if any([tup[0] == operator_name for tup in tokens]):
-            #     print(f"Please remove {operator_name}")
+            if any([tup[0] == operator_name for tup in tokens]):
+                print(f"Please remove {operator_name}")
             tokens.append((operator_name, f" {unicode} "))
 
     literal_tokens = {
@@ -273,17 +261,17 @@ def init_module():
         "\\": [
             "LeftRowBox",
             "RightRowBox",
+            "FormBox",
+            "FractionBox",
             "InterpretedBox",
-            "SuperscriptBox",
-            "SubscriptBox",
+            "OverunderscriptBox",
             "OverscriptBox",
-            "UnderscriptBox",
-            "OtherscriptBox",
-            "FractionBox",
-            "SqrtBox",
             "RadicalBox",
-            "FormBox",
             "RawBackslash",
+            "SqrtBox",
+            "SubscriptBox",
+            "SuperscriptBox",
+            "UnderscriptBox",
         ],
         "]": ["RawRightBracket"],
         "^": ["UpSetDelayed", "UpSet", "Power"],

From dee31211f62393d18255a3419e975d979ce6461d Mon Sep 17 00:00:00 2001
From: rocky <rb@dustyfeet.com>
Date: Sat, 7 Dec 2024 20:41:11 -0500
Subject: [PATCH 3/3] Handle boxing ternary operators

They list two operators. We just need the first of these in "tokens".
---
 mathics_scanner/tokeniser.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
index cdaac50..798729a 100644
--- a/mathics_scanner/tokeniser.py
+++ b/mathics_scanner/tokeniser.py
@@ -210,12 +210,18 @@ def init_module():
         ("VerticalSeparator", r" \uF432 "),
     ]
 
-    for table in ("box-operators", "no-meaning-infix-operators"):
-        table_info = OPERATOR_DATA[table]
+    for table_name in ("box-operators", "no-meaning-infix-operators"):
+        table_info = OPERATOR_DATA[table_name]
         for operator_name, unicode in table_info.items():
-            if any([tup[0] == operator_name for tup in tokens]):
-                print(f"Please remove {operator_name}")
-            tokens.append((operator_name, f" {unicode} "))
+            # if any([tup[0] == operator_name for tup in tokens]):
+            #     print(f"Please remove {operator_name}")
+
+            # Ternary operators have two character symbols
+            # in a list. For tokens, we just want the first
+            # of the pair
+            if isinstance(unicode, list):
+                unicode = unicode[0]
+            tokens.append((operator_name, rf" {unicode} "))
 
     literal_tokens = {
         "!": ["Unequal", "Factorial2", "Factorial"],
@@ -264,7 +270,6 @@ def init_module():
             "FormBox",
             "FractionBox",
             "InterpretedBox",
-            "OverunderscriptBox",
             "OverscriptBox",
             "RadicalBox",
             "RawBackslash",
@@ -272,6 +277,7 @@ def init_module():
             "SubscriptBox",
             "SuperscriptBox",
             "UnderscriptBox",
+            "UnderoverscriptBox",
         ],
         "]": ["RawRightBracket"],
         "^": ["UpSetDelayed", "UpSet", "Power"],