Add more test and tests for new stuff

Mathics3 · Dec 11, 2024 · ab22393 · ab22393
1 parent 4eb5499
commit ab22393
Show file tree

Hide file tree

Showing 3 changed files with 39 additions and 7 deletions.
diff --git a/mathics_scanner/prescanner.py b/mathics_scanner/prescanner.py
@@ -98,6 +98,7 @@ def try_parse_base(start_shift: int, end_shift: int, base: int) -> None:
                     self.feeder.message("Syntax", "sntoct2")
                 elif last == 3:
                     self.feeder.message("Syntax", "sntoct1")
+                    raise ScanError()
                 elif last == 4:
                     self.feeder.message("Syntax", "snthex")
                 else:
@@ -151,6 +152,16 @@ def try_parse_named_character(start_shift: int):
             # Stay in same line fragment, but advance the cursor position.
             self.pos = i + 1
 
+        # FIXME:
+        #  The following code is boneheadedly wrong because
+        #  the surrounding lexical context determines whether
+        #  an escape sequences should be valid or not.
+        #  For example, inside a comment, there is no such thing
+        #  as an invalid escape sequence. And this cause  \050 which is
+        #  a valid escape sequence, parenthesis, to get treated like
+        #  a grouping symbol inside of a string.
+        # ...
+        #
         # In the following loop, we look for and replace escape
         # sequences. The current character under consideration is at
         # self.code[self.pos].  When an escape sequence is found at

diff --git a/mathics_scanner/tokeniser.py b/mathics_scanner/tokeniser.py
@@ -574,6 +574,13 @@ def t_String(self, match: re.Match) -> Token:
                     # We have a \ at the end of a line.
                     self.incomplete()
                     skipped_chars.append(self.pos)
+
+                # Code below is in pre-scanner. We might decide
+                # later to move that code here.
+                # elif self.code[self.pos + 1] in "01234567":
+                #     # See if we have an octal number.
+                #     try_parse_base(1, 4, 8)
+
                 else:
                     # newlines (\n), tabs (\t) and double backslash
                     # "\\" have the backslash preserved. But for other
@@ -591,7 +598,7 @@ def t_String(self, match: re.Match) -> Token:
                         self.feeder.message(
                             "Syntax", "stresc", self.code[self.pos : self.pos + 2]
                         )
-                        return Token("String", "", start)
+                        raise ScanError()
 
                     self.pos += 2
             else:

diff --git a/test/test_string_tokens.py b/test/test_string_tokens.py
@@ -5,7 +5,7 @@
 
 import pytest
 
-from mathics_scanner.errors import IncompleteSyntaxError
+from mathics_scanner.errors import IncompleteSyntaxError, ScanError
 from mathics_scanner.feed import SingleLineFeeder
 from mathics_scanner.tokeniser import Token, Tokeniser
 
@@ -17,10 +17,19 @@ def check_string(source_text, expected_text: str):
     assert token.text == expected_text
 
 
-def incomplete_error(s: str):
-    with pytest.raises(IncompleteSyntaxError):
+def incomplete_error(s: str, failure_msg: str):
+    with pytest.raises(IncompleteSyntaxError) as excinfo:
         get_tokens(s)
 
+    assert excinfo, failure_msg
+
+
+def scan_error(s: str, failure_msg: str):
+    with pytest.raises(ScanError) as excinfo:
+        get_tokens(s)
+
+    assert excinfo, failure_msg
+
 
 def single_token(source_text) -> Token:
     tokens = get_tokens(source_text)
@@ -45,9 +54,14 @@ def test_string():
     for ctrl_char in ("\b", "\f", "\n", "\r", "\t"):
         check_string(f'"a{ctrl_char}"', f'"a{ctrl_char}"')
 
-    incomplete_error(r'"a\X"')
+    # Broken:
+    # "a\050", "a\051" "a\052"
+    # Prescanning eagerly replaces the escape sequences with
+    # symbols "(", ")", or "*" respectively and this messes up parsing
+    # somehow.
     check_string(r'"abc"', r'"abc"')
-    incomplete_error(r'"abc')
     check_string(r'"abc(*def*)"', r'"abc(*def*)"')
     check_string(r'"a\"b\\c"', r'"a\"b\\c"')
-    incomplete_error(r'"\"')
+    incomplete_error(r'"abc', "String does not have terminating quote")
+    incomplete_error(r'"\"', "Unterminated escape sequence")
+    incomplete_error(r'"a\X"', '"X" is not a valid escape character')