Fix Read{,List}[] with TokenWord and improve ...

* Read{,List}[] with TokenWord needs to return two tokens sometimes * Adjust error message the right value of Read or ReadList is used. * simplify logic in eval/files_io/read.py * Add data_dir variable that can be used in testing.
Mathics3 · Sep 29, 2024 · 77d5f60 · 77d5f60
1 parent a551242
commit 77d5f60
Show file tree

Hide file tree

Showing 8 changed files with 148 additions and 51 deletions.
diff --git a/mathics/builtin/files_io/files.py b/mathics/builtin/files_io/files.py
@@ -45,6 +45,7 @@
     MathicsOpen,
     channel_to_stream,
     close_stream,
+    parse_read_options,
     read_name_and_stream,
 )
 from mathics.eval.makeboxes import do_format, format_element
@@ -703,7 +704,9 @@ def eval_default(self, exprs, filename, evaluation):
 def validate_read_type(name: str, typ, evaluation: Evaluation):
     """
     Validate a Read option type, and give a message if
-    the type is invalid. For Expession[Hold]
+    the type is invalid. For Expession[Hold], we convert it to
+    SymbolHoldExpression, String names are like "Byte" are
+    converted to Symbols in the return.
     """
     if hasattr(typ, "head") and typ.head == SymbolHold:
         if not hasattr(typ, "elements"):
@@ -715,6 +718,26 @@ def validate_read_type(name: str, typ, evaluation: Evaluation):
             return None
 
         return SymbolHoldExpression
+
+    if isinstance(typ, String):
+        typ = Symbol(typ.value)
+    elif not isinstance(typ, Symbol):
+        evaluation.message(name, "readf", typ)
+        return None
+
+    if typ.short_name not in (
+        "Byte",
+        "Character",
+        "Expression",
+        "Number",
+        "Real",
+        "Record",
+        "String",
+        "Word",
+    ):
+        evaluation.message(name, "readf", typ)
+        return None
+
     return typ
 
 
@@ -791,11 +814,11 @@ class Read(Builtin):
      = 5
     #> Close[stream];
 
-    Reading a comment however will return the empty list:
+    Reading a comment, a non-expression, will return 'Hold[Null]'
     >> stream = StringToStream["(* ::Package:: *)"];
 
     >> Read[stream, Hold[Expression]]
-     = {}
+     = Hold[Null]
 
     #> Close[stream];
 
@@ -868,14 +891,26 @@ def eval(self, stream, types, evaluation: Evaluation, options: dict):
                 if new_type is None:
                     return
                 checked_types.append(new_type)
-            check_types = tuple(checked_types)
+            checked_types = tuple(checked_types)
         else:
             new_type = validate_read_type("Read", types, evaluation)
             if new_type is None:
                 return
             checked_types = (new_type,)
 
-        return eval_Read(name, n, checked_types, stream, evaluation, options)
+        result = eval_Read("Read", n, checked_types, stream, evaluation, options)
+        if isinstance(result, list):
+            if isinstance(types, ListExpression):
+                assert len(result) == len(
+                    types.elements
+                ), "internal error: eval_Read() should have a return for each type"
+            else:
+                assert (
+                    len(result) == 1
+                ), f"internal error: eval_Read() should return at most 1 element; got {result}"
+                return result[0]
+
+        return from_python(result)
 
 
 class ReadList(Read):
@@ -982,7 +1017,10 @@ class ReadList(Read):
     >> InputForm[%]
      = {123, abc}
     """
-    messages = {"opstl": "Value of option `1` should be a string or a list of strings."}
+    messages = {
+        "opstl": "Value of option `1` should be a string or a list of strings.",
+        "readf": "`1` is not a valid format specification.",
+    }
     options = {
         "NullRecords": "False",
         "NullWords": "False",
@@ -998,17 +1036,16 @@ class ReadList(Read):
     def eval(self, file, types, evaluation: Evaluation, options: dict):
         "ReadList[file_, types_, OptionsPattern[ReadList]]"
 
+        py_options = parse_read_options(options)
         # Options
         # TODO: Implement extra options
-        # py_options = parse_read_options(options)
         # null_records = py_options['NullRecords']
         # null_words = py_options['NullWords']
         # record_separators = py_options['RecordSeparators']
         # token_words = py_options['TokenWords']
         # word_separators = py_options['WordSeparators']
 
         result = []
-        name, n, stream = read_name_and_stream(file, evaluation)
 
         # FIXME: DRY better with Read[].
         # Validate types parameter and store the
@@ -1020,30 +1057,41 @@ def eval(self, file, types, evaluation: Evaluation, options: dict):
                 if new_type is None:
                     return
                 checked_types.append(new_type)
-            check_types = tuple(checked_types)
+            checked_types = tuple(checked_types)
         else:
             new_type = validate_read_type("ReadList", types, evaluation)
             if new_type is None:
                 return
             checked_types = (new_type,)
 
+        name, n, stream = read_name_and_stream(file, evaluation)
+
         if name is None:
             return
         elif name == SymbolFailed:
             return SymbolFailed
 
         while True:
-            tmp = eval_Read(name, n, checked_types, stream, evaluation, options)
+            next_elt = eval_Read(
+                "ReadList", n, checked_types, stream, evaluation, options
+            )
 
-            if tmp is None:
+            if next_elt is None:
                 return
 
-            if tmp is SymbolFailed:
+            if next_elt is SymbolFailed:
                 return
 
-            if tmp is SymbolEndOfFile:
+            if next_elt is SymbolEndOfFile:
                 break
-            result.append(tmp)
+
+            if isinstance(next_elt, list) and py_options["TokenWords"]:
+                # FIXME: This might not be correct in all cases.
+                # we probably need a more positive way to indicate whether next_elt
+                # was returned from TokenWord parsing or not.
+                result += next_elt
+            else:
+                result.append(next_elt)
         return from_python(result)
 
     def eval_n(self, file, types, n: Integer, evaluation: Evaluation, options: dict):

diff --git a/mathics/core/systemsymbols.py b/mathics/core/systemsymbols.py
@@ -273,6 +273,7 @@
 SymbolVariance = Symbol("System`Variance")
 SymbolWhitespace = Symbol("System`Whitespace")
 SymbolWhitespaceCharacter = Symbol("System`WhitespaceCharacter")
+SymbolWord = Symbol("System`Word")
 SymbolWordBoundary = Symbol("System`WordBoundary")
 SymbolWordCharacter = Symbol("System`WordCharacter")
 SymbolXor = Symbol("System`Xor")
diff --git a/mathics/eval/files_io/files.py b/mathics/eval/files_io/files.py
@@ -26,6 +26,7 @@
     SymbolHoldExpression,
     SymbolPath,
     SymbolReal,
+    SymbolWord,
 )
 from mathics.core.util import canonic_filename
 from mathics.eval.files_io.read import (
@@ -128,13 +129,14 @@ def eval_Read(
     name: str, n: int, types: tuple, stream, evaluation: Evaluation, options: dict
 ):
     """
-    Evaluation method for Read[] and ReadList[]
+    Evaluation method for Read[] and ReadList[]. `name` will be either "Read" or
+    "ReadList" and is used in error messages
     """
     types = to_mathics_list(*types)
 
     for typ in types.elements:
         if typ not in READ_TYPES:
-            evaluation.message("Read", "readf", typ)
+            evaluation.message(name, "readf", typ)
             return SymbolFailed
 
     separators = read_get_separators(options, evaluation)
@@ -199,7 +201,7 @@ def eval_Read(
 
                 if expr is SymbolEndOfFile:
                     evaluation.message(
-                        "Read", "readt", tmp, to_expression("InputSteam", name, n)
+                        name, "readt", tmp, to_expression("InputSteam", name, n)
                     )
                     return SymbolFailed
                 elif isinstance(expr, BaseElement):
@@ -219,7 +221,7 @@ def eval_Read(
                         tmp = float(tmp)
                     except ValueError:
                         evaluation.message(
-                            "Read", "readn", to_expression("InputSteam", name, n)
+                            name, "readn", to_expression("InputSteam", name, n)
                         )
                         return SymbolFailed
                 result.append(tmp)
@@ -231,7 +233,7 @@ def eval_Read(
                     tmp = float(tmp)
                 except ValueError:
                     evaluation.message(
-                        "Read", "readn", to_expression("InputSteam", name, n)
+                        name, "readn", to_expression("InputSteam", name, n)
                     )
                     return SymbolFailed
                 result.append(tmp)
@@ -242,17 +244,37 @@ def eval_Read(
                 if len(tmp) == 0:
                     raise EOFError
                 result.append(tmp.rstrip("\n"))
-            elif typ is Symbol("Word"):
-                result.append(next(read_word))
+            elif typ is SymbolWord:
+                # next() for word tokens can return one or two words:
+                # the next word in the list and a following TokenWord
+                # match.  Therefore, test for this and do list-like
+                # appending here.
+
+                # THINK ABOUT: We might need to reconsider/refactor
+                # other cases to allow for multiple words as well. And
+                # for uniformity, we may want to redo the generators to
+                # always return *lists* instead instead of either a
+                # word or a list (which is always at most two words?)
+                words = next(read_word)
+                if not isinstance(words, list):
+                    words = [words]
+                result += words
 
         except EOFError:
             return SymbolEndOfFile
         except UnicodeDecodeError:
-            evaluation.message("General", "ucdec")
+            evaluation.message(name, "ucdec")
 
     if isinstance(result, Symbol):
         return result
-    if len(result) == 1:
-        return from_python(*result)
+    if isinstance(result, list):
+        result_len = len(result)
+        if result_len == 0:
+            if SymbolHoldExpression in types:
+                return Expression(SymbolHold, SymbolNull)
+        elif result_len == 2 and SymbolWord in types:
+            return [from_python(part) for part in result]
+        elif result_len == 1:
+            result = result[0]
 
     return from_python(result)
diff --git a/mathics/eval/files_io/read.py b/mathics/eval/files_io/read.py
@@ -159,9 +159,9 @@ def parse_read_options(options) -> dict:
             string_quotes=False
         )
         assert isinstance(record_separators, list)
-        assert all(
-            isinstance(s, str) and s[0] == s[-1] == '"' for s in record_separators
-        )
+        # assert all(
+        #     isinstance(s, str) and s[0] == s[-1] == '"' for s in record_separators
+        # )
         record_separators = [s[1:-1] for s in record_separators]
         result["RecordSeparators"] = record_separators
 
@@ -171,8 +171,6 @@ def parse_read_options(options) -> dict:
             string_quotes=False
         )
         assert isinstance(word_separators, list)
-        assert all(isinstance(s, str) and s[0] == s[-1] == '"' for s in word_separators)
-        word_separators = [s[1:-1] for s in word_separators]
         result["WordSeparators"] = word_separators
 
     # NullRecords
@@ -190,7 +188,6 @@ def parse_read_options(options) -> dict:
     # TokenWords
     if "System`TokenWords" in keys:
         token_words = options["System`TokenWords"].to_python(string_quotes=False)
-        assert token_words == []
         result["TokenWords"] = token_words
 
     return result
@@ -385,9 +382,7 @@ def read_from_stream(
                         else:
                             yield word
                             continue
-                last_word = word
-                word = ""
-                yield last_word
+                yield word
                 break
 
             if tmp in word_separators:
@@ -396,30 +391,24 @@ def read_from_stream(
                 if stream.io.seekable():
                     stream.io.seek(stream.io.tell() - 1)
                 word += some_token_word_prefix
-                last_word = word
-                word = ""
                 some_token_word_prefix = ""
-                yield last_word
+                yield word
                 break
 
             if accepted is not None and tmp not in accepted:
                 word += some_token_word_prefix
-                last_word = word
-                word = ""
                 some_token_word_prefix = ""
-                yield last_word
+                yield word
                 break
 
             some_token_word_prefix += tmp
             for token_word in token_words:
                 if token_word == some_token_word_prefix:
                     if word:
-                        # Start here
-                        last_word = word
-                        word = ""
-                        some_token_word_prefix = ""
-                        yield last_word
-                    yield token_word
+                        yield [word, token_word]
+                    else:
+                        yield token_word
+                    some_token_word_prefix = ""
                     break
             else:
                 word += some_token_word_prefix

diff --git a/test/builtin/files_io/test_files.py b/test/builtin/files_io/test_files.py
@@ -278,12 +278,6 @@ def test_close():
         #     "{Read, InputStream, String, {Real}}",
         #     "",
         # ),
-        (
-            r'stream = StringToStream["\"abc123\""];ReadList[stream, "Invalid"]//{#1[[0]],#1[[2]]}&',
-            ("Invalid is not a valid format specification.",),
-            "{ReadList, Invalid}",
-            "",
-        ),
         ("Close[stream];", None, "Null", ""),
         (
             'ReadList[StringToStream["a 1 b 2"], {Word, Number}, 1]',