frictionlessdata#1622: Fix more cases (including CI)

# Summary The CI tests identified some issues that don't show up on a normal test run. This commit fixes those issues. It also highlighted that there were numerous areas that didn't have sufficient test coverage for the case that the caller had already opened the resource. The indexer has some notable changes, but the biggest area affected is the parsers when writing from an already opened source. This commit adds unit tests for the index and all the parser formats for this case, and fixes the code to support the lack of nested contexts. # Tests - Setup the required databases for CI by copying the commands in the github actions - Run `hatch run +py=3.11 ci:test` and ensure all tests pass and coverage remains sufficient - Run `hatch run test` in case it is different and ensure all tests pass and coverage remains sufficient This also means that all linting etc. has been run too.
Engine-B · Jan 25, 2024 · 998e6d3 · 998e6d3
1 parent bdccca0
commit 998e6d3
Show file tree

Hide file tree

Showing 31 changed files with 405 additions and 49 deletions.
diff --git a/frictionless/analyzer/analyzer.py b/frictionless/analyzer/analyzer.py
@@ -34,7 +34,8 @@ def analyze_table_resource(
         # Iterate rows
         columns_data: Dict[str, List[Any]] = {}
         numeric = ["integer", "numeric", "number"]
-        with resource:
+        # Use a copy of the resource to avoid side effects (see #1622)
+        with resource.to_copy() as resource:
             for row in resource.row_stream:
                 null_columns = 0
                 for field_name in row:

diff --git a/frictionless/formats/gsheets/parser.py b/frictionless/formats/gsheets/parser.py
@@ -53,7 +53,8 @@ def write_row_stream(self, source: TableResource):
         sh = gc.open_by_key(key)
         wks = sh.worksheet_by_id(gid) if gid else sh[0]  # type: ignore
         data: List[Any] = []
-        with source:
+        # Use a copy of the source to avoid side effects (see #1622)
+        with source.to_copy() as source:
             data.append(source.schema.field_names)
             for row in source.row_stream:
                 data.append(row.to_list())

diff --git a/frictionless/formats/html/parser.py b/frictionless/formats/html/parser.py
@@ -57,7 +57,8 @@ def read_cell_stream_create(self) -> types.ICellStream:
     # It will give us an ability to support HtmlDialect
     def write_row_stream(self, source: TableResource):
         html = "<html><body><table>\n"
-        with source:
+        # Use a copy of the source to avoid side effects (see #1622)
+        with source.to_copy() as source:
             html += "<tr>"
             for name in source.schema.field_names:
                 html += f"<td>{name}</td>"

diff --git a/frictionless/formats/pandas/parser.py b/frictionless/formats/pandas/parser.py
@@ -128,7 +128,8 @@ def write_row_stream(self, source: TableResource):
         data_rows: List[Tuple[Any]] = []
         index_rows: List[Tuple[Any]] = []
         fixed_types = {}
-        with source:
+        # Use a copy of the source to avoid side effects (see #1622)
+        with source.to_copy() as source:
             for row in source.row_stream:
                 data_values: List[Any] = []
                 index_values: List[Any] = []

diff --git a/frictionless/formats/qsv/adapter.py b/frictionless/formats/qsv/adapter.py
@@ -27,7 +27,8 @@ def read_schema(self, resource: Resource) -> Schema:
         command = [self.qsv_path, "stats", "--infer-dates", "--dates-whitelist", "all"]
         process = sp.Popen(command, stdout=sp.PIPE, stdin=sp.PIPE)
         # TODO: Use FileResource here (or future resource.stream_bytes())
-        with resource:
+        # Use a copy of the resource to avoid side effects (see #1622)
+        with resource.to_copy() as resource:
             while True:
                 chunk = resource.read_bytes(size=BLOCK_SIZE)
                 if not chunk:

diff --git a/frictionless/formats/spss/parser.py b/frictionless/formats/spss/parser.py
@@ -99,7 +99,8 @@ def write_row_stream(self, source: TableResource):
 
         # Write rows
         with sav.SavWriter(self.resource.normpath, ioUtf8=True, **spss_schema) as writer:  # type: ignore
-            with source:
+            # Use a copy of the source to avoid side effects (see #1622)
+            with source.to_copy() as source:
                 for row in source.row_stream:  # type: ignore
                     cells: List[Any] = []
                     for field in source.schema.fields:  # type: ignore
@@ -130,7 +131,8 @@ def __write_convert_schema(self, source: TableResource):
             "varTypes": {},
             "formats": {},
         }
-        with source:
+        # Use a copy of the source to avoid side effects (see #1622)
+        with source.to_copy() as source:
             # Add fields
             sizes: Dict[str, int] = {}
             mapping = self.__write_convert_type()

diff --git a/frictionless/formats/sql/adapter.py b/frictionless/formats/sql/adapter.py
@@ -109,7 +109,8 @@ def write_package(self, package: Package):
         for table in self.metadata.sorted_tables:
             if package.has_table_resource(table.name):
                 resource = package.get_table_resource(table.name)
-                with resource:
+                # Use a copy of the resource to avoid side effects (see #1622)
+                with resource.to_copy() as resource:
                     self.write_row_stream(resource.row_stream, table_name=table.name)
         return models.PublishResult(
             url=self.engine.url.render_as_string(hide_password=True),

diff --git a/frictionless/formats/sql/parser.py b/frictionless/formats/sql/parser.py
@@ -51,6 +51,7 @@ def write_row_stream(self, source: TableResource):
         adapter = SqlAdapter(engine, control=control)
         if not adapter:
             raise FrictionlessException(f"Not supported source: {self.resource.normpath}")
-        with source:
+        # Write from a copy to prevent side effects (see #1622)
+        with source.to_copy() as source:
             adapter.write_schema(source.schema, table_name=control.table)
             adapter.write_row_stream(source.row_stream, table_name=control.table)
diff --git a/frictionless/indexer/indexer.py b/frictionless/indexer/indexer.py
@@ -45,20 +45,24 @@ def __attrs_post_init__(self):
 
     def index(self) -> Optional[Report]:
         self.prepare_resource()
-        with self.resource:
-            # Index is resouce-based operation not supporting FKs
-            if self.resource.schema.foreign_keys:
-                self.resource.schema.foreign_keys = []
-            self.create_table()
-            while True:
-                try:
-                    return self.populate_table()
-                except Exception:
-                    if self.fast and self.use_fallback:
-                        self.fast = False
-                        continue
-                    self.delete_table()
-                    raise
+
+        # Infer resource if needed
+        if self.resource.closed:
+            self.resource.infer()
+
+        # Index is resouce-based operation not supporting FKs
+        if self.resource.schema.foreign_keys:
+            self.resource.schema.foreign_keys = []
+        self.create_table()
+        while True:
+            try:
+                return self.populate_table()
+            except Exception:
+                if self.fast and self.use_fallback:
+                    self.fast = False
+                    continue
+                self.delete_table()
+                raise
 
     def prepare_resource(self):
         if self.qsv_path:
@@ -108,25 +112,29 @@ def populate_table_fast_sqlite(self):
         sql_command = f".import '|cat -' \"{self.table_name}\""
         command = ["sqlite3", "-csv", self.adapter.engine.url.database, sql_command]
         process = subprocess.Popen(command, stdin=PIPE, stdout=PIPE)
-        for line_number, line in enumerate(self.resource.byte_stream, start=1):
-            if line_number > 1:
-                process.stdin.write(line)  # type: ignore
-            self.report_progress(f"{self.resource.stats.bytes} bytes")
+        # Iterate over a copy of the resouce to avoid side effects (see #1622)
+        with self.resource.to_copy() as resource:
+            for line_number, line in enumerate(resource.byte_stream, start=1):
+                if line_number > 1:
+                    process.stdin.write(line)  # type: ignore
+                self.report_progress(f"{self.resource.stats.bytes} bytes")
         process.stdin.close()  # type: ignore
         process.wait()
 
     def populate_table_fast_postgresql(self):
         database_url = self.adapter.engine.url.render_as_string(hide_password=False)
         with platform.psycopg.connect(database_url) as connection:
             with connection.cursor() as cursor:
-                query = 'COPY "%s" FROM STDIN CSV HEADER' % self.table_name
-                with cursor.copy(query) as copy:  # type: ignore
-                    while True:
-                        chunk = self.resource.read_bytes(size=settings.BLOCK_SIZE)
-                        if not chunk:
-                            break
-                        copy.write(chunk)
-                        self.report_progress(f"{self.resource.stats.bytes} bytes")
+                # Iterate over a copy of the resouce to avoid side effects (see #1622)
+                with self.resource.to_copy() as resource:
+                    query = 'COPY "%s" FROM STDIN CSV HEADER' % self.table_name
+                    with cursor.copy(query) as copy:  # type: ignore
+                        while True:
+                            chunk = resource.read_bytes(size=settings.BLOCK_SIZE)
+                            if not chunk:
+                                break
+                            copy.write(chunk)
+                            self.report_progress(f"{self.resource.stats.bytes} bytes")
 
     def delete_table(self):
         self.adapter.delete_resource(self.table_name)

diff --git a/frictionless/steps/table/table_debug.py b/frictionless/steps/table/table_debug.py
@@ -33,8 +33,9 @@ def transform_resource(self, resource: Resource):
 
         # Data
         def data():  # type: ignore
-            with current:
-                for row in current.row_stream:  # type: ignore
+            # Use a copy of the source to avoid side effects (see #1622)
+            with current.to_copy() as current_copy:
+                for row in current_copy.row_stream:  # type: ignore
                     self.function(row)  # type: ignore
                     yield row
 

diff --git a/frictionless/steps/table/table_validate.py b/frictionless/steps/table/table_validate.py
@@ -29,11 +29,14 @@ def transform_resource(self, resource: Resource):
 
         # Data
         def data():  # type: ignore
-            with current:
-                if not current.header.valid:  # type: ignore
-                    raise FrictionlessException(error=current.header.errors[0])  # type: ignore
-                yield current.header  # type: ignore
-                for row in current.row_stream:  # type: ignore
+            # Use a copy of the source to avoid side effects (see #1622)
+            with current.to_copy() as current_copy:  # type: ignore
+                if not current_copy.header.valid:  # type: ignore
+                    raise FrictionlessException(
+                        error=current_copy.header.errors[0]  # type: ignore
+                    )  # type: ignore
+                yield current_copy.header  # type: ignore
+                for row in current_copy.row_stream:  # type: ignore
                     if not row.valid:  # type: ignore
                         raise FrictionlessException(error=row.errors[0])  # type: ignore
                     yield row

diff --git a/tests/analyzer/test_resource.py b/tests/analyzer/test_resource.py
@@ -241,3 +241,29 @@ def test_analyze_resource_detailed_with_invalid_data():
     assert analysis["rowsWithNullValues"] == 3
     assert analysis["notNullRows"] == 1
     assert analysis["variableTypes"] == {"integer": 3, "string": 1}
+
+
+def test_analyze_resource_is_independent_bug_1622():
+    # Test that we can analyze a resource without side effects
+    resource = TableResource(path="data/analysis-data.csv")
+    with resource:
+        analysis = resource.analyze()
+        assert list(analysis.keys()) == [
+            "variableTypes",
+            "notNullRows",
+            "rowsWithNullValues",
+            "fieldStats",
+            "averageRecordSizeInBytes",
+            "timeTaken",
+            "md5",
+            "sha256",
+            "bytes",
+            "fields",
+            "rows",
+        ]
+        assert round(analysis["averageRecordSizeInBytes"]) == 85
+        assert analysis["fields"] == 11
+        assert analysis["rows"] == 9
+        assert analysis["rowsWithNullValues"] == 2
+        assert analysis["notNullRows"] == 7
+        assert analysis["variableTypes"] == {}
diff --git a/tests/formats/csv/test_parser.py b/tests/formats/csv/test_parser.py
@@ -344,3 +344,17 @@ def test_csv_parser_proper_quote_issue_493():
     resource.infer()
     assert resource.dialect.to_descriptor() == {}
     assert len(resource.schema.fields) == 126
+
+
+@pytest.mark.skipif(platform.type == "windows", reason="Fix on Windows")
+def test_csv_parser_write_independent_issue_1622(tmpdir):
+    source = TableResource(path="data/table.csv")
+    with source:
+        target = TableResource(path=str(tmpdir.join("table.csv")))
+        source.write(target)
+        with target:
+            assert target.header == ["id", "name"]
+            assert target.read_rows() == [
+                {"id": 1, "name": "english"},
+                {"id": 2, "name": "中国人"},
+            ]
diff --git a/tests/formats/excel/parsers/test_xls.py b/tests/formats/excel/parsers/test_xls.py
@@ -169,3 +169,16 @@ def test_xls_parser_cast_int_to_string_1251():
         {"A": "001", "B": "b", "C": "1", "D": "a", "E": 1},
         {"A": "002", "B": "c", "C": "1", "D": "1", "E": 1},
     ]
+
+
+def test_xls_parser_write_independent_bug_1622(tmpdir):
+    source = TableResource(path="data/table.csv")
+    with source:
+        target = TableResource(path=str(tmpdir.join("table.xls")))
+        source.write(target)
+        with target:
+            assert target.header == ["id", "name"]
+            assert target.read_rows() == [
+                {"id": 1, "name": "english"},
+                {"id": 2, "name": "中国人"},
+            ]
diff --git a/tests/formats/excel/parsers/test_xlsx.py b/tests/formats/excel/parsers/test_xlsx.py
@@ -307,3 +307,16 @@ def test_xlsx_parser_cannot_read_resource_from_remote_package_issue_1504():
     resource = package.get_table_resource("excel")
     table = resource.read_table()
     assert len(table.rows) == 4
+
+
+def test_xlsx_parser_write_independent_1622(tmpdir):
+    source = TableResource(path="data/table.csv")
+    with source:
+        target = TableResource(path=str(tmpdir.join("table.xlsx")))
+        source.write(target)
+        with target:
+            assert target.header == ["id", "name"]
+            assert target.read_rows() == [
+                {"id": 1, "name": "english"},
+                {"id": 2, "name": "中国人"},
+            ]
diff --git a/tests/formats/gsheets/test_parser.py b/tests/formats/gsheets/test_parser.py
@@ -52,10 +52,11 @@ def test_gsheets_parser_write(google_credentials_path):
     path = "https://docs.google.com/spreadsheets/d/1F2OiYmaf8e3x7jSc95_uNgfUyBlSXrcRg-4K_MFNZQI/edit"
     control = formats.GsheetsControl(credentials=google_credentials_path)
     source = TableResource(path="data/table.csv")
-    target = source.write(path=path, control=control)
-    with target:
-        assert target.header == ["id", "name"]
-        assert target.read_rows() == [
-            {"id": 1, "name": "english"},
-            {"id": 2, "name": "中国人"},
-        ]
+    with source:
+        target = source.write(path=path, control=control)
+        with target:
+            assert target.header == ["id", "name"]
+            assert target.read_rows() == [
+                {"id": 1, "name": "english"},
+                {"id": 2, "name": "中国人"},
+            ]
diff --git a/tests/formats/html/test_parser.py b/tests/formats/html/test_parser.py
@@ -62,3 +62,17 @@ def test_html_parser_newline_in_cell_construction_file_issue_865(tmpdir):
     target = source.write(str(tmpdir.join("table.csv")))
     target.infer(stats=True)
     assert target.stats.rows == 226
+
+
+@pytest.mark.skipif(platform.type == "windows", reason="Fix on Windows")
+def test_html_parser_write_independent_bug_1622(tmpdir):
+    source = TableResource(path="data/table.csv")
+    with source:
+        target = TableResource(path=str(tmpdir.join("table.html")))
+        source.write(target)
+        with target:
+            assert target.header == ["id", "name"]
+            assert target.read_rows() == [
+                {"id": 1, "name": "english"},
+                {"id": 2, "name": "中国人"},
+            ]
diff --git a/tests/formats/inline/test_parser.py b/tests/formats/inline/test_parser.py
@@ -139,3 +139,15 @@ def test_inline_parser_write_skip_header():
     with TableResource(path="data/table.csv") as resource:
         resource.write(target)
         assert target.data == [[1, "english"], [2, "中国人"]]
+
+
+@pytest.mark.skip
+def test_inline_parser_write_keyed_independent_bug_1622(tmpdir):
+    control = formats.InlineControl(keyed=True)
+    source = TableResource(path="data/table.csv")
+    with source:
+        target = source.write(format="inline", control=control)
+        assert target.data == [
+            {"id": 1, "name": "english"},
+            {"id": 2, "name": "中国人"},
+        ]
diff --git a/tests/formats/json/parsers/test_json.py b/tests/formats/json/parsers/test_json.py
@@ -135,3 +135,20 @@ def test_json_parser_write_skip_header(tmpdir):
     with TableResource(path="data/table.csv") as resource:
         target = resource.write(target)
         assert target.read_data() == [[1, "english"], [2, "中国人"]]
+
+
+# Bugs
+
+
+def test_json_parser_write_independent_bug_1622(tmpdir):
+    source = TableResource(path="data/table.csv")
+    with source:
+        target = TableResource(path=str(tmpdir.join("table.json")))
+        target = source.write(target)
+        assert target.normpath
+        with open(target.normpath) as file:
+            assert json.load(file) == [
+                ["id", "name"],
+                [1, "english"],
+                [2, "中国人"],
+            ]
diff --git a/tests/formats/json/parsers/test_jsonl.py b/tests/formats/json/parsers/test_jsonl.py
@@ -59,3 +59,18 @@ def test_jsonl_parser_write_skip_header(tmpdir):
             {"field1": 1, "field2": "english"},
             {"field1": 2, "field2": "中国人"},
         ]
+
+
+# Bugs
+
+
+def test_jsonl_parser_write_independent_bug_1622(tmpdir):
+    source = TableResource(path="data/table.csv")
+    with source:
+        target = source.write(path=str(tmpdir.join("table.jsonl")))
+        with target:
+            assert target.header == ["id", "name"]
+            assert target.read_rows() == [
+                {"id": 1, "name": "english"},
+                {"id": 2, "name": "中国人"},
+            ]
diff --git a/tests/formats/ods/test_parser.py b/tests/formats/ods/test_parser.py
@@ -139,3 +139,19 @@ def test_ods_parser_write_skip_header(tmpdir):
         resource.write_table(target)
     table = target.read_table()
     assert table.header == ["field1", "field2"]
+
+
+# Bugs
+
+
+def test_ods_parser_write_independent_bug_1622(tmpdir):
+    source = TableResource(path="data/table.csv")
+    with source:
+        target = TableResource(path=str(tmpdir.join("table.ods")))
+        source.write(target)
+        with target:
+            assert target.header == ["id", "name"]
+            assert target.read_rows() == [
+                {"id": 1, "name": "english"},
+                {"id": 2, "name": "中国人"},
+            ]