Skip to content

Commit

Permalink
frictionlessdata#1622: Fix more cases (including CI)
Browse files Browse the repository at this point in the history
# Summary

The CI tests identified some issues that don't show up on a normal test run.  This commit fixes those issues.

It also highlighted that there were numerous areas
that didn't have sufficient test coverage for the case
that the caller had already opened the resource.

The indexer has some notable changes, but the biggest
area affected is the parsers when writing
from an already opened source.  This commit adds
unit tests for the index and all the parser formats for this case,
and fixes the code to support the lack of nested contexts.

# Tests

- Setup the required databases for CI by copying the commands in the github actions
- Run `hatch run +py=3.11 ci:test` and ensure all tests pass and coverage remains sufficient
- Run `hatch run test` in case it is different and ensure all tests pass and coverage remains sufficient

This also means that all linting etc. has been run too.
  • Loading branch information
richardt-engineb committed Jan 25, 2024
1 parent bdccca0 commit 998e6d3
Show file tree
Hide file tree
Showing 31 changed files with 405 additions and 49 deletions.
3 changes: 2 additions & 1 deletion frictionless/analyzer/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ def analyze_table_resource(
# Iterate rows
columns_data: Dict[str, List[Any]] = {}
numeric = ["integer", "numeric", "number"]
with resource:
# Use a copy of the resource to avoid side effects (see #1622)
with resource.to_copy() as resource:
for row in resource.row_stream:
null_columns = 0
for field_name in row:
Expand Down
3 changes: 2 additions & 1 deletion frictionless/formats/gsheets/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ def write_row_stream(self, source: TableResource):
sh = gc.open_by_key(key)
wks = sh.worksheet_by_id(gid) if gid else sh[0] # type: ignore
data: List[Any] = []
with source:
# Use a copy of the source to avoid side effects (see #1622)
with source.to_copy() as source:
data.append(source.schema.field_names)
for row in source.row_stream:
data.append(row.to_list())
Expand Down
3 changes: 2 additions & 1 deletion frictionless/formats/html/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,8 @@ def read_cell_stream_create(self) -> types.ICellStream:
# It will give us an ability to support HtmlDialect
def write_row_stream(self, source: TableResource):
html = "<html><body><table>\n"
with source:
# Use a copy of the source to avoid side effects (see #1622)
with source.to_copy() as source:
html += "<tr>"
for name in source.schema.field_names:
html += f"<td>{name}</td>"
Expand Down
3 changes: 2 additions & 1 deletion frictionless/formats/pandas/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,8 @@ def write_row_stream(self, source: TableResource):
data_rows: List[Tuple[Any]] = []
index_rows: List[Tuple[Any]] = []
fixed_types = {}
with source:
# Use a copy of the source to avoid side effects (see #1622)
with source.to_copy() as source:
for row in source.row_stream:
data_values: List[Any] = []
index_values: List[Any] = []
Expand Down
3 changes: 2 additions & 1 deletion frictionless/formats/qsv/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ def read_schema(self, resource: Resource) -> Schema:
command = [self.qsv_path, "stats", "--infer-dates", "--dates-whitelist", "all"]
process = sp.Popen(command, stdout=sp.PIPE, stdin=sp.PIPE)
# TODO: Use FileResource here (or future resource.stream_bytes())
with resource:
# Use a copy of the resource to avoid side effects (see #1622)
with resource.to_copy() as resource:
while True:
chunk = resource.read_bytes(size=BLOCK_SIZE)
if not chunk:
Expand Down
6 changes: 4 additions & 2 deletions frictionless/formats/spss/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,8 @@ def write_row_stream(self, source: TableResource):

# Write rows
with sav.SavWriter(self.resource.normpath, ioUtf8=True, **spss_schema) as writer: # type: ignore
with source:
# Use a copy of the source to avoid side effects (see #1622)
with source.to_copy() as source:
for row in source.row_stream: # type: ignore
cells: List[Any] = []
for field in source.schema.fields: # type: ignore
Expand Down Expand Up @@ -130,7 +131,8 @@ def __write_convert_schema(self, source: TableResource):
"varTypes": {},
"formats": {},
}
with source:
# Use a copy of the source to avoid side effects (see #1622)
with source.to_copy() as source:
# Add fields
sizes: Dict[str, int] = {}
mapping = self.__write_convert_type()
Expand Down
3 changes: 2 additions & 1 deletion frictionless/formats/sql/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,8 @@ def write_package(self, package: Package):
for table in self.metadata.sorted_tables:
if package.has_table_resource(table.name):
resource = package.get_table_resource(table.name)
with resource:
# Use a copy of the resource to avoid side effects (see #1622)
with resource.to_copy() as resource:
self.write_row_stream(resource.row_stream, table_name=table.name)
return models.PublishResult(
url=self.engine.url.render_as_string(hide_password=True),
Expand Down
3 changes: 2 additions & 1 deletion frictionless/formats/sql/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@ def write_row_stream(self, source: TableResource):
adapter = SqlAdapter(engine, control=control)
if not adapter:
raise FrictionlessException(f"Not supported source: {self.resource.normpath}")
with source:
# Write from a copy to prevent side effects (see #1622)
with source.to_copy() as source:
adapter.write_schema(source.schema, table_name=control.table)
adapter.write_row_stream(source.row_stream, table_name=control.table)
60 changes: 34 additions & 26 deletions frictionless/indexer/indexer.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,20 +45,24 @@ def __attrs_post_init__(self):

def index(self) -> Optional[Report]:
self.prepare_resource()
with self.resource:
# Index is resouce-based operation not supporting FKs
if self.resource.schema.foreign_keys:
self.resource.schema.foreign_keys = []
self.create_table()
while True:
try:
return self.populate_table()
except Exception:
if self.fast and self.use_fallback:
self.fast = False
continue
self.delete_table()
raise

# Infer resource if needed
if self.resource.closed:
self.resource.infer()

# Index is resouce-based operation not supporting FKs
if self.resource.schema.foreign_keys:
self.resource.schema.foreign_keys = []
self.create_table()
while True:
try:
return self.populate_table()
except Exception:
if self.fast and self.use_fallback:
self.fast = False
continue
self.delete_table()
raise

def prepare_resource(self):
if self.qsv_path:
Expand Down Expand Up @@ -108,25 +112,29 @@ def populate_table_fast_sqlite(self):
sql_command = f".import '|cat -' \"{self.table_name}\""
command = ["sqlite3", "-csv", self.adapter.engine.url.database, sql_command]
process = subprocess.Popen(command, stdin=PIPE, stdout=PIPE)
for line_number, line in enumerate(self.resource.byte_stream, start=1):
if line_number > 1:
process.stdin.write(line) # type: ignore
self.report_progress(f"{self.resource.stats.bytes} bytes")
# Iterate over a copy of the resouce to avoid side effects (see #1622)
with self.resource.to_copy() as resource:
for line_number, line in enumerate(resource.byte_stream, start=1):
if line_number > 1:
process.stdin.write(line) # type: ignore
self.report_progress(f"{self.resource.stats.bytes} bytes")
process.stdin.close() # type: ignore
process.wait()

def populate_table_fast_postgresql(self):
database_url = self.adapter.engine.url.render_as_string(hide_password=False)
with platform.psycopg.connect(database_url) as connection:
with connection.cursor() as cursor:
query = 'COPY "%s" FROM STDIN CSV HEADER' % self.table_name
with cursor.copy(query) as copy: # type: ignore
while True:
chunk = self.resource.read_bytes(size=settings.BLOCK_SIZE)
if not chunk:
break
copy.write(chunk)
self.report_progress(f"{self.resource.stats.bytes} bytes")
# Iterate over a copy of the resouce to avoid side effects (see #1622)
with self.resource.to_copy() as resource:
query = 'COPY "%s" FROM STDIN CSV HEADER' % self.table_name
with cursor.copy(query) as copy: # type: ignore
while True:
chunk = resource.read_bytes(size=settings.BLOCK_SIZE)
if not chunk:
break
copy.write(chunk)
self.report_progress(f"{self.resource.stats.bytes} bytes")

def delete_table(self):
self.adapter.delete_resource(self.table_name)
Expand Down
5 changes: 3 additions & 2 deletions frictionless/steps/table/table_debug.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,9 @@ def transform_resource(self, resource: Resource):

# Data
def data(): # type: ignore
with current:
for row in current.row_stream: # type: ignore
# Use a copy of the source to avoid side effects (see #1622)
with current.to_copy() as current_copy:
for row in current_copy.row_stream: # type: ignore
self.function(row) # type: ignore
yield row

Expand Down
13 changes: 8 additions & 5 deletions frictionless/steps/table/table_validate.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,14 @@ def transform_resource(self, resource: Resource):

# Data
def data(): # type: ignore
with current:
if not current.header.valid: # type: ignore
raise FrictionlessException(error=current.header.errors[0]) # type: ignore
yield current.header # type: ignore
for row in current.row_stream: # type: ignore
# Use a copy of the source to avoid side effects (see #1622)
with current.to_copy() as current_copy: # type: ignore
if not current_copy.header.valid: # type: ignore
raise FrictionlessException(
error=current_copy.header.errors[0] # type: ignore
) # type: ignore
yield current_copy.header # type: ignore
for row in current_copy.row_stream: # type: ignore
if not row.valid: # type: ignore
raise FrictionlessException(error=row.errors[0]) # type: ignore
yield row
Expand Down
26 changes: 26 additions & 0 deletions tests/analyzer/test_resource.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,3 +241,29 @@ def test_analyze_resource_detailed_with_invalid_data():
assert analysis["rowsWithNullValues"] == 3
assert analysis["notNullRows"] == 1
assert analysis["variableTypes"] == {"integer": 3, "string": 1}


def test_analyze_resource_is_independent_bug_1622():
# Test that we can analyze a resource without side effects
resource = TableResource(path="data/analysis-data.csv")
with resource:
analysis = resource.analyze()
assert list(analysis.keys()) == [
"variableTypes",
"notNullRows",
"rowsWithNullValues",
"fieldStats",
"averageRecordSizeInBytes",
"timeTaken",
"md5",
"sha256",
"bytes",
"fields",
"rows",
]
assert round(analysis["averageRecordSizeInBytes"]) == 85
assert analysis["fields"] == 11
assert analysis["rows"] == 9
assert analysis["rowsWithNullValues"] == 2
assert analysis["notNullRows"] == 7
assert analysis["variableTypes"] == {}
14 changes: 14 additions & 0 deletions tests/formats/csv/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,3 +344,17 @@ def test_csv_parser_proper_quote_issue_493():
resource.infer()
assert resource.dialect.to_descriptor() == {}
assert len(resource.schema.fields) == 126


@pytest.mark.skipif(platform.type == "windows", reason="Fix on Windows")
def test_csv_parser_write_independent_issue_1622(tmpdir):
source = TableResource(path="data/table.csv")
with source:
target = TableResource(path=str(tmpdir.join("table.csv")))
source.write(target)
with target:
assert target.header == ["id", "name"]
assert target.read_rows() == [
{"id": 1, "name": "english"},
{"id": 2, "name": "中国人"},
]
13 changes: 13 additions & 0 deletions tests/formats/excel/parsers/test_xls.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,3 +169,16 @@ def test_xls_parser_cast_int_to_string_1251():
{"A": "001", "B": "b", "C": "1", "D": "a", "E": 1},
{"A": "002", "B": "c", "C": "1", "D": "1", "E": 1},
]


def test_xls_parser_write_independent_bug_1622(tmpdir):
source = TableResource(path="data/table.csv")
with source:
target = TableResource(path=str(tmpdir.join("table.xls")))
source.write(target)
with target:
assert target.header == ["id", "name"]
assert target.read_rows() == [
{"id": 1, "name": "english"},
{"id": 2, "name": "中国人"},
]
13 changes: 13 additions & 0 deletions tests/formats/excel/parsers/test_xlsx.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,3 +307,16 @@ def test_xlsx_parser_cannot_read_resource_from_remote_package_issue_1504():
resource = package.get_table_resource("excel")
table = resource.read_table()
assert len(table.rows) == 4


def test_xlsx_parser_write_independent_1622(tmpdir):
source = TableResource(path="data/table.csv")
with source:
target = TableResource(path=str(tmpdir.join("table.xlsx")))
source.write(target)
with target:
assert target.header == ["id", "name"]
assert target.read_rows() == [
{"id": 1, "name": "english"},
{"id": 2, "name": "中国人"},
]
15 changes: 8 additions & 7 deletions tests/formats/gsheets/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,10 +52,11 @@ def test_gsheets_parser_write(google_credentials_path):
path = "https://docs.google.com/spreadsheets/d/1F2OiYmaf8e3x7jSc95_uNgfUyBlSXrcRg-4K_MFNZQI/edit"
control = formats.GsheetsControl(credentials=google_credentials_path)
source = TableResource(path="data/table.csv")
target = source.write(path=path, control=control)
with target:
assert target.header == ["id", "name"]
assert target.read_rows() == [
{"id": 1, "name": "english"},
{"id": 2, "name": "中国人"},
]
with source:
target = source.write(path=path, control=control)
with target:
assert target.header == ["id", "name"]
assert target.read_rows() == [
{"id": 1, "name": "english"},
{"id": 2, "name": "中国人"},
]
14 changes: 14 additions & 0 deletions tests/formats/html/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,3 +62,17 @@ def test_html_parser_newline_in_cell_construction_file_issue_865(tmpdir):
target = source.write(str(tmpdir.join("table.csv")))
target.infer(stats=True)
assert target.stats.rows == 226


@pytest.mark.skipif(platform.type == "windows", reason="Fix on Windows")
def test_html_parser_write_independent_bug_1622(tmpdir):
source = TableResource(path="data/table.csv")
with source:
target = TableResource(path=str(tmpdir.join("table.html")))
source.write(target)
with target:
assert target.header == ["id", "name"]
assert target.read_rows() == [
{"id": 1, "name": "english"},
{"id": 2, "name": "中国人"},
]
12 changes: 12 additions & 0 deletions tests/formats/inline/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,15 @@ def test_inline_parser_write_skip_header():
with TableResource(path="data/table.csv") as resource:
resource.write(target)
assert target.data == [[1, "english"], [2, "中国人"]]


@pytest.mark.skip
def test_inline_parser_write_keyed_independent_bug_1622(tmpdir):
control = formats.InlineControl(keyed=True)
source = TableResource(path="data/table.csv")
with source:
target = source.write(format="inline", control=control)
assert target.data == [
{"id": 1, "name": "english"},
{"id": 2, "name": "中国人"},
]
17 changes: 17 additions & 0 deletions tests/formats/json/parsers/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,3 +135,20 @@ def test_json_parser_write_skip_header(tmpdir):
with TableResource(path="data/table.csv") as resource:
target = resource.write(target)
assert target.read_data() == [[1, "english"], [2, "中国人"]]


# Bugs


def test_json_parser_write_independent_bug_1622(tmpdir):
source = TableResource(path="data/table.csv")
with source:
target = TableResource(path=str(tmpdir.join("table.json")))
target = source.write(target)
assert target.normpath
with open(target.normpath) as file:
assert json.load(file) == [
["id", "name"],
[1, "english"],
[2, "中国人"],
]
15 changes: 15 additions & 0 deletions tests/formats/json/parsers/test_jsonl.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,18 @@ def test_jsonl_parser_write_skip_header(tmpdir):
{"field1": 1, "field2": "english"},
{"field1": 2, "field2": "中国人"},
]


# Bugs


def test_jsonl_parser_write_independent_bug_1622(tmpdir):
source = TableResource(path="data/table.csv")
with source:
target = source.write(path=str(tmpdir.join("table.jsonl")))
with target:
assert target.header == ["id", "name"]
assert target.read_rows() == [
{"id": 1, "name": "english"},
{"id": 2, "name": "中国人"},
]
16 changes: 16 additions & 0 deletions tests/formats/ods/test_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,3 +139,19 @@ def test_ods_parser_write_skip_header(tmpdir):
resource.write_table(target)
table = target.read_table()
assert table.header == ["field1", "field2"]


# Bugs


def test_ods_parser_write_independent_bug_1622(tmpdir):
source = TableResource(path="data/table.csv")
with source:
target = TableResource(path=str(tmpdir.join("table.ods")))
source.write(target)
with target:
assert target.header == ["id", "name"]
assert target.read_rows() == [
{"id": 1, "name": "english"},
{"id": 2, "name": "中国人"},
]
Loading

0 comments on commit 998e6d3

Please sign in to comment.