diff --git a/src/levanter/data/sharded_datasource.py b/src/levanter/data/sharded_datasource.py index 10eb42b1b..208116ca6 100644 --- a/src/levanter/data/sharded_datasource.py +++ b/src/levanter/data/sharded_datasource.py @@ -438,7 +438,7 @@ def shard_names(self) -> Sequence[str]: def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[dict]: url = self._shard_name_to_url_mapping[shard_name] - with fsspec.open(url, "r", compression="infer") as f: + with fsspec.open(url, "rb", compression="infer") as f: table = pq.read_table(f) sliced_table = table.slice(row) # zero-copy slicing for record in sliced_table.to_pylist(): diff --git a/tests/test_sharded_dataset.py b/tests/test_sharded_dataset.py index b732596e5..265a70867 100644 --- a/tests/test_sharded_dataset.py +++ b/tests/test_sharded_dataset.py @@ -61,10 +61,6 @@ def test_basic_parquet_datasource_read_row(): assert len(datasource.shard_names) == 1, "Expected only one shard" shard_name = datasource.shard_names[0] - print(f"Shard name: {shard_name}") - print("File name: ", f.name) - print("File path: ", os.path.abspath(f.name)) - # sanity check: Read data starting from row 1 row_data = list(datasource.open_shard_at_row(shard_name=shard_name, row=1))