Skip to content

Commit

Permalink
read as binary file
Browse files Browse the repository at this point in the history
  • Loading branch information
nikil-ravi committed Oct 13, 2024
1 parent 8d09cfd commit 50715e9
Show file tree
Hide file tree
Showing 2 changed files with 1 addition and 5 deletions.
2 changes: 1 addition & 1 deletion src/levanter/data/sharded_datasource.py
Original file line number Diff line number Diff line change
Expand Up @@ -438,7 +438,7 @@ def shard_names(self) -> Sequence[str]:

def open_shard_at_row(self, shard_name: str, row: int) -> Iterator[dict]:
url = self._shard_name_to_url_mapping[shard_name]
with fsspec.open(url, "r", compression="infer") as f:
with fsspec.open(url, "rb", compression="infer") as f:
table = pq.read_table(f)
sliced_table = table.slice(row) # zero-copy slicing
for record in sliced_table.to_pylist():
Expand Down
4 changes: 0 additions & 4 deletions tests/test_sharded_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,10 +61,6 @@ def test_basic_parquet_datasource_read_row():
assert len(datasource.shard_names) == 1, "Expected only one shard"
shard_name = datasource.shard_names[0]

print(f"Shard name: {shard_name}")
print("File name: ", f.name)
print("File path: ", os.path.abspath(f.name))

# sanity check: Read data starting from row 1
row_data = list(datasource.open_shard_at_row(shard_name=shard_name, row=1))

Expand Down

0 comments on commit 50715e9

Please sign in to comment.