Skip to content

Commit

Permalink
use prefix correctly (#7)
Browse files Browse the repository at this point in the history
  • Loading branch information
samos123 authored Dec 16, 2023
1 parent 45b8f05 commit 6f216a2
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 3 deletions.
10 changes: 9 additions & 1 deletion batchelor/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,14 @@ def parse_bucket(path: str) -> str:
return path.split("/")[2]


def parse_prefix(path: str) -> str:
"""
Parse prefix from a GCS path. For example given the path
gs://bucket-name/path/to/file, return path/to/file
"""
return path.split("/", 3)[3]


def filter_json_files(paths: list[str]) -> list[str]:
return [path for path in paths if path.endswith(".json") or path.endswith(".jsonl")]

Expand All @@ -22,7 +30,7 @@ def convert_path_to_list(path: str) -> list[str]:
bucket_name = parse_bucket(path)
paths = []
client = storage.Client()
for blob in client.list_blobs(bucket_name, prefix=path):
for blob in client.list_blobs(bucket_name, prefix=parse_prefix(path)):
paths.append(f"gs://{bucket_name}/{blob.name}")
return filter_json_files(paths)
return [path]
Expand Down
14 changes: 12 additions & 2 deletions tests/test_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from google.cloud import storage
import pytest

from batchelor.reader import parse_bucket, convert_path_to_list
from batchelor.reader import parse_bucket, convert_path_to_list, parse_prefix


@pytest.fixture
Expand All @@ -18,11 +18,21 @@ def test_parse_bucket(mock_client):
expected = "bucket-name"
assert parse_bucket(input) == expected

input = "gcss://bucket-name/path/to/file"
input = "gcs://bucket-name/path/to/file"
expected = "bucket-name"
assert parse_bucket(input) == expected


def test_parse_bucket(mock_client):
input = "gs://bucket-name/path/to/file"
expected = "path/to/file"
assert parse_prefix(input) == expected

input = "gcs://bucket-name/path/to/file"
expected = "path/to/file"
assert parse_prefix(input) == expected


@dataclass
class Blob:
name: str
Expand Down

0 comments on commit 6f216a2

Please sign in to comment.