From 6f216a2e935d751e702a543bb27ca0a295af7298 Mon Sep 17 00:00:00 2001 From: Sam Stoelinga Date: Fri, 15 Dec 2023 17:38:20 -0800 Subject: [PATCH] use prefix correctly (#7) --- batchelor/reader.py | 10 +++++++++- tests/test_reader.py | 14 ++++++++++++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/batchelor/reader.py b/batchelor/reader.py index a3a8127..5bcba85 100644 --- a/batchelor/reader.py +++ b/batchelor/reader.py @@ -13,6 +13,14 @@ def parse_bucket(path: str) -> str: return path.split("/")[2] +def parse_prefix(path: str) -> str: + """ + Parse prefix from a GCS path. For example given the path + gs://bucket-name/path/to/file, return path/to/file + """ + return path.split("/", 3)[3] + + def filter_json_files(paths: list[str]) -> list[str]: return [path for path in paths if path.endswith(".json") or path.endswith(".jsonl")] @@ -22,7 +30,7 @@ def convert_path_to_list(path: str) -> list[str]: bucket_name = parse_bucket(path) paths = [] client = storage.Client() - for blob in client.list_blobs(bucket_name, prefix=path): + for blob in client.list_blobs(bucket_name, prefix=parse_prefix(path)): paths.append(f"gs://{bucket_name}/{blob.name}") return filter_json_files(paths) return [path] diff --git a/tests/test_reader.py b/tests/test_reader.py index 696f54a..a719146 100644 --- a/tests/test_reader.py +++ b/tests/test_reader.py @@ -5,7 +5,7 @@ from google.cloud import storage import pytest -from batchelor.reader import parse_bucket, convert_path_to_list +from batchelor.reader import parse_bucket, convert_path_to_list, parse_prefix @pytest.fixture @@ -18,11 +18,21 @@ def test_parse_bucket(mock_client): expected = "bucket-name" assert parse_bucket(input) == expected - input = "gcss://bucket-name/path/to/file" + input = "gcs://bucket-name/path/to/file" expected = "bucket-name" assert parse_bucket(input) == expected +def test_parse_bucket(mock_client): + input = "gs://bucket-name/path/to/file" + expected = "path/to/file" + assert parse_prefix(input) == expected + + input = "gcs://bucket-name/path/to/file" + expected = "path/to/file" + assert parse_prefix(input) == expected + + @dataclass class Blob: name: str