Skip to content

Commit

Permalink
Handle multiprocessing with s3fs
Browse files Browse the repository at this point in the history
  • Loading branch information
chuckwondo committed May 10, 2024
1 parent 1ad38df commit 9a83118
Show file tree
Hide file tree
Showing 2 changed files with 23 additions and 15 deletions.
18 changes: 13 additions & 5 deletions src/gedi_subset/subset.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@
logger = logging.getLogger("gedi_subset")


@dataclass
@dataclass(frozen=True, kw_only=True)
class SubsetGranuleProps:
"""Properties for calling `subset_granule` with a single argument.
Expand All @@ -68,7 +68,7 @@ class SubsetGranuleProps:
single argument.
"""

fs: s3fs.S3FileSystem
fs: s3fs.S3FileSystem | None = None
granule: Granule
maap: MAAP
aoi_gdf: gpd.GeoDataFrame
Expand Down Expand Up @@ -162,10 +162,11 @@ def subset_granule(props: SubsetGranuleProps) -> IOResultE[Maybe[str]]:
return IOSuccess(Nothing)

logger.debug(f"Subsetting {inpath}")
fs = props.fs or s3fs.S3FileSystem()

try:
with (
props.fs.open(inpath, block_size=4 * 1024 * 1024, cache_type="all") as f,
fs.open(inpath, block_size=4 * 1024 * 1024, cache_type="all") as f,
h5py.File(f) as hdf5,
):
gdf = subset_hdf5(
Expand Down Expand Up @@ -249,10 +250,17 @@ def append_subset(src: str) -> IOResultE[str]:
logger.info(f"Found {len(found_granules)} in the CMR")
logger.info(f"Total downloadable granules: {len(downloadable_granules)}")

fs = s3fs.S3FileSystem()
payloads = (
SubsetGranuleProps(
fs, granule, maap, aoi_gdf, lat, lon, beams, columns, query, output_dir
granule=granule,
maap=maap,
aoi_gdf=aoi_gdf,
lat_col=lat,
lon_col=lon,
beams=beams,
columns=columns,
query=query,
output_dir=output_dir,
)
for granule in downloadable_granules
)
Expand Down
20 changes: 10 additions & 10 deletions tests/test_subset.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,16 +80,16 @@ def test_subset_granule(
expected_path = os.path.join(tmp_path, "temp.gpq")
io_result = subset_granule(
SubsetGranuleProps(
fs,
granule,
maap,
aoi_gdf,
"lat_lowestmode",
"lon_lowestmode",
"all",
["agbd"],
"l2_quality_flag == 1",
tmp_path,
fs=fs,
granule=granule,
maap=maap,
aoi_gdf=aoi_gdf,
lat_col="lat_lowestmode",
lon_col="lon_lowestmode",
beams="all",
columns=["agbd"],
query="l2_quality_flag == 1",
output_dir=tmp_path,
)
)

Expand Down

0 comments on commit 9a83118

Please sign in to comment.