Skip to content

Commit

Permalink
Add a cheaper way to skip datasets
Browse files Browse the repository at this point in the history
It can be really expensive to check all inner granule lists.
  • Loading branch information
jeremyh committed Jun 9, 2022
1 parent 6868ac6 commit 10621d9
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 22 deletions.
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,10 @@ Options:
for multi-granule files). Beware that multi-
granule datasets without a granule id in the
filename will overwrite each-other
--throughly-check-existing / --cheaply-check-existing
Should we open every dataset to check if
*all* inner granules have been produced?
Default: false.
--provider [sinergise.com|esa.int]
Restrict scanning to only packages of the
given provider. (ESA assumes a zip file,
Expand Down
69 changes: 47 additions & 22 deletions eodatasets3/prepare/sentinel_l1_prepare.py
Original file line number Diff line number Diff line change
Expand Up @@ -553,6 +553,13 @@ def granule_ids(self) -> Optional[List[str]]:
help="Include the granule id in metadata filenames? (default: auto -- include only for multi-granule files). "
"Beware that multi-granule datasets without a granule id in the filename will overwrite each-other",
)
@click.option(
"--throughly-check-existing/--cheaply-check-existing",
"thoroughly_check_existing",
is_flag=True,
default=False,
help="Should we open every dataset to check if *all* inner granules have been produced? Default: false.",
)
@click.option(
"--provider",
default=None,
Expand Down Expand Up @@ -630,6 +637,7 @@ def main(
overwrite_existing: bool,
verbose: bool,
workers: int,
thoroughly_check_existing: bool,
embed_location: Optional[bool],
only_regions_in_file: Optional[Path],
before_month: Optional[Tuple[int, int]],
Expand Down Expand Up @@ -774,6 +782,45 @@ def find_jobs() -> Iterable[Job]:
)
continue

# Put outputs in a different folder?
if output_base:
# What base folder should we choose for creating subfolders in the output?
if input_relative_to is None:
input_relative_to = _get_default_relative_folder_base(
found_dataset.base_folder
)

output_folder = output_base / found_dataset.base_folder.relative_to(
input_relative_to
)
# Default to true.
if embed_location is None:
embed_location = True
else:
output_folder = found_dataset.base_folder
# Default to false
if embed_location is None:
embed_location = False

# It's very slow to read the list of inner granules.
#
# So, if we're not thoroughly checking for missing outputs.
if (
(not thoroughly_check_existing)
# ... and any outputs exist at all
and list(
output_folder.glob(f"{found_dataset.name}*.odc-metadata.yaml")
)
# ... and we're not overwriting our outputs
and not overwrite_existing
):
# Skip it!
_LOG.debug(
"At least one output exists: skipping. %s", found_dataset.name
)
continue

# This has to read the files, so can be slow. That's why we try to skip above if possible.
granule_ids = found_dataset.granule_ids

# When granule_id is None, it means process all without filtering.
Expand All @@ -794,28 +841,6 @@ def find_jobs() -> Iterable[Job]:
else:
yaml_filename = f"{found_dataset.name}.odc-metadata.yaml"

# Put it in a different folder?
if output_base:

# What base folder should we choose for creating subfolders in the output?
if input_relative_to is None:
input_relative_to = _get_default_relative_folder_base(
found_dataset.base_folder
)

output_folder = (
output_base
/ found_dataset.base_folder.relative_to(input_relative_to)
)
# Default to true.
if embed_location is None:
embed_location = True
else:
output_folder = found_dataset.base_folder
# Default to false
if embed_location is None:
embed_location = False

output_yaml = output_folder / yaml_filename
if output_yaml.exists():
if not overwrite_existing:
Expand Down

0 comments on commit 10621d9

Please sign in to comment.