Add a cheaper way to skip datasets

It can be really expensive to check all inner granule lists.
opendatacube · Jun 9, 2022 · 10621d9 · 10621d9
1 parent 6868ac6
commit 10621d9
Show file tree

Hide file tree

Showing 2 changed files with 51 additions and 22 deletions.
diff --git a/README.md b/README.md
@@ -242,6 +242,10 @@ Options:
                                   for multi-granule files). Beware that multi-
                                   granule datasets without a granule id in the
                                   filename will overwrite each-other
+  --throughly-check-existing / --cheaply-check-existing
+                                  Should we open every dataset to check if
+                                  *all* inner granules have been produced?
+                                  Default: false.
   --provider [sinergise.com|esa.int]
                                   Restrict scanning to only packages of the
                                   given provider. (ESA assumes a zip file,

diff --git a/eodatasets3/prepare/sentinel_l1_prepare.py b/eodatasets3/prepare/sentinel_l1_prepare.py
@@ -553,6 +553,13 @@ def granule_ids(self) -> Optional[List[str]]:
     help="Include the granule id in metadata filenames? (default: auto -- include only for multi-granule files). "
     "Beware that multi-granule datasets without a granule id in the filename will overwrite each-other",
 )
+@click.option(
+    "--throughly-check-existing/--cheaply-check-existing",
+    "thoroughly_check_existing",
+    is_flag=True,
+    default=False,
+    help="Should we open every dataset to check if *all* inner granules have been produced? Default: false.",
+)
 @click.option(
     "--provider",
     default=None,
@@ -630,6 +637,7 @@ def main(
     overwrite_existing: bool,
     verbose: bool,
     workers: int,
+    thoroughly_check_existing: bool,
     embed_location: Optional[bool],
     only_regions_in_file: Optional[Path],
     before_month: Optional[Tuple[int, int]],
@@ -774,6 +782,45 @@ def find_jobs() -> Iterable[Job]:
                             )
                             continue
 
+                # Put outputs in a different folder?
+                if output_base:
+                    # What base folder should we choose for creating subfolders in the output?
+                    if input_relative_to is None:
+                        input_relative_to = _get_default_relative_folder_base(
+                            found_dataset.base_folder
+                        )
+
+                    output_folder = output_base / found_dataset.base_folder.relative_to(
+                        input_relative_to
+                    )
+                    # Default to true.
+                    if embed_location is None:
+                        embed_location = True
+                else:
+                    output_folder = found_dataset.base_folder
+                    # Default to false
+                    if embed_location is None:
+                        embed_location = False
+
+                # It's very slow to read the list of inner granules.
+                #
+                # So, if we're not thoroughly checking for missing outputs.
+                if (
+                    (not thoroughly_check_existing)
+                    # ... and any outputs exist at all
+                    and list(
+                        output_folder.glob(f"{found_dataset.name}*.odc-metadata.yaml")
+                    )
+                    # ... and we're not overwriting our outputs
+                    and not overwrite_existing
+                ):
+                    # Skip it!
+                    _LOG.debug(
+                        "At least one output exists: skipping. %s", found_dataset.name
+                    )
+                    continue
+
+                # This has to read the files, so can be slow. That's why we try to skip above if possible.
                 granule_ids = found_dataset.granule_ids
 
                 # When granule_id is None, it means process all without filtering.
@@ -794,28 +841,6 @@ def find_jobs() -> Iterable[Job]:
                     else:
                         yaml_filename = f"{found_dataset.name}.odc-metadata.yaml"
 
-                    # Put it in a different folder?
-                    if output_base:
-
-                        # What base folder should we choose for creating subfolders in the output?
-                        if input_relative_to is None:
-                            input_relative_to = _get_default_relative_folder_base(
-                                found_dataset.base_folder
-                            )
-
-                        output_folder = (
-                            output_base
-                            / found_dataset.base_folder.relative_to(input_relative_to)
-                        )
-                        # Default to true.
-                        if embed_location is None:
-                            embed_location = True
-                    else:
-                        output_folder = found_dataset.base_folder
-                        # Default to false
-                        if embed_location is None:
-                            embed_location = False
-
                     output_yaml = output_folder / yaml_filename
                     if output_yaml.exists():
                         if not overwrite_existing: