From 55c2f9ea45cccf4594aba0b63aa4b187ad04d68f Mon Sep 17 00:00:00 2001 From: Mario Winkler Date: Fri, 11 Oct 2024 17:04:23 +0200 Subject: [PATCH] Move lib to registration library repo --- README.md | 4 +- lib/README.md | 0 lib/docs/src/lib.rst | 0 lib/lib/__init__.py | 0 lib/lib/base/__init__.py | 0 lib/lib/base/download.py | 122 ----- lib/lib/base/file.py | 262 ---------- lib/lib/base/geometry.py | 40 -- lib/lib/base/geoparquet.py | 16 - lib/lib/base/inventory.py | 443 ---------------- lib/lib/base/order.py | 303 ----------- lib/lib/datasets/__init__.py | 0 lib/lib/datasets/landsat.py | 317 ------------ lib/lib/datasets/modis.preview.py | 191 ------- lib/lib/datasets/modis.py | 234 --------- lib/lib/datasets/sentinel.py | 514 ------------------- lib/lib/datasets/viirs.py | 246 --------- lib/lib/providers/__init__.py | 0 lib/lib/providers/esa_cdse.py | 378 -------------- lib/lib/providers/nasa_cmr.py | 221 -------- lib/lib/providers/nasa_daac.py | 650 ------------------------ lib/lib/providers/usgs_m2m.py | 722 --------------------------- lib/lib/resources/__init__.py | 0 lib/lib/resources/pgstac.py | 15 - lib/lib/resources/stac.py | 204 -------- lib/lib/resources/stac_geoparquet.py | 268 ---------- lib/pyproject.toml | 57 --- lib/test/__init__.py | 0 lib/test/base/__init__.py | 0 lib/test/base/test_download.py | 114 ----- lib/test/base/test_file.py | 425 ---------------- 31 files changed, 2 insertions(+), 5744 deletions(-) delete mode 100644 lib/README.md delete mode 100644 lib/docs/src/lib.rst delete mode 100644 lib/lib/__init__.py delete mode 100644 lib/lib/base/__init__.py delete mode 100644 lib/lib/base/download.py delete mode 100644 lib/lib/base/file.py delete mode 100644 lib/lib/base/geometry.py delete mode 100644 lib/lib/base/geoparquet.py delete mode 100644 lib/lib/base/inventory.py delete mode 100644 lib/lib/base/order.py delete mode 100644 lib/lib/datasets/__init__.py delete mode 100644 lib/lib/datasets/landsat.py delete mode 100644 lib/lib/datasets/modis.preview.py delete mode 100644 lib/lib/datasets/modis.py delete mode 100644 lib/lib/datasets/sentinel.py delete mode 100644 lib/lib/datasets/viirs.py delete mode 100644 lib/lib/providers/__init__.py delete mode 100644 lib/lib/providers/esa_cdse.py delete mode 100644 lib/lib/providers/nasa_cmr.py delete mode 100644 lib/lib/providers/nasa_daac.py delete mode 100644 lib/lib/providers/usgs_m2m.py delete mode 100644 lib/lib/resources/__init__.py delete mode 100644 lib/lib/resources/pgstac.py delete mode 100644 lib/lib/resources/stac.py delete mode 100644 lib/lib/resources/stac_geoparquet.py delete mode 100644 lib/pyproject.toml delete mode 100644 lib/test/__init__.py delete mode 100644 lib/test/base/__init__.py delete mode 100644 lib/test/base/test_download.py delete mode 100644 lib/test/base/test_file.py diff --git a/README.md b/README.md index 5a9d844..1977109 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ The EOEPCA+ Resource Registration building block consists of the following compo - Registration API - [https://github.com/EOEPCA/registration-api](https://github.com/EOEPCA/registration-api) - Registration Harvester - [https://github.com/EOEPCA/registration-harvester](https://github.com/EOEPCA/registration-harvester) -- Common Registration Library - [https://github.com/EOEPCA/registration-library](https://github.com/EOEPCA/registration-library) +- Common Registration Library - https://github.com/EOEPCA/registration-library The building block is built with and based on the following tools and standards. @@ -46,7 +46,7 @@ The building block is built with and based on the following tools and standards. - [OGC API Processes](https://ogcapi.ogc.org/processes/) - [Flowable](https://www.flowable.com/open-source) - [FastAPI](https://fastapi.tiangolo.com/) -- [OMG Business Process Model and Notation (BPMN)](https://www.bpmn.org) +- [BPMN - Business Process Model and Notation](https://www.bpmn.org) ## Getting Started diff --git a/lib/README.md b/lib/README.md deleted file mode 100644 index e69de29..0000000 diff --git a/lib/docs/src/lib.rst b/lib/docs/src/lib.rst deleted file mode 100644 index e69de29..0000000 diff --git a/lib/lib/__init__.py b/lib/lib/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/lib/lib/base/__init__.py b/lib/lib/base/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/lib/lib/base/download.py b/lib/lib/base/download.py deleted file mode 100644 index 8179f05..0000000 --- a/lib/lib/base/download.py +++ /dev/null @@ -1,122 +0,0 @@ -import os -import time -import netrc -import requests -from datetime import datetime - - -def access_token(): - """ - Description... - - Returns: - (str): ... - """ - if "token_expire_time" in os.environ and time.time() <= (float(os.environ["token_expire_time"]) - 5): - return os.environ["s3_access_key"] - - print("Need to get a new access token") - # todo: added try block & Exception raise for testing - try: - auth = netrc.netrc().authenticators("dataspace.copernicus.eu") - username = auth[0] - password = auth[2] - except Exception as e: - raise Exception("Failed to get credentials from netrc: %s" % e) - auth_server_url = "https://identity.dataspace.copernicus.eu/auth/realms/CDSE/protocol/openid-connect/token" - data = { - "client_id": "cdse-public", - "grant_type": "password", - "username": username, - "password": password, - } - - # response = requests.post(auth_server_url, data=data, verify=True, allow_redirects=False).json() - # todo: added try block for testing - try: - response = requests.post(auth_server_url, data=data, verify=True, allow_redirects=False) - response.raise_for_status() - response_json = response.json() - except requests.exceptions.RequestException as e: - raise Exception("Failed to get access token: %s" % e) - - token_time = time.time() - os.environ["token_expire_time"] = str(token_time + response_json["expires_in"]) - print( - "New expiration tme for access token: %s" - % datetime.fromtimestamp(float(os.environ["token_expire_time"])).strftime("%m/%d/%Y, %H:%M:%S") - ) - os.environ["s3_access_key"] = response_json["access_token"] - # () gelöscht - return os.environ["s3_access_key"] - - -def download_data( - url, output_dir, file_name=None, chunk_size=1024 * 1000, timeout=300, auth=None, check_size=True, overwrite=False -): - """ - Download single file from USGS M2M by download url - - Parameters: - url: x - output_dir: x - file_name: x - chunk_size: x - timeout: x - auth: x - check_size: x - overwrite: x - - Returns: - (str|bool): filepath or False - """ - - try: - print("Waiting for server response...") - if auth: - r = requests.get(url, stream=True, allow_redirects=True, timeout=timeout, auth=auth) - else: - r = requests.get(url, stream=True, allow_redirects=True, timeout=timeout) - expected_file_size = int(r.headers.get("content-length", -1)) - if file_name is None: - try: - file_name = r.headers["Content-Disposition"].split('"')[1] - except Exception: - file_name = os.path.basename(url) - # raise Exception("Can not automatically identify file_name.") - - print(f"Filename: {file_name}") - file_path = os.path.join(output_dir, file_name) - # TODO: Check for existing files and whether they have the correct file size - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - if os.path.exists(file_path) and overwrite is False: - return file_path - elif os.path.exists(file_path) and overwrite is True: - print("Removing old file") - os.remove(file_path) - - with open(file_path, "wb") as f: - start = time.perf_counter() - print(f"Download of {file_name} in progress...") - for chunk in r.iter_content(chunk_size=chunk_size): - f.write(chunk) - duration = time.perf_counter() - start - - file_size = os.stat(file_path).st_size - speed = round((file_size / duration) / (1000 * 1000), 2) - - if check_size: - if expected_file_size != file_size: - os.remove(file_path) - print(f"Failed to download from {url}") - return False - - print(f"Download of {file_name} successful. Average download speed: {speed} MB/s") - return file_path - - except Exception as e: - print(e) - print(f"Failed to download from {url}.") - return False diff --git a/lib/lib/base/file.py b/lib/lib/base/file.py deleted file mode 100644 index d17aef1..0000000 --- a/lib/lib/base/file.py +++ /dev/null @@ -1,262 +0,0 @@ -import os -import zipfile -import tarfile -import hashlib - -# todo: module-attributes in docs? -checksum_funcs = dict() -checksum_funcs["SHA3-256"] = hashlib.sha3_256 -checksum_funcs["MD5"] = hashlib.md5 - -try: - import blake3 - - checksum_funcs["BLAKE3"] = blake3.blake3 -except Exception: - pass - - -def zip_directory(source_path, destination_path=None): - """ - Compress the contents of a directory into a zip file. - - Args: - source_path (str): The path to the source directory to be compressed. - destination_path (str, optional): The path where the zip file will be created. - If not provided, the zip file will be created in the same directory as the source directory - with the name of the source directory plus .zip extension. Defaults to None. - - Returns: - (str): The path to the generated zip file. - - Raises: - ValueError: If the source_path is not a directory or if the destination_path already exists. - """ - if not os.path.isdir(source_path): - raise ValueError("Source path is not a directory.") - - if destination_path is None: - destination_path = os.path.join(os.path.dirname(source_path), os.path.basename(source_path) + ".zip") - - if os.path.exists(destination_path): - raise ValueError("Destination path already exists.") - - with zipfile.ZipFile(destination_path, "w", zipfile.ZIP_DEFLATED) as zipf: - for root, dirs, files in os.walk(source_path): - for file in files: - file_path = os.path.join(root, file) - relative_path = os.path.relpath(file_path, source_path) - zipf.write(file_path, relative_path) - - return destination_path - - -def unzip_file(zip_file, remove_zip=True, extract_dir=None): - """ - Unzips a Sentinel scene and lists failed files - - Arguments: - zip_file: zip file to unzip - remove_zip: Whether zip file is being removed or not (default: True) - extract_dir: x - - Returns: - (dict): scene_path with folder of unzipped file and boolean zip_file_removed - """ - if not os.path.exists(zip_file): - raise Exception("File does not exist: %s" % zip_file) - - if extract_dir is None: - extract_dir = os.path.dirname(zip_file) - - failed_files = dict() - failed_logs = "" - - with zipfile.ZipFile(zip_file, "r") as zip_ref: # with block added bc os.remove raised WinErr32 - for name in zip_ref.namelist(): - try: - zip_ref.extract(name, extract_dir) - except Exception as e: - failed_files[name] = str(e) - failed_logs += name + ": " + str(e) + "\n" - - if len(failed_files) > 0: - raise Exception("Exceptions during unzipping: %s\n\n%s" % (zip_file, failed_logs)) - else: # removed try block bc sub-folder error wasn't caught - zip_folder_list = zip_ref.namelist()[0].split("/") - if len(zip_folder_list) < 2: - raise Exception("Could not find sub-folder in zip file %s" % zip_file) - zip_folder = zip_folder_list[0] - - response_dict = {"scene_path": os.path.join(extract_dir, zip_folder)} - - if remove_zip: - try: - os.remove(zip_file) - response_dict["zip_file_removed"] = True - except Exception: - response_dict["zip_file_removed"] = False - else: - response_dict["zip_file_removed"] = False - return response_dict - - -def untar_file(tar_file, remove_tar=True, create_folder=False, base_folder=None): - """ - Untars a scene and lists failed files - - Arguments: - tar_file: tar file to untar - remove_tar: Whether tar file is being removed or not (default: True) - create_folder: x - base_folder: x - - Returns: - (dict): scene_path with folder of untared file - """ - if not os.path.exists(tar_file): - raise Exception("File does not exist: %s" % tar_file) - - tar_ref = tarfile.open(tar_file, "r:") - if not base_folder: - base_folder = os.path.dirname(tar_file) - if create_folder: - scene_name = os.path.splitext(os.path.basename(tar_file))[0] - extract_dir = os.path.join(base_folder, scene_name) - else: - extract_dir = base_folder - - failed_files = dict() - failed_logs = "" - - for name in tar_ref.getnames(): - try: - tar_ref.extract(name, extract_dir) - except Exception as e: - failed_files[name] = str(e) - failed_logs += name + ": " + str(e) + "\n" - - tar_ref.close() - - if len(failed_files) > 0: - raise Exception("Exceptions during untaring: %s\n\n%s" % (tar_file, failed_logs)) - else: - response_dict = {"scene_path": extract_dir} - if remove_tar: - try: - os.remove(tar_file) - response_dict["zip_file_removed"] = True - print("Tar-File successfully removed: %s" % tar_file) - except Exception as e: - response_dict["zip_file_removed"] = False - print(e) - print("Tar-File could not be removed: %s" % tar_file) - return response_dict - - -def check_file_size(expected_file_size, file_path): - """ - Description... - - Parameters: - expected_file_size: x - file_path: x - - Returns: - (bool): ... - - Raises: - Exception: File not found. - """ - if os.path.isfile(file_path): - actual_file_size = os.path.getsize(file_path) - if expected_file_size == actual_file_size: - return True - else: - print(f"Different file sizes - {expected_file_size} expected - {actual_file_size} found") - return False - else: - raise Exception("File not found: {file_path}") - - -def get_file_size(file_path): - """ - Description... - - Parameters: - file_path: x - - Returns: - (...): ... - - Raises: - Exception: File does not exist. - """ - if not os.path.exists(file_path): - raise Exception("File %s does not exist!" % file_path) - stat = os.stat(file_path) - return stat.st_size - - -def get_folder_size(folder_path): - """ - Description... - - Parameters: - folder_path: x - - Returns: - (...): ... - - Raises: - Exception: Folder does not exist. - """ - if not os.path.exists(folder_path): - raise Exception("Folder %s does not exist!" % folder_path) - size = 0 - for path, dirs, files in os.walk(folder_path): - for f in files: - fp = os.path.join(path, f) - stat = os.stat(fp) - size += stat.st_size - return size - - -def calculate_checksum(algorithm, check_file): - """ - Description... - - Parameters: - algorithm: x - check_file: x - - Returns: - (...): ... - - Raises: - Exception: Checksum algorithm not available. - """ - if algorithm not in checksum_funcs: - raise Exception("Checksum algorithm not available") - checksum = checksum_funcs[algorithm](open(check_file, "rb").read()).hexdigest().lower() - return checksum - - -def delete_file(file: str): - """ - Description... - - Parameters: - algorithm: x - check_file: x - - Returns: - (...): ... - - Raises: - OSError: ... - """ - try: - os.remove(file) - except OSError as e: - raise Exception("Error: %s - %s." % (e.filename, e.strerror)) diff --git a/lib/lib/base/geometry.py b/lib/lib/base/geometry.py deleted file mode 100644 index 8cbccbe..0000000 --- a/lib/lib/base/geometry.py +++ /dev/null @@ -1,40 +0,0 @@ -import shapely - - -def wkt_to_geom(wkt): - """ - Description... - - Parameters: - wkt: x - - Returns: - (...): ... - """ - return shapely.wkt.loads(wkt) - - -def geom_to_wkt(geom): - """ - Description... - - Parameters: - geom: x - - Returns: - (...): ... - """ - return shapely.geometry.shape(geom).wkt - - -def calculate_bbox(geom): - """ - Description... - - Parameters: - geom: x - - Returns: - (...): ... - """ - return shapely.geometry.shape(geom).bounds diff --git a/lib/lib/base/geoparquet.py b/lib/lib/base/geoparquet.py deleted file mode 100644 index 2179512..0000000 --- a/lib/lib/base/geoparquet.py +++ /dev/null @@ -1,16 +0,0 @@ -import duckdb - - -def get_max_datetime_from_geoparquet(geoparquet, column): - """ - Description... - - Parameters: - geoparquet: x - column: x - - Returns: - (...): ... - """ - res = duckdb.query(f"set TimeZone = 'UTC'; SELECT max(\"{column}\") as max_datetime FROM '{geoparquet}'") - return res.df().values[0][0].isoformat() diff --git a/lib/lib/base/inventory.py b/lib/lib/base/inventory.py deleted file mode 100644 index 64bb8fc..0000000 --- a/lib/lib/base/inventory.py +++ /dev/null @@ -1,443 +0,0 @@ -import os -import duckdb -import requests -import psycopg2 -from datetime import datetime - -from .order import insert_into_database - - -def update_inventory(scene_id, collection, inventory_dsn): - """ - Description... - - Parameters: - scene_id: x - collection: x - inventory_dsn: x - - Returns: - (...): ... - - Raises: - Exception: 0 affected rows. - """ - # Update inventory database - print("Updating inventory for %s" % (scene_id)) - conn = psycopg2.connect(inventory_dsn) - cur = conn.cursor() - status = "succeeded" - query = ( - # todo: define uniform format string rules (for entire project) - # "UPDATE items SET content = jsonb_set(content, '{properties,order:status}', '\"%s\"'::jsonb) - # WHERE id = '%s' and collection in ('%s');" - "UPDATE items " - "SET content = jsonb_set(content, '{properties,order:status}', '\"%s\"'::jsonb) " - "WHERE id = '%s' " - "AND collection in ('%s');" % (status, scene_id, collection) - ) - - print(query) - cur.execute(query) - print("[%s] affected rows: %s" % (scene_id, cur.rowcount)) - - if cur.rowcount == 0: - conn.close() - raise Exception("0 affected rows") - else: - conn.commit() - conn.close() - return True - - -def get_scene_id_from_inventory_db(conn, collection, max_datetime=None): - """ - Description... - - Parameters: - conn: x - collection: x - max_datetime: x - - Returns: - (list): ... - """ - if max_datetime: - query = ( - f"SELECT id " - f"FROM items " - f"WHERE collection='{collection}' " - f"AND content->'properties'->>'order:status' != 'removed' " - f"AND datetime < '{max_datetime}'" - ) - else: - query = ( - f"SELECT id " - f"FROM items " - f"WHERE collection='{collection}' " - f"AND content->'properties'->>'order:status' != 'removed'" - ) - print(query) - cur = conn.cursor() - cur.execute(query) - return [i[0] for i in cur.fetchall()] - - -def get_scenes_from_inventory_file(db_file, date_column="ContentDate:Start", max_datetime=None): - """ - Description... - - Parameters: - db_file: x - date_column: x - max_datetime: x - - Returns: - (...): ... - """ - print(f"Query {db_file}") - # if max_datetime: - # results = duckdb.query( - # 'set TimeZone=\'UTC\'; SELECT * FROM \'%s\' WHERE "%s" < \'%s\'' % (db_file, date_column, max_datetime)) - # else: - # results = duckdb.query('set TimeZone=\'UTC\'; SELECT * FROM \'%s\'' % (db_file)) - # Todo: Check for quotes - if max_datetime: - results = duckdb.query( - "set TimeZone='UTC'; SELECT * FROM '%s' WHERE '%s' < '%s'" % (db_file, date_column, max_datetime) - ) - else: - results = duckdb.query("set TimeZone='UTC'; SELECT * FROM '%s'" % (db_file)) - return results.df() - - -def get_scenes_diff(scenes_inventory, scenes_db, id_column): - """ - Description... - - Parameters: - scenes_inventory: x - scenes_db: x - id_column: x - - Returns: - (tuple(...)): ... - """ - # Inventory API does not include the file extension as part of the scene id column in comparison to the inventory - # files from CDSE data provider. Thus, we need to remove the file extension from the inventory list of scene ids. - file_ext = os.path.splitext(scenes_inventory[id_column][0])[1] - scenes_inventory_names = list(scenes_inventory[id_column].str.replace(file_ext, "")) - print(f"Scenes Inventory: {len(scenes_inventory_names)}") - - # Substract the scene ids of the terrabyte Inventory from the inventory of the data provider to get missing scenes - new_scenes = list(set(scenes_inventory_names).difference(scenes_db)) - print(f"New items: {len(new_scenes)}") - - # Substract the inventory of the data provider from the terrabyte Inventory - # to get scenes that should be removed (not anymore available) - to_be_removed = list(set(scenes_db).difference(scenes_inventory_names)) - print(f"Items to be removed: {len(to_be_removed)}") - return new_scenes, to_be_removed - - -def get_item_from_id(scene_id, collection, api_url="https://stac.terrabyte.lrz.de/inventory/api"): - """ - Description... - - Parameters: - scene_id: x - collection: x - api_url: x - - Returns: - (...|bool): ... - """ - url = f"{api_url}/collections/{collection}/items/{scene_id}" - resp = requests.get(url) - if resp.status_code == 200: - return resp.json() - else: - return False - - -def query_geoparquet( - inventory, - collection, - geoparquet, - max_datetime=None, - date_column="datetime", - inventory_column="geoparquet", - id_column="id", -): - """ - Description... - - Parameters: - inventory: x - collection: x - geoparquet: x - max_datetime: x - date_column: x - inventory_column: x - id_column: x - - Returns: - (...): ... - """ - if max_datetime: - query = ( - f"set TimeZone = 'UTC'; " - f'SELECT DATE_TRUNC(\'year\', "{date_column}") AS year, count("{id_column}") AS count ' - f"FROM '{geoparquet}' " - f"WHERE \"{date_column}\" < '{max_datetime}' " - f"GROUP by year" - ) - else: - query = ( - f"set TimeZone = 'UTC'; " - f'SELECT DATE_TRUNC(\'year\', "{date_column}") AS year, count("{id_column}") AS count ' - f"FROM '{geoparquet}' " - f"GROUP by year" - ) - - print(f"Geoparquet {collection}: {geoparquet}") - print(f"Query: {query}") - - res = duckdb.query(query) - df = res.df() - - # Convert year to integer and index - df["year"] = df["year"].dt.year - df.set_index("year", inplace=True) - - # Select count column - data = df.to_dict()["count"] - - if collection not in inventory: - inventory[collection] = dict() - - for year in data: - if str(year) not in inventory[collection]: - inventory[collection][str(year)] = dict( - inventory=0, removed=0, online=0, pending=0, deprecated=0, stac_api=0, geoparquet=0, datasource=0 - ) - - inventory[collection][str(year)][inventory_column] = data[year] - - return inventory - - -def query_stac_db(cur, inventory, collection, max_datetime=None): - """ - Description... - - Parameters: - cur: x - inventory: x - collection: x - max_datetime: x - - Returns: - (...): ... - """ - where_condition = "" - if max_datetime: - where_condition = f"AND datetime < '{max_datetime}'" - query = ( - f"SELECT DATE_TRUNC('year', datetime) AS year, count(id) " - f"FROM items " - f"WHERE collection='{collection}' {where_condition} " - f"GROUP BY year;" - ) - print(f"STAC API {collection}: {query}") - - cur.execute(query) - api_stats = cur.fetchall() - if collection not in inventory: - inventory[collection] = dict() - - for i in api_stats: - date, count = i - year = str(date.year) - - if year not in inventory[collection]: - inventory[collection][year] = dict( - inventory=0, removed=0, online=0, pending=0, deprecated=0, stac_api=0, geoparquet=0, datasource=0 - ) - - inventory[collection][year]["stac_api"] += count - - return inventory - - -def query_inventory_db(cur, inventory, collection, max_datetime=None): - """ - Description... - - Parameters: - cur: x - inventory: x - collection: x - max_datetime: x - - Returns: - (...): ... - """ - where_condition = "" - if max_datetime: - where_condition = f"AND datetime < '{max_datetime}'" - query = ( - f"SELECT DATE_TRUNC('year', datetime) AS year, " - f"content->'properties'->>'order:status' AS status, " - f"content->'properties'->>'deprecated' AS deprecated, count(id) " - f"FROM items " - f"WHERE collection='{collection}' {where_condition} " - f"GROUP BY year, status, deprecated;" - ) - print(f"Inventory {collection}: {query}") - - cur.execute(query) - api_stats = cur.fetchall() - - if collection not in inventory: - inventory[collection] = dict() - - for i in api_stats: - date, status, deprecated, count = i - year = str(date.year) - - if year not in inventory[collection]: - inventory[collection][year] = dict( - inventory=0, removed=0, online=0, pending=0, deprecated=0, stac_api=0, geoparquet=0, datasource=0 - ) - - if status == "succeeded" and deprecated == "false": - inventory[collection][year]["online"] += count - elif status == "removed": - inventory[collection][year]["removed"] += count - elif status != "succeeded" and deprecated == "false": - inventory[collection][year]["pending"] += count - elif deprecated == "true": - inventory[collection][year]["deprecated"] += count - - if status != "removed": - inventory[collection][year]["inventory"] += count - - return inventory - - -def calculate_differences(collection, inventory_geoparquet, conn, id_column, date_column, max_datetime=None): - """ - Description... - - Parameters: - collection: x - inventory_geoparquet: x - conn: x - id_column: x - date_column: x - max_datetime: x - - Returns: - (tuple(...)): ... - """ - scenes_inventory = get_scenes_from_inventory_file( - inventory_geoparquet, date_column=date_column, max_datetime=max_datetime - ) - scenes_db_names = get_scene_id_from_inventory_db(conn, collection, max_datetime=max_datetime) - new_scenes, to_be_removed = get_scenes_diff(scenes_inventory, scenes_db_names, id_column) - if len(new_scenes) > 0: - scenes_inventory_by_name = scenes_inventory.set_index(id_column) - file_ext = os.path.splitext(scenes_inventory[id_column][0])[1] - scenes = [] - for scene_id in new_scenes: - scene = scenes_inventory_by_name.loc[scene_id + file_ext].to_dict() - scene[id_column] = scene_id + file_ext - scenes.append(scene) - return scenes, to_be_removed - else: - return new_scenes, to_be_removed - - -def generate_stac_new_scenes(scenes, collection, inventory_fct): - """ - Description... - - Parameters: - scenes: x - collection: x - inventory_fct: x - - Returns: - (...): ... - - Raises: - Exception: Error while creating metadata for a scene. - """ - stac_items = [] - for scene in scenes: - try: - stac_items.append(inventory_fct(scene, collection).to_dict()) - except Exception as e: - print(f"Error while creating metadata for {scene}: {e}") - return stac_items - - -def import_new_scenes(scenes, collection, inventory_fct, dsn): - """ - Description... - - Parameters: - scenes: x - collection: x - inventory_fct: x - dsn: x - - Returns: - (...): ... - """ - stac_items = generate_stac_new_scenes(scenes, collection, inventory_fct) - return insert_into_database(dsn, stac_items) - - -def delete_removed_scenes(collection, to_be_removed, reasons, api_url, api_user, api_pw): - """ - Description... - - Parameters: - collection: x - to_be_removed: x - reasons: x - api_url: x - api_user: x - api_pw: x - """ - for scene_id in to_be_removed: - stac_item = get_item_from_id(scene_id, collection) - if stac_item: - order_status = stac_item["properties"]["order:status"] - print(scene_id, "Order Status: " + order_status) - if order_status == "succeeded": - # remove from STAC API - r = requests.delete( - "%s/collections/%s/items/%s" % (api_url, collection, stac_item["id"]), - auth=(api_user, api_pw), - ) - print("%s: Delete from STAC API: %s" % (scene_id, r.status_code)) - - stac_item["properties"]["order:status"] = "removed" - if scene_id in reasons: - reason = reasons[scene_id] - stac_item["properties"]["deletion:date"] = reason["DeletionDate"] - stac_item["properties"]["deletion:cause"] = reason["DeletionCause"] - stac_item["properties"]["deprecated"] = True - stac_item["properties"]["updated"] = datetime.utcnow().isoformat() + "Z" - r = requests.put( - "%s/collections/%s/items/%s" - % ("https://stac.terrabyte.lrz.de/inventory/api", collection, stac_item["id"]), - json=stac_item, - # auth=(api_user, api_pw) - ) - print("%s: Update Inventory API: %s" % (scene_id, r.status_code)) - else: - print("Not found in inventory: %s" % scene_id) diff --git a/lib/lib/base/order.py b/lib/lib/base/order.py deleted file mode 100644 index 2851273..0000000 --- a/lib/lib/base/order.py +++ /dev/null @@ -1,303 +0,0 @@ -import json -import psycopg2 -import psycopg2.extras -from datetime import datetime -from pypgstac.pypgstac import PgstacCLI - - -def get_items_from_query(dsn, order_id, collections, where_query): - """ - Description... - - Parameters: - dsn: x - order_id: x - collections: x - where_query: x - - Returns: - (...): ... - """ - conn = psycopg2.connect(dsn) - cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor) - - if len(collections) > 0: - where_query += " and collection in (%s)" % (json.dumps(collections).replace('"', "'")[1:-1]) - - _ = update_database(cur, conn, order_id, where_query, order_status="pending") - scenes = get_items_from_order_id(order_id, collections, dsn) - return scenes - - -def get_last_items_from_collection(dsn, order_id, collection, max_items=1000): - """ - Description... - - Parameters: - dsn: x - order_id: x - collection: x - max_items: x - - Returns: - (...): ... - """ - conn = psycopg2.connect(dsn) - cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor) - where_query = ( - "collection = '%s' and content->'properties'->>'order:status'='orderable' ORDER BY datetime DESC LIMIT %s" - % (collection, max_items) - ) - - _ = update_database(cur, conn, order_id, where_query, order_status="pending") - scenes = get_items_from_order_id(order_id, [collection], dsn) - return scenes - - -def generate_batches_from_inventory(order_id, dsn, collections, where_query, batch_size=1000): - """ - Description... - - Parameters: - order_id: x - dsn: x - collections: x - where_query: x - batch_size: x - - Returns: - (...): ... - """ - conn = psycopg2.connect(dsn) - cur = conn.cursor() - - batches = [] - if len(collections) > 0: - where_query += " and collection in (%s)" % (json.dumps(collections).replace('"', "'")[1:-1]) - - print("Where query: " + where_query) - order_no = 1 - batch_id = "%s_%s" % (order_id, order_no) - - affected_rows = update_database(cur, order_id, batch_id, where_query, batch_size) - conn.commit() - if affected_rows == 0: - print("No scenes found") - return [] - batches.append(batch_id) - while affected_rows == batch_size: - order_no += 1 - batch_id = "%s_%s" % (order_id, order_no) - affected_rows = update_database(cur, order_id, batch_id, where_query, batch_size) - conn.commit() - batches.append(batch_id) - - conn.close() - - return batches - - -def update_database(cur, conn, order_id, where_query, order_status="ordered"): - """ - Description... - - Parameters: - cur: x - conn: x - order_id: x - where_query: x - order_status: x - - Returns: - (...): ... - """ - # Update all items with order id, order date, batch id, and ordered status - order_update = {"order:status": order_status, "order:id": order_id, "order:date": datetime.now().isoformat()} - query = ( - "UPDATE items " - "SET content = jsonb_set(content, '{properties}', content->'properties' || '%s'::jsonb) " - "WHERE id in (SELECT id FROM items WHERE %s);" % (json.dumps(order_update), where_query) - ) - print(query) - cur.execute(query) - conn.commit() - print("affected rows: %s" % cur.rowcount) - return cur.rowcount - - -def update_database_batch(cur, conn, order_id, batch_id, where_query, batch_size): - """ - Description... - - Parameters: - cur: x - conn: x - order_id: x - batch_id: x - where_query: x - batch_size: x - - Returns: - (...): ... - """ - # Update all items with order id, order date, batch id, and ordered status - order_update = { - "order:status": "ordered", - "order:id": order_id, - "order:date": datetime.now().isoformat(), - "order:batch_id": batch_id, - } - query = ( - "UPDATE items " - "SET content = jsonb_set(content, '{properties}', content->'properties' || '%s'::jsonb) " - "WHERE id in (SELECT id FROM items WHERE %s LIMIT %s);" % (json.dumps(order_update), where_query, batch_size) - ) - print(query) - cur.execute(query) - conn.commit() - print("affected rows: %s" % cur.rowcount) - return cur.rowcount - - -def get_order_from_id(scene_id, dsn): - """ - Description... - - Parameters: - scene_id: x - dsn: x - - Returns: - (...): ... - """ - conn = psycopg2.connect(dsn) - cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor) - query = "select content->'properties'->'terrabyte:order' from items where id='%s';" % scene_id - print(query) - cur.execute(query) - scene = cur.fetchone() - if len(scene) > 0: - return scene[0] - else: - return scene - - -def get_items_from_order_id(order_id, collections, dsn): - """ - Description... - - Parameters: - order_id: x - collections: x - dsn: x - - Returns: - (...): ... - """ - conn = psycopg2.connect(dsn) - cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor) - if len(collections) > 0: - where_query_add = "and collection in (%s)" % (str(collections).replace("[", "").replace("]", "")) - query = ( - "select content->'properties'->'terrabyte:order' from items where content->'properties'->>'order:id'='%s' %s;" - % (order_id, where_query_add) - ) - print(query) - cur.execute(query) - result = cur.fetchall() - scenes = [r[0] for r in result] - return scenes - - -def insert_into_database(dsn, stac, method="insert_ignore"): - """ - Description... - - Parameters: - dsn: x - stac: x - method: x - - Returns: - (...): ... - - Raises: - Exception: ... - """ - try: - cli = PgstacCLI(dsn=dsn, debug=True) - cli.load(table="items", file=stac, method=method) - return True - except Exception as e: - print(str(e)) - return False - - -def get_scenes_from_batch(batch_id, collections, dsn): - """ - Description... - - Parameters: - batch_id: x - collections: x - dsn: x - - Returns: - (...): ... - """ - # Query scenes from a specific batch id, additional use filter through collections for performance reasons - collections = json.dumps(collections).replace('"', "'")[1:-1] - conn = psycopg2.connect(dsn) - cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor) - query = "SELECT * FROM items WHERE content->'properties'->'order:batch_id' = '\"%s\"' and collection in (%s)" % ( - batch_id, - collections, - ) - print(query) - cur.execute(query) - scenes = cur.fetchall() - - # For all scenes set order status = "pending" - status = "pending" - query = ( - "UPDATE items " - "SET content = jsonb_set(content, '{properties,order:status}', '\"%s\"'::jsonb) " - "WHERE content->'properties'->'order:batch_id' = '\"%s\"' and collection in (%s);" - % (status, batch_id, collections) - ) - print(query) - cur.execute(query) - conn.commit() - - conn.close() - return scenes - - -def update_items_inventory_status(property, id, collection, dsn, status="pending"): - """ - Description... - - Parameters: - property: x - id: x - collection: x - dsn: x - status: x - - """ - # property = 'order:order_id' or 'order:batch_id' - - collections = json.dumps(collection).replace('"', "'")[1:-1] - conn = psycopg2.connect(dsn) - cur = conn.cursor(cursor_factory=psycopg2.extras.DictCursor) - # For all scenes set order status = "pending" - query = ( - "UPDATE items " - "SET content = jsonb_set(content, '{properties,order:status}', '\"%s\"'::jsonb) " - "WHERE content->'properties'->'%s' = '\"%s\"' and collection in (%s);" % (property, status, id, collections) - ) - - print(query) - cur.execute(query) - conn.commit() - conn.close() diff --git a/lib/lib/datasets/__init__.py b/lib/lib/datasets/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/lib/lib/datasets/landsat.py b/lib/lib/datasets/landsat.py deleted file mode 100644 index 1c833ec..0000000 --- a/lib/lib/datasets/landsat.py +++ /dev/null @@ -1,317 +0,0 @@ -import os -import re -import json -import glob -import copy -import logging -import pystac -import requests -from datetime import datetime - -from ..resources.stac import ( - extract_stactools, - add_asset_filesize, -) - -# https://www.usgs.gov/faqs/what-naming-convention-landsat-collections-level-1-scenes -scene_id_pattern = ( - r"^L" - r"(?PC|O|T|E|T|M)" - r"(?P[0-9]{2})_" - r"(?P[0-9A-Z]{4})_" - r"(?P[0-9]{3})" - r"(?P[0-9]{3})_" - r"(?P[0-9]{8})_" - r"(?P[0-9]{8})_" - r"(?P[0-9]{2})_" - r"(?P[A-Z0-9]{2})$" -) - -sensor_name = { - "C": "oli-tirs", - "O": "oli", - # "T": "tirs", - "E": "etm", - "T": "tm", - "M": "mss", -} - -asset_changes = { - "LT": { # for Landsat 4 and 5 - "blue": ["B01", "Blue Band (B01)"], - "green": ["B02", "Green Band (B02)"], - "red": ["B03", "Red Band (B03)"], - "nir08": ["B04", "Near Infrared Band 0.8 (B04)"], - "swir16": ["B05", "Short-wave Infrared Band 1.6 (B05)"], - "lwir": ["B06", "Surface Temperature Band (B06)"], - "swir22": ["B07", "Short-wave Infrared Band 2.2 (B07)"], - "atmos_opacity": ["Atmos_Opacity", ""], - "atran": ["ATRAN", ""], - "cdist": ["CDIST", ""], - "drad": ["DRAD", ""], - "urad": ["URAD", ""], - "trad": ["TRAD", ""], - "emis": ["EMIS", ""], - "emsd": ["EMSD", ""], - "qa_pixel": ["QA_Pixel", ""], - "qa_radsat": ["QA_Radsat", ""], - "qa": ["QA_Temp", ""], - "cloud_qa": ["QA_Cloud", ""], - }, - "LE": { # for Landsat 7 - "blue": ["B01", "Blue Band (B01)"], - "green": ["B02", "Green Band (B02)"], - "red": ["B03", "Red Band (B03)"], - "nir08": ["B04", "Near Infrared Band 0.8 (B04)"], - "swir16": ["B05", "Short-wave Infrared Band 1.6 (B05)"], - "lwir": ["B06", "Surface Temperature Band (B06)"], - "swir22": ["B07", "Short-wave Infrared Band 2.2 (B07)"], - "atmos_opacity": ["Atmos_Opacity", ""], - "atran": ["ATRAN", ""], - "cdist": ["CDIST", ""], - "drad": ["DRAD", ""], - "urad": ["URAD", ""], - "trad": ["TRAD", ""], - "emis": ["EMIS", ""], - "emsd": ["EMSD", ""], - "qa_pixel": ["QA_Pixel", ""], - "qa_radsat": ["QA_Radsat", ""], - "qa": ["QA_Temp", ""], - "cloud_qa": ["QA_Cloud", ""], - }, - "LC": { # for Landsat 8 and 9 - "coastal": ["B01", "Coastal/Aerosol Band (B01)"], - "blue": ["B02", "Blue Band (B02)"], - "green": ["B03", "Green Band (B03)"], - "red": ["B04", "Red Band (B04)"], - "nir08": ["B05", "Near Infrared Band 0.8 (B05)"], - "swir16": ["B06", "Short-wave Infrared Band 1.6 (B06)"], - "lwir11": ["B10", "Surface Temperature Band (B10)"], - "swir22": ["B07", "Short-wave Infrared Band 2.2 (B07)"], - "atran": ["ATRAN", ""], - "cdist": ["CDIST", ""], - "drad": ["DRAD", ""], - "urad": ["URAD", ""], - "trad": ["TRAD", ""], - "emis": ["EMIS", ""], - "emsd": ["EMSD", ""], - "qa_pixel": ["QA_Pixel", ""], - "qa_radsat": ["QA_Radsat", ""], - "qa": ["QA_Temp", ""], - "qa_aerosol": ["QA_Aerosol", ""], - }, -} - -folder_structure = "level-{processingLevelNo}/standard/{sensor}/{year}/{wrsPath}/{wrsRow}" - - -def get_scene_id_info(scene_id): - """ - Description... - - Parameters: - scene_id: x - - - Returns: - (...): ... - """ - match = re.match(re.compile(scene_id_pattern), scene_id) - return match.groupdict() - - -def get_scene_id_folder(scene_id, folder_format=None): - """ - Description... - - Parameters: - scene_id: x - folder_format: x - - Returns: - (...): ... - """ - variables = get_scene_id_info(scene_id) - if "start" in variables: - date = datetime.strptime(variables["start"], "%Y%m%d") - variables["year"] = date.strftime("%Y") - variables["month"] = date.strftime("%m") - variables["day"] = date.strftime("%d") - if "sensor" in variables: - variables["sensor"] = sensor_name[variables["sensor"]] - if "processingLevel" in variables: - variables["processingLevelNo"] = variables["processingLevel"][1] - - if folder_format is None: - folder_format = folder_structure - - return folder_format.format(**variables) - - -def landsat_metadata(scene_path, scene_id, return_pystac=False, add_file_size=False): - """ - Description... - - Parameters: - scene_path: x - scene_id: x - return_pystac: x - add_file_size: x - - Returns: - (...): ... - - Raises: - Exception: Metadata_error: Folder does not exist. - Exception: Metadata_error: No *_MTL.xml file available in folder. - Exception: Metadata_error: Error during creating metadata. - """ - if scene_path[-1] == "/": - scene_path = scene_path[:-1] - print("executing landsat_metadata for %s" % scene_path) - if not os.path.exists(scene_path): - raise Exception("metadata_error: Folder does not exist %s" % (scene_path)) - stac_function = "stactools.landsat.stac.create_item" - stac_function_options = {"use_usgs_geometry": False} - landsat_mtl_xml = glob.glob(os.path.join(scene_path, "*_MTL.xml")) - if len(landsat_mtl_xml) == 0: - metadata_error = "No *_MTL.xml file available in folder %s" % (scene_path,) - raise Exception("metadata_error: %s" % metadata_error) - landsat_mtl_xml = landsat_mtl_xml[0] - print("MTL file: %s" % landsat_mtl_xml) - - try: - stac_file = os.path.join(scene_path, scene_id + ".STAC.json") - stac_item = extract_stactools(landsat_mtl_xml, stac_function, stac_function_options) - stac_item.id = scene_id - stac_item = modify_landsat_stac(stac_item) - if add_file_size: - stac_item = add_asset_filesize(stac_item) - if return_pystac: - return stac_item - else: - with open(stac_file, "w") as f: - f.write(json.dumps(stac_item.to_dict())) - return stac_file - - except Exception as e: - metadata_error = "Error during creating metadata for %s: %s" % ( - scene_path, - str(e), - ) - raise Exception("metadata_error: %s" % metadata_error) - - -def adapt_stac_metadata(scene_path): - """ - Changes hrefs in existing Landsat STAC-Metadata - - Parameters: - scene_path: x - - Returns: - (...): ... - - Raises: - Exception: Failed to adapt STAC-metadata. - """ - - print(f"Adapt STAC-metadata of {scene_path}.") - # for all STAC-metadata files do - stac_jsons = [] - for file in os.listdir(scene_path): - if file.endswith("stac.json"): - stac_jsons.append(file) - - stac_files = [] - if len(stac_jsons) > 0: - try: - for file in stac_jsons: - # read in stac.json - with open(os.path.join(scene_path, file), "r") as stac_file: - data = json.load(stac_file) - if "created" not in data["properties"]: - data["properties"]["created"] = str(datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%fZ")) - - # adapt all assets - for asset in data["assets"]: - href_old = data["assets"][asset]["href"] - file_name = os.path.basename(href_old) - # href_new = os.path.join(scene_path,file_name) # keep relative paths - href_new = file_name - data["assets"][asset]["href"] = href_new - if "alternate" in data["assets"][asset]: - del data["assets"][asset]["alternate"] - - # remove index asset - if "index" in data["assets"]: - del data["assets"]["index"] - - # remove all links - data["links"] = [] - - # write .COG .json file - with open(os.path.join(scene_path, file), "w") as jsonFile: - json.dump(data, jsonFile, indent=4) - stac_files.append(os.path.join(scene_path, file)) - print(f"STAC-metadata of {scene_path} successfully adapted") - - except Exception as e: - print(e) - print(f"Failed to adapt STAC-metadata of {scene_path}") - else: - print(f"{scene_path} does not contain STAC-metadata to adapt.") - - return stac_files - - -__log = logging.getLogger("Log Info") - - -def modify_landsat_stac(stac_item: pystac.item.Item): - """ - Modify the Asset-Keys and eo:bands:name for a Landsat L2 STAC-Item. - - Args: - stac_item: The STAC item file to modify. Must be a STACObject. - - Returns: - (...): A pystac.item.Item object with the desired changes. - - Raises: - Exception: Could not find entry in asset_changes configuration. - """ - - stac_item_dict = copy.deepcopy(stac_item.to_dict(include_self_link=False)) - - if stac_item_dict["geometry"]["type"] == "MultiPolygon": - try: - link = stac_item_dict["links"][1]["href"] - stac_item_usgs = requests.get(link).json() - stac_item_dict["geometry"] = stac_item_usgs["geometry"] - except Exception: - pass - - mission = stac_item.id[0:2] # Get first two characters of Item id (e.g., LC for LC09_L2SR_....) - if mission not in asset_changes: - raise Exception("Could not find entry for %s in asset_changes configuration" % mission) - input_dict = asset_changes[mission] - - for i, (current_key, target_key) in enumerate(input_dict.items()): - __log.info(f"Replacing the current Asset-Key {current_key} with the new Asset-Key {target_key[0]}.") - try: - stac_item_dict["assets"][target_key[0]] = copy.deepcopy(stac_item_dict["assets"].pop(current_key)) - if "eo:bands" in stac_item_dict["assets"][target_key[0]]: - stac_item_dict["assets"][target_key[0]]["eo:bands"][0]["name"] = target_key[0] - stac_item_dict["assets"][target_key[0]]["title"] = target_key[1] - except Exception: - __log.info(f"{current_key} is not a Asset in this STAC-Item.") - - if "proj:centroid" in stac_item_dict["properties"]: - for key in stac_item_dict["properties"]["proj:centroid"]: - stac_item_dict["properties"]["proj:centroid"][key] = float( - stac_item_dict["properties"]["proj:centroid"][key] - ) - - stac_item_object_final = pystac.Item.from_dict(stac_item_dict) - return stac_item_object_final diff --git a/lib/lib/datasets/modis.preview.py b/lib/lib/datasets/modis.preview.py deleted file mode 100644 index b7f3a96..0000000 --- a/lib/lib/datasets/modis.preview.py +++ /dev/null @@ -1,191 +0,0 @@ -import os -from osgeo import gdal - -# todo: modis.preview wird von docgenerator nicht gefunden. Punkt im Namen müsste gegen anderes Zeichen ersetzt werden - -modis_previews = { - "mod09ga": { - "sample": ( - "https://modis-samples.fra1.cdn.digitaloceanspaces.com/" - "MOD09GA.061/MOD09GA.A2023177.h20v05.061.2023179044546.hdf" - ), - "description": "MODIS/Terra Surface Reflectance Daily L2G Global 1km and 500m SIN Grid", - "bands": ["sur_refl_b01_1", "sur_refl_b04_1", "sur_refl_b03_1"], - "cmap": None, - }, - "myd09ga": { - "sample": ( - "https://modis-samples.fra1.cdn.digitaloceanspaces.com/" - "MYD09GA.061/MYD09GA.A2023168.h29v05.061.2023170035115.hdf" - ), - "description": "MODIS/Aqua Surface Reflectance Daily L2G Global 1km and 500m SIN Grid", - "bands": ["sur_refl_b01_1", "sur_refl_b04_1", "sur_refl_b03_1"], - "cmap": None, - }, - "mod09gq": { - "sample": ( - "https://modis-samples.fra1.cdn.digitaloceanspaces.com/" - "MOD09GQ.061/MOD09GQ.A2023177.h20v05.061.2023179044546.hdf" - ), - "description": "MODIS/Terra Surface Reflectance Daily L2G Global 250m SIN Grid", - "bands": ["sur_refl_b01_1", "sur_refl_b01_1", "sur_refl_b02_1"], - "cmap": None, - }, - "myd09gq": { - "sample": ( - "https://modis-samples.fra1.cdn.digitaloceanspaces.com/" - "MYD09GQ.061/MYD09GQ.A2023126.h29v06.061.2023128033751.hdf" - ), - "description": "MODIS/Aqua Surface Reflectance Daily L2G Global 250m SIN Grid", - "bands": ["sur_refl_b01_1", "sur_refl_b01_1", "sur_refl_b02_1"], - "cmap": None, - }, - "mod10a1": { - "sample": ( - "https://modis-samples.fra1.cdn.digitaloceanspaces.com/" - "MOD10A1.061/MOD10A1.A2023113.h23v04.061.2023115044355.hdf" - ), - "description": "MODIS/Terra Snow Cover Daily L3 Global 500m SIN Grid", - "bands": ["NDSI_Snow_Cover"], - "cmap": [ - (0, (255, 255, 255), 100, (255, 255, 0)), - (101, (56, 56, 56), 199, (56, 56, 56)), - (200, (30, 144, 255), 250, (0, 0, 205)), - ], - }, - "myd10a1": { - "sample": ( - "https://modis-samples.fra1.cdn.digitaloceanspaces.com/" - "MYD10A1.061/MYD10A1.A2023175.h24v05.061.2023177034319.hdf" - ), - "description": "MODIS/Aqua Snow Cover Daily L3 Global 500m SIN Grid", - "bands": ["NDSI_Snow_Cover"], - "cmap": [ - (0, (255, 255, 255), 100, (255, 255, 0)), - (101, (56, 56, 56), 199, (56, 56, 56)), - (200, (30, 144, 255), 250, (0, 0, 205)), - ], - }, - "mod13a2": { - "sample": ( - "https://modis-samples.fra1.cdn.digitaloceanspaces.com/" - "MOD13A2.061/MOD13A2.A2023145.h20v05.061.2023164003951.hdf" - ), - "description": "MODIS/Terra Vegetation Indices 16-Day L3 Global 1km SIN Grid", - "bands": ['"1 km 16 days MIR reflectance"', '"1 km 16 days NIR reflectance"', '"1 km 16 days red reflectance"'], - "cmap": None, - }, - "myd13a2": { - "sample": ( - "https://modis-samples.fra1.cdn.digitaloceanspaces.com/" - "MYD13A2.061/MYD13A2.A2023153.h29v12.061.2023170141456.hdf" - ), - "description": "MODIS/Aqua Vegetation Indices 16-Day L3 Global 1km SIN Grid", - "bands": ['"1 km 16 days MIR reflectance"', '"1 km 16 days NIR reflectance"', '"1 km 16 days red reflectance"'], - "cmap": None, - }, - "mod13a3": { - "sample": ( - "https://modis-samples.fra1.cdn.digitaloceanspaces.com/" - "MOD13A3.061/MOD13A3.A2023121.h20v05.061.2023164011854.hdf" - ), - "description": "MODIS/Terra Vegetation Indices Monthly L3 Global 1km SIN Grid", - "bands": ['"1 km monthly MIR reflectance"', '"1 km monthly NIR reflectance"', '"1 km monthly red reflectance"'], - "cmap": None, - }, - "myd13a3": { - "sample": ( - "https://modis-samples.fra1.cdn.digitaloceanspaces.com/" - "MYD13A3.061/MYD13A3.A2023032.h29v12.061.2023074111018.hdf" - ), - "description": "MODIS/Aqua Vegetation Indices Monthly L3 Global 1km SIN Grid", - "bands": ['"1 km monthly MIR reflectance"', '"1 km monthly NIR reflectance"', '"1 km monthly red reflectance"'], - "cmap": None, - }, - "mod13q1": { - "sample": ( - "https://modis-samples.fra1.cdn.digitaloceanspaces.com/" - "MOD13Q1.061/MOD13Q1.A2023161.h20v05.061.2023177232655.hdf" - ), - "description": "MODIS/Terra Vegetation Indices 16-Day L3 Global 250m SIN Grid", - "bands": ['"250m 16 days MIR reflectance"', '"250m 16 days NIR reflectance"', '"250m 16 days red reflectance"'], - "cmap": None, - }, - "myd13q1": { - "sample": ( - "https://modis-samples.fra1.cdn.digitaloceanspaces.com/" - "MYD13Q1.061/MYD13Q1.A2023137.h29v12.061.2023154011314.hdf" - ), - "description": "MODIS/Aqua Vegetation Indices 16-Day L3 Global 250m SIN Grid", - "bands": ['"250m 16 days MIR reflectance"', '"250m 16 days NIR reflectance"', '"250m 16 days red reflectance"'], - "cmap": None, - }, -} - - -def create_preview(infile: str, outdir: str, outres: int) -> str: - """ - Creates an 8-Bit RGB COG preview of the MODIS hdf infile - - Args: - infile (str): Path to input HDF - outdir (str): Output directory - outres (int): Resolution in meters - - Returns: - preview (str): Path to preview COG - """ - basename = os.path.splitext(os.path.basename(infile))[0] - preview = os.path.join(outdir, f"{basename}_preview.tif") - product_id = basename.split(".")[0].lower() - # identify product based on MODIS naming convention - product = modis_previews[product_id] - # collect RGB band hrefs - band_hrefs = [] - mds = gdal.Open(infile) - sds = mds.GetSubDatasets() - for b in product["bands"]: - for sd, _ in sds: - if b == sd.split(":")[-1]: - band_hrefs.append(sd) - mds = None - # extract bands incl. statistics into RGB in-memory stack - bands = [] - vmins = [] - vmaxs = [] - for band_href in band_hrefs: - band = gdal.Open(band_href) - vmin, vmax, _, _ = band.GetRasterBand(1).GetStatistics(True, True) - vmins.append(vmin) - vmaxs.append(vmax) - bands.append(band) - scale_params = [min(vmins), max(vmaxs), 0, 255] - vrt = os.path.join("/vsimem", f"{basename}_preview.vrt") - vrt_options = gdal.BuildVRTOptions(separate=True, hideNodata=True, resolution="user", xRes=outres, yRes=outres) - gdal.BuildVRT(vrt, bands, options=vrt_options) - # reset scale value, add colormap if appropriate - rgb_stack = gdal.Open(vrt) - for i in range(rgb_stack.RasterCount): - rgb_stack.GetRasterBand(i + 1).SetScale(1.0) - if product["cmap"]: - b1 = rgb_stack.GetRasterBand(1) - colors = gdal.ColorTable() - for ramp in product["cmap"]: - colors.CreateColorRamp(*ramp) - b1.SetRasterColorTable(colors) - b1.SetRasterColorInterpretation(gdal.GCI_PaletteIndex) - rgb_stack = None - # scale into 8-Bit COG - translate_options = gdal.TranslateOptions(format="COG", outputType=gdal.GDT_Byte, scaleParams=[scale_params]) - gdal.Translate(preview, vrt, options=translate_options) - gdal.Unlink(vrt) - for band in bands: - band = None - return preview - - -# hdfs = [modis_previews[product]['sample'] for product in modis_previews] -# for hdf in hdfs: -# create_preview(hdf, r'D:\_temp', 2000) -# print(f'{os.path.split(hdf)[1]} done') -# print('done') diff --git a/lib/lib/datasets/modis.py b/lib/lib/datasets/modis.py deleted file mode 100644 index 750f4b7..0000000 --- a/lib/lib/datasets/modis.py +++ /dev/null @@ -1,234 +0,0 @@ -import os -import json -import re -from datetime import datetime -import pystac -import rio_stac - -from ..resources.stac import ( - extract_stactools, - add_asset_filesize, -) - -scene_id_pattern = ( - r"^" - r"(?P[0-9A-Z]{7,8})." - r"A" - r"(?P[0-9]{7})." - r"(?P[0-9a-z]{6})." - r"(?P[0-9]{3})." - r"(?P[0-9]{13})$" -) - -folder_structure = "{sensor}/{product}.{version}/{year}/{month}/{day}/{tile_id}" -usgs_path_structure = "{usgs_path}/{product}.{version}/{year}.{month}.{day}" - - -def get_scene_id_info(scene_id): - """ - Description... - - Parameters: - scene_id: x - - Returns: - (...): ... - """ - match = re.match(re.compile(scene_id_pattern), scene_id) - variables = match.groupdict() - date = datetime.strptime(variables["start"], "%Y%j") - variables["year"] = date.strftime("%Y") - variables["month"] = date.strftime("%m") - variables["day"] = date.strftime("%d") - - if "product" in variables: - if variables["product"].startswith("MOD"): - variables["satellite"] = "Terra" - variables["usgs_path"] = "https://e4ftl01.cr.usgs.gov/MOLT" - elif variables["product"].startswith("MYD"): - variables["satellite"] = "Aqua" - variables["usgs_path"] = "https://e4ftl01.cr.usgs.gov/MOLA" - else: - variables["satellite"] = "Terra+Aqua" - variables["usgs_path"] = "https://e4ftl01.cr.usgs.gov/MOTA" - if variables["product"].startswith("VNP"): - variables["sensor"] = "VIIRS" - variables["usgs_path"] = None - else: - variables["sensor"] = "MODIS" - return variables - - -def get_scene_id_folder(scene_id, folder_format=None): - """ - Description... - - Parameters: - scene_id: x - folder_format: x - - - Returns: - (...): ... - """ - variables = get_scene_id_info(scene_id) - # date = datetime.strptime(variables["start"], "%Y%j") - # variables["year"] = date.strftime("%Y") - # variables["month"] = date.strftime("%m") - # variables["day"] = date.strftime("%d") - - if folder_format is None: - folder_format = folder_structure - return folder_format.format(**variables) - - -def get_usgs_path(scene_id): - """ - Description... - - Parameters: - scene_id: x - - Returns: - (...): ... - """ - return get_scene_id_folder(scene_id, folder_format=usgs_path_structure) - - -def get_stac_proj(input_file): - """ - Description... - - Parameters: - input_file: x - - Returns: - (...): ... - """ - rio = rio_stac.create_stac_item(input_file, with_proj=True) - del rio.properties["proj:projjson"] - return rio.properties - - -def create_stac_item(scene_path, scene_id, return_pystac=False, add_file_size=False): - """ - Description... - - Parameters: - scene_path: x - scene_id: x - return_pystac: x - add_file_size: x - - Returns: - (...): ... - - Raises: - Exception: Metadata_error: Folder does not exist. - Exception: Metadata_error: Error during creating metadata. - """ - if scene_path[-1] == "/": - scene_path = scene_path[:-1] - - if not os.path.exists(scene_path): - raise Exception("metadata_error: Folder does not exist %s" % (scene_path)) - - stac_function = "stactools.modis.stac.create_item" - try: - stac_file = os.path.join(os.path.dirname(scene_path), scene_id + ".STAC.json") - stac_item = extract_stactools(scene_path, stac_function, {}) - stac_item = add_modis_adjustments(stac_item) - - stac_item.properties["terrabyte:uniq_id"] = ".".join(stac_item.id.split(".")[0:-1]) - stac_item.id = scene_id - - # Add file:// protocol for local file paths - for asset in stac_item.assets: - stac_item.assets[asset].href = "file://%s" % stac_item.assets[asset].href - - if add_file_size: - stac_item = add_asset_filesize(stac_item) - if return_pystac: - return stac_item - else: - with open(stac_file, "w") as f: - f.write(json.dumps(stac_item.to_dict())) - return stac_file - - except Exception as e: - metadata_error = "Error during creating metadata for %s: %s" % ( - scene_path, - str(e), - ) - raise Exception("metadata_error: %s" % metadata_error) - - return stac_file - - -def add_modis_adjustments(stac): - """ - Description... - - Parameters: - stac: x - - Returns: - (...): ... - """ - product = os.path.basename(stac.id)[3:7].lower() - asset_tmpl_file = "modis.%s.json" % product - asset_tmpl = json.load(open(os.path.join(os.path.dirname(__file__), "templates", asset_tmpl_file))) - data = stac.to_dict() - - data["properties"]["proj:wkt2"] = ( - 'PROJCRS["unnamed",' - 'BASEGEOGCRS["Unknown datum based upon the custom spheroid",' - 'DATUM["Not specified (based on custom spheroid)",' - 'ELLIPSOID["Custom spheroid",6371007.181,0,LENGTHUNIT["metre",1,ID["EPSG",9001]]]],' - 'PRIMEM["Greenwich",0,ANGLEUNIT["degree",0.0174532925199433,ID["EPSG",9122]]]],' - 'CONVERSION["Sinusoidal",METHOD["Sinusoidal"],PARAMETER["Longitude of natural origin",0,' - 'ANGLEUNIT["degree",0.0174532925199433],ID["EPSG",8802]],' - 'PARAMETER["False easting",0,LENGTHUNIT["metre",1],ID["EPSG",8806]],' - 'PARAMETER["False northing",0,LENGTHUNIT["metre",1],ID["EPSG",8807]]],' - 'CS[Cartesian,2],AXIS["(E)",east,ORDER[1],LENGTHUNIT["Meter",1]],' - 'AXIS["(N)",north,ORDER[2],LENGTHUNIT["Meter",1]]]' - ) - hdf = data["assets"]["hdf"]["href"] - data["assets"]["hdf"]["type"] = "application/hdf4" - if asset_tmpl_file == "modis.09ga.json": - infos = dict() - proj_1km = 'HDF4_EOS:EOS_GRID:"' + hdf + '":MODIS_Grid_1km_2D:num_observations_1km' - infos["1km"] = get_stac_proj(proj_1km) - proj_500m = 'HDF4_EOS:EOS_GRID:"' + hdf + '":MODIS_Grid_500m_2D:num_observations_500m' - try: - infos["500m"] = get_stac_proj(proj_500m) - except Exception as e: - print(data["id"], "FAILED", str(e)) - return False - for asset in asset_tmpl: - asset_tmpl[asset]["href"] = asset_tmpl[asset]["href"].replace("{{hdf_path}}", hdf) - info = infos["500m"] - if "1km" in asset_tmpl[asset]["href"]: - info = infos["1km"] - asset_tmpl[asset]["proj:transform"] = info["proj:transform"] - asset_tmpl[asset]["proj:shape"] = info["proj:shape"] - data["properties"]["proj:geometry"] = info["proj:geometry"] - data["properties"]["proj:bbox"] = info["proj:bbox"] - data["assets"].update(asset_tmpl) - else: - data["assets"].update(asset_tmpl) - first_band = asset_tmpl[list(asset_tmpl.keys())[0]]["href"].replace("{{hdf_path}}", hdf) - try: - info = get_stac_proj(first_band) - except Exception as e: - print(data["id"], "FAILED", str(e)) - return False - data["properties"].update(info) - - if "https://stac-extensions.github.io/projection/v1.1.0/schema.json" not in data["stac_extensions"]: - data["stac_extensions"].append("https://stac-extensions.github.io/projection/v1.1.0/schema.json") - - stac_string = json.dumps(data) - stac_string = stac_string.replace("{{hdf_path}}", hdf) - stac_item = json.loads(stac_string) - return pystac.Item.from_dict(stac_item) diff --git a/lib/lib/datasets/sentinel.py b/lib/lib/datasets/sentinel.py deleted file mode 100644 index 6ee8cc3..0000000 --- a/lib/lib/datasets/sentinel.py +++ /dev/null @@ -1,514 +0,0 @@ -import os -import re -import json -import copy -from datetime import datetime -import xml.etree.ElementTree as ET -import logging -import pystac - -from ..base.file import calculate_checksum -from ..resources.stac import ( - extract_stactools, - add_asset_filesize, -) - - -# scene_id_patterns = { -# "S1": -# r"^(?PS1[AB])_" -# r"(?PS1|S2|S3|S4|S5|S6|IW|EW|WV|EN|N1|N2|N3|N4|N5|N6|IM)_" -# -# r"(?PSLC|GRD|OCN)" -# r"(?PF|H|M|_)_" -# r"(?P1|2)" -# r"(?PS|A)" -# r"(?PSH|SV|DH|DV|VV|HH|HV|VH)_" -# r"(?P[0-9]{8}T[0-9]{6})_" -# r"(?P[0-9]{8}T[0-9]{6})_" -# r"(?P[0-9]{6})_" -# r"(?P[0-9A-F]{6})_" -# r"(?P[0-9A-F]{4})$", -# "S2": -# r"^(?PS2[AB])_" -# r"MSI" -# r"(?PL1C|L2A)_" -# r"(?P[0-9]{8}T[0-9]{6})_" -# r"(?PN[0-9]{4})_" -# r"R(?P[0-9]{3})_" -# r"T(?P[0-9]{2})" -# r"(?P[A-Z]{1})" -# r"(?P[A-Z]{2})_" -# r"(?P[0-9]{8}T[0-9]{6})$", -# "S3": -# r"^(?PS3[AB])_" -# r"(?POL|SL|SR|DO|MW|GN|SY|TM|AX)_" -# r"(?P0|1|2)_" -# r"(?P[A-Z0-9_]{6})_" -# r"(?P[0-9]{8}T[0-9]{6})_" -# r"(?P[0-9]{8}T[0-9]{6})_" -# r"(?P[0-9]{8}T[0-9]{6})_" -# r"(?P[A-Z0-9_]{17})_" -# r"(?P
[A-Z0-9_]{3})_" -# r"(?P[A-Z0-9_]{8})$", -# "S5": -# r"^(?PS5P)_" -# r"(?P[A-Z]{4})_" -# r"(?P[A-Z0-9_]{3})_" -# r"(?P[A-Z0-9_]{6})_" -# r"(?P[0-9]{8}T[0-9]{6})_" -# r"(?P[0-9]{8}T[0-9]{6})_" -# r"(?P[0-9]{5})_" -# r"(?P[0-9]{2})_" -# r"(?P[0-9]{6})_" -# r"(?P[0-9]{8}T[0-9]{6})$", -# "S5P_AUX": -# r"^(?PS5P)_" -# r"(?P[A-Z]{4})_" -# r"(?P[A-Z0-9_]{3})_" -# r"(?P[A-Z0-9_]{6})_" -# r"(?P[0-9]{8}T[0-9]{6})_" -# r"(?P[0-9]{8}T[0-9]{6})_" -# r"(?P[0-9]{8}T[0-9]{6})$" -# } -# Todo: Check Formatter pattern Changes (old commented out) -scene_id_patterns = { - "S1": r"^(?PS1[AB])_" - r"(?PS1|S2|S3|S4|S5|S6|IW|EW|WV|EN|N1|N2|N3|N4|N5|N6|IM)_" - r"(?PSLC|GRD|OCN)" - r"(?PF|H|M|_)_" - r"(?P1|2)" - r"(?PS|A)" - r"(?PSH|SV|DH|DV|VV|HH|HV|VH)_" - r"(?P[0-9]{8}T[0-9]{6})_" - r"(?P[0-9]{8}T[0-9]{6})_" - r"(?P[0-9]{6})_" - r"(?P[0-9A-F]{6})_" - r"(?P[0-9A-F]{4})$", - "S2": r"^(?PS2[AB])_" - r"MSI" - r"(?PL1C|L2A)_" - r"(?P[0-9]{8}T[0-9]{6})_" - r"(?PN[0-9]{4})_" - r"R(?P[0-9]{3})_" - r"T(?P[0-9]{2})" - r"(?P[A-Z]{1})" - r"(?P[A-Z]{2})_" - r"(?P[0-9]{8}T[0-9]{6})$", - "S3": r"^(?PS3[AB])_" - r"(?POL|SL|SR|DO|MW|GN|SY|TM|AX)_" - r"(?P0|1|2)_" - r"(?P[A-Z0-9_]{6})_" - r"(?P[0-9]{8}T[0-9]{6})_" - r"(?P[0-9]{8}T[0-9]{6})_" - r"(?P[0-9]{8}T[0-9]{6})_" - r"(?P[A-Z0-9_]{17})_" - r"(?P
[A-Z0-9_]{3})_" - r"(?P[A-Z0-9_]{8})$", - "S5": r"^(?PS5P)_" - r"(?P[A-Z]{4})_" - r"(?P[A-Z0-9_]{3})_" - r"(?P[A-Z0-9_]{6})_" - r"(?P[0-9]{8}T[0-9]{6})_" - r"(?P[0-9]{8}T[0-9]{6})_" - r"(?P[0-9]{5})_" - r"(?P[0-9]{2})_" - r"(?P[0-9]{6})_" - r"(?P[0-9]{8}T[0-9]{6})$", - "S5P_AUX": r"^(?PS5P)_" - r"(?P[A-Z]{4})_" - r"(?P[A-Z0-9_]{3})_" - r"(?P[A-Z0-9_]{6})_" - r"(?P[0-9]{8}T[0-9]{6})_" - r"(?P[0-9]{8}T[0-9]{6})_" - r"(?P[0-9]{8}T[0-9]{6})$", -} - -folder_structures = { - "S1": "{product}/{year}/{month}/{day}", - "S2": "{level}/tiles/{utm_zone}/{mgrs_lat}/{square}/{year}/{month}", - "S3": "{instrument}/{product}/{year}/{month}/{day}", - "S5": "{category}/{product}/{year}/{month}/{day}", -} - -checksum_settings = { - "S1": dict(file="manifest.safe", algorithm="MD5"), - "S2": dict(file="manifest.safe", algorithm="SHA3-256"), - "S3": dict(file="xfdumanifest.xml", algorithm="MD5"), -} - -asset_changes = { - "S2": { - "coastal": "B01", - "blue": "B02", - "green": "B03", - "red": "B04", - "rededge1": "B05", - "rededge2": "B06", - "rededge3": "B07", - "nir": "B08", - "nir08": "B8A", - "nir09": "B09", - "cirrus": "B10", - "swir16": "B11", - "swir22": "B12", - "visual": "TCI", - "aot_10m": "AOT", - "wvp_10m": "WVP", - "scl": "SCL", - } -} - -variable_mappings = {"S3": {"OL": "OLCI", "SL": "SLSTR", "SY": "SYNERGY"}} - - -def get_scene_id_info(scene_id): - """ - Description... - - Parameters: - scene_id: x - - Returns: - (...): ... - - Raises: - Exception: Satellite not supported. - """ - satellite = scene_id[0:2] - if satellite == "S5" and "_AUX_" in scene_id: - satellite = "S5P_AUX" - if satellite not in scene_id_patterns: - raise Exception("Satellite %s not supported" % satellite) - - match = re.match(re.compile(scene_id_patterns[satellite]), scene_id) - variables = match.groupdict() - if "category" in variables: - variables["category"] = variables["category"].rstrip("_") - if "product" in variables: - variables["product"] = variables["product"].rstrip("_").replace("___", "_").replace("__", "_") - return variables - - -def get_scene_id_folder(scene_id, folder_format=None): - """ - Description... - - Parameters: - scene_id: x - folder_format: x - - Returns: - (...): ... - - Raises: - Exception: Satellite not supported. - Exception: Not supported for integrity check. - Exception: manifest.safe not found. - Exception: File does not match expected file size. - Exception: File does not match expected checksum. - """ - variables = get_scene_id_info(scene_id) - if "start" in variables: - date = datetime.strptime(variables["start"], "%Y%m%dT%H%M%S") - variables["year"] = date.strftime("%Y") - variables["month"] = date.strftime("%m") - variables["day"] = date.strftime("%d") - - if scene_id.startswith("S3"): - if "instrument" in variables: - variables["instrument"] = variable_mappings["S3"][variables["instrument"]] - if "product" in variables: - variables["product"] = ( - variables["instrument"][0:2] + "_" + variables["processingLevel"] + "_" + variables["product"] + "___" - ) - - if folder_format is None: - satellite = scene_id[0:2] - if satellite not in folder_structures: - raise Exception("Satellite %s not supported" % satellite) - folder_format = folder_structures[satellite] - - return folder_format.format(**variables) - - -def validate_integrity(scene_path, scene_id): - """ - Description... - - Parameters: - scene_path: x - scene_id: x - - Returns: - (...): ... - - Raises: - Exception: Not supported for integrity check. - Exception: manifest.safe not found. - Exception: File does not match expected file size. - Exception: File does not match expected checksum. - """ - if scene_id[0:2] not in checksum_settings: - raise Exception("%s not supported for integrity check (%s)" % (scene_id[0:2], scene_id)) - settings = checksum_settings[scene_id[0:2]] - checksum_alg = settings["algorithm"] - index_file_name = settings["file"] - - manifest = os.path.join(scene_path, index_file_name) - if not os.path.exists(manifest): - raise Exception("manifest.safe not found: %s" % manifest) - - tree = ET.parse(manifest) - root = tree.getroot() - for f in root.findall(".//byteStream"): - href = f.find("fileLocation").attrib["href"] - check_file = os.path.join(scene_path, href) - size = os.stat(check_file).st_size - expected_file_size = int(f.attrib["size"]) - if size != expected_file_size: - raise Exception( - "File %s does not match expected file size: %s vs. %s" % (check_file, size, expected_file_size) - ) - - expected_checksum = f.find("checksum").text.lower() - # checksum_alg = "SHA3-256" - checksum = calculate_checksum(checksum_alg, check_file) - if checksum != expected_checksum: - raise Exception( - "File %s does not match expected checksum (%s): %s vs. %s" - % (check_file, checksum_alg, size, expected_checksum) - ) - - return True - - -def sentinel_metadata(scene_path, scene_id, return_pystac=False, add_file_size=False): - """ - Description... - - Parameters: - scene_path: x - scene_id: x - return_pystac: x - add_file_size: x - - Returns: - (...): ... - - Raises: - Exception: Metadata_error: Folder does not exist. - Exception: Metadata_error: No STAC function for this scene. - Exception: Error during creating metadata. - """ - if scene_path[-1] == "/": - scene_path = scene_path[:-1] - print("executing sentinel_metadata for %s" % scene_path) - - if not os.path.exists(scene_path): - raise Exception("metadata_error: Folder does not exist %s" % (scene_path)) - - stac_function = None - stac_function_args = {} - if scene_id.startswith("S1") and "_GRD" in scene_id: - stac_function = "stactools.sentinel1.stac.create_item" - elif scene_id.startswith("S1") and "_SLC" in scene_id: - stac_function = "stactools.sentinel1.stac.create_item" - elif scene_id.startswith("S2"): - stac_function = "stactools.sentinel2.stac.create_item" - elif scene_id.startswith("S3"): - stac_function = "stactools.sentinel3.stac.create_item" - stac_function_args = dict(skip_nc=True) - else: - raise Exception("metadata_error: No STAC function for %s" % scene_id) - - base_item = None - collection = get_collection_name(scene_id) - if collection == "sentinel-2-c1-l2a": - base_item = os.path.join(os.path.dirname(__file__), "collections", "sentinel", "sentinel-2-c1-l2a.json") - elif collection == "sentinel-2-c1-l1c": - base_item = os.path.join(os.path.dirname(__file__), "collections", "sentinel", "sentinel-2-c1-l1c.json") - elif collection == "sentinel-3-olci-l1-efr": - base_item = os.path.join(os.path.dirname(__file__), "collections", "sentinel", "sentinel-3-olci-l1-efr.json") - - try: - stac_file = os.path.join(scene_path, scene_id + ".STAC.json") - stac_item = extract_stactools(scene_path, stac_function, stac_function_args) - stac_item.properties["terrabyte:stactools_id"] = stac_item.id - stac_item.id = scene_id - - if base_item: - base_item = json.load(open(base_item)) - if "item_assets" in base_item: - base_item["assets"] = base_item["item_assets"] - - if scene_id.startswith("S2"): - stac_item = modify_s2_stac(stac_item, base_item=base_item) - elif scene_id.startswith("S3"): - stac_item = modify_s3_stac(stac_item, base_item=base_item) - - # Add file:// protocol for local file paths - for asset in stac_item.assets: - if stac_item.assets[asset].href.startswith("/"): - stac_item.assets[asset].href = "file://%s" % stac_item.assets[asset].href - - if add_file_size: - stac_item = add_asset_filesize(stac_item) - - if return_pystac: - return stac_item - else: - with open(stac_file, "w") as f: - f.write(json.dumps(stac_item.to_dict())) - return stac_file - except Exception as e: - metadata_error = "Error during creating metadata for %s: %s" % ( - scene_path, - str(e), - ) - raise Exception("metadata_error: %s" % metadata_error) - - -def get_collection_name(scene_id): - """ - Description... - - Parameters: - scene_id: x - - Returns: - (...): ... - - Raises: - Exception: No collection found. - """ - if scene_id.startswith("S1") and "_GRD" in scene_id: - return "sentinel-1-grd" - elif scene_id.startswith("S1") and "_SLC" in scene_id: - return "sentinel-1-slc" - elif scene_id.startswith("S2") and "_MSIL1C_" in scene_id: - return "sentinel-2-c1-l1c" - elif scene_id.startswith("S2") and "_MSIL2A_" in scene_id: - return "sentinel-2-c1-l2a" - elif scene_id.startswith("S3") and "_OL_1_EFR_" in scene_id: - return "sentinel-3-olci-l1-efr" - elif scene_id.startswith("S5") and "_L1B_" in scene_id: - return "sentinel-5p-l1b" - elif scene_id.startswith("S5") and "_L2_" in scene_id: - return "sentinel-5p-l2" - elif scene_id.startswith("S5") and "_AUX_" in scene_id: - return "sentinel-5p-aux" - else: - raise Exception("No collection found") - - -__log = logging.getLogger("Log Info") - - -def modify_s2_stac(stac_item: pystac.item.Item, base_item=None): - """ - Modify the Asset-Keys and eo:bands:name for a Sentinel-2 L2 STAC-Item. - - Args: - stac_item: The STAC item file/object to modify. Must be a STACObject. - base_item: x - - Returns: - (...): A pystac.item.Item object with the desired changes. - - Raises: - Exception: Could not find entry in asset_changes configuration. - """ - - stac_item_dict = copy.deepcopy(stac_item.to_dict(include_self_link=False)) - - mission = stac_item.id[0:2] # Get first two characters of Item id (e.g., S2 for S2A_MSIL2A_....) - if mission not in asset_changes: - raise Exception("Could not find entry for %s in asset_changes configuration" % mission) - input_dict = asset_changes[mission] - assets_new = dict() - - for i, (current_key, target_key) in enumerate(input_dict.items()): - - if current_key in stac_item_dict["assets"]: - __log.info(f"Replacing the current Asset-Key {current_key} with the new Asset-Key {target_key}.") - assets_new[target_key] = copy.deepcopy(stac_item_dict["assets"].pop(current_key)) - else: - __log.info( - ( - f"Replacing the current Asset-Key {current_key} with the new Asset-Key {target_key}: " - f"Current key not found in metadata!" - ) - ) - - if base_item: - try: - for key in base_item["assets"]: - if key not in assets_new: - assets_new[key] = copy.deepcopy(stac_item_dict["assets"][key]) - for property in base_item["assets"][key]: - assets_new[key][property] = copy.deepcopy(base_item["assets"][key][property]) - - except Exception as e: - __log.error("ERROR: %s" % e) - - stac_item_dict["assets"] = assets_new - stac_item_object_final = pystac.Item.from_dict(stac_item_dict) - return stac_item_object_final - - -def modify_s3_stac(stac_item: pystac.item.Item, base_item=None): - """ - Modify the Asset-Keys and eo:bands:name for a Sentinel-3 STAC-Item. - - Args: - stac_item: The STAC item file/object to modify. Must be a STACObject. - base_item: x - - Returns: - (...): A pystac.item.Item object with the desired changes.""" - - stac_item_dict = copy.deepcopy(stac_item.to_dict(include_self_link=False)) - - assets_new = dict() - for key in stac_item_dict["assets"]: - asset = copy.deepcopy(stac_item_dict["assets"][key]) - for property in stac_item_dict["assets"][key]: - if property.startswith("file:"): - del asset[property] - - assets_new[key] = asset - - if base_item: - try: - for key in base_item["assets"]: - if key not in assets_new: - assets_new[key] = copy.deepcopy(stac_item_dict["assets"][key]) - for property in base_item["assets"][key]: - assets_new[key][property] = copy.deepcopy(base_item["assets"][key][property]) - if "resolution" in assets_new[key]: - del assets_new[key]["resolution"] - - except Exception as e: - __log.error("ERROR: %s" % e) - - stac_item_dict["assets"] = assets_new - - info = get_scene_id_info(stac_item.id) - tby_item_id = ( - f"{info['sensor']}_{info['instrument']}_{info['processingLevel']}_{info['product']}_" - f"{info['start']}_{info['stop']}_{info['instance']}" - ) - stac_item_dict["properties"]["terrabyte:uniq_id"] = tby_item_id - - if "s3:productType" in stac_item_dict["properties"]: - stac_item_dict["properties"]["s3:product_type"] = stac_item_dict["properties"]["s3:productType"] - del stac_item_dict["properties"]["s3:productType"] - - info = stac_item_dict["id"].split("_") - timeliness = info[-2] - baseline_collection = info[-1] - stac_item_dict["properties"]["s3:processing_timeliness"] = timeliness - stac_item_dict["properties"]["s3:baseline_collection"] = baseline_collection - stac_item_object_final = pystac.Item.from_dict(stac_item_dict) - return stac_item_object_final diff --git a/lib/lib/datasets/viirs.py b/lib/lib/datasets/viirs.py deleted file mode 100644 index f41ed19..0000000 --- a/lib/lib/datasets/viirs.py +++ /dev/null @@ -1,246 +0,0 @@ -from dateutil.parser import parse -import pystac -import os -import re -from datetime import datetime - -from ..resources.stac import ( - extract_stactools, - add_asset_filesize, -) - -scene_id_pattern = ( - r"^" - r"(?P[0-9A-Z]{7})." - r"A" - r"(?P[0-9]{7})." - r"(?P[0-9a-z]{6})." - r"(?P[0-9]{3})." - r"(?P[0-9]{13})" -) - -scene_id_pattern2 = ( - r"^" - r"(?P[0-9A-Z]{8})." - r"A" - r"(?P[0-9]{7})." - r"(?P[0-9a-z]{6})." - r"(?P[0-9]{3})." - r"(?P[0-9]{13})" -) - -folder_structure = "{product}.{version}/{year}/{month}/{day}/{tile_id}" - - -def get_scene_id_info(scene_id): - """ - Description... - - Parameters: - scene_id: x - - Returns: - (...): ... - """ - used_pattern = scene_id_pattern - if len(scene_id.split(".")[0]) == 8: - used_pattern = scene_id_pattern2 - match = re.match(re.compile(used_pattern), scene_id) - variables = match.groupdict() - return variables - - -def get_scene_id_folder(scene_id, folder_format=None): - """ - Description... - - Parameters: - scene_id: x - folder_format: x - - Returns: - (...): ... - """ - variables = get_scene_id_info(scene_id) - date = datetime.strptime(variables["start"], "%Y%j") - variables["year"] = date.strftime("%Y") - variables["month"] = date.strftime("%m") - variables["day"] = date.strftime("%d") - - if folder_format is None: - folder_format = folder_structure - return folder_format.format(**variables) - - -def viirs_metadata(scene_path, scene_id, return_pystac=False, add_file_size=False): - """ - Description... - - Parameters: - scene_path: x - scene_id: x - return_pystac: x - add_file_size: x - - Returns: - (...): ... - - Raises: - Exception: Metadata_error: Folder does not exist. - Exception: Metadata_error: Error during creating metadata. - """ - if scene_path[-1] == "/": - scene_path = scene_path[:-1] - - if not os.path.exists(scene_path): - raise Exception("metadata_error: Folder does not exist %s" % (scene_path)) - - stac_function = "stactools.viirs.stac.create_item" - try: - stac_file = os.path.join(os.path.dirname(scene_path), scene_id + ".STAC.json") - stac_item = extract_stactools(scene_path, stac_function, {}) - # stac_item = add_modis_adjustments(stac_item) - - stac_item.properties["terrabyte:uniq_id"] = ".".join(stac_item.id.split(".")[0:-1]) - stac_item.id = scene_id - - # Add file:// protocol for local file paths - for asset in stac_item.assets: - stac_item.assets[asset].href = "file://%s" % stac_item.assets[asset].href - - if add_file_size: - stac_item = add_asset_filesize(stac_item) - if return_pystac: - return stac_item - else: - stac_item.save_object(dest_href=stac_file) - return stac_file - - except Exception as e: - metadata_error = "Error during creating metadata for %s: %s" % ( - scene_path, - str(e), - ) - raise Exception("metadata_error: %s" % metadata_error) - - return stac_file - - -def get_geometry(points): - """ - Description... - - Parameters: - points: x - - Returns: - (...): ... - """ - coordinates = [] - for p in points[::-1]: - coordinates.append([p["Longitude"], p["Latitude"]]) - return {"type": "Polygon", "coordinates": [coordinates]} - - -def get_bbox(geometry): - """ - Description... - - Parameters: - geometry: x - - Returns: - (...): ... - - Raises: - Exception: No collection found. - """ - coords = geometry["coordinates"] - lats = [c[1] for c in coords[0]] - lons = [c[0] for c in coords[0]] - return [min(lons), min(lats), max(lons), max(lats)] - - -def create_item_for_inventory(scene, collection, collection_public): - """ - Description... - - Parameters: - scene: x - collection: x - collection_public: x - - Returns: - (...): ... - - Raises: - Exception: Could not find identifier. - """ - item_id = None - for identifier in scene["umm"]["DataGranule"]["Identifiers"]: - if identifier["IdentifierType"] == "ProducerGranuleId": - item_id = identifier["Identifier"] - if item_id is None: - raise Exception("Could not find identifier") - item_id = os.path.splitext(item_id)[0] - # item_id = scene['meta']['native-id'] - item_parts = item_id.split(".") - - tby_item_id = ".".join(item_parts[0:4]) - - item_datetime_begin = parse(scene["umm"]["TemporalExtent"]["RangeDateTime"]["BeginningDateTime"]) - item_datetime_end = parse(scene["umm"]["TemporalExtent"]["RangeDateTime"]["EndingDateTime"]) - - item_geometry = get_geometry( - scene["umm"]["SpatialExtent"]["HorizontalSpatialDomain"]["Geometry"]["GPolygons"][0]["Boundary"]["Points"] - ) - item_bbox = get_bbox(item_geometry) - - item = pystac.Item( - id=item_id, - datetime=None, - start_datetime=item_datetime_begin, - end_datetime=item_datetime_end, - geometry=item_geometry, - bbox=item_bbox, - collection=collection, - properties={}, - ) - - item.properties["deprecated"] = False - item.properties["order:status"] = "orderable" - item.properties["version"] = item_parts[-1] - - if "revision-date" in scene["meta"]: - item.properties["viirs:revision-date"] = parse(scene["meta"]["revision-date"]).isoformat() - if "revision-id" in scene["meta"]: - item.properties["viirs:revision-id"] = scene["meta"]["revision-id"] - item.properties["viirs:provider-id"] = scene["meta"]["provider-id"] - item.properties["viirs:concept-id"] = scene["meta"]["concept-id"] - - item.properties["platform"] = scene["umm"]["Platforms"][0]["ShortName"] - - for attrib in scene["umm"]["AdditionalAttributes"]: - if attrib["Name"] == "VERTICALTILENUMBER": - item.properties["viirs:vertical-tile"] = int(attrib["Values"][0]) - elif attrib["Name"] == "HORIZONTALTILENUMBER": - item.properties["viirs:horizontal-tile"] = int(attrib["Values"][0]) - - item.properties["file:size"] = scene["umm"]["DataGranule"]["ArchiveAndDistributionInformation"][0]["Size"] - item.properties["file:unit"] = scene["umm"]["DataGranule"]["ArchiveAndDistributionInformation"][0]["SizeUnit"] - - item.properties["terrabyte:item_id"] = tby_item_id - item.properties["terrabyte:folder"] = os.path.join(get_scene_id_folder(item_id), item_id + ".h5") - item.properties["terrabyte:collection_id"] = collection_public - - item.properties["viirs:dates"] = dict() - for date in scene["umm"]["ProviderDates"]: - item.properties["viirs:dates"][date["Type"]] = parse(date["Date"]).isoformat() - - for url in scene["umm"]["RelatedUrls"]: - if url["Type"] == "GET DATA": - item.assets["hdf"] = pystac.Asset(href=url["URL"]) - elif ".xml" in url["URL"] and "https://" in url["URL"]: - item.assets["xml"] = pystac.Asset(href=url["URL"]) - - return item diff --git a/lib/lib/providers/__init__.py b/lib/lib/providers/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/lib/lib/providers/esa_cdse.py b/lib/lib/providers/esa_cdse.py deleted file mode 100644 index 39575eb..0000000 --- a/lib/lib/providers/esa_cdse.py +++ /dev/null @@ -1,378 +0,0 @@ -import os -import requests -import pystac -import pandas -import shapely.wkt -import shapely.geometry -from dateutil.parser import parse - -from ..base import geometry as geom_fct -from ..base.download import download_data -from ..datasets.sentinel import get_scene_id_info, get_scene_id_folder, get_collection_name - - -def login(username, password): - """ - Description... - - Parameters: - username: x - password: x - - - Returns: - (...): ... - - Raises: - Exception: Keycloak token creation failed. - """ - data = { - "client_id": "cdse-public", - "username": username, - "password": password, - "grant_type": "password", - } - try: - r = requests.post( - "https://identity.dataspace.copernicus.eu/auth/realms/CDSE/protocol/openid-connect/token", - data=data, - ) - r.raise_for_status() - except Exception: - raise Exception(f"Keycloak token creation failed. Reponse from the server was: {r.json()}") - return r.json()["access_token"] - - -def search_data(api_url="https://datahub.creodias.eu/odata/v1", query_filter=None): - """ - Searches for data in ESA Copernicus Data Space Ecosystem based on a given query filter - - Arguments: - api_url: API URL - query_filter: Query string to be passed to the API URL - - Returns: - (Array): List of scenes - """ - - query_url = False - if query_filter: - query_url = api_url + query_filter - - scenes = [] - while query_url: - print(query_url) - try: - res = requests.get(query_url) - if res.status_code == 200: - data = res.json() - print("Found %s scenes" % len(data["value"])) - for feature in data["value"]: - scene = dict( - uid=feature["Id"], - scene_id=feature["Name"], - S3Path=feature["S3Path"], - GeoFootprint=feature["GeoFootprint"], - ContentLength=feature["ContentLength"], - PublicationDate=feature["PublicationDate"], - ModificationDate=feature["ModificationDate"], - ) - if "Attributes" in feature: - for attr in feature["Attributes"]: - scene[attr["Name"]] = attr["Value"] - scenes.append(scene) - else: - print("ERROR", res.status_code, res.content) - if "@odata.nextLink" in data: - query_url = data["@odata.nextLink"] - else: - query_url = False - except Exception as e: - print("ERROR", str(e)) - - return scenes - - -def search_scenes_ingestion(date_from, date_to, filters=None): - """ - Description... - - Parameters: - date_from: x - date_to: x - filters: x - - Returns: - (...): ... - - Raises: - Exception: Search failed. - """ - query_template = "/Products?$filter=%filter&$top=1000" - filter_base = "((PublicationDate ge %date_from and PublicationDate lt %date_to) and (Online eq true))".replace( - "%date_from", date_from - ).replace("%date_to", date_to) - - if filters is None: - filters = [ - ( - "(startswith(Name,'S1') and (contains(Name,'SLC') or contains(Name,'GRD')) " - "and not contains(Name,'_COG') and not contains(Name, 'CARD_BS'))&$expand=Attributes" - ), - "(startswith(Name,'S2') and (contains(Name,'L2A')) and not contains(Name,'_N9999'))", - # ("(startswith(Name,'S2') and (contains(Name,'L1C') or - # contains(Name,'L2A')) and not contains(Name,'_N9999'))"), - # "(startswith(Name,'S3A') or startswith(Name,'S3B'))", - # "(startswith(Name,'S5P') and not contains(Name,'NRTI_'))" - ] - - scenes = [] - for filter in filters: - filter_all = filter_base + " and " + filter - query_url = query_template.replace("%filter", filter_all) - try: - scenes_current = search_data(query_filter=query_url) - print("%s scenes found" % len(scenes_current)) - scenes.extend(scenes_current) - except Exception as e: - print("Search failed %s" % (str(e))) - - return scenes - - -def download_csv_inventory(output_dir, file_name, overwrite=False): - """ - Description... - - Parameters: - output_dir: x - file_name: x - overwrite: x - - Returns: - (...): ... - """ - zip_file = download_data( - "https://s3.waw3-1.cloudferro.com/swift/v1/CatalogueCSV/CopernicusCatalogueCSV.zip", - output_dir, - file_name=file_name, - overwrite=overwrite, - ) - return zip_file - - -def convert_inventory_csv_to_parquet(files, collections, output_folder, config): - """ - Description... - - Parameters: - files: x - collections: x - output_folder: x - config: x - - Returns: - (...): ... - """ - temp_df = dict() - files.sort() - files_count = len(files) - if not os.path.exists(output_folder): - os.makedirs(output_folder) - counter = 1 - for file in files: - print(f"{counter}/{files_count}") - counter += 1 - df = pandas.read_csv( - file, delimiter=";", parse_dates=["ContentDate:Start"] - ) # , usecols=['Name', 'ContentDate:Start'] - for col in collections: - if col not in temp_df: - temp_df[col] = [] - sub = df[df["Name"].str.match(config[col]["pattern"])] - temp_df[col].append(sub) - - output_files = [] - for col in collections: - out_file = os.path.join(output_folder, "%s.inventory.parquet" % col) - df = pandas.concat(temp_df[col]) - df.to_parquet(out_file, index=False) - output_files.append(out_file) - - temp_df = None - df = None - - return output_files - - -def csv_to_inventory(scene_csv, collection=None, order_id=None, order_status="orderable"): - """ - Description... - - Parameters: - scene_csv: x - collection: x - order_id: x - order_status: x - - Returns: - (...): ... - """ - geometry = shapely.wkt.loads(scene_csv["Bbox"]) - geometry = shapely.geometry.mapping(geometry) - scene = { - "uid": scene_csv["Id"], - "scene_id": scene_csv["Name"], - "PublicationDate": scene_csv["IngestionDate"], - "ModificationDate": scene_csv["ModificationDate"], - "GeoFootprint": geometry, - "S3Path": scene_csv["S3Path"], - } - return to_inventory(scene, collection=collection, order_id=order_id, order_status=order_status) - - -def to_inventory(scene, collection=None, order_id=None, order_status="orderable"): - """ - Description... - - Parameters: - scene: x - collection: x - order_id: x - order_status: x - - Returns: - (...): ... - """ - uid = scene["uid"] - scene_id = os.path.splitext(scene["scene_id"])[0] - info = get_scene_id_info(scene_id) - item_id = scene_id - id_parts = item_id.split("_") - - tile_id = id_parts[5][1:] - datetime = parse(info["start"]) - publication_date = parse(scene["PublicationDate"]).isoformat() - modification_date = parse(scene["ModificationDate"]).isoformat() - - item_geometry = None - item_bbox = None - if scene["GeoFootprint"]: - item_geometry = scene["GeoFootprint"] - try: - item_bbox = geom_fct.calculate_bbox(item_geometry) - except Exception as e: - print(str(e)) - - item = pystac.Item( - id=scene_id, - datetime=datetime, - geometry=item_geometry, - bbox=item_bbox, - properties={}, - ) - - item.properties["esa:uuid"] = uid - item.properties["esa:scene_id"] = item_id - if item_id.startswith("S2"): - item.properties["s2:tile"] = tile_id - item.properties["s2:baseline"] = id_parts[3] - - item.properties["cdse:publication_date"] = publication_date - item.properties["cdse:modification_date"] = modification_date - item.properties["version"] = modification_date - item.properties["deprecated"] = False - - if order_id is not None: - item.properties["order:id"] = order_id - if order_status is not None: - item.properties["order:status"] = order_status - - item.properties["terrabyte:folder"] = os.path.join(get_scene_id_folder(item_id), scene["scene_id"]) - if collection: - item.properties["terrabyte:collection_id"] = collection - else: - item.properties["terrabyte:collection_id"] = get_collection_name(item_id) - item.collection_id = item.properties["terrabyte:collection_id"] - - if item_id.startswith("S1"): - tby_parts = item_id.split("_") - item.properties["terrabyte:uniq_id"] = "_".join(tby_parts[0:-1]) - elif item_id.startswith("S2"): - tby_parts = item_id.split("_") - tby_parts.pop(3) - item.properties["terrabyte:uniq_id"] = "_".join(tby_parts) - elif item_id.startswith("S3"): - item.properties["terrabyte:uniq_id"] = ( - f"{info['sensor']}_{info['instrument']}_{info['processingLevel']}_" - f"{info['product']}_{info['start']}_{info['stop']}_{info['instance']}" - ) - elif item_id.startswith("S5"): - item.properties["terrabyte:collection_id"] = "" - if "_AUX_" not in item_id: - item.properties["terrabyte:uniq_id"] = ( - f"{info['sensor']}_{info['category']}_{info['product']}_{info['start']}" - f"_{info['stop']}_{info['orbitNumber']}" - ) - else: - item.properties["terrabyte:uniq_id"] = item_id - - item.properties["cdse:s3path"] = scene["S3Path"] - - item.properties["terrabyte:order"] = dict( - cdse_id=uid, - scene_id=item.properties["esa:scene_id"], - uniq_id=item.properties["terrabyte:uniq_id"], - inventory=item.collection_id, - collection=item.properties["terrabyte:collection_id"], - download_folder=get_scene_id_folder(item_id), - s3path=item.properties["cdse:s3path"], - ) - - return item - - -def query_deleted_scene_id(scene_id): - """ - Description... - - Parameters: - scene_id: x - - Returns: - (...): ... - """ - query = ( - f"https://catalogue.dataspace.copernicus.eu/odata/v1/DeletedProducts?$filter=contains(Name,%27{scene_id}%27)" - ) - try: - data = requests.get(query).json() - if "value" in data: - return data["value"][0] - except Exception as e: - print(str(e)) - return False - - -def query_deleted_scenes(to_be_removed): - """ - Description... - - Parameters: - to_be_removed: x - - Returns: - (...): ... - - Raises: - Exception: Error for Scene querying deleted endpoint. - """ - scenes = dict() - for scene_id in to_be_removed: - try: - reason = query_deleted_scene_id(scene_id) - except Exception as e: - print(f"Error for {scene_id} querying deleted endpoint: {e}") - reason = {} - scenes[scene_id] = reason - return scenes diff --git a/lib/lib/providers/nasa_cmr.py b/lib/lib/providers/nasa_cmr.py deleted file mode 100644 index db1477d..0000000 --- a/lib/lib/providers/nasa_cmr.py +++ /dev/null @@ -1,221 +0,0 @@ -import earthaccess -import pystac -import os -from dateutil.parser import parse -from ..datasets.modis import get_scene_id_folder -from ..base.geometry import calculate_bbox - - -def login(username=None, password=None): - """ - Description... - - Parameters: - username: x - password: x - - Returns: - (...): ... - """ - return earthaccess.login() - - -def search_data(short_name, version, count=-1, **kwargs): - """ - Description... - - Parameters: - short_name: x - version: x - count: x - **kwargs: x - - Returns: - (...): ... - """ - results = earthaccess.search_data( - short_name=short_name, - version=version, - # updated_since="2023-08-13T04:00:00.00Z", - count=count, - **kwargs, - ) - return results - - -def search_scenes_ingestion(products, date_from, date_to=None): - """ - Description... - - Parameters: - products: x - date_from: x - date_to: x - - Returns: - (...): ... - """ - scenes = [] - for product in products: - short_name, version = product.split(".") - if "MODD10" in product or "MYD10" in product: - version = version.replace("0", "") - - if date_to: - p_scenes = search_data(short_name, version, production_date=(date_from, date_to)) - else: - p_scenes = search_data(short_name, version, updated_since=date_from) - scenes.extend(p_scenes) - - return scenes - - -def get_inventory_collection(scene_id): - """ - Description... - - Parameters: - scene_id: x - - Returns: - (...): ... - """ - # scene_id = 'MOD09GA.A2023255.h08v08.061.2023257025446' - parts = scene_id.split(".") - return "modis-%s-%s" % (parts[0].lower(), parts[3]) - - -def get_collection_name(scene_id): - """ - Description... - - Parameters: - scene_id: x - - Returns: - (...): ... - """ - parts = scene_id.split(".") - product = parts[0].lower()[3:] - return "modis-%s-%s" % (product, parts[3]) - - -def get_geometry(points): - """ - Description... - - Parameters: - points: x - - Returns: - (...): ... - """ - coordinates = [] - for p in points[::-1]: - coordinates.append([p["Longitude"], p["Latitude"]]) - return {"type": "Polygon", "coordinates": [coordinates]} - - -def to_inventory(scene, order_status="orderable", order_id=None, batch_id=None): - """ - Description... - - Parameters: - scene: x - order_status: x - order_id: x - batch_id: x - - Returns: - (...): ... - - Raises: - Exception: Could not find identifier. - """ - item_id = scene["meta"]["native-id"] - if item_id.startswith("SC"): - for identifier in scene["umm"]["DataGranule"]["Identifiers"]: - if identifier["IdentifierType"] == "ProducerGranuleId": - item_id = identifier["Identifier"] - if item_id is None: - raise Exception("Could not find identifier") - item_id = os.path.splitext(item_id)[0] - item_parts = item_id.split(".") - - tby_parts = item_id.split(".") - tby_parts.pop(-1) - tby_item_id = ".".join(tby_parts) - - item_datetime_begin = parse(scene["umm"]["TemporalExtent"]["RangeDateTime"]["BeginningDateTime"]) - item_datetime_end = parse(scene["umm"]["TemporalExtent"]["RangeDateTime"]["EndingDateTime"]) - - item_geometry = get_geometry( - scene["umm"]["SpatialExtent"]["HorizontalSpatialDomain"]["Geometry"]["GPolygons"][0]["Boundary"]["Points"] - ) - item_bbox = calculate_bbox(item_geometry) - - item = pystac.Item( - id=item_id, - datetime=None, - start_datetime=item_datetime_begin, - end_datetime=item_datetime_end, - geometry=item_geometry, - bbox=item_bbox, - properties={}, - ) - - item.properties["modis:scene_id"] = item_id - - item.properties["deprecated"] = False - if order_status is not None: - item.properties["order:status"] = order_status - if order_id: - item.properties["order:id"] = order_id - if batch_id: - item.properties["order:batch_id"] = batch_id - item.properties["version"] = item_parts[-1] - - if "revision-date" in scene["meta"]: - item.properties["modis:revision-date"] = parse(scene["meta"]["revision-date"]).isoformat() - if "revision-id" in scene["meta"]: - item.properties["modis:revision-id"] = scene["meta"]["revision-id"] - item.properties["modis:provider-id"] = scene["meta"]["provider-id"] - item.properties["modis:concept-id"] = scene["meta"]["concept-id"] - - for attrib in scene["umm"]["AdditionalAttributes"]: - if attrib["Name"] == "VERTICALTILENUMBER": - item.properties["modis:vertical-tile"] = int(attrib["Values"][0]) - elif attrib["Name"] == "HORIZONTALTILENUMBER": - item.properties["modis:horizontal-tile"] = int(attrib["Values"][0]) - elif attrib["Name"] == "PROCESSVERSION": - item.properties["modis:processor-version"] = attrib["Values"][0] - - item.properties["file:size"] = scene["umm"]["DataGranule"]["ArchiveAndDistributionInformation"][0]["Size"] - item.properties["file:unit"] = scene["umm"]["DataGranule"]["ArchiveAndDistributionInformation"][0]["SizeUnit"] - - item.properties["terrabyte:item_id"] = tby_item_id - item.properties["terrabyte:folder"] = os.path.join(get_scene_id_folder(item_id), item_id + ".hdf") - item.properties["terrabyte:collection_id"] = get_collection_name(item_id) - - item.collection_id = get_inventory_collection(item_id) - - item.properties["modis:dates"] = dict() - for date in scene["umm"]["ProviderDates"]: - item.properties["modis:dates"][date["Type"]] = parse(date["Date"]).isoformat() - - for url in scene["umm"]["RelatedUrls"]: - if url["Type"] == "GET DATA": - item.assets["hdf"] = pystac.Asset(href=url["URL"]) - elif ".xml" in url["URL"] and "https://" in url["URL"]: - item.assets["xml"] = pystac.Asset(href=url["URL"]) - - item.properties["terrabyte:order"] = dict( - scene_id=item.id, - inventory=item.collection_id, - collection=item.properties["terrabyte:collection_id"], - download_folder=get_scene_id_folder(item_id), - url_hdf=item.assets["hdf"].href, - url_xml=item.assets["xml"].href, - ) - - return item diff --git a/lib/lib/providers/nasa_daac.py b/lib/lib/providers/nasa_daac.py deleted file mode 100644 index 2e67897..0000000 --- a/lib/lib/providers/nasa_daac.py +++ /dev/null @@ -1,650 +0,0 @@ -#!/usr/bin/env python -# ---------------------------------------------------------------------------- -# NSIDC Data Download Script -# -# Copyright (c) 2022 Regents of the University of Colorado -# Permission is hereby granted, free of charge, to any person obtaining -# a copy of this software and associated documentation files (the "Software"), -# to deal in the Software without restriction, including without limitation -# the rights to use, copy, modify, merge, publish, distribute, sublicense, -# and/or sell copies of the Software, and to permit persons to whom the -# Software is furnished to do so, subject to the following conditions: -# The above copyright notice and this permission notice shall be included -# in all copies or substantial portions of the Software. -# -# Tested in Python 2.7 and Python 3.4, 3.6, 3.7, 3.8, 3.9 -# -# To run the script at a Linux, macOS, or Cygwin command-line terminal: -# $ python nsidc-data-download.py -# -# On Windows, open Start menu -> Run and type cmd. Then type: -# python nsidc-data-download.py -# -# The script will first search Earthdata for all matching files. -# You will then be prompted for your Earthdata username/password -# and the script will download the matching files. -# -# If you wish, you may store your Earthdata username/password in a .netrc -# file in your $HOME directory and the script will automatically attempt to -# read this file. The .netrc file should have the following format: -# machine urs.earthdata.nasa.gov login MYUSERNAME password MYPASSWORD -# where 'MYUSERNAME' and 'MYPASSWORD' are your Earthdata credentials. -# -# Instead of a username/password, you may use an Earthdata bearer token. -# To construct a bearer token, log into Earthdata and choose "Generate Token". -# To use the token, when the script prompts for your username, -# just press Return (Enter). You will then be prompted for your token. -# You can store your bearer token in the .netrc file in the following format: -# machine urs.earthdata.nasa.gov login token password MYBEARERTOKEN -# where 'MYBEARERTOKEN' is your Earthdata bearer token. -# -from __future__ import print_function - -# import queue -import base64 -import getopt -import json - -# import logging -import math -import netrc -import os.path -import ssl -import sys - -# import threading -import time - -try: - from urllib.parse import urlparse - from urllib.request import urlopen, Request, build_opener, HTTPCookieProcessor - from urllib.error import HTTPError, URLError -except ImportError: - from urlparse import urlparse - from urllib2 import ( - urlopen, - Request, - HTTPError, - URLError, - build_opener, - HTTPCookieProcessor, - ) - -short_name = "MOD10A1" -version = "61" -time_start = "2000-02-24T00:00:00Z" -time_end = "2022-01-31T20:06:01Z" -bounding_box = "" -polygon = "" -filename_filter = "" -url_list = [] - -CMR_URL = "https://cmr.earthdata.nasa.gov" -URS_URL = "https://urs.earthdata.nasa.gov" -CMR_PAGE_SIZE = 2000 -CMR_FILE_URL = ( - "{0}/search/granules.json?" - "sort_key[]=start_date&sort_key[]=producer_granule_id" - "&scroll=true&page_size={1}".format(CMR_URL, CMR_PAGE_SIZE) -) - - -def get_login_credentials(): - """ - Get user credentials from .netrc or prompt for input. - - Returns: - (tuple): ... - """ - credentials = None - token = None - - try: - info = netrc.netrc() - username, account, password = info.authenticators(urlparse(URS_URL).hostname) - if username == "token": - token = password - else: - credentials = "{0}:{1}".format(username, password) - credentials = base64.b64encode(credentials.encode("ascii")).decode("ascii") - except Exception: - username = None - password = None - - return credentials, token - - -def build_version_query_params(version): - """ - Description... - - Parameters: - version: x - - Returns: - (...): ... - """ - desired_pad_length = 3 - if len(version) > desired_pad_length: - print('Version string too long: "{0}"'.format(version)) - quit() - - version = str(int(version)) # Strip off any leading zeros - query_params = "" - - while len(version) <= desired_pad_length: - padded_version = version.zfill(desired_pad_length) - query_params += "&version={0}".format(padded_version) - desired_pad_length -= 1 - return query_params - - -def filter_add_wildcards(filter): - """ - Description... - - Parameters: - filter: x - - Returns: - (...): ... - """ - if not filter.startswith("*"): - filter = "*" + filter - if not filter.endswith("*"): - filter = filter + "*" - return filter - - -def build_filename_filter(filename_filter): - """ - Description... - - Parameters: - filename_filter: x - - Returns: - (...): ... - """ - filters = filename_filter.split(",") - result = "&options[producer_granule_id][pattern]=true" - for filter in filters: - result += "&producer_granule_id[]=" + filter_add_wildcards(filter) - return result - - -def build_cmr_query_url( - provider, - short_name, - version, - time_start, - time_end, - bounding_box=None, - polygon=None, - filename_filter=None, -): - """ - Description... - - Parameters: - provider: x - short_name: x - version: x - time_start: x - time_end: x - bounding_box: x - polygon: x - filename_filter: x - - Returns: - (...): ... - """ - params = "&provider={0}".format(provider) - params += "&short_name={0}".format(short_name) - params += build_version_query_params(version) - params += "&temporal[]={0},{1}".format(time_start, time_end) - if polygon: - params += "&polygon={0}".format(polygon) - elif bounding_box: - params += "&bounding_box={0}".format(bounding_box) - if filename_filter: - params += build_filename_filter(filename_filter) - return CMR_FILE_URL + params - - -def get_speed(time_elapsed, chunk_size): - """ - Description... - - Parameters: - time_elapsed: x - chunk_size: x - - Returns: - (...): ... - """ - if time_elapsed <= 0: - return "" - speed = chunk_size / time_elapsed - if speed <= 0: - speed = 1 - size_name = ("", "k", "M", "G", "T", "P", "E", "Z", "Y") - i = int(math.floor(math.log(speed, 1000))) - p = math.pow(1000, i) - return "{0:.1f}{1}B/s".format(speed / p, size_name[i]) - - -def output_progress(count, total, status="", bar_len=60): - """ - Description... - - Parameters: - count: x - total: x - status: x - bar_len: x - - Returns: - (...): ... - """ - if total <= 0: - return - fraction = min(max(count / float(total), 0), 1) - filled_len = int(round(bar_len * fraction)) - percents = int(round(100.0 * fraction)) - bar = "=" * filled_len + " " * (bar_len - filled_len) - fmt = " [{0}] {1:3d}% {2} ".format(bar, percents, status) - print("\b" * (len(fmt) + 4), end="") # clears the line - sys.stdout.write(fmt) - sys.stdout.flush() - - -def cmr_read_in_chunks(file_object, chunk_size=1024 * 1024): - """ - Read a file in chunks using a generator. Default chunk size: 1Mb. - - Parameters: - file_object: x - chunk_size: x - - Returns: - (...): ... - """ - while True: - data = file_object.read(chunk_size) - if not data: - break - yield data - - -def get_login_response(url, credentials, token): - """ - Description... - - Parameters: - url: x - credentials: x - token: x - - Returns: - (...): ... - """ - opener = build_opener(HTTPCookieProcessor()) - - req = Request(url) - if token: - req.add_header("Authorization", "Bearer {0}".format(token)) - elif credentials: - try: - response = opener.open(req) - # We have a redirect URL - try again with authorization. - url = response.url - except HTTPError: - # No redirect - just try again with authorization. - pass - except Exception as e: - print("Error{0}: {1}".format(type(e), str(e))) - sys.exit(1) - - req = Request(url) - req.add_header("Authorization", "Basic {0}".format(credentials)) - - try: - response = opener.open(req) - except HTTPError as e: - err = "HTTP error {0}, {1}".format(e.code, e.reason) - if "Unauthorized" in e.reason: - if token: - err += ": Check your bearer token" - else: - err += ": Check your username and password" - print(err) - sys.exit(1) - except Exception as e: - print("Error{0}: {1}".format(type(e), str(e))) - sys.exit(1) - - return response - - -def cmr_download(urls, output_dir=".", force=False, quiet=False): - """ - Download files from list of urls. - - Parameters: - urls: x - output_dir: x - force: x - quiet: x - - Returns: - (...): ... - """ - if not urls: - return - - url_count = len(urls) - if not quiet: - print("Downloading {0} files...".format(url_count)) - credentials = None - token = None - - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - for index, url in enumerate(urls, start=1): - if not credentials and not token: - p = urlparse(url) - if p.scheme == "https": - credentials, token = get_login_credentials() - - filename = url.split("/")[-1] - filename = os.path.join(output_dir, filename) - if not quiet: - print("{0}/{1}: {2}".format(str(index).zfill(len(str(url_count))), url_count, filename)) - - try: - response = get_login_response(url, credentials, token) - length = int(response.headers["content-length"]) - try: - if not force and length == os.path.getsize(filename): - if not quiet: - print(" File exists, skipping") - continue - except OSError: - pass - count = 0 - chunk_size = min(max(length, 1), 1024 * 1024) - max_chunks = int(math.ceil(length / chunk_size)) - time_initial = time.time() - with open(filename, "wb") as out_file: - for data in cmr_read_in_chunks(response, chunk_size=chunk_size): - out_file.write(data) - if not quiet: - count = count + 1 - time_elapsed = time.time() - time_initial - download_speed = get_speed(time_elapsed, count * chunk_size) - output_progress(count, max_chunks, status=download_speed) - if not quiet: - print() - except HTTPError as e: - print("HTTP error {0}, {1}".format(e.code, e.reason)) - except URLError as e: - print("URL error: {0}".format(e.reason)) - except IOError: - raise - - -# todo: function causes linter calls -# def download_data_threads(scenes, base_dir="."): -# if not os.path.exists(output_dir): -# os.makedirs(output_dir) -# -# que = queue.Queue() -# threads = [] -# -# # num_theads = min(50, len(scenes)) -# _ = min(50, len(scenes)) -# -# start = time.perf_counter() -# -# for scene in scenes: -# urls = scene["urls"] -# file_dir = None -# output_dir = os.path.join(base_dir, file_dir) -# -# download_thread = threading.Thread( -# target=lambda q, urls, output_dir: q.put(cmr_download(urls, output_dir)), -# args=(que, urls, output_dir), -# ) -# download_thread.start() -# threads.append(download_thread) -# -# while not q.empty(): -# work = q.get() # fetch new work from the Queue -# try: -# data = urlopen(work[1]).read() -# logging.info("Requested..." + work[1]) -# result[work[0]] = data # Store data back at correct index -# except: -# logging.error("Error with URL check!") -# result[work[0]] = {} -# # signal to the queue that task has been processed -# q.task_done() -# -# for thread in threads: -# thread.join() -# -# download_time = time.perf_counter() - start -# -# datasets = [] -# mean_download_speed = 0 -# while not que.empty(): -# result = que.get() -# mean_download_speed += result["download_speed"] -# datasets.append(result) -# -# if len(datasets) > 0: -# mean_download_speed = mean_download_speed / len(datasets) -# -# return dict( -# datasets=datasets, -# total_time=download_time, -# mean_download_speed=mean_download_speed, -# ) - - -def cmr_filter(search_results): - """ - Select only the desired data files from CMR response. - - Parameters: - search_results: x - - Returns: - (...): ... - """ - if "feed" not in search_results or "entry" not in search_results["feed"]: - return [] - - scenes = [] - - for item in search_results["feed"]["entry"]: - record = dict( - id=item["id"], - scene_id=item["producer_granule_id"], - time_start=item["time_start"], - time_end=item["time_end"], - updated=item["updated"], - granule_size=item["granule_size"], - urls=[], - ) - - if "links" in item: - unique_filenames = set() - for link in item["links"]: - if "href" not in link: - # Exclude links with nothing to download - continue - if "inherited" in link and link["inherited"] is True: - # Why are we excluding these links? - continue - if "rel" in link and "data#" not in link["rel"]: - # Exclude links which are not classified by CMR as "data" or "metadata" - continue - - if "title" in link and "opendap" in link["title"].lower(): - # Exclude OPeNDAP links--they are responsible for many duplicates - # This is a hack; when the metadata is updated to properly identify - # non-datapool links, we should be able to do this in a non-hack way - continue - - filename = link["href"].split("/")[-1] - if filename in unique_filenames: - # Exclude links with duplicate filenames (they would overwrite) - continue - unique_filenames.add(filename) - - record["urls"].append(link["href"]) - - scenes.append(record) - - return scenes - - -def cmr_search( - provider, - short_name, - version, - time_start, - time_end, - bounding_box="", - polygon="", - filename_filter="", - quiet=False, -): - """ - Perform a scrolling CMR query for files matching input criteria. - - Parameters: - provider: x - short_name: x - version: x - time_start: x - time_end: x - bounding_box: x - polygon: x - filename_filter: x - quiet: x - - Returns: - (...): ... - """ - cmr_query_url = build_cmr_query_url( - provoder=provider, - short_name=short_name, - version=version, - time_start=time_start, - time_end=time_end, - bounding_box=bounding_box, - polygon=polygon, - filename_filter=filename_filter, - ) - if not quiet: - print("Querying for data:\n\t{0}\n".format(cmr_query_url)) - - cmr_scroll_id = None - ctx = ssl.create_default_context() - ctx.check_hostname = False - ctx.verify_mode = ssl.CERT_NONE - - scenes = [] - hits = 0 - while True: - req = Request(cmr_query_url) - if cmr_scroll_id: - req.add_header("cmr-scroll-id", cmr_scroll_id) - try: - response = urlopen(req, context=ctx) - except Exception as e: - print("Error: " + str(e)) - sys.exit(1) - if not cmr_scroll_id: - # Python 2 and 3 have different case for the http headers - headers = {k.lower(): v for k, v in dict(response.info()).items()} - cmr_scroll_id = headers["cmr-scroll-id"] - hits = int(headers["cmr-hits"]) - if not quiet: - if hits > 0: - print("Found {0} matches.".format(hits)) - else: - print("Found no matches.") - search_page = response.read() - search_page = json.loads(search_page.decode("utf-8")) - url_scroll_results = cmr_filter(search_page) - if not url_scroll_results: - break - if not quiet and hits > CMR_PAGE_SIZE: - print(".", end="") - sys.stdout.flush() - scenes += url_scroll_results - - if not quiet and hits > CMR_PAGE_SIZE: - print() - return scenes - - -def main(argv=None): - """ - Description... - - Parameters: - argv: x - - Returns: - (...): ... - """ - global short_name, version, time_start, time_end, bounding_box, polygon, filename_filter, url_list - - if argv is None: - argv = sys.argv[1:] - - force = False - quiet = False - usage = "usage: nsidc-download_***.py [--help, -h] [--force, -f] [--quiet, -q]" - - try: - opts, args = getopt.getopt(argv, "hfq", ["help", "force", "quiet"]) - for opt, _arg in opts: - if opt in ("-f", "--force"): - force = True - elif opt in ("-q", "--quiet"): - quiet = True - elif opt in ("-h", "--help"): - print(usage) - sys.exit(0) - except getopt.GetoptError as e: - print(e.args[0]) - print(usage) - sys.exit(1) - - try: - if not url_list: - url_list = cmr_search( - short_name, - version, - time_start, - time_end, - bounding_box=bounding_box, - polygon=polygon, - filename_filter=filename_filter, - quiet=quiet, - ) - - cmr_download(url_list, force=force, quiet=quiet) - except KeyboardInterrupt: - quit() - - -if __name__ == "__main__": - main() diff --git a/lib/lib/providers/usgs_m2m.py b/lib/lib/providers/usgs_m2m.py deleted file mode 100644 index 8e9ea1f..0000000 --- a/lib/lib/providers/usgs_m2m.py +++ /dev/null @@ -1,722 +0,0 @@ -import os -import time -import json -import requests -import pystac -import pandas -from pystac_client import Client -from dateutil.parser import parse - -from pystac.extensions.eo import EOExtension - -from ..base.file import check_file_size -from ..base.download import download_data as download_data_base -from ..datasets.landsat import get_scene_id_folder - - -authentication_errors = ["AUTH_INVALID", "AUTH_KEY_INVALID"] -rate_limits = ["RATE_LIMIT", "RATE_LIMIT_USER_DL"] - - -def sendJSONRequest(url, data, apiKey=None): - """ - Description... - - Parameters: - url: x - data: x - apiKey: x - - Returns: - (...): ... - - Raises: - Exception: Could not conduct request twice.. - Exception: Error occurred. - Exception: Error 404 occurred. - Exception: Error 401 occurred. - Exception: Error 400 occurred. - """ - json_data = json.dumps(data) - - headers = {} - if apiKey is not None: - headers["X-Auth-Token"] = apiKey - - try: - response = requests.post(url, json_data, headers=headers) - if response is None: - print("No output from service - try again") - response = requests.post(url, json_data, headers=headers) - if response is None: - raise Exception("Could not conduct request twice. URL: %s.") - return False - except Exception as e: - print("Undefined exception: %s" % e) - return False - - output = json.loads(response.text) - if output["errorCode"] is not None: - print(output["errorCode"], "- ", output["errorMessage"]) - if output["errorCode"] in ["RATE_LIMIT", "RATE_LIMIT_USER"]: - print("Try again because reuqest limit") - response = requests.post(url, json_data, headers=headers) - output = json.loads(response.text) - if output["errorCode"] is not None: - print(output["errorCode"], "- ", output["errorMessage"]) - raise Exception("The following error occurred (%s): %s" % (output["errorCode"], output["errorMessage"])) - if response.status_code == 404: - print("404 Not Found") - raise Exception("The following error 404 occurred (%s): %s" % (output["errorCode"], output["errorMessage"])) - elif response.status_code == 401: - print("401 Unauthorized") - raise Exception("The following error 401 occurred (%s): %s" % (output["errorCode"], output["errorMessage"])) - elif response.status_code == 400: - print("Error Code", response.status_code) - raise Exception("The following error 400 occurred (%s): %s" % (output["errorCode"], output["errorMessage"])) - - return output["data"] - - -def login(username: str, password: str, token=False, api_url="https://m2m.cr.usgs.gov/api/api/json/stable/"): - """ - Description... - - Parameters: - username: x - password: x - token: x - api_url: x - - Returns: - (...): ... - """ - if token: - endpoint = "login-token" - payload = {"username": username, "token": password} - print("Use login with token") - else: - endpoint = "login" - payload = {"username": username, "password": password} - print("Use login with password") - - while True: - api_key = sendJSONRequest(api_url + endpoint, payload) - if not api_key: - print("API request failed. Try again...\n") - else: - return api_key - - -def search_data( - query: dict, api_key: str, api_url="https://m2m.cr.usgs.gov/api/api/json/stable/", download_options=True -): - """ - Description... - - Parameters: - query: x - api_key: x - api_url: x - download_options: x - - Returns: - (...): ... - """ - scenes = sendJSONRequest(api_url + "scene-search", query, api_key) - if scenes["recordsReturned"] > 0: - print(str(scenes["recordsReturned"]) + " scenes found.\n") - - if download_options: - - sceneIds = [] - for result in scenes["results"]: - # Add this scene to the list to download - sceneIds.append(result["entityId"]) - - return get_download_options(query["datasetName"], sceneIds) - - else: - return scenes - else: - print("Search returned no results. Check query!\n") - return [] - - -def get_download_options(datasetName, sceneIds, api_key, api_url="https://m2m.cr.usgs.gov/api/api/json/stable/"): - """ - Description... - - Parameters: - datasetName: x - sceneIds: x - api_key: x - api_url: x - - Returns: - (...): ... - """ - # Find the download options for these scenes - # NOTE :: Remember the scene list cannot exceed 50,000 items! - payload = {"datasetName": datasetName, "entityIds": sceneIds} - - downloadOptions = sendJSONRequest(api_url + "download-options", payload, api_key) - - if downloadOptions is None: - print("No downloadable scenes found.\n") - return [] - else: - # Aggregate a list of available products - downloads = [] - # downloads_systems = dict() - downloads_uniq = [] - for product in downloadOptions: - # Make sure the product is available for this scene - if product["available"] is True and product["downloadSystem"] != "folder": - # We should only return a scene once (not duplicates from additional download systems) - # -> TODO: this is currently a LANDSAT specific use case - not valid for MODIS! - # if product["downloadSystem"] not in downloads_systems: - # downloads_systems[product["downloadSystem"]] = [] - - if product["entityId"] not in downloads_uniq: - item = { - "entityId": product["entityId"], - "displayId": product["displayId"], - "productId": product["id"], - "download_system": product["downloadSystem"], - } - downloads.append(item) - # downloads_systems[product["downloadSystem"]].append(item) - downloads_uniq.append(product["entityId"]) - - print(str(len(downloads)) + " downloadable data records found.") - return downloads - - -def get_download_urls( - downloads: list, - api_key, - label="", - api_url="https://m2m.cr.usgs.gov/api/api/json/stable/", -): - """ - Description... - - Parameters: - downloads: x - api_key: x - label: x - api_url: x - - Returns: - (...): ... - """ - if label == "": - label = str(int(time.time() * 1000)) - - print("Label: %s" % label) - - payload = {"downloads": downloads, "label": label} # , "returnAvailable": True, "configurationCode": "order" - - # Call the download to get the direct download url - results = sendJSONRequest(api_url + "download-request", payload, api_key) - # print(str(results)) - while not results: - print("API request failed. Try again...\n") - results = sendJSONRequest(api_url + "download-request", payload, api_key) - - # with open('%s_download_reqest.json' % label, 'w') as f: - # x = f.write(json.dumps(results, indent=4)) - - print("available: %s" % len(results["availableDownloads"])) - print("preparing: %s" % len(results["preparingDownloads"])) - print("duplicates: %s" % len(results["duplicateProducts"])) - print("failed: %s" % len(results["failed"])) - print("newRecords: %s" % len(results["newRecords"])) - print("numInvalidScenes: %s" % results["numInvalidScenes"]) - - download_urls = dict() - availableDownloads = results["availableDownloads"] - # if len(results["availableDownloads"]) > 0: - # for result in results["availableDownloads"]: - # print(f"Get download url: {result['url']}\n") - # download_urls[result['url']] = result - - payload = {"label": label} - results = sendJSONRequest(api_url + "download-retrieve", payload, api_key) - while not results: - print("API request failed. Try again...\n") - results = sendJSONRequest(api_url + "download-retrieve", payload, api_key) - - while results["queueSize"] > 0: - print("Queue Size: %s - try again in 15 seconds" % results["queueSize"]) - time.sleep(15) - results = sendJSONRequest(api_url + "download-retrieve", payload, api_key) - while not results: - print("API request failed. Try again...\n") - results = sendJSONRequest(api_url + "download-retrieve", payload, api_key) - - if results is not False: - for result in results["available"]: - print(f"Get download url: {result['url']}\n") - if result["url"] not in download_urls: - download_urls[result["url"]] = result - # download_urls.append(result) - - for result in results["requested"]: - print(f"Get download url: {result['url']}\n") - if result["url"] not in download_urls: - download_urls[result["url"]] = result - # download_urls.append(result) - - if len(availableDownloads) > 0: - for result in availableDownloads: - print(f"Get download url: {result['url']}\n") - if result["url"] not in download_urls: - download_urls[result["url"]] = result - - return [download_urls[url] for url in download_urls] - - -def add_download_urls(scenes, api_key): - """ - Description... - - Parameters: - scenes: x - api_key: x - - Returns: - (...): ... - - Raises: - Exception: No scenes found. - """ - search_query = dict() - scenes_all = dict() - for scene in scenes: - scenes_all[scene["scene_id"]] = scene - datasetName = scene["inventory"].replace("-", "_") - if datasetName not in search_query: - search_query[datasetName] = [] - search_query[datasetName].append(scene["landsat_id"]) - - results_all = [] - for datasetName in search_query: - print("Found %s scenes for %s collection" % (len(search_query[datasetName]), datasetName)) - if len(search_query[datasetName]) > 0: - results = get_download_options(datasetName, search_query[datasetName], api_key=api_key) - print("Find %s results for %s" % (len(results), datasetName)) - results_all.extend(results) - - print("Found %s results" % len(results_all)) - if len(results_all) == 0: - raise Exception("No scenes found") - - downloads = get_download_urls(downloads=results_all, api_key=api_key) - print("Found %s downloads" % len(downloads)) - - scenes = [] - scenes_added = [] - for item in downloads: - url = item["url"] - if "displayId" in item: - id = item["displayId"] - elif url.startswith("https://landsatlook.usgs.gov"): - id = url.replace("https://landsatlook.usgs.gov/gen-bundle?landsat_product_id=", "").split("&")[0] - else: - id = None - - if id in scenes_all: - scenes_all[id]["url"] = url - if id not in scenes_added: - scenes.append(scenes_all[id]) - scenes_added.append(id) - else: - scenes.append(dict(url=url)) - - return scenes - - -def download_data(url, output_dir, chunk_size=1024 * 1000, timeout=300): - """ - Download single file from USGS M2M by download url - - Parameters: - url: x - output_dir: x - chunk_size: x - timeout: x - - Returns: - (...): ... - - Raises: - Exception: Failed to download. - """ - - try: - print("Waiting for server response...") - r = requests.get(url, stream=True, allow_redirects=True, timeout=timeout) - expected_file_size = int(r.headers.get("content-length", 0)) - file_name = r.headers["Content-Disposition"].split('"')[1] - print(f"Filename: {file_name}") - file_path = os.path.join(output_dir, file_name) - # TODO: Check for existing files and whether they have the correct file size - if not os.path.exists(output_dir): - os.makedirs(output_dir) - - with open(file_path, "wb") as f: - start = time.perf_counter() - print(f"Download of {file_name} in progress...") - for chunk in r.iter_content(chunk_size=chunk_size): - f.write(chunk) - duration = time.perf_counter() - start - speed = round((expected_file_size / duration) / (1000 * 1000), 2) - - if check_file_size(expected_file_size, file_path): - print(f"Download of {file_name} successful. Average download speed: {speed} MB/s") - return file_path - else: - os.remove(file_path) - print(f"Failed to download from {url}") - raise Exception(f"Failed to download from {url}") - except Exception as e: - print(e) - print(f"Failed to download from {url}.") - raise Exception(f"Failed to download from {url}") - - -def download_aria(scene, basedir, aria2): - """ - Description... - - Parameters: - scene: x - basedir: x - aria2: x - - Returns: - (...): ... - """ - if "scene_id" in scene: - path = get_scene_id_folder(scene["scene_id"]) - directory = os.path.join(basedir, path) - filename = scene["scene_id"] + ".tar" - download = aria2.add_uris( - [scene["url"]], {"dir": directory, "out": filename, "continue": "true", "allow-overwrite": "true"} - ) - - return {"gid": download.gid, "file_path": os.path.join(directory, filename)} - else: - # In this case we do not have a filename before downloading - directory = os.path.join(basedir, ".download") - download = aria2.add_uris([scene["url"]], {"dir": directory, "continue": "true", "allow-overwrite": "true"}) - return {"gid": download.gid} - - -def search_data_stac(collections, query, max_items=10000, api_url="https://landsatlook.usgs.gov/stac-server"): - """ - Description... - - Parameters: - collections: x - query: x - max_items: x - api_url: x - - Returns: - (...): ... - """ - params = {"max_items": max_items, "collections": collections, "query": query} - - items = [] - try: - cat = Client.open(api_url) - search = cat.search(**params) - items = list(search.items_as_dicts()) - for i in items: - del i["assets"] - i["id"] = i["id"].replace("_SR", "") - print("%s scenes found" % len(items)) - except Exception as e: - print("FAILED", str(e)) - return items - - -def search_data_ingestion(date_from, date_to, collections="landsat-c2l2-sr"): - """ - Description... - - Parameters: - date_from: x - date_to: x - collections: x - - Returns: - (...): ... - """ - query = {"created": {"gte": date_from, "lt": date_to}} - return search_data_stac(collections=collections, query=query) - - -collections = { - "LC_C2_L1": "landsat-ot-c2-l1", - "LC_C2_L2": "landsat-ot-c2-l2", - "LE_C2_L1": "landsat-etm-c2-l1", - "LE_C2_L2": "landsat-etm-c2-l2", - "LT_C2_L1": "landsat-tm-c2-l1", - "LT_C2_L2": "landsat-tm-c2-l2", -} - - -def get_collection_name(scene_id): - """ - Description... - - Parameters: - scene_id: x - - Returns: - (...): ... - - Raises: - Exception: Could not find item in pre-defined collections for scene. - """ - parts = scene_id.split("_") - sensor = parts[0][0:2] - collection = "C" + parts[5][1] - level = parts[1][0:2] - name = f"{sensor}_{collection}_{level}" - if name in collections: - return collections[name] - else: - raise Exception(f"Could not find {name} in pre-defined collections for scene {scene_id}") - - -def to_inventory_from_stac(item, order_status="orderable", order_id=None, batch_id=None): - """ - Description... - - Parameters: - item: x - order_status: x - order_id: x - batch_id: x - - Returns: - (...): ... - """ - if "collection" in item: - del item["collection"] - - if "links" in item: - del item["links"] - - item = pystac.Item.from_dict(item) - item_id = item.id - item_parts = item_id.split("_") - item_parts.pop(4) - tby_item_id = "_".join(item_parts) - - item.properties["deprecated"] = False - if order_status is not None: - item.properties["order:status"] = order_status - if order_id: - item.properties["order:id"] = order_id - if batch_id: - item.properties["order:batch_id"] = batch_id - item.properties["version"] = item.properties["updated"] - - item.properties["landsat:scene_id"] = item.properties["landsat:scene_id"] - item.properties["landsat:wrs_row"] = int(item.properties["landsat:wrs_row"]) - item.properties["landsat:wrs_path"] = int(item.properties["landsat:wrs_path"]) - - item.properties["terrabyte:item_id"] = tby_item_id - item.properties["terrabyte:folder"] = os.path.join(get_scene_id_folder(item_id), item_id) - item.properties["terrabyte:collection_id"] = get_collection_name(item_id) - - item.properties["terrabyte:order"] = dict( - scene_id=item.id, - landsat_id=item.properties["landsat:scene_id"], - inventory=item.properties["terrabyte:collection_id"], - collection=item.properties["terrabyte:collection_id"], - download_folder=get_scene_id_folder(item_id), - ) - - item.collection_id = item.properties["terrabyte:collection_id"] - - return item - - -def convert_inventory_csv_to_parquet(file, collection, output_folder): - """ - Description... - - Parameters: - file: x - collection: x - output_folder: x - - Returns: - (...): ... - """ - df = pandas.read_csv(file, delimiter=",", parse_dates=["Date Acquired"]) - out_file = os.path.join(output_folder, "%s.inventory.parquet" % collection) - df.to_parquet(out_file, index=False) - return out_file - - -def get_datetime(scene): - """ - Description... - - Parameters: - scene: x - - Returns: - (...): ... - """ - start_time = parse(scene["Start Time"]) - stop_time = parse(scene["Stop Time"]) - mean_time = start_time + (stop_time - start_time) / 2.0 - return mean_time - - -def get_geometry(scene): - """ - Description... - - Parameters: - scene: x - - Returns: - (...): ... - """ - coordinates = [ - [ - [ - float(scene["Corner Upper Left Longitude"]), - float(scene["Corner Upper Left Latitude"]), - ], - [ - float(scene["Corner Upper Right Longitude"]), - float(scene["Corner Upper Right Latitude"]), - ], - [ - float(scene["Corner Lower Right Longitude"]), - float(scene["Corner Lower Right Latitude"]), - ], - [ - float(scene["Corner Lower Left Longitude"]), - float(scene["Corner Lower Left Latitude"]), - ], - [ - float(scene["Corner Upper Left Longitude"]), - float(scene["Corner Upper Left Latitude"]), - ], - ] - ] - return {"type": "Polygon", "coordinates": coordinates} - - -def get_bbox(geometry): - """ - Description... - - Parameters: - geometry: x - - Returns: - (...): ... - """ - coords = geometry["coordinates"] - lats = [c[1] for c in coords[0]] - lons = [c[0] for c in coords[0]] - return [min(lons), min(lats), max(lons), max(lats)] - - -def csv_to_inventory(scene, collection=None, order_id=None, order_status="orderable"): - """ - Description... - - Parameters: - scene: x - collection: x - order_id: x - order_status: x - - Returns: - (...): ... - """ - item_id = scene["Display ID"] - item_parts = item_id.split("_") - item_parts.pop(4) - tby_item_id = "_".join(item_parts) - - item_datetime = get_datetime(scene) - item_geometry = get_geometry(scene) - item_bbox = get_bbox(item_geometry) - - item = pystac.Item( - id=item_id, - datetime=item_datetime, - geometry=item_geometry, - bbox=item_bbox, - collection=collection, - properties={}, - ) - - if float(scene["Land Cloud Cover"]) >= 0: - eo_ext = EOExtension.ext(item, add_if_missing=True) - eo_ext.cloud_cover = float(scene["Land Cloud Cover"]) - - item.properties["deprecated"] = False - item.properties["order:status"] = order_status - item.properties["version"] = scene["Date Product Generated L2"] - - item.properties["landsat:scene_id"] = scene["Landsat Scene Identifier"] - item.properties["landsat:wrs_path"] = int(scene["WRS Path"]) - item.properties["landsat:wrs_row"] = int(scene["WRS Row"]) - item.properties["landsat:correction"] = item_id.split("_")[1] - item.properties["landsat:product_generated"] = parse(scene["Date Product Generated L2"]).isoformat() - - item.properties["terrabyte:item_id"] = tby_item_id - item.properties["terrabyte:folder"] = os.path.join(get_scene_id_folder(item_id), item_id) - item.properties["terrabyte:collection_id"] = collection - - item.properties["terrabyte:order"] = dict( - scene_id=item.id, - landsat_id=item.properties["landsat:scene_id"], - inventory=item.properties["terrabyte:collection_id"], - collection=item.properties["terrabyte:collection_id"], - download_folder=get_scene_id_folder(item_id), - ) - - return item - - -def download_csv_inventory(output_dir, overwrite=False): - """ - Description... - - Parameters: - output_dir: x - overwrite: x - - Returns: - (...): ... - """ - download_urls = [ - "https://landsat.usgs.gov/landsat/metadata_service/bulk_metadata_files/LANDSAT_OT_C2_L2.csv.gz", - "https://landsat.usgs.gov/landsat/metadata_service/bulk_metadata_files/LANDSAT_ETM_C2_L2.csv.gz", - "https://landsat.usgs.gov/landsat/metadata_service/bulk_metadata_files/LANDSAT_TM_C2_L2.csv.gz", - ] - - zip_files = [] - - for url in download_urls: - print(url) - downloaded_file = download_data_base(url, output_dir, overwrite=overwrite) - zip_files.append(downloaded_file) - return zip_files diff --git a/lib/lib/resources/__init__.py b/lib/lib/resources/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/lib/lib/resources/pgstac.py b/lib/lib/resources/pgstac.py deleted file mode 100644 index 218647f..0000000 --- a/lib/lib/resources/pgstac.py +++ /dev/null @@ -1,15 +0,0 @@ -from pypgstac.pypgstac import PgstacCLI - - -def import_stac_items(input, dsn, debug=False, method="insert_ignore"): - """ - Description... - - Parameters: - input: x - dsn: x - debug: x - method: x - """ - cli = PgstacCLI(dsn=dsn, debug=debug) - cli.load(table="items", file=input, method=method) diff --git a/lib/lib/resources/stac.py b/lib/lib/resources/stac.py deleted file mode 100644 index 7c8a002..0000000 --- a/lib/lib/resources/stac.py +++ /dev/null @@ -1,204 +0,0 @@ -import os -import json -import importlib -import datetime -import pystac -import requests -from pystac.extensions.file import FileExtension -from ..base.file import get_file_size, get_folder_size - - -def extract_by_function_name(scene_path: str, function_name: str, stac_function_options: dict): - """ - Extract metadata from scene folder - - Arguments: - scene_path: Scene folder to extract metadata from - function_name: Function name for scene to be used for metadata extraction - (e.g., stactools.sentinel2.stac.create_item) - stac_function_options: x - - Returns: - (...): As defined in the function - """ - if scene_path[-1] == "/": - scene_path = scene_path[:-1] - - mod_name, func_name = function_name.rsplit(".", 1) - mod = importlib.import_module(mod_name) - metadata_function = getattr(mod, func_name) - - return metadata_function(scene_path, **stac_function_options) - - -def extract_stactools(scene_path: str, function_name: str, stac_function_options: dict): - """ - Description... - - Parameters: - scene_path: x - function_name: x - stac_function_options: x - - Returns: - (...): ... - """ - stac_item = extract_by_function_name(scene_path, function_name, stac_function_options) - if "created" not in stac_item.properties: - stac_item.properties["created"] = str(datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%fZ")) - return stac_item - - -def extract_and_save_stactools( - scene_path: str, function_name: str, stac_function_options: dict, output_file: str, make_asset_hrefs_relative=False -): - """ - Description... - - Parameters: - scene_path: x - function_name: x - stac_function_options: x - output_file: x - make_asset_hrefs_relative: x - - Returns: - (...): ... - - Raises: - Exception: Could not make asset hrefs relative. - """ - # stactools packages return a pystac.Item as result - stac_item = extract_by_function_name(scene_path, function_name, stac_function_options) - if "created" not in stac_item.properties: - stac_item.properties["created"] = str(datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%fZ")) - if make_asset_hrefs_relative: - try: - if stac_item.get_self_href() is None: - stac_item.set_self_href(output_file) - - stac_item = stac_item.make_asset_hrefs_relative() - stac_item.remove_links("self") - except Exception as e: - print("Could not make asset hrefs relative: %s" % str(e)) - with open(output_file, "w") as file: - file.write(json.dumps(stac_item.to_dict())) - return output_file - - -def add_asset_filesize(stac): - """ - Description... - - Parameters: - stac: x - - Returns: - (...): ... - """ - # if not os.path.exists(stac_file): - # raise Exception("File %s does not exist!" % stac_file) - # stac = pystac.Item.from_file(stac_file) - FileExtension.add_to(stac) - - # base_dir = os.path.dirname(stac_file) - - for asset_key in stac.assets: - asset = stac.assets[asset_key] - # if asset.href[0] == '/': - # base_dir = '' - # href = os.path.join(base_dir, asset.href) - if os.path.isfile(asset.href): - asset.extra_fields["file:size"] = get_file_size(asset.href) - elif os.path.isdir(asset.href): - asset.extra_fields["file:size"] = get_folder_size(asset.href) - - # stac.save_object(include_self_link=False) - return stac - - -def register_metadata( - stac_file, - scene_id, - inventory_id, - inventory_collection, - collection, - api_url, - api_user, - api_pw, - inventory_dsn, - file_deletion=False, -): - """ - Description... - - Parameters: - stac_file: x - scene_id: x - inventory_id: x - inventory_collection: x - collection: x - api_url: x - api_user: x - api_pw: x - inventory_dsn: x - file_deletion: x - - Returns: - (...): ... - - Raises: - Exception: Registration_error: STAC file does not exist. - Exception: Registration_error: STAC collection not found in configuration or file. - Exception: Registration_error: Request of product not successful. - """ - stac_files = stac_file.split(";") - for stac_file in stac_files: - if not os.path.exists(stac_file): - raise Exception( - "registration_error: STAC file does not exist %s" % (stac_file), - ) - stac = pystac.read_file(stac_file) - stac = stac.make_asset_hrefs_absolute() - stac.properties["terrabyte:scene_id"] = scene_id - - # Check STAC collection id - if collection: - stac.collection_id = collection # stac.set_collection - - if stac.collection_id is None: - raise Exception( - "registration_error: STAC collection not found in configuration or file", - ) - - # Conduct request to STAC API - api_action = "insert" - r = requests.post( - "%s/collections/%s/items" % (api_url, stac.collection_id), json=stac.to_dict(), auth=(api_user, api_pw) - ) - if r.status_code == 409: - # Product already exists -> update - stac.properties["updated"] = str(datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%fZ")) - api_action = "update" - r = requests.put( - "%s/collections/%s/items/%s" % (api_url, stac.collection_id, stac.id), - json=stac.to_dict(), - auth=(api_user, api_pw), - ) - - if r.status_code != 200: - raise Exception( - ( - "registration_error: %s request of product %s not successful. " - "Status code: %s. Reason: %s. Response content: %s" - ) - % (api_action, stac.id, r.status_code, r.reason, r.content), - ) - else: - print("%s request of product %s in collection %s successful." % (api_action, stac.id, stac.collection_id)) - - # Optionally, delete STAC file - if file_deletion: - os.remove(stac_file) - - return {} diff --git a/lib/lib/resources/stac_geoparquet.py b/lib/lib/resources/stac_geoparquet.py deleted file mode 100644 index 6819d34..0000000 --- a/lib/lib/resources/stac_geoparquet.py +++ /dev/null @@ -1,268 +0,0 @@ -import os -import itertools -from typing import Sequence -from concurrent.futures import ThreadPoolExecutor - -from dateutil.parser import parse -from datetime import datetime, timedelta -import calendar - -import pypgstac.db -import pypgstac.hydration -import pandas as pd -import shapely.wkb - -import stac_geoparquet.arrow -import pyarrow as pa - -BASE_QUERY = "SELECT * from pgstac.items WHERE collection='{}'" - - -def partition_from_db_items_pair(idx, db, collection, output, base_url, overwrite=True): - """ - Description... - - Parameters: - idx: x - db: x - collection: x - output: x - base_url: x - overwrite: x - - Returns: - (...): ... - """ - try: - start, end = idx - file_end = end - timedelta(days=1) - f_out = f"{output}/{start.strftime('%Y%m%d')}_{file_end.strftime('%Y%m%d')}.parquet" - if os.path.exists(f_out) and not overwrite: - print(start, end, "exists") - return True - print(start, end) - query = BASE_QUERY.format(collection) + f" AND datetime >= '{start}' AND datetime < '{end}'" - - base_item = db.query_one(f"select * from collection_base_item('{collection}');") - - results = db.query(query) - items = [prepare_item(result, base_item, base_url) for result in results] - print(start, end, "items", len(items)) - if len(items) == 0: - return None - items_arrow = stac_geoparquet.arrow.parse_stac_items_to_arrow(items) - table = pa.Table.from_batches(items_arrow) - stac_geoparquet.arrow.to_parquet(table, f_out) - print(start, end, f_out, ": OK") - return f_out - except Exception as e: - print(start, end, "FAILED", str(e)) - - -def pairwise(iterable: Sequence) -> list[tuple[datetime, datetime]]: - """ - Description... - - Parameters: - iterable: x - - Returns: - (list): ... - """ - a, b = itertools.tee(iterable) - next(b, None) - return zip(a, b) - - -def prepare_datetime_pairs(datetime_range, partition_frequency): - """ - Description... - - Parameters: - datetime_range: x - partition_frequency: x - - - Returns: - (list): ... - """ - start_datetime = datetime.fromisoformat(datetime_range.split("/")[0].split("T")[0]) - end_datetime = datetime.fromisoformat(datetime_range.split("/")[1].split("T")[0]) - idx = pd.date_range( - start_datetime - timedelta(weeks=5), end_datetime + timedelta(weeks=5), freq=partition_frequency - ) - dt_pairs = pairwise(idx) - return list(dt_pairs) - - -def prepare_item(record, base_item, base_url): - """ - Description... - - Parameters: - record: x - base_item: x - base_url: x - - Returns: - (...): ... - """ - columns = [ - "id", - "geometry", - "collection", - "datetime", - "end_datetime", - "content", - ] - - item = dict(zip(columns, record)) - item.pop("datetime") - item.pop("end_datetime") - - geom = shapely.wkb.loads(item["geometry"], hex=True) - - item["geometry"] = geom.__geo_interface__ - content = item.pop("content") - assert isinstance(content, dict) - if "bbox" in content: - item["bbox"] = content["bbox"] - else: - item["bbox"] = list(geom.bounds) - - item["assets"] = content["assets"] - if "stac_extensions" in content: - item["stac_extensions"] = content["stac_extensions"] - item["properties"] = content["properties"] - - pypgstac.hydration.hydrate(base_item, item) - - item["links"] = [ - { - "rel": "collection", - "type": "application/json", - "href": f"{base_url}/collections/{item['collection']}", - }, - { - "rel": "parent", - "type": "application/json", - "href": f"{base_url}/collections/{item['collection']}", - }, - { - "rel": "root", - "type": "application/json", - "href": f"{base_url}", - }, - { - "rel": "self", - "type": "application/geo+json", - "href": f"{base_url}/collections/{item['collection']}/items/{item['id']}", - }, - ] - - return item - - -def generate_date_ranges(start_date, end_date): - """ - Description... - - Parameters: - start_date: x - end_date: x - - - Returns: - (...): ... - """ - date_ranges = [] - - # Set the initial current_date to the start_date - current_date = start_date - - while current_date <= end_date: - year = current_date.year - month = current_date.month - - # Determine the first day of the current month - first_day_of_month = datetime(year, month, 1) - # Determine the last day of the current month - _, last_day_of_month = calendar.monthrange(year, month) - last_day_of_month_date = datetime(year, month, last_day_of_month) - - # If the current_date is before the first day of the month, set it to the first day - if current_date < first_day_of_month: - current_date = first_day_of_month - - while current_date.month == month and current_date <= end_date: - end_range_date = current_date + timedelta(days=7) - - # Ensure the end range date does not exceed the last day of the month or the end date - if end_range_date > last_day_of_month_date: - end_range_date = last_day_of_month_date - if end_range_date > end_date: - end_range_date = end_date - - date_ranges.append((current_date, end_range_date + timedelta(days=1))) - - # Move to the next range - current_date = end_range_date + timedelta(days=1) - - # Move to the first day of the next month - current_date = first_day_of_month + timedelta(days=32) - current_date = datetime(current_date.year, current_date.month, 1) - - return date_ranges - - -def handle_partition_db_arrow(dsn, collection, output, base_url, frequency, datetime_range=None, max_threads=8): - """ - Description... - - Parameters: - dsn: x - collection: x - output: x - base_url: x - frequency: x - datetime_range: x - max_threads: x - - Returns: - (...): ... - """ - db = pypgstac.db.PgstacDB(dsn) - with db: - db.connection.execute("set statement_timeout = 600000;") - - if not datetime_range: - dt_pairs = [] - collection_db = db.query_one(f"SELECT * FROM pgstac.collections WHERE id='{collection}'") - interval = collection_db[2]["extent"]["temporal"]["interval"][0] - start = parse(interval[0]) - end = interval[1] - if not end: - end = datetime.now(tz=start.tzinfo) - else: - end = parse(interval[1]) - datetime_range = f"{start.isoformat()}/{end.isoformat()}" - interval_pairs = prepare_datetime_pairs(datetime_range, frequency) - dt_pairs.extend(interval_pairs) - - # For weekly frequencies we calculate a 7 day rolling window starting on the first of each month - if frequency == "W": - start_date = datetime(start.year, start.month, 1) - end_date = datetime(end.year, end.month, end.day) - dt_pairs = generate_date_ranges(start_date, end_date) - - else: - dt_pairs = prepare_datetime_pairs(datetime_range, frequency) - - executor = ThreadPoolExecutor(max_workers=max_threads) - futures = [ - executor.submit(partition_from_db_items_pair, pair, db, collection, output, base_url) for pair in dt_pairs - ] - for f in futures: - if res := f.result(): - print("Exported", res) - print("End of processing") diff --git a/lib/pyproject.toml b/lib/pyproject.toml deleted file mode 100644 index c716ef5..0000000 --- a/lib/pyproject.toml +++ /dev/null @@ -1,57 +0,0 @@ -[build-system] -requires = ["hatchling"] -build-backend = "hatchling.build" - -[project] -name = "lib" -version = "0.0.1" -authors = [] -description = "Common Library of the Resource Registration Building Block to harvest and register resources" -readme = "README.md" -requires-python = ">=3.8" -classifiers = [ - "Programming Language :: Python :: 3", - "License :: OSI Approved :: Apache Software License", - "Operating System :: OS Independent", -] -dependencies = [ - "pystac", - "pystac_client", - "stactools_sentinel", - "rio-cogeo", - "requests", - "psycopg2-binary", - "pika" -] -license = {file = "LICENSE"} - -[project.optional-dependencies] -dev = [ - "black", - "flake8" -] -test = [ - "pytest" -] -docs = [ - "mkdocs" -] - -[project.urls] -Homepage = "https://github.com/EOEPCA/resource-registration/tree/main/lib/lib" -Issues = "https://github.com/EOEPCA/resource-registration/issues" - -[tool.hatch.build.targets.wheel] -packages = ["lib"] - -[tool.pytest.ini_options] -minversion = "6.0" -addopts = "-q --import-mode=importlib --no-header -rfE --disable-warnings --log-level=INFO" -testpaths = [ - "test", - "integration", -] - -[tool.black] -line-length = 120 -target-version = ["py38"] diff --git a/lib/test/__init__.py b/lib/test/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/lib/test/base/__init__.py b/lib/test/base/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/lib/test/base/test_download.py b/lib/test/base/test_download.py deleted file mode 100644 index 464af3a..0000000 --- a/lib/test/base/test_download.py +++ /dev/null @@ -1,114 +0,0 @@ -import os -import time -from unittest import mock -import pytest - -import requests -from lib.base.download import access_token - - -# Test if the function returns the existing token if it is not expired -def test_access_token_existing_valid_token(monkeypatch): - # Set up the environment with a valid token and expiry time - monkeypatch.setenv("token_expire_time", str(time.time() + 3600)) # Token valid for the next hour - monkeypatch.setenv("s3_access_key", "existing_valid_token") - - assert access_token() == "existing_valid_token" - - -# Test if the function requests a new token if the existing one is expired -def test_access_token_expired_token_with_mock(monkeypatch): - # Set up environment variables - monkeypatch.setenv("token_expire_time", str(time.time() - 3600)) # Token expired - monkeypatch.setenv("s3_access_key", "expired_token") - - # Verify that the current token is indeed the expired token - assert os.environ.get("s3_access_key") == "expired_token" - - # Mock the netrc file - with mock.patch("netrc.netrc") as mock_netrc: - mock_auth = ("username", None, "password") # Mock username and password - mock_netrc.return_value.authenticators.return_value = mock_auth - - # Mock the requests.post method - with mock.patch("requests.post") as mock_post: - # Simulate a successful token response - mock_response = mock.MagicMock() - mock_response.json.return_value = {"access_token": "new_token", "expires_in": 3600} - mock_post.return_value = mock_response - - new_token = access_token() - - assert new_token == "new_token" - assert os.environ["s3_access_key"] == "new_token" - assert float(os.environ["token_expire_time"]) > time.time() - - -# Test if the function correctly requests a new token when no token is set -def test_access_token_no_existing_token(monkeypatch): - # Mock the netrc file - with mock.patch("netrc.netrc") as mock_netrc: - mock_auth = ("username", None, "password") # Mock username and password - mock_netrc.return_value.authenticators.return_value = mock_auth - - # Mock the requests.post method - with mock.patch("requests.post") as mock_post: - # Simulate a successful token response - mock_response = mock.MagicMock() - mock_response.json.return_value = {"access_token": "new_token", "expires_in": 3600} - mock_post.return_value = mock_response - - # Remove environment variables - monkeypatch.delenv("token_expire_time", raising=False) - monkeypatch.delenv("s3_access_key", raising=False) - - new_token = access_token() - assert new_token == "new_token" - assert os.environ["s3_access_key"] == "new_token" - assert float(os.environ["token_expire_time"]) > time.time() - - -# Test if the function handles network errors gracefully -def test_access_token_network_error(monkeypatch): - # Mock the netrc file - with mock.patch("netrc.netrc") as mock_netrc: - mock_auth = ("username", None, "password") # Mock username and password - mock_netrc.return_value.authenticators.return_value = mock_auth - - monkeypatch.setenv("token_expire_time", str(time.time() - 3600)) # Token expired - - with mock.patch("requests.post") as mock_post: - mock_post.side_effect = requests.exceptions.RequestException("Network error") - - with pytest.raises(Exception, match="Failed to get access token"): - access_token() - - -# Test if the function handles missing credentials in the netrc file gracefully -def test_access_token_missing_credentials(monkeypatch): - with mock.patch("requests.post") as mock_post: - mock_post.return_value.json.return_value = {"access_token": "new_token", "expires_in": 3600} - - with mock.patch("netrc.netrc") as mock_netrc: - mock_netrc.return_value.authenticators.side_effect = Exception("Netrc error") - - monkeypatch.setenv("token_expire_time", str(time.time() - 3600)) # Token expired - - with pytest.raises(Exception, match="Failed to get credentials from netrc"): - access_token() - - -# Test if the function handles an invalid response from the token server gracefully -def test_access_token_invalid_response(): - # Mock the netrc file - with mock.patch("netrc.netrc") as mock_netrc: - mock_auth = ("username", None, "password") # Mock username and password - mock_netrc.return_value.authenticators.return_value = mock_auth - - with mock.patch("requests.post") as mock_post: - mock_post.return_value.status_code = 200 - mock_post.return_value.json.return_value = {"unexpected_key": "value"} - - with mock.patch.dict(os.environ, {"token_expire_time": str(time.time() - 3600)}): - with pytest.raises(KeyError): - access_token() diff --git a/lib/test/base/test_file.py b/lib/test/base/test_file.py deleted file mode 100644 index e4f629d..0000000 --- a/lib/test/base/test_file.py +++ /dev/null @@ -1,425 +0,0 @@ -import io -import os -import shutil -import tarfile -import tempfile -from unittest import mock - -import pytest -import zipfile -from lib.base.file import ( - zip_directory, - delete_file, - unzip_file, - calculate_checksum, - get_folder_size, - untar_file, - check_file_size, - get_file_size, -) - - -# @pytest.fixture(autouse=True) -# def temp_directory(): -# # Create a temporary directory for testing -# temp_dir = tempfile.mkdtemp() -# yield temp_dir -# -# # Clean up the temporary directory after the test -# for filename in os.listdir(temp_dir): -# file_path = os.path.join(temp_dir, filename) -# # Check if the path points to a file (not a directory) -# if os.path.isfile(file_path): -# # Delete the file -# os.remove(file_path) -# os.rmdir(temp_dir) - - -def cleanup_directory(directory): - shutil.rmtree(directory) # Löscht rekursiv das Verzeichnis und alle Inhalte - - -@pytest.fixture(autouse=True) -def temp_directory(): - temp_dir = tempfile.mkdtemp() - yield temp_dir - cleanup_directory(temp_dir) - - -@pytest.fixture -def zip_file_path(temp_directory): - zip_file_path = os.path.join(temp_directory, "test.zip") - with zipfile.ZipFile(zip_file_path, "w") as zipf: - zipf.writestr("folder/test.txt", "Test content") - return zip_file_path - - -@pytest.fixture -def create_test_tar(temp_directory): - tar_file = os.path.join(temp_directory, "test.tar") - temp_file = os.path.join(temp_directory, "test.txt") - - with tarfile.open(tar_file, "w") as tar: - with open(temp_file, "w") as f: - f.write("Test content") - tar.add(temp_file, arcname="test.txt") - - return tar_file - - -def test_zip_directory(temp_directory): - # Create a test file inside the temporary directory - test_file = os.path.join(temp_directory, "test.txt") - with open(test_file, "w") as f: - f.write("This is a test file.") - - # Zip the temporary directory - zip_file_path = zip_directory(temp_directory) - - # Check if the zip file is created - assert os.path.isfile(zip_file_path) - - # Check if the zip file name matches the directory name - assert os.path.basename(zip_file_path) == os.path.basename(temp_directory) + ".zip" - - # Check if the zip file contains the test file - with zipfile.ZipFile(zip_file_path, "r") as zipf: - assert "test.txt" in zipf.namelist() - - -def test_zip_directory_invalid_source(): - # Test invalid source path - with pytest.raises(ValueError): - zip_directory("/invalid/path") - - -def test_zip_directory_existing_destination(temp_directory): - # Create a test file inside the temporary directory - test_file = os.path.join(temp_directory, "test.txt") - with open(test_file, "w") as f: - f.write("This is a test file.") - - # Create a test zip file in the temporary directory - test_zip_path = os.path.join(temp_directory, "test.zip") - with zipfile.ZipFile(test_zip_path, "w") as zipf: - zipf.write(os.path.join(temp_directory, "test.txt"), "test.txt") - - # Test destination path already exists - with pytest.raises(ValueError): - zip_directory(temp_directory, test_zip_path) - - -def test_delete_file(): - temp_dir = tempfile.mkdtemp() - test_file = os.path.join(temp_dir, "test_iif.xml") - with open(test_file, "w") as f: - f.write("") - - delete_file(test_file) - assert not os.path.isfile(test_file) - os.rmdir(temp_dir) - - -def test_delete_file_not_existing(): - temp_dir = tempfile.mkdtemp() - test_file = os.path.join(temp_dir, "test_iif.xml") - with pytest.raises(Exception): - delete_file(test_file) - os.rmdir(temp_dir) - - -def test_delete_file_is_dir(): - temp_dir = tempfile.mkdtemp() - with pytest.raises(Exception): - delete_file(temp_dir) - os.rmdir(temp_dir) - - -# Test if the function raises an exception when the ZIP file does not exist -def test_unzip_file_not_existing(): - with pytest.raises(Exception, match="File does not exist"): - unzip_file("non_existing_file.zip") - - -# Test if the function correctly unzips a valid ZIP file and returns the correct path and removal status -def test_unzip_file_success(zip_file_path): - result = unzip_file(zip_file_path) - assert os.path.exists(result["scene_path"]) - assert os.path.isfile(os.path.join(result["scene_path"], "test.txt")) - assert result["zip_file_removed"] is True - - -# Test if the function handles errors during file extraction properly -def test_unzip_file_with_failed_files(zip_file_path): - # Simuliere einen Fehler beim Entpacken - with mock.patch.object(zipfile.ZipFile, "extract", side_effect=Exception("Extraction failed")): - with pytest.raises(Exception, match="Exceptions during unzipping"): - unzip_file(zip_file_path) - - -# Test if the function removes the ZIP file when 'remove_zip=True' -def test_unzip_file_remove_zip(zip_file_path): - result = unzip_file(zip_file_path, remove_zip=True) - assert result["zip_file_removed"] is True - assert not os.path.exists(zip_file_path) - - -# Test if the function does not remove the ZIP file when 'remove_zip=False' -def test_unzip_file_keep_zip(zip_file_path): - result = unzip_file(zip_file_path, remove_zip=False) - assert result["zip_file_removed"] is False - assert os.path.exists(zip_file_path) - - -# Test if the function handles ZIP files without subfolders correctly -def test_unzip_file_no_subfolder(temp_directory): - zip_file_path = os.path.join(temp_directory, "test.zip") - with zipfile.ZipFile(zip_file_path, "w") as zipf: - zipf.writestr("test.txt", "Test content") - - with pytest.raises(Exception, match="Could not find sub-folder in zip file"): - unzip_file(zip_file_path) - - -# Test if the function raises an exception for invalid or corrupted ZIP files -def test_unzip_file_invalid_zip(temp_directory): - invalid_zip_file = os.path.join(temp_directory, "invalid.zip") - with open(invalid_zip_file, "w") as f: - f.write("This is not a valid zip file") - - with pytest.raises(zipfile.BadZipFile): - unzip_file(invalid_zip_file) - - -# Test if the function raises an exception for an unsupported checksum algorithm -def test_calculate_checksum_unsupported_algorithm(): - # Test with an unsupported algorithm name - with pytest.raises(Exception, match="Checksum algorithm not available"): - calculate_checksum("unsupported_algorithm", "file.txt") - - # Test with an empty algorithm name - with pytest.raises(Exception, match="Checksum algorithm not available"): - calculate_checksum("", "file.txt") - - -# Test if the function calculates the checksum correctly using a supported algorithm -def test_calculate_checksum_correct_algorithm(temp_directory): - test_file = os.path.join(temp_directory, "test.txt") - with open(test_file, "w") as f: - f.write("Test") - - # Expected checksum value (e.g., MD5 hash) - expected_checksum = "0cbc6611f5540bd0809a388dc95a615b" # Example value for "Test" with MD5 - result = calculate_checksum("MD5", test_file) - assert result == expected_checksum - - -# Test if the function handles missing files correctly -def test_calculate_checksum_file_not_found(): - with pytest.raises(FileNotFoundError): - calculate_checksum("MD5", "non_existing_file.txt") - - -# Test if the function handles empty files correctly -def test_calculate_checksum_empty_file(temp_directory): - empty_file = os.path.join(temp_directory, "empty.txt") - open(empty_file, "w").close() - - # Expected checksum value for an empty file (e.g., MD5 hash of an empty string) - expected_checksum = "d41d8cd98f00b204e9800998ecf8427e" # Example value for an empty file with MD5 - result = calculate_checksum("MD5", empty_file) - assert result == expected_checksum - - -# Test if the function handles binary files correctly -def test_calculate_checksum_binary_file(temp_directory): - binary_file = os.path.join(temp_directory, "binary.bin") - with open(binary_file, "wb") as f: - f.write(b"\x00\x01\x02\x03") - - # Expected checksum value for the binary content - expected_checksum = "37b59afd592725f9305e484a5d7f5168" # Example value for binary content with MD5 - result = calculate_checksum("MD5", binary_file) - assert result == expected_checksum - - -# Test if the function raises an exception for a non-existent folder -def test_get_folder_size_folder_not_exist(): - with pytest.raises(Exception, match="Folder .* does not exist!"): - get_folder_size("non_existing_folder") - - -# Test if the function returns 0 for an empty folder -def test_get_folder_size_empty_folder(temp_directory): - empty_folder = os.path.join(temp_directory, "empty") - os.makedirs(empty_folder) - size = get_folder_size(empty_folder) - assert size == 0 - - -# Test if the function calculates the size correctly for a folder with a single file -def test_get_folder_size_single_file(temp_directory): - single_file_folder = os.path.join(temp_directory, "single_file") - os.makedirs(single_file_folder) - file_path = os.path.join(single_file_folder, "file.txt") - with open(file_path, "w") as f: - f.write("Test content") - size = get_folder_size(single_file_folder) - assert size == os.path.getsize(file_path) - - -# Test if the function calculates the total size correctly for a folder with multiple files -def test_get_folder_size_multiple_files(temp_directory): - multiple_files_folder = os.path.join(temp_directory, "multiple_files") - os.makedirs(multiple_files_folder) - file_paths = [os.path.join(multiple_files_folder, "file1.txt"), os.path.join(multiple_files_folder, "file2.txt")] - with open(file_paths[0], "w") as f: - f.write("Content 1") - with open(file_paths[1], "w") as f: - f.write("Content 2") - total_size = sum(os.path.getsize(fp) for fp in file_paths) - size = get_folder_size(multiple_files_folder) - assert size == total_size - - -# Test if the function calculates the size correctly for a folder with subfolders -def test_get_folder_size_with_subfolders(temp_directory): - main_folder = os.path.join(temp_directory, "main_folder") - os.makedirs(main_folder) - subfolder = os.path.join(main_folder, "subfolder") - os.makedirs(subfolder) - - file_in_main = os.path.join(main_folder, "main_file.txt") - file_in_sub = os.path.join(subfolder, "sub_file.txt") - - with open(file_in_main, "w") as f: - f.write("Main file content") - with open(file_in_sub, "w") as f: - f.write("Subfolder file content") - - total_size = os.path.getsize(file_in_main) + os.path.getsize(file_in_sub) - size = get_folder_size(main_folder) - assert size == total_size - - -# Test if the function raises an exception when the TAR file does not exist -def test_untar_file_file_not_exist(): - with pytest.raises(Exception, match="File does not exist: .*"): - untar_file("non_existing_file.tar") - - -# Test if the function successfully extracts a valid TAR file -def test_untar_file_successful_extraction(temp_directory, create_test_tar): - tar_file = create_test_tar - extract_dir = os.path.join(temp_directory, "extracted") - - result = untar_file(tar_file, base_folder=extract_dir) - - assert os.path.isfile(os.path.join(result["scene_path"], "test.txt")) - assert result["zip_file_removed"] is True - assert result["scene_path"] == extract_dir - - -# Test if the function removes the TAR file after extraction when remove_tar=True -def test_untar_file_remove_tar_after_extraction(temp_directory, create_test_tar): - tar_file = create_test_tar - - result = untar_file(tar_file, remove_tar=True, create_folder=True, base_folder=temp_directory) - - assert not os.path.exists(tar_file) - assert result["zip_file_removed"] is True - - -# Test if the function creates a new folder for the extracted files when create_folder=True -def test_untar_file_create_folder(temp_directory, create_test_tar): - tar_file = create_test_tar - - result = untar_file(tar_file, remove_tar=False, create_folder=True, base_folder=temp_directory) - - expected_folder = os.path.join(temp_directory, "test") - assert os.path.exists(expected_folder) - assert result["scene_path"] == expected_folder - - -# Test if the function extracts files to a custom base folder when base_folder is provided -def test_untar_file_custom_base_folder(temp_directory, create_test_tar): - tar_file = create_test_tar - custom_folder = os.path.join(temp_directory, "custom_folder") - os.makedirs(custom_folder) - - result = untar_file(tar_file, create_folder=False, base_folder=custom_folder) - - extracted_file = os.path.join(custom_folder, "test.txt") - assert os.path.exists(extracted_file) - assert result["scene_path"] == custom_folder - - -# Test if the function handles extraction failures correctly and logs failed files -def test_untar_file_failed_extraction(temp_directory): - tar_file = os.path.join(temp_directory, "test.tar") - - # Create a TAR file with invalid file paths - with tarfile.open(tar_file, "w") as tar: - tarinfo = tarfile.TarInfo("invalid/../test.txt") - tarinfo.size = len(b"Invalid content") - tar.addfile(tarinfo, io.BytesIO(b"Invalid content")) - - # Patch tarfile.extract to raise an exception - with mock.patch.object(tarfile.TarFile, "extract", side_effect=Exception("Mocked extraction failure")): - with pytest.raises(Exception, match="Exceptions during untaring: .*"): - untar_file(tar_file, remove_tar=False, base_folder=temp_directory) - - -# Test if the function raises an exception when given a file that is not a valid TAR file -def test_untar_file_invalid_tar(temp_directory): - not_a_tar_file = os.path.join(temp_directory, "not_a_tar.txt") - - with open(not_a_tar_file, "w") as f: - f.write("This is not a TAR file.") - - with pytest.raises(tarfile.ReadError): - untar_file(not_a_tar_file, remove_tar=False, base_folder=temp_directory) - - -def test_untar_file_tar_not_removed_on_delete_error(temp_directory, create_test_tar): - tar_file = create_test_tar - - with mock.patch("os.remove", side_effect=PermissionError("Mocked permission error")): - result = untar_file(tar_file, remove_tar=True, create_folder=False, base_folder=temp_directory) - - assert os.path.exists(tar_file) - assert result["zip_file_removed"] is False - - -# Test if the function correctly identifies when the file size matches the expected size -def test_check_file_size_success(temp_directory): - test_file = os.path.join(temp_directory, "test_file.txt") - with open(test_file, "w") as f: - f.write("Test") - - assert check_file_size(4, test_file) is True - - -# Test if the function raises an exception when the file does not exist -def test_check_file_size_file_not_found(temp_directory): - non_existent_file = os.path.join(temp_directory, "non_existent_file.txt") - - with pytest.raises(Exception, match="File not found: "): - check_file_size(0, non_existent_file) - - -# Test if the function returns the correct size for an existing file -def test_get_file_size_existing_file(temp_directory): - test_file = os.path.join(temp_directory, "test_file.txt") - with open(test_file, "w") as f: - f.write("Sample content") - - assert get_file_size(test_file) == len("Sample content") - - -# Test if the function raises an exception when the file does not exist -def test_get_file_size_file_not_found(temp_directory): - non_existent_file = os.path.join(temp_directory, "non_existent_file.txt") - - with pytest.raises(Exception, match="File .* does not exist!"): - get_file_size(non_existent_file)