From 63e998da04199bcbb44eb2b931e6cb31d9513439 Mon Sep 17 00:00:00 2001 From: Matthias Mohr Date: Mon, 12 Sep 2022 16:46:25 +0200 Subject: [PATCH] Add option to fix netCDF issues #17 (#18) --- CHANGELOG.md | 2 +- src/stactools/goes_glm/commands.py | 10 +- src/stactools/goes_glm/parquet.py | 29 +--- src/stactools/goes_glm/stac.py | 246 ++++++++++++++++------------- 4 files changed, 155 insertions(+), 132 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e5f3e33..f51d9b0 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,7 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/). ### Added -- Nothing. +- New option `fixnetcdf` (see issue [#17](https://github.com/stactools-packages/goes-glm/issues/17) for details) ### Deprecated diff --git a/src/stactools/goes_glm/commands.py b/src/stactools/goes_glm/commands.py index c2cd48f..c080f6d 100644 --- a/src/stactools/goes_glm/commands.py +++ b/src/stactools/goes_glm/commands.py @@ -96,12 +96,18 @@ def create_collection_command( default=False, help="Does not include the netCDF file in the created metadata if set to `TRUE`.", ) + @click.option( + "--fixnetcdf", + default=False, + help="Fixes missing _Unsigned attributes in some older netCDF files if set to `TRUE`.", + ) def create_item_command( source: str, destination: str, collection: str = "", nogeoparquet: bool = False, nonetcdf: bool = False, + fixnetcdf: bool = False, ) -> None: """Creates a STAC Item @@ -113,7 +119,9 @@ def create_item_command( if len(collection) > 0: stac_collection = Collection.from_file(collection) - item = stac.create_item(source, stac_collection, nogeoparquet, nonetcdf) + item = stac.create_item( + source, stac_collection, nogeoparquet, nonetcdf, fixnetcdf + ) item.save_object(dest_href=destination) return None diff --git a/src/stactools/goes_glm/parquet.py b/src/stactools/goes_glm/parquet.py index abb3748..8b5f295 100644 --- a/src/stactools/goes_glm/parquet.py +++ b/src/stactools/goes_glm/parquet.py @@ -1,7 +1,6 @@ import logging import os from datetime import datetime, timedelta, timezone -from math import isnan from typing import Any, Dict, List, Optional from geopandas import GeoDataFrame, GeoSeries @@ -200,20 +199,14 @@ def create_asset( base = datetime.fromisoformat(unit[14:]).replace(tzinfo=timezone.utc) new_data: List[Optional[datetime]] = [] - invalid_datetimes = 0 for val in data: - if val is None or isnan(val): - # Sometimes the values contain NaN/None values - new_data.append(None) - invalid_datetimes += 1 - else: - try: - delta = timedelta(seconds=val) - new_data.append(base + delta) - except TypeError: - raise Exception( - f"An invalid value '{val}' found in variable '{var_name}'" - ) + try: + delta = timedelta(seconds=val) + new_data.append(base + delta) + except TypeError: + raise Exception( + f"An invalid value '{val}' found in variable '{var_name}'" + ) table_data[new_col] = new_data col_info = { @@ -221,14 +214,6 @@ def create_asset( "type": constants.PARQUET_DATETIME_COL_TYPE, } - if invalid_datetimes > 0: - logger.warning( - f"The variable {var_name} contains {invalid_datetimes} `null` values" - ) - col_info[ - "description" - ] = f"The column contains {invalid_datetimes} `null` values" - table_cols.append(col_info) table_data[col] = data diff --git a/src/stactools/goes_glm/stac.py b/src/stactools/goes_glm/stac.py index 8d0711e..40d94db 100644 --- a/src/stactools/goes_glm/stac.py +++ b/src/stactools/goes_glm/stac.py @@ -161,6 +161,7 @@ def create_item( collection: Optional[Collection] = None, nogeoparquet: bool = False, nonetcdf: bool = False, + fixnetcdf: bool = False, ) -> Item: """Create a STAC Item @@ -174,130 +175,159 @@ def create_item( collection (pystac.Collection): HREF to an existing collection nogeoparquet (bool): If set to True, no geoparquet file is generated for the Item nonetcdf (bool): If set to True, the netCDF file is not added to the Item + fixnetcdf (bool): If set to True, fixes missing _Unsigned attributes in some of + the older netCDF files Returns: Item: STAC Item object """ - dataset = Dataset(asset_href, "r", format="NETCDF4") + with Dataset(asset_href, "a", format="NETCDF4") as dataset: + id = dataset.dataset_name.replace(".nc", "") + sys_env = id[:2] + if sys_env != "OR": + logger.warning("You are ingesting test data.") - id = dataset.dataset_name.replace(".nc", "") - sys_env = id[:2] - if sys_env != "OR": - logger.warning("You are ingesting test data.") - - computed_datetime = center_datetime( - dataset.time_coverage_start, dataset.time_coverage_end - ) - - try: - platform = constants.Platforms[dataset.platform_ID] - if platform == constants.Platforms.G18: - raise Exception("GOES-18/T is not supported yet") - except ValueError: - raise Exception( - f"The dataset contains an invalid platform identifier: {dataset.platform_ID}" - ) + var_count = len(dataset.variables) + if var_count != 45 and var_count != 48: + raise Exception( + f"The number of variables is expected to be 45 or 48, but it is {var_count}" + ) - try: - slot_str = dataset.orbital_slot.replace("-", "_") - slot = constants.OrbitalSlot[slot_str] - except KeyError: - # Some files seem to list "GOES_Test" as slot, use platform ID as indicator then. - logger.warning( - f"The value for 'orbital_slot' is invalid: {dataset.orbital_slot}" + # See page 14-15 for details: + # https://www.noaasis.noaa.gov/pdf/ps-pvr/goes-16/GLM/Full/GOES16_GLM_FullValidation_ProductPerformanceGuide.pdf + defect_vars = { + "event_time_offset": False, + "group_time_offset": False, + "flash_time_offset_of_first_event": False, + "flash_time_offset_of_last_event": False, + "group_frame_time_offset": False, + "flash_frame_time_offset_of_first_event": False, + "flash_frame_time_offset_of_last_event": False, + } + for key in defect_vars: + if key in dataset.variables: + if not hasattr(dataset.variables[key], "_Unsigned"): + dataset.variables[key]._Unsigned = "true" + defect_vars[key] = True + + computed_datetime = center_datetime( + dataset.time_coverage_start, dataset.time_coverage_end ) - # GOES-16 began drifting to the GOES-East operational location [...] on November 30, 2017. - # Drift was complete on December 11, 2017, - g16drift = datetime(2017, 12, 11, tzinfo=timezone.utc) - g17drift = datetime(2018, 11, 13, tzinfo=timezone.utc) - if platform == constants.Platforms.G16 and computed_datetime > g16drift: - slot = constants.OrbitalSlot.GOES_East - # GOES-17 began drifting to its GOES-West operational location [...] on October 24, 2018. - # Drift was completed on November 13, 2018 - elif platform == constants.Platforms.G17 and computed_datetime > g17drift: - slot = constants.OrbitalSlot.GOES_West - else: + try: + platform = constants.Platforms[dataset.platform_ID] + if platform == constants.Platforms.G18: + raise Exception("GOES-18/T is not supported yet") + except ValueError: raise Exception( - f"The dataset contains an invalid oribtal slot identifier: {dataset.orbital_slot}" + f"The dataset contains an invalid platform identifier: {dataset.platform_ID}" ) - properties = { - "start_datetime": dataset.time_coverage_start, - "end_datetime": dataset.time_coverage_end, - "mission": constants.MISSION, - "constellation": constants.CONSTELLATION, - "platform": platform, - "instruments": [dataset.instrument_ID], - "gsd": constants.RESOLUTION, - "processing:level": constants.PROCESSING_LEVEL, - "processing:facility": dataset.production_site, - "goes:orbital-slot": slot, - "goes:system-environment": sys_env, - } - - if slot == constants.OrbitalSlot.GOES_East: - bbox = constants.ITEM_BBOX_EAST - geometry = constants.GEOMETRY_EAST - elif slot == constants.OrbitalSlot.GOES_West: - bbox = constants.ITEM_BBOX_WEST - geometry = constants.GEOMETRY_WEST - else: - bbox = None - geometry = None - - centroid = {} - for key, var in dataset.variables.items(): - if len(var.dimensions) != 0: - continue - - ma = var[...] - if var.name == "lat_field_of_view": - centroid["lat"] = ma.tolist() - elif var.name == "lon_field_of_view": - centroid["lon"] = ma.tolist() - elif ma.count() > 0: - properties[f"goes-glm:{var.name}"] = ma.tolist() - - item = Item( - stac_extensions=[ - # todo: add extension again once released #12 - # constants.GOES_EXTENSION, - constants.PROCESSING_EXTENSION, - ], - id=id, - properties=properties, - geometry=geometry, - bbox=bbox, - datetime=computed_datetime, - collection=collection, - ) + try: + slot_str = dataset.orbital_slot.replace("-", "_") + slot = constants.OrbitalSlot[slot_str] + except KeyError: + # Some files seem to list "GOES_Test" as slot, use platform ID as indicator then. + logger.warning( + f"The value for 'orbital_slot' is invalid: {dataset.orbital_slot}" + ) + + # GOES-16 began drifting to the GOES-East operational location [...] on + # November 30, 2017. Drift was complete on December 11, 2017, + g16drift = datetime(2017, 12, 11, tzinfo=timezone.utc) + g17drift = datetime(2018, 11, 13, tzinfo=timezone.utc) + if platform == constants.Platforms.G16 and computed_datetime > g16drift: + slot = constants.OrbitalSlot.GOES_East + # GOES-17 began drifting to its GOES-West operational location [...] on + # October 24, 2018. Drift was completed on November 13, 2018 + elif platform == constants.Platforms.G17 and computed_datetime > g17drift: + slot = constants.OrbitalSlot.GOES_West + else: + raise Exception( + "The dataset contains an invalid oribtal slot identifier: " + + dataset.orbital_slot + ) + + properties = { + "start_datetime": dataset.time_coverage_start, + "end_datetime": dataset.time_coverage_end, + "mission": constants.MISSION, + "constellation": constants.CONSTELLATION, + "platform": platform, + "instruments": [dataset.instrument_ID], + "gsd": constants.RESOLUTION, + "processing:level": constants.PROCESSING_LEVEL, + "processing:facility": dataset.production_site, + "goes:orbital-slot": slot, + "goes:system-environment": sys_env, + } - proj = ProjectionExtension.ext(item, add_if_missing=True) - proj.epsg = constants.TARGET_CRS - if len(centroid) == 2: - proj.centroid = centroid + if slot == constants.OrbitalSlot.GOES_East: + bbox = constants.ITEM_BBOX_EAST + geometry = constants.GEOMETRY_EAST + elif slot == constants.OrbitalSlot.GOES_West: + bbox = constants.ITEM_BBOX_WEST + geometry = constants.GEOMETRY_WEST + else: + bbox = None + geometry = None + + centroid = {} + for key, var in dataset.variables.items(): + if len(var.dimensions) != 0: + continue + + ma = var[...] + if var.name == "lat_field_of_view": + centroid["lat"] = ma.tolist() + elif var.name == "lon_field_of_view": + centroid["lon"] = ma.tolist() + elif ma.count() > 0: + properties[f"goes-glm:{var.name}"] = ma.tolist() + + item = Item( + stac_extensions=[ + # todo: add extension again once released #12 + # constants.GOES_EXTENSION, + constants.PROCESSING_EXTENSION, + ], + id=id, + properties=properties, + geometry=geometry, + bbox=bbox, + datetime=computed_datetime, + collection=collection, + ) - if not nogeoparquet: - target_folder = os.path.dirname(asset_href) - assets = parquet.convert(dataset, target_folder) - for key, asset_dict in assets.items(): + proj = ProjectionExtension.ext(item, add_if_missing=True) + proj.epsg = constants.TARGET_CRS + if len(centroid) == 2: + proj.centroid = centroid + + if not nogeoparquet: + target_folder = os.path.dirname(asset_href) + assets = parquet.convert(dataset, target_folder) + for key, asset_dict in assets.items(): + asset = Asset.from_dict(asset_dict) + item.add_asset(key, asset) + TableExtension.ext(asset, add_if_missing=True) + + if not nonetcdf: + # todo: replace with DataCube extension from PySTAC #16 + item.stac_extensions.append(constants.DATACUBE_EXTENSION) + asset_dict = netcdf.create_asset(asset_href) + asset_dict["created"] = dataset.date_created + asset_dict["cube:dimensions"] = netcdf.to_cube_dimensions(dataset) + asset_dict["cube:variables"] = netcdf.to_cube_variables(dataset) asset = Asset.from_dict(asset_dict) - item.add_asset(key, asset) - TableExtension.ext(asset, add_if_missing=True) + item.add_asset(constants.NETCDF_KEY, asset) - if not nonetcdf: - # todo: replace with DataCube extension from PySTAC #16 - item.stac_extensions.append(constants.DATACUBE_EXTENSION) - asset_dict = netcdf.create_asset(asset_href) - asset_dict["created"] = dataset.date_created - asset_dict["cube:dimensions"] = netcdf.to_cube_dimensions(dataset) - asset_dict["cube:variables"] = netcdf.to_cube_variables(dataset) - asset = Asset.from_dict(asset_dict) - item.add_asset(constants.NETCDF_KEY, asset) - - return item + for key, is_defect in defect_vars.items(): + if is_defect: + del dataset.variables[key]._Unsigned + + return item def center_datetime(start: str, end: str) -> datetime: