Skip to content

Commit

Permalink
Add option to fix netCDF issues #17 (#18)
Browse files Browse the repository at this point in the history
  • Loading branch information
m-mohr authored Sep 12, 2022
1 parent 5110c74 commit 63e998d
Show file tree
Hide file tree
Showing 4 changed files with 155 additions and 132 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

### Added

- Nothing.
- New option `fixnetcdf` (see issue [#17](https://github.com/stactools-packages/goes-glm/issues/17) for details)

### Deprecated

Expand Down
10 changes: 9 additions & 1 deletion src/stactools/goes_glm/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,12 +96,18 @@ def create_collection_command(
default=False,
help="Does not include the netCDF file in the created metadata if set to `TRUE`.",
)
@click.option(
"--fixnetcdf",
default=False,
help="Fixes missing _Unsigned attributes in some older netCDF files if set to `TRUE`.",
)
def create_item_command(
source: str,
destination: str,
collection: str = "",
nogeoparquet: bool = False,
nonetcdf: bool = False,
fixnetcdf: bool = False,
) -> None:
"""Creates a STAC Item
Expand All @@ -113,7 +119,9 @@ def create_item_command(
if len(collection) > 0:
stac_collection = Collection.from_file(collection)

item = stac.create_item(source, stac_collection, nogeoparquet, nonetcdf)
item = stac.create_item(
source, stac_collection, nogeoparquet, nonetcdf, fixnetcdf
)
item.save_object(dest_href=destination)

return None
Expand Down
29 changes: 7 additions & 22 deletions src/stactools/goes_glm/parquet.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import logging
import os
from datetime import datetime, timedelta, timezone
from math import isnan
from typing import Any, Dict, List, Optional

from geopandas import GeoDataFrame, GeoSeries
Expand Down Expand Up @@ -200,35 +199,21 @@ def create_asset(
base = datetime.fromisoformat(unit[14:]).replace(tzinfo=timezone.utc)

new_data: List[Optional[datetime]] = []
invalid_datetimes = 0
for val in data:
if val is None or isnan(val):
# Sometimes the values contain NaN/None values
new_data.append(None)
invalid_datetimes += 1
else:
try:
delta = timedelta(seconds=val)
new_data.append(base + delta)
except TypeError:
raise Exception(
f"An invalid value '{val}' found in variable '{var_name}'"
)
try:
delta = timedelta(seconds=val)
new_data.append(base + delta)
except TypeError:
raise Exception(
f"An invalid value '{val}' found in variable '{var_name}'"
)

table_data[new_col] = new_data
col_info = {
"name": new_col,
"type": constants.PARQUET_DATETIME_COL_TYPE,
}

if invalid_datetimes > 0:
logger.warning(
f"The variable {var_name} contains {invalid_datetimes} `null` values"
)
col_info[
"description"
] = f"The column contains {invalid_datetimes} `null` values"

table_cols.append(col_info)

table_data[col] = data
Expand Down
246 changes: 138 additions & 108 deletions src/stactools/goes_glm/stac.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ def create_item(
collection: Optional[Collection] = None,
nogeoparquet: bool = False,
nonetcdf: bool = False,
fixnetcdf: bool = False,
) -> Item:
"""Create a STAC Item
Expand All @@ -174,130 +175,159 @@ def create_item(
collection (pystac.Collection): HREF to an existing collection
nogeoparquet (bool): If set to True, no geoparquet file is generated for the Item
nonetcdf (bool): If set to True, the netCDF file is not added to the Item
fixnetcdf (bool): If set to True, fixes missing _Unsigned attributes in some of
the older netCDF files
Returns:
Item: STAC Item object
"""

dataset = Dataset(asset_href, "r", format="NETCDF4")
with Dataset(asset_href, "a", format="NETCDF4") as dataset:
id = dataset.dataset_name.replace(".nc", "")
sys_env = id[:2]
if sys_env != "OR":
logger.warning("You are ingesting test data.")

id = dataset.dataset_name.replace(".nc", "")
sys_env = id[:2]
if sys_env != "OR":
logger.warning("You are ingesting test data.")

computed_datetime = center_datetime(
dataset.time_coverage_start, dataset.time_coverage_end
)

try:
platform = constants.Platforms[dataset.platform_ID]
if platform == constants.Platforms.G18:
raise Exception("GOES-18/T is not supported yet")
except ValueError:
raise Exception(
f"The dataset contains an invalid platform identifier: {dataset.platform_ID}"
)
var_count = len(dataset.variables)
if var_count != 45 and var_count != 48:
raise Exception(
f"The number of variables is expected to be 45 or 48, but it is {var_count}"
)

try:
slot_str = dataset.orbital_slot.replace("-", "_")
slot = constants.OrbitalSlot[slot_str]
except KeyError:
# Some files seem to list "GOES_Test" as slot, use platform ID as indicator then.
logger.warning(
f"The value for 'orbital_slot' is invalid: {dataset.orbital_slot}"
# See page 14-15 for details:
# https://www.noaasis.noaa.gov/pdf/ps-pvr/goes-16/GLM/Full/GOES16_GLM_FullValidation_ProductPerformanceGuide.pdf
defect_vars = {
"event_time_offset": False,
"group_time_offset": False,
"flash_time_offset_of_first_event": False,
"flash_time_offset_of_last_event": False,
"group_frame_time_offset": False,
"flash_frame_time_offset_of_first_event": False,
"flash_frame_time_offset_of_last_event": False,
}
for key in defect_vars:
if key in dataset.variables:
if not hasattr(dataset.variables[key], "_Unsigned"):
dataset.variables[key]._Unsigned = "true"
defect_vars[key] = True

computed_datetime = center_datetime(
dataset.time_coverage_start, dataset.time_coverage_end
)

# GOES-16 began drifting to the GOES-East operational location [...] on November 30, 2017.
# Drift was complete on December 11, 2017,
g16drift = datetime(2017, 12, 11, tzinfo=timezone.utc)
g17drift = datetime(2018, 11, 13, tzinfo=timezone.utc)
if platform == constants.Platforms.G16 and computed_datetime > g16drift:
slot = constants.OrbitalSlot.GOES_East
# GOES-17 began drifting to its GOES-West operational location [...] on October 24, 2018.
# Drift was completed on November 13, 2018
elif platform == constants.Platforms.G17 and computed_datetime > g17drift:
slot = constants.OrbitalSlot.GOES_West
else:
try:
platform = constants.Platforms[dataset.platform_ID]
if platform == constants.Platforms.G18:
raise Exception("GOES-18/T is not supported yet")
except ValueError:
raise Exception(
f"The dataset contains an invalid oribtal slot identifier: {dataset.orbital_slot}"
f"The dataset contains an invalid platform identifier: {dataset.platform_ID}"
)

properties = {
"start_datetime": dataset.time_coverage_start,
"end_datetime": dataset.time_coverage_end,
"mission": constants.MISSION,
"constellation": constants.CONSTELLATION,
"platform": platform,
"instruments": [dataset.instrument_ID],
"gsd": constants.RESOLUTION,
"processing:level": constants.PROCESSING_LEVEL,
"processing:facility": dataset.production_site,
"goes:orbital-slot": slot,
"goes:system-environment": sys_env,
}

if slot == constants.OrbitalSlot.GOES_East:
bbox = constants.ITEM_BBOX_EAST
geometry = constants.GEOMETRY_EAST
elif slot == constants.OrbitalSlot.GOES_West:
bbox = constants.ITEM_BBOX_WEST
geometry = constants.GEOMETRY_WEST
else:
bbox = None
geometry = None

centroid = {}
for key, var in dataset.variables.items():
if len(var.dimensions) != 0:
continue

ma = var[...]
if var.name == "lat_field_of_view":
centroid["lat"] = ma.tolist()
elif var.name == "lon_field_of_view":
centroid["lon"] = ma.tolist()
elif ma.count() > 0:
properties[f"goes-glm:{var.name}"] = ma.tolist()

item = Item(
stac_extensions=[
# todo: add extension again once released #12
# constants.GOES_EXTENSION,
constants.PROCESSING_EXTENSION,
],
id=id,
properties=properties,
geometry=geometry,
bbox=bbox,
datetime=computed_datetime,
collection=collection,
)
try:
slot_str = dataset.orbital_slot.replace("-", "_")
slot = constants.OrbitalSlot[slot_str]
except KeyError:
# Some files seem to list "GOES_Test" as slot, use platform ID as indicator then.
logger.warning(
f"The value for 'orbital_slot' is invalid: {dataset.orbital_slot}"
)

# GOES-16 began drifting to the GOES-East operational location [...] on
# November 30, 2017. Drift was complete on December 11, 2017,
g16drift = datetime(2017, 12, 11, tzinfo=timezone.utc)
g17drift = datetime(2018, 11, 13, tzinfo=timezone.utc)
if platform == constants.Platforms.G16 and computed_datetime > g16drift:
slot = constants.OrbitalSlot.GOES_East
# GOES-17 began drifting to its GOES-West operational location [...] on
# October 24, 2018. Drift was completed on November 13, 2018
elif platform == constants.Platforms.G17 and computed_datetime > g17drift:
slot = constants.OrbitalSlot.GOES_West
else:
raise Exception(
"The dataset contains an invalid oribtal slot identifier: "
+ dataset.orbital_slot
)

properties = {
"start_datetime": dataset.time_coverage_start,
"end_datetime": dataset.time_coverage_end,
"mission": constants.MISSION,
"constellation": constants.CONSTELLATION,
"platform": platform,
"instruments": [dataset.instrument_ID],
"gsd": constants.RESOLUTION,
"processing:level": constants.PROCESSING_LEVEL,
"processing:facility": dataset.production_site,
"goes:orbital-slot": slot,
"goes:system-environment": sys_env,
}

proj = ProjectionExtension.ext(item, add_if_missing=True)
proj.epsg = constants.TARGET_CRS
if len(centroid) == 2:
proj.centroid = centroid
if slot == constants.OrbitalSlot.GOES_East:
bbox = constants.ITEM_BBOX_EAST
geometry = constants.GEOMETRY_EAST
elif slot == constants.OrbitalSlot.GOES_West:
bbox = constants.ITEM_BBOX_WEST
geometry = constants.GEOMETRY_WEST
else:
bbox = None
geometry = None

centroid = {}
for key, var in dataset.variables.items():
if len(var.dimensions) != 0:
continue

ma = var[...]
if var.name == "lat_field_of_view":
centroid["lat"] = ma.tolist()
elif var.name == "lon_field_of_view":
centroid["lon"] = ma.tolist()
elif ma.count() > 0:
properties[f"goes-glm:{var.name}"] = ma.tolist()

item = Item(
stac_extensions=[
# todo: add extension again once released #12
# constants.GOES_EXTENSION,
constants.PROCESSING_EXTENSION,
],
id=id,
properties=properties,
geometry=geometry,
bbox=bbox,
datetime=computed_datetime,
collection=collection,
)

if not nogeoparquet:
target_folder = os.path.dirname(asset_href)
assets = parquet.convert(dataset, target_folder)
for key, asset_dict in assets.items():
proj = ProjectionExtension.ext(item, add_if_missing=True)
proj.epsg = constants.TARGET_CRS
if len(centroid) == 2:
proj.centroid = centroid

if not nogeoparquet:
target_folder = os.path.dirname(asset_href)
assets = parquet.convert(dataset, target_folder)
for key, asset_dict in assets.items():
asset = Asset.from_dict(asset_dict)
item.add_asset(key, asset)
TableExtension.ext(asset, add_if_missing=True)

if not nonetcdf:
# todo: replace with DataCube extension from PySTAC #16
item.stac_extensions.append(constants.DATACUBE_EXTENSION)
asset_dict = netcdf.create_asset(asset_href)
asset_dict["created"] = dataset.date_created
asset_dict["cube:dimensions"] = netcdf.to_cube_dimensions(dataset)
asset_dict["cube:variables"] = netcdf.to_cube_variables(dataset)
asset = Asset.from_dict(asset_dict)
item.add_asset(key, asset)
TableExtension.ext(asset, add_if_missing=True)
item.add_asset(constants.NETCDF_KEY, asset)

if not nonetcdf:
# todo: replace with DataCube extension from PySTAC #16
item.stac_extensions.append(constants.DATACUBE_EXTENSION)
asset_dict = netcdf.create_asset(asset_href)
asset_dict["created"] = dataset.date_created
asset_dict["cube:dimensions"] = netcdf.to_cube_dimensions(dataset)
asset_dict["cube:variables"] = netcdf.to_cube_variables(dataset)
asset = Asset.from_dict(asset_dict)
item.add_asset(constants.NETCDF_KEY, asset)

return item
for key, is_defect in defect_vars.items():
if is_defect:
del dataset.variables[key]._Unsigned

return item


def center_datetime(start: str, end: str) -> datetime:
Expand Down

0 comments on commit 63e998d

Please sign in to comment.