Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option to fix netCDF issues #17 #18

Merged
merged 2 commits into from
Sep 12, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/).

### Added

- Nothing.
- New option `fixnetcdf` (see issue [#17](https://github.com/stactools-packages/goes-glm/issues/17) for details)

### Deprecated

Expand Down
10 changes: 9 additions & 1 deletion src/stactools/goes_glm/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,12 +96,18 @@ def create_collection_command(
default=False,
help="Does not include the netCDF file in the created metadata if set to `TRUE`.",
)
@click.option(
"--fixnetcdf",
default=False,
help="Fixes missing _Unsigned attributes in some older netCDF files if set to `TRUE`.",
)
def create_item_command(
source: str,
destination: str,
collection: str = "",
nogeoparquet: bool = False,
nonetcdf: bool = False,
fixnetcdf: bool = False,
) -> None:
"""Creates a STAC Item

Expand All @@ -113,7 +119,9 @@ def create_item_command(
if len(collection) > 0:
stac_collection = Collection.from_file(collection)

item = stac.create_item(source, stac_collection, nogeoparquet, nonetcdf)
item = stac.create_item(
source, stac_collection, nogeoparquet, nonetcdf, fixnetcdf
)
item.save_object(dest_href=destination)

return None
Expand Down
29 changes: 7 additions & 22 deletions src/stactools/goes_glm/parquet.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import logging
import os
from datetime import datetime, timedelta, timezone
from math import isnan
from typing import Any, Dict, List, Optional

from geopandas import GeoDataFrame, GeoSeries
Expand Down Expand Up @@ -200,35 +199,21 @@ def create_asset(
base = datetime.fromisoformat(unit[14:]).replace(tzinfo=timezone.utc)

new_data: List[Optional[datetime]] = []
invalid_datetimes = 0
for val in data:
if val is None or isnan(val):
# Sometimes the values contain NaN/None values
new_data.append(None)
invalid_datetimes += 1
else:
try:
delta = timedelta(seconds=val)
new_data.append(base + delta)
except TypeError:
raise Exception(
f"An invalid value '{val}' found in variable '{var_name}'"
)
try:
delta = timedelta(seconds=val)
new_data.append(base + delta)
except TypeError:
raise Exception(
f"An invalid value '{val}' found in variable '{var_name}'"
)

table_data[new_col] = new_data
col_info = {
"name": new_col,
"type": constants.PARQUET_DATETIME_COL_TYPE,
}

if invalid_datetimes > 0:
logger.warning(
f"The variable {var_name} contains {invalid_datetimes} `null` values"
)
col_info[
"description"
] = f"The column contains {invalid_datetimes} `null` values"

table_cols.append(col_info)

table_data[col] = data
Expand Down
246 changes: 138 additions & 108 deletions src/stactools/goes_glm/stac.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,7 @@ def create_item(
collection: Optional[Collection] = None,
nogeoparquet: bool = False,
nonetcdf: bool = False,
fixnetcdf: bool = False,
) -> Item:
"""Create a STAC Item

Expand All @@ -174,130 +175,159 @@ def create_item(
collection (pystac.Collection): HREF to an existing collection
nogeoparquet (bool): If set to True, no geoparquet file is generated for the Item
nonetcdf (bool): If set to True, the netCDF file is not added to the Item
fixnetcdf (bool): If set to True, fixes missing _Unsigned attributes in some of
the older netCDF files

Returns:
Item: STAC Item object
"""

dataset = Dataset(asset_href, "r", format="NETCDF4")
with Dataset(asset_href, "a", format="NETCDF4") as dataset:
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Always opening the files for reading is not ideal. On the other hand, scaling the variables manually also doesn't seem ideal.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤷 Since there is now an option to fix the netcdf, always opening them for append seems OK.

id = dataset.dataset_name.replace(".nc", "")
sys_env = id[:2]
if sys_env != "OR":
logger.warning("You are ingesting test data.")

id = dataset.dataset_name.replace(".nc", "")
sys_env = id[:2]
if sys_env != "OR":
logger.warning("You are ingesting test data.")

computed_datetime = center_datetime(
dataset.time_coverage_start, dataset.time_coverage_end
)

try:
platform = constants.Platforms[dataset.platform_ID]
if platform == constants.Platforms.G18:
raise Exception("GOES-18/T is not supported yet")
except ValueError:
raise Exception(
f"The dataset contains an invalid platform identifier: {dataset.platform_ID}"
)
var_count = len(dataset.variables)
if var_count != 45 and var_count != 48:
raise Exception(
f"The number of variables is expected to be 45 or 48, but it is {var_count}"
)

try:
slot_str = dataset.orbital_slot.replace("-", "_")
slot = constants.OrbitalSlot[slot_str]
except KeyError:
# Some files seem to list "GOES_Test" as slot, use platform ID as indicator then.
logger.warning(
f"The value for 'orbital_slot' is invalid: {dataset.orbital_slot}"
# See page 14-15 for details:
# https://www.noaasis.noaa.gov/pdf/ps-pvr/goes-16/GLM/Full/GOES16_GLM_FullValidation_ProductPerformanceGuide.pdf
defect_vars = {
"event_time_offset": False,
"group_time_offset": False,
"flash_time_offset_of_first_event": False,
"flash_time_offset_of_last_event": False,
"group_frame_time_offset": False,
"flash_frame_time_offset_of_first_event": False,
"flash_frame_time_offset_of_last_event": False,
}
for key in defect_vars:
if key in dataset.variables:
if not hasattr(dataset.variables[key], "_Unsigned"):
dataset.variables[key]._Unsigned = "true"
defect_vars[key] = True

computed_datetime = center_datetime(
dataset.time_coverage_start, dataset.time_coverage_end
)

# GOES-16 began drifting to the GOES-East operational location [...] on November 30, 2017.
# Drift was complete on December 11, 2017,
g16drift = datetime(2017, 12, 11, tzinfo=timezone.utc)
g17drift = datetime(2018, 11, 13, tzinfo=timezone.utc)
if platform == constants.Platforms.G16 and computed_datetime > g16drift:
slot = constants.OrbitalSlot.GOES_East
# GOES-17 began drifting to its GOES-West operational location [...] on October 24, 2018.
# Drift was completed on November 13, 2018
elif platform == constants.Platforms.G17 and computed_datetime > g17drift:
slot = constants.OrbitalSlot.GOES_West
else:
try:
platform = constants.Platforms[dataset.platform_ID]
if platform == constants.Platforms.G18:
raise Exception("GOES-18/T is not supported yet")
except ValueError:
raise Exception(
f"The dataset contains an invalid oribtal slot identifier: {dataset.orbital_slot}"
f"The dataset contains an invalid platform identifier: {dataset.platform_ID}"
)

properties = {
"start_datetime": dataset.time_coverage_start,
"end_datetime": dataset.time_coverage_end,
"mission": constants.MISSION,
"constellation": constants.CONSTELLATION,
"platform": platform,
"instruments": [dataset.instrument_ID],
"gsd": constants.RESOLUTION,
"processing:level": constants.PROCESSING_LEVEL,
"processing:facility": dataset.production_site,
"goes:orbital-slot": slot,
"goes:system-environment": sys_env,
}

if slot == constants.OrbitalSlot.GOES_East:
bbox = constants.ITEM_BBOX_EAST
geometry = constants.GEOMETRY_EAST
elif slot == constants.OrbitalSlot.GOES_West:
bbox = constants.ITEM_BBOX_WEST
geometry = constants.GEOMETRY_WEST
else:
bbox = None
geometry = None

centroid = {}
for key, var in dataset.variables.items():
if len(var.dimensions) != 0:
continue

ma = var[...]
if var.name == "lat_field_of_view":
centroid["lat"] = ma.tolist()
elif var.name == "lon_field_of_view":
centroid["lon"] = ma.tolist()
elif ma.count() > 0:
properties[f"goes-glm:{var.name}"] = ma.tolist()

item = Item(
stac_extensions=[
# todo: add extension again once released #12
# constants.GOES_EXTENSION,
constants.PROCESSING_EXTENSION,
],
id=id,
properties=properties,
geometry=geometry,
bbox=bbox,
datetime=computed_datetime,
collection=collection,
)
try:
slot_str = dataset.orbital_slot.replace("-", "_")
slot = constants.OrbitalSlot[slot_str]
except KeyError:
# Some files seem to list "GOES_Test" as slot, use platform ID as indicator then.
logger.warning(
f"The value for 'orbital_slot' is invalid: {dataset.orbital_slot}"
)

# GOES-16 began drifting to the GOES-East operational location [...] on
# November 30, 2017. Drift was complete on December 11, 2017,
g16drift = datetime(2017, 12, 11, tzinfo=timezone.utc)
g17drift = datetime(2018, 11, 13, tzinfo=timezone.utc)
if platform == constants.Platforms.G16 and computed_datetime > g16drift:
slot = constants.OrbitalSlot.GOES_East
# GOES-17 began drifting to its GOES-West operational location [...] on
# October 24, 2018. Drift was completed on November 13, 2018
elif platform == constants.Platforms.G17 and computed_datetime > g17drift:
slot = constants.OrbitalSlot.GOES_West
else:
raise Exception(
"The dataset contains an invalid oribtal slot identifier: "
+ dataset.orbital_slot
)

properties = {
"start_datetime": dataset.time_coverage_start,
"end_datetime": dataset.time_coverage_end,
"mission": constants.MISSION,
"constellation": constants.CONSTELLATION,
"platform": platform,
"instruments": [dataset.instrument_ID],
"gsd": constants.RESOLUTION,
"processing:level": constants.PROCESSING_LEVEL,
"processing:facility": dataset.production_site,
"goes:orbital-slot": slot,
"goes:system-environment": sys_env,
}

proj = ProjectionExtension.ext(item, add_if_missing=True)
proj.epsg = constants.TARGET_CRS
if len(centroid) == 2:
proj.centroid = centroid
if slot == constants.OrbitalSlot.GOES_East:
bbox = constants.ITEM_BBOX_EAST
geometry = constants.GEOMETRY_EAST
elif slot == constants.OrbitalSlot.GOES_West:
bbox = constants.ITEM_BBOX_WEST
geometry = constants.GEOMETRY_WEST
else:
bbox = None
geometry = None

centroid = {}
for key, var in dataset.variables.items():
if len(var.dimensions) != 0:
continue

ma = var[...]
if var.name == "lat_field_of_view":
centroid["lat"] = ma.tolist()
elif var.name == "lon_field_of_view":
centroid["lon"] = ma.tolist()
elif ma.count() > 0:
properties[f"goes-glm:{var.name}"] = ma.tolist()

item = Item(
stac_extensions=[
# todo: add extension again once released #12
# constants.GOES_EXTENSION,
constants.PROCESSING_EXTENSION,
],
id=id,
properties=properties,
geometry=geometry,
bbox=bbox,
datetime=computed_datetime,
collection=collection,
)

if not nogeoparquet:
target_folder = os.path.dirname(asset_href)
assets = parquet.convert(dataset, target_folder)
for key, asset_dict in assets.items():
proj = ProjectionExtension.ext(item, add_if_missing=True)
proj.epsg = constants.TARGET_CRS
if len(centroid) == 2:
proj.centroid = centroid

if not nogeoparquet:
target_folder = os.path.dirname(asset_href)
assets = parquet.convert(dataset, target_folder)
for key, asset_dict in assets.items():
asset = Asset.from_dict(asset_dict)
item.add_asset(key, asset)
TableExtension.ext(asset, add_if_missing=True)

if not nonetcdf:
# todo: replace with DataCube extension from PySTAC #16
item.stac_extensions.append(constants.DATACUBE_EXTENSION)
asset_dict = netcdf.create_asset(asset_href)
asset_dict["created"] = dataset.date_created
asset_dict["cube:dimensions"] = netcdf.to_cube_dimensions(dataset)
asset_dict["cube:variables"] = netcdf.to_cube_variables(dataset)
asset = Asset.from_dict(asset_dict)
item.add_asset(key, asset)
TableExtension.ext(asset, add_if_missing=True)
item.add_asset(constants.NETCDF_KEY, asset)

if not nonetcdf:
# todo: replace with DataCube extension from PySTAC #16
item.stac_extensions.append(constants.DATACUBE_EXTENSION)
asset_dict = netcdf.create_asset(asset_href)
asset_dict["created"] = dataset.date_created
asset_dict["cube:dimensions"] = netcdf.to_cube_dimensions(dataset)
asset_dict["cube:variables"] = netcdf.to_cube_variables(dataset)
asset = Asset.from_dict(asset_dict)
item.add_asset(constants.NETCDF_KEY, asset)

return item
for key, is_defect in defect_vars.items():
if is_defect:
del dataset.variables[key]._Unsigned

return item


def center_datetime(start: str, end: str) -> datetime:
Expand Down