Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Move integration testing feedstock to tests directory #171

Merged
merged 9 commits into from
Feb 17, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions codecov.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
coverage:
precision: 2
round: down
status:
project:
default:
target: 95
informational: true
patch: off
changes: off
ignore:
- "setup.py"
- "tests/*"
- "**/__init__.py"
2 changes: 1 addition & 1 deletion docs/tutorial/flink.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ we must answer:
### Where Data is Discovered

Recipe input is defined in the recipe itself. Most recipes will define a `pangeo_forge_recipes.patterns.FilePattern` that provides the pipeline with input file locations.
The example below taken from the [integration test recipe](https://github.com/pforgetest/gpcp-from-gcs-feedstock/blob/main/feedstock/recipe.py)
The example below taken from [this example recipe](https://github.com/pforgetest/gpcp-from-gcs-feedstock/blob/main/feedstock/recipe.py)

```python
import apache_beam as beam
Expand Down
13 changes: 7 additions & 6 deletions tests/integration/flink/test_flink_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,14 @@
import tempfile
import time
from importlib.metadata import version
from pathlib import Path

import pytest
import xarray as xr
from packaging.version import parse as parse_version

TEST_DATA_DIR = Path(__file__).parent.parent.parent / "test-data"


def test_flink_bake(minio_service, flinkversion, pythonversion, beamversion):
fsspec_args = {
Expand All @@ -18,7 +21,7 @@ def test_flink_bake(minio_service, flinkversion, pythonversion, beamversion):

pfr_version = parse_version(version("pangeo-forge-recipes"))
if pfr_version >= parse_version("0.10"):
recipe_version_ref = str(pfr_version)
recipe_version_ref = "0.10.x"
else:
recipe_version_ref = "0.9.x"
pytest.xfail(
Expand Down Expand Up @@ -71,11 +74,9 @@ def test_flink_bake(minio_service, flinkversion, pythonversion, beamversion):
"pangeo-forge-runner",
"bake",
"--repo",
"https://github.com/pforgetest/gpcp-from-gcs-feedstock.git",
"--ref",
# in the test feedstock, tags are named for
# the recipe version used to write the recipe module
recipe_version_ref,
str(TEST_DATA_DIR / "gpcp-from-gcs"),
"--feedstock-subdir",
f"feedstock-{recipe_version_ref}",
"-f",
f.name,
]
Expand Down
31 changes: 25 additions & 6 deletions tests/integration/test_dataflow_integration.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,33 @@
import json
import subprocess
import sys
import tempfile
import time
from importlib.metadata import version
from pathlib import Path

import pytest
import xarray as xr
from packaging.version import parse as parse_version

TEST_DATA_DIR = Path(__file__).parent.parent / "test-data"


def test_dataflow_integration():
python_version = (
f"{sys.version_info.major}.{sys.version_info.minor}.{sys.version_info.micro}"
)
pfr_version = parse_version(version("pangeo-forge-recipes"))
if pfr_version >= parse_version("0.10"):
recipe_version_ref = str(pfr_version)
recipe_version_ref = "0.10.x"
Copy link
Collaborator

@ranchodeluxe ranchodeluxe Feb 13, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reviewer note:
given that integrate/unit tests only test beginning and end range of versions I made the unilateral decision to just have 0.10.x listed in test-data which I think works fine

else:
recipe_version_ref = "0.9.x"
bucket = "gs://pangeo-forge-runner-ci-testing"
config = {
"Bake": {
"prune": True,
"bakery_class": "pangeo_forge_runner.bakery.dataflow.DataflowBakery",
"job_name": f"gpcp-from-gcs-py{python_version.replace('.','')}-v{recipe_version_ref.replace('.','')}",
},
"DataflowBakery": {"temp_gcs_location": bucket + "/temp"},
"TargetStorage": {
Expand All @@ -43,11 +51,9 @@ def test_dataflow_integration():
"pangeo-forge-runner",
"bake",
"--repo",
"https://github.com/pforgetest/gpcp-from-gcs-feedstock.git",
"--ref",
# in the test feedstock, tags are named for the recipes version
# which was used to write the recipe module
recipe_version_ref,
str(TEST_DATA_DIR / "gpcp-from-gcs"),
"--feedstock-subdir",
f"feedstock-{recipe_version_ref}",
"--json",
"-f",
f.name,
Expand All @@ -72,6 +78,15 @@ def test_dataflow_integration():

# okay, time to start checking if the job is done
show_job = f"gcloud dataflow jobs show {job_id} --format=json".split()
show_job_errors = [
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reviewer note: let's use this command to dump logs when job fails so we don't have to use GCP console

"gcloud",
"logging",
"read",
f'resource.type="dataflow_step" AND resource.labels.job_id="{job_id}" AND severity>=ERROR',
"--limit",
"500",
"--format=json",
]
while True:
elapsed = time.time() - start
print(f"Time {elapsed = }")
Expand All @@ -95,6 +110,10 @@ def test_dataflow_integration():
# still running, let's give it another 30s then check again
time.sleep(30)
else:
# try to get some output to the stdout so we don't have to log into the GCP console
state_proc = subprocess.run(show_job_errors, capture_output=True)
assert state_proc.returncode == 0
print(json.loads(state_proc.stdout))
# consider any other state a failure
pytest.fail(f"{state = } is neither 'Done' nor 'Running'")

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
title: "Test meta.yaml with list of recipes"
recipes:
- dict_object: 'recipe:recipes'
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
recipes = {"test_1": "test_1", "test_2": "test_2"}
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
pangeo-forge-recipes>=0.10.0,<0.11
31 changes: 31 additions & 0 deletions tests/test-data/gpcp-from-gcs/feedstock-0.10.x/meta.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
title: "Global Precipitation Climatology Project"
description: >
Global Precipitation Climatology Project (GPCP) Daily Version 1.3 gridded, merged ty
satellite/gauge precipitation Climate data Record (CDR) from 1996 to present.
pangeo_forge_version: "0.9.0"
pangeo_notebook_version: "2022.06.02"
recipes:
- id: gpcp-from-gcs
object: "recipe:recipe"
provenance:
providers:
- name: "NOAA NCEI"
description: "National Oceanographic & Atmospheric Administration National Centers for Environmental Information"
roles:
- host
- licensor
url: https://www.ncei.noaa.gov/products/global-precipitation-climatology-project
- name: "University of Maryland"
description: >
University of Maryland College Park Earth System Science Interdisciplinary Center
(ESSIC) and Cooperative Institute for Climate and Satellites (CICS).
roles:
- producer
url: http://gpcp.umd.edu/
license: "No constraints on data access or use."
maintainers:
- name: "Charles Stern"
orcid: "0000-0002-4078-0852"
github: cisaacstern
bakery:
id: "pangeo-ldeo-nsf-earthcube"
35 changes: 35 additions & 0 deletions tests/test-data/gpcp-from-gcs/feedstock-0.10.x/recipe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import apache_beam as beam
import pandas as pd
from pangeo_forge_recipes.patterns import ConcatDim, FilePattern
from pangeo_forge_recipes.transforms import (
OpenURLWithFSSpec,
OpenWithXarray,
StoreToZarr,
)

dates = [
d.to_pydatetime().strftime("%Y%m%d")
for d in pd.date_range("1996-10-01", "1999-02-01", freq="D")
]


def make_url(time):
url_base = "https://storage.googleapis.com/pforge-test-data"
return f"{url_base}/gpcp/v01r03_daily_d{time}.nc"


concat_dim = ConcatDim("time", dates, nitems_per_file=1)
pattern = FilePattern(make_url, concat_dim)


recipe = (
beam.Create(pattern.items())
| OpenURLWithFSSpec()
| OpenWithXarray(
file_type=pattern.file_type, xarray_open_kwargs={"decode_coords": "all"}
)
| StoreToZarr(
store_name="gpcp",
combine_dims=pattern.combine_dim_keys,
)
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
gcsfs
pangeo-forge-recipes>=0.10.0,<0.11
s3fs
31 changes: 31 additions & 0 deletions tests/test-data/gpcp-from-gcs/feedstock-0.9.x/meta.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
title: "Global Precipitation Climatology Project"
description: >
Global Precipitation Climatology Project (GPCP) Daily Version 1.3 gridded, merged ty
satellite/gauge precipitation Climate data Record (CDR) from 1996 to present.
pangeo_forge_version: "0.9.0"
pangeo_notebook_version: "2022.06.02"
recipes:
- id: gpcp-from-gcs
object: "recipe:recipe"
provenance:
providers:
- name: "NOAA NCEI"
description: "National Oceanographic & Atmospheric Administration National Centers for Environmental Information"
roles:
- host
- licensor
url: https://www.ncei.noaa.gov/products/global-precipitation-climatology-project
- name: "University of Maryland"
description: >
University of Maryland College Park Earth System Science Interdisciplinary Center
(ESSIC) and Cooperative Institute for Climate and Satellites (CICS).
roles:
- producer
url: http://gpcp.umd.edu/
license: "No constraints on data access or use."
maintainers:
- name: "Charles Stern"
orcid: "0000-0002-4078-0852"
github: cisaacstern
bakery:
id: "pangeo-ldeo-nsf-earthcube"
21 changes: 21 additions & 0 deletions tests/test-data/gpcp-from-gcs/feedstock-0.9.x/recipe.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import pandas as pd
from pangeo_forge_recipes.patterns import ConcatDim, FilePattern
from pangeo_forge_recipes.recipes import XarrayZarrRecipe

dates = [
d.to_pydatetime().strftime("%Y%m%d")
for d in pd.date_range("1996-10-01", "1999-02-01", freq="D")
]


def make_url(time):
url_base = "https://storage.googleapis.com/pforge-test-data"
return f"{url_base}/gpcp/v01r03_daily_d{time}.nc"


concat_dim = ConcatDim("time", dates, nitems_per_file=1)
pattern = FilePattern(make_url, concat_dim)

recipe = XarrayZarrRecipe(
pattern, inputs_per_chunk=200, xarray_open_kwargs={"decode_coords": "all"}
)
18 changes: 12 additions & 6 deletions tests/unit/test_bake.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,16 @@
import sys
import tempfile
from importlib.metadata import distributions, version
from pathlib import Path

import pytest
import xarray as xr
from packaging.version import parse as parse_version

from pangeo_forge_runner.commands.bake import Bake

TEST_DATA_DIR = Path(__file__).parent.parent / "test-data"


@pytest.fixture
def recipes_uninstalled():
Expand Down Expand Up @@ -193,11 +196,9 @@ def test_gpcp_bake(
"pangeo-forge-runner",
"bake",
"--repo",
"https://github.com/pforgetest/gpcp-from-gcs-feedstock.git",
"--ref",
# in the test feedstock, tags are named for the recipes version
# which was used to write the recipe module
recipes_version_ref,
str(TEST_DATA_DIR / "gpcp-from-gcs"),
"--feedstock-subdir",
f"feedstock-{recipes_version_ref}",
"--json",
"-f",
f.name,
Expand All @@ -222,7 +223,7 @@ def test_gpcp_bake(
if custom_job_name:
assert job_name.startswith(custom_job_name)
else:
assert job_name.startswith("gh-pforgetest-gpcp-from-gcs-")
assert job_name.startswith("local-gpcp-2dfrom-2dgcs-feedstock-")

if "dictobj" in recipes_version_ref:
assert job_name.endswith(
Expand All @@ -246,6 +247,11 @@ def test_gpcp_bake(
else:
zarr_store_full_paths = [config["TargetStorage"]["root_path"]]

# dictobj runs do not generate any datasets b/c they are not recipes
# so we've asserted what we can already, just move on
if recipes_version_ref.endswith("dictobj"):
return
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

reviewer note: this couldn't have been actually producing output for 0.10.x so I think this is fine


# Open the generated datasets with xarray!
for path in zarr_store_full_paths:
print(f"Opening dataset for {path = }")
Expand Down
Loading