Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add pygmt.read to read a dataset/grid/image into pandas.DataFrame/xarray.DataArray #3673

Open
wants to merge 26 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
d913c86
Add pygmt.read to read a dataset/grid/image into pandas.DataFrame/xar…
seisman Dec 2, 2024
f456bf8
Set GMT accessor
seisman Dec 5, 2024
c3cbb6e
Need to set 'source' encoding to make GMT accessor work
seisman Dec 5, 2024
f2a4ce4
Merge branch 'main' into feature/read
seisman Dec 5, 2024
1dd97c6
Fix the source encoding
seisman Dec 5, 2024
7790ea3
No need to set the source encoding in load_remote_dataset.py
seisman Dec 5, 2024
e588008
Revert changes in pygmt/datasets/load_remote_dataset.py
seisman Dec 6, 2024
40d12ee
Improve docstring in pygmt/helpers/testing.py
seisman Dec 6, 2024
fa1021d
Improve docstrinbgs
seisman Dec 6, 2024
c378225
Get rid of decorators
seisman Dec 8, 2024
7b749e0
Improve comment
seisman Dec 8, 2024
8befa58
Get rid of the fmt_docstring alias
seisman Dec 8, 2024
a758752
Fix type hints issue with overload
seisman Dec 9, 2024
9d66cf4
Remove the type ignore flag
seisman Dec 9, 2024
a05383a
region defaults to None
seisman Dec 9, 2024
6ca4ef2
Merge branch 'main' into feature/read
seisman Dec 9, 2024
7851ced
Improve type hints and add tests
seisman Dec 9, 2024
084b87a
Improve the checking of return value of which
seisman Dec 9, 2024
b21997c
Use the read funciton in pygmt/tests/test_datatypes_dataset.py
seisman Dec 9, 2024
a812317
Use the read function instead of the load_dataarray method
seisman Dec 9, 2024
1f0f158
Add one test to make sure that read and load_dataarray returns the sa…
seisman Dec 9, 2024
957c7eb
Simplify pygmt/tests/test_clib_read_data.py with read
seisman Dec 9, 2024
6aef3ca
Fix a typo
seisman Dec 9, 2024
72afbfe
Replace xr.open_dataarray with read
seisman Dec 9, 2024
03de9b7
Fix a typo
seisman Dec 9, 2024
85c533d
Merge branch 'main' into feature/read
seisman Dec 19, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/api/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -172,6 +172,7 @@ Input/output
:toctree: generated

load_dataarray
read
Comment on lines 174 to +175
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The load_dataarray function was put under the pygmt.io namespace. Should we consider putting read under pygmt.io too? (Thinking about whether we need a low-level pygmt.clib.read and high-level pygmt.io.read in my other comment).


GMT Defaults
------------
Expand Down
1 change: 1 addition & 0 deletions pygmt/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@
makecpt,
nearneighbor,
project,
read,
select,
sph2grd,
sphdistance,
Expand Down
6 changes: 2 additions & 4 deletions pygmt/datasets/samples.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,7 @@
import pandas as pd
import xarray as xr
from pygmt.exceptions import GMTInvalidInput
from pygmt.io import load_dataarray
from pygmt.src import which
from pygmt.src import read, which


def _load_japan_quakes() -> pd.DataFrame:
Expand Down Expand Up @@ -203,8 +202,7 @@ def _load_earth_relief_holes() -> xr.DataArray:
The Earth relief grid. Coordinates are latitude and longitude in degrees. Relief
is in meters.
"""
fname = which("@earth_relief_20m_holes.grd", download="c")
return load_dataarray(fname, engine="netcdf4")
return read("@earth_relief_20m_holes.grd", kind="grid") # type: ignore[return-value]


class GMTSampleData(NamedTuple):
Expand Down
13 changes: 6 additions & 7 deletions pygmt/helpers/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,9 @@
import string
from pathlib import Path

import xarray as xr
from pygmt.exceptions import GMTImageComparisonFailure
from pygmt.io import load_dataarray
from pygmt.src import which
from pygmt.src import read


def check_figures_equal(*, extensions=("png",), tol=0.0, result_dir="result_images"):
Expand Down Expand Up @@ -144,17 +144,16 @@ def wrapper(*args, ext="png", request=None, **kwargs):
return decorator


def load_static_earth_relief():
def load_static_earth_relief() -> xr.DataArray:
"""
Load the static_earth_relief file for internal testing.
Load the static_earth_relief.nc file for internal testing.

Returns
-------
data : xarray.DataArray
data
A grid of Earth relief for internal tests.
"""
fname = which("@static_earth_relief.nc", download="c")
return load_dataarray(fname)
return read("@static_earth_relief.nc", kind="grid") # type: ignore[return-value]


def skip_if_no(package):
Expand Down
1 change: 1 addition & 0 deletions pygmt/src/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
from pygmt.src.plot3d import plot3d
from pygmt.src.project import project
from pygmt.src.psconvert import psconvert
from pygmt.src.read import read
from pygmt.src.rose import rose
from pygmt.src.select import select
from pygmt.src.shift_origin import shift_origin
Expand Down
118 changes: 118 additions & 0 deletions pygmt/src/read.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
"""
Read a file into an appropriate object.
"""

from collections.abc import Mapping, Sequence
from pathlib import PurePath
from typing import Any, Literal

import pandas as pd
import xarray as xr
from pygmt.clib import Session
from pygmt.helpers import build_arg_list, is_nonstr_iter
from pygmt.src.which import which


def read(
file: str | PurePath,
kind: Literal["dataset", "grid", "image"],
region: Sequence[float] | str | None = None,
header: int | None = None,
column_names: pd.Index | None = None,
dtype: type | Mapping[Any, type] | None = None,
index_col: str | int | None = None,
) -> pd.DataFrame | xr.DataArray:
"""
Read a dataset, grid, or image from a file and return the appropriate object.

The returned object is a :class:`pandas.DataFrame` for datasets, and
:class:`xarray.DataArray` for grids and images.

For datasets, keyword arguments ``column_names``, ``header``, ``dtype``, and
``index_col`` are supported.

Parameters
----------
file
The file name to read.
kind
The kind of data to read. Valid values are ``"dataset"``, ``"grid"``, and
``"image"``.
region
The region of interest. Only data within this region will be read.
column_names
A list of column names.
header
Row number containing column names. ``header=None`` means not to parse the
column names from table header. Ignored if the row number is larger than the
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
column names from table header. Ignored if the row number is larger than the
column names from the table header. Ignored if the row number is larger than the

number of headers in the table.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
number of headers in the table.
number of header lines in the table.

dtype
Data type. Can be a single type for all columns or a dictionary mapping
column names to types.
index_col
Column to set as index.

Returns
-------
data
Return type depends on the ``kind`` argument:

- ``"dataset"``: :class:`pandas.DataFrame`
- ``"grid"`` or ``"image"``: :class:`xarray.DataArray`


Examples
--------
Read a dataset into a :class:`pandas.DataFrame` object:

>>> from pygmt import read
>>> df = read("@hotspots.txt", kind="dataset")
>>> type(df)
<class 'pandas.core.frame.DataFrame'>

Read a grid into an :class:`xarray.DataArray` object:

>>> dataarray = read("@earth_relief_01d", kind="grid")
>>> type(dataarray)
<class 'xarray.core.dataarray.DataArray'>
"""
if kind not in {"dataset", "grid", "image"}:
msg = f"Invalid kind {kind}: must be one of 'dataset', 'grid', or 'image'."
raise ValueError(msg)

if kind != "dataset" and any(
v is not None for v in [column_names, header, dtype, index_col]
):
msg = (
"Only the 'dataset' kind supports the 'column_names', 'header', "
"'dtype', and 'index_col' arguments."
)
raise ValueError(msg)

kwdict = {
"R": "/".join(f"{v}" for v in region) if is_nonstr_iter(region) else region, # type: ignore[union-attr]
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line is used here to avoid using the kwargs_to_string, use_alias decorators:

"R": "/".join(f"{v}" for v in region) if is_nonstr_iter(region) else region

"T": {"dataset": "d", "grid": "g", "image": "i"}[kind],
}

with Session() as lib:
with lib.virtualfile_out(kind=kind) as voutfile:
lib.call_module("read", args=[file, voutfile, *build_arg_list(kwdict)])

match kind:
case "dataset":
return lib.virtualfile_to_dataset(
vfname=voutfile,
column_names=column_names,
header=header,
dtype=dtype,
index_col=index_col,
)
case "grid" | "image":
raster = lib.virtualfile_to_raster(vfname=voutfile, kind=kind)
Comment on lines +102 to +111
Copy link
Member

@weiji14 weiji14 Dec 19, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Debating on whether we should have a low-level clib read that reads into a GMT virtualfile, and a high-level read that wraps around that to do both read + convert virtualfile to a pandas.DataFrame or xarray.DataArray.

# Add "source" encoding
source = which(fname=file)
raster.encoding["source"] = (
source[0] if isinstance(source, list) else source
)
_ = raster.gmt # Load GMTDataArray accessor information
return raster
9 changes: 2 additions & 7 deletions pygmt/tests/test_datatypes_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,7 @@

import pandas as pd
import pytest
from pygmt import which
from pygmt.clib import Session
from pygmt import read, which
from pygmt.helpers import GMTTempFile


Expand Down Expand Up @@ -44,11 +43,7 @@ def dataframe_from_gmt(fname, **kwargs):
"""
Read tabular data as pandas.DataFrame using GMT virtual file.
"""
with Session() as lib:
with lib.virtualfile_out(kind="dataset") as vouttbl:
lib.call_module("read", [fname, vouttbl, "-Td"])
df = lib.virtualfile_to_dataset(vfname=vouttbl, **kwargs)
return df
return read(fname, kind="dataset", **kwargs)


@pytest.mark.benchmark
Expand Down
28 changes: 28 additions & 0 deletions pygmt/tests/test_read.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""
Test the read function.
"""

import pytest
from pygmt import read


def test_read_invalid_kind():
"""
Test that an invalid kind raises a ValueError.
"""
with pytest.raises(ValueError, match="Invalid kind"):
read("file.cpt", kind="cpt")


def test_read_invalid_arguments():
"""
Test that invalid arguments raise a ValueError for non-'dataset' kind.
"""
with pytest.raises(ValueError, match="Only the 'dataset' kind supports"):
read("file.nc", kind="grid", column_names="foo")

with pytest.raises(ValueError, match="Only the 'dataset' kind supports"):
read("file.nc", kind="grid", header=1)

with pytest.raises(ValueError, match="Only the 'dataset' kind supports"):
read("file.nc", kind="grid", dtype="float")
Loading