Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

clib.Session.virtualfile_from_vectors: Now takes a sequence of vectors as its single argument (Passing multiple arguments will be unsupported in v0.16.0) #3522

Merged
merged 14 commits into from
Oct 22, 2024
Merged
113 changes: 62 additions & 51 deletions pygmt/clib/session.py
Original file line number Diff line number Diff line change
Expand Up @@ -1324,37 +1324,36 @@ def open_virtual_file(self, family, geometry, direction, data):
return self.open_virtualfile(family, geometry, direction, data)

@contextlib.contextmanager
def virtualfile_from_vectors(self, *vectors):
def virtualfile_from_vectors(
self, vectors: Sequence, *args
) -> Generator[str, None, None]:
"""
Store 1-D arrays as columns of a table inside a virtual file.
Store a sequence of 1-D vectors as columns of a dataset inside a virtual file.

Use the virtual file name to pass in the data in your vectors to a GMT
module.
Use the virtual file name to pass the dataset with your vectors to a GMT module.

Context manager (use in a ``with`` block). Yields the virtual file name
that you can pass as an argument to a GMT module call. Closes the
virtual file upon exit of the ``with`` block.
Context manager (use in a ``with`` block). Yields the virtual file name that you
can pass as an argument to a GMT module call. Closes the virtual file upon exit
of the ``with`` block.

Use this instead of creating the data container and virtual file by
hand with :meth:`pygmt.clib.Session.create_data`,
:meth:`pygmt.clib.Session.put_vector`, and
:meth:`pygmt.clib.Session.open_virtualfile`.
Use this instead of creating the data container and virtual file by hand with
:meth:`pygmt.clib.Session.create_data`, :meth:`pygmt.clib.Session.put_vector`,
and :meth:`pygmt.clib.Session.open_virtualfile`.

If the arrays are C contiguous blocks of memory, they will be passed
without copying to GMT. If they are not (e.g., they are columns of a
2-D array), they will need to be copied to a contiguous block.
If the arrays are C contiguous blocks of memory, they will be passed without
copying to GMT. If they are not (e.g., they are columns of a 2-D array), they
will need to be copied to a contiguous block.

Parameters
----------
vectors : 1-D arrays
The vectors that will be included in the array. All must be of the
vectors
A sequence of vectors that will be stored in the dataset. All must be of the
same size.

Yields
------
fname : str
The name of virtual file. Pass this as a file name argument to a
GMT module.
fname
The name of virtual file. Pass this as a file name argument to a GMT module.

Examples
--------
Expand All @@ -1366,34 +1365,49 @@ def virtualfile_from_vectors(self, *vectors):
>>> y = np.array([4, 5, 6])
>>> z = pd.Series([7, 8, 9])
>>> with Session() as ses:
... with ses.virtualfile_from_vectors(x, y, z) as fin:
... with ses.virtualfile_from_vectors((x, y, z)) as fin:
... # Send the output to a file so that we can read it
... with GMTTempFile() as fout:
... ses.call_module("info", [fin, f"->{fout.name}"])
... print(fout.read().strip())
<vector memory>: N = 3 <1/3> <4/6> <7/9>
"""
# Conversion to a C-contiguous array needs to be done here and not in
# put_vector or put_strings because we need to maintain a reference to
# the copy while it is being used by the C API. Otherwise, the array
# would be garbage collected and the memory freed. Creating it in this
# context manager guarantees that the copy will be around until the
# virtual file is closed. The conversion is implicit in
# "*args" is added in v0.14.0 for backward-compatibility with the deprecated
# syntax of passing multiple vectors as positional arguments.
# Remove it in v0.16.0.
if len(args) > 0:
msg = (
"Passing multiple arguments to Session.virtualfile_from_vectors is "
"deprecated since v0.14.0 and will be unsupported in v0.16.0. "
"Put all vectors in a sequence (a tuple or a list) instead and pass "
"the sequence as the single argument to this function. "
"E.g., use `with lib.virtualfile_from_vectors((x, y, z)) as vfile` "
"instead of `with lib.virtualfile_from_vectors(x, y, z) as vfile`."
)
warnings.warn(message=msg, category=FutureWarning, stacklevel=3)
vectors = (vectors, *args)

# Conversion to a C-contiguous array needs to be done here and not in put_vector
# or put_strings because we need to maintain a reference to the copy while it is
# being used by the C API. Otherwise, the array would be garbage collected and
# the memory freed. Creating it in this context manager guarantees that the copy
# will be around until the virtual file is closed. The conversion is implicit in
# vectors_to_arrays.
arrays = vectors_to_arrays(vectors)

columns = len(arrays)
# Find arrays that are of string dtype from column 3 onwards
# Assumes that first 2 columns contains coordinates like longitude
# latitude, or datetime string types.
# Find arrays that are of string dtype from column 3 onwards. Assumes that first
# 2 columns contains coordinates like longitude latitude, or datetime string
seisman marked this conversation as resolved.
Show resolved Hide resolved
# types.
for col, array in enumerate(arrays[2:]):
if pd.api.types.is_string_dtype(array.dtype):
columns = col + 2
break

rows = len(arrays[0])
if not all(len(i) == rows for i in arrays):
raise GMTInvalidInput("All arrays must have same size.")
msg = "All arrays must have same size."
raise GMTInvalidInput(msg)

family = "GMT_IS_DATASET|GMT_VIA_VECTOR"
geometry = "GMT_IS_POINT"
Expand All @@ -1406,8 +1420,8 @@ def virtualfile_from_vectors(self, *vectors):
for col, array in enumerate(arrays[:columns]):
self.put_vector(dataset, column=col, vector=array)

# Use put_strings for last column(s) with string type data
# Have to use modifier "GMT_IS_DUPLICATE" to duplicate the strings
# Use put_strings for last column(s) with string type data.
# Have to use modifier "GMT_IS_DUPLICATE" to duplicate the strings.
string_arrays = arrays[columns:]
if string_arrays:
if len(string_arrays) == 1:
Expand Down Expand Up @@ -1682,7 +1696,7 @@ def virtualfile_from_stringio(
seg.header = None
seg.text = None

def virtualfile_in( # noqa: PLR0912
def virtualfile_in(
self,
check_kind=None,
data=None,
Expand Down Expand Up @@ -1781,19 +1795,18 @@ def virtualfile_in( # noqa: PLR0912
"vectors": self.virtualfile_from_vectors,
}[kind]

# Ensure the data is an iterable (Python list or tuple).
# "_data" is the data that will be passed to the _virtualfile_from function.
# "_data" defaults to "data" but should be adjusted for some cases.
_data = data
match kind:
case "arg" | "file" | "geojson" | "grid" | "image" | "stringio":
Comment on lines +1798 to -1786
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe just put _data = data in the case _ below? I'd actually prefer if we make it more explicit like:

case "arg" | "file" | "geojson" | "grid" | "stringio":
    _data = data

So that if there's a new kind in the future, we know to add it explicitly.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd actually prefer if we make it more explicit like:

case "arg" | "file" | "geojson" | "grid" | "stringio":
    _data = data

So that if there's a new kind in the future, we know to add it explicitly.

We can't do it like this, because we're currently writing case statements like case "image" if data.dtype != "uint8":, which only catches the case when kind="image" and data.dtype != "uint8".

We can refactor the codes to:

case "arg" | "file" | "geojson" | "grid" | "stringio":
    _data = data
case "image":
    _data = data
    if data.dtype != "uint8":
        # do something
case "matrix":
    _data = data
    if data.dtype.kind not in "iuf":
    # do something

or

case "arg" | "file" | "geojson" | "grid" | "stringio":
    _data = data
case "image" if data.dtype != "uint8":
    raise ...
case "matrix" if data.dtype.kind not in "iuf":
    ...
case _:
    _data = data # this is necessary to catch cases like kind="matrix" and dtype in "iuf"  

I still prefer the current codes, which set _data to data by default and the match-case statements only need to deal with exceptions.

So that if there's a new kind in the future, we know to add it explicitly.

If there is a new kind in the future, _data=data is still the default. If needs special handling and we forget to add it, then the related tests will fail anyway.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah ok, that makes sense, let's keep _data = data at the top then.

_data = (data,)
if kind == "image" and data.dtype != "uint8":
msg = (
f"Input image has dtype: {data.dtype} which is unsupported, "
"and may result in an incorrect output. Please recast image "
"to a uint8 dtype and/or scale to 0-255 range, e.g. "
"using a histogram equalization function like "
"skimage.exposure.equalize_hist."
)
warnings.warn(message=msg, category=RuntimeWarning, stacklevel=2)
case "image" if data.dtype != "uint8":
msg = (
f"Input image has dtype: {data.dtype} which is unsupported, and "
"may result in an incorrect output. Please recast image to a uint8 "
"dtype and/or scale to 0-255 range, e.g. using a histogram "
"equalization function like skimage.exposure.equalize_hist."
)
warnings.warn(message=msg, category=RuntimeWarning, stacklevel=2)
case "empty": # data is None, so data must be given via x/y/z.
_data = [x, y]
if z is not None:
Expand All @@ -1808,19 +1821,17 @@ def virtualfile_in( # noqa: PLR0912
else:
# Python list, tuple, numpy.ndarray, and pandas.Series types
_data = np.atleast_2d(np.asanyarray(data).T)
case "matrix":
case "matrix" if data.dtype.kind not in "iuf":
# GMT can only accept a 2-D matrix which are signed integer (i),
# unsigned integer (u) or floating point (f) types. For other data
# types, we need to use virtualfile_from_vectors instead, which turns
# the matrix into a list of vectors and allows for better handling of
# non-integer/float type inputs (e.g. for string or datetime data types)
_data = (data,)
if data.dtype.kind not in "iuf":
_virtualfile_from = self.virtualfile_from_vectors
_data = data.T
_virtualfile_from = self.virtualfile_from_vectors
_data = data.T

# Finally create the virtualfile from the data, to be passed into GMT
file_context = _virtualfile_from(*_data)
file_context = _virtualfile_from(_data)
return file_context

def virtualfile_from_data(
Expand Down
37 changes: 30 additions & 7 deletions pygmt/tests/test_clib_virtualfile_from_vectors.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def test_virtualfile_from_vectors(dtypes):
y = np.arange(size, size * 2, 1, dtype=dtype)
z = np.arange(size * 2, size * 3, 1, dtype=dtype)
with clib.Session() as lib:
with lib.virtualfile_from_vectors(x, y, z) as vfile:
with lib.virtualfile_from_vectors((x, y, z)) as vfile:
with GMTTempFile() as outfile:
lib.call_module("info", [vfile, f"->{outfile.name}"])
output = outfile.read(keep_tabs=True)
Expand All @@ -64,7 +64,7 @@ def test_virtualfile_from_vectors_one_string_or_object_column(dtype):
y = np.arange(size, size * 2, 1, dtype=np.int32)
strings = np.array(["a", "bc", "defg", "hijklmn", "opqrst"], dtype=dtype)
with clib.Session() as lib:
with lib.virtualfile_from_vectors(x, y, strings) as vfile:
with lib.virtualfile_from_vectors((x, y, strings)) as vfile:
with GMTTempFile() as outfile:
lib.call_module("convert", [vfile, f"->{outfile.name}"])
output = outfile.read(keep_tabs=True)
Expand All @@ -86,7 +86,7 @@ def test_virtualfile_from_vectors_two_string_or_object_columns(dtype):
strings1 = np.array(["a", "bc", "def", "ghij", "klmnolooong"], dtype=dtype)
strings2 = np.array(["pqrst", "uvwx", "yz!", "@#", "$"], dtype=dtype)
with clib.Session() as lib:
with lib.virtualfile_from_vectors(x, y, strings1, strings2) as vfile:
with lib.virtualfile_from_vectors((x, y, strings1, strings2)) as vfile:
with GMTTempFile() as outfile:
lib.call_module("convert", [vfile, f"->{outfile.name}"])
output = outfile.read(keep_tabs=True)
Expand All @@ -105,7 +105,7 @@ def test_virtualfile_from_vectors_transpose(dtypes):
for dtype in dtypes:
data = np.arange(shape[0] * shape[1], dtype=dtype).reshape(shape)
with clib.Session() as lib:
with lib.virtualfile_from_vectors(*data.T) as vfile:
with lib.virtualfile_from_vectors(data.T) as vfile:
with GMTTempFile() as outfile:
lib.call_module("info", [vfile, "-C", f"->{outfile.name}"])
output = outfile.read(keep_tabs=True)
Expand All @@ -122,7 +122,7 @@ def test_virtualfile_from_vectors_diff_size():
y = np.arange(6)
with clib.Session() as lib:
with pytest.raises(GMTInvalidInput):
with lib.virtualfile_from_vectors(x, y):
with lib.virtualfile_from_vectors((x, y)):
pass


Expand All @@ -143,7 +143,7 @@ def test_virtualfile_from_vectors_pandas(dtypes_pandas):
dtype=dtype,
)
with clib.Session() as lib:
with lib.virtualfile_from_vectors(data.x, data.y, data.z) as vfile:
with lib.virtualfile_from_vectors((data.x, data.y, data.z)) as vfile:
with GMTTempFile() as outfile:
lib.call_module("info", [vfile, f"->{outfile.name}"])
output = outfile.read(keep_tabs=True)
Expand All @@ -163,10 +163,33 @@ def test_virtualfile_from_vectors_arraylike():
y = tuple(range(size, size * 2, 1))
z = range(size * 2, size * 3, 1)
with clib.Session() as lib:
with lib.virtualfile_from_vectors(x, y, z) as vfile:
with lib.virtualfile_from_vectors((x, y, z)) as vfile:
with GMTTempFile() as outfile:
lib.call_module("info", [vfile, f"->{outfile.name}"])
output = outfile.read(keep_tabs=True)
bounds = "\t".join([f"<{min(i):.0f}/{max(i):.0f}>" for i in (x, y, z)])
expected = f"<vector memory>: N = {size}\t{bounds}\n"
assert output == expected


def test_virtualfile_from_vectors_args():
"""
Test the backward compatibility of the deprecated syntax for passing multiple
vectors.

This test is the same as test_virtualfile_from_vectors_arraylike, but using the
old syntax.
"""
size = 13
x = list(range(0, size, 1))
y = tuple(range(size, size * 2, 1))
z = range(size * 2, size * 3, 1)
with pytest.warns(FutureWarning, match="virtualfile_from_vectors"):
with clib.Session() as lib:
with lib.virtualfile_from_vectors(x, y, z) as vfile:
with GMTTempFile() as outfile:
lib.call_module("info", [vfile, f"->{outfile.name}"])
output = outfile.read(keep_tabs=True)
bounds = "\t".join([f"<{min(i):.0f}/{max(i):.0f}>" for i in (x, y, z)])
expected = f"<vector memory>: N = {size}\t{bounds}\n"
assert output == expected