Skip to content

Commit

Permalink
Add dtypes module, refactor, document
Browse files Browse the repository at this point in the history
  • Loading branch information
otsaloma committed Dec 8, 2024
1 parent 64946a5 commit 9f6a106
Show file tree
Hide file tree
Showing 9 changed files with 92 additions and 22 deletions.
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ clean:
rm -rf */*/.pytest_cache

doc:
$(MAKE) -C doc clean html
$(MAKE) SPHINXBUILD=../venv/bin/sphinx-build -C doc clean html

doc-check:
PYTHONPATH=. doc/check.py
Expand All @@ -34,7 +34,7 @@ doc-open:
xdg-open doc/_build/html/index.html

doc-watch:
watchexec -e py,rst --workdir doc $(MAKE) html
watchexec -e py,rst --workdir doc $(MAKE) SPHINXBUILD=../venv/bin/sphinx-build html

install:
pip3 install --break-system-packages .
Expand Down
19 changes: 19 additions & 0 deletions dataiter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,28 @@
PRINT_MAX_ELEMENTS = 100
PRINT_MAX_ITEMS = 10
PRINT_MAX_ROWS = 100

#: Maximum amount of columns to wrap print output to. Note that this is only a
#: fallback in case Python's ``shutil.get_terminal_size`` fails to detect the
#: width of your terminal. By default the detected full width is used.
PRINT_MAX_WIDTH = 80

#: Thousand separator to use when formatting numbers. By default this is blank,
#: meaning no thousand separators are rendered.
PRINT_THOUSAND_SEPARATOR = ""

#: Maximum width to truncate string columns to in :class:`DataFrame` print
#: output. When this is exceeded, strings will be cut with an ellipsis (``…``)
#: at the end.
PRINT_TRUNCATE_WIDTH = 36

#: ``True`` to use Numba, if available, to speed up :doc:`aggregations
#: </aggregation>`.
USE_NUMBA = False

#: ``True`` to use Numba cache for JIT-compiled :doc:`aggregations
#: </aggregation>`, ``False`` to only keep compiled code in memory for the
#: duration of the session.
USE_NUMBA_CACHE = True

if not np.__version__.startswith("2."):
Expand Down Expand Up @@ -69,6 +87,7 @@ def check(x):
globals().pop("numba", None)
globals().pop("util", None)

from dataiter import dtypes # noqa
from dataiter.vector import Vector # noqa
from dataiter.data_frame import DataFrame # noqa
from dataiter.data_frame import DataFrameColumn # noqa
Expand Down
31 changes: 31 additions & 0 deletions dataiter/dtypes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# -*- coding: utf-8 -*-

# Copyright (c) 2024 Osmo Salomaa
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.

from numpy.dtypes import StringDType

#: Instance of NumPy variable-width StringDType used
string = StringDType(na_object="")

# Use a blank string as missing value sentinel (1) because that's what we used
# prior to the NumPy 2.0 StringDType and (2) because in many cases, such as CSV
# input, a distinction between NA and blank cannot usually be made.
# https://numpy.org/doc/stable/user/basics.strings.html#missing-data-support
27 changes: 11 additions & 16 deletions dataiter/vector.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,11 @@
import numpy as np
import sys

from dataiter import dtypes
from dataiter import util
from math import inf
from numpy.dtypes import StringDType

# Use a blank string as missing value sentinel (1) because that's what we used
# prior to the NumPy 2.0 StringDType and (2) because with the common case of
# CSV input, a distinction between NA and blank cannot usually be made.
# https://numpy.org/doc/stable/user/basics.strings.html#missing-data-support
string_dtype = StringDType(na_object="")

TYPE_CONVERSIONS = {
datetime.date: "datetime64[D]",
datetime.datetime: "datetime64[us]",
Expand All @@ -55,7 +50,7 @@ class Vector(np.ndarray):

def __new__(cls, object, dtype=None):
if dtype is str:
dtype = string_dtype
dtype = dtypes.string
# If given a NumPy array, we can do a fast initialization.
if isinstance(object, np.ndarray):
dtype = dtype or object.dtype
Expand Down Expand Up @@ -178,7 +173,7 @@ def as_string(self):
>>> vector = di.Vector([1, 2, 3])
>>> vector.as_string()
"""
return self.astype(string_dtype)
return self.astype(dtypes.string)

def _check_dimensions(self):
if self.ndim == 1: return
Expand Down Expand Up @@ -256,7 +251,7 @@ def fast(cls, object, dtype=None):
# Evaluate generator/iterator.
object = list(object)
if dtype is str:
dtype = string_dtype
dtype = dtypes.string
return cls._np_array(object, dtype).view(cls)

def get_memory_use(self):
Expand Down Expand Up @@ -329,7 +324,7 @@ def is_na(self):
if self.is_float():
return np.isnan(self)
if self.is_string():
return self == string_dtype.na_object
return self == dtypes.string.na_object
# Can't use np.isin here since elements can be arrays.
return self.fast([x is None for x in self], bool)

Expand Down Expand Up @@ -377,7 +372,7 @@ def map(self, function, *args, dtype=None, **kwargs):
>>> vector.map(math.pow, 2)
"""
if dtype is str:
dtype = string_dtype
dtype = dtypes.string
return self.__class__((function(x, *args, **kwargs) for x in self), dtype)

@property
Expand Down Expand Up @@ -445,7 +440,7 @@ def na_value(self):
if self.is_integer():
return np.nan
if self.is_string():
return string_dtype.na_object
return dtypes.string.na_object
# Note that using None, e.g. for a boolean vector,
# might not work directly as it requires upcasting to object.
return None
Expand All @@ -456,13 +451,13 @@ def _np_array(object, dtype=None):
# In some cases we can only fix the dtype ex-post.
if dtype is None:
if util.unique_types(object) == {str}:
dtype = string_dtype
dtype = dtypes.string
if dtype is str:
dtype = string_dtype
dtype = dtypes.string
array = np.array(object, dtype)
if dtype is None:
if np.issubdtype(array.dtype, np.str_):
array = array.astype(string_dtype)
array = array.astype(dtypes.string)
return array

def range(self):
Expand Down Expand Up @@ -593,7 +588,7 @@ def _std_to_np(cls, seq, dtype=None):
# Convert missing values in seq to NumPy equivalents.
# Can be empty if all of seq are missing values.
if dtype is str:
dtype = string_dtype
dtype = dtypes.string
types = util.unique_types(seq)
if dtype is not None:
na = Vector.fast([], dtype).na_value
Expand Down
16 changes: 15 additions & 1 deletion doc/dataiter.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,15 @@ type, i.e. :class:`.DataFrame` for CSV, NPZ and Parquet,
:func:`~dataiter.read_npz`
:func:`~dataiter.read_parquet`

The following constants can be used to customize certain defaults, such as
formatting and limits for printing.

:attr:`dataiter.PRINT_MAX_WIDTH`
:attr:`dataiter.PRINT_THOUSAND_SEPARATOR`
:attr:`dataiter.PRINT_TRUNCATE_WIDTH`
:attr:`dataiter.USE_NUMBA`
:attr:`dataiter.USE_NUMBA_CACHE`

.. automodule:: dataiter
:members: all,
any,
Expand All @@ -54,4 +63,9 @@ type, i.e. :class:`.DataFrame` for CSV, NPZ and Parquet,
read_parquet,
std,
sum,
var
var,
PRINT_MAX_WIDTH,
PRINT_THOUSAND_SEPARATOR,
PRINT_TRUNCATE_WIDTH,
USE_NUMBA,
USE_NUMBA_CACHE
7 changes: 7 additions & 0 deletions doc/dtypes.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
dataiter.dtypes
===============

Custom data types for vectors.

.. automodule:: dataiter.dtypes
:members:
1 change: 1 addition & 0 deletions doc/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -49,4 +49,5 @@ Currently included are the following classes.
geojson
list-of-dicts
vector
dtypes
dt
6 changes: 3 additions & 3 deletions doc/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
attd==1.0
jinja2==3.1.3
numpy==2.0.1
pandas==2.2.2
pyarrow==17.0.0
numpy==2.0.2
pandas==2.2.3
pyarrow==18.1.0
sphinx==7.2.6
sphinx-rtd-theme==2.0.0
wcwidth==0.2.13
3 changes: 3 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
attd==1.0
click==8.1.7
flake8==7.1.1
jinja2==3.1.3
numba==0.60.0
numpy==2.0.2
pandas==2.2.3
pyarrow==18.1.0
pytest==8.3.4
sphinx==7.2.6
sphinx-rtd-theme==2.0.0
wcwidth==0.2.13

0 comments on commit 9f6a106

Please sign in to comment.