Add dtypes module, refactor, document

otsaloma · Dec 8, 2024 · 9f6a106 · 9f6a106
1 parent 64946a5
commit 9f6a106
Show file tree

Hide file tree

Showing 9 changed files with 92 additions and 22 deletions.
diff --git a/Makefile b/Makefile
@@ -25,7 +25,7 @@ clean:
 	rm -rf */*/.pytest_cache
 
 doc:
-	$(MAKE) -C doc clean html
+	$(MAKE) SPHINXBUILD=../venv/bin/sphinx-build -C doc clean html
 
 doc-check:
 	PYTHONPATH=. doc/check.py
@@ -34,7 +34,7 @@ doc-open:
 	xdg-open doc/_build/html/index.html
 
 doc-watch:
-	watchexec -e py,rst --workdir doc $(MAKE) html
+	watchexec -e py,rst --workdir doc $(MAKE) SPHINXBUILD=../venv/bin/sphinx-build html
 
 install:
 	pip3 install --break-system-packages .

diff --git a/dataiter/__init__.py b/dataiter/__init__.py
@@ -34,10 +34,28 @@
 PRINT_MAX_ELEMENTS = 100
 PRINT_MAX_ITEMS = 10
 PRINT_MAX_ROWS = 100
+
+#: Maximum amount of columns to wrap print output to. Note that this is only a
+#: fallback in case Python's ``shutil.get_terminal_size`` fails to detect the
+#: width of your terminal. By default the detected full width is used.
 PRINT_MAX_WIDTH = 80
+
+#: Thousand separator to use when formatting numbers. By default this is blank,
+#: meaning no thousand separators are rendered.
 PRINT_THOUSAND_SEPARATOR = ""
+
+#: Maximum width to truncate string columns to in :class:`DataFrame` print
+#: output. When this is exceeded, strings will be cut with an ellipsis (``…``)
+#: at the end.
 PRINT_TRUNCATE_WIDTH = 36
+
+#: ``True`` to use Numba, if available, to speed up :doc:`aggregations
+#: </aggregation>`.
 USE_NUMBA = False
+
+#: ``True`` to use Numba cache for JIT-compiled :doc:`aggregations
+#: </aggregation>`, ``False`` to only keep compiled code in memory for the
+#: duration of the session.
 USE_NUMBA_CACHE = True
 
 if not np.__version__.startswith("2."):
@@ -69,6 +87,7 @@ def check(x):
 globals().pop("numba", None)
 globals().pop("util", None)
 
+from dataiter import dtypes # noqa
 from dataiter.vector import Vector # noqa
 from dataiter.data_frame import DataFrame # noqa
 from dataiter.data_frame import DataFrameColumn # noqa

diff --git a/dataiter/dtypes.py b/dataiter/dtypes.py
@@ -0,0 +1,31 @@
+# -*- coding: utf-8 -*-
+
+# Copyright (c) 2024 Osmo Salomaa
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+
+from numpy.dtypes import StringDType
+
+#: Instance of NumPy variable-width StringDType used
+string = StringDType(na_object="")
+
+# Use a blank string as missing value sentinel (1) because that's what we used
+# prior to the NumPy 2.0 StringDType and (2) because in many cases, such as CSV
+# input, a distinction between NA and blank cannot usually be made.
+# https://numpy.org/doc/stable/user/basics.strings.html#missing-data-support
diff --git a/dataiter/vector.py b/dataiter/vector.py
@@ -25,16 +25,11 @@
 import numpy as np
 import sys
 
+from dataiter import dtypes
 from dataiter import util
 from math import inf
 from numpy.dtypes import StringDType
 
-# Use a blank string as missing value sentinel (1) because that's what we used
-# prior to the NumPy 2.0 StringDType and (2) because with the common case of
-# CSV input, a distinction between NA and blank cannot usually be made.
-# https://numpy.org/doc/stable/user/basics.strings.html#missing-data-support
-string_dtype = StringDType(na_object="")
-
 TYPE_CONVERSIONS = {
     datetime.date: "datetime64[D]",
     datetime.datetime: "datetime64[us]",
@@ -55,7 +50,7 @@ class Vector(np.ndarray):
 
     def __new__(cls, object, dtype=None):
         if dtype is str:
-            dtype = string_dtype
+            dtype = dtypes.string
         # If given a NumPy array, we can do a fast initialization.
         if isinstance(object, np.ndarray):
             dtype = dtype or object.dtype
@@ -178,7 +173,7 @@ def as_string(self):
         >>> vector = di.Vector([1, 2, 3])
         >>> vector.as_string()
         """
-        return self.astype(string_dtype)
+        return self.astype(dtypes.string)
 
     def _check_dimensions(self):
         if self.ndim == 1: return
@@ -256,7 +251,7 @@ def fast(cls, object, dtype=None):
             # Evaluate generator/iterator.
             object = list(object)
         if dtype is str:
-            dtype = string_dtype
+            dtype = dtypes.string
         return cls._np_array(object, dtype).view(cls)
 
     def get_memory_use(self):
@@ -329,7 +324,7 @@ def is_na(self):
         if self.is_float():
             return np.isnan(self)
         if self.is_string():
-            return self == string_dtype.na_object
+            return self == dtypes.string.na_object
         # Can't use np.isin here since elements can be arrays.
         return self.fast([x is None for x in self], bool)
 
@@ -377,7 +372,7 @@ def map(self, function, *args, dtype=None, **kwargs):
         >>> vector.map(math.pow, 2)
         """
         if dtype is str:
-            dtype = string_dtype
+            dtype = dtypes.string
         return self.__class__((function(x, *args, **kwargs) for x in self), dtype)
 
     @property
@@ -445,7 +440,7 @@ def na_value(self):
         if self.is_integer():
             return np.nan
         if self.is_string():
-            return string_dtype.na_object
+            return dtypes.string.na_object
         # Note that using None, e.g. for a boolean vector,
         # might not work directly as it requires upcasting to object.
         return None
@@ -456,13 +451,13 @@ def _np_array(object, dtype=None):
         # In some cases we can only fix the dtype ex-post.
         if dtype is None:
             if util.unique_types(object) == {str}:
-                dtype = string_dtype
+                dtype = dtypes.string
         if dtype is str:
-            dtype = string_dtype
+            dtype = dtypes.string
         array = np.array(object, dtype)
         if dtype is None:
             if np.issubdtype(array.dtype, np.str_):
-                array = array.astype(string_dtype)
+                array = array.astype(dtypes.string)
         return array
 
     def range(self):
@@ -593,7 +588,7 @@ def _std_to_np(cls, seq, dtype=None):
         # Convert missing values in seq to NumPy equivalents.
         # Can be empty if all of seq are missing values.
         if dtype is str:
-            dtype = string_dtype
+            dtype = dtypes.string
         types = util.unique_types(seq)
         if dtype is not None:
             na = Vector.fast([], dtype).na_value

diff --git a/doc/dataiter.rst b/doc/dataiter.rst
@@ -33,6 +33,15 @@ type, i.e. :class:`.DataFrame` for CSV, NPZ and Parquet,
 :func:`~dataiter.read_npz`
 :func:`~dataiter.read_parquet`
 
+The following constants can be used to customize certain defaults, such as
+formatting and limits for printing.
+
+:attr:`dataiter.PRINT_MAX_WIDTH`
+:attr:`dataiter.PRINT_THOUSAND_SEPARATOR`
+:attr:`dataiter.PRINT_TRUNCATE_WIDTH`
+:attr:`dataiter.USE_NUMBA`
+:attr:`dataiter.USE_NUMBA_CACHE`
+
 .. automodule:: dataiter
    :members: all,
              any,
@@ -54,4 +63,9 @@ type, i.e. :class:`.DataFrame` for CSV, NPZ and Parquet,
              read_parquet,
              std,
              sum,
-             var
+             var,
+             PRINT_MAX_WIDTH,
+             PRINT_THOUSAND_SEPARATOR,
+             PRINT_TRUNCATE_WIDTH,
+             USE_NUMBA,
+             USE_NUMBA_CACHE
diff --git a/doc/dtypes.rst b/doc/dtypes.rst
@@ -0,0 +1,7 @@
+dataiter.dtypes
+===============
+
+Custom data types for vectors.
+
+.. automodule:: dataiter.dtypes
+   :members:
diff --git a/doc/index.rst b/doc/index.rst
@@ -49,4 +49,5 @@ Currently included are the following classes.
    geojson
    list-of-dicts
    vector
+   dtypes
    dt
diff --git a/doc/requirements.txt b/doc/requirements.txt
@@ -1,8 +1,8 @@
 attd==1.0
 jinja2==3.1.3
-numpy==2.0.1
-pandas==2.2.2
-pyarrow==17.0.0
+numpy==2.0.2
+pandas==2.2.3
+pyarrow==18.1.0
 sphinx==7.2.6
 sphinx-rtd-theme==2.0.0
 wcwidth==0.2.13
diff --git a/requirements.txt b/requirements.txt
@@ -1,9 +1,12 @@
 attd==1.0
 click==8.1.7
 flake8==7.1.1
+jinja2==3.1.3
 numba==0.60.0
 numpy==2.0.2
 pandas==2.2.3
 pyarrow==18.1.0
 pytest==8.3.4
+sphinx==7.2.6
+sphinx-rtd-theme==2.0.0
 wcwidth==0.2.13
-Original file line number
+Diff line change
@@ Expand Up / @@ -49,4 +49,5 @@ Currently included are the following classes. @@
        geojson
        list-of-dicts
        vector
+       dtypes
        dt