Refactor

otsaloma · Dec 14, 2024 · 58565cf · 58565cf
1 parent bc77799
commit 58565cf
Show file tree

Hide file tree

Showing 9 changed files with 56 additions and 60 deletions.
diff --git a/bin/di-csv2json b/bin/di-csv2json
@@ -2,19 +2,18 @@
 
 import click
 import dataiter as di
-import re
 
 from pathlib import Path
 
-@click.command()
+@click.command(no_args_is_help=True)
 @click.option("-f", "--force", is_flag=True, default=False, help="Overwrite existing file")
 @click.argument("file", nargs=-1, type=click.Path(exists=True))
 def main(force, file):
     """Convert CSV file to JSON file."""
-    for input in file:
-        output = re.sub(r"\.(csv|CSV)$", "", input) + ".json"
+    for input in map(Path, file):
+        output = input.with_suffix(".json")
         click.echo(f"{input} → {output}")
-        if Path(output).exists() and not force:
+        if output.exists() and not force:
             raise SystemExit(
                 f"Output file {output} exists, "
                 f"use -f/--force to overwrite")

diff --git a/bin/di-format-geojson b/bin/di-format-geojson
@@ -5,7 +5,7 @@ import dataiter as di
 import shutil
 import time
 
-@click.command()
+@click.command(no_args_is_help=True)
 @click.option("-i", "--indent", default=2, help="Indent level")
 @click.option("-p", "--precision", default=9, help="Coordinate precision")
 @click.argument("file", nargs=-1, type=click.Path(exists=True))

diff --git a/bin/di-geojson2csv b/bin/di-geojson2csv
@@ -2,19 +2,18 @@
 
 import click
 import dataiter as di
-import re
 
 from pathlib import Path
 
-@click.command()
+@click.command(no_args_is_help=True)
 @click.option("-f", "--force", is_flag=True, default=False, help="Overwrite existing file")
 @click.argument("file", nargs=-1, type=click.Path(exists=True))
 def main(force, file):
     """Convert GeoJSON file to CSV file."""
-    for input in file:
-        output = re.sub(r"\.(geojson|GEOJSON)$", "", input) + ".csv"
+    for input in map(Path, file):
+        output = input.with_suffix(".csv")
         click.echo(f"{input} → {output}")
-        if Path(output).exists() and not force:
+        if output.exists() and not force:
             raise SystemExit(
                 f"Output file {output} exists, "
                 f"use -f/--force to overwrite")

diff --git a/bin/di-json2csv b/bin/di-json2csv
@@ -2,19 +2,18 @@
 
 import click
 import dataiter as di
-import re
 
 from pathlib import Path
 
-@click.command()
+@click.command(no_args_is_help=True)
 @click.option("-f", "--force", is_flag=True, default=False, help="Overwrite existing file")
 @click.argument("file", nargs=-1, type=click.Path(exists=True))
 def main(force, file):
     """Convert JSON file to CSV file."""
-    for input in file:
-        output = re.sub(r"\.(json|JSON)$", "", input) + ".csv"
+    for input in map(Path, file):
+        output = input.with_suffix(".csv")
         click.echo(f"{input} → {output}")
-        if Path(output).exists() and not force:
+        if output.exists() and not force:
             raise SystemExit(
                 f"Output file {output} exists, "
                 f"use -f/--force to overwrite")

diff --git a/data/neighbourhoods.py b/data/neighbourhoods.py
diff --git a/dataiter/test/__init__.py b/dataiter/test/__init__.py
@@ -37,24 +37,26 @@ def wrapper(path):
     return wrapper
 
 @cached
-def data_frame(path):
-    path = get_data_path(path)
+def data_frame(name):
+    path = get_data_path(name)
     extension = path.suffix.lstrip(".")
     read = getattr(DataFrame, f"read_{extension}")
     return read(path)
 
 @cached
-def geojson(path):
-    path = get_data_path(path)
+def geojson(name):
+    path = get_data_path(name)
     return GeoJSON.read(path)
 
-def get_data_path(path):
-    root = Path(__file__).parent.parent.parent.resolve()
-    return root / "data" / str(path)
+def get_data_path(name):
+    for parent in Path(__file__).parents:
+        path = parent / "data" / name
+        if path.exists():
+            return path
 
 @cached
-def list_of_dicts(path):
-    path = get_data_path(path)
+def list_of_dicts(name):
+    path = get_data_path(name)
     extension = path.suffix.lstrip(".")
     read = getattr(ListOfDicts, f"read_{extension}")
     return read(path)
diff --git a/dataiter/test/test_io.py b/dataiter/test/test_io.py
@@ -48,3 +48,8 @@ def test_read_npz(self):
         s1 = inspect.signature(io.read_npz)
         s2 = inspect.signature(DataFrame.read_npz)
         assert s1 == s2
+
+    def test_read_parquet(self):
+        s1 = inspect.signature(io.read_parquet)
+        s2 = inspect.signature(DataFrame.read_parquet)
+        assert s1 == s2
diff --git a/dataiter/test/test_vector.py b/dataiter/test/test_vector.py
@@ -168,11 +168,16 @@ def test__array_wrap___expect_scalar(self):
     def test___len__(self):
         assert len(Vector([1, 2, 3])) == 3
 
-    def test_as_boolean(self):
+    def test_as_boolean_int(self):
         a = Vector([1, 0]).as_boolean()
         assert a.is_boolean()
         assert np.all(a == [True, False])
 
+    def test_as_boolean_string(self):
+        a = Vector(["1", "0"]).as_boolean()
+        assert a.is_boolean()
+        assert np.all(a == [True, True])
+
     def test_as_bytes(self):
         a = Vector([0, 1]).as_bytes()
         assert a.is_bytes()
@@ -619,6 +624,10 @@ def test_tolist_timedelta_datetime(self):
         b = [datetime.timedelta(microseconds=1), datetime.timedelta(microseconds=1)]
         assert Vector(a).tolist() == b
 
-    def test_unique(self):
+    def test_unique_int(self):
         a = Vector([1, 2, None, 1, 2, 3])
         assert a.unique().tolist() == [1, 2, None, 3]
+
+    def test_unique_string(self):
+        a = Vector(["a", "b", None, "a", "b", "c"])
+        assert a.unique().tolist() == ["a", "b", None, "c"]
diff --git a/dataiter/vector.py b/dataiter/vector.py
@@ -49,19 +49,15 @@ class Vector(np.ndarray):
     """
 
     def __new__(cls, object, dtype=None):
-        if dtype is str:
-            dtype = dtypes.string
+        dtype = cls._map_input_dtype(dtype)
         # If given a NumPy array, we can do a fast initialization.
         if isinstance(object, np.ndarray):
             dtype = dtype or object.dtype
             return cls._np_array(object, dtype).view(cls)
         # If given a Python list, or something else generic, we need
         # to convert certain types and special values. This is really
         # slow, see Vector.fast for faster initialization.
-        if (hasattr(object, "__iter__") and
-            not isinstance(object, (list, tuple))):
-            # Evaluate generator/iterator.
-            object = list(object)
+        object = util.sequencify(object)
         return cls._std_to_np(object, dtype).view(cls)
 
     def __init__(self, object, dtype=None):
@@ -102,11 +98,6 @@ def as_boolean(self):
         >>> vector = di.Vector([0, 1])
         >>> vector.as_boolean()
         """
-        if self.is_string():
-            # NumPy does bool(int(str)), which is weird.
-            # https://github.com/numpy/numpy/issues/20898
-            # https://github.com/numpy/numpy/pull/21024
-            return self.map(bool)
         return self.astype(bool)
 
     def as_bytes(self):
@@ -246,12 +237,8 @@ def fast(cls, object, dtype=None):
         `object`. Use this only if you know `object` doesn't contain special
         values or if you know they are already of the correct type.
         """
-        if (hasattr(object, "__iter__") and
-            not isinstance(object, (np.ndarray, list, tuple))):
-            # Evaluate generator/iterator.
-            object = list(object)
-        if dtype is str:
-            dtype = dtypes.string
+        object = util.sequencify(object)
+        dtype = cls._map_input_dtype(dtype)
         return cls._np_array(object, dtype).view(cls)
 
     def get_memory_use(self):
@@ -371,10 +358,15 @@ def map(self, function, *args, dtype=None, **kwargs):
         >>> vector = di.Vector(range(10))
         >>> vector.map(math.pow, 2)
         """
-        if dtype is str:
-            dtype = dtypes.string
+        dtype = self._map_input_dtype(dtype)
         return self.__class__((function(x, *args, **kwargs) for x in self), dtype)
 
+    @classmethod
+    def _map_input_dtype(cls, dtype):
+        if dtype is str:
+            return dtypes.string
+        return dtype
+
     @property
     def na_dtype(self):
         """
@@ -445,15 +437,14 @@ def na_value(self):
         # might not work directly as it requires upcasting to object.
         return None
 
-    @staticmethod
-    def _np_array(object, dtype=None):
+    @classmethod
+    def _np_array(cls, object, dtype=None):
         # NumPy still defaults to fixed width strings.
         # In some cases we can only fix the dtype ex-post.
         if dtype is None:
             if util.unique_types(object) == {str}:
                 dtype = dtypes.string
-        if dtype is str:
-            dtype = dtypes.string
+        dtype = cls._map_input_dtype(dtype)
         array = np.array(object, dtype)
         if dtype is None:
             if np.issubdtype(array.dtype, np.str_):
@@ -587,8 +578,7 @@ def sort(self, *, dir=1):
     def _std_to_np(cls, seq, dtype=None):
         # Convert missing values in seq to NumPy equivalents.
         # Can be empty if all of seq are missing values.
-        if dtype is str:
-            dtype = dtypes.string
+        dtype = cls._map_input_dtype(dtype)
         types = util.unique_types(seq)
         if dtype is not None:
             na = Vector.fast([], dtype).na_value
@@ -622,7 +612,7 @@ def _std_to_np_na_value(cls, types):
         if not types:
             return None
         if str in types:
-            return ""
+            return dtypes.string.na_object
         if all(x in [float, int] or
                np.issubdtype(x, np.floating) or
                np.issubdtype(x, np.integer)