Skip to content

Commit

Permalink
Refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
otsaloma committed Dec 14, 2024
1 parent bc77799 commit 58565cf
Show file tree
Hide file tree
Showing 9 changed files with 56 additions and 60 deletions.
9 changes: 4 additions & 5 deletions bin/di-csv2json
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,18 @@

import click
import dataiter as di
import re

from pathlib import Path

@click.command()
@click.command(no_args_is_help=True)
@click.option("-f", "--force", is_flag=True, default=False, help="Overwrite existing file")
@click.argument("file", nargs=-1, type=click.Path(exists=True))
def main(force, file):
"""Convert CSV file to JSON file."""
for input in file:
output = re.sub(r"\.(csv|CSV)$", "", input) + ".json"
for input in map(Path, file):
output = input.with_suffix(".json")
click.echo(f"{input}{output}")
if Path(output).exists() and not force:
if output.exists() and not force:
raise SystemExit(
f"Output file {output} exists, "
f"use -f/--force to overwrite")
Expand Down
2 changes: 1 addition & 1 deletion bin/di-format-geojson
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import dataiter as di
import shutil
import time

@click.command()
@click.command(no_args_is_help=True)
@click.option("-i", "--indent", default=2, help="Indent level")
@click.option("-p", "--precision", default=9, help="Coordinate precision")
@click.argument("file", nargs=-1, type=click.Path(exists=True))
Expand Down
9 changes: 4 additions & 5 deletions bin/di-geojson2csv
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,18 @@

import click
import dataiter as di
import re

from pathlib import Path

@click.command()
@click.command(no_args_is_help=True)
@click.option("-f", "--force", is_flag=True, default=False, help="Overwrite existing file")
@click.argument("file", nargs=-1, type=click.Path(exists=True))
def main(force, file):
"""Convert GeoJSON file to CSV file."""
for input in file:
output = re.sub(r"\.(geojson|GEOJSON)$", "", input) + ".csv"
for input in map(Path, file):
output = input.with_suffix(".csv")
click.echo(f"{input}{output}")
if Path(output).exists() and not force:
if output.exists() and not force:
raise SystemExit(
f"Output file {output} exists, "
f"use -f/--force to overwrite")
Expand Down
9 changes: 4 additions & 5 deletions bin/di-json2csv
Original file line number Diff line number Diff line change
Expand Up @@ -2,19 +2,18 @@

import click
import dataiter as di
import re

from pathlib import Path

@click.command()
@click.command(no_args_is_help=True)
@click.option("-f", "--force", is_flag=True, default=False, help="Overwrite existing file")
@click.argument("file", nargs=-1, type=click.Path(exists=True))
def main(force, file):
"""Convert JSON file to CSV file."""
for input in file:
output = re.sub(r"\.(json|JSON)$", "", input) + ".csv"
for input in map(Path, file):
output = input.with_suffix(".csv")
click.echo(f"{input}{output}")
if Path(output).exists() and not force:
if output.exists() and not force:
raise SystemExit(
f"Output file {output} exists, "
f"use -f/--force to overwrite")
Expand Down
7 changes: 0 additions & 7 deletions data/neighbourhoods.py

This file was deleted.

20 changes: 11 additions & 9 deletions dataiter/test/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,24 +37,26 @@ def wrapper(path):
return wrapper

@cached
def data_frame(path):
path = get_data_path(path)
def data_frame(name):
path = get_data_path(name)
extension = path.suffix.lstrip(".")
read = getattr(DataFrame, f"read_{extension}")
return read(path)

@cached
def geojson(path):
path = get_data_path(path)
def geojson(name):
path = get_data_path(name)
return GeoJSON.read(path)

def get_data_path(path):
root = Path(__file__).parent.parent.parent.resolve()
return root / "data" / str(path)
def get_data_path(name):
for parent in Path(__file__).parents:
path = parent / "data" / name
if path.exists():
return path

@cached
def list_of_dicts(path):
path = get_data_path(path)
def list_of_dicts(name):
path = get_data_path(name)
extension = path.suffix.lstrip(".")
read = getattr(ListOfDicts, f"read_{extension}")
return read(path)
5 changes: 5 additions & 0 deletions dataiter/test/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,3 +48,8 @@ def test_read_npz(self):
s1 = inspect.signature(io.read_npz)
s2 = inspect.signature(DataFrame.read_npz)
assert s1 == s2

def test_read_parquet(self):
s1 = inspect.signature(io.read_parquet)
s2 = inspect.signature(DataFrame.read_parquet)
assert s1 == s2
13 changes: 11 additions & 2 deletions dataiter/test/test_vector.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,11 +168,16 @@ def test__array_wrap___expect_scalar(self):
def test___len__(self):
assert len(Vector([1, 2, 3])) == 3

def test_as_boolean(self):
def test_as_boolean_int(self):
a = Vector([1, 0]).as_boolean()
assert a.is_boolean()
assert np.all(a == [True, False])

def test_as_boolean_string(self):
a = Vector(["1", "0"]).as_boolean()
assert a.is_boolean()
assert np.all(a == [True, True])

def test_as_bytes(self):
a = Vector([0, 1]).as_bytes()
assert a.is_bytes()
Expand Down Expand Up @@ -619,6 +624,10 @@ def test_tolist_timedelta_datetime(self):
b = [datetime.timedelta(microseconds=1), datetime.timedelta(microseconds=1)]
assert Vector(a).tolist() == b

def test_unique(self):
def test_unique_int(self):
a = Vector([1, 2, None, 1, 2, 3])
assert a.unique().tolist() == [1, 2, None, 3]

def test_unique_string(self):
a = Vector(["a", "b", None, "a", "b", "c"])
assert a.unique().tolist() == ["a", "b", None, "c"]
42 changes: 16 additions & 26 deletions dataiter/vector.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,19 +49,15 @@ class Vector(np.ndarray):
"""

def __new__(cls, object, dtype=None):
if dtype is str:
dtype = dtypes.string
dtype = cls._map_input_dtype(dtype)
# If given a NumPy array, we can do a fast initialization.
if isinstance(object, np.ndarray):
dtype = dtype or object.dtype
return cls._np_array(object, dtype).view(cls)
# If given a Python list, or something else generic, we need
# to convert certain types and special values. This is really
# slow, see Vector.fast for faster initialization.
if (hasattr(object, "__iter__") and
not isinstance(object, (list, tuple))):
# Evaluate generator/iterator.
object = list(object)
object = util.sequencify(object)
return cls._std_to_np(object, dtype).view(cls)

def __init__(self, object, dtype=None):
Expand Down Expand Up @@ -102,11 +98,6 @@ def as_boolean(self):
>>> vector = di.Vector([0, 1])
>>> vector.as_boolean()
"""
if self.is_string():
# NumPy does bool(int(str)), which is weird.
# https://github.com/numpy/numpy/issues/20898
# https://github.com/numpy/numpy/pull/21024
return self.map(bool)
return self.astype(bool)

def as_bytes(self):
Expand Down Expand Up @@ -246,12 +237,8 @@ def fast(cls, object, dtype=None):
`object`. Use this only if you know `object` doesn't contain special
values or if you know they are already of the correct type.
"""
if (hasattr(object, "__iter__") and
not isinstance(object, (np.ndarray, list, tuple))):
# Evaluate generator/iterator.
object = list(object)
if dtype is str:
dtype = dtypes.string
object = util.sequencify(object)
dtype = cls._map_input_dtype(dtype)
return cls._np_array(object, dtype).view(cls)

def get_memory_use(self):
Expand Down Expand Up @@ -371,10 +358,15 @@ def map(self, function, *args, dtype=None, **kwargs):
>>> vector = di.Vector(range(10))
>>> vector.map(math.pow, 2)
"""
if dtype is str:
dtype = dtypes.string
dtype = self._map_input_dtype(dtype)
return self.__class__((function(x, *args, **kwargs) for x in self), dtype)

@classmethod
def _map_input_dtype(cls, dtype):
if dtype is str:
return dtypes.string
return dtype

@property
def na_dtype(self):
"""
Expand Down Expand Up @@ -445,15 +437,14 @@ def na_value(self):
# might not work directly as it requires upcasting to object.
return None

@staticmethod
def _np_array(object, dtype=None):
@classmethod
def _np_array(cls, object, dtype=None):
# NumPy still defaults to fixed width strings.
# In some cases we can only fix the dtype ex-post.
if dtype is None:
if util.unique_types(object) == {str}:
dtype = dtypes.string
if dtype is str:
dtype = dtypes.string
dtype = cls._map_input_dtype(dtype)
array = np.array(object, dtype)
if dtype is None:
if np.issubdtype(array.dtype, np.str_):
Expand Down Expand Up @@ -587,8 +578,7 @@ def sort(self, *, dir=1):
def _std_to_np(cls, seq, dtype=None):
# Convert missing values in seq to NumPy equivalents.
# Can be empty if all of seq are missing values.
if dtype is str:
dtype = dtypes.string
dtype = cls._map_input_dtype(dtype)
types = util.unique_types(seq)
if dtype is not None:
na = Vector.fast([], dtype).na_value
Expand Down Expand Up @@ -622,7 +612,7 @@ def _std_to_np_na_value(cls, types):
if not types:
return None
if str in types:
return ""
return dtypes.string.na_object
if all(x in [float, int] or
np.issubdtype(x, np.floating) or
np.issubdtype(x, np.integer)
Expand Down

0 comments on commit 58565cf

Please sign in to comment.