Skip to content

Commit

Permalink
Add more integer dtypes (#769)
Browse files Browse the repository at this point in the history
* Add support for more signed integer dtypes

Now it's possible to read files that contain those types.

* Add support for encoding 8/16/32 bit series and elements

* Support creating 8-bit and 16-bit signed integer series

* Lift conversion of integer dtypes when reading df from IO

* Add unsigned integer dtypes support

This also removes some conversion, and simplify things in the Rust side.
  • Loading branch information
Philip Sampaio authored Dec 13, 2023
1 parent 7550921 commit 4dab528
Show file tree
Hide file tree
Showing 15 changed files with 255 additions and 83 deletions.
7 changes: 6 additions & 1 deletion lib/explorer/backend/lazy_series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -848,7 +848,12 @@ defmodule Explorer.Backend.LazySeries do

open = A.color("(", :list, opts)
close = A.color(")", :list, opts)
dtype = A.color("#{Series.dtype(series)}", :atom, opts)

dtype =
series
|> Series.dtype()
|> Explorer.Shared.dtype_to_string()
|> A.color(:atom, opts)

A.concat([
A.color("LazySeries[???]", :atom, opts),
Expand Down
6 changes: 3 additions & 3 deletions lib/explorer/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -5623,9 +5623,9 @@ defmodule Explorer.DataFrame do
iex> Explorer.DataFrame.nil_count(df)
#Explorer.DataFrame<
Polars[1 x 3]
a integer [1]
b integer [2]
c integer [0]
a u32 [1]
b u32 [2]
c u32 [0]
>
"""
@doc type: :single
Expand Down
6 changes: 6 additions & 0 deletions lib/explorer/polars_backend/native.ex
Original file line number Diff line number Diff line change
Expand Up @@ -323,8 +323,14 @@ defmodule Explorer.PolarsBackend.Native do
def s_from_list_duration(_name, _val, _precision), do: err()
def s_from_list_f32(_name, _val), do: err()
def s_from_list_f64(_name, _val), do: err()
def s_from_list_i8(_name, _val), do: err()
def s_from_list_i16(_name, _val), do: err()
def s_from_list_i32(_name, _val), do: err()
def s_from_list_i64(_name, _val), do: err()
def s_from_list_u8(_name, _val), do: err()
def s_from_list_u16(_name, _val), do: err()
def s_from_list_u32(_name, _val), do: err()
def s_from_list_u64(_name, _val), do: err()
def s_from_list_str(_name, _val), do: err()
def s_from_list_binary(_name, _val), do: err()
def s_from_list_categories(_name, _val), do: err()
Expand Down
11 changes: 11 additions & 0 deletions lib/explorer/polars_backend/shared.ex
Original file line number Diff line number Diff line change
Expand Up @@ -133,6 +133,17 @@ defmodule Explorer.PolarsBackend.Shared do
def from_list(list, dtype, name) when is_list(list) do
case dtype do
:integer -> Native.s_from_list_i64(name, list)
# Signed integers
{:s, 8} -> Native.s_from_list_i8(name, list)
{:s, 16} -> Native.s_from_list_i16(name, list)
{:s, 32} -> Native.s_from_list_i32(name, list)
{:s, 64} -> Native.s_from_list_i64(name, list)
# Unsigned integers
{:u, 8} -> Native.s_from_list_u8(name, list)
{:u, 16} -> Native.s_from_list_u16(name, list)
{:u, 32} -> Native.s_from_list_u32(name, list)
{:u, 64} -> Native.s_from_list_u64(name, list)
# Floats
{:f, 32} -> Native.s_from_list_f32(name, list)
{:f, 64} -> Native.s_from_list_f64(name, list)
:boolean -> Native.s_from_list_bool(name, list)
Expand Down
60 changes: 30 additions & 30 deletions lib/explorer/series.ex
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ defmodule Explorer.Series do
* `{:datetime, precision}` - DateTime type with millisecond/microsecond/nanosecond precision that unwraps to `Elixir.NaiveDateTime`
* `{:duration, precision}` - Duration type with millisecond/microsecond/nanosecond precision that unwraps to `Explorer.Duration`
* `{:f, size}` - a 64-bit or 32-bit floating point number
* `{:s, size}` - a 8-bit or 16-bit or 32-bit or 64-bit signed integer number.
* `{:u, size}` - a 8-bit or 16-bit or 32-bit or 64-bit unsigned integer number.
* `:integer` - 64-bit signed integer
* `:string` - UTF-8 encoded binary
* `:time` - Time type that unwraps to `Elixir.Time`
Expand Down Expand Up @@ -99,9 +101,11 @@ defmodule Explorer.Series do
@datetime_dtypes Explorer.Shared.datetime_types()
@duration_dtypes Explorer.Shared.duration_types()
@float_dtypes Explorer.Shared.float_types()
@integer_types Explorer.Shared.integer_types()

@date_or_datetime_dtypes [:date | @datetime_dtypes]
@temporal_dtypes [:time | @date_or_datetime_dtypes ++ @duration_dtypes]
@numeric_dtypes [:integer | @float_dtypes]
@numeric_dtypes @integer_types ++ @float_dtypes
@numeric_or_temporal_dtypes @numeric_dtypes ++ @temporal_dtypes

@io_dtypes Shared.dtypes() -- [:binary, :string, {:list, :any}, {:struct, :any}]
Expand All @@ -116,6 +120,14 @@ defmodule Explorer.Series do
| duration_dtype
| {:f, 32}
| {:f, 64}
| {:s, 8}
| {:s, 16}
| {:s, 32}
| {:s, 64}
| {:u, 8}
| {:u, 16}
| {:u, 32}
| {:u, 64}
| :integer
| :string
| list_dtype
Expand Down Expand Up @@ -151,25 +163,13 @@ defmodule Explorer.Series do

defguardp is_io_dtype(dtype) when K.in(dtype, @io_dtypes)

defguardp is_numeric_dtype(dtype) when K.in(dtype, [{:f, 32}, {:f, 64}, :integer])
defguardp is_numeric_dtype(dtype) when K.in(dtype, @numeric_dtypes)

defguardp is_numeric_or_bool_dtype(dtype)
when K.in(dtype, [{:f, 32}, {:f, 64}, :integer, :boolean])
when K.in(dtype, [:boolean | @numeric_dtypes])

defguardp is_numeric_or_temporal_dtype(dtype)
when K.in(dtype, [
{:f, 32},
{:f, 64},
:integer,
:date,
:time,
{:datetime, :nanosecond},
{:datetime, :microsecond},
{:datetime, :millisecond},
{:duration, :nanosecond},
{:duration, :microsecond},
{:duration, :millisecond}
])
when K.in(dtype, @numeric_or_temporal_dtypes)

@impl true
def fetch(series, idx) when is_integer(idx), do: {:ok, fetch!(series, idx)}
Expand Down Expand Up @@ -2138,15 +2138,15 @@ defmodule Explorer.Series do
iex> s = Explorer.Series.from_list([~D[2021-01-01], ~D[1999-12-31]])
iex> Explorer.Series.sum(s)
** (ArgumentError) Explorer.Series.sum/1 not implemented for dtype :date. Valid dtypes are [:integer, {:f, 32}, {:f, 64}, :boolean]
** (ArgumentError) Explorer.Series.sum/1 not implemented for dtype :date. Valid dtypes are [:boolean, {:s, 8}, {:s, 16}, {:s, 32}, {:s, 64}, :integer, {:u, 8}, {:u, 16}, {:u, 32}, {:u, 64}, {:f, 32}, {:f, 64}]
"""
@doc type: :aggregation
@spec sum(series :: Series.t()) :: number() | non_finite() | nil
def sum(%Series{dtype: dtype} = series) when is_numeric_or_bool_dtype(dtype),
do: apply_series(series, :sum)

def sum(%Series{dtype: dtype}),
do: dtype_error("sum/1", dtype, [:integer, {:f, 32}, {:f, 64}, :boolean])
do: dtype_error("sum/1", dtype, [:boolean | @numeric_dtypes])

@doc """
Gets the minimum value of the series.
Expand Down Expand Up @@ -2185,7 +2185,7 @@ defmodule Explorer.Series do
iex> s = Explorer.Series.from_list(["a", "b", "c"])
iex> Explorer.Series.min(s)
** (ArgumentError) Explorer.Series.min/1 not implemented for dtype :string. Valid dtypes are [:integer, {:f, 32}, {:f, 64}, :time, :date, {:datetime, :nanosecond}, {:datetime, :microsecond}, {:datetime, :millisecond}, {:duration, :nanosecond}, {:duration, :microsecond}, {:duration, :millisecond}]
** (ArgumentError) Explorer.Series.min/1 not implemented for dtype :string. Valid dtypes are [{:s, 8}, {:s, 16}, {:s, 32}, {:s, 64}, :integer, {:u, 8}, {:u, 16}, {:u, 32}, {:u, 64}, {:f, 32}, {:f, 64}, :time, :date, {:datetime, :nanosecond}, {:datetime, :microsecond}, {:datetime, :millisecond}, {:duration, :nanosecond}, {:duration, :microsecond}, {:duration, :millisecond}]
"""
@doc type: :aggregation
@spec min(series :: Series.t()) ::
Expand Down Expand Up @@ -2232,7 +2232,7 @@ defmodule Explorer.Series do
iex> s = Explorer.Series.from_list(["a", "b", "c"])
iex> Explorer.Series.max(s)
** (ArgumentError) Explorer.Series.max/1 not implemented for dtype :string. Valid dtypes are [:integer, {:f, 32}, {:f, 64}, :time, :date, {:datetime, :nanosecond}, {:datetime, :microsecond}, {:datetime, :millisecond}, {:duration, :nanosecond}, {:duration, :microsecond}, {:duration, :millisecond}]
** (ArgumentError) Explorer.Series.max/1 not implemented for dtype :string. Valid dtypes are [{:s, 8}, {:s, 16}, {:s, 32}, {:s, 64}, :integer, {:u, 8}, {:u, 16}, {:u, 32}, {:u, 64}, {:f, 32}, {:f, 64}, :time, :date, {:datetime, :nanosecond}, {:datetime, :microsecond}, {:datetime, :millisecond}, {:duration, :nanosecond}, {:duration, :microsecond}, {:duration, :millisecond}]
"""
@doc type: :aggregation
@spec max(series :: Series.t()) ::
Expand Down Expand Up @@ -2283,7 +2283,7 @@ defmodule Explorer.Series do
iex> s = Explorer.Series.from_list(["a", "b", "c"])
iex> Explorer.Series.argmax(s)
** (ArgumentError) Explorer.Series.argmax/1 not implemented for dtype :string. Valid dtypes are [:integer, {:f, 32}, {:f, 64}, :time, :date, {:datetime, :nanosecond}, {:datetime, :microsecond}, {:datetime, :millisecond}, {:duration, :nanosecond}, {:duration, :microsecond}, {:duration, :millisecond}]
** (ArgumentError) Explorer.Series.argmax/1 not implemented for dtype :string. Valid dtypes are [{:s, 8}, {:s, 16}, {:s, 32}, {:s, 64}, :integer, {:u, 8}, {:u, 16}, {:u, 32}, {:u, 64}, {:f, 32}, {:f, 64}, :time, :date, {:datetime, :nanosecond}, {:datetime, :microsecond}, {:datetime, :millisecond}, {:duration, :nanosecond}, {:duration, :microsecond}, {:duration, :millisecond}]
"""
@doc type: :aggregation
@spec argmax(series :: Series.t()) :: number() | non_finite() | nil
Expand Down Expand Up @@ -2342,7 +2342,7 @@ defmodule Explorer.Series do
iex> s = Explorer.Series.from_list(["a", "b", "c"])
iex> Explorer.Series.argmin(s)
** (ArgumentError) Explorer.Series.argmin/1 not implemented for dtype :string. Valid dtypes are [:integer, {:f, 32}, {:f, 64}, :time, :date, {:datetime, :nanosecond}, {:datetime, :microsecond}, {:datetime, :millisecond}, {:duration, :nanosecond}, {:duration, :microsecond}, {:duration, :millisecond}]
** (ArgumentError) Explorer.Series.argmin/1 not implemented for dtype :string. Valid dtypes are [{:s, 8}, {:s, 16}, {:s, 32}, {:s, 64}, :integer, {:u, 8}, {:u, 16}, {:u, 32}, {:u, 64}, {:f, 32}, {:f, 64}, :time, :date, {:datetime, :nanosecond}, {:datetime, :microsecond}, {:datetime, :millisecond}, {:duration, :nanosecond}, {:duration, :microsecond}, {:duration, :millisecond}]
"""
@doc type: :aggregation
@spec argmin(series :: Series.t()) :: number() | non_finite() | nil
Expand Down Expand Up @@ -2373,7 +2373,7 @@ defmodule Explorer.Series do
iex> s = Explorer.Series.from_list([~D[2021-01-01], ~D[1999-12-31]])
iex> Explorer.Series.mean(s)
** (ArgumentError) Explorer.Series.mean/1 not implemented for dtype :date. Valid dtypes are [:integer, {:f, 32}, {:f, 64}]
** (ArgumentError) Explorer.Series.mean/1 not implemented for dtype :date. Valid dtypes are [{:s, 8}, {:s, 16}, {:s, 32}, {:s, 64}, :integer, {:u, 8}, {:u, 16}, {:u, 32}, {:u, 64}, {:f, 32}, {:f, 64}]
"""
@doc type: :aggregation
@spec mean(series :: Series.t()) :: float() | non_finite() | nil
Expand Down Expand Up @@ -2444,7 +2444,7 @@ defmodule Explorer.Series do
iex> s = Explorer.Series.from_list([~D[2021-01-01], ~D[1999-12-31]])
iex> Explorer.Series.median(s)
** (ArgumentError) Explorer.Series.median/1 not implemented for dtype :date. Valid dtypes are [:integer, {:f, 32}, {:f, 64}]
** (ArgumentError) Explorer.Series.median/1 not implemented for dtype :date. Valid dtypes are [{:s, 8}, {:s, 16}, {:s, 32}, {:s, 64}, :integer, {:u, 8}, {:u, 16}, {:u, 32}, {:u, 64}, {:f, 32}, {:f, 64}]
"""
@doc type: :aggregation
@spec median(series :: Series.t()) :: float() | non_finite() | nil
Expand Down Expand Up @@ -2479,7 +2479,7 @@ defmodule Explorer.Series do
iex> s = Explorer.Series.from_list([~N[2021-01-01 00:00:00], ~N[1999-12-31 00:00:00]])
iex> Explorer.Series.variance(s)
** (ArgumentError) Explorer.Series.variance/1 not implemented for dtype {:datetime, :microsecond}. Valid dtypes are [:integer, {:f, 32}, {:f, 64}]
** (ArgumentError) Explorer.Series.variance/1 not implemented for dtype {:datetime, :microsecond}. Valid dtypes are [{:s, 8}, {:s, 16}, {:s, 32}, {:s, 64}, :integer, {:u, 8}, {:u, 16}, {:u, 32}, {:u, 64}, {:f, 32}, {:f, 64}]
"""
@doc type: :aggregation
@spec variance(series :: Series.t(), ddof :: non_neg_integer()) :: float() | non_finite() | nil
Expand Down Expand Up @@ -2515,7 +2515,7 @@ defmodule Explorer.Series do
iex> s = Explorer.Series.from_list(["a", "b", "c"])
iex> Explorer.Series.standard_deviation(s)
** (ArgumentError) Explorer.Series.standard_deviation/1 not implemented for dtype :string. Valid dtypes are [:integer, {:f, 32}, {:f, 64}]
** (ArgumentError) Explorer.Series.standard_deviation/1 not implemented for dtype :string. Valid dtypes are [{:s, 8}, {:s, 16}, {:s, 32}, {:s, 64}, :integer, {:u, 8}, {:u, 16}, {:u, 32}, {:u, 64}, {:f, 32}, {:f, 64}]
"""
@doc type: :aggregation
@spec standard_deviation(series :: Series.t(), ddof :: non_neg_integer()) ::
Expand Down Expand Up @@ -2552,7 +2552,7 @@ defmodule Explorer.Series do
iex> s = Explorer.Series.from_list([true, false, true])
iex> Explorer.Series.product(s)
** (ArgumentError) Explorer.Series.product/1 not implemented for dtype :boolean. Valid dtypes are [:integer, {:f, 32}, {:f, 64}]
** (ArgumentError) Explorer.Series.product/1 not implemented for dtype :boolean. Valid dtypes are [{:s, 8}, {:s, 16}, {:s, 32}, {:s, 64}, :integer, {:u, 8}, {:u, 16}, {:u, 32}, {:u, 64}, {:f, 32}, {:f, 64}]
"""
@doc type: :aggregation
@spec product(series :: Series.t()) :: float() | non_finite() | nil
Expand Down Expand Up @@ -2599,7 +2599,7 @@ defmodule Explorer.Series do
iex> s = Explorer.Series.from_list([true, false, true])
iex> Explorer.Series.quantile(s, 0.5)
** (ArgumentError) Explorer.Series.quantile/2 not implemented for dtype :boolean. Valid dtypes are [:integer, {:f, 32}, {:f, 64}, :time, :date, {:datetime, :nanosecond}, {:datetime, :microsecond}, {:datetime, :millisecond}, {:duration, :nanosecond}, {:duration, :microsecond}, {:duration, :millisecond}]
** (ArgumentError) Explorer.Series.quantile/2 not implemented for dtype :boolean. Valid dtypes are [{:s, 8}, {:s, 16}, {:s, 32}, {:s, 64}, :integer, {:u, 8}, {:u, 16}, {:u, 32}, {:u, 64}, {:f, 32}, {:f, 64}, :time, :date, {:datetime, :nanosecond}, {:datetime, :microsecond}, {:datetime, :millisecond}, {:duration, :nanosecond}, {:duration, :microsecond}, {:duration, :millisecond}]
"""
@doc type: :aggregation
@spec quantile(series :: Series.t(), quantile :: float()) :: any()
Expand Down Expand Up @@ -2645,7 +2645,7 @@ defmodule Explorer.Series do
iex> s = Explorer.Series.from_list([true, false, true])
iex> Explorer.Series.skew(s, false)
** (ArgumentError) Explorer.Series.skew/2 not implemented for dtype :boolean. Valid dtypes are [:integer, {:f, 32}, {:f, 64}]
** (ArgumentError) Explorer.Series.skew/2 not implemented for dtype :boolean. Valid dtypes are [{:s, 8}, {:s, 16}, {:s, 32}, {:s, 64}, :integer, {:u, 8}, {:u, 16}, {:u, 32}, {:u, 64}, {:f, 32}, {:f, 64}]
"""
@doc type: :aggregation
@spec skew(series :: Series.t(), opts :: Keyword.t()) :: float() | non_finite() | nil
Expand Down Expand Up @@ -5022,7 +5022,7 @@ defmodule Explorer.Series do
iex> s = Explorer.Series.from_list(["a", "b", "c"])
iex> Explorer.Series.abs(s)
** (ArgumentError) Explorer.Series.abs/1 not implemented for dtype :string. Valid dtypes are [:integer, {:f, 32}, {:f, 64}]
** (ArgumentError) Explorer.Series.abs/1 not implemented for dtype :string. Valid dtypes are [{:s, 8}, {:s, 16}, {:s, 32}, {:s, 64}, :integer, {:u, 8}, {:u, 16}, {:u, 32}, {:u, 64}, {:f, 32}, {:f, 64}]
"""
@doc type: :element_wise
@spec abs(series :: Series.t()) :: Series.t()
Expand Down
44 changes: 44 additions & 0 deletions lib/explorer/shared.ex
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,15 @@ defmodule Explorer.Shared do
:date,
{:f, 32},
{:f, 64},
{:s, 8},
{:s, 16},
{:s, 32},
{:s, 64},
{:u, 8},
{:u, 16},
{:u, 32},
{:u, 64},
# TODO: remove this integer
:integer,
:string,
:time,
Expand Down Expand Up @@ -53,7 +62,15 @@ defmodule Explorer.Shared do

def normalise_dtype(dtype) when dtype in @scalar_types, do: dtype
def normalise_dtype(dtype) when dtype in [:float, :f64], do: {:f, 64}
def normalise_dtype(dtype) when dtype in [:integer, :i64], do: {:s, 64}
def normalise_dtype(:f32), do: {:f, 32}
def normalise_dtype(:i8), do: {:s, 8}
def normalise_dtype(:i16), do: {:s, 16}
def normalise_dtype(:i32), do: {:s, 32}
def normalise_dtype(:u8), do: {:u, 8}
def normalise_dtype(:u16), do: {:u, 16}
def normalise_dtype(:u32), do: {:u, 32}
def normalise_dtype(:u64), do: {:u, 64}
def normalise_dtype(_dtype), do: nil

@doc """
Expand Down Expand Up @@ -85,6 +102,21 @@ defmodule Explorer.Shared do
"""
def float_types, do: [{:f, 32}, {:f, 64}]

@doc """
Supported signed integer dtypes.
"""
def signed_integer_types, do: [{:s, 8}, {:s, 16}, {:s, 32}, {:s, 64}, :integer]

@doc """
Supported unsigned integer dtypes.
"""
def unsigned_integer_types, do: [{:u, 8}, {:u, 16}, {:u, 32}, {:u, 64}]

@doc """
All integer dtypes.
"""
def integer_types, do: signed_integer_types() ++ unsigned_integer_types()

@doc """
Gets the backend from a `Keyword.t()` or `nil`.
"""
Expand Down Expand Up @@ -225,6 +257,14 @@ defmodule Explorer.Shared do
:binary,
{:f, 32},
{:f, 64},
{:s, 8},
{:s, 16},
{:s, 32},
{:s, 64},
{:u, 8},
{:u, 16},
{:u, 32},
{:u, 64},
:integer,
:category
],
Expand Down Expand Up @@ -259,6 +299,8 @@ defmodule Explorer.Shared do
type in [:integer, {:f, 32}, {:f, 64}, :numeric],
do: :numeric

defp type(item, {:s, _} = integer_type) when is_integer(item), do: integer_type
defp type(item, {:u, _} = integer_type) when is_integer(item) and item >= 0, do: integer_type
defp type(item, _type) when is_integer(item), do: :integer
defp type(item, {:f, _} = float_dtype) when is_float(item), do: float_dtype
defp type(item, _type) when is_float(item), do: {:f, 64}
Expand Down Expand Up @@ -447,6 +489,8 @@ defmodule Explorer.Shared do
def dtype_to_string({:list, dtype}), do: "list[" <> dtype_to_string(dtype) <> "]"
def dtype_to_string({:struct, fields}), do: "struct[#{map_size(fields)}]"
def dtype_to_string({:f, size}), do: "f" <> Integer.to_string(size)
def dtype_to_string({:s, size}), do: "s" <> Integer.to_string(size)
def dtype_to_string({:u, size}), do: "u" <> Integer.to_string(size)
def dtype_to_string(other) when is_atom(other), do: Atom.to_string(other)

@threshold 0.77
Expand Down
8 changes: 8 additions & 0 deletions native/explorer/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

5 changes: 1 addition & 4 deletions native/explorer/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,7 @@ features = [
"cov",
"decompress-fast",
"describe",
"dtype-date",
"dtype-time",
"dtype-datetime",
"dtype-categorical",
"dtype-full",
"ipc",
"ipc_streaming",
"lazy",
Expand Down
Loading

0 comments on commit 4dab528

Please sign in to comment.