Skip to content

Commit

Permalink
Better concat_rows error messages (#1057)
Browse files Browse the repository at this point in the history
* Change error message to be more explicit

* Update tests with new error message pattern

* No need to rebuild the head set each time

* Fix typo in error message

* Put back old algorithm

* Fix bug in dtypes message

* Be consistent about quotes around column names
  • Loading branch information
billylanchantin authored Jan 19, 2025
1 parent 1095bcb commit 29d7b36
Show file tree
Hide file tree
Showing 3 changed files with 127 additions and 10 deletions.
61 changes: 56 additions & 5 deletions lib/explorer/data_frame.ex
Original file line number Diff line number Diff line change
Expand Up @@ -5458,10 +5458,11 @@ defmodule Explorer.DataFrame do
defp compute_changed_types_concat_rows([head | tail]) do
types = head.dtypes

Enum.reduce(tail, %{}, fn df, changed_types ->
tail
|> Enum.with_index(1)
|> Enum.reduce(%{}, fn {df, index}, changed_types ->
if n_columns(df) != map_size(types) do
raise ArgumentError,
"dataframes must have the same columns"
raise ArgumentError, concat_rows_mismatched_columns_message(head, df, index)
end

Enum.reduce(df.dtypes, changed_types, fn {name, type}, changed_types ->
Expand All @@ -5471,16 +5472,66 @@ defmodule Explorer.DataFrame do
Map.put(changed_types, name, dtype)
else
raise ArgumentError,
"columns and dtypes must be identical for all dataframes"
concat_rows_incompatible_dtypes_message(name, current_type, type, index)
end

%{} ->
raise ArgumentError, "dataframes must have the same columns"
raise ArgumentError, concat_rows_mismatched_columns_message(head, df, index)
end
end)
end)
end

defp concat_rows_mismatched_columns_message(df_0, df_i, i) do
df_0_cols = df_0 |> names() |> MapSet.new()
df_i_cols = df_i |> names() |> MapSet.new()
mismatched_cols = MapSet.symmetric_difference(df_0_cols, df_i_cols)
in_0_only = df_0_cols |> MapSet.intersection(mismatched_cols) |> Enum.sort()
in_i_only = df_i_cols |> MapSet.intersection(mismatched_cols) |> Enum.sort()

message_0 =
if in_0_only == [] do
""
else
"""
* dataframe 0 has these columns not present in dataframe #{i}:
#{inspect(in_0_only)}
"""
end

message_i =
if in_i_only == [] do
""
else
"""
* dataframe #{i} has these columns not present in dataframe 0:
#{inspect(in_i_only)}
"""
end

"dataframes must have the same columns\n#{message_0}#{message_i}"
end

defp concat_rows_incompatible_dtypes_message(column_name, df_0_dtype, df_i_dtype, i) do
"""
column dtypes must be compatible for all dataframes
* dataframe 0, column #{inspect(column_name)} has dtype:
#{inspect(df_0_dtype)}
* dataframe #{i}, column #{inspect(column_name)} has dtype:
#{inspect(df_i_dtype)}
these types are incompatible
"""
end

@doc """
Combine two dataframes row-wise.
Expand Down
50 changes: 47 additions & 3 deletions test/explorer/data_frame/lazy_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -1476,15 +1476,59 @@ defmodule Explorer.DataFrame.LazyTest do
df1 = DF.new([x: [7, 8, 9], y: ["g", "h", nil]], lazy: true)

assert_raise ArgumentError,
"dataframes must have the same columns",
"""
dataframes must have the same columns
* dataframe 0 has these columns not present in dataframe 1:
[\"x\", \"y\"]
* dataframe 1 has these columns not present in dataframe 0:
[\"z\"]
""",
fn -> DF.concat_rows(df1, DF.new([z: [7, 8, 9]], lazy: true)) end

assert_raise ArgumentError,
"dataframes must have the same columns",
"""
dataframes must have the same columns
* dataframe 0 has these columns not present in dataframe 1:
[\"y\"]
* dataframe 1 has these columns not present in dataframe 0:
[\"z\"]
""",
fn -> DF.concat_rows(df1, DF.new([x: [7, 8, 9], z: [7, 8, 9]], lazy: true)) end

assert_raise ArgumentError,
"columns and dtypes must be identical for all dataframes",
"""
dataframes must have the same columns
* dataframe 1 has these columns not present in dataframe 0:
[\"z\"]
""",
fn ->
DF.concat_rows(df1, DF.new([x: [7], y: [8], z: [9]], lazy: true))
end

assert_raise ArgumentError,
"""
column dtypes must be compatible for all dataframes
* dataframe 0, column \"y\" has dtype:
:string
* dataframe 1, column \"y\" has dtype:
{:s, 64}
these types are incompatible
""",
fn ->
DF.concat_rows(df1, DF.new([x: [7, 8, 9], y: [10, 11, 12]], lazy: true))
end
Expand Down
26 changes: 24 additions & 2 deletions test/explorer/data_frame_test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -2883,15 +2883,37 @@ defmodule Explorer.DataFrameTest do
df1 = DF.new(x: [1, 2, 3], y: ["a", "b", "c"])

assert_raise ArgumentError,
"dataframes must have the same columns",
"""
dataframes must have the same columns
* dataframe 0 has these columns not present in dataframe 1:
[\"x\", \"y\"]
* dataframe 1 has these columns not present in dataframe 0:
[\"z\"]
""",
fn -> DF.concat_rows(df1, DF.new(z: [7, 8, 9])) end
end

test "with incompatible column dtypes" do
df1 = DF.new(x: [1, 2, 3], y: ["a", "b", "c"])

assert_raise ArgumentError,
"columns and dtypes must be identical for all dataframes",
"""
column dtypes must be compatible for all dataframes
* dataframe 0, column \"y\" has dtype:
:string
* dataframe 1, column \"y\" has dtype:
{:s, 64}
these types are incompatible
""",
fn -> DF.concat_rows(df1, DF.new(x: [7, 8, 9], y: [10, 11, 12])) end
end

Expand Down

0 comments on commit 29d7b36

Please sign in to comment.