Skip to content

Commit

Permalink
Support truncate_ragged_lines option for reading CSV files (#53)
Browse files Browse the repository at this point in the history
  • Loading branch information
jvdp authored Mar 24, 2024
1 parent 39a877e commit 10bff31
Show file tree
Hide file tree
Showing 7 changed files with 39 additions and 14 deletions.
4 changes: 3 additions & 1 deletion ext/polars/src/batched_csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ impl RbBatchedCsv {
let row_index = Option::<(String, IdxSize)>::try_convert(arguments[21])?;
let sample_size = usize::try_convert(arguments[22])?;
let eol_char = String::try_convert(arguments[23])?;
let truncate_ragged_lines = bool::try_convert(arguments[24])?;
// end arguments

let null_values = null_values.map(|w| w.0);
Expand Down Expand Up @@ -107,7 +108,8 @@ impl RbBatchedCsv {
.with_end_of_line_char(eol_char)
.with_skip_rows_after_header(skip_rows_after_header)
.with_row_index(row_index)
.sample_size(sample_size);
.sample_size(sample_size)
.truncate_ragged_lines(truncate_ragged_lines);

let reader = if low_memory {
let reader = reader
Expand Down
2 changes: 2 additions & 0 deletions ext/polars/src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ impl RbDataFrame {
let row_index = Option::<(String, IdxSize)>::try_convert(arguments[22])?;
let sample_size = usize::try_convert(arguments[23])?;
let eol_char = String::try_convert(arguments[24])?;
let truncate_ragged_lines = bool::try_convert(arguments[25])?;
// end arguments

let null_values = null_values.map(|w| w.0);
Expand Down Expand Up @@ -196,6 +197,7 @@ impl RbDataFrame {
.with_skip_rows_after_header(skip_rows_after_header)
.with_row_index(row_index)
.sample_size(sample_size)
.truncate_ragged_lines(truncate_ragged_lines)
.finish()
.map_err(RbPolarsErr::from)?;
Ok(df.into())
Expand Down
4 changes: 3 additions & 1 deletion ext/polars/src/lazyframe/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ impl RbLazyFrame {
let row_index = Option::<(String, IdxSize)>::try_convert(arguments[17])?;
let try_parse_dates = bool::try_convert(arguments[18])?;
let eol_char = String::try_convert(arguments[19])?;
let truncate_ragged_lines = bool::try_convert(arguments[20])?;
// end arguments

let null_values = null_values.map(|w| w.0);
Expand Down Expand Up @@ -133,7 +134,8 @@ impl RbLazyFrame {
.with_encoding(encoding.0)
.with_row_index(row_index)
.with_try_parse_dates(try_parse_dates)
.with_null_values(null_values);
.with_null_values(null_values)
.truncate_ragged_lines(truncate_ragged_lines);

if let Some(_lambda) = with_schema_modify {
todo!();
Expand Down
6 changes: 4 additions & 2 deletions lib/polars/batched_csv_reader.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ def initialize(
row_count_offset: 0,
sample_size: 1024,
eol_char: "\n",
new_columns: nil
new_columns: nil,
truncate_ragged_lines: false
)
if Utils.pathlike?(file)
path = Utils.normalise_filepath(file)
Expand Down Expand Up @@ -75,7 +76,8 @@ def initialize(
skip_rows_after_header,
Utils._prepare_row_count_args(row_count_name, row_count_offset),
sample_size,
eol_char
eol_char,
truncate_ragged_lines
)
self.new_columns = new_columns
end
Expand Down
9 changes: 6 additions & 3 deletions lib/polars/data_frame.rb
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@ def self._read_csv(
row_count_name: nil,
row_count_offset: 0,
sample_size: 1024,
eol_char: "\n"
eol_char: "\n",
truncate_ragged_lines: false
)
if Utils.pathlike?(file)
path = Utils.normalise_filepath(file)
Expand Down Expand Up @@ -147,7 +148,8 @@ def self._read_csv(
skip_rows_after_header: skip_rows_after_header,
row_count_name: row_count_name,
row_count_offset: row_count_offset,
eol_char: eol_char
eol_char: eol_char,
truncate_ragged_lines: truncate_ragged_lines
)
if columns.nil?
return _from_rbdf(scan.collect._df)
Expand Down Expand Up @@ -186,7 +188,8 @@ def self._read_csv(
skip_rows_after_header,
Utils._prepare_row_count_args(row_count_name, row_count_offset),
sample_size,
eol_char
eol_char,
truncate_ragged_lines
)
)
end
Expand Down
22 changes: 17 additions & 5 deletions lib/polars/io.rb
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,8 @@ module IO
# allocation needed.
# @param eol_char [String]
# Single byte end of line character.
# @param truncate_ragged_lines [Boolean]
# Truncate lines that are longer than the schema.
#
# @return [DataFrame]
#
Expand Down Expand Up @@ -113,7 +115,8 @@ def read_csv(
row_count_name: nil,
row_count_offset: 0,
sample_size: 1024,
eol_char: "\n"
eol_char: "\n",
truncate_ragged_lines: false
)
Utils._check_arg_is_1byte("sep", sep, false)
Utils._check_arg_is_1byte("comment_char", comment_char, false)
Expand Down Expand Up @@ -161,7 +164,8 @@ def read_csv(
row_count_name: row_count_name,
row_count_offset: row_count_offset,
sample_size: sample_size,
eol_char: eol_char
eol_char: eol_char,
truncate_ragged_lines: truncate_ragged_lines
)
end

Expand Down Expand Up @@ -239,6 +243,8 @@ def read_csv(
# the column remains of data type `:str`.
# @param eol_char [String]
# Single byte end of line character.
# @param truncate_ragged_lines [Boolean]
# Truncate lines that are longer than the schema.
#
# @return [LazyFrame]
def scan_csv(
Expand All @@ -262,7 +268,8 @@ def scan_csv(
row_count_name: nil,
row_count_offset: 0,
parse_dates: false,
eol_char: "\n"
eol_char: "\n",
truncate_ragged_lines: false
)
Utils._check_arg_is_1byte("sep", sep, false)
Utils._check_arg_is_1byte("comment_char", comment_char, false)
Expand Down Expand Up @@ -294,6 +301,7 @@ def scan_csv(
row_count_offset: row_count_offset,
parse_dates: parse_dates,
eol_char: eol_char,
truncate_ragged_lines: truncate_ragged_lines
)
end

Expand Down Expand Up @@ -755,6 +763,8 @@ def read_database(query, schema_overrides: nil)
# allocation needed.
# @param eol_char [String]
# Single byte end of line character.
# @param truncate_ragged_lines [Boolean]
# Truncate lines that are longer than the schema.
#
# @return [BatchedCsvReader]
#
Expand Down Expand Up @@ -787,7 +797,8 @@ def read_csv_batched(
row_count_name: nil,
row_count_offset: 0,
sample_size: 1024,
eol_char: "\n"
eol_char: "\n",
truncate_ragged_lines: false
)
projection, columns = Utils.handle_projection_columns(columns)

Expand Down Expand Up @@ -827,7 +838,8 @@ def read_csv_batched(
row_count_offset: row_count_offset,
sample_size: sample_size,
eol_char: eol_char,
new_columns: new_columns
new_columns: new_columns,
truncate_ragged_lines: truncate_ragged_lines
)
end

Expand Down
6 changes: 4 additions & 2 deletions lib/polars/lazy_frame.rb
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ def self._scan_csv(
row_count_name: nil,
row_count_offset: 0,
parse_dates: false,
eol_char: "\n"
eol_char: "\n",
truncate_ragged_lines: true
)
dtype_list = nil
if !dtypes.nil?
Expand Down Expand Up @@ -81,7 +82,8 @@ def self._scan_csv(
encoding,
Utils._prepare_row_count_args(row_count_name, row_count_offset),
parse_dates,
eol_char
eol_char,
truncate_ragged_lines
)
)
end
Expand Down

0 comments on commit 10bff31

Please sign in to comment.