Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support truncate_ragged_lines option for reading CSV files #53

Merged
merged 3 commits into from
Mar 24, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions ext/polars/src/batched_csv.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ impl RbBatchedCsv {
let row_index = Option::<(String, IdxSize)>::try_convert(arguments[21])?;
let sample_size = usize::try_convert(arguments[22])?;
let eol_char = String::try_convert(arguments[23])?;
let truncate_ragged_lines = bool::try_convert(arguments[24])?;
// end arguments

let null_values = null_values.map(|w| w.0);
Expand Down Expand Up @@ -92,6 +93,7 @@ impl RbBatchedCsv {
.with_separator(separator.as_bytes()[0])
.with_skip_rows(skip_rows)
.with_ignore_errors(ignore_errors)
.truncate_ragged_lines(truncate_ragged_lines)
jvdp marked this conversation as resolved.
Show resolved Hide resolved
.with_projection(projection)
.with_rechunk(rechunk)
.with_chunk_size(chunk_size)
Expand Down
2 changes: 2 additions & 0 deletions ext/polars/src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ impl RbDataFrame {
let row_index = Option::<(String, IdxSize)>::try_convert(arguments[22])?;
let sample_size = usize::try_convert(arguments[23])?;
let eol_char = String::try_convert(arguments[24])?;
let truncate_ragged_lines = bool::try_convert(arguments[25])?;
// end arguments

let null_values = null_values.map(|w| w.0);
Expand Down Expand Up @@ -178,6 +179,7 @@ impl RbDataFrame {
.with_separator(separator.as_bytes()[0])
.with_skip_rows(skip_rows)
.with_ignore_errors(ignore_errors)
.truncate_ragged_lines(truncate_ragged_lines)
.with_projection(projection)
.with_rechunk(rechunk)
.with_chunk_size(chunk_size)
Expand Down
2 changes: 2 additions & 0 deletions ext/polars/src/lazyframe/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ impl RbLazyFrame {
let row_index = Option::<(String, IdxSize)>::try_convert(arguments[17])?;
let try_parse_dates = bool::try_convert(arguments[18])?;
let eol_char = String::try_convert(arguments[19])?;
let truncate_ragged_lines = bool::try_convert(arguments[20])?;
// end arguments

let null_values = null_values.map(|w| w.0);
Expand All @@ -120,6 +121,7 @@ impl RbLazyFrame {
.with_separator(separator)
.has_header(has_header)
.with_ignore_errors(ignore_errors)
.truncate_ragged_lines(truncate_ragged_lines)
.with_skip_rows(skip_rows)
.with_n_rows(n_rows)
.with_cache(cache)
Expand Down
6 changes: 4 additions & 2 deletions lib/polars/batched_csv_reader.rb
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,8 @@ def initialize(
row_count_offset: 0,
sample_size: 1024,
eol_char: "\n",
new_columns: nil
new_columns: nil,
truncate_ragged_lines: false
)
if Utils.pathlike?(file)
path = Utils.normalise_filepath(file)
Expand Down Expand Up @@ -75,7 +76,8 @@ def initialize(
skip_rows_after_header,
Utils._prepare_row_count_args(row_count_name, row_count_offset),
sample_size,
eol_char
eol_char,
truncate_ragged_lines
)
self.new_columns = new_columns
end
Expand Down
7 changes: 5 additions & 2 deletions lib/polars/data_frame.rb
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,8 @@ def self._read_csv(
row_count_name: nil,
row_count_offset: 0,
sample_size: 1024,
eol_char: "\n"
eol_char: "\n",
truncate_ragged_lines: false
)
if Utils.pathlike?(file)
path = Utils.normalise_filepath(file)
Expand Down Expand Up @@ -140,6 +141,7 @@ def self._read_csv(
dtypes: dtypes_dict,
null_values: null_values,
ignore_errors: ignore_errors,
truncate_ragged_lines: truncate_ragged_lines,
jvdp marked this conversation as resolved.
Show resolved Hide resolved
infer_schema_length: infer_schema_length,
n_rows: n_rows,
low_memory: low_memory,
Expand Down Expand Up @@ -186,7 +188,8 @@ def self._read_csv(
skip_rows_after_header,
Utils._prepare_row_count_args(row_count_name, row_count_offset),
sample_size,
eol_char
eol_char,
truncate_ragged_lines
)
)
end
Expand Down
18 changes: 15 additions & 3 deletions lib/polars/io.rb
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ module IO
# Try to keep reading lines if some lines yield errors.
# First try `infer_schema_length: 0` to read all columns as
# `:str` to check which values might cause an issue.
# @param truncate_ragged_lines [Boolean]
jvdp marked this conversation as resolved.
Show resolved Hide resolved
# Truncate lines that are longer than the schema.
# @param parse_dates [Boolean]
# Try to automatically parse dates. If this does not succeed,
# the column remains of data type `:str`.
Expand Down Expand Up @@ -113,7 +115,8 @@ def read_csv(
row_count_name: nil,
row_count_offset: 0,
sample_size: 1024,
eol_char: "\n"
eol_char: "\n",
truncate_ragged_lines: false
)
Utils._check_arg_is_1byte("sep", sep, false)
Utils._check_arg_is_1byte("comment_char", comment_char, false)
Expand Down Expand Up @@ -149,6 +152,7 @@ def read_csv(
dtypes: dtypes,
null_values: null_values,
ignore_errors: ignore_errors,
truncate_ragged_lines: truncate_ragged_lines,
parse_dates: parse_dates,
n_threads: n_threads,
infer_schema_length: infer_schema_length,
Expand Down Expand Up @@ -208,6 +212,8 @@ def read_csv(
# Try to keep reading lines if some lines yield errors.
# First try `infer_schema_length: 0` to read all columns as
# `:str` to check which values might cause an issue.
# @param truncate_ragged_lines [Boolean]
# Truncate lines that are longer than the schema.
# @param cache [Boolean]
# Cache the result after reading.
# @param with_column_names [Object]
Expand Down Expand Up @@ -262,7 +268,8 @@ def scan_csv(
row_count_name: nil,
row_count_offset: 0,
parse_dates: false,
eol_char: "\n"
eol_char: "\n",
truncate_ragged_lines: false
)
Utils._check_arg_is_1byte("sep", sep, false)
Utils._check_arg_is_1byte("comment_char", comment_char, false)
Expand All @@ -282,6 +289,7 @@ def scan_csv(
dtypes: dtypes,
null_values: null_values,
ignore_errors: ignore_errors,
truncate_ragged_lines: truncate_ragged_lines,
cache: cache,
with_column_names: with_column_names,
infer_schema_length: infer_schema_length,
Expand Down Expand Up @@ -716,6 +724,8 @@ def read_database(query, schema_overrides: nil)
# Try to keep reading lines if some lines yield errors.
# First try `infer_schema_length: 0` to read all columns as
# `:str` to check which values might cause an issue.
# @param truncate_ragged_lines [Boolean]
# Truncate lines that are longer than the schema.
# @param parse_dates [Boolean]
# Try to automatically parse dates. If this does not succeed,
# the column remains of data type `:str`.
Expand Down Expand Up @@ -787,7 +797,8 @@ def read_csv_batched(
row_count_name: nil,
row_count_offset: 0,
sample_size: 1024,
eol_char: "\n"
eol_char: "\n",
truncate_ragged_lines: false
)
projection, columns = Utils.handle_projection_columns(columns)

Expand All @@ -814,6 +825,7 @@ def read_csv_batched(
dtypes: dtypes,
null_values: null_values,
ignore_errors: ignore_errors,
truncate_ragged_lines: truncate_ragged_lines,
parse_dates: parse_dates,
n_threads: n_threads,
infer_schema_length: infer_schema_length,
Expand Down
6 changes: 4 additions & 2 deletions lib/polars/lazy_frame.rb
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ def self._scan_csv(
row_count_name: nil,
row_count_offset: 0,
parse_dates: false,
eol_char: "\n"
eol_char: "\n",
truncate_ragged_lines: true
)
dtype_list = nil
if !dtypes.nil?
Expand Down Expand Up @@ -81,7 +82,8 @@ def self._scan_csv(
encoding,
Utils._prepare_row_count_args(row_count_name, row_count_offset),
parse_dates,
eol_char
eol_char,
truncate_ragged_lines
)
)
end
Expand Down