Skip to content

Commit

Permalink
feat: parse_float_as_decimal supports scientific notation and Decim…
Browse files Browse the repository at this point in the history
…al256 (#13806)

* feat: `parse_float_as_decimal` supports scientific notation and Decimal256

* Fix test

* Add test

* Add test

* Refine negative scales

* Update comment

* Refine bigint_to_i256

* UT for bigint_to_i256

* Add ut for parse_decimal
  • Loading branch information
jonahgao authored Dec 19, 2024
1 parent 32a13d8 commit b7fb32a
Show file tree
Hide file tree
Showing 4 changed files with 245 additions and 47 deletions.
1 change: 1 addition & 0 deletions datafusion-cli/Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions datafusion/sql/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ unparser = []
arrow = { workspace = true }
arrow-array = { workspace = true }
arrow-schema = { workspace = true }
bigdecimal = { workspace = true }
datafusion-common = { workspace = true, default-features = true }
datafusion-expr = { workspace = true }
indexmap = { workspace = true }
Expand Down
194 changes: 157 additions & 37 deletions datafusion/sql/src/expr/value.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,13 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel};
use arrow::compute::kernels::cast_utils::{
parse_interval_month_day_nano_config, IntervalParseConfig, IntervalUnit,
};
use arrow::datatypes::DECIMAL128_MAX_PRECISION;
use arrow_schema::DataType;
use arrow::datatypes::{i256, DECIMAL128_MAX_PRECISION};
use arrow_schema::{DataType, DECIMAL256_MAX_PRECISION};
use bigdecimal::num_bigint::BigInt;
use bigdecimal::{BigDecimal, Signed, ToPrimitive};
use datafusion_common::{
internal_err, not_impl_err, plan_err, DFSchema, DataFusionError, Result, ScalarValue,
internal_datafusion_err, internal_err, not_impl_err, plan_err, DFSchema,
DataFusionError, Result, ScalarValue,
};
use datafusion_expr::expr::{BinaryExpr, Placeholder};
use datafusion_expr::planner::PlannerResult;
Expand All @@ -31,6 +34,9 @@ use log::debug;
use sqlparser::ast::{BinaryOperator, Expr as SQLExpr, Interval, UnaryOperator, Value};
use sqlparser::parser::ParserError::ParserError;
use std::borrow::Cow;
use std::cmp::Ordering;
use std::ops::Neg;
use std::str::FromStr;

impl<S: ContextProvider> SqlToRel<'_, S> {
pub(crate) fn parse_value(
Expand Down Expand Up @@ -84,7 +90,7 @@ impl<S: ContextProvider> SqlToRel<'_, S> {
}

if self.options.parse_float_as_decimal {
parse_decimal_128(unsigned_number, negative)
parse_decimal(unsigned_number, negative)
} else {
signed_number.parse::<f64>().map(lit).map_err(|_| {
DataFusionError::from(ParserError(format!(
Expand Down Expand Up @@ -315,45 +321,84 @@ const fn try_decode_hex_char(c: u8) -> Option<u8> {
}
}

/// Parse Decimal128 from a string
///
/// TODO: support parsing from scientific notation
fn parse_decimal_128(unsigned_number: &str, negative: bool) -> Result<Expr> {
// remove leading zeroes
let trimmed = unsigned_number.trim_start_matches('0');
// Parse precision and scale, remove decimal point if exists
let (precision, scale, replaced_str) = if trimmed == "." {
// Special cases for numbers such as “0.”, “000.”, and so on.
(1, 0, Cow::Borrowed("0"))
} else if let Some(i) = trimmed.find('.') {
(
trimmed.len() - 1,
trimmed.len() - i - 1,
Cow::Owned(trimmed.replace('.', "")),
)
} else {
// No decimal point, keep as is
(trimmed.len(), 0, Cow::Borrowed(trimmed))
};
/// Returns None if the value can't be converted to i256.
/// Modified from <https://github.com/apache/arrow-rs/blob/c4dbf0d8af6ca5a19b8b2ea777da3c276807fc5e/arrow-buffer/src/bigint/mod.rs#L303>
fn bigint_to_i256(v: &BigInt) -> Option<i256> {
let v_bytes = v.to_signed_bytes_le();
match v_bytes.len().cmp(&32) {
Ordering::Less => {
let mut bytes = if v.is_negative() {
[255_u8; 32]
} else {
[0; 32]
};
bytes[0..v_bytes.len()].copy_from_slice(&v_bytes[..v_bytes.len()]);
Some(i256::from_le_bytes(bytes))
}
Ordering::Equal => Some(i256::from_le_bytes(v_bytes.try_into().unwrap())),
Ordering::Greater => None,
}
}

let number = replaced_str.parse::<i128>().map_err(|e| {
fn parse_decimal(unsigned_number: &str, negative: bool) -> Result<Expr> {
let mut dec = BigDecimal::from_str(unsigned_number).map_err(|e| {
DataFusionError::from(ParserError(format!(
"Cannot parse {replaced_str} as i128 when building decimal: {e}"
"Cannot parse {unsigned_number} as BigDecimal: {e}"
)))
})?;

// Check precision overflow
if precision as u8 > DECIMAL128_MAX_PRECISION {
return Err(DataFusionError::from(ParserError(format!(
"Cannot parse {replaced_str} as i128 when building decimal: precision overflow"
))));
if negative {
dec = dec.neg();
}

Ok(Expr::Literal(ScalarValue::Decimal128(
Some(if negative { -number } else { number }),
precision as u8,
scale as i8,
)))
let digits = dec.digits();
let (int_val, scale) = dec.into_bigint_and_exponent();
if scale < i8::MIN as i64 {
return not_impl_err!(
"Decimal scale {} exceeds the minimum supported scale: {}",
scale,
i8::MIN
);
}
let precision = if scale > 0 {
// arrow-rs requires the precision to include the positive scale.
// See <https://github.com/apache/arrow-rs/blob/123045cc766d42d1eb06ee8bb3f09e39ea995ddc/arrow-array/src/types.rs#L1230>
std::cmp::max(digits, scale.unsigned_abs())
} else {
digits
};
if precision <= DECIMAL128_MAX_PRECISION as u64 {
let val = int_val.to_i128().ok_or_else(|| {
// Failures are unexpected here as we have already checked the precision
internal_datafusion_err!(
"Unexpected overflow when converting {} to i128",
int_val
)
})?;
Ok(Expr::Literal(ScalarValue::Decimal128(
Some(val),
precision as u8,
scale as i8,
)))
} else if precision <= DECIMAL256_MAX_PRECISION as u64 {
let val = bigint_to_i256(&int_val).ok_or_else(|| {
// Failures are unexpected here as we have already checked the precision
internal_datafusion_err!(
"Unexpected overflow when converting {} to i256",
int_val
)
})?;
Ok(Expr::Literal(ScalarValue::Decimal256(
Some(val),
precision as u8,
scale as i8,
)))
} else {
not_impl_err!(
"Decimal precision {} exceeds the maximum supported precision: {}",
precision,
DECIMAL256_MAX_PRECISION
)
}
}

#[cfg(test)]
Expand All @@ -379,4 +424,79 @@ mod tests {
assert_eq!(output, expect);
}
}

#[test]
fn test_bigint_to_i256() {
let cases = [
(BigInt::from(0), Some(i256::from(0))),
(BigInt::from(1), Some(i256::from(1))),
(BigInt::from(-1), Some(i256::from(-1))),
(
BigInt::from_str(i256::MAX.to_string().as_str()).unwrap(),
Some(i256::MAX),
),
(
BigInt::from_str(i256::MIN.to_string().as_str()).unwrap(),
Some(i256::MIN),
),
(
// Can't fit into i256
BigInt::from_str((i256::MAX.to_string() + "1").as_str()).unwrap(),
None,
),
];

for (input, expect) in cases {
let output = bigint_to_i256(&input);
assert_eq!(output, expect);
}
}

#[test]
fn test_parse_decimal() {
// Supported cases
let cases = [
("0", ScalarValue::Decimal128(Some(0), 1, 0)),
("1", ScalarValue::Decimal128(Some(1), 1, 0)),
("123.45", ScalarValue::Decimal128(Some(12345), 5, 2)),
// Digit count is less than scale
("0.001", ScalarValue::Decimal128(Some(1), 3, 3)),
// Scientific notation
("123.456e-2", ScalarValue::Decimal128(Some(123456), 6, 5)),
// Negative scale
("123456e128", ScalarValue::Decimal128(Some(123456), 6, -128)),
// Decimal256
(
&("9".repeat(39) + "." + "99999"),
ScalarValue::Decimal256(
Some(i256::from_string(&"9".repeat(44)).unwrap()),
44,
5,
),
),
];
for (input, expect) in cases {
let output = parse_decimal(input, true).unwrap();
assert_eq!(output, Expr::Literal(expect.arithmetic_negate().unwrap()));

let output = parse_decimal(input, false).unwrap();
assert_eq!(output, Expr::Literal(expect));
}

// scale < i8::MIN
assert_eq!(
parse_decimal("1e129", false)
.unwrap_err()
.strip_backtrace(),
"This feature is not implemented: Decimal scale -129 exceeds the minimum supported scale: -128"
);

// Unsupported precision
assert_eq!(
parse_decimal(&"1".repeat(77), false)
.unwrap_err()
.strip_backtrace(),
"This feature is not implemented: Decimal precision 77 exceeds the maximum supported precision: 76"
);
}
}
96 changes: 86 additions & 10 deletions datafusion/sqllogictest/test_files/options.slt
Original file line number Diff line number Diff line change
Expand Up @@ -192,19 +192,95 @@ select arrow_typeof(00009999999999999999999999999999999999.9999),
----
Decimal128(38, 4) Decimal128(38, 4) Decimal128(20, 0)

# precision overflow
statement error DataFusion error: SQL error: ParserError\("Cannot parse 123456789012345678901234567890123456789 as i128 when building decimal: precision overflow"\)
select 123456789.012345678901234567890123456789
# scientific notation
query RTRTRT
select 1.23e3, arrow_typeof(1.23e3),
+1.23e1, arrow_typeof(+1.23e1),
-1234.56e-3, arrow_typeof(-1234.56e-3)
----
1230 Decimal128(3, -1) 12.3 Decimal128(3, 1) -1.23456 Decimal128(6, 5)

query RTRTRT
select 1.23e-2, arrow_typeof(1.23e-2),
1.23456e0, arrow_typeof(1.23456e0),
-.0123e2, arrow_typeof(-.0123e2)
----
0.0123 Decimal128(4, 4) 1.23456 Decimal128(6, 5) -1.23 Decimal128(3, 2)

# Decimal256 cases
query RT
select 123456789.0123456789012345678901234567890,
arrow_typeof(123456789.0123456789012345678901234567890)
----
123456789.012345678901 Decimal256(40, 31)

query RT
select -123456789.0123456789012345678901234567890,
arrow_typeof(-123456789.0123456789012345678901234567890)
----
-123456789.012345678901 Decimal256(40, 31)

# max precision and scale of Decimal256
query RTRT
select -1e-76, arrow_typeof(-1e-76),
-1.234567e-70, arrow_typeof(-1.234567e-70)
----
0 Decimal256(76, 76) 0 Decimal256(76, 76)

statement error SQL error: ParserError\("Cannot parse 123456789012345678901234567890123456789 as i128 when building decimal: precision overflow"\)
select -123456789.012345678901234567890123456789
# Decimal256::MAX for nonnegative scale
query RT
select 9999999999999999999999999999999999999999999999999999999999999999999999999999,
arrow_typeof(9999999999999999999999999999999999999999999999999999999999999999999999999999);
----
9999999999999999999999999999999999999999999999999999999999999999999999999999 Decimal256(76, 0)

# can not fit in i128
statement error SQL error: ParserError\("Cannot parse 1234567890123456789012345678901234567890 as i128 when building decimal: number too large to fit in target type"\)
select 123456789.0123456789012345678901234567890
# Decimal256::MIN
query RT
select -9999999999999999999999999999999999999999999999999999999999999999999999999999,
arrow_typeof(-9999999999999999999999999999999999999999999999999999999999999999999999999999);
----
-9999999999999999999999999999999999999999999999999999999999999999999999999999 Decimal256(76, 0)

statement error SQL error: ParserError\("Cannot parse 1234567890123456789012345678901234567890 as i128 when building decimal: number too large to fit in target type"\)
select -123456789.0123456789012345678901234567890
# boundaries between decimal128 and decimal256
query RTRT
select 1e-38, arrow_typeof(1e-38),
1e-39, arrow_typeof(1e-39);
----
0 Decimal128(38, 38) 0 Decimal256(39, 39)

query RTRT
select -1e-38, arrow_typeof(-1e-38),
-1e-39, arrow_typeof(-1e-39);
----
0 Decimal128(38, 38) 0 Decimal256(39, 39)

# unsupported precision
query error Decimal precision 77 exceeds the maximum supported precision: 76
select -1e-77;

query error Decimal precision 79 exceeds the maximum supported precision: 76
select 1.000000000000000000000000000000000000000000000000000000000000000000000000000001;

# negative scales
query TR
select arrow_typeof(1e77), 1e77
----
Decimal128(1, -77) 100000000000000000000000000000000000000000000000000000000000000000000000000000

query T
select arrow_typeof(1e128)
----
Decimal128(1, -128)

query error Decimal scale \-129 exceeds the minimum supported scale: \-128
select 1e129

# simple arithmetic
query RTRT
select 1e40 + 1e40, arrow_typeof(1e40 + 1e40),
1e-40 + -1e-40, arrow_typeof(1e-40 + -1e-40)
----
20000000000000000000000000000000000000000 Decimal128(2, -40) 0 Decimal256(41, 40)

# Restore option to default value
statement ok
Expand Down

0 comments on commit b7fb32a

Please sign in to comment.