From e757c8594272a6adff5530ecbb2af688e5f30d82 Mon Sep 17 00:00:00 2001 From: Alex Huszagh Date: Fri, 17 Jan 2025 17:09:09 -0600 Subject: [PATCH] Improve parsing floats to use our better base prefix logic. --- lexical-parse-float/src/parse.rs | 98 ++++++++++---------------- lexical-parse-integer/src/algorithm.rs | 12 ++-- lexical-util/src/feature_format.rs | 16 +++-- lexical-util/src/format_builder.rs | 35 +++++++-- lexical-util/src/not_feature_format.rs | 16 +++-- lexical-util/src/skip.rs | 2 +- 6 files changed, 95 insertions(+), 84 deletions(-) diff --git a/lexical-parse-float/src/parse.rs b/lexical-parse-float/src/parse.rs index 2dc85f00..683620bf 100644 --- a/lexical-parse-float/src/parse.rs +++ b/lexical-parse-float/src/parse.rs @@ -251,11 +251,10 @@ pub fn parse_complete( options: &Options, ) -> Result { let mut byte = bytes.bytes::<{ FORMAT }>(); + let format = NumberFormat:: {}; let is_negative = parse_mantissa_sign(&mut byte)?; if byte.integer_iter().is_consumed() { - if NumberFormat::::REQUIRED_INTEGER_DIGITS - || NumberFormat::::REQUIRED_MANTISSA_DIGITS - { + if format.required_integer_digits() || format.required_mantissa_digits() { return Err(Error::Empty(byte.cursor())); } else { return Ok(F::ZERO); @@ -294,11 +293,10 @@ pub fn fast_path_complete( options: &Options, ) -> Result { let mut byte = bytes.bytes::<{ FORMAT }>(); + let format = NumberFormat:: {}; let is_negative = parse_mantissa_sign(&mut byte)?; if byte.integer_iter().is_consumed() { - if NumberFormat::::REQUIRED_INTEGER_DIGITS - || NumberFormat::::REQUIRED_MANTISSA_DIGITS - { + if format.required_integer_digits() || format.required_mantissa_digits() { return Err(Error::Empty(byte.cursor())); } else { return Ok(F::ZERO); @@ -319,11 +317,10 @@ pub fn parse_partial( options: &Options, ) -> Result<(F, usize)> { let mut byte = bytes.bytes::<{ FORMAT }>(); + let format = NumberFormat:: {}; let is_negative = parse_mantissa_sign(&mut byte)?; if byte.integer_iter().is_consumed() { - if NumberFormat::::REQUIRED_INTEGER_DIGITS - || NumberFormat::::REQUIRED_MANTISSA_DIGITS - { + if format.required_integer_digits() || format.required_mantissa_digits() { return Err(Error::Empty(byte.cursor())); } else { return Ok((F::ZERO, byte.cursor())); @@ -368,11 +365,10 @@ pub fn fast_path_partial( options: &Options, ) -> Result<(F, usize)> { let mut byte = bytes.bytes::<{ FORMAT }>(); + let format = NumberFormat:: {}; let is_negative = parse_mantissa_sign(&mut byte)?; if byte.integer_iter().is_consumed() { - if NumberFormat::::REQUIRED_INTEGER_DIGITS - || NumberFormat::::REQUIRED_MANTISSA_DIGITS - { + if format.required_integer_digits() || format.required_mantissa_digits() { return Err(Error::Empty(byte.cursor())); } else { return Ok((F::ZERO, byte.cursor())); @@ -535,51 +531,31 @@ pub fn parse_number<'a, const FORMAT: u128, const IS_PARTIAL: bool>( let bits_per_digit = shared::log2(format.mantissa_radix()) as i64; let bits_per_base = shared::log2(format.exponent_base()) as i64; - // INTEGER - - // Check to see if we have a valid base prefix. - // NOTE: `lz_prefix` is if we had a leading zero when - // checking for a base prefix: it is not if the prefix - // exists or not. - // TODO: MIGRATE TO BASE PREFIX LOGIC - #[allow(unused_variables)] - let mut lz_prefix = false; - #[cfg(all(feature = "format", feature = "power-of-two"))] - { - let base_prefix = format.base_prefix(); - let mut has_prefix = false; - let mut iter = byte.integer_iter(); - if base_prefix != 0 && iter.read_if_value_cased(b'0').is_some() { - // Check to see if the next character is the base prefix. - // We must have a format like `0x`, `0d`, `0o`. - // NOTE: The check for empty integer digits happens below so - // we don't need a redundant check here. - lz_prefix = true; - let prefix = iter.read_if_value(base_prefix, format.case_sensitive_base_prefix()); - has_prefix = prefix.is_some(); - if has_prefix && iter.is_buffer_empty() && format.required_integer_digits() { - return Err(Error::EmptyInteger(iter.cursor())); - } - } - if format.required_base_prefix() && !has_prefix { - return Err(Error::MissingBasePrefix(iter.cursor())); + // skip and validate an optional base prefix + let has_base_prefix = cfg!(feature = "format") && byte.integer_iter().read_base_prefix(); + if cfg!(feature = "format") && has_base_prefix { + if byte.is_buffer_empty() && format.required_integer_digits() { + return Err(Error::EmptyInteger(byte.cursor())); } + } else if format.required_base_prefix() { + return Err(Error::MissingBasePrefix(byte.cursor())); } - // Parse our integral digits. - let mut mantissa = 0_u64; + // INTEGER + let start = byte.clone(); + let mut mantissa = 0_u64; let mut integer_iter = byte.integer_iter(); - let start_count = integer_iter.digits(); + let integer_start = integer_iter.digits(); + + // Parse our integral digits. #[cfg(not(feature = "compact"))] parse_8digits::<_, FORMAT>(&mut integer_iter, &mut mantissa); parse_digits(&mut integer_iter, format.mantissa_radix(), |digit| { mantissa = mantissa.wrapping_mul(format.radix() as u64).wrapping_add(digit as u64); }); - let mut n_digits = integer_iter.digits_since(start_count); - #[cfg(feature = "format")] + let mut n_digits = integer_iter.digits_since(integer_start); let n_before_dot = n_digits; - #[cfg(feature = "format")] if format.required_integer_digits() && n_digits == 0 { return Err(Error::EmptyInteger(byte.cursor())); } @@ -610,10 +586,13 @@ pub fn parse_number<'a, const FORMAT: u128, const IS_PARTIAL: bool>( let integer_digits = unsafe { start.as_slice().get_unchecked(..b_digits) }; // Check if integer leading zeros are disabled. - #[cfg(feature = "format")] - if !lz_prefix && format.no_float_leading_zeros() { + if cfg!(feature = "format") + && format.no_float_leading_zeros() + && !has_base_prefix + && n_before_dot > 1 + { let mut integer = integer_digits.bytes::(); - if integer_digits.len() > 1 && integer.integer_iter().peek() == Some(&b'0') { + if integer.integer_iter().peek() == Some(&b'0') { return Err(Error::InvalidLeadingZeros(start.cursor())); } } @@ -632,13 +611,13 @@ pub fn parse_number<'a, const FORMAT: u128, const IS_PARTIAL: bool>( unsafe { byte.step_unchecked() }; let before = byte.clone(); let mut fraction_iter = byte.fraction_iter(); - let start_count = fraction_iter.digits(); + let fraction_count = fraction_iter.digits(); #[cfg(not(feature = "compact"))] parse_8digits::<_, FORMAT>(&mut fraction_iter, &mut mantissa); parse_digits(&mut fraction_iter, format.mantissa_radix(), |digit| { mantissa = mantissa.wrapping_mul(format.radix() as u64).wrapping_add(digit as u64); }); - n_after_dot = fraction_iter.digits_since(start_count); + n_after_dot = fraction_iter.digits_since(fraction_count); // NOTE: We can't use the number of digits to extract the slice for // non-contiguous iterators, but we also need to the number of digits // for our value calculation. We store both, and let the compiler know @@ -674,7 +653,7 @@ pub fn parse_number<'a, const FORMAT: u128, const IS_PARTIAL: bool>( // NOTE: Check if we have our exponent **BEFORE** checking if the // mantissa is empty, so we can ensure let has_exponent = byte - .first_is(exponent_character, format.case_sensitive_exponent() && cfg!(feature = "format")); + .first_is(exponent_character, cfg!(feature = "format") && format.case_sensitive_exponent()); // check to see if we have any invalid leading zeros n_digits += n_after_dot; @@ -701,8 +680,7 @@ pub fn parse_number<'a, const FORMAT: u128, const IS_PARTIAL: bool>( unsafe { byte.step_unchecked() }; // Check float format syntax checks. - #[cfg(feature = "format")] - { + if cfg!(feature = "format") { // NOTE: We've overstepped for the safety invariant before. if format.no_exponent_notation() { return Err(Error::InvalidExponent(byte.cursor() - 1)); @@ -736,14 +714,14 @@ pub fn parse_number<'a, const FORMAT: u128, const IS_PARTIAL: bool>( let is_negative_exponent = parse_exponent_sign(&mut byte)?; let mut exponent_iter = byte.exponent_iter(); - let start_count = exponent_iter.digits(); + let exponent_start = exponent_iter.digits(); parse_digits(&mut exponent_iter, format.exponent_radix(), |digit| { if explicit_exponent < 0x10000000 { explicit_exponent *= format.exponent_radix() as i64; explicit_exponent += digit as i64; } }); - if format.required_exponent_digits() && exponent_iter.digits_since(start_count) == 0 { + if format.required_exponent_digits() && exponent_iter.digits_since(exponent_start) == 0 { return Err(Error::EmptyExponent(byte.cursor())); } // Handle our sign, and get the explicit part of the exponent. @@ -760,9 +738,8 @@ pub fn parse_number<'a, const FORMAT: u128, const IS_PARTIAL: bool>( // Check to see if we have a valid base suffix. // We've already trimmed any leading digit separators here, so we can be safe // that the first character **is not** a digit separator. - // FIXME: Improve parsing of this - #[cfg(all(feature = "format", feature = "power-of-two"))] - if format.has_base_suffix() { + // TODO: Improve parsing of this using a base suffix method + if cfg!(all(feature = "format", feature = "power-of-two")) && format.has_base_suffix() { let base_suffix = format.base_suffix(); let is_suffix = byte.first_is(base_suffix, format.case_sensitive_base_suffix()); if is_suffix { @@ -779,8 +756,7 @@ pub fn parse_number<'a, const FORMAT: u128, const IS_PARTIAL: bool>( let end = byte.cursor(); let mut step = u64_step(format.mantissa_radix()); let mut many_digits = false; - #[cfg(feature = "format")] - if !format.required_mantissa_digits() && n_digits == 0 { + if cfg!(feature = "format") && !format.required_mantissa_digits() && n_digits == 0 { exponent = 0; } if n_digits <= step { diff --git a/lexical-parse-integer/src/algorithm.rs b/lexical-parse-integer/src/algorithm.rs index e982003e..cfbf808e 100644 --- a/lexical-parse-integer/src/algorithm.rs +++ b/lexical-parse-integer/src/algorithm.rs @@ -674,14 +674,15 @@ macro_rules! algorithm { let mut byte = $bytes.bytes::(); let format = NumberFormat:: {}; let radix = format.mantissa_radix(); + debug_assert!(format.is_valid(), "should have already checked for an invalid number format"); let is_negative = parse_sign::(&mut byte)?; let mut iter = byte.integer_iter(); maybe_into_empty!(iter, $into_ok); // skip and validate an optional base prefix - #[cfg(all(feature = "format", feature = "power-of-two"))] - if iter.read_base_prefix() { + let has_base_prefix = cfg!(feature = "format") && iter.read_base_prefix(); + if cfg!(feature = "format") && has_base_prefix { maybe_into_empty!(iter, $into_ok); } else if format.required_base_prefix() { return Err(Error::MissingBasePrefix(iter.cursor())); @@ -689,8 +690,7 @@ macro_rules! algorithm { // NOTE: always do a peek so any leading digit separators // are skipped, and we can get the correct index - #[cfg(feature = "format")] - if format.no_integer_leading_zeros() && iter.peek() == Some(&b'0') { + if cfg!(feature = "format") && format.no_integer_leading_zeros() && !has_base_prefix && iter.peek() == Some(&b'0') { // NOTE: Skipping zeros is **EXPENSIVE* so we skip that without our format feature let index = iter.cursor(); let zeros = iter.skip_zeros(); @@ -719,7 +719,6 @@ macro_rules! algorithm { // and even if parsing a 64-bit integer is marginally faster, it // culminates in **way** slower performance overall for simple // integers, and no improvement for large integers. - #[allow(unused)] let mut has_suffix = false; // FIXME: This is only used for the parsing of the base suffix. #[allow(unused)] @@ -776,8 +775,7 @@ macro_rules! algorithm { ); } - #[cfg(all(feature = "format", feature = "power-of-two"))] - if format.required_base_suffix() && !has_suffix { + if cfg!(all(feature = "format", feature = "power-of-two")) && format.required_base_suffix() && !has_suffix { return Err(Error::MissingBaseSuffix(iter.cursor())); } diff --git a/lexical-util/src/feature_format.rs b/lexical-util/src/feature_format.rs index ff276b1b..b58d4d66 100644 --- a/lexical-util/src/feature_format.rs +++ b/lexical-util/src/feature_format.rs @@ -516,16 +516,20 @@ impl NumberFormat { /// Get if leading zeros before an integer are not allowed. /// - /// Can only be modified with [`feature`][crate#features] `format`. Defaults - /// to [`false`]. + /// Can only be modified with [`feature`][crate#features] `format`. This + /// only applies if there is no base prefix: that is, the zeros are + /// at the absolute start of the number. Defaults to [`false`]. /// /// # Examples /// + /// With a base prefix of `x`. + /// /// | Input | Valid? | /// |:-:|:-:| /// | `01` | ❌ | /// | `0` | ✔️ | /// | `10` | ✔️ | + /// | `0x01` | ✔️ | /// /// # Used For /// @@ -544,17 +548,21 @@ impl NumberFormat { /// /// This is before the significant digits of the float, that is, if there is /// 1 or more digits in the integral component and the leading digit is 0, - /// Can only be modified with [`feature`][crate#features] `format`. Defaults - /// to [`false`]. + /// Can only be modified with [`feature`][crate#features] `format`. This + /// only applies if there is no base prefix: that is, the zeros are + /// at the absolute start of the number. Defaults to [`false`]. /// /// # Examples /// + /// With a base prefix of `x`. + /// /// | Input | Valid? | /// |:-:|:-:| /// | `01` | ❌ | /// | `01.0` | ❌ | /// | `0` | ✔️ | /// | `10` | ✔️ | + /// | `0x01.0` | ✔️ | /// | `0.1` | ✔️ | /// /// # Used For diff --git a/lexical-util/src/format_builder.rs b/lexical-util/src/format_builder.rs index 14cfa72b..8c24ae88 100644 --- a/lexical-util/src/format_builder.rs +++ b/lexical-util/src/format_builder.rs @@ -1339,16 +1339,20 @@ impl NumberFormatBuilder { /// Get if leading zeros before an integer are not allowed. /// - /// Can only be modified with [`feature`][crate#features] `format`. Defaults - /// to [`false`]. + /// Can only be modified with [`feature`][crate#features] `format`. This + /// only applies if there is no base prefix: that is, the zeros are + /// at the absolute start of the number. Defaults to [`false`]. /// /// # Examples /// + /// With a base prefix of `x`. + /// /// | Input | Valid? | /// |:-:|:-:| /// | `01` | ❌ | /// | `0` | ✔️ | /// | `10` | ✔️ | + /// | `0x01` | ✔️ | /// /// # Used For /// @@ -1362,17 +1366,21 @@ impl NumberFormatBuilder { /// /// This is before the significant digits of the float, that is, if there is /// 1 or more digits in the integral component and the leading digit is 0, - /// Can only be modified with [`feature`][crate#features] `format`. Defaults - /// to [`false`]. + /// Can only be modified with [`feature`][crate#features] `format`. This + /// only applies if there is no base prefix: that is, the zeros are + /// at the absolute start of the number. Defaults to [`false`]. /// /// # Examples /// + /// With a base prefix of `x`. + /// /// | Input | Valid? | /// |:-:|:-:| /// | `01` | ❌ | /// | `01.0` | ❌ | /// | `0` | ✔️ | /// | `10` | ✔️ | + /// | `0x01.0` | ✔️ | /// | `0.1` | ✔️ | /// /// # Used For @@ -3148,15 +3156,19 @@ impl NumberFormatBuilder { /// Set if leading zeros before an integer are not allowed. /// - /// Defaults to [`false`]. + /// This only applies if there is no base prefix: that is, the zeros are + /// at the absolute start of the number. Defaults to [`false`]. /// /// # Examples /// + /// With a base prefix of `x`. + /// /// | Input | Valid? | /// |:-:|:-:| /// | `01` | ❌ | /// | `0` | ✔️ | /// | `10` | ✔️ | + /// | `0x01` | ✔️ | /// /// # Used For /// @@ -3165,6 +3177,7 @@ impl NumberFormatBuilder { /// #[inline(always)] @@ -3189,16 +3203,20 @@ impl NumberFormatBuilder { /// /// This is before the significant digits of the float, that is, if there is /// 1 or more digits in the integral component and the leading digit is 0, - /// Defaults to [`false`]. + /// This only applies if there is no base prefix: that is, the zeros are + /// at the absolute start of the number. Defaults to [`false`]. /// /// # Examples /// + /// With a base prefix of `x`. + /// /// | Input | Valid? | /// |:-:|:-:| /// | `01` | ❌ | /// | `01.0` | ❌ | /// | `0` | ✔️ | /// | `10` | ✔️ | + /// | `0x01.0` | ✔️ | /// | `0.1` | ✔️ | /// /// # Used For @@ -3208,6 +3226,7 @@ impl NumberFormatBuilder { /// #[inline(always)] diff --git a/lexical-util/src/not_feature_format.rs b/lexical-util/src/not_feature_format.rs index 41adddca..b87677d1 100644 --- a/lexical-util/src/not_feature_format.rs +++ b/lexical-util/src/not_feature_format.rs @@ -474,16 +474,20 @@ impl NumberFormat { /// Get if leading zeros before an integer are not allowed. /// - /// Can only be modified with [`feature`][crate#features] `format`. Defaults - /// to [`false`]. + /// Can only be modified with [`feature`][crate#features] `format`. This + /// only applies if there is no base prefix: that is, the zeros are + /// at the absolute start of the number. Defaults to [`false`]. /// /// # Examples /// + /// With a base prefix of `x`. + /// /// | Input | Valid? | /// |:-:|:-:| /// | `01` | ❌ | /// | `0` | ✔️ | /// | `10` | ✔️ | + /// | `0x01` | ✔️ | /// /// # Used For /// @@ -502,17 +506,21 @@ impl NumberFormat { /// /// This is before the significant digits of the float, that is, if there is /// 1 or more digits in the integral component and the leading digit is 0, - /// Can only be modified with [`feature`][crate#features] `format`. Defaults - /// to [`false`]. + /// Can only be modified with [`feature`][crate#features] `format`. This + /// only applies if there is no base prefix: that is, the zeros are + /// at the absolute start of the number. Defaults to [`false`]. /// /// # Examples /// + /// With a base prefix of `x`. + /// /// | Input | Valid? | /// |:-:|:-:| /// | `01` | ❌ | /// | `01.0` | ❌ | /// | `0` | ✔️ | /// | `10` | ✔️ | + /// | `0x01.0` | ✔️ | /// | `0.1` | ✔️ | /// /// # Used For diff --git a/lexical-util/src/skip.rs b/lexical-util/src/skip.rs index 884d1af7..0811a9f1 100644 --- a/lexical-util/src/skip.rs +++ b/lexical-util/src/skip.rs @@ -1258,7 +1258,7 @@ macro_rules! skip_iterator_impl { #[inline(always)] pub fn new(byte: &'b mut Bytes<'a, FORMAT>) -> Self { Self { - byte: byte, + byte, digits: 0, } }