Skip to content

Commit

Permalink
Improve parsing floats to use our better base prefix logic.
Browse files Browse the repository at this point in the history
  • Loading branch information
Alexhuszagh committed Jan 17, 2025
1 parent d8ec91f commit e757c85
Show file tree
Hide file tree
Showing 6 changed files with 95 additions and 84 deletions.
98 changes: 37 additions & 61 deletions lexical-parse-float/src/parse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -251,11 +251,10 @@ pub fn parse_complete<F: LemireFloat, const FORMAT: u128>(
options: &Options,
) -> Result<F> {
let mut byte = bytes.bytes::<{ FORMAT }>();
let format = NumberFormat::<FORMAT> {};
let is_negative = parse_mantissa_sign(&mut byte)?;
if byte.integer_iter().is_consumed() {
if NumberFormat::<FORMAT>::REQUIRED_INTEGER_DIGITS
|| NumberFormat::<FORMAT>::REQUIRED_MANTISSA_DIGITS
{
if format.required_integer_digits() || format.required_mantissa_digits() {
return Err(Error::Empty(byte.cursor()));
} else {
return Ok(F::ZERO);
Expand Down Expand Up @@ -294,11 +293,10 @@ pub fn fast_path_complete<F: LemireFloat, const FORMAT: u128>(
options: &Options,
) -> Result<F> {
let mut byte = bytes.bytes::<{ FORMAT }>();
let format = NumberFormat::<FORMAT> {};
let is_negative = parse_mantissa_sign(&mut byte)?;
if byte.integer_iter().is_consumed() {
if NumberFormat::<FORMAT>::REQUIRED_INTEGER_DIGITS
|| NumberFormat::<FORMAT>::REQUIRED_MANTISSA_DIGITS
{
if format.required_integer_digits() || format.required_mantissa_digits() {
return Err(Error::Empty(byte.cursor()));
} else {
return Ok(F::ZERO);
Expand All @@ -319,11 +317,10 @@ pub fn parse_partial<F: LemireFloat, const FORMAT: u128>(
options: &Options,
) -> Result<(F, usize)> {
let mut byte = bytes.bytes::<{ FORMAT }>();
let format = NumberFormat::<FORMAT> {};
let is_negative = parse_mantissa_sign(&mut byte)?;
if byte.integer_iter().is_consumed() {
if NumberFormat::<FORMAT>::REQUIRED_INTEGER_DIGITS
|| NumberFormat::<FORMAT>::REQUIRED_MANTISSA_DIGITS
{
if format.required_integer_digits() || format.required_mantissa_digits() {
return Err(Error::Empty(byte.cursor()));
} else {
return Ok((F::ZERO, byte.cursor()));
Expand Down Expand Up @@ -368,11 +365,10 @@ pub fn fast_path_partial<F: LemireFloat, const FORMAT: u128>(
options: &Options,
) -> Result<(F, usize)> {
let mut byte = bytes.bytes::<{ FORMAT }>();
let format = NumberFormat::<FORMAT> {};
let is_negative = parse_mantissa_sign(&mut byte)?;
if byte.integer_iter().is_consumed() {
if NumberFormat::<FORMAT>::REQUIRED_INTEGER_DIGITS
|| NumberFormat::<FORMAT>::REQUIRED_MANTISSA_DIGITS
{
if format.required_integer_digits() || format.required_mantissa_digits() {
return Err(Error::Empty(byte.cursor()));
} else {
return Ok((F::ZERO, byte.cursor()));
Expand Down Expand Up @@ -535,51 +531,31 @@ pub fn parse_number<'a, const FORMAT: u128, const IS_PARTIAL: bool>(
let bits_per_digit = shared::log2(format.mantissa_radix()) as i64;
let bits_per_base = shared::log2(format.exponent_base()) as i64;

// INTEGER

// Check to see if we have a valid base prefix.
// NOTE: `lz_prefix` is if we had a leading zero when
// checking for a base prefix: it is not if the prefix
// exists or not.
// TODO: MIGRATE TO BASE PREFIX LOGIC
#[allow(unused_variables)]
let mut lz_prefix = false;
#[cfg(all(feature = "format", feature = "power-of-two"))]
{
let base_prefix = format.base_prefix();
let mut has_prefix = false;
let mut iter = byte.integer_iter();
if base_prefix != 0 && iter.read_if_value_cased(b'0').is_some() {
// Check to see if the next character is the base prefix.
// We must have a format like `0x`, `0d`, `0o`.
// NOTE: The check for empty integer digits happens below so
// we don't need a redundant check here.
lz_prefix = true;
let prefix = iter.read_if_value(base_prefix, format.case_sensitive_base_prefix());
has_prefix = prefix.is_some();
if has_prefix && iter.is_buffer_empty() && format.required_integer_digits() {
return Err(Error::EmptyInteger(iter.cursor()));
}
}
if format.required_base_prefix() && !has_prefix {
return Err(Error::MissingBasePrefix(iter.cursor()));
// skip and validate an optional base prefix
let has_base_prefix = cfg!(feature = "format") && byte.integer_iter().read_base_prefix();
if cfg!(feature = "format") && has_base_prefix {
if byte.is_buffer_empty() && format.required_integer_digits() {
return Err(Error::EmptyInteger(byte.cursor()));
}
} else if format.required_base_prefix() {
return Err(Error::MissingBasePrefix(byte.cursor()));
}

// Parse our integral digits.
let mut mantissa = 0_u64;
// INTEGER

let start = byte.clone();
let mut mantissa = 0_u64;
let mut integer_iter = byte.integer_iter();
let start_count = integer_iter.digits();
let integer_start = integer_iter.digits();

// Parse our integral digits.
#[cfg(not(feature = "compact"))]
parse_8digits::<_, FORMAT>(&mut integer_iter, &mut mantissa);
parse_digits(&mut integer_iter, format.mantissa_radix(), |digit| {
mantissa = mantissa.wrapping_mul(format.radix() as u64).wrapping_add(digit as u64);
});
let mut n_digits = integer_iter.digits_since(start_count);
#[cfg(feature = "format")]
let mut n_digits = integer_iter.digits_since(integer_start);
let n_before_dot = n_digits;
#[cfg(feature = "format")]
if format.required_integer_digits() && n_digits == 0 {
return Err(Error::EmptyInteger(byte.cursor()));
}
Expand Down Expand Up @@ -610,10 +586,13 @@ pub fn parse_number<'a, const FORMAT: u128, const IS_PARTIAL: bool>(
let integer_digits = unsafe { start.as_slice().get_unchecked(..b_digits) };

// Check if integer leading zeros are disabled.
#[cfg(feature = "format")]
if !lz_prefix && format.no_float_leading_zeros() {
if cfg!(feature = "format")
&& format.no_float_leading_zeros()
&& !has_base_prefix
&& n_before_dot > 1
{
let mut integer = integer_digits.bytes::<FORMAT>();
if integer_digits.len() > 1 && integer.integer_iter().peek() == Some(&b'0') {
if integer.integer_iter().peek() == Some(&b'0') {
return Err(Error::InvalidLeadingZeros(start.cursor()));
}
}
Expand All @@ -632,13 +611,13 @@ pub fn parse_number<'a, const FORMAT: u128, const IS_PARTIAL: bool>(
unsafe { byte.step_unchecked() };
let before = byte.clone();
let mut fraction_iter = byte.fraction_iter();
let start_count = fraction_iter.digits();
let fraction_count = fraction_iter.digits();
#[cfg(not(feature = "compact"))]
parse_8digits::<_, FORMAT>(&mut fraction_iter, &mut mantissa);
parse_digits(&mut fraction_iter, format.mantissa_radix(), |digit| {
mantissa = mantissa.wrapping_mul(format.radix() as u64).wrapping_add(digit as u64);
});
n_after_dot = fraction_iter.digits_since(start_count);
n_after_dot = fraction_iter.digits_since(fraction_count);
// NOTE: We can't use the number of digits to extract the slice for
// non-contiguous iterators, but we also need to the number of digits
// for our value calculation. We store both, and let the compiler know
Expand Down Expand Up @@ -674,7 +653,7 @@ pub fn parse_number<'a, const FORMAT: u128, const IS_PARTIAL: bool>(
// NOTE: Check if we have our exponent **BEFORE** checking if the
// mantissa is empty, so we can ensure
let has_exponent = byte
.first_is(exponent_character, format.case_sensitive_exponent() && cfg!(feature = "format"));
.first_is(exponent_character, cfg!(feature = "format") && format.case_sensitive_exponent());

// check to see if we have any invalid leading zeros
n_digits += n_after_dot;
Expand All @@ -701,8 +680,7 @@ pub fn parse_number<'a, const FORMAT: u128, const IS_PARTIAL: bool>(
unsafe { byte.step_unchecked() };

// Check float format syntax checks.
#[cfg(feature = "format")]
{
if cfg!(feature = "format") {
// NOTE: We've overstepped for the safety invariant before.
if format.no_exponent_notation() {
return Err(Error::InvalidExponent(byte.cursor() - 1));
Expand Down Expand Up @@ -736,14 +714,14 @@ pub fn parse_number<'a, const FORMAT: u128, const IS_PARTIAL: bool>(

let is_negative_exponent = parse_exponent_sign(&mut byte)?;
let mut exponent_iter = byte.exponent_iter();
let start_count = exponent_iter.digits();
let exponent_start = exponent_iter.digits();
parse_digits(&mut exponent_iter, format.exponent_radix(), |digit| {
if explicit_exponent < 0x10000000 {
explicit_exponent *= format.exponent_radix() as i64;
explicit_exponent += digit as i64;
}
});
if format.required_exponent_digits() && exponent_iter.digits_since(start_count) == 0 {
if format.required_exponent_digits() && exponent_iter.digits_since(exponent_start) == 0 {
return Err(Error::EmptyExponent(byte.cursor()));
}
// Handle our sign, and get the explicit part of the exponent.
Expand All @@ -760,9 +738,8 @@ pub fn parse_number<'a, const FORMAT: u128, const IS_PARTIAL: bool>(
// Check to see if we have a valid base suffix.
// We've already trimmed any leading digit separators here, so we can be safe
// that the first character **is not** a digit separator.
// FIXME: Improve parsing of this
#[cfg(all(feature = "format", feature = "power-of-two"))]
if format.has_base_suffix() {
// TODO: Improve parsing of this using a base suffix method
if cfg!(all(feature = "format", feature = "power-of-two")) && format.has_base_suffix() {
let base_suffix = format.base_suffix();
let is_suffix = byte.first_is(base_suffix, format.case_sensitive_base_suffix());
if is_suffix {
Expand All @@ -779,8 +756,7 @@ pub fn parse_number<'a, const FORMAT: u128, const IS_PARTIAL: bool>(
let end = byte.cursor();
let mut step = u64_step(format.mantissa_radix());
let mut many_digits = false;
#[cfg(feature = "format")]
if !format.required_mantissa_digits() && n_digits == 0 {
if cfg!(feature = "format") && !format.required_mantissa_digits() && n_digits == 0 {
exponent = 0;
}
if n_digits <= step {
Expand Down
12 changes: 5 additions & 7 deletions lexical-parse-integer/src/algorithm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -674,23 +674,23 @@ macro_rules! algorithm {
let mut byte = $bytes.bytes::<FORMAT>();
let format = NumberFormat::<FORMAT> {};
let radix = format.mantissa_radix();
debug_assert!(format.is_valid(), "should have already checked for an invalid number format");

let is_negative = parse_sign::<T, FORMAT>(&mut byte)?;
let mut iter = byte.integer_iter();
maybe_into_empty!(iter, $into_ok);

// skip and validate an optional base prefix
#[cfg(all(feature = "format", feature = "power-of-two"))]
if iter.read_base_prefix() {
let has_base_prefix = cfg!(feature = "format") && iter.read_base_prefix();
if cfg!(feature = "format") && has_base_prefix {
maybe_into_empty!(iter, $into_ok);
} else if format.required_base_prefix() {
return Err(Error::MissingBasePrefix(iter.cursor()));
}

// NOTE: always do a peek so any leading digit separators
// are skipped, and we can get the correct index
#[cfg(feature = "format")]
if format.no_integer_leading_zeros() && iter.peek() == Some(&b'0') {
if cfg!(feature = "format") && format.no_integer_leading_zeros() && !has_base_prefix && iter.peek() == Some(&b'0') {
// NOTE: Skipping zeros is **EXPENSIVE* so we skip that without our format feature
let index = iter.cursor();
let zeros = iter.skip_zeros();
Expand Down Expand Up @@ -719,7 +719,6 @@ macro_rules! algorithm {
// and even if parsing a 64-bit integer is marginally faster, it
// culminates in **way** slower performance overall for simple
// integers, and no improvement for large integers.
#[allow(unused)]
let mut has_suffix = false;
// FIXME: This is only used for the parsing of the base suffix.
#[allow(unused)]
Expand Down Expand Up @@ -776,8 +775,7 @@ macro_rules! algorithm {
);
}

#[cfg(all(feature = "format", feature = "power-of-two"))]
if format.required_base_suffix() && !has_suffix {
if cfg!(all(feature = "format", feature = "power-of-two")) && format.required_base_suffix() && !has_suffix {
return Err(Error::MissingBaseSuffix(iter.cursor()));
}

Expand Down
16 changes: 12 additions & 4 deletions lexical-util/src/feature_format.rs
Original file line number Diff line number Diff line change
Expand Up @@ -516,16 +516,20 @@ impl<const FORMAT: u128> NumberFormat<FORMAT> {

/// Get if leading zeros before an integer are not allowed.
///
/// Can only be modified with [`feature`][crate#features] `format`. Defaults
/// to [`false`].
/// Can only be modified with [`feature`][crate#features] `format`. This
/// only applies if there is no base prefix: that is, the zeros are
/// at the absolute start of the number. Defaults to [`false`].
///
/// # Examples
///
/// With a base prefix of `x`.
///
/// | Input | Valid? |
/// |:-:|:-:|
/// | `01` | ❌ |
/// | `0` | ✔️ |
/// | `10` | ✔️ |
/// | `0x01` | ✔️ |
///
/// # Used For
///
Expand All @@ -544,17 +548,21 @@ impl<const FORMAT: u128> NumberFormat<FORMAT> {
///
/// This is before the significant digits of the float, that is, if there is
/// 1 or more digits in the integral component and the leading digit is 0,
/// Can only be modified with [`feature`][crate#features] `format`. Defaults
/// to [`false`].
/// Can only be modified with [`feature`][crate#features] `format`. This
/// only applies if there is no base prefix: that is, the zeros are
/// at the absolute start of the number. Defaults to [`false`].
///
/// # Examples
///
/// With a base prefix of `x`.
///
/// | Input | Valid? |
/// |:-:|:-:|
/// | `01` | ❌ |
/// | `01.0` | ❌ |
/// | `0` | ✔️ |
/// | `10` | ✔️ |
/// | `0x01.0` | ✔️ |
/// | `0.1` | ✔️ |
///
/// # Used For
Expand Down
Loading

0 comments on commit e757c85

Please sign in to comment.