From 0dd996907c34d86d8c1f85dbd1efb12c948e786a Mon Sep 17 00:00:00 2001 From: hkalbasi Date: Mon, 16 Dec 2024 22:25:51 +0330 Subject: [PATCH 1/3] Use divide and conquer in to_radix_digits --- benches/bigint.rs | 10 +++++ src/biguint/convert.rs | 92 ++++++++++++++++++++++++++++++------------ 2 files changed, 76 insertions(+), 26 deletions(-) diff --git a/benches/bigint.rs b/benches/bigint.rs index 22795274..9b1ae2ff 100644 --- a/benches/bigint.rs +++ b/benches/bigint.rs @@ -210,6 +210,16 @@ fn to_str_radix_10_2(b: &mut Bencher) { to_str_radix_bench(b, 10, 10009); } +#[bench] +fn to_str_radix_10_3(b: &mut Bencher) { + to_str_radix_bench(b, 10, 100009); +} + +#[bench] +fn to_str_radix_10_4(b: &mut Bencher) { + to_str_radix_bench(b, 10, 1000009); +} + #[bench] fn to_str_radix_16(b: &mut Bencher) { to_str_radix_bench(b, 16, 1009); diff --git a/src/biguint/convert.rs b/src/biguint/convert.rs index 3daf3dcb..757d932d 100644 --- a/src/biguint/convert.rs +++ b/src/biguint/convert.rs @@ -1,7 +1,7 @@ // This uses stdlib features higher than the MSRV #![allow(clippy::manual_range_contains)] // 1.35 -use super::{biguint_from_vec, BigUint, ToBigUint}; +use super::{biguint_from_vec, BigUint, IntDigits, ToBigUint}; use super::addition::add2; use super::division::{div_rem_digit, FAST_DIV_WIDE}; @@ -701,34 +701,48 @@ pub(super) fn to_radix_digits_le(u: &BigUint, radix: u32) -> Vec { // The threshold for this was chosen by anecdotal performance measurements to // approximate where this starts to make a noticeable difference. if digits.data.len() >= 64 { - let mut big_base = BigUint::from(base); - let mut big_power = 1usize; - - // Choose a target base length near √n. - let target_len = digits.data.len().sqrt(); - while big_base.data.len() < target_len { - big_base = &big_base * &big_base; - big_power *= 2; - } - - // This outer loop will run approximately √n times. - while digits > big_base { - // This is still the dominating factor, with n digits divided by √n digits. - let (q, mut big_r) = digits.div_rem(&big_base); - digits = q; - - // This inner loop now has O(√n²)=O(n) behavior altogether. - for _ in 0..big_power { - let (q, mut r) = div_rem_digit(big_r, base); - big_r = q; - for _ in 0..power { - res.push((r % radix) as u8); - r /= radix; - } + let mut big_bases = Vec::with_capacity(32); + big_bases.push((BigUint::from(base), power)); + + loop { + let (big_base, power) = big_bases.last().unwrap(); + if big_base.data.len() > digits.data.len() / 2 + 1 { + break; } + let next_big_base = big_base * big_base; + let next_power = *power * 2; + big_bases.push((next_big_base, next_power)); + } + + to_radix_digits_le_divide_and_conquer( + digits, + base, + power, + &big_bases, + big_bases.len() - 1, + &mut res, + radix, + ); + while res.last() == Some(&0) { + res.pop(); } + return res; } + to_radix_digits_le_small(digits, base, power, &mut res, radix); + + res +} + +// Extract little-endian radix digits for small numbers +#[inline(always)] // forced inline to get const-prop for radix=10 +fn to_radix_digits_le_small( + mut digits: BigUint, + base: u64, + power: usize, + res: &mut Vec, + radix: u64, +) { while digits.data.len() > 1 { let (q, mut r) = div_rem_digit(digits, base); for _ in 0..power { @@ -743,8 +757,34 @@ pub(super) fn to_radix_digits_le(u: &BigUint, radix: u32) -> Vec { res.push((r % radix) as u8); r /= radix; } +} - res +fn to_radix_digits_le_divide_and_conquer( + number: BigUint, + base: u64, + power: usize, + big_bases: &[(BigUint, usize)], + k: usize, + res: &mut Vec, + radix: u64, +) { + let &(ref big_base, result_len) = &big_bases[k]; + if number.data.len() < 8 { + let prev_res_len = res.len(); + if !number.is_zero() { + to_radix_digits_le_small(number, base, power, res, radix); + } + while res.len() < prev_res_len + result_len * 2 { + res.push(0); + } + return; + } + // number always has two digits in the big base + let (digit_1, digit_2) = number.div_rem(big_base); + assert!(&digit_1 < big_base); + assert!(&digit_2 < big_base); + to_radix_digits_le_divide_and_conquer(digit_2, base, power, big_bases, k - 1, res, radix); + to_radix_digits_le_divide_and_conquer(digit_1, base, power, big_bases, k - 1, res, radix); } pub(super) fn to_radix_le(u: &BigUint, radix: u32) -> Vec { From d6313e2d08b20f68bc30d2053c284e72f76f7e30 Mon Sep 17 00:00:00 2001 From: hkalbasi Date: Tue, 17 Dec 2024 00:20:39 +0330 Subject: [PATCH 2/3] Update the threshold in to_radix_digits --- src/biguint/convert.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/biguint/convert.rs b/src/biguint/convert.rs index 757d932d..0aa67d31 100644 --- a/src/biguint/convert.rs +++ b/src/biguint/convert.rs @@ -700,7 +700,7 @@ pub(super) fn to_radix_digits_le(u: &BigUint, radix: u32) -> Vec { // performance. We can mitigate this by dividing into chunks of a larger base first. // The threshold for this was chosen by anecdotal performance measurements to // approximate where this starts to make a noticeable difference. - if digits.data.len() >= 64 { + if digits.data.len() >= 32 { let mut big_bases = Vec::with_capacity(32); big_bases.push((BigUint::from(base), power)); From 018dd8f499f4f971638a612c9cc2edff02e11e3a Mon Sep 17 00:00:00 2001 From: hkalbasi Date: Thu, 2 Jan 2025 14:38:27 +0330 Subject: [PATCH 3/3] Add Burnikel Ziegler algorithm for division --- benches/bigint.rs | 5 ++ src/bigint.rs | 10 ++++ src/biguint/division.rs | 130 +++++++++++++++++++++++++++++++++++++--- 3 files changed, 137 insertions(+), 8 deletions(-) diff --git a/benches/bigint.rs b/benches/bigint.rs index 9b1ae2ff..d916a7de 100644 --- a/benches/bigint.rs +++ b/benches/bigint.rs @@ -112,6 +112,11 @@ fn divide_2(b: &mut Bencher) { divide_bench(b, 1 << 16, 1 << 12); } +#[bench] +fn divide_3(b: &mut Bencher) { + divide_bench(b, 1 << 20, 1 << 16); +} + #[bench] fn divide_big_little(b: &mut Bencher) { divide_bench(b, 1 << 16, 1 << 4); diff --git a/src/bigint.rs b/src/bigint.rs index b4f84b9e..f6fb983a 100644 --- a/src/bigint.rs +++ b/src/bigint.rs @@ -975,6 +975,16 @@ impl BigInt { self.data.bits() } + /// Converts this [`BigInt`] into a [`BigUint`], if it's not negative. + #[inline] + pub fn into_biguint(self) -> Option { + match self.sign { + Plus => Some(self.data), + NoSign => Some(BigUint::ZERO), + Minus => None, + } + } + /// Converts this [`BigInt`] into a [`BigUint`], if it's not negative. #[inline] pub fn to_biguint(&self) -> Option { diff --git a/src/biguint/division.rs b/src/biguint/division.rs index 3dfb0bbb..cf6f484b 100644 --- a/src/biguint/division.rs +++ b/src/biguint/division.rs @@ -1,14 +1,15 @@ use super::addition::__add2; use super::{cmp_slice, BigUint}; -use crate::big_digit::{self, BigDigit, DoubleBigDigit}; -use crate::UsizePromotion; +use crate::big_digit::{self, BigDigit, DoubleBigDigit, BITS}; +use crate::biguint::IntDigits; +use crate::{BigInt, UsizePromotion}; use core::cmp::Ordering::{Equal, Greater, Less}; use core::mem; use core::ops::{Div, DivAssign, Rem, RemAssign}; use num_integer::Integer; -use num_traits::{CheckedDiv, CheckedEuclid, Euclid, One, ToPrimitive, Zero}; +use num_traits::{CheckedDiv, CheckedEuclid, Euclid, One, Signed, ToPrimitive, Zero}; pub(super) const FAST_DIV_WIDE: bool = cfg!(any(target_arch = "x86", target_arch = "x86_64")); @@ -197,9 +198,9 @@ fn div_rem(mut u: BigUint, mut d: BigUint) -> (BigUint, BigUint) { if shift == 0 { // no need to clone d - div_rem_core(u, &d.data) + div_rem_core(u, &d) } else { - let (q, r) = div_rem_core(u << shift, &(d << shift).data); + let (q, r) = div_rem_core(u << shift, &(d << shift)); // renormalize the remainder (q, r >> shift) } @@ -239,17 +240,130 @@ pub(super) fn div_rem_ref(u: &BigUint, d: &BigUint) -> (BigUint, BigUint) { if shift == 0 { // no need to clone d - div_rem_core(u.clone(), &d.data) + div_rem_core(u.clone(), &d) } else { - let (q, r) = div_rem_core(u << shift, &(d << shift).data); + let (q, r) = div_rem_core(u << shift, &(d << shift)); // renormalize the remainder (q, r >> shift) } } +const BURNIKEL_ZIEGLER_THRESHOLD: usize = 64; + +/// This algorithm is from Burnikel and Ziegler, "Fast Recursive Division", Algorithm 1. +/// It is a recursive algorithm that divides the dividend and divisor into blocks of digits +/// and uses a divide-and-conquer approach to find the quotient. +/// +/// The algorithm is more complex than the base algorithm, but it is faster for large operands. +/// +/// Time complexity of this algorithm is the same as the algorithm used for the multiplication. +/// +/// link: https://pure.mpg.de/rest/items/item_1819444_4/component/file_2599480/content +fn div_rem_burnikel_ziegler(u: &BigUint, d: &BigUint) -> (BigUint, BigUint) { + fn divide_biguint(mut b: BigUint, level: usize) -> (BigUint, BigUint) { + if b.len() <= level { + return (BigUint::ZERO, b); + } + let b1_data = b.data[level..].to_vec(); + b.data.truncate(level); + ( + BigUint { data: b1_data }.normalized(), + b.normalized(), + ) + } + + fn normalizing_shift_amount(b: &BigUint, level: usize) -> usize { + (level - b.len() + 1) * BITS as usize - b.data.last().unwrap().ilog2() as usize - 1 + } + + fn concat_biguint(b1: &BigUint, b2: BigUint, level: usize) -> BigUint { + let mut data = b2.data; + data.reserve(level + b1.len() - data.len()); + data.extend(std::iter::repeat(0).take(level - data.len())); + data.extend_from_slice(&b1.data); + BigUint { data }.normalized() + } + + fn div_two_digit_by_one( + ah: BigUint, + al: BigUint, + b: BigUint, + level: usize, + ) -> (BigUint, BigUint) { + // A precondition of this function is that q fits into a single digit. + debug_assert!(ah < b); + if level <= BURNIKEL_ZIEGLER_THRESHOLD { + return div_rem(concat_biguint(&ah, al.clone(), level), b); + } + let shift = normalizing_shift_amount(&b, level); + if shift != 0 { + let b = b << shift; + let (ah, al) = divide_biguint(concat_biguint(&ah, al.clone(), level) << shift, level); + let (q, r) = div_two_digit_by_one_normalized(ah, al, b, level); + (q, r >> shift) + } else { + div_two_digit_by_one_normalized(ah, al, b, level) + } + } + + fn div_two_digit_by_one_normalized( + ah: BigUint, + al: BigUint, + b: BigUint, + level: usize, + ) -> (BigUint, BigUint) { + let level = level / 2; + let (a1, a2) = divide_biguint(ah, level); + let (a3, a4) = divide_biguint(al, level); + let (b1, b2) = divide_biguint(b, level); + let (q1, r) = div_three_halves_by_two(a1, a2, a3, b1.clone(), b2.clone(), level); + let (r1, r2) = divide_biguint(r, level); + let (q2, s) = div_three_halves_by_two(r1, r2, a4, b1, b2, level); + (concat_biguint(&q1, q2, level), s) + } + + fn div_three_halves_by_two( + a1: BigUint, + a2: BigUint, + a3: BigUint, + b1: BigUint, + b2: BigUint, + level: usize, + ) -> (BigUint, BigUint) { + let (mut q, c) = div_two_digit_by_one(a1, a2, b1.clone(), level); + let mut r = BigInt::from(concat_biguint(&c, a3, level)) - BigInt::from(&q * &b2); + let b = concat_biguint(&b1, b2, level); + if r.is_negative() { + q -= 1u32; + r += BigInt::from(b.clone()); + if r.is_negative() { + q -= 1u32; + r += BigInt::from(b.clone()); + } + } + (q, r.into_biguint().unwrap()) + } + + let mut level = 1 << (u.data.len().ilog2()); + if d.len() > level { + level *= 2; + } + let (u1, u2) = divide_biguint(u.clone(), level); + if &u1 > d { + div_two_digit_by_one(BigUint::ZERO, u.clone(), d.clone(), level * 2) + } else { + div_two_digit_by_one(u1, u2, d.clone(), level) + } +} + /// An implementation of the base division algorithm. /// Knuth, TAOCP vol 2 section 4.3.1, algorithm D, with an improvement from exercises 19-21. -fn div_rem_core(mut a: BigUint, b: &[BigDigit]) -> (BigUint, BigUint) { +fn div_rem_core(mut a: BigUint, b: &BigUint) -> (BigUint, BigUint) { + if a.len() > BURNIKEL_ZIEGLER_THRESHOLD * 2 && b.len() > BURNIKEL_ZIEGLER_THRESHOLD { + return div_rem_burnikel_ziegler(&a, b); + } + + let b = &*b.data; debug_assert!(a.data.len() >= b.len() && b.len() > 1); debug_assert!(b.last().unwrap().leading_zeros() == 0);