diff --git a/benches/bigint.rs b/benches/bigint.rs
index 22795274..d916a7de 100644
--- a/benches/bigint.rs
+++ b/benches/bigint.rs
@@ -112,6 +112,11 @@ fn divide_2(b: &mut Bencher) {
     divide_bench(b, 1 << 16, 1 << 12);
 }
 
+#[bench]
+fn divide_3(b: &mut Bencher) {
+    divide_bench(b, 1 << 20, 1 << 16);
+}
+
 #[bench]
 fn divide_big_little(b: &mut Bencher) {
     divide_bench(b, 1 << 16, 1 << 4);
@@ -210,6 +215,16 @@ fn to_str_radix_10_2(b: &mut Bencher) {
     to_str_radix_bench(b, 10, 10009);
 }
 
+#[bench]
+fn to_str_radix_10_3(b: &mut Bencher) {
+    to_str_radix_bench(b, 10, 100009);
+}
+
+#[bench]
+fn to_str_radix_10_4(b: &mut Bencher) {
+    to_str_radix_bench(b, 10, 1000009);
+}
+
 #[bench]
 fn to_str_radix_16(b: &mut Bencher) {
     to_str_radix_bench(b, 16, 1009);
diff --git a/src/bigint.rs b/src/bigint.rs
index b4f84b9e..f6fb983a 100644
--- a/src/bigint.rs
+++ b/src/bigint.rs
@@ -975,6 +975,16 @@ impl BigInt {
         self.data.bits()
     }
 
+    /// Converts this [`BigInt`] into a [`BigUint`], if it's not negative.
+    #[inline]
+    pub fn into_biguint(self) -> Option<BigUint> {
+        match self.sign {
+            Plus => Some(self.data),
+            NoSign => Some(BigUint::ZERO),
+            Minus => None,
+        }
+    }
+    
     /// Converts this [`BigInt`] into a [`BigUint`], if it's not negative.
     #[inline]
     pub fn to_biguint(&self) -> Option<BigUint> {
diff --git a/src/biguint/convert.rs b/src/biguint/convert.rs
index 3daf3dcb..0aa67d31 100644
--- a/src/biguint/convert.rs
+++ b/src/biguint/convert.rs
@@ -1,7 +1,7 @@
 // This uses stdlib features higher than the MSRV
 #![allow(clippy::manual_range_contains)] // 1.35
 
-use super::{biguint_from_vec, BigUint, ToBigUint};
+use super::{biguint_from_vec, BigUint, IntDigits, ToBigUint};
 
 use super::addition::add2;
 use super::division::{div_rem_digit, FAST_DIV_WIDE};
@@ -700,35 +700,49 @@ pub(super) fn to_radix_digits_le(u: &BigUint, radix: u32) -> Vec<u8> {
     // performance. We can mitigate this by dividing into chunks of a larger base first.
     // The threshold for this was chosen by anecdotal performance measurements to
     // approximate where this starts to make a noticeable difference.
-    if digits.data.len() >= 64 {
-        let mut big_base = BigUint::from(base);
-        let mut big_power = 1usize;
-
-        // Choose a target base length near √n.
-        let target_len = digits.data.len().sqrt();
-        while big_base.data.len() < target_len {
-            big_base = &big_base * &big_base;
-            big_power *= 2;
-        }
-
-        // This outer loop will run approximately √n times.
-        while digits > big_base {
-            // This is still the dominating factor, with n digits divided by √n digits.
-            let (q, mut big_r) = digits.div_rem(&big_base);
-            digits = q;
-
-            // This inner loop now has O(√n²)=O(n) behavior altogether.
-            for _ in 0..big_power {
-                let (q, mut r) = div_rem_digit(big_r, base);
-                big_r = q;
-                for _ in 0..power {
-                    res.push((r % radix) as u8);
-                    r /= radix;
-                }
+    if digits.data.len() >= 32 {
+        let mut big_bases = Vec::with_capacity(32);
+        big_bases.push((BigUint::from(base), power));
+
+        loop {
+            let (big_base, power) = big_bases.last().unwrap();
+            if big_base.data.len() > digits.data.len() / 2 + 1 {
+                break;
             }
+            let next_big_base = big_base * big_base;
+            let next_power = *power * 2;
+            big_bases.push((next_big_base, next_power));
+        }
+
+        to_radix_digits_le_divide_and_conquer(
+            digits,
+            base,
+            power,
+            &big_bases,
+            big_bases.len() - 1,
+            &mut res,
+            radix,
+        );
+        while res.last() == Some(&0) {
+            res.pop();
         }
+        return res;
     }
 
+    to_radix_digits_le_small(digits, base, power, &mut res, radix);
+
+    res
+}
+
+// Extract little-endian radix digits for small numbers
+#[inline(always)] // forced inline to get const-prop for radix=10
+fn to_radix_digits_le_small(
+    mut digits: BigUint,
+    base: u64,
+    power: usize,
+    res: &mut Vec<u8>,
+    radix: u64,
+) {
     while digits.data.len() > 1 {
         let (q, mut r) = div_rem_digit(digits, base);
         for _ in 0..power {
@@ -743,8 +757,34 @@ pub(super) fn to_radix_digits_le(u: &BigUint, radix: u32) -> Vec<u8> {
         res.push((r % radix) as u8);
         r /= radix;
     }
+}
 
-    res
+fn to_radix_digits_le_divide_and_conquer(
+    number: BigUint,
+    base: u64,
+    power: usize,
+    big_bases: &[(BigUint, usize)],
+    k: usize,
+    res: &mut Vec<u8>,
+    radix: u64,
+) {
+    let &(ref big_base, result_len) = &big_bases[k];
+    if number.data.len() < 8 {
+        let prev_res_len = res.len();
+        if !number.is_zero() {
+            to_radix_digits_le_small(number, base, power, res, radix);
+        }
+        while res.len() < prev_res_len + result_len * 2 {
+            res.push(0);
+        }
+        return;
+    }
+    // number always has two digits in the big base
+    let (digit_1, digit_2) = number.div_rem(big_base);
+    assert!(&digit_1 < big_base);
+    assert!(&digit_2 < big_base);
+    to_radix_digits_le_divide_and_conquer(digit_2, base, power, big_bases, k - 1, res, radix);
+    to_radix_digits_le_divide_and_conquer(digit_1, base, power, big_bases, k - 1, res, radix);
 }
 
 pub(super) fn to_radix_le(u: &BigUint, radix: u32) -> Vec<u8> {
diff --git a/src/biguint/division.rs b/src/biguint/division.rs
index 3dfb0bbb..cf6f484b 100644
--- a/src/biguint/division.rs
+++ b/src/biguint/division.rs
@@ -1,14 +1,15 @@
 use super::addition::__add2;
 use super::{cmp_slice, BigUint};
 
-use crate::big_digit::{self, BigDigit, DoubleBigDigit};
-use crate::UsizePromotion;
+use crate::big_digit::{self, BigDigit, DoubleBigDigit, BITS};
+use crate::biguint::IntDigits;
+use crate::{BigInt, UsizePromotion};
 
 use core::cmp::Ordering::{Equal, Greater, Less};
 use core::mem;
 use core::ops::{Div, DivAssign, Rem, RemAssign};
 use num_integer::Integer;
-use num_traits::{CheckedDiv, CheckedEuclid, Euclid, One, ToPrimitive, Zero};
+use num_traits::{CheckedDiv, CheckedEuclid, Euclid, One, Signed, ToPrimitive, Zero};
 
 pub(super) const FAST_DIV_WIDE: bool = cfg!(any(target_arch = "x86", target_arch = "x86_64"));
 
@@ -197,9 +198,9 @@ fn div_rem(mut u: BigUint, mut d: BigUint) -> (BigUint, BigUint) {
 
     if shift == 0 {
         // no need to clone d
-        div_rem_core(u, &d.data)
+        div_rem_core(u, &d)
     } else {
-        let (q, r) = div_rem_core(u << shift, &(d << shift).data);
+        let (q, r) = div_rem_core(u << shift, &(d << shift));
         // renormalize the remainder
         (q, r >> shift)
     }
@@ -239,17 +240,130 @@ pub(super) fn div_rem_ref(u: &BigUint, d: &BigUint) -> (BigUint, BigUint) {
 
     if shift == 0 {
         // no need to clone d
-        div_rem_core(u.clone(), &d.data)
+        div_rem_core(u.clone(), &d)
     } else {
-        let (q, r) = div_rem_core(u << shift, &(d << shift).data);
+        let (q, r) = div_rem_core(u << shift, &(d << shift));
         // renormalize the remainder
         (q, r >> shift)
     }
 }
 
+const BURNIKEL_ZIEGLER_THRESHOLD: usize = 64;
+
+/// This algorithm is from Burnikel and Ziegler, "Fast Recursive Division", Algorithm 1.
+/// It is a recursive algorithm that divides the dividend and divisor into blocks of digits
+/// and uses a divide-and-conquer approach to find the quotient.
+///
+/// The algorithm is more complex than the base algorithm, but it is faster for large operands.
+///
+/// Time complexity of this algorithm is the same as the algorithm used for the multiplication.
+/// 
+/// link: https://pure.mpg.de/rest/items/item_1819444_4/component/file_2599480/content 
+fn div_rem_burnikel_ziegler(u: &BigUint, d: &BigUint) -> (BigUint, BigUint) {
+    fn divide_biguint(mut b: BigUint, level: usize) -> (BigUint, BigUint) {
+        if b.len() <= level {
+            return (BigUint::ZERO, b);
+        }
+        let b1_data = b.data[level..].to_vec();
+        b.data.truncate(level);
+        (
+            BigUint { data: b1_data }.normalized(),
+            b.normalized(),
+        )
+    }
+
+    fn normalizing_shift_amount(b: &BigUint, level: usize) -> usize {
+        (level - b.len() + 1) * BITS as usize - b.data.last().unwrap().ilog2() as usize - 1
+    }
+
+    fn concat_biguint(b1: &BigUint, b2: BigUint, level: usize) -> BigUint {
+        let mut data = b2.data;
+        data.reserve(level + b1.len() - data.len());
+        data.extend(std::iter::repeat(0).take(level - data.len()));
+        data.extend_from_slice(&b1.data);
+        BigUint { data }.normalized()
+    }
+
+    fn div_two_digit_by_one(
+        ah: BigUint,
+        al: BigUint,
+        b: BigUint,
+        level: usize,
+    ) -> (BigUint, BigUint) {
+        // A precondition of this function is that q fits into a single digit.
+        debug_assert!(ah < b);
+        if level <= BURNIKEL_ZIEGLER_THRESHOLD {
+            return div_rem(concat_biguint(&ah, al.clone(), level), b);
+        }
+        let shift = normalizing_shift_amount(&b, level);
+        if shift != 0 {
+            let b = b << shift;
+            let (ah, al) = divide_biguint(concat_biguint(&ah, al.clone(), level) << shift, level);
+            let (q, r) = div_two_digit_by_one_normalized(ah, al, b, level);
+            (q, r >> shift)
+        } else {
+            div_two_digit_by_one_normalized(ah, al, b, level)
+        }
+    }
+
+    fn div_two_digit_by_one_normalized(
+        ah: BigUint,
+        al: BigUint,
+        b: BigUint,
+        level: usize,
+    ) -> (BigUint, BigUint) {
+        let level = level / 2;
+        let (a1, a2) = divide_biguint(ah, level);
+        let (a3, a4) = divide_biguint(al, level);
+        let (b1, b2) = divide_biguint(b, level);
+        let (q1, r) = div_three_halves_by_two(a1, a2, a3, b1.clone(), b2.clone(), level);
+        let (r1, r2) = divide_biguint(r, level);
+        let (q2, s) = div_three_halves_by_two(r1, r2, a4, b1, b2, level);
+        (concat_biguint(&q1, q2, level), s)
+    }
+
+    fn div_three_halves_by_two(
+        a1: BigUint,
+        a2: BigUint,
+        a3: BigUint,
+        b1: BigUint,
+        b2: BigUint,
+        level: usize,
+    ) -> (BigUint, BigUint) {
+        let (mut q, c) = div_two_digit_by_one(a1, a2, b1.clone(), level);
+        let mut r = BigInt::from(concat_biguint(&c, a3, level)) - BigInt::from(&q * &b2);
+        let b = concat_biguint(&b1, b2, level);
+        if r.is_negative() {
+            q -= 1u32;
+            r += BigInt::from(b.clone());
+            if r.is_negative() {
+                q -= 1u32;
+                r += BigInt::from(b.clone());
+            }
+        }
+        (q, r.into_biguint().unwrap())
+    }
+
+    let mut level = 1 << (u.data.len().ilog2());
+    if d.len() > level {
+        level *= 2;
+    }
+    let (u1, u2) = divide_biguint(u.clone(), level);
+    if &u1 > d {
+        div_two_digit_by_one(BigUint::ZERO, u.clone(), d.clone(), level * 2)    
+    } else {
+        div_two_digit_by_one(u1, u2, d.clone(), level)
+    }
+}
+
 /// An implementation of the base division algorithm.
 /// Knuth, TAOCP vol 2 section 4.3.1, algorithm D, with an improvement from exercises 19-21.
-fn div_rem_core(mut a: BigUint, b: &[BigDigit]) -> (BigUint, BigUint) {
+fn div_rem_core(mut a: BigUint, b: &BigUint) -> (BigUint, BigUint) {
+    if a.len() > BURNIKEL_ZIEGLER_THRESHOLD * 2 && b.len() > BURNIKEL_ZIEGLER_THRESHOLD {
+        return div_rem_burnikel_ziegler(&a, b);
+    }
+
+    let b = &*b.data;
     debug_assert!(a.data.len() >= b.len() && b.len() > 1);
     debug_assert!(b.last().unwrap().leading_zeros() == 0);