The RAM myth

purplesyringa · Dec 18, 2024 · 4b42774 · 4b42774
1 parent 828394a
commit 4b42774
Show file tree

Hide file tree

Showing 8 changed files with 491 additions and 2 deletions.
diff --git a/blog/feed.rss b/blog/feed.rss
@@ -7,11 +7,28 @@
 		<copyright>Alisa Sireneva, CC BY</copyright>
 		<managingEditor>[email protected] (Alisa Sireneva)</managingEditor>
 		<webMaster>[email protected] (Alisa Sireneva)</webMaster>
-		<lastBuildDate>Thu, 12 Dec 2024 14:50:16 GMT</lastBuildDate>
+		<lastBuildDate>Wed, 18 Dec 2024 21:58:51 GMT</lastBuildDate>
 		<docs>https://www.rssboard.org/rss-specification</docs>
 		<ttl>60</ttl>
 		<atom:link href="https://purplesyringa.moe/blog/feed.rss" rel="self" type="application/rss+xml" />
 
+			<item>
+				<title>The RAM myth</title>
+				<link>https://purplesyringa.moe/blog/./the-ram-myth/</link>
+				<description>The RAM myth is a belief that modern computer memory resembles perfect random-access memory. Cache is seen as an optimization for small data: if it fits in L2, it’s going to be processed faster; if it doesn’t, there’s nothing we can do.
+Most likely, you believe that code like this is the fastest way to shard data:
+groups = [[] for _ in range(n_groups)]
+for element in elements:
+groups[element.group].append(element)
+
+Indeed, it’s linear (i.e. asymptotically optimal), and we have to access random indices anyway, so cache isn’t going to help us in any case.
+In reality, this is leaving a lot of performance on the table, and certain asymptotically slower algorithms can perform sharding significantly faster on large input. They are mostly used by on-disk databases, but, surprisingly, they are useful even for in-RAM data.</description>
+				<author>[email protected] (Alisa Sireneva)</author>
+
+				<guid>https://purplesyringa.moe/blog/./the-ram-myth/</guid>
+				<pubDate>Thu, 19 Dec 2024 00:00:00 GMT</pubDate>
+			</item>
+
 			<item>
 				<title>Thoughts on Rust hashing</title>
 				<link>https://purplesyringa.moe/blog/./thoughts-on-rust-hashing/</link>

diff --git a/blog/index.html b/blog/index.html
diff --git a/blog/the-ram-myth/benchmark.rs b/blog/the-ram-myth/benchmark.rs
@@ -0,0 +1,163 @@
+use core::mem::MaybeUninit;
+use criterion::{
+    BenchmarkId, Criterion, SamplingMode, Throughput, {criterion_group, criterion_main},
+};
+use fixed_slice_vec::FixedSliceVec;
+use std::time::Duration;
+use wyrand::WyRand;
+
+// const CUTOFF: usize = 50_000;
+const CUTOFF: usize = 200_000;
+// const CUTOFF: usize = 1_000_000;
+
+#[inline(never)]
+fn fallback(
+    elements: impl Iterator<Item = u64> + Clone,
+    elements_len: usize,
+    key: &mut impl FnMut(u64) -> usize,
+    key_bitness: u32,
+    callback: &mut impl FnMut(&mut dyn Iterator<Item = u64>),
+) {
+    let n_groups = 1 << key_bitness;
+
+    let mut counts: Vec<usize> = vec![0; n_groups];
+    for element in elements.clone() {
+        counts[key(element) & (n_groups - 1)] += 1;
+    }
+
+    let mut group_ptrs: Vec<usize> = vec![0; n_groups];
+    for i in 1..n_groups {
+        group_ptrs[i] = group_ptrs[i - 1] + counts[i - 1];
+    }
+
+    let mut buffer = vec![MaybeUninit::uninit(); elements_len];
+    for element in elements {
+        let group_ptr = &mut group_ptrs[key(element) & ((1 << key_bitness) - 1)];
+        buffer[*group_ptr].write(element);
+        *group_ptr += 1;
+    }
+
+    let mut end_ptr = 0;
+    for i in 0..n_groups {
+        let start_ptr = end_ptr;
+        end_ptr += counts[i];
+        if counts[i] > 0 {
+            assert_eq!(end_ptr, group_ptrs[i]); // safety check for initialization!
+            let group = &buffer[start_ptr..end_ptr];
+            let group = unsafe { &*(group as *const [MaybeUninit<u64>] as *const [u64]) };
+            callback(&mut group.iter().copied());
+        }
+    }
+}
+
+struct Bucket<'buffer, T> {
+    reserved: FixedSliceVec<'buffer, T>,
+    overflow: Vec<T>,
+}
+
+impl<'buffer, T> Bucket<'buffer, T> {
+    fn new(reserved: FixedSliceVec<'buffer, T>) -> Self {
+        Self {
+            reserved,
+            overflow: Vec::new(),
+        }
+    }
+
+    fn push(&mut self, element: T) {
+        if let Err(element) = self.reserved.try_push(element) {
+            self.overflow.push(element.0);
+        }
+    }
+
+    fn len(&self) -> usize {
+        self.reserved.len() + self.overflow.len()
+    }
+
+    fn iter(&self) -> core::iter::Chain<core::slice::Iter<T>, core::slice::Iter<T>> {
+        self.reserved.iter().chain(self.overflow.iter())
+    }
+}
+
+pub fn radix_sort(
+    elements: impl Iterator<Item = u64> + Clone,
+    elements_len: usize,
+    key: &mut impl FnMut(u64) -> usize,
+    key_bitness: u32,
+    callback: &mut impl FnMut(&mut dyn Iterator<Item = u64>),
+) {
+    // The step at which `key` is consumed. `2 ** BITS` buckets are allocated.
+    const BITS: u32 = 8;
+
+    if elements_len <= CUTOFF || key_bitness <= BITS {
+        fallback(elements, elements_len, key, key_bitness, callback);
+        return;
+    }
+
+    let shift = key_bitness - BITS;
+
+    let reserved_capacity = (elements_len >> BITS).max(1); // 0 breaks `chunks_mut`
+
+    // Partitioning a single allocation is more efficient than allocating multiple times
+    let mut buffer = vec![MaybeUninit::uninit(); reserved_capacity << BITS];
+    let mut reserved = buffer.chunks_mut(reserved_capacity);
+    let mut buckets: [Bucket<u64>; 1 << BITS] = core::array::from_fn(|_| {
+        Bucket::new(FixedSliceVec::new(reserved.next().unwrap_or(&mut [])))
+    });
+
+    for element in elements {
+        buckets[(key(element) >> shift) & ((1 << BITS) - 1)].push(element);
+    }
+
+    for bucket in buckets {
+        radix_sort(
+            bucket.iter().copied(),
+            bucket.len(),
+            key,
+            key_bitness - BITS,
+            callback,
+        );
+    }
+}
+
+macro_rules! run {
+    ($fn:ident, $input:expr, $n:expr, $m:expr) => {{
+        let mut total = 0;
+        $fn(
+            $input,
+            $n,
+            &mut |element| (element.wrapping_mul(0x9a08c0ebcf5bc11b) >> (64 - $m)) as usize,
+            $m,
+            &mut |group| {
+                total += group.min().unwrap();
+            },
+        );
+        total
+    }};
+}
+
+fn bench_grouping(c: &mut Criterion) {
+    let mut group = c.benchmark_group("grouping");
+    group
+        .warm_up_time(Duration::from_secs(1))
+        .measurement_time(Duration::from_secs(1))
+        .sampling_mode(SamplingMode::Flat);
+    for shift in 0..10 {
+        let n = 80000usize << shift;
+        let m = 13 + shift;
+
+        let mut rng = WyRand::new(0x9a08c0ebcf5bc11b);
+        let input = (0..n).map(move |_| rng.rand());
+
+        group.throughput(Throughput::Elements(n as u64));
+        group.bench_with_input(BenchmarkId::new("old", n), &m, |b, &m| {
+            b.iter(|| run!(fallback, input.clone(), n, m));
+        });
+        group.bench_with_input(BenchmarkId::new("new", n), &m, |b, &m| {
+            b.iter(|| run!(radix_sort, input.clone(), n, m));
+        });
+    }
+    group.finish();
+}
+
+criterion_group!(benches, bench_grouping);
+criterion_main!(benches);
diff --git a/blog/the-ram-myth/benchmark.svg b/blog/the-ram-myth/benchmark.svg
diff --git a/blog/the-ram-myth/improvement.svg b/blog/the-ram-myth/improvement.svg