Skip to content

Commit

Permalink
The RAM myth
Browse files Browse the repository at this point in the history
  • Loading branch information
purplesyringa committed Dec 18, 2024
1 parent 828394a commit 4b42774
Show file tree
Hide file tree
Showing 8 changed files with 491 additions and 2 deletions.
19 changes: 18 additions & 1 deletion blog/feed.rss
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,28 @@
<copyright>Alisa Sireneva, CC BY</copyright>
<managingEditor>[email protected] (Alisa Sireneva)</managingEditor>
<webMaster>[email protected] (Alisa Sireneva)</webMaster>
<lastBuildDate>Thu, 12 Dec 2024 14:50:16 GMT</lastBuildDate>
<lastBuildDate>Wed, 18 Dec 2024 21:58:51 GMT</lastBuildDate>
<docs>https://www.rssboard.org/rss-specification</docs>
<ttl>60</ttl>
<atom:link href="https://purplesyringa.moe/blog/feed.rss" rel="self" type="application/rss+xml" />

<item>
<title>The RAM myth</title>
<link>https://purplesyringa.moe/blog/./the-ram-myth/</link>
<description>The RAM myth is a belief that modern computer memory resembles perfect random-access memory. Cache is seen as an optimization for small data: if it fits in L2, it’s going to be processed faster; if it doesn’t, there’s nothing we can do.
Most likely, you believe that code like this is the fastest way to shard data:
groups = [[] for _ in range(n_groups)]
for element in elements:
groups[element.group].append(element)

Indeed, it’s linear (i.e. asymptotically optimal), and we have to access random indices anyway, so cache isn’t going to help us in any case.
In reality, this is leaving a lot of performance on the table, and certain asymptotically slower algorithms can perform sharding significantly faster on large input. They are mostly used by on-disk databases, but, surprisingly, they are useful even for in-RAM data.</description>
<author>[email protected] (Alisa Sireneva)</author>

<guid>https://purplesyringa.moe/blog/./the-ram-myth/</guid>
<pubDate>Thu, 19 Dec 2024 00:00:00 GMT</pubDate>
</item>

<item>
<title>Thoughts on Rust hashing</title>
<link>https://purplesyringa.moe/blog/./thoughts-on-rust-hashing/</link>
Expand Down
5 changes: 4 additions & 1 deletion blog/index.html

Large diffs are not rendered by default.

163 changes: 163 additions & 0 deletions blog/the-ram-myth/benchmark.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
use core::mem::MaybeUninit;
use criterion::{
BenchmarkId, Criterion, SamplingMode, Throughput, {criterion_group, criterion_main},
};
use fixed_slice_vec::FixedSliceVec;
use std::time::Duration;
use wyrand::WyRand;

// const CUTOFF: usize = 50_000;
const CUTOFF: usize = 200_000;
// const CUTOFF: usize = 1_000_000;

#[inline(never)]
fn fallback(
elements: impl Iterator<Item = u64> + Clone,
elements_len: usize,
key: &mut impl FnMut(u64) -> usize,
key_bitness: u32,
callback: &mut impl FnMut(&mut dyn Iterator<Item = u64>),
) {
let n_groups = 1 << key_bitness;

let mut counts: Vec<usize> = vec![0; n_groups];
for element in elements.clone() {
counts[key(element) & (n_groups - 1)] += 1;
}

let mut group_ptrs: Vec<usize> = vec![0; n_groups];
for i in 1..n_groups {
group_ptrs[i] = group_ptrs[i - 1] + counts[i - 1];
}

let mut buffer = vec![MaybeUninit::uninit(); elements_len];
for element in elements {
let group_ptr = &mut group_ptrs[key(element) & ((1 << key_bitness) - 1)];
buffer[*group_ptr].write(element);
*group_ptr += 1;
}

let mut end_ptr = 0;
for i in 0..n_groups {
let start_ptr = end_ptr;
end_ptr += counts[i];
if counts[i] > 0 {
assert_eq!(end_ptr, group_ptrs[i]); // safety check for initialization!
let group = &buffer[start_ptr..end_ptr];
let group = unsafe { &*(group as *const [MaybeUninit<u64>] as *const [u64]) };
callback(&mut group.iter().copied());
}
}
}

struct Bucket<'buffer, T> {
reserved: FixedSliceVec<'buffer, T>,
overflow: Vec<T>,
}

impl<'buffer, T> Bucket<'buffer, T> {
fn new(reserved: FixedSliceVec<'buffer, T>) -> Self {
Self {
reserved,
overflow: Vec::new(),
}
}

fn push(&mut self, element: T) {
if let Err(element) = self.reserved.try_push(element) {
self.overflow.push(element.0);
}
}

fn len(&self) -> usize {
self.reserved.len() + self.overflow.len()
}

fn iter(&self) -> core::iter::Chain<core::slice::Iter<T>, core::slice::Iter<T>> {
self.reserved.iter().chain(self.overflow.iter())
}
}

pub fn radix_sort(
elements: impl Iterator<Item = u64> + Clone,
elements_len: usize,
key: &mut impl FnMut(u64) -> usize,
key_bitness: u32,
callback: &mut impl FnMut(&mut dyn Iterator<Item = u64>),
) {
// The step at which `key` is consumed. `2 ** BITS` buckets are allocated.
const BITS: u32 = 8;

if elements_len <= CUTOFF || key_bitness <= BITS {
fallback(elements, elements_len, key, key_bitness, callback);
return;
}

let shift = key_bitness - BITS;

let reserved_capacity = (elements_len >> BITS).max(1); // 0 breaks `chunks_mut`

// Partitioning a single allocation is more efficient than allocating multiple times
let mut buffer = vec![MaybeUninit::uninit(); reserved_capacity << BITS];
let mut reserved = buffer.chunks_mut(reserved_capacity);
let mut buckets: [Bucket<u64>; 1 << BITS] = core::array::from_fn(|_| {
Bucket::new(FixedSliceVec::new(reserved.next().unwrap_or(&mut [])))
});

for element in elements {
buckets[(key(element) >> shift) & ((1 << BITS) - 1)].push(element);
}

for bucket in buckets {
radix_sort(
bucket.iter().copied(),
bucket.len(),
key,
key_bitness - BITS,
callback,
);
}
}

macro_rules! run {
($fn:ident, $input:expr, $n:expr, $m:expr) => {{
let mut total = 0;
$fn(
$input,
$n,
&mut |element| (element.wrapping_mul(0x9a08c0ebcf5bc11b) >> (64 - $m)) as usize,
$m,
&mut |group| {
total += group.min().unwrap();
},
);
total
}};
}

fn bench_grouping(c: &mut Criterion) {
let mut group = c.benchmark_group("grouping");
group
.warm_up_time(Duration::from_secs(1))
.measurement_time(Duration::from_secs(1))
.sampling_mode(SamplingMode::Flat);
for shift in 0..10 {
let n = 80000usize << shift;
let m = 13 + shift;

let mut rng = WyRand::new(0x9a08c0ebcf5bc11b);
let input = (0..n).map(move |_| rng.rand());

group.throughput(Throughput::Elements(n as u64));
group.bench_with_input(BenchmarkId::new("old", n), &m, |b, &m| {
b.iter(|| run!(fallback, input.clone(), n, m));
});
group.bench_with_input(BenchmarkId::new("new", n), &m, |b, &m| {
b.iter(|| run!(radix_sort, input.clone(), n, m));
});
}
group.finish();
}

criterion_group!(benches, bench_grouping);
criterion_main!(benches);
1 change: 1 addition & 0 deletions blog/the-ram-myth/benchmark.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions blog/the-ram-myth/improvement.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit 4b42774

Please sign in to comment.