-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
828394a
commit 4b42774
Showing
8 changed files
with
491 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -7,11 +7,28 @@ | |
<copyright>Alisa Sireneva, CC BY</copyright> | ||
<managingEditor>[email protected] (Alisa Sireneva)</managingEditor> | ||
<webMaster>[email protected] (Alisa Sireneva)</webMaster> | ||
<lastBuildDate>Thu, 12 Dec 2024 14:50:16 GMT</lastBuildDate> | ||
<lastBuildDate>Wed, 18 Dec 2024 21:58:51 GMT</lastBuildDate> | ||
<docs>https://www.rssboard.org/rss-specification</docs> | ||
<ttl>60</ttl> | ||
<atom:link href="https://purplesyringa.moe/blog/feed.rss" rel="self" type="application/rss+xml" /> | ||
|
||
<item> | ||
<title>The RAM myth</title> | ||
<link>https://purplesyringa.moe/blog/./the-ram-myth/</link> | ||
<description>The RAM myth is a belief that modern computer memory resembles perfect random-access memory. Cache is seen as an optimization for small data: if it fits in L2, it’s going to be processed faster; if it doesn’t, there’s nothing we can do. | ||
Most likely, you believe that code like this is the fastest way to shard data: | ||
groups = [[] for _ in range(n_groups)] | ||
for element in elements: | ||
groups[element.group].append(element) | ||
|
||
Indeed, it’s linear (i.e. asymptotically optimal), and we have to access random indices anyway, so cache isn’t going to help us in any case. | ||
In reality, this is leaving a lot of performance on the table, and certain asymptotically slower algorithms can perform sharding significantly faster on large input. They are mostly used by on-disk databases, but, surprisingly, they are useful even for in-RAM data.</description> | ||
<author>[email protected] (Alisa Sireneva)</author> | ||
|
||
<guid>https://purplesyringa.moe/blog/./the-ram-myth/</guid> | ||
<pubDate>Thu, 19 Dec 2024 00:00:00 GMT</pubDate> | ||
</item> | ||
|
||
<item> | ||
<title>Thoughts on Rust hashing</title> | ||
<link>https://purplesyringa.moe/blog/./thoughts-on-rust-hashing/</link> | ||
|
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,163 @@ | ||
use core::mem::MaybeUninit; | ||
use criterion::{ | ||
BenchmarkId, Criterion, SamplingMode, Throughput, {criterion_group, criterion_main}, | ||
}; | ||
use fixed_slice_vec::FixedSliceVec; | ||
use std::time::Duration; | ||
use wyrand::WyRand; | ||
|
||
// const CUTOFF: usize = 50_000; | ||
const CUTOFF: usize = 200_000; | ||
// const CUTOFF: usize = 1_000_000; | ||
|
||
#[inline(never)] | ||
fn fallback( | ||
elements: impl Iterator<Item = u64> + Clone, | ||
elements_len: usize, | ||
key: &mut impl FnMut(u64) -> usize, | ||
key_bitness: u32, | ||
callback: &mut impl FnMut(&mut dyn Iterator<Item = u64>), | ||
) { | ||
let n_groups = 1 << key_bitness; | ||
|
||
let mut counts: Vec<usize> = vec![0; n_groups]; | ||
for element in elements.clone() { | ||
counts[key(element) & (n_groups - 1)] += 1; | ||
} | ||
|
||
let mut group_ptrs: Vec<usize> = vec![0; n_groups]; | ||
for i in 1..n_groups { | ||
group_ptrs[i] = group_ptrs[i - 1] + counts[i - 1]; | ||
} | ||
|
||
let mut buffer = vec![MaybeUninit::uninit(); elements_len]; | ||
for element in elements { | ||
let group_ptr = &mut group_ptrs[key(element) & ((1 << key_bitness) - 1)]; | ||
buffer[*group_ptr].write(element); | ||
*group_ptr += 1; | ||
} | ||
|
||
let mut end_ptr = 0; | ||
for i in 0..n_groups { | ||
let start_ptr = end_ptr; | ||
end_ptr += counts[i]; | ||
if counts[i] > 0 { | ||
assert_eq!(end_ptr, group_ptrs[i]); // safety check for initialization! | ||
let group = &buffer[start_ptr..end_ptr]; | ||
let group = unsafe { &*(group as *const [MaybeUninit<u64>] as *const [u64]) }; | ||
callback(&mut group.iter().copied()); | ||
} | ||
} | ||
} | ||
|
||
struct Bucket<'buffer, T> { | ||
reserved: FixedSliceVec<'buffer, T>, | ||
overflow: Vec<T>, | ||
} | ||
|
||
impl<'buffer, T> Bucket<'buffer, T> { | ||
fn new(reserved: FixedSliceVec<'buffer, T>) -> Self { | ||
Self { | ||
reserved, | ||
overflow: Vec::new(), | ||
} | ||
} | ||
|
||
fn push(&mut self, element: T) { | ||
if let Err(element) = self.reserved.try_push(element) { | ||
self.overflow.push(element.0); | ||
} | ||
} | ||
|
||
fn len(&self) -> usize { | ||
self.reserved.len() + self.overflow.len() | ||
} | ||
|
||
fn iter(&self) -> core::iter::Chain<core::slice::Iter<T>, core::slice::Iter<T>> { | ||
self.reserved.iter().chain(self.overflow.iter()) | ||
} | ||
} | ||
|
||
pub fn radix_sort( | ||
elements: impl Iterator<Item = u64> + Clone, | ||
elements_len: usize, | ||
key: &mut impl FnMut(u64) -> usize, | ||
key_bitness: u32, | ||
callback: &mut impl FnMut(&mut dyn Iterator<Item = u64>), | ||
) { | ||
// The step at which `key` is consumed. `2 ** BITS` buckets are allocated. | ||
const BITS: u32 = 8; | ||
|
||
if elements_len <= CUTOFF || key_bitness <= BITS { | ||
fallback(elements, elements_len, key, key_bitness, callback); | ||
return; | ||
} | ||
|
||
let shift = key_bitness - BITS; | ||
|
||
let reserved_capacity = (elements_len >> BITS).max(1); // 0 breaks `chunks_mut` | ||
|
||
// Partitioning a single allocation is more efficient than allocating multiple times | ||
let mut buffer = vec![MaybeUninit::uninit(); reserved_capacity << BITS]; | ||
let mut reserved = buffer.chunks_mut(reserved_capacity); | ||
let mut buckets: [Bucket<u64>; 1 << BITS] = core::array::from_fn(|_| { | ||
Bucket::new(FixedSliceVec::new(reserved.next().unwrap_or(&mut []))) | ||
}); | ||
|
||
for element in elements { | ||
buckets[(key(element) >> shift) & ((1 << BITS) - 1)].push(element); | ||
} | ||
|
||
for bucket in buckets { | ||
radix_sort( | ||
bucket.iter().copied(), | ||
bucket.len(), | ||
key, | ||
key_bitness - BITS, | ||
callback, | ||
); | ||
} | ||
} | ||
|
||
macro_rules! run { | ||
($fn:ident, $input:expr, $n:expr, $m:expr) => {{ | ||
let mut total = 0; | ||
$fn( | ||
$input, | ||
$n, | ||
&mut |element| (element.wrapping_mul(0x9a08c0ebcf5bc11b) >> (64 - $m)) as usize, | ||
$m, | ||
&mut |group| { | ||
total += group.min().unwrap(); | ||
}, | ||
); | ||
total | ||
}}; | ||
} | ||
|
||
fn bench_grouping(c: &mut Criterion) { | ||
let mut group = c.benchmark_group("grouping"); | ||
group | ||
.warm_up_time(Duration::from_secs(1)) | ||
.measurement_time(Duration::from_secs(1)) | ||
.sampling_mode(SamplingMode::Flat); | ||
for shift in 0..10 { | ||
let n = 80000usize << shift; | ||
let m = 13 + shift; | ||
|
||
let mut rng = WyRand::new(0x9a08c0ebcf5bc11b); | ||
let input = (0..n).map(move |_| rng.rand()); | ||
|
||
group.throughput(Throughput::Elements(n as u64)); | ||
group.bench_with_input(BenchmarkId::new("old", n), &m, |b, &m| { | ||
b.iter(|| run!(fallback, input.clone(), n, m)); | ||
}); | ||
group.bench_with_input(BenchmarkId::new("new", n), &m, |b, &m| { | ||
b.iter(|| run!(radix_sort, input.clone(), n, m)); | ||
}); | ||
} | ||
group.finish(); | ||
} | ||
|
||
criterion_group!(benches, bench_grouping); | ||
criterion_main!(benches); |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Oops, something went wrong.