diff --git a/Cargo.toml b/Cargo.toml index a31829e..6a37e09 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -9,7 +9,7 @@ description = "Fast integer compression/decompression via SIMD bit-packing. Port edition = "2018" [dependencies] -crunchy = "0.2" +unroll = "0.1" [dev-dependencies] rand = "0.8" diff --git a/src/bitpacker1x.rs b/src/bitpacker1x.rs index 0025749..4b1c452 100644 --- a/src/bitpacker1x.rs +++ b/src/bitpacker1x.rs @@ -11,7 +11,7 @@ mod scalar { type DataType = u32; - fn set1(el: i32) -> DataType { + const fn set1(el: i32) -> DataType { el as u32 } diff --git a/src/bitpacker4x.rs b/src/bitpacker4x.rs index be56241..6b3a87c 100644 --- a/src/bitpacker4x.rs +++ b/src/bitpacker4x.rs @@ -73,7 +73,7 @@ mod scalar { type DataType = [u32; 4]; - fn set1(el: i32) -> DataType { + const fn set1(el: i32) -> DataType { [el as u32; 4] } diff --git a/src/bitpacker8x.rs b/src/bitpacker8x.rs index 8a0c591..fb00fd0 100644 --- a/src/bitpacker8x.rs +++ b/src/bitpacker8x.rs @@ -82,7 +82,7 @@ mod scalar { type DataType = [u32; 8]; - fn set1(el: i32) -> DataType { + const fn set1(el: i32) -> DataType { [el as u32; 8] } diff --git a/src/macros.rs b/src/macros.rs index 0e61b0e..75497c9 100644 --- a/src/macros.rs +++ b/src/macros.rs @@ -1,107 +1,131 @@ macro_rules! pack_unpack_with_bits { - - ($name:ident, $n:expr, $cpufeature:meta) => { - - + ($name:ident, $n:literal, $cycle:literal, $cpufeature:meta) => { mod $name { - use crunchy::unroll; use super::BLOCK_LEN; + use super::{ + left_shift_32, load_unaligned, op_and, op_or, right_shift_32, set1, + store_unaligned, DataType, + }; use super::{Sink, Transformer}; - use super::{DataType, - set1, - right_shift_32, - left_shift_32, - op_or, - op_and, - load_unaligned, - store_unaligned}; + use unroll::unroll_for_loops; const NUM_BITS: usize = $n; const NUM_BYTES_PER_BLOCK: usize = NUM_BITS * BLOCK_LEN / 8; + const REPEAT: usize = 32 / $cycle; + #[$cpufeature] - pub(crate) unsafe fn pack(input_arr: &[u32], output_arr: &mut [u8], mut delta_computer: TDeltaComputer) -> usize { - assert_eq!(input_arr.len(), BLOCK_LEN, "Input block too small {}, (expected {})", input_arr.len(), BLOCK_LEN); - assert!(output_arr.len() >= NUM_BYTES_PER_BLOCK, "Output array too small (numbits {}). {} <= {}", NUM_BITS, output_arr.len(), NUM_BYTES_PER_BLOCK); + #[unroll_for_loops] + pub(crate) unsafe fn pack( + input_arr: &[u32], + output_arr: &mut [u8], + mut delta_computer: TDeltaComputer, + ) -> usize { + assert_eq!( + input_arr.len(), + BLOCK_LEN, + "Input block too small {}, (expected {})", + input_arr.len(), + BLOCK_LEN + ); + assert!( + output_arr.len() >= NUM_BYTES_PER_BLOCK, + "Output array too small (numbits {}). {} <= {}", + NUM_BITS, + output_arr.len(), + NUM_BYTES_PER_BLOCK + ); - let input_ptr = input_arr.as_ptr() as *const DataType; + let mut input_ptr = input_arr.as_ptr() as *const DataType; let mut output_ptr = output_arr.as_mut_ptr() as *mut DataType; - let mut out_register: DataType = delta_computer.transform(load_unaligned(input_ptr)); - - unroll! { - for iter in 0..30 { - const i: usize = 1 + iter; - - const bits_filled: usize = i * NUM_BITS; - const inner_cursor: usize = bits_filled % 32; - const remaining: usize = 32 - inner_cursor; - let offset_ptr = input_ptr.add(i); - let in_register: DataType = delta_computer.transform(load_unaligned(offset_ptr)); - - out_register = - if inner_cursor > 0 { - let shifted = left_shift_32(in_register, inner_cursor as i32); - op_or(out_register, shifted) - } else { - in_register - }; - - if remaining <= NUM_BITS { + for _ in 0..REPEAT { + let mut out_register: DataType = + delta_computer.transform(load_unaligned(input_ptr)); + input_ptr = input_ptr.add(1); + + // Ideally we would have liked to write `for input_id in 1..$cycle-1` + // here, unfortunately unroll_for_loops needs a literal here to kick + // in. + for iter in 2..$cycle { + const BITS_FILLED: usize = (iter - 1) * NUM_BITS; + const INNER_CURSOR: usize = BITS_FILLED % 32; + const REMAINING: usize = 32 - INNER_CURSOR; + + let in_register: DataType = + delta_computer.transform(load_unaligned(input_ptr)); + input_ptr = input_ptr.add(1); + + out_register = if INNER_CURSOR > 0 { + let shifted = left_shift_32(in_register, INNER_CURSOR as i32); + op_or(out_register, shifted) + } else { + in_register + }; + + if REMAINING <= NUM_BITS { store_unaligned(output_ptr, out_register); output_ptr = output_ptr.offset(1); - if remaining < NUM_BITS { - out_register = right_shift_32(in_register, remaining as i32); + if REMAINING < NUM_BITS { + out_register = right_shift_32(in_register, REMAINING as i32); } } } + + let in_register: DataType = delta_computer.transform(load_unaligned(input_ptr)); + input_ptr = input_ptr.add(1); + let shifted = left_shift_32(in_register, 32 - NUM_BITS as i32); + out_register = op_or(out_register, shifted); + store_unaligned(output_ptr, out_register); + output_ptr = output_ptr.add(1); } - let in_register: DataType = delta_computer.transform(load_unaligned(input_ptr.add(31))); - let shifted = left_shift_32(in_register, 32 - NUM_BITS as i32); - out_register = op_or(out_register, shifted); - store_unaligned(output_ptr, out_register); NUM_BYTES_PER_BLOCK } #[$cpufeature] - pub(crate) unsafe fn unpack(compressed: &[u8], mut output: Output) -> usize { - - assert!(compressed.len() >= NUM_BYTES_PER_BLOCK, "Compressed array seems too small. ({} < {}) ", compressed.len(), NUM_BYTES_PER_BLOCK); + #[unroll_for_loops] + pub(crate) unsafe fn unpack( + compressed: &[u8], + mut output: Output, + ) -> usize { + assert!( + compressed.len() >= NUM_BYTES_PER_BLOCK, + "Compressed array seems too small. ({} < {}) ", + compressed.len(), + NUM_BYTES_PER_BLOCK + ); let mut input_ptr = compressed.as_ptr() as *const DataType; - let mask_scalar: u32 = ((1u64 << NUM_BITS) - 1u64) as u32; - let mask = set1(mask_scalar as i32); - - let mut in_register: DataType = load_unaligned(input_ptr); - - let out_register = op_and(in_register, mask); - output.process(out_register); + const MASK_SCALAR: u32 = ((1u64 << NUM_BITS) - 1u64) as u32; + let mask: DataType = set1(MASK_SCALAR as i32); - unroll! { - for iter in 0..31 { - const i: usize = iter + 1; + for _ in 0..REPEAT { + let mut in_register: DataType = load_unaligned(input_ptr); + let out_register = op_and(in_register, mask); + output.process(out_register); - const inner_cursor: usize = (i * NUM_BITS) % 32; - const inner_capacity: usize = 32 - inner_cursor; + for i in 1..$cycle { + const INNER_CURSOR: usize = (i * NUM_BITS) % 32; + const INNER_CAPACITY: usize = 32 - INNER_CURSOR; // LLVM will not emit the shift operand if - // `inner_cursor` is 0. - let shifted_in_register = right_shift_32(in_register, inner_cursor as i32); + // `INNER_CURSOR` is 0. + let shifted_in_register = right_shift_32(in_register, INNER_CURSOR as i32); let mut out_register: DataType = op_and(shifted_in_register, mask); // We consumed our current quadruplets entirely. // We therefore read another one. - if inner_capacity <= NUM_BITS && i != 31 { + if INNER_CAPACITY <= NUM_BITS && i != $cycle - 1 { input_ptr = input_ptr.add(1); in_register = load_unaligned(input_ptr); // This quadruplets is actually cutting one of // our `DataType`. We need to read the next one. - if inner_capacity < NUM_BITS { - let shifted = left_shift_32(in_register, inner_capacity as i32); + if INNER_CAPACITY < NUM_BITS { + let shifted = left_shift_32(in_register, INNER_CAPACITY as i32); let masked = op_and(shifted, mask); out_register = op_or(out_register, masked); } @@ -109,27 +133,28 @@ macro_rules! pack_unpack_with_bits { output.process(out_register); } + input_ptr = input_ptr.add(1); } - - NUM_BYTES_PER_BLOCK } } - } + }; } macro_rules! pack_unpack_with_bits_32 { ($cpufeature:meta) => { mod pack_unpack_with_bits_32 { + use super::BLOCK_LEN; use super::{load_unaligned, store_unaligned, DataType}; use super::{Sink, Transformer}; - use crunchy::unroll; + use unroll::unroll_for_loops; const NUM_BITS: usize = 32; const NUM_BYTES_PER_BLOCK: usize = NUM_BITS * BLOCK_LEN / 8; #[$cpufeature] + #[unroll_for_loops] pub(crate) unsafe fn pack( input_arr: &[u32], output_arr: &mut [u8], @@ -152,14 +177,12 @@ macro_rules! pack_unpack_with_bits_32 { let input_ptr: *const DataType = input_arr.as_ptr() as *const DataType; let output_ptr = output_arr.as_mut_ptr() as *mut DataType; - unroll! { - for i in 0..32 { - let input_offset_ptr = input_ptr.offset(i as isize); - let output_offset_ptr = output_ptr.offset(i as isize); - let input_register = load_unaligned(input_offset_ptr); - let output_register = delta_computer.transform(input_register); - store_unaligned(output_offset_ptr, output_register); - } + for i in 0..32 { + let input_offset_ptr = input_ptr.offset(i as isize); + let output_offset_ptr = output_ptr.offset(i as isize); + let input_register = load_unaligned(input_offset_ptr); + let output_register = delta_computer.transform(input_register); + store_unaligned(output_offset_ptr, output_register); } NUM_BYTES_PER_BLOCK } @@ -191,39 +214,39 @@ macro_rules! declare_bitpacker { ($cpufeature:meta) => { use super::super::UnsafeBitPacker; use crate::most_significant_bit; - use crunchy::unroll; - - pack_unpack_with_bits!(pack_unpack_with_bits_1, 1, $cpufeature); - pack_unpack_with_bits!(pack_unpack_with_bits_2, 2, $cpufeature); - pack_unpack_with_bits!(pack_unpack_with_bits_3, 3, $cpufeature); - pack_unpack_with_bits!(pack_unpack_with_bits_4, 4, $cpufeature); - pack_unpack_with_bits!(pack_unpack_with_bits_5, 5, $cpufeature); - pack_unpack_with_bits!(pack_unpack_with_bits_6, 6, $cpufeature); - pack_unpack_with_bits!(pack_unpack_with_bits_7, 7, $cpufeature); - pack_unpack_with_bits!(pack_unpack_with_bits_8, 8, $cpufeature); - pack_unpack_with_bits!(pack_unpack_with_bits_9, 9, $cpufeature); - pack_unpack_with_bits!(pack_unpack_with_bits_10, 10, $cpufeature); - pack_unpack_with_bits!(pack_unpack_with_bits_11, 11, $cpufeature); - pack_unpack_with_bits!(pack_unpack_with_bits_12, 12, $cpufeature); - pack_unpack_with_bits!(pack_unpack_with_bits_13, 13, $cpufeature); - pack_unpack_with_bits!(pack_unpack_with_bits_14, 14, $cpufeature); - pack_unpack_with_bits!(pack_unpack_with_bits_15, 15, $cpufeature); - pack_unpack_with_bits!(pack_unpack_with_bits_16, 16, $cpufeature); - pack_unpack_with_bits!(pack_unpack_with_bits_17, 17, $cpufeature); - pack_unpack_with_bits!(pack_unpack_with_bits_18, 18, $cpufeature); - pack_unpack_with_bits!(pack_unpack_with_bits_19, 19, $cpufeature); - pack_unpack_with_bits!(pack_unpack_with_bits_20, 20, $cpufeature); - pack_unpack_with_bits!(pack_unpack_with_bits_21, 21, $cpufeature); - pack_unpack_with_bits!(pack_unpack_with_bits_22, 22, $cpufeature); - pack_unpack_with_bits!(pack_unpack_with_bits_23, 23, $cpufeature); - pack_unpack_with_bits!(pack_unpack_with_bits_24, 24, $cpufeature); - pack_unpack_with_bits!(pack_unpack_with_bits_25, 25, $cpufeature); - pack_unpack_with_bits!(pack_unpack_with_bits_26, 26, $cpufeature); - pack_unpack_with_bits!(pack_unpack_with_bits_27, 27, $cpufeature); - pack_unpack_with_bits!(pack_unpack_with_bits_28, 28, $cpufeature); - pack_unpack_with_bits!(pack_unpack_with_bits_29, 29, $cpufeature); - pack_unpack_with_bits!(pack_unpack_with_bits_30, 30, $cpufeature); - pack_unpack_with_bits!(pack_unpack_with_bits_31, 31, $cpufeature); + use unroll::unroll_for_loops; + + pack_unpack_with_bits!(pack_unpack_with_bits_1, 1, 32, $cpufeature); + pack_unpack_with_bits!(pack_unpack_with_bits_2, 2, 16, $cpufeature); + pack_unpack_with_bits!(pack_unpack_with_bits_3, 3, 32, $cpufeature); + pack_unpack_with_bits!(pack_unpack_with_bits_4, 4, 8, $cpufeature); + pack_unpack_with_bits!(pack_unpack_with_bits_5, 5, 32, $cpufeature); + pack_unpack_with_bits!(pack_unpack_with_bits_6, 6, 16, $cpufeature); + pack_unpack_with_bits!(pack_unpack_with_bits_7, 7, 32, $cpufeature); + pack_unpack_with_bits!(pack_unpack_with_bits_8, 8, 4, $cpufeature); + pack_unpack_with_bits!(pack_unpack_with_bits_9, 9, 32, $cpufeature); + pack_unpack_with_bits!(pack_unpack_with_bits_10, 10, 16, $cpufeature); + pack_unpack_with_bits!(pack_unpack_with_bits_11, 11, 32, $cpufeature); + pack_unpack_with_bits!(pack_unpack_with_bits_12, 12, 8, $cpufeature); + pack_unpack_with_bits!(pack_unpack_with_bits_13, 13, 32, $cpufeature); + pack_unpack_with_bits!(pack_unpack_with_bits_14, 14, 16, $cpufeature); + pack_unpack_with_bits!(pack_unpack_with_bits_15, 15, 32, $cpufeature); + pack_unpack_with_bits!(pack_unpack_with_bits_16, 16, 2, $cpufeature); + pack_unpack_with_bits!(pack_unpack_with_bits_17, 17, 32, $cpufeature); + pack_unpack_with_bits!(pack_unpack_with_bits_18, 18, 16, $cpufeature); + pack_unpack_with_bits!(pack_unpack_with_bits_20, 20, 8, $cpufeature); + pack_unpack_with_bits!(pack_unpack_with_bits_19, 19, 32, $cpufeature); + pack_unpack_with_bits!(pack_unpack_with_bits_21, 21, 32, $cpufeature); + pack_unpack_with_bits!(pack_unpack_with_bits_22, 22, 16, $cpufeature); + pack_unpack_with_bits!(pack_unpack_with_bits_23, 23, 32, $cpufeature); + pack_unpack_with_bits!(pack_unpack_with_bits_24, 24, 4, $cpufeature); + pack_unpack_with_bits!(pack_unpack_with_bits_25, 25, 32, $cpufeature); + pack_unpack_with_bits!(pack_unpack_with_bits_26, 26, 16, $cpufeature); + pack_unpack_with_bits!(pack_unpack_with_bits_27, 27, 32, $cpufeature); + pack_unpack_with_bits!(pack_unpack_with_bits_28, 28, 8, $cpufeature); + pack_unpack_with_bits!(pack_unpack_with_bits_29, 29, 32, $cpufeature); + pack_unpack_with_bits!(pack_unpack_with_bits_30, 30, 16, $cpufeature); + pack_unpack_with_bits!(pack_unpack_with_bits_31, 31, 32, $cpufeature); pack_unpack_with_bits_32!($cpufeature); unsafe fn compress_generic( @@ -454,6 +477,7 @@ macro_rules! declare_bitpacker { } #[$cpufeature] + #[unroll_for_loops] unsafe fn num_bits(decompressed: &[u32]) -> u8 { assert_eq!( decompressed.len(), @@ -463,17 +487,16 @@ macro_rules! declare_bitpacker { ); let data: *const DataType = decompressed.as_ptr() as *const DataType; let mut accumulator = load_unaligned(data); - unroll! { - for iter in 0..31 { - let i = iter + 1; - let newvec = load_unaligned(data.add(i)); - accumulator = op_or(accumulator, newvec); - } + for iter in 0..31 { + let i = iter + 1; + let newvec = load_unaligned(data.add(i)); + accumulator = op_or(accumulator, newvec); } most_significant_bit(or_collapse_to_u32(accumulator)) } #[$cpufeature] + #[unroll_for_loops] unsafe fn num_bits_sorted(initial: u32, decompressed: &[u32]) -> u8 { assert_eq!( decompressed.len(), @@ -488,14 +511,12 @@ macro_rules! declare_bitpacker { let mut accumulator = compute_delta(load_unaligned(data), initial_vec); let mut previous = first; - unroll! { - for iter in 0..30 { - let i = iter + 1; - let current = load_unaligned(data.add(i)); - let delta = compute_delta(current, previous); - accumulator = op_or(accumulator, delta); - previous = current; - } + for iter in 0..30 { + let i = iter + 1; + let current = load_unaligned(data.add(i)); + let delta = compute_delta(current, previous); + accumulator = op_or(accumulator, delta); + previous = current; } let current = load_unaligned(data.add(31)); let delta = compute_delta(current, previous);