From 58a18c03586ec17566c5291a9800f033c4f24094 Mon Sep 17 00:00:00 2001 From: Carlos Rolo <3799585+cjrolo@users.noreply.github.com> Date: Thu, 19 Oct 2023 12:06:28 +0100 Subject: [PATCH 1/7] Chunk sizes calculations --- brro-compressor/src/main.rs | 5 ++- brro-compressor/src/optimizer/mod.rs | 52 ++++++++++++++++++---------- brro-compressor/src/utils/mod.rs | 10 +++++- 3 files changed, 45 insertions(+), 22 deletions(-) diff --git a/brro-compressor/src/main.rs b/brro-compressor/src/main.rs index 53076cd..7b85179 100644 --- a/brro-compressor/src/main.rs +++ b/brro-compressor/src/main.rs @@ -84,8 +84,7 @@ fn process_single_file(arguments: &Args) -> Result<(), std::io::Error> { fn compress_data(vec: &Vec, tag: &MetricTag, arguments: &Args) -> Vec { debug!("Compressing data!"); let optimizer_results = optimizer::process_data(vec, tag); - let _optimizer_results_f: Vec = optimizer_results.iter().map(|&x| x as f64).collect(); - + debug!("Samples in: {}, Samples out: {}", vec.len(), optimizer_results.len()); let mut cs = CompressedStream::new(); let compressor = match arguments.compressor { CompressorType::Noop => Compressor::Noop, @@ -96,7 +95,7 @@ fn compress_data(vec: &Vec, tag: &MetricTag, arguments: &Args) -> Vec { CompressorType::Wavelet => Compressor::Wavelet }; - cs.compress_chunk_with(vec, compressor); + cs.compress_chunk_with(&optimizer_results, compressor); cs.to_bytes() } diff --git a/brro-compressor/src/optimizer/mod.rs b/brro-compressor/src/optimizer/mod.rs index 685e98c..c662a86 100644 --- a/brro-compressor/src/optimizer/mod.rs +++ b/brro-compressor/src/optimizer/mod.rs @@ -6,6 +6,15 @@ use log::debug; use types::metric_tag::MetricTag; use crate::types; +/// Max Frame size, this can aprox. 36h of data at 1point/sec rate, a little more than 1 week at 1point/5sec +/// and 1 month (30 days) at 1 point/20sec. +/// This would be aprox. 1MB of Raw data (131072 * 64bits). +/// We wouldn't want to decompressed a ton of uncessary data, but for historical view of the data, looking into 1day/week/month at once is very reasonable +const MAX_FRAME_SIZE: usize = 131072; // 2^17 +/// The Min frame size is one that allows our compressors potentially achieve 100x compression. Currently the most +/// limited one is the FFT compressor, that needs 3 frequencies at minimum, 3x100 = 300, next power of 2 is 512. +const MIN_FRAME_SIZE: usize = 512; // 2^9 + impl MetricTag { #[allow(clippy::wrong_self_convention)] fn from_float(&self, x: f64) -> i64 { @@ -46,25 +55,32 @@ fn to_median_filter(data: &Vec) -> Vec { filtered } +/// This function gets a length and returns a vector with the chunk sizes to feed to the different compressors +/// A lot of assumptions go into selecting the chunk size, including: +/// 1. Collection rate - It is not expected that the collection rate exceeds 1point sec (it is expected actually less) +/// 2. Maximum compression achievable - A compressed frame as overhead and a minimum number of segments, small frames don't allow great compressions +/// 3. FFT operates faster under power of 2 +fn get_chunks_sizes(len: usize) -> Vec { + Vec::::with_capacity(MIN_FRAME_SIZE) +} + /// This should look at the data and return an optimized dataset for a specific compressor, /// If a compressor is hand picked, this should be skipped. -/// TODO: Make it do that -pub fn process_data(wav_data: &Vec, tag: &MetricTag) -> Vec { - let mut _bitdepth = 64; - let mut _dc_component: i64 = 0; - let mut _fractional = true; - - debug!("Tag: {:?}", tag); - let data = match tag { - MetricTag::Other => Vec::new(), - MetricTag::QuasiRandom => to_median_filter(wav_data), - _ => { - wav_data - .iter() - .map(|x| tag.from_float(*x)) +pub fn process_data(wav_data: &Vec, tag: &MetricTag) -> Vec { + // My idea here: + // 1. Clean data + // 2. Split into good sized chunks (aka power of 2) + // 3. Get each chunk into the compressor that it should go + // 3.1. Chunks should be at least of a size that it can allow a 100x compression for that given compressor (FFT is 512) + let len = wav_data.len(); + if !len.is_power_of_two() { + todo!() + } + // Cleaning data, removing NaN, etc. This might reduce sample count + debug!("Tag: {:?} Len: {}", tag, wav_data.len()); + // Is len a power of 2? If not try to get the previous power of 2 + wav_data.iter() + .filter(|x| !(x.is_nan() || x.is_infinite())) + .copied() .collect() - } - }; - _fractional = false; - data } \ No newline at end of file diff --git a/brro-compressor/src/utils/mod.rs b/brro-compressor/src/utils/mod.rs index 540e337..3aaf196 100644 --- a/brro-compressor/src/utils/mod.rs +++ b/brro-compressor/src/utils/mod.rs @@ -1,4 +1,12 @@ pub mod reader; pub mod writer; pub mod error; -mod file_type_detector; \ No newline at end of file +mod file_type_detector; + +// Is this the right place? +pub const fn prev_power_of_two(n: u64) -> u64 { + // n = 0 gives highest_bit_set_idx = 0. + let highest_bit_set_idx = 63 - (n|1).leading_zeros(); + // Binary AND of highest bit with n is a no-op, except zero gets wiped. + (1 << highest_bit_set_idx) & n +} \ No newline at end of file From dda913d5c66f992a5bae868bc6fc62dfc8be516b Mon Sep 17 00:00:00 2001 From: Carlos Rolo <3799585+cjrolo@users.noreply.github.com> Date: Thu, 19 Oct 2023 14:18:06 +0100 Subject: [PATCH 2/7] WIP in optimization --- brro-compressor/src/optimizer/mod.rs | 123 +++++++++++++++++++++------ brro-compressor/src/utils/mod.rs | 2 +- 2 files changed, 100 insertions(+), 25 deletions(-) diff --git a/brro-compressor/src/optimizer/mod.rs b/brro-compressor/src/optimizer/mod.rs index c662a86..6739741 100644 --- a/brro-compressor/src/optimizer/mod.rs +++ b/brro-compressor/src/optimizer/mod.rs @@ -4,7 +4,7 @@ use median::Filter; use log::debug; use types::metric_tag::MetricTag; -use crate::types; +use crate::{types, utils::prev_power_of_two, compressor::Compressor}; /// Max Frame size, this can aprox. 36h of data at 1point/sec rate, a little more than 1 week at 1point/5sec /// and 1 month (30 days) at 1 point/20sec. @@ -15,6 +15,84 @@ const MAX_FRAME_SIZE: usize = 131072; // 2^17 /// limited one is the FFT compressor, that needs 3 frequencies at minimum, 3x100 = 300, next power of 2 is 512. const MIN_FRAME_SIZE: usize = 512; // 2^9 +// My idea here: +// 1. Clean data +// 2. Split into good sized chunks (aka power of 2) +// 3. Get each chunk into the compressor that it should go +// 3.1. Chunks should be at least of a size that it can allow a 100x compression for that given compressor (FFT is 512) +// 4. From the clean data and chunk sizes, assign an optimizer for each chunk +struct OptimizerPlan { + pub data: Vec, + pub chunk_sizes: Vec, + pub compressors: Vec, +} + +impl OptimizerPlan { + pub fn create_plan(data: Vec) -> Self { + let c_data = OptimizerPlan::clean_data(&data); + let chunks = OptimizerPlan::get_chunks_sizes(c_data.len()); + let optimizer = OptimizerPlan::assign_compressor(&c_data, &chunks, None); + OptimizerPlan { data: c_data, + chunk_sizes: chunks, + compressors: optimizer } + } + + pub fn create_plan_bounded(data: Vec, max_error: f32) -> Self { + let c_data = OptimizerPlan::clean_data(&data); + let chunks = OptimizerPlan::get_chunks_sizes(c_data.len()); + let optimizer = OptimizerPlan::assign_compressor(&c_data, &chunks, Some(max_error)); + OptimizerPlan { data: c_data, + chunk_sizes: chunks, + compressors: optimizer } + } + + /// Removes NaN and infinite references from the data + pub fn clean_data(wav_data: &Vec) -> Vec { + // Cleaning data, removing NaN, etc. This might reduce sample count + wav_data.iter() + .filter(|x| !(x.is_nan() || x.is_infinite())) + .copied() + .collect() + } + + /// This function gets a length and returns a vector with the chunk sizes to feed to the different compressors + /// A lot of assumptions go into selecting the chunk size, including: + /// 1. Collection rate - It is not expected that the collection rate exceeds 1point sec (it is expected actually less) + /// 2. Maximum compression achievable - A compressed frame as overhead and a minimum number of segments, small frames don't allow great compressions + /// 3. FFT operates faster under power of 2 + fn get_chunks_sizes(mut len: usize) -> Vec { + let mut chunk_sizes = Vec::::new(); + while len > 0 { + match len { + _ if len >= MAX_FRAME_SIZE => { + chunk_sizes.push(MAX_FRAME_SIZE); + len -= MAX_FRAME_SIZE; + }, + _ if len <= MIN_FRAME_SIZE => { + chunk_sizes.push(len); + len = 0; + }, + _ => { + let size = prev_power_of_two(len); + chunk_sizes.push(size); + len -= size; + } + } + } + chunk_sizes + } + + /// Assigns a compressor to a chunk of data + fn assign_compressor(clean_data: &Vec, chunks: &Vec, max_error: Option) -> Vec { + let selection = Vec::with_capacity(chunks.len()); + match max_error { + Some(err) => todo!(), + None => return selection, + } + } + +} + impl MetricTag { #[allow(clippy::wrong_self_convention)] fn from_float(&self, x: f64) -> i64 { @@ -55,32 +133,29 @@ fn to_median_filter(data: &Vec) -> Vec { filtered } -/// This function gets a length and returns a vector with the chunk sizes to feed to the different compressors -/// A lot of assumptions go into selecting the chunk size, including: -/// 1. Collection rate - It is not expected that the collection rate exceeds 1point sec (it is expected actually less) -/// 2. Maximum compression achievable - A compressed frame as overhead and a minimum number of segments, small frames don't allow great compressions -/// 3. FFT operates faster under power of 2 -fn get_chunks_sizes(len: usize) -> Vec { - Vec::::with_capacity(MIN_FRAME_SIZE) -} - /// This should look at the data and return an optimized dataset for a specific compressor, /// If a compressor is hand picked, this should be skipped. pub fn process_data(wav_data: &Vec, tag: &MetricTag) -> Vec { - // My idea here: - // 1. Clean data - // 2. Split into good sized chunks (aka power of 2) - // 3. Get each chunk into the compressor that it should go - // 3.1. Chunks should be at least of a size that it can allow a 100x compression for that given compressor (FFT is 512) - let len = wav_data.len(); - if !len.is_power_of_two() { - todo!() - } - // Cleaning data, removing NaN, etc. This might reduce sample count debug!("Tag: {:?} Len: {}", tag, wav_data.len()); - // Is len a power of 2? If not try to get the previous power of 2 wav_data.iter() - .filter(|x| !(x.is_nan() || x.is_infinite())) - .copied() - .collect() + .filter(|x| !(x.is_nan() || x.is_infinite())) + .copied() + .collect() +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_get_chunks_sizes() { + let len_very_large: usize = 131072 * 3 + 1765; + let len_small: usize = 31; + let len_right_sized: usize = 2048; + let len_some_size: usize = 12032; + assert_eq!(OptimizerPlan::get_chunks_sizes(len_very_large), [131072, 131072, 131072, 1024, 512, 229]); + assert_eq!(OptimizerPlan::get_chunks_sizes(len_small), [31]); + assert_eq!(OptimizerPlan::get_chunks_sizes(len_right_sized), [2048]); + assert_eq!(OptimizerPlan::get_chunks_sizes(len_some_size), [8192, 2048, 1024, 512, 256]); + } } \ No newline at end of file diff --git a/brro-compressor/src/utils/mod.rs b/brro-compressor/src/utils/mod.rs index 3aaf196..4d28aef 100644 --- a/brro-compressor/src/utils/mod.rs +++ b/brro-compressor/src/utils/mod.rs @@ -4,7 +4,7 @@ pub mod error; mod file_type_detector; // Is this the right place? -pub const fn prev_power_of_two(n: u64) -> u64 { +pub fn prev_power_of_two(n: usize) -> usize { // n = 0 gives highest_bit_set_idx = 0. let highest_bit_set_idx = 63 - (n|1).leading_zeros(); // Binary AND of highest bit with n is a no-op, except zero gets wiped. From 7f891cd05d8a5b049139b40fc3cb3cee5c2b68aa Mon Sep 17 00:00:00 2001 From: Carlos Rolo <3799585+cjrolo@users.noreply.github.com> Date: Thu, 19 Oct 2023 16:55:49 +0100 Subject: [PATCH 3/7] Moved code around. Implemented Optimzer struct --- Cargo.lock | 34 +++++++++++ brro-compressor/Cargo.toml | 2 +- brro-compressor/src/optimizer/mod.rs | 81 ++++++++++--------------- brro-compressor/src/types/metric_tag.rs | 42 +++++++++++++ brro-compressor/src/utils/mod.rs | 7 +++ 5 files changed, 115 insertions(+), 51 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a650878..a4f090a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -118,6 +118,17 @@ version = "1.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d468802bab17cbc0cc575e9b053f41e72aa36bfa6b7f55e3529ffa43161b97fa" +[[package]] +name = "average" +version = "0.14.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d804c74bb2d66e9b7047658d21af0f1c937d7d2466410cbf1aed3b0c04048d4" +dependencies = [ + "easy-cast", + "float-ord", + "num-traits", +] + [[package]] name = "backtrace" version = "0.3.68" @@ -189,6 +200,7 @@ dependencies = [ name = "brro-compressor" version = "0.1.0" dependencies = [ + "average", "bincode", "clap", "env_logger", @@ -387,6 +399,15 @@ version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "347675b2993d588e8506457ea2de0e64a89ad0fcbc0e79d07d25f50542f40b59" +[[package]] +name = "easy-cast" +version = "0.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "10936778145f3bea71fd9bf61332cce28c28e96a380714f7ab34838b80733fd6" +dependencies = [ + "libm", +] + [[package]] name = "either" version = "1.8.1" @@ -451,6 +472,12 @@ version = "0.4.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" +[[package]] +name = "float-ord" +version = "0.3.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8ce81f49ae8a0482e4c55ea62ebbd7e5a686af544c00b9d090bba3ff9be97b3d" + [[package]] name = "fnv" version = "1.0.7" @@ -824,6 +851,12 @@ version = "0.2.147" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b4668fb0ea861c1df094127ac5f1da3409a82116a4ba74fca2e58ef927159bb3" +[[package]] +name = "libm" +version = "0.2.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ec2a862134d2a7d32d7983ddcdd1c4923530833c9f2ea1a44fc5fa473989058" + [[package]] name = "linux-raw-sys" version = "0.3.8" @@ -971,6 +1004,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "578ede34cf02f8924ab9447f50c28075b4d3e5b269972345e7e0372b38c6cdcd" dependencies = [ "autocfg", + "libm", ] [[package]] diff --git a/brro-compressor/Cargo.toml b/brro-compressor/Cargo.toml index 19f642f..b82e3fb 100644 --- a/brro-compressor/Cargo.toml +++ b/brro-compressor/Cargo.toml @@ -14,7 +14,7 @@ clap = {version = "4.3.14", features = ["derive"] } bincode = "2.0.0-rc.3" rustfft = "6.1.0" tempfile = "3.2" - +average = "0.14.1" regex = "1.9.1" hound = "3.5" median = "0.3.2" \ No newline at end of file diff --git a/brro-compressor/src/optimizer/mod.rs b/brro-compressor/src/optimizer/mod.rs index 6739741..d628c64 100644 --- a/brro-compressor/src/optimizer/mod.rs +++ b/brro-compressor/src/optimizer/mod.rs @@ -1,10 +1,6 @@ -// Lucas - Once the project is far enough along I strongly reccomend reenabling dead code checks -#![allow(dead_code)] - -use median::Filter; use log::debug; use types::metric_tag::MetricTag; -use crate::{types, utils::prev_power_of_two, compressor::Compressor}; +use crate::{types, utils::{prev_power_of_two, f64_to_u64}, compressor::Compressor}; /// Max Frame size, this can aprox. 36h of data at 1point/sec rate, a little more than 1 week at 1point/5sec /// and 1 month (30 days) at 1 point/20sec. @@ -21,6 +17,7 @@ const MIN_FRAME_SIZE: usize = 512; // 2^9 // 3. Get each chunk into the compressor that it should go // 3.1. Chunks should be at least of a size that it can allow a 100x compression for that given compressor (FFT is 512) // 4. From the clean data and chunk sizes, assign an optimizer for each chunk +#[derive(Debug, Clone)] struct OptimizerPlan { pub data: Vec, pub chunk_sizes: Vec, @@ -28,7 +25,7 @@ struct OptimizerPlan { } impl OptimizerPlan { - pub fn create_plan(data: Vec) -> Self { + pub fn plan(data: Vec) -> Self { let c_data = OptimizerPlan::clean_data(&data); let chunks = OptimizerPlan::get_chunks_sizes(c_data.len()); let optimizer = OptimizerPlan::assign_compressor(&c_data, &chunks, None); @@ -37,7 +34,7 @@ impl OptimizerPlan { compressors: optimizer } } - pub fn create_plan_bounded(data: Vec, max_error: f32) -> Self { + pub fn plan_bounded(data: Vec, max_error: f32) -> Self { let c_data = OptimizerPlan::clean_data(&data); let chunks = OptimizerPlan::get_chunks_sizes(c_data.len()); let optimizer = OptimizerPlan::assign_compressor(&c_data, &chunks, Some(max_error)); @@ -82,55 +79,30 @@ impl OptimizerPlan { chunk_sizes } + /// Walks the data, checks how much variability is in the data, and assigns a compressor based on that + /// NOTE: Is this any good? + fn best_compressor(data: &[f64]) -> Compressor { + let _ = data.iter().map(|&f| f64_to_u64(f, 0)); + // For now, let's just return FFT + Compressor::FFT + } + /// Assigns a compressor to a chunk of data fn assign_compressor(clean_data: &Vec, chunks: &Vec, max_error: Option) -> Vec { - let selection = Vec::with_capacity(chunks.len()); + let mut selection = Vec::with_capacity(chunks.len()); match max_error { - Some(err) => todo!(), - None => return selection, - } - } - -} - -impl MetricTag { - #[allow(clippy::wrong_self_convention)] - fn from_float(&self, x: f64) -> i64 { - match self { - MetricTag::Other => { - 0 - } - MetricTag::NotFloat | MetricTag::QuasiRandom => { - x as i64 - } - MetricTag::Percent(y) => { - to_multiply_and_truncate(x, *y) - } - MetricTag::Duration(y) => { - to_multiply_and_truncate(x, *y) - } - MetricTag::Bytes(y) => { - (x as i64) / (*y as i64) - } + Some(_err) => todo!(), + None => { + let mut s = 0; + for size in chunks.iter() { + selection.push(OptimizerPlan::best_compressor(&clean_data[s..(s+*size-1)])); + s += *size; + } + }, } + selection } -} - -/// Converts a float via multiplication and truncation -fn to_multiply_and_truncate(number: f64, mul: i32) -> i64 { - (number * mul as f64) as i64 -} -fn to_median_filter(data: &Vec) -> Vec { - let mut filtered = Vec::with_capacity(data.len()); - // 10minutes of data - let mut filter = Filter::new(50); - for point in data { - let point_int = MetricTag::QuasiRandom.from_float(*point); - let median = filter.consume(point_int); - filtered.push(median) - } - filtered } /// This should look at the data and return an optimized dataset for a specific compressor, @@ -158,4 +130,13 @@ mod tests { assert_eq!(OptimizerPlan::get_chunks_sizes(len_right_sized), [2048]); assert_eq!(OptimizerPlan::get_chunks_sizes(len_some_size), [8192, 2048, 1024, 512, 256]); } + + #[test] + fn assign_compressor() { + let fake_data = vec![12.23; 132671]; + let chunks = OptimizerPlan::get_chunks_sizes(fake_data.len()); + println!("{:?}", chunks); + let compressor_vec = OptimizerPlan::assign_compressor(&fake_data, &chunks, None); + assert_eq!(compressor_vec.len(), 4); + } } \ No newline at end of file diff --git a/brro-compressor/src/types/metric_tag.rs b/brro-compressor/src/types/metric_tag.rs index 4fa5236..34cc3ac 100644 --- a/brro-compressor/src/types/metric_tag.rs +++ b/brro-compressor/src/types/metric_tag.rs @@ -1,3 +1,5 @@ +use median::Filter; + #[derive(Debug)] pub enum MetricTag { Percent(i32), @@ -11,4 +13,44 @@ pub enum MetricTag { Bytes(i32), // Data that is in bytes... Make it MB, or KB Other, // Everything else +} + +impl MetricTag { + #[allow(clippy::wrong_self_convention)] + fn from_float(&self, x: f64) -> i64 { + match self { + MetricTag::Other => { + 0 + } + MetricTag::NotFloat | MetricTag::QuasiRandom => { + x as i64 + } + MetricTag::Percent(y) => { + Self::to_multiply_and_truncate(x, *y) + } + MetricTag::Duration(y) => { + Self::to_multiply_and_truncate(x, *y) + } + MetricTag::Bytes(y) => { + (x as i64) / (*y as i64) + } + } + } + + /// Converts a float via multiplication and truncation + fn to_multiply_and_truncate(number: f64, mul: i32) -> i64 { + (number * mul as f64) as i64 + } + + fn to_median_filter(data: &Vec) -> Vec { + let mut filtered = Vec::with_capacity(data.len()); + // 10minutes of data + let mut filter = Filter::new(50); + for point in data { + let point_int = MetricTag::QuasiRandom.from_float(*point); + let median = filter.consume(point_int); + filtered.push(median) + } + filtered + } } \ No newline at end of file diff --git a/brro-compressor/src/utils/mod.rs b/brro-compressor/src/utils/mod.rs index 4d28aef..7374a9b 100644 --- a/brro-compressor/src/utils/mod.rs +++ b/brro-compressor/src/utils/mod.rs @@ -9,4 +9,11 @@ pub fn prev_power_of_two(n: usize) -> usize { let highest_bit_set_idx = 63 - (n|1).leading_zeros(); // Binary AND of highest bit with n is a no-op, except zero gets wiped. (1 << highest_bit_set_idx) & n +} + +/// Converts a float to u64 with a given precision +pub fn f64_to_u64(number: f64, precision: usize) -> u64 { + if precision > 6 { panic!("Precision only available up to 6 digits!")} + let mul = [1, 10, 100, 1_000, 10_000, 100_000, 1_000_000][precision]; + (number * mul as f64) as u64 } \ No newline at end of file From edb14206350d1b005b0edf11f81ddcbd1368c264 Mon Sep 17 00:00:00 2001 From: Carlos Rolo <3799585+cjrolo@users.noreply.github.com> Date: Thu, 19 Oct 2023 17:42:43 +0100 Subject: [PATCH 4/7] Tests and execution plan --- brro-compressor/src/optimizer/mod.rs | 38 ++++++++++++++++++++++++---- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/brro-compressor/src/optimizer/mod.rs b/brro-compressor/src/optimizer/mod.rs index d628c64..65c4857 100644 --- a/brro-compressor/src/optimizer/mod.rs +++ b/brro-compressor/src/optimizer/mod.rs @@ -25,6 +25,8 @@ struct OptimizerPlan { } impl OptimizerPlan { + + /// Creates an optimal data compression plan pub fn plan(data: Vec) -> Self { let c_data = OptimizerPlan::clean_data(&data); let chunks = OptimizerPlan::get_chunks_sizes(c_data.len()); @@ -34,7 +36,9 @@ impl OptimizerPlan { compressors: optimizer } } + /// Creates an optimal plan for compression for the data set provided bound by a given error pub fn plan_bounded(data: Vec, max_error: f32) -> Self { + // TODO: Check error limits let c_data = OptimizerPlan::clean_data(&data); let chunks = OptimizerPlan::get_chunks_sizes(c_data.len()); let optimizer = OptimizerPlan::assign_compressor(&c_data, &chunks, Some(max_error)); @@ -43,8 +47,14 @@ impl OptimizerPlan { compressors: optimizer } } + /// Sets a given compressor for all data chunks + pub fn set_compressor(&mut self, compressor: Compressor) { + let new_compressors = vec![compressor; self.compressors.len()]; + self.compressors = new_compressors; + } + /// Removes NaN and infinite references from the data - pub fn clean_data(wav_data: &Vec) -> Vec { + pub fn clean_data(wav_data: &[f64]) -> Vec { // Cleaning data, removing NaN, etc. This might reduce sample count wav_data.iter() .filter(|x| !(x.is_nan() || x.is_infinite())) @@ -79,23 +89,34 @@ impl OptimizerPlan { chunk_sizes } + /// Returns an iterator with the data slice and the compressor associated + pub fn get_execution(&self) -> Vec<(&Compressor, &[f64])> { + let mut output = Vec::with_capacity(self.chunk_sizes.len()); + let mut s = 0; + for (i,size) in self.chunk_sizes.iter().enumerate() { + output.push((&self.compressors[i] ,&self.data[s..(s+*size)])); + s += *size; + } + output + } + /// Walks the data, checks how much variability is in the data, and assigns a compressor based on that /// NOTE: Is this any good? - fn best_compressor(data: &[f64]) -> Compressor { + fn get_compressor(data: &[f64]) -> Compressor { let _ = data.iter().map(|&f| f64_to_u64(f, 0)); // For now, let's just return FFT Compressor::FFT } /// Assigns a compressor to a chunk of data - fn assign_compressor(clean_data: &Vec, chunks: &Vec, max_error: Option) -> Vec { + fn assign_compressor(clean_data: &[f64], chunks: &Vec, max_error: Option) -> Vec { let mut selection = Vec::with_capacity(chunks.len()); match max_error { Some(_err) => todo!(), None => { let mut s = 0; for size in chunks.iter() { - selection.push(OptimizerPlan::best_compressor(&clean_data[s..(s+*size-1)])); + selection.push(OptimizerPlan::get_compressor(&clean_data[s..(s+*size)])); s += *size; } }, @@ -119,6 +140,14 @@ pub fn process_data(wav_data: &Vec, tag: &MetricTag) -> Vec { mod tests { use super::*; + #[test] + fn optimizer() { + let fake_data = vec![12.23; 2049]; + let op = OptimizerPlan::plan(fake_data); + let plan_vec = op.get_execution(); + assert_eq!(plan_vec.len(), 2); + } + #[test] fn test_get_chunks_sizes() { let len_very_large: usize = 131072 * 3 + 1765; @@ -135,7 +164,6 @@ mod tests { fn assign_compressor() { let fake_data = vec![12.23; 132671]; let chunks = OptimizerPlan::get_chunks_sizes(fake_data.len()); - println!("{:?}", chunks); let compressor_vec = OptimizerPlan::assign_compressor(&fake_data, &chunks, None); assert_eq!(compressor_vec.len(), 4); } From 310ce1d041ab4e213efbe2fa651a4645d144e32f Mon Sep 17 00:00:00 2001 From: Carlos Rolo <3799585+cjrolo@users.noreply.github.com> Date: Thu, 19 Oct 2023 19:06:05 +0100 Subject: [PATCH 5/7] small fix --- brro-compressor/src/utils/mod.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/brro-compressor/src/utils/mod.rs b/brro-compressor/src/utils/mod.rs index 7374a9b..e2e0cb8 100644 --- a/brro-compressor/src/utils/mod.rs +++ b/brro-compressor/src/utils/mod.rs @@ -13,6 +13,7 @@ pub fn prev_power_of_two(n: usize) -> usize { /// Converts a float to u64 with a given precision pub fn f64_to_u64(number: f64, precision: usize) -> u64 { + // TODO: Panic on overflow if precision > 6 { panic!("Precision only available up to 6 digits!")} let mul = [1, 10, 100, 1_000, 10_000, 100_000, 1_000_000][precision]; (number * mul as f64) as u64 From 037544afd09386a9bbae5f8c6f8d2af84a979689 Mon Sep 17 00:00:00 2001 From: Carlos Rolo <3799585+cjrolo@users.noreply.github.com> Date: Thu, 19 Oct 2023 19:12:25 +0100 Subject: [PATCH 6/7] Minor fixes --- brro-compressor/src/optimizer/mod.rs | 2 +- brro-compressor/src/utils/mod.rs | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/brro-compressor/src/optimizer/mod.rs b/brro-compressor/src/optimizer/mod.rs index 65c4857..0903a48 100644 --- a/brro-compressor/src/optimizer/mod.rs +++ b/brro-compressor/src/optimizer/mod.rs @@ -128,7 +128,7 @@ impl OptimizerPlan { /// This should look at the data and return an optimized dataset for a specific compressor, /// If a compressor is hand picked, this should be skipped. -pub fn process_data(wav_data: &Vec, tag: &MetricTag) -> Vec { +pub fn process_data(wav_data: &[f64], tag: &MetricTag) -> Vec { debug!("Tag: {:?} Len: {}", tag, wav_data.len()); wav_data.iter() .filter(|x| !(x.is_nan() || x.is_infinite())) diff --git a/brro-compressor/src/utils/mod.rs b/brro-compressor/src/utils/mod.rs index f5308b5..c4f2e84 100644 --- a/brro-compressor/src/utils/mod.rs +++ b/brro-compressor/src/utils/mod.rs @@ -1,6 +1,7 @@ pub mod error; -pub mod readers; pub mod writers; +pub mod readers; + mod file_type_detector; // Is this the right place? From 247d2ea0fcbe3a96642a098767ac4c4154f3f9e2 Mon Sep 17 00:00:00 2001 From: Carlos Rolo <3799585+cjrolo@users.noreply.github.com> Date: Fri, 20 Oct 2023 09:27:48 +0100 Subject: [PATCH 7/7] PR fixes --- brro-compressor/src/optimizer/mod.rs | 2 +- brro-compressor/src/types/metric_tag.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/brro-compressor/src/optimizer/mod.rs b/brro-compressor/src/optimizer/mod.rs index 0903a48..986f12c 100644 --- a/brro-compressor/src/optimizer/mod.rs +++ b/brro-compressor/src/optimizer/mod.rs @@ -109,7 +109,7 @@ impl OptimizerPlan { } /// Assigns a compressor to a chunk of data - fn assign_compressor(clean_data: &[f64], chunks: &Vec, max_error: Option) -> Vec { + fn assign_compressor(clean_data: &[f64], chunks: &[usize], max_error: Option) -> Vec { let mut selection = Vec::with_capacity(chunks.len()); match max_error { Some(_err) => todo!(), diff --git a/brro-compressor/src/types/metric_tag.rs b/brro-compressor/src/types/metric_tag.rs index 34cc3ac..03d8c3c 100644 --- a/brro-compressor/src/types/metric_tag.rs +++ b/brro-compressor/src/types/metric_tag.rs @@ -42,7 +42,7 @@ impl MetricTag { (number * mul as f64) as i64 } - fn to_median_filter(data: &Vec) -> Vec { + fn to_median_filter(data: &[f64]) -> Vec { let mut filtered = Vec::with_capacity(data.len()); // 10minutes of data let mut filter = Filter::new(50);