diff --git a/gtars/src/tokenizers/meta_tokenizer.rs b/gtars/src/tokenizers/meta_tokenizer.rs index b27e2ae1..517d779b 100644 --- a/gtars/src/tokenizers/meta_tokenizer.rs +++ b/gtars/src/tokenizers/meta_tokenizer.rs @@ -52,8 +52,10 @@ impl TryFrom<&Path> for MetaTokenizer { _ => Some(&config.universes[1..]), }; + let primary_universe = value.parent().unwrap().join(primary_universe); + // parse first universe - let reader = get_dynamic_reader(Path::new(primary_universe))?; + let reader = get_dynamic_reader(Path::new(&primary_universe))?; let mut universe = Universe::default(); let mut intervals: HashMap>> = HashMap::new(); let mut region_to_metatoken: HashMap = HashMap::new(); @@ -139,7 +141,9 @@ impl TryFrom<&Path> for MetaTokenizer { for (u_num, other_universe) in other_universes.iter().enumerate() { - let reader = get_dynamic_reader(Path::new(other_universe))?; + let other_universe = value.parent().unwrap().join(other_universe); + + let reader = get_dynamic_reader(Path::new(&other_universe))?; let mut intervals: HashMap>> = HashMap::new(); for line in reader.lines() { @@ -277,3 +281,29 @@ impl TryFrom<&Path> for MetaTokenizer { }) } } + + +// tests +#[cfg(test)] +mod tests { + + use super::*; + use pretty_assertions::assert_eq; + use rstest::*; + + #[fixture] + fn path_to_config_file() -> &'static str { + "tests/data/tokenizer.meta.toml" + } + + #[fixture] + fn path_to_tokenize_bed_file() -> &'static str { + "tests/data/to_tokenize.bed" + } + + #[rstest] + fn test_create_tokenizer(path_to_config_file: &str) { + let tokenizer = MetaTokenizer::try_from(Path::new(path_to_config_file)).unwrap(); + assert_eq!(tokenizer.universe.len(), 27); + } +} \ No newline at end of file diff --git a/gtars/src/tokenizers/mod.rs b/gtars/src/tokenizers/mod.rs index cbb2f804..19b94ad4 100644 --- a/gtars/src/tokenizers/mod.rs +++ b/gtars/src/tokenizers/mod.rs @@ -41,134 +41,4 @@ pub mod consts { pub use config::TokenizerConfig; pub use fragment_tokenizer::FragmentTokenizer; pub use traits::{SingleCellTokenizer, Tokenizer}; -pub use tree_tokenizer::TreeTokenizer; - -#[cfg(test)] -mod tests { - - use crate::common::models::{Region, RegionSet}; - use crate::tokenizers::traits::SpecialTokens; - use std::path::Path; - - use super::*; - use pretty_assertions::assert_eq; - use rstest::*; - - #[fixture] - fn path_to_bed_file() -> &'static str { - "tests/data/peaks.bed" - } - - #[fixture] - fn path_to_config_file() -> &'static str { - "tests/data/tokenizer.toml" - } - - #[fixture] - fn path_to_bad_config_file() -> &'static str { - "tests/data/tokenizer_bad.toml" - } - - #[fixture] - fn path_to_tokenize_bed_file() -> &'static str { - "tests/data/to_tokenize.bed" - } - - #[rstest] - fn test_create_tokenizer_from_bed(path_to_bed_file: &str) { - let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap(); - assert_eq!(tokenizer.vocab_size(), 32); // 25 regions + 7 special tokens - } - - #[rstest] - fn test_create_tokenizer_from_config(path_to_config_file: &str) { - let tokenizer = TreeTokenizer::try_from(Path::new(path_to_config_file)).unwrap(); - assert_eq!(tokenizer.vocab_size(), 56); // 25 regions in main universe + 24 in hierarchical + 7 special tokens - } - - #[rstest] - #[should_panic] - fn test_bad_config_file(path_to_bad_config_file: &str) { - let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bad_config_file)); - let _tokenizer = tokenizer.unwrap(); - } - - #[rstest] - fn test_get_special_token_ids(path_to_bed_file: &str) { - let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap(); - let unk_id = tokenizer.unknown_token_id(); - let pad_id = tokenizer.padding_token_id(); - let mask_id = tokenizer.mask_token_id(); - let eos_id = tokenizer.eos_token_id(); - let bos_id = tokenizer.bos_token_id(); - let cls_id = tokenizer.cls_token_id(); - let sep_id = tokenizer.sep_token_id(); - - assert_eq!(unk_id, 25); - assert_eq!(pad_id, 26); - assert_eq!(mask_id, 27); - assert_eq!(eos_id, 28); - assert_eq!(bos_id, 29); - assert_eq!(cls_id, 30); - assert_eq!(sep_id, 31); - } - - #[rstest] - fn test_tokenize_bed_file(path_to_bed_file: &str, path_to_tokenize_bed_file: &str) { - let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap(); - let rs = RegionSet::try_from(Path::new(path_to_tokenize_bed_file)).unwrap(); - let tokenized_regions = tokenizer.tokenize_region_set(&rs); - - println!("{}", tokenized_regions.len()); - assert_eq!(tokenized_regions.len(), 4); - - // last should be the unknown token - let unknown_token = tokenizer - .universe - .convert_id_to_region(tokenized_regions[3]) - .unwrap(); - assert!(unknown_token.chr == "chrUNK"); - } - - #[rstest] - fn test_hierarchical_universe_hit(path_to_config_file: &str) { - let tokenizer = TreeTokenizer::try_from(Path::new(path_to_config_file)).unwrap(); - let res = tokenizer.tokenize_region(&Region { - chr: "chr1".to_string(), - start: 100, - end: 200, - }); - assert_eq!(res.len(), 1); - - // check the id, it should be len(primary_universe) + 1 (since its chr1) - assert_eq!(res.ids, vec![25]); - - let res = res.into_region_vec(); - let region = &res[0]; - - assert_eq!(region.chr, "chr1"); - assert_eq!(region.start, 0); - assert_eq!(region.end, 248_956_422); - } - - #[rstest] - fn test_hierarchical_universe_no_hit(path_to_config_file: &str) { - let tokenizer = TreeTokenizer::try_from(Path::new(path_to_config_file)).unwrap(); - let res = tokenizer.tokenize_region(&Region { - chr: "chrFOO".to_string(), - start: 100, - end: 200, - }); - assert_eq!(res.len(), 1); - - // check the id, it should be the id of the UNK token - assert_eq!(res.ids, vec![49]); - - let res = res.into_region_vec(); - let region = &res[0]; - - assert_eq!(region.chr, "chrUNK"); - assert_eq!(region.start, 0); - assert_eq!(region.end, 0); - } -} +pub use tree_tokenizer::TreeTokenizer; \ No newline at end of file diff --git a/gtars/src/tokenizers/tree_tokenizer.rs b/gtars/src/tokenizers/tree_tokenizer.rs index 096d9605..48c09316 100644 --- a/gtars/src/tokenizers/tree_tokenizer.rs +++ b/gtars/src/tokenizers/tree_tokenizer.rs @@ -414,3 +414,133 @@ impl TreeTokenizer { // use default implementation impl Pad for TreeTokenizer {} + +#[cfg(test)] +mod tests { + + use crate::common::models::{Region, RegionSet}; + use crate::tokenizers::traits::SpecialTokens; + use std::path::Path; + + use super::*; + use pretty_assertions::assert_eq; + use rstest::*; + + #[fixture] + fn path_to_bed_file() -> &'static str { + "tests/data/peaks.bed" + } + + #[fixture] + fn path_to_config_file() -> &'static str { + "tests/data/tokenizer.toml" + } + + #[fixture] + fn path_to_bad_config_file() -> &'static str { + "tests/data/tokenizer_bad.toml" + } + + #[fixture] + fn path_to_tokenize_bed_file() -> &'static str { + "tests/data/to_tokenize.bed" + } + + #[rstest] + fn test_create_tokenizer_from_bed(path_to_bed_file: &str) { + let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap(); + assert_eq!(tokenizer.vocab_size(), 32); // 25 regions + 7 special tokens + } + + #[rstest] + fn test_create_tokenizer_from_config(path_to_config_file: &str) { + let tokenizer = TreeTokenizer::try_from(Path::new(path_to_config_file)).unwrap(); + assert_eq!(tokenizer.vocab_size(), 56); // 25 regions in main universe + 24 in hierarchical + 7 special tokens + } + + #[rstest] + #[should_panic] + fn test_bad_config_file(path_to_bad_config_file: &str) { + let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bad_config_file)); + let _tokenizer = tokenizer.unwrap(); + } + + #[rstest] + fn test_get_special_token_ids(path_to_bed_file: &str) { + let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap(); + let unk_id = tokenizer.unknown_token_id(); + let pad_id = tokenizer.padding_token_id(); + let mask_id = tokenizer.mask_token_id(); + let eos_id = tokenizer.eos_token_id(); + let bos_id = tokenizer.bos_token_id(); + let cls_id = tokenizer.cls_token_id(); + let sep_id = tokenizer.sep_token_id(); + + assert_eq!(unk_id, 25); + assert_eq!(pad_id, 26); + assert_eq!(mask_id, 27); + assert_eq!(eos_id, 28); + assert_eq!(bos_id, 29); + assert_eq!(cls_id, 30); + assert_eq!(sep_id, 31); + } + + #[rstest] + fn test_tokenize_bed_file(path_to_bed_file: &str, path_to_tokenize_bed_file: &str) { + let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap(); + let rs = RegionSet::try_from(Path::new(path_to_tokenize_bed_file)).unwrap(); + let tokenized_regions = tokenizer.tokenize_region_set(&rs); + + println!("{}", tokenized_regions.len()); + assert_eq!(tokenized_regions.len(), 4); + + // last should be the unknown token + let unknown_token = tokenizer + .universe + .convert_id_to_region(tokenized_regions[3]) + .unwrap(); + assert!(unknown_token.chr == "chrUNK"); + } + + #[rstest] + fn test_hierarchical_universe_hit(path_to_config_file: &str) { + let tokenizer = TreeTokenizer::try_from(Path::new(path_to_config_file)).unwrap(); + let res = tokenizer.tokenize_region(&Region { + chr: "chr1".to_string(), + start: 100, + end: 200, + }); + assert_eq!(res.len(), 1); + + // check the id, it should be len(primary_universe) + 1 (since its chr1) + assert_eq!(res.ids, vec![25]); + + let res = res.into_region_vec(); + let region = &res[0]; + + assert_eq!(region.chr, "chr1"); + assert_eq!(region.start, 0); + assert_eq!(region.end, 248_956_422); + } + + #[rstest] + fn test_hierarchical_universe_no_hit(path_to_config_file: &str) { + let tokenizer = TreeTokenizer::try_from(Path::new(path_to_config_file)).unwrap(); + let res = tokenizer.tokenize_region(&Region { + chr: "chrFOO".to_string(), + start: 100, + end: 200, + }); + assert_eq!(res.len(), 1); + + // check the id, it should be the id of the UNK token + assert_eq!(res.ids, vec![49]); + + let res = res.into_region_vec(); + let region = &res[0]; + + assert_eq!(region.chr, "chrUNK"); + assert_eq!(region.start, 0); + assert_eq!(region.end, 0); + } +} diff --git a/gtars/tests/data/chroms.meta.bed b/gtars/tests/data/chroms.meta.bed new file mode 100644 index 00000000..f75c730e --- /dev/null +++ b/gtars/tests/data/chroms.meta.bed @@ -0,0 +1,24 @@ +chr1 0 248956422 1 +chr2 0 242193529 1 +chr3 0 198295559 2 +chr4 0 190214555 2 +chr5 0 181538259 3 +chr6 0 170805979 3 +chr7 0 159345973 4 +chr8 0 145138636 4 +chr9 0 138394717 5 +chr10 0 133797422 5 +chr11 0 135086622 6 +chr12 0 133275309 6 +chr13 0 114364328 6 +chr14 0 107043718 7 +chr15 0 101991189 7 +chr16 0 90338345 8 +chr17 0 83257441 8 +chr18 0 80373285 8 +chr19 0 58617616 8 +chr20 0 64444167 9 +chr21 0 46709983 9 +chr22 0 50818468 10 +chrX 0 156040895 11 +chrY 0 57227415 11 \ No newline at end of file diff --git a/gtars/tests/data/tokenizer.meta.toml b/gtars/tests/data/tokenizer.meta.toml new file mode 100644 index 00000000..219834f0 --- /dev/null +++ b/gtars/tests/data/tokenizer.meta.toml @@ -0,0 +1 @@ +universes = ["peaks.meta.bed", "chroms.meta.bed"] \ No newline at end of file