Skip to content

Commit

Permalink
basic implementation of the meta tokenizer
Browse files Browse the repository at this point in the history
  • Loading branch information
nleroy917 committed Jun 16, 2024
1 parent 3b52388 commit 74518ef
Show file tree
Hide file tree
Showing 5 changed files with 188 additions and 133 deletions.
34 changes: 32 additions & 2 deletions gtars/src/tokenizers/meta_tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,10 @@ impl TryFrom<&Path> for MetaTokenizer {
_ => Some(&config.universes[1..]),
};

let primary_universe = value.parent().unwrap().join(primary_universe);

// parse first universe
let reader = get_dynamic_reader(Path::new(primary_universe))?;
let reader = get_dynamic_reader(Path::new(&primary_universe))?;
let mut universe = Universe::default();
let mut intervals: HashMap<String, Vec<Interval<u32, u32>>> = HashMap::new();
let mut region_to_metatoken: HashMap<Region, Region> = HashMap::new();
Expand Down Expand Up @@ -139,7 +141,9 @@ impl TryFrom<&Path> for MetaTokenizer {

for (u_num, other_universe) in other_universes.iter().enumerate() {

let reader = get_dynamic_reader(Path::new(other_universe))?;
let other_universe = value.parent().unwrap().join(other_universe);

let reader = get_dynamic_reader(Path::new(&other_universe))?;
let mut intervals: HashMap<String, Vec<Interval<u32, u32>>> = HashMap::new();

for line in reader.lines() {
Expand Down Expand Up @@ -277,3 +281,29 @@ impl TryFrom<&Path> for MetaTokenizer {
})
}
}


// tests
#[cfg(test)]
mod tests {

use super::*;
use pretty_assertions::assert_eq;
use rstest::*;

#[fixture]
fn path_to_config_file() -> &'static str {
"tests/data/tokenizer.meta.toml"
}

#[fixture]
fn path_to_tokenize_bed_file() -> &'static str {
"tests/data/to_tokenize.bed"
}

#[rstest]
fn test_create_tokenizer(path_to_config_file: &str) {
let tokenizer = MetaTokenizer::try_from(Path::new(path_to_config_file)).unwrap();
assert_eq!(tokenizer.universe.len(), 27);
}
}
132 changes: 1 addition & 131 deletions gtars/src/tokenizers/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,134 +41,4 @@ pub mod consts {
pub use config::TokenizerConfig;
pub use fragment_tokenizer::FragmentTokenizer;
pub use traits::{SingleCellTokenizer, Tokenizer};
pub use tree_tokenizer::TreeTokenizer;

#[cfg(test)]
mod tests {

use crate::common::models::{Region, RegionSet};
use crate::tokenizers::traits::SpecialTokens;
use std::path::Path;

use super::*;
use pretty_assertions::assert_eq;
use rstest::*;

#[fixture]
fn path_to_bed_file() -> &'static str {
"tests/data/peaks.bed"
}

#[fixture]
fn path_to_config_file() -> &'static str {
"tests/data/tokenizer.toml"
}

#[fixture]
fn path_to_bad_config_file() -> &'static str {
"tests/data/tokenizer_bad.toml"
}

#[fixture]
fn path_to_tokenize_bed_file() -> &'static str {
"tests/data/to_tokenize.bed"
}

#[rstest]
fn test_create_tokenizer_from_bed(path_to_bed_file: &str) {
let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap();
assert_eq!(tokenizer.vocab_size(), 32); // 25 regions + 7 special tokens
}

#[rstest]
fn test_create_tokenizer_from_config(path_to_config_file: &str) {
let tokenizer = TreeTokenizer::try_from(Path::new(path_to_config_file)).unwrap();
assert_eq!(tokenizer.vocab_size(), 56); // 25 regions in main universe + 24 in hierarchical + 7 special tokens
}

#[rstest]
#[should_panic]
fn test_bad_config_file(path_to_bad_config_file: &str) {
let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bad_config_file));
let _tokenizer = tokenizer.unwrap();
}

#[rstest]
fn test_get_special_token_ids(path_to_bed_file: &str) {
let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap();
let unk_id = tokenizer.unknown_token_id();
let pad_id = tokenizer.padding_token_id();
let mask_id = tokenizer.mask_token_id();
let eos_id = tokenizer.eos_token_id();
let bos_id = tokenizer.bos_token_id();
let cls_id = tokenizer.cls_token_id();
let sep_id = tokenizer.sep_token_id();

assert_eq!(unk_id, 25);
assert_eq!(pad_id, 26);
assert_eq!(mask_id, 27);
assert_eq!(eos_id, 28);
assert_eq!(bos_id, 29);
assert_eq!(cls_id, 30);
assert_eq!(sep_id, 31);
}

#[rstest]
fn test_tokenize_bed_file(path_to_bed_file: &str, path_to_tokenize_bed_file: &str) {
let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap();
let rs = RegionSet::try_from(Path::new(path_to_tokenize_bed_file)).unwrap();
let tokenized_regions = tokenizer.tokenize_region_set(&rs);

println!("{}", tokenized_regions.len());
assert_eq!(tokenized_regions.len(), 4);

// last should be the unknown token
let unknown_token = tokenizer
.universe
.convert_id_to_region(tokenized_regions[3])
.unwrap();
assert!(unknown_token.chr == "chrUNK");
}

#[rstest]
fn test_hierarchical_universe_hit(path_to_config_file: &str) {
let tokenizer = TreeTokenizer::try_from(Path::new(path_to_config_file)).unwrap();
let res = tokenizer.tokenize_region(&Region {
chr: "chr1".to_string(),
start: 100,
end: 200,
});
assert_eq!(res.len(), 1);

// check the id, it should be len(primary_universe) + 1 (since its chr1)
assert_eq!(res.ids, vec![25]);

let res = res.into_region_vec();
let region = &res[0];

assert_eq!(region.chr, "chr1");
assert_eq!(region.start, 0);
assert_eq!(region.end, 248_956_422);
}

#[rstest]
fn test_hierarchical_universe_no_hit(path_to_config_file: &str) {
let tokenizer = TreeTokenizer::try_from(Path::new(path_to_config_file)).unwrap();
let res = tokenizer.tokenize_region(&Region {
chr: "chrFOO".to_string(),
start: 100,
end: 200,
});
assert_eq!(res.len(), 1);

// check the id, it should be the id of the UNK token
assert_eq!(res.ids, vec![49]);

let res = res.into_region_vec();
let region = &res[0];

assert_eq!(region.chr, "chrUNK");
assert_eq!(region.start, 0);
assert_eq!(region.end, 0);
}
}
pub use tree_tokenizer::TreeTokenizer;
130 changes: 130 additions & 0 deletions gtars/src/tokenizers/tree_tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -414,3 +414,133 @@ impl TreeTokenizer {

// use default implementation
impl Pad for TreeTokenizer {}

#[cfg(test)]
mod tests {

use crate::common::models::{Region, RegionSet};
use crate::tokenizers::traits::SpecialTokens;
use std::path::Path;

use super::*;
use pretty_assertions::assert_eq;
use rstest::*;

#[fixture]
fn path_to_bed_file() -> &'static str {
"tests/data/peaks.bed"
}

#[fixture]
fn path_to_config_file() -> &'static str {
"tests/data/tokenizer.toml"
}

#[fixture]
fn path_to_bad_config_file() -> &'static str {
"tests/data/tokenizer_bad.toml"
}

#[fixture]
fn path_to_tokenize_bed_file() -> &'static str {
"tests/data/to_tokenize.bed"
}

#[rstest]
fn test_create_tokenizer_from_bed(path_to_bed_file: &str) {
let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap();
assert_eq!(tokenizer.vocab_size(), 32); // 25 regions + 7 special tokens
}

#[rstest]
fn test_create_tokenizer_from_config(path_to_config_file: &str) {
let tokenizer = TreeTokenizer::try_from(Path::new(path_to_config_file)).unwrap();
assert_eq!(tokenizer.vocab_size(), 56); // 25 regions in main universe + 24 in hierarchical + 7 special tokens
}

#[rstest]
#[should_panic]
fn test_bad_config_file(path_to_bad_config_file: &str) {
let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bad_config_file));
let _tokenizer = tokenizer.unwrap();
}

#[rstest]
fn test_get_special_token_ids(path_to_bed_file: &str) {
let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap();
let unk_id = tokenizer.unknown_token_id();
let pad_id = tokenizer.padding_token_id();
let mask_id = tokenizer.mask_token_id();
let eos_id = tokenizer.eos_token_id();
let bos_id = tokenizer.bos_token_id();
let cls_id = tokenizer.cls_token_id();
let sep_id = tokenizer.sep_token_id();

assert_eq!(unk_id, 25);
assert_eq!(pad_id, 26);
assert_eq!(mask_id, 27);
assert_eq!(eos_id, 28);
assert_eq!(bos_id, 29);
assert_eq!(cls_id, 30);
assert_eq!(sep_id, 31);
}

#[rstest]
fn test_tokenize_bed_file(path_to_bed_file: &str, path_to_tokenize_bed_file: &str) {
let tokenizer = TreeTokenizer::try_from(Path::new(path_to_bed_file)).unwrap();
let rs = RegionSet::try_from(Path::new(path_to_tokenize_bed_file)).unwrap();
let tokenized_regions = tokenizer.tokenize_region_set(&rs);

println!("{}", tokenized_regions.len());
assert_eq!(tokenized_regions.len(), 4);

// last should be the unknown token
let unknown_token = tokenizer
.universe
.convert_id_to_region(tokenized_regions[3])
.unwrap();
assert!(unknown_token.chr == "chrUNK");
}

#[rstest]
fn test_hierarchical_universe_hit(path_to_config_file: &str) {
let tokenizer = TreeTokenizer::try_from(Path::new(path_to_config_file)).unwrap();
let res = tokenizer.tokenize_region(&Region {
chr: "chr1".to_string(),
start: 100,
end: 200,
});
assert_eq!(res.len(), 1);

// check the id, it should be len(primary_universe) + 1 (since its chr1)
assert_eq!(res.ids, vec![25]);

let res = res.into_region_vec();
let region = &res[0];

assert_eq!(region.chr, "chr1");
assert_eq!(region.start, 0);
assert_eq!(region.end, 248_956_422);
}

#[rstest]
fn test_hierarchical_universe_no_hit(path_to_config_file: &str) {
let tokenizer = TreeTokenizer::try_from(Path::new(path_to_config_file)).unwrap();
let res = tokenizer.tokenize_region(&Region {
chr: "chrFOO".to_string(),
start: 100,
end: 200,
});
assert_eq!(res.len(), 1);

// check the id, it should be the id of the UNK token
assert_eq!(res.ids, vec![49]);

let res = res.into_region_vec();
let region = &res[0];

assert_eq!(region.chr, "chrUNK");
assert_eq!(region.start, 0);
assert_eq!(region.end, 0);
}
}
24 changes: 24 additions & 0 deletions gtars/tests/data/chroms.meta.bed
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
chr1 0 248956422 1
chr2 0 242193529 1
chr3 0 198295559 2
chr4 0 190214555 2
chr5 0 181538259 3
chr6 0 170805979 3
chr7 0 159345973 4
chr8 0 145138636 4
chr9 0 138394717 5
chr10 0 133797422 5
chr11 0 135086622 6
chr12 0 133275309 6
chr13 0 114364328 6
chr14 0 107043718 7
chr15 0 101991189 7
chr16 0 90338345 8
chr17 0 83257441 8
chr18 0 80373285 8
chr19 0 58617616 8
chr20 0 64444167 9
chr21 0 46709983 9
chr22 0 50818468 10
chrX 0 156040895 11
chrY 0 57227415 11
1 change: 1 addition & 0 deletions gtars/tests/data/tokenizer.meta.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
universes = ["peaks.meta.bed", "chroms.meta.bed"]

0 comments on commit 74518ef

Please sign in to comment.