Skip to content

Commit

Permalink
flush out the meta tokenizer
Browse files Browse the repository at this point in the history
nleroy917 committed Jun 16, 2024
1 parent 8ba4294 commit 797b204
Showing 3 changed files with 280 additions and 6 deletions.
4 changes: 4 additions & 0 deletions gtars/src/common/models/universe.rs
Original file line number Diff line number Diff line change
@@ -48,6 +48,10 @@ impl Universe {
pub fn is_empty(&self) -> bool {
self.region_to_id.len() == 0
}

pub fn contains_region(&self, region: &Region) -> bool {
self.region_to_id.contains_key(region)
}
}

impl From<Vec<Region>> for Universe {
279 changes: 275 additions & 4 deletions gtars/src/tokenizers/meta_tokenizer.rs
Original file line number Diff line number Diff line change
@@ -6,9 +6,11 @@ use anyhow::{Context, Result};
use rust_lapper::{Interval, Lapper};

use crate::common::consts::special_tokens::*;
use crate::common::models::{Region, Universe};
use crate::common::models::{Region, RegionSet, Universe, TokenizedRegionSet};
use crate::common::utils::get_dynamic_reader;
use crate::tokenizers::TokenizerConfig;
use crate::tokenizers::{TokenizerConfig, Tokenizer};

use super::traits::SpecialTokens;

///
/// The MetaTokenizer is a TreeTokenizer that implements the concept of meta-tokens. Meta
@@ -113,7 +115,9 @@ impl TryFrom<&Path> for MetaTokenizer {
};

// update the universe with the metatoken
universe.insert_token(&meta_region);
if !universe.contains_region(&meta_region) {
universe.insert_token(&meta_region);
}

// insert a region into the appropriate list
let ilist = intervals.entry(region.chr.clone()).or_default();
@@ -195,7 +199,9 @@ impl TryFrom<&Path> for MetaTokenizer {
};

// update the universe with the metatoken
universe.insert_token(&meta_region);
if !universe.contains_region(&meta_region) {
universe.insert_token(&meta_region);
}

// insert a region into the appropriate list
let ilist = intervals.entry(region.chr.clone()).or_default();
@@ -282,11 +288,219 @@ impl TryFrom<&Path> for MetaTokenizer {
}
}

impl SpecialTokens for MetaTokenizer {
fn unknown_token(&self) -> Region {
Region {
chr: UNKNOWN_CHR.to_string(),
start: UNKNOWN_START as u32,
end: UNKNOWN_END as u32,
}
}

fn padding_token(&self) -> Region {
Region {
chr: PAD_CHR.to_string(),
start: PAD_START as u32,
end: PAD_END as u32,
}
}

fn mask_token(&self) -> Region {
Region {
chr: MASK_CHR.to_string(),
start: MASK_START as u32,
end: MASK_END as u32,
}
}

fn cls_token(&self) -> Region {
Region {
chr: CLS_CHR.to_string(),
start: CLS_START as u32,
end: CLS_END as u32,
}
}

fn bos_token(&self) -> Region {
Region {
chr: BOS_CHR.to_string(),
start: BOS_START as u32,
end: BOS_END as u32,
}
}

fn eos_token(&self) -> Region {
Region {
chr: EOS_CHR.to_string(),
start: EOS_START as u32,
end: EOS_END as u32,
}
}

fn sep_token(&self) -> Region {
Region {
chr: SEP_CHR.to_string(),
start: SEP_START as u32,
end: SEP_END as u32,
}
}

fn unknown_token_id(&self) -> u32 {
self.universe
.convert_region_to_id(&self.unknown_token())
.unwrap()
}

fn padding_token_id(&self) -> u32 {
self.universe
.convert_region_to_id(&self.padding_token())
.unwrap()
}

fn mask_token_id(&self) -> u32 {
self.universe
.convert_region_to_id(&self.mask_token())
.unwrap()
}

fn cls_token_id(&self) -> u32 {
self.universe
.convert_region_to_id(&self.cls_token())
.unwrap()
}

fn bos_token_id(&self) -> u32 {
self.universe
.convert_region_to_id(&self.bos_token())
.unwrap()
}

fn eos_token_id(&self) -> u32 {
self.universe
.convert_region_to_id(&self.eos_token())
.unwrap()
}

fn sep_token_id(&self) -> u32 {
self.universe
.convert_region_to_id(&self.sep_token())
.unwrap()
}
}

impl Tokenizer for MetaTokenizer {

fn vocab_size(&self) -> usize {
self.universe.len()
}

fn get_universe(&self) -> &Universe {
&self.universe
}

fn tokenize_region(&self, region: &Region) -> TokenizedRegionSet {
let lapper = self.tree.get(&region.chr);

match lapper {
Some(lapper) => {
let intervals = lapper.find(region.start, region.end);
let mut ids: Vec<u32> = intervals.map(|interval| interval.val).collect();

// tokenized to nothing... check secondary trees
if ids.is_empty() {
// oh, we have no secondary trees, just return the unknown token
if self.secondary_trees.is_none() {
ids = vec![self.unknown_token_id()];
// iterate over secondary trees and check if the region is in any of them
} else {
for s_tree in self.secondary_trees.as_ref().unwrap() {
// default to unknown token
ids = vec![self.unknown_token_id()];

let s_lapper = s_tree.get(&region.chr);
if s_lapper.is_none() {
continue;
}
// get overlapped intervals -- map to regions
let intervals = s_lapper.unwrap().find(region.start, region.end);
let regions: Vec<u32> =
intervals.map(|interval| interval.val).collect();

// a hit
if !regions.is_empty() {
ids = regions;
break;
}
}
}
}

TokenizedRegionSet {
ids,
universe: &self.universe,
}
}
// primary universe didnt have that chromosome/contig/seqname
// so, check secondary trees
None => {
let mut ids = Vec::new();
// oh, we have no secondary trees, just return the unknown token
if self.secondary_trees.is_none() {
ids = vec![self.unknown_token_id()];
// iterate over secondary trees and check if the region is in any of them
} else {
for s_tree in self.secondary_trees.as_ref().unwrap() {
// default to unknown token
ids = vec![self.unknown_token_id()];

let s_lapper = s_tree.get(&region.chr);
if s_lapper.is_none() {
continue;
}

// get overlapped intervals -- map to regions
let intervals = s_lapper.unwrap().find(region.start, region.end);
let regions: Vec<u32> = intervals.map(|interval| interval.val).collect();

// a hit
if !regions.is_empty() {
ids = regions;
break;
} else {
ids = vec![self.unknown_token_id()];
}
}
}

TokenizedRegionSet {
ids,
universe: &self.universe,
}
}
}
}

fn tokenize_region_set(&self, region_set: &RegionSet) -> TokenizedRegionSet {
let mut tokenized_regions: Vec<u32> = Vec::new();

for region in region_set {
let tokenized_region = self.tokenize_region(region);
tokenized_regions.extend(tokenized_region.ids);
}

TokenizedRegionSet {
ids: tokenized_regions,
universe: &self.universe,
}
}
}

// tests
#[cfg(test)]
mod tests {

use crate::common::models::RegionSet;

use super::*;
use pretty_assertions::assert_eq;
use rstest::*;
@@ -306,4 +520,61 @@ mod tests {
let tokenizer = MetaTokenizer::try_from(Path::new(path_to_config_file)).unwrap();
assert_eq!(tokenizer.universe.len(), 27);
}

#[rstest]
fn test_does_tokenize(path_to_config_file: &str, path_to_tokenize_bed_file: &str) {
let tokenizer = MetaTokenizer::try_from(Path::new(path_to_config_file)).unwrap();
let region_set = RegionSet::try_from(Path::new(path_to_tokenize_bed_file)).unwrap();

let tokens = tokenizer.tokenize_region_set(&region_set);

assert_eq!(tokens.len(), 4);
}

#[rstest]
fn test_tokenize_to_first_second_unk(path_to_config_file: &str) {
let tokenizer = MetaTokenizer::try_from(Path::new(path_to_config_file)).unwrap();

let r1 = Region { // tokenize to id 1
chr: "chr4".to_string(),
start: 16270184,
end: 16270240
};

let r2 = Region { // drops through to the secondary and tokenizes to id 13
chr: "chr10".to_string(),
start: 705762,
end: 705762
};

let r3 = Region { // unknown token, so should be id 20
chr: "chrY".to_string(),
start: 1000000,
end: 1000000
};

assert_eq!(tokenizer.tokenize_region(&r1).ids, vec![1]);
assert_eq!(tokenizer.tokenize_region(&r2).ids, vec![13]);
assert_eq!(tokenizer.tokenize_region(&r3).ids, vec![20]);
}

#[rstest]
fn test_multiple_regions_to_one_meta_id(path_to_config_file: &str) {
let tokenizer = MetaTokenizer::try_from(Path::new(path_to_config_file)).unwrap();

let r1 = Region { // tokenize to 2
chr: "chr10".to_string(),
start: 70576220,
end: 70576251
};

let r2 = Region { // tokenize to id 2
chr: "chr2".to_string(),
start: 203871487,
end: 203871688
};

assert_eq!(tokenizer.tokenize_region(&r1).ids, vec![2]);
assert_eq!(tokenizer.tokenize_region(&r2).ids, vec![2]);
}
}
3 changes: 1 addition & 2 deletions gtars/tests/data/chroms.meta.bed
Original file line number Diff line number Diff line change
@@ -20,5 +20,4 @@ chr19 0 58617616 cluster_8
chr20 0 64444167 cluster_9
chr21 0 46709983 cluster_9
chr22 0 50818468 cluster_10
chrX 0 156040895 cluster_11
chrY 0 57227415 cluster_11
chrX 0 156040895 cluster_11

0 comments on commit 797b204

Please sign in to comment.