Skip to content

Commit

Permalink
Merge pull request #24 from databio/dev
Browse files Browse the repository at this point in the history
Release `v0.0.13` -- Add fragment file tokenizer
  • Loading branch information
nleroy917 authored Jun 3, 2024
2 parents dab4beb + 9760d04 commit f40fc8c
Show file tree
Hide file tree
Showing 16 changed files with 456 additions and 17 deletions.
2 changes: 1 addition & 1 deletion bindings/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "genimtools-py"
version = "0.0.12"
version = "0.0.13"
edition = "2021"

# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
Expand Down
1 change: 1 addition & 0 deletions bindings/genimtools/models/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from .genimtools.models import * # noqa: F403
9 changes: 9 additions & 0 deletions bindings/genimtools/models/__init__.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from typing import List

class Region:
chr: str
start: int
end: int

class RegionSet:
regions: List[Region]
17 changes: 17 additions & 0 deletions bindings/genimtools/tokenizers/__init__.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -366,4 +366,21 @@ class TreeTokenizer:
def __repr__(self) -> str:
"""
Get a string representation of the tokenizer.
"""

class FragmentTokenizer:
def __new__(cls, path: str) -> FragmentTokenizer:
"""
Construct a new FragmentTokenizer from a universe file.
:param path: The path to the universe file. This should be a BED file.
"""

def tokenize_fragments(self, file_path: str, out_path: str = None, filter: List[str] = None) -> None:
"""
Tokenize a file containing fragments.
:param file_path: The path to the file containing fragments.
:param out_path: The path to the output file. If None, the output is written to the standard output.
:param filter: A list of chromosomes to filter. If None, all chromosomes are included.
"""
4 changes: 2 additions & 2 deletions bindings/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,13 @@ fn genimtools(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
let tokenize_module = pyo3::wrap_pymodule!(tokenizers::tokenizers);
let ailist_module = pyo3::wrap_pymodule!(ailist::ailist);
let utils_module = pyo3::wrap_pymodule!(utils::utils);
let models_modeule = pyo3::wrap_pymodule!(models::models);
let models_module = pyo3::wrap_pymodule!(models::models);

m.add_wrapped(vocab_module)?;
m.add_wrapped(tokenize_module)?;
m.add_wrapped(ailist_module)?;
m.add_wrapped(utils_module)?;
m.add_wrapped(models_modeule)?;
m.add_wrapped(models_module)?;

let sys = PyModule::import_bound(py, "sys")?;
let binding = sys.getattr("modules")?;
Expand Down
34 changes: 34 additions & 0 deletions bindings/src/tokenizers/fragments_tokenizer.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
use pyo3::prelude::*;

#[pyclass(name = "FragmentTokenizer")]
pub struct PyFragmentTokenizer {
pub tokenizer: genimtools::tokenizers::FragmentTokenizer,
}

#[pymethods]
impl PyFragmentTokenizer {
#[new]
pub fn new(path: String) -> PyResult<Self> {
let path = std::path::Path::new(&path);
let tokenizer = genimtools::tokenizers::FragmentTokenizer::try_from(path)?;
Ok(PyFragmentTokenizer { tokenizer })
}

pub fn tokenize_fragments(
&self,
file: String,
out_path: Option<String>,
filter: Option<Vec<String>>,
) -> PyResult<()> {
let path = std::path::Path::new(&file);
let out_path = out_path.unwrap_or("".to_string());
let out_path = std::path::Path::new(&out_path);
match filter {
Some(filter) => self
.tokenizer
.tokenize_fragments_with_filter(path, out_path, filter),
None => self.tokenizer.tokenize_fragments(path, out_path),
}?;
Ok(())
}
}
3 changes: 3 additions & 0 deletions bindings/src/tokenizers/mod.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
mod fragments_tokenizer;
mod tree_tokenizer;

use pyo3::prelude::*;

pub use self::fragments_tokenizer::PyFragmentTokenizer;
pub use self::tree_tokenizer::PyTreeTokenizer;
pub use crate::models::{
PyRegion, PyRegionSet, PyTokenizedRegion, PyTokenizedRegionSet, PyUniverse,
Expand All @@ -10,6 +12,7 @@ pub use crate::models::{
#[pymodule]
pub fn tokenizers(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
m.add_class::<PyTreeTokenizer>()?;
m.add_class::<PyFragmentTokenizer>()?;
m.add_class::<PyRegion>()?;
m.add_class::<PyTokenizedRegionSet>()?;
m.add_class::<PyTokenizedRegion>()?;
Expand Down
1 change: 0 additions & 1 deletion bindings/src/tokenizers/tree_tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -155,7 +155,6 @@ impl PyTreeTokenizer {

Ok(py_tokenized_region_set)
})

}

// encode returns a list of ids
Expand Down
2 changes: 1 addition & 1 deletion genimtools/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "genimtools"
version = "0.0.12"
version = "0.0.13"
edition = "2021"
description = "Performance-critical tools to manipulate, analyze, and process genomic interval data. Primarily focused on building tools for geniml - our genomic machine learning python package."
license = "MIT"
Expand Down
4 changes: 4 additions & 0 deletions genimtools/docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [0.0.13]
- implemented a fragment file tokenizer that will generate `.gtok` files directly from `fragments.tsv.gz` files.
- fix an off-by-one error in the `region-to-id` maps in the `Universe` structs. This was leading to critical bugs in our models.

## [0.0.12]
- optimize creation of `PyRegionSet` to reduce expensive cloning of `Universe` structs.

Expand Down
4 changes: 2 additions & 2 deletions genimtools/src/common/models/universe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ pub struct Universe {

impl Universe {
pub fn insert_token(&mut self, region: &Region) {
let new_id = self.region_to_id.len() + 1;
let new_id = self.region_to_id.len();
self.region_to_id.insert(region.to_owned(), new_id as u32);
self.id_to_region.insert(new_id as u32, region.to_owned());
}
Expand Down Expand Up @@ -82,4 +82,4 @@ impl TryFrom<&Path> for Universe {
id_to_region,
})
}
}
}
26 changes: 16 additions & 10 deletions genimtools/src/common/utils.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,20 @@ use flate2::read::GzDecoder;

use crate::common::models::region::Region;

pub fn get_dynamic_reader(path: &Path) -> Result<BufReader<Box<dyn Read>>> {
let is_gzipped = path.extension() == Some(OsStr::new("gz"));
let file = File::open(path).with_context(|| "Failed to open bed file.")?;

let file: Box<dyn Read> = match is_gzipped {
true => Box::new(GzDecoder::new(file)),
false => Box::new(file),
};

let reader = BufReader::new(file);

Ok(reader)
}

pub fn generate_region_to_id_map(regions: &[Region]) -> HashMap<Region, u32> {
let mut current_id = 0;
let mut region_to_id: HashMap<Region, u32> = HashMap::new();
Expand Down Expand Up @@ -39,17 +53,9 @@ pub fn generate_id_to_region_map(regions: &[Region]) -> HashMap<u32, Region> {
}

pub fn extract_regions_from_bed_file(path: &Path) -> Result<Vec<Region>> {
let mut regions = Vec::new();

let is_gzipped = path.extension() == Some(OsStr::new("gz"));
let file = File::open(path).with_context(|| "Failed to open bed file.")?;
let reader = get_dynamic_reader(path)?;

let file: Box<dyn Read> = match is_gzipped {
true => Box::new(GzDecoder::new(file)),
false => Box::new(file),
};

let reader = BufReader::new(file);
let mut regions = Vec::new();

for line in reader.lines() {
let line = line.with_context(|| "Failed parsing line in BED file")?;
Expand Down
81 changes: 81 additions & 0 deletions genimtools/src/io/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
use std::fs::File;
use std::fs::OpenOptions;
use std::io::{BufReader, BufWriter, Read, Write};

use anyhow::{Context, Result};
Expand Down Expand Up @@ -101,3 +102,83 @@ pub fn read_tokens_from_gtok(filename: &str) -> Result<Vec<u32>> {

Ok(tokens)
}

///
/// Initialize a `.gtok` file with a header and size flag.
/// # Arguments
/// - filename: the file to initialize
///
/// # Returns
/// - Result<(), anyhow::Error>
pub fn init_gtok_file(filename: &str) -> Result<()> {
// make sure the path exists
let path = std::path::Path::new(filename);

if let Some(parent) = path.parent() {
std::fs::create_dir_all(parent)?;
} else {
anyhow::bail!("Failed to create parent directories for gtok file!")
}

let file = File::create(filename).with_context(|| "Failed to create gtok file!")?;
let mut writer = BufWriter::new(file);

writer
.write_all(GTOK_HEADER)
.with_context(|| "Failed to write GTOK header to file!")?;

// assume large and write u32 flag
writer
.write_all(&GTOK_U32_FLAG.to_le_bytes())
.with_context(|| "Failed to write GTOK size flag to file!")?;

Ok(())
}

pub fn append_tokens_to_gtok_file(filename: &str, tokens: &[u32]) -> Result<()> {
let file = File::open(filename).with_context(|| "Failed to open gtok file!")?;

let mut reader = BufReader::new(file);

// check the header
let mut header = [0; 4];
reader.read_exact(&mut header)?;

if &header != GTOK_HEADER {
anyhow::bail!("File doesn't appear to be a valid .gtok file.")
}

// detect the size flag
let mut size_flag = [0; 1];
reader.read_exact(&mut size_flag)?;

// start appending to the open file
// must reopen beacause `Bufreader` takes ownership of `file`.
let file = OpenOptions::new()
.append(true)
.open(filename)
.with_context(|| "Failed to open gtok file for appending")?;
let mut writer = BufWriter::new(file);

match size_flag {
[GTOK_U16_FLAG] => {
for token in tokens {
writer
.write_all(&(*token as u16).to_le_bytes())
.with_context(|| "Failed to write bytes to file!")?;
}
}
[GTOK_U32_FLAG] => {
for token in tokens {
writer
.write_all(&token.to_le_bytes())
.with_context(|| "Failed to write bytes to file!")?;
}
}
_ => {
anyhow::bail!("Invalid data format flag found in gtok file")
}
}

Ok(())
}
Loading

0 comments on commit f40fc8c

Please sign in to comment.