Merge pull request #24 from databio/dev

Release `v0.0.13` -- Add fragment file tokenizer
databio · Jun 3, 2024 · f40fc8c · f40fc8c
2 parents dab4beb + 9760d04
commit f40fc8c
Show file tree

Hide file tree

Showing 16 changed files with 456 additions and 17 deletions.
diff --git a/bindings/Cargo.toml b/bindings/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "genimtools-py"
-version = "0.0.12"
+version = "0.0.13"
 edition = "2021"
 
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

diff --git a/bindings/genimtools/models/__init__.py b/bindings/genimtools/models/__init__.py
@@ -0,0 +1 @@
+from .genimtools.models import *  # noqa: F403
diff --git a/bindings/genimtools/models/__init__.pyi b/bindings/genimtools/models/__init__.pyi
@@ -0,0 +1,9 @@
+from typing import List
+
+class Region:
+    chr: str
+    start: int
+    end: int
+
+class RegionSet:
+    regions: List[Region]
diff --git a/bindings/genimtools/tokenizers/__init__.pyi b/bindings/genimtools/tokenizers/__init__.pyi
@@ -366,4 +366,21 @@ class TreeTokenizer:
     def __repr__(self) -> str:
         """
         Get a string representation of the tokenizer.
+        """
+
+class FragmentTokenizer:
+    def __new__(cls, path: str) -> FragmentTokenizer:
+        """
+        Construct a new FragmentTokenizer from a universe file.
+
+        :param path: The path to the universe file. This should be a BED file.
+        """
+
+    def tokenize_fragments(self, file_path: str, out_path: str = None, filter: List[str] = None) -> None:
+        """
+        Tokenize a file containing fragments.
+
+        :param file_path: The path to the file containing fragments.
+        :param out_path: The path to the output file. If None, the output is written to the standard output.
+        :param filter: A list of chromosomes to filter. If None, all chromosomes are included.
         """
diff --git a/bindings/src/lib.rs b/bindings/src/lib.rs
@@ -15,13 +15,13 @@ fn genimtools(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
     let tokenize_module = pyo3::wrap_pymodule!(tokenizers::tokenizers);
     let ailist_module = pyo3::wrap_pymodule!(ailist::ailist);
     let utils_module = pyo3::wrap_pymodule!(utils::utils);
-    let models_modeule = pyo3::wrap_pymodule!(models::models);
+    let models_module = pyo3::wrap_pymodule!(models::models);
 
     m.add_wrapped(vocab_module)?;
     m.add_wrapped(tokenize_module)?;
     m.add_wrapped(ailist_module)?;
     m.add_wrapped(utils_module)?;
-    m.add_wrapped(models_modeule)?;
+    m.add_wrapped(models_module)?;
 
     let sys = PyModule::import_bound(py, "sys")?;
     let binding = sys.getattr("modules")?;

diff --git a/bindings/src/tokenizers/fragments_tokenizer.rs b/bindings/src/tokenizers/fragments_tokenizer.rs
@@ -0,0 +1,34 @@
+use pyo3::prelude::*;
+
+#[pyclass(name = "FragmentTokenizer")]
+pub struct PyFragmentTokenizer {
+    pub tokenizer: genimtools::tokenizers::FragmentTokenizer,
+}
+
+#[pymethods]
+impl PyFragmentTokenizer {
+    #[new]
+    pub fn new(path: String) -> PyResult<Self> {
+        let path = std::path::Path::new(&path);
+        let tokenizer = genimtools::tokenizers::FragmentTokenizer::try_from(path)?;
+        Ok(PyFragmentTokenizer { tokenizer })
+    }
+
+    pub fn tokenize_fragments(
+        &self,
+        file: String,
+        out_path: Option<String>,
+        filter: Option<Vec<String>>,
+    ) -> PyResult<()> {
+        let path = std::path::Path::new(&file);
+        let out_path = out_path.unwrap_or("".to_string());
+        let out_path = std::path::Path::new(&out_path);
+        match filter {
+            Some(filter) => self
+                .tokenizer
+                .tokenize_fragments_with_filter(path, out_path, filter),
+            None => self.tokenizer.tokenize_fragments(path, out_path),
+        }?;
+        Ok(())
+    }
+}
diff --git a/bindings/src/tokenizers/mod.rs b/bindings/src/tokenizers/mod.rs
@@ -1,7 +1,9 @@
+mod fragments_tokenizer;
 mod tree_tokenizer;
 
 use pyo3::prelude::*;
 
+pub use self::fragments_tokenizer::PyFragmentTokenizer;
 pub use self::tree_tokenizer::PyTreeTokenizer;
 pub use crate::models::{
     PyRegion, PyRegionSet, PyTokenizedRegion, PyTokenizedRegionSet, PyUniverse,
@@ -10,6 +12,7 @@ pub use crate::models::{
 #[pymodule]
 pub fn tokenizers(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> {
     m.add_class::<PyTreeTokenizer>()?;
+    m.add_class::<PyFragmentTokenizer>()?;
     m.add_class::<PyRegion>()?;
     m.add_class::<PyTokenizedRegionSet>()?;
     m.add_class::<PyTokenizedRegion>()?;

diff --git a/bindings/src/tokenizers/tree_tokenizer.rs b/bindings/src/tokenizers/tree_tokenizer.rs
@@ -155,7 +155,6 @@ impl PyTreeTokenizer {
 
             Ok(py_tokenized_region_set)
         })
-
     }
 
     // encode returns a list of ids

diff --git a/genimtools/Cargo.toml b/genimtools/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "genimtools"
-version = "0.0.12"
+version = "0.0.13"
 edition = "2021"
 description = "Performance-critical tools to manipulate, analyze, and process genomic interval data. Primarily focused on building tools for geniml - our genomic machine learning python package."
 license = "MIT"

diff --git a/genimtools/docs/changelog.md b/genimtools/docs/changelog.md
@@ -4,6 +4,10 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [0.0.13]
+- implemented a fragment file tokenizer that will generate `.gtok` files directly from `fragments.tsv.gz` files.
+- fix an off-by-one error in the `region-to-id` maps in the `Universe` structs. This was leading to critical bugs in our models.
+
 ## [0.0.12]
 - optimize creation of `PyRegionSet` to reduce expensive cloning of `Universe` structs.
 

diff --git a/genimtools/src/common/models/universe.rs b/genimtools/src/common/models/universe.rs
@@ -17,7 +17,7 @@ pub struct Universe {
 
 impl Universe {
     pub fn insert_token(&mut self, region: &Region) {
-        let new_id = self.region_to_id.len() + 1;
+        let new_id = self.region_to_id.len();
         self.region_to_id.insert(region.to_owned(), new_id as u32);
         self.id_to_region.insert(new_id as u32, region.to_owned());
     }
@@ -82,4 +82,4 @@ impl TryFrom<&Path> for Universe {
             id_to_region,
         })
     }
-}
+}
diff --git a/genimtools/src/common/utils.rs b/genimtools/src/common/utils.rs
@@ -10,6 +10,20 @@ use flate2::read::GzDecoder;
 
 use crate::common::models::region::Region;
 
+pub fn get_dynamic_reader(path: &Path) -> Result<BufReader<Box<dyn Read>>> {
+    let is_gzipped = path.extension() == Some(OsStr::new("gz"));
+    let file = File::open(path).with_context(|| "Failed to open bed file.")?;
+
+    let file: Box<dyn Read> = match is_gzipped {
+        true => Box::new(GzDecoder::new(file)),
+        false => Box::new(file),
+    };
+
+    let reader = BufReader::new(file);
+
+    Ok(reader)
+}
+
 pub fn generate_region_to_id_map(regions: &[Region]) -> HashMap<Region, u32> {
     let mut current_id = 0;
     let mut region_to_id: HashMap<Region, u32> = HashMap::new();
@@ -39,17 +53,9 @@ pub fn generate_id_to_region_map(regions: &[Region]) -> HashMap<u32, Region> {
 }
 
 pub fn extract_regions_from_bed_file(path: &Path) -> Result<Vec<Region>> {
-    let mut regions = Vec::new();
-
-    let is_gzipped = path.extension() == Some(OsStr::new("gz"));
-    let file = File::open(path).with_context(|| "Failed to open bed file.")?;
+    let reader = get_dynamic_reader(path)?;
 
-    let file: Box<dyn Read> = match is_gzipped {
-        true => Box::new(GzDecoder::new(file)),
-        false => Box::new(file),
-    };
-
-    let reader = BufReader::new(file);
+    let mut regions = Vec::new();
 
     for line in reader.lines() {
         let line = line.with_context(|| "Failed parsing line in BED file")?;

diff --git a/genimtools/src/io/mod.rs b/genimtools/src/io/mod.rs
@@ -1,4 +1,5 @@
 use std::fs::File;
+use std::fs::OpenOptions;
 use std::io::{BufReader, BufWriter, Read, Write};
 
 use anyhow::{Context, Result};
@@ -101,3 +102,83 @@ pub fn read_tokens_from_gtok(filename: &str) -> Result<Vec<u32>> {
 
     Ok(tokens)
 }
+
+///
+/// Initialize a `.gtok` file with a header and size flag.
+/// # Arguments
+/// - filename: the file to initialize
+///
+/// # Returns
+/// - Result<(), anyhow::Error>
+pub fn init_gtok_file(filename: &str) -> Result<()> {
+    // make sure the path exists
+    let path = std::path::Path::new(filename);
+
+    if let Some(parent) = path.parent() {
+        std::fs::create_dir_all(parent)?;
+    } else {
+        anyhow::bail!("Failed to create parent directories for gtok file!")
+    }
+
+    let file = File::create(filename).with_context(|| "Failed to create gtok file!")?;
+    let mut writer = BufWriter::new(file);
+
+    writer
+        .write_all(GTOK_HEADER)
+        .with_context(|| "Failed to write GTOK header to file!")?;
+
+    // assume large and write u32 flag
+    writer
+        .write_all(&GTOK_U32_FLAG.to_le_bytes())
+        .with_context(|| "Failed to write GTOK size flag to file!")?;
+
+    Ok(())
+}
+
+pub fn append_tokens_to_gtok_file(filename: &str, tokens: &[u32]) -> Result<()> {
+    let file = File::open(filename).with_context(|| "Failed to open gtok file!")?;
+
+    let mut reader = BufReader::new(file);
+
+    // check the header
+    let mut header = [0; 4];
+    reader.read_exact(&mut header)?;
+
+    if &header != GTOK_HEADER {
+        anyhow::bail!("File doesn't appear to be a valid .gtok file.")
+    }
+
+    // detect the size flag
+    let mut size_flag = [0; 1];
+    reader.read_exact(&mut size_flag)?;
+
+    // start appending to the open file
+    // must reopen beacause `Bufreader` takes ownership of `file`.
+    let file = OpenOptions::new()
+        .append(true)
+        .open(filename)
+        .with_context(|| "Failed to open gtok file for appending")?;
+    let mut writer = BufWriter::new(file);
+
+    match size_flag {
+        [GTOK_U16_FLAG] => {
+            for token in tokens {
+                writer
+                    .write_all(&(*token as u16).to_le_bytes())
+                    .with_context(|| "Failed to write bytes to file!")?;
+            }
+        }
+        [GTOK_U32_FLAG] => {
+            for token in tokens {
+                writer
+                    .write_all(&token.to_le_bytes())
+                    .with_context(|| "Failed to write bytes to file!")?;
+            }
+        }
+        _ => {
+            anyhow::bail!("Invalid data format flag found in gtok file")
+        }
+    }
+
+    Ok(())
+}
-Original file line number
+Diff line change
@@ Expand Up / @@ -155,7 +155,6 @@ impl PyTreeTokenizer { @@
                 Ok(py_tokenized_region_set)
             })
         }
         // encode returns a list of ids
@@ Expand Down @@