diff --git a/README.md b/README.md index 3a71fc27..717436d8 100644 --- a/README.md +++ b/README.md @@ -17,9 +17,11 @@ This repo is organized like so: -1. A rust library crate (`/gtars/lib.rs`) that provides functions, traits, and structs for working with genomic interval data. -2. A rust binary crate (in `/gtars/main.rs`), a small, wrapper command-line interface for the library crate. -3. A rust crate (in `/bindings`) that provides Python bindings, and a resulting Python package, so that it can be used within Python. +1. The main gtars rust package in `/gtars`, which contains two crates: + 1a. A rust library crate (`/gtars/lib.rs`) that provides functions, traits, and structs for working with genomic interval data. + 1b. A rust binary crate (in `/gtars/main.rs`), a small, wrapper command-line interface for the library crate. +2. Python bindings (in `/bindings/python`), which consists of a rust package with a library crate (no binary crate) and Python package. +3. R bindings (in `/bindinds/r`), which consists of an R package. This repository is a work in progress, and still in early development. diff --git a/bindings/python/README.md b/bindings/python/README.md index 267eab85..52e025c2 100644 --- a/bindings/python/README.md +++ b/bindings/python/README.md @@ -1,13 +1,17 @@ # gtars + This is a python wrapper around the `gtars` crate. It provides an easy interface for using `gtars` in python. It is currently in early development, and as such, it does not have a lot of functionality yet, but new tools are being worked on right now. ## Installation + You can get `gtars` from PyPI: + ```bash pip install gtars ``` ## Usage + Import the package, and use the tools: ```python import gtars as gt @@ -15,4 +19,17 @@ import gtars as gt gt.prune_universe(...) ``` ## Developer docs -Write the develop docs here... \ No newline at end of file + +To build for development: + +```bash +cd bindings/python +maturin build --release +``` + +Then install the local wheel that was just built: + +``` +version=`grep '^version =' Cargo.toml | cut -d '"' -f 2` +pip install --force-reinstall target/wheels/gtars-${version}-cp312-cp312-manylinux_2_38_x86_64.whl +``` diff --git a/bindings/python/gtars/digests/__init__.py b/bindings/python/gtars/digests/__init__.py new file mode 100644 index 00000000..d21d6228 --- /dev/null +++ b/bindings/python/gtars/digests/__init__.py @@ -0,0 +1 @@ +from .gtars.digests import * # noqa: F403 diff --git a/bindings/python/src/digests/mod.rs b/bindings/python/src/digests/mod.rs new file mode 100644 index 00000000..f51ef963 --- /dev/null +++ b/bindings/python/src/digests/mod.rs @@ -0,0 +1,71 @@ +// This is intended to provide minimal Python bindings to functions in the `digests` module of the `gtars` crate. + +use pyo3::prelude::*; +use gtars::digests::{sha512t24u, md5, DigestResult}; + +#[pyfunction] +pub fn sha512t24u_digest(readable: &str) -> String { + return sha512t24u(readable); +} + +#[pyfunction] +pub fn md5_digest(readable: &str) -> String { + return md5(readable); +} + +#[pyfunction] +pub fn digest_fasta(fasta: &str) -> PyResult> { + match gtars::digests::digest_fasta(fasta) { + Ok(digest_results) => { + let py_digest_results: Vec = digest_results.into_iter().map(PyDigestResult::from).collect(); + Ok(py_digest_results) + }, + Err(e) => Err(PyErr::new::(format!("Error processing FASTA file: {}", e))), + } +} + +#[pyclass] +#[pyo3(name="DigestResult")] +pub struct PyDigestResult { + #[pyo3(get,set)] + pub id: String, + #[pyo3(get,set)] + pub length: usize, + #[pyo3(get,set)] + pub sha512t24u: String, + #[pyo3(get,set)] + pub md5: String +} + +#[pymethods] +impl PyDigestResult { + fn __repr__(&self) -> String { + format!("", self.id) + } + + fn __str__(&self) -> PyResult { + Ok(format!("DigestResult for sequence {}\n length: {}\n sha512t24u: {}\n md5: {}", self.id, self.length, self.sha512t24u, self.md5)) + } +} + +impl From for PyDigestResult { + fn from(value: DigestResult) -> Self { + PyDigestResult { + id: value.id, + length: value.length, + sha512t24u: value.sha512t24u, + md5: value.md5 + } + } +} + +// This represents the Python module to be created +#[pymodule] +pub fn digests(_py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { + m.add_function(wrap_pyfunction!(sha512t24u_digest, m)?)?; + m.add_function(wrap_pyfunction!(md5_digest, m)?)?; + m.add_function(wrap_pyfunction!(digest_fasta, m)?)?; + m.add_class::()?; + Ok(()) +} + diff --git a/bindings/python/src/lib.rs b/bindings/python/src/lib.rs index 207ab55b..52d0e790 100644 --- a/bindings/python/src/lib.rs +++ b/bindings/python/src/lib.rs @@ -5,6 +5,7 @@ mod ailist; mod models; mod tokenizers; mod utils; +mod digests; pub const VERSION: &str = env!("CARGO_PKG_VERSION"); @@ -14,11 +15,13 @@ fn gtars(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { let ailist_module = pyo3::wrap_pymodule!(ailist::ailist); let utils_module = pyo3::wrap_pymodule!(utils::utils); let models_module = pyo3::wrap_pymodule!(models::models); + let digests_module = pyo3::wrap_pymodule!(digests::digests); m.add_wrapped(tokenize_module)?; m.add_wrapped(ailist_module)?; m.add_wrapped(utils_module)?; m.add_wrapped(models_module)?; + m.add_wrapped(digests_module)?; let sys = PyModule::import_bound(py, "sys")?; let binding = sys.getattr("modules")?; @@ -29,6 +32,7 @@ fn gtars(py: Python, m: &Bound<'_, PyModule>) -> PyResult<()> { sys_modules.set_item("gtars.ailist", m.getattr("ailist")?)?; sys_modules.set_item("gtars.utils", m.getattr("utils")?)?; sys_modules.set_item("gtars.models", m.getattr("models")?)?; + sys_modules.set_item("gtars.digests", m.getattr("digests")?)?; // add constants m.add("__version__", VERSION)?; diff --git a/gtars/Cargo.toml b/gtars/Cargo.toml index be23b212..462af9a1 100644 --- a/gtars/Cargo.toml +++ b/gtars/Cargo.toml @@ -28,7 +28,10 @@ bigtools = "0.5.4" tokio = "1.40.0" os_pipe = "1.2.1" glob = "0.3.1" - +base64-url = "2.0.0" +sha2 = "0.10.7" +md-5 = "0.10.5" +seq_io = "0.3.2" [dev-dependencies] diff --git a/gtars/src/common/utils.rs b/gtars/src/common/utils.rs index 93b8c837..f1d5fc1e 100644 --- a/gtars/src/common/utils.rs +++ b/gtars/src/common/utils.rs @@ -6,16 +6,17 @@ use std::io::{BufRead, BufReader}; use std::path::Path; use anyhow::{Context, Result}; -use flate2::read::GzDecoder; +use flate2::read::MultiGzDecoder; use rust_lapper::{Interval, Lapper}; use crate::common::models::region::Region; use crate::common::models::universe::Universe; /// -/// Function to return a reader for either a gzip'd or non-gzip'd file. +/// Get a reader for either a gzip'd or non-gzip'd file. /// /// # Arguments +/// /// - path: path to the file to read /// pub fn get_dynamic_reader(path: &Path) -> Result>> { @@ -23,7 +24,7 @@ pub fn get_dynamic_reader(path: &Path) -> Result>> { let file = File::open(path).with_context(|| "Failed to open bed file.")?; let file: Box = match is_gzipped { - true => Box::new(GzDecoder::new(file)), + true => Box::new(MultiGzDecoder::new(file)), false => Box::new(file), }; @@ -32,6 +33,25 @@ pub fn get_dynamic_reader(path: &Path) -> Result>> { Ok(reader) } +/// Get a reader for either a gzipped, non-gzipped file, or stdin +/// +/// # Arguments +/// +/// - file_path: path to the file to read, or '-' for stdin +/// +/// # Returns +/// +/// A `BufReader` object for a given file path or stdin. +pub fn get_dynamic_reader_w_stdin(file_path_str: &str) -> Result>> { + if file_path_str == "-" { + Ok(BufReader::new(Box::new(std::io::stdin()) as Box)) + } else { + let file_path = Path::new(file_path_str); + return get_dynamic_reader(&file_path); + } +} + + /// /// Create a region-to-id hash-map from a list of regions /// diff --git a/gtars/src/digests/mod.rs b/gtars/src/digests/mod.rs new file mode 100644 index 00000000..a8374a5b --- /dev/null +++ b/gtars/src/digests/mod.rs @@ -0,0 +1,178 @@ +//! # Fast digest computations for genomic sequences +//! +//! This module provides functions for computing digests of strings. +//! +//! # Functions +//! +//! The following functions are available: +//! +//! * `sha512t24u` - Processes a given string to compute its GA4GH sha512t24 checksum. +//! * `md5` - Processes a given string to compute its MD5 checksum. +//! * `digest_fasta` - Processes a FASTA file to compute the digests of each sequence in the file. +//! +//! # Usage +//! +//! The `sha512t24u` function can be used to compute the GA4GH sha512t24 checksum of a string. +//! +//! ```rust +//! use gtars::digests::sha512t24u; +//! +//! let digest = sha512t24u("hello world"); +//! ``` +use std::io::prelude::{Read, Write}; +use std::io; +use std::fs::File; +use std::path::Path; + +use anyhow::Result; +use md5::Md5; +use sha2::{Digest, Sha512}; +use seq_io::fasta::{Reader, RefRecord, Record}; + +use crate::common::utils::get_dynamic_reader; + +/// A struct representing the digest of a given string. +#[derive(Debug)] +pub struct DigestResult { + pub id: String, + pub length: usize, + pub sha512t24u: String, + pub md5: String, +} + +/// Processes a given string to compute its GA4GH sha512t24u digest. +/// +/// # Arguments +/// +/// * `string` - The input string to be processed. +/// +/// # Returns +/// +/// A string SHA-512 digest of the input string. +pub fn sha512t24u(string: &str) -> String { + let mut sha512_hasher_box = Box::new(Sha512::new()); + for s in string.as_bytes().chunks(800) { + sha512_hasher_box.as_mut().update(s); + } + base64_url::encode(&sha512_hasher_box.as_mut().finalize_reset()[0..24]) +} + +/// Process a string to compute its md5 digest +/// +/// # Arguments +/// +/// * `string` - The input string to be processed. +/// +/// # Returns +/// +/// A string MD5 digest of the input string. +pub fn md5(string: &str) -> String { + let mut hasher = Md5::new(); + for s in string.as_bytes().chunks(800) { + hasher.update(s); + } + let result = hasher.finalize(); + format!("{:x}", result) +} + + +/// Processes a FASTA file to compute the digests of each sequence in the file. +/// +/// This function reads a FASTA file, computes the SHA-512 and MD5 digests for each sequence, +/// and returns a vector of `DigestResult` structs containing the results. It can also handle +// gzipped FASTA files (ending in `.gz`). +/// +/// # Arguments +/// +/// * `file_path` - A string slice that holds the path to the FASTA file to be processed. +/// +/// # Returns +/// +/// A vector of `DigestResult` structs, each containing the length, SHA-512 digest, and MD5 digest +/// of a sequence in the FASTA file. +/// +/// # Panics +/// +/// This function will panic if the file cannot be opened or if there is an error reading the file. +/// +/// # Examples +/// +/// +pub fn digest_fasta(file_path: &str) -> Result> { + let path = Path::new(&file_path); + let file_reader = get_dynamic_reader(&path)?; + let mut fasta_reader = Reader::new(file_reader); + let mut results = Vec::new(); + while let Some(record) = fasta_reader.next() { // returns a RefRecord object + let record = record.expect("Error found when retrieving next record."); + let id = record.id().expect("No ID found for the FASTA record"); + let mut sha512_hasher = Sha512::new(); + let mut md5_hasher = Md5::new(); + let mut length = 0; + // let result = process_sequence(record, verbose); + for seq_line in record.seq_lines() { + // let seq_line = seq_line.expect("Error found when retrieving next sequence line."); + sha512_hasher.update(seq_line.to_ascii_uppercase()); + md5_hasher.update(seq_line.to_ascii_uppercase()); + length += seq_line.len(); + } + // let result = sha512_hasher.finalize(); + let sha512 = base64_url::encode(&sha512_hasher.finalize_reset()[0..24]); + let md5 = format!("{:x}", md5_hasher.finalize_reset()); + results.push(DigestResult { + id: id.to_string(), + length: length, + sha512t24u: sha512, + md5: md5 + }); + } + Ok(results) +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_sha512t24u() { + let digest = sha512t24u("hello world"); + assert_eq!(digest, "MJ7MSJwS1utMxA9QyQLytNDtd-5RGnx6"); + } + + #[test] + fn test_md5() { + let digest = md5("hello world"); + assert_eq!(digest, "5eb63bbbe01eeed093cb22bb8f5acdc3"); + } + + #[test] + fn test_digest_fasta() { + let results = digest_fasta("tests/data/base.fa").expect("Can't open test fasta file"); + println!("{:?}", results); + assert_eq!(results.len(), 3); + assert_eq!(results[0].length, 8); + assert_eq!(results[0].sha512t24u, "iYtREV555dUFKg2_agSJW6suquUyPpMw"); + assert_eq!(results[0].md5, "5f63cfaa3ef61f88c9635fb9d18ec945"); + assert_eq!(results[1].length, 4); + assert_eq!(results[1].sha512t24u, "YBbVX0dLKG1ieEDCiMmkrTZFt_Z5Vdaj"); + assert_eq!(results[1].md5, "31fc6ca291a32fb9df82b85e5f077e31"); + assert_eq!(results[2].length, 4); + assert_eq!(results[2].sha512t24u, "AcLxtBuKEPk_7PGE_H4dGElwZHCujwH6"); + assert_eq!(results[2].md5, "92c6a56c9e9459d8a42b96f7884710bc"); + } + + #[test] + fn test_digest_gzipped_fasta() { + let results = digest_fasta("tests/data/base.fa.gz").expect("Can't open test fasta file"); + println!("{:?}", results); + assert_eq!(results[0].length, 8); + assert_eq!(results[0].sha512t24u, "iYtREV555dUFKg2_agSJW6suquUyPpMw"); + assert_eq!(results[0].md5, "5f63cfaa3ef61f88c9635fb9d18ec945"); + } + + #[test] + fn bogus_fasta_file() { + let result = digest_fasta("tests/data/bogus.fa"); + assert!(result.is_err(), "Expected an error for a bogus fasta file"); + } +} \ No newline at end of file diff --git a/gtars/src/lib.rs b/gtars/src/lib.rs index f7bb97fc..822a4d8c 100644 --- a/gtars/src/lib.rs +++ b/gtars/src/lib.rs @@ -35,6 +35,7 @@ //! ``` pub mod ailist; pub mod common; +pub mod digests; pub mod fragsplit; pub mod igd; pub mod io; diff --git a/gtars/tests/data/base.fa b/gtars/tests/data/base.fa new file mode 100644 index 00000000..dd08063d --- /dev/null +++ b/gtars/tests/data/base.fa @@ -0,0 +1,6 @@ +>chrX +TTGGGGAA +>chr1 +GGAA +>chr2 +GCGC diff --git a/gtars/tests/data/base.fa.gz b/gtars/tests/data/base.fa.gz new file mode 100644 index 00000000..343e91af Binary files /dev/null and b/gtars/tests/data/base.fa.gz differ