Skip to content

Commit

Permalink
Merge pull request #38 from databio/region_scoring
Browse files Browse the repository at this point in the history
Add fragment file region scoring matrix creation
  • Loading branch information
nleroy917 authored Dec 3, 2024
2 parents 7596b24 + 28f05d8 commit 0e145ec
Showing 25 changed files with 540 additions and 12 deletions.
52 changes: 52 additions & 0 deletions gtars/src/common/models/fragments.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
use std::str::FromStr;

use anyhow::Result;

use crate::common::models::Region;

#[allow(unused)]
pub struct Fragment {
pub chr: String,
pub start: u32,
pub end: u32,
pub barcode: String,
pub read_support: u32,
}

impl FromStr for Fragment {
type Err = anyhow::Error;

fn from_str(s: &str) -> Result<Self> {
let parts: Vec<&str> = s.split_whitespace().collect();
// dont check file integrity right now
// if parts.len() != 6 {
// anyhow::bail!(
// "Error parsing fragment file line: {}. Is your fragment file malformed? Found {} parts.",
// s,
// parts.len()
// )
// }

let start = parts[1].parse::<u32>()?;
let end = parts[2].parse::<u32>()?;
let read_support = parts[4].parse::<u32>()?;

Ok(Fragment {
chr: parts[0].to_string(),
start,
end,
barcode: parts[3].to_string(),
read_support,
})
}
}

impl From<Fragment> for Region {
fn from(val: Fragment) -> Self {
Region {
chr: val.chr,
start: val.start,
end: val.end,
}
}
}
2 changes: 2 additions & 0 deletions gtars/src/common/models/mod.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
pub mod fragments;
pub mod region;
pub mod region_set;
pub mod tokenized_region;
pub mod tokenized_regionset;
pub mod universe;

// re-export for cleaner imports
pub use self::fragments::Fragment;
pub use self::region::Region;
pub use self::region_set::RegionSet;
pub use self::tokenized_region::TokenizedRegion;
6 changes: 3 additions & 3 deletions gtars/src/fragsplit/split.rs
Original file line number Diff line number Diff line change
@@ -159,17 +159,17 @@ mod tests {

#[fixture]
fn barcode_cluster_map_file() -> &'static str {
"tests/data/scatlas_leiden.csv"
"tests/data/barcode_cluster_map.tsv"
}

#[fixture]
fn path_to_fragment_files() -> &'static str {
"tests/data/fragments-test"
"tests/data/fragments/fragsplit"
}

#[fixture]
fn path_to_output() -> &'static str {
"tests/data/out-test"
"tests/data/out"
}

#[fixture]
5 changes: 5 additions & 0 deletions gtars/src/main.rs
Original file line number Diff line number Diff line change
@@ -4,6 +4,7 @@ use clap::Command;
// go through the library crate to get the interfaces
use gtars::fragsplit;
use gtars::igd;
use gtars::scoring;
use gtars::tokenizers;
use gtars::uniwig;

@@ -25,6 +26,7 @@ fn build_parser() -> Command {
.subcommand(fragsplit::cli::make_fragsplit_cli())
.subcommand(uniwig::cli::create_uniwig_cli())
.subcommand(igd::cli::create_igd_cli())
.subcommand(scoring::cli::make_fscoring_cli())
}

fn main() -> Result<()> {
@@ -50,6 +52,9 @@ fn main() -> Result<()> {
}
_ => unreachable!("IGD Subcommand not found"),
},
Some((scoring::consts::FSCORING_CMD, matches)) => {
scoring::cli::handlers::region_fragment_scoring(matches)?;
}
_ => unreachable!("Subcommand not found"),
};

67 changes: 67 additions & 0 deletions gtars/src/scoring/cli.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
use std::collections::HashSet;
use std::io::BufRead;
use std::path::PathBuf;

use anyhow::Result;
use clap::{arg, Arg, ArgMatches, Command};

use super::*;
use crate::scoring::{region_scoring_from_fragments, ConsensusSet, FragmentFileGlob};

pub fn make_fscoring_cli() -> Command {
Command::new(consts::FSCORING_CMD)
.author("Nathan LeRoy")
.about("Create a scoring matrix for a set of fragment files over a consensus peak set.")
.arg(Arg::new("fragments"))
.arg(Arg::new("consensus"))
.arg(arg!(--mode <mode>))
.arg(arg!(--output <output>))
}

pub mod handlers {

use std::str::FromStr;

use consts::DEFAULT_SCORING_MODE;

use super::*;

pub fn region_fragment_scoring(matches: &ArgMatches) -> Result<()> {
// get arguments from CLI
let fragments = matches
.get_one::<String>("fragments")
.expect("A path to fragment files is required.");

let consensus = matches
.get_one::<String>("consensus")
.expect("A path to a mapping file is required.");

let default_out = consts::DEFAULT_OUT.to_string();
let output = matches.get_one::<String>("output").unwrap_or(&default_out);
let mode = match matches.get_one::<String>("mode") {
Some(mode) => {
let supplied_mode = ScoringMode::from_str(mode);
match supplied_mode {
Ok(mode) => mode,
Err(_err) => anyhow::bail!("Unknown scoring mode supplied: {}", mode)
}
},
None => DEFAULT_SCORING_MODE,
};

// coerce arguments to types
let mut fragments = FragmentFileGlob::new(fragments)?;
let consensus = PathBuf::from(consensus);
let consensus = ConsensusSet::new(consensus)?;

let count_mat = region_scoring_from_fragments(
&mut fragments,
&consensus,
mode,
)?;

count_mat.write_to_file(output)?;

Ok(())
}
}
7 changes: 7 additions & 0 deletions gtars/src/scoring/consts.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
use crate::scoring::ScoringMode;

pub const FSCORING_CMD: &str = "fscoring";
pub const DEFAULT_OUT: &str = "fscoring.csv.gz";
pub const DEFAULT_SCORING_MODE: ScoringMode = ScoringMode::Atac;
pub const START_SHIFT: i8 = 4;
pub const END_SHIFT: i8 = 5;
105 changes: 105 additions & 0 deletions gtars/src/scoring/counts.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
use std::fs::File;
use std::io::{BufWriter, Write};
use std::ops::{Add, AddAssign};

use anyhow::Result;
use flate2::write::GzEncoder;
use flate2::Compression;

pub struct CountMatrix<T> {
data: Vec<T>,
pub rows: usize,
pub cols: usize,
}

pub struct RowIterator<'a, T> {
matrix: &'a CountMatrix<T>,
current_row: usize,
}

impl<T> CountMatrix<T>
where
T: Copy + Default + Add<Output = T> + AddAssign + From<u8>,
{
pub fn new(rows: usize, cols: usize) -> Self {
Self {
data: vec![T::default(); rows * cols],
rows,
cols,
}
}

pub fn get(&self, row: usize, col: usize) -> Option<&T> {
self.data.get(row * self.cols + col)
}

pub fn set(&mut self, row: usize, col: usize, value: T) -> Result<(), String> {
if row < self.rows && col < self.cols {
self.data[row * self.cols + col] = value;
Ok(())
} else {
Err(format!("Index out of bounds: row {}, col {}", row, col))
}
}

pub fn increment(&mut self, row: usize, col: usize) {
if row < self.rows && col < self.cols {
let index = row * self.cols + col;
if let Some(value) = self.data.get_mut(index) {
*value += 1.into();
}
}
}
}

impl<'a, T> Iterator for RowIterator<'a, T>
where
T: Copy + Default,
{
type Item = &'a [T];

fn next(&mut self) -> Option<Self::Item> {
if self.current_row < self.matrix.rows {
let start = self.current_row * self.matrix.cols;
let end = start + self.matrix.cols;
self.current_row += 1;
Some(&self.matrix.data[start..end])
} else {
None
}
}
}

impl<T> CountMatrix<T>
where
T: Copy + Default,
{
pub fn iter_rows(&self) -> RowIterator<T> {
RowIterator {
matrix: self,
current_row: 0,
}
}
}

impl<T> CountMatrix<T>
where
T: Copy + Default + ToString,
{
pub fn write_to_file(&self, filename: &str) -> std::io::Result<()> {
let file = File::create(filename)?;
let mut buf_writer = BufWriter::new(GzEncoder::new(file, Compression::default()));

for row in self.iter_rows() {
let row_str: String = row
.iter()
.map(|v| v.to_string())
.collect::<Vec<String>>()
.join(",");
buf_writer.write_all(row_str.as_bytes())?;
buf_writer.write_all(b"\n")?; // Add a newline after each row
}

Ok(())
}
}
Loading

0 comments on commit 0e145ec

Please sign in to comment.