Skip to content

Commit

Permalink
Adjust subcommand, output writing for basic BED, reporting API
Browse files Browse the repository at this point in the history
 - granges_adjust and adjust subcommand
 - random granges
 - dev-commands feature & RandomBed
 - to_bed3()
 - empty range iterator
 - cleaner output to standard out
 - reporting API
 - TSV trait and impls
  • Loading branch information
vsbuffalo committed Feb 16, 2024
1 parent 548fbd0 commit 828d9e2
Show file tree
Hide file tree
Showing 18 changed files with 508 additions and 78 deletions.
6 changes: 4 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,10 @@ noodles = { version = "0.63.0", features = ["core", "bed"] }
rand = "0.8.5"
thiserror = "1.0.57"

# [features]
# cli = [ "clap" ]
[features]
# cli = [ "clap" ] // TODO make feature
dev-commands = [ ]


[[bin]]
name = "granges"
Expand Down
2 changes: 1 addition & 1 deletion src/error.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::num::{ParseIntError, ParseFloatError};
use std::num::{ParseFloatError, ParseIntError};

use genomap::GenomeMapError;
use thiserror::Error;
Expand Down
50 changes: 45 additions & 5 deletions src/granges.rs
Original file line number Diff line number Diff line change
@@ -1,15 +1,19 @@
use std::path::PathBuf;

use genomap::GenomeMap;
use indexmap::IndexMap;

use crate::{
io::OutputFile,
iterators::{GRangesEmptyIterator, GRangesIterator},
prelude::GRangesError,
ranges::{
coitrees::{COITrees, COITreesIndexed},
vec::{VecRanges, VecRangesEmpty, VecRangesIndexed},
RangeEmpty, RangeIndexed, RangeRecord,
},
traits::{RangeContainer, RangesIterable},
Position, iterators::GRangesIterator,
traits::{RangeContainer, RangesIterable, TsvSerialize},
Position,
};

#[derive(Clone, Debug)]
Expand Down Expand Up @@ -145,6 +149,27 @@ impl GRanges<VecRangesEmpty, ()> {
}
}

impl<R> GRanges<R, ()>
where
R: RangeContainer + RangesIterable<RangeEmpty>,
{
// TODO: candidate for a trait
pub fn to_bed3(&self, output: Option<impl Into<PathBuf>>) -> Result<(), GRangesError> {
// output stream -- header is None for now (TODO)
let output = output.map_or(OutputFile::new_stdout(None), |file| {
OutputFile::new(file, None)
});
let mut writer = output.writer()?;

let seqnames = self.seqnames();
for range in self.iter_ranges_empty() {
let record = range.to_record(&seqnames);
writeln!(writer, "{}", record.to_tsv())?;
}
Ok(())
}
}

impl<T> GRanges<VecRangesIndexed, T> {
/// Convert this [`VecRangesIndexed`] range container to a cache-oblivious interval tree
/// range container, [`COITreesIndexed`]. This is done using the [`coitrees`] library
Expand All @@ -163,10 +188,25 @@ impl<T> GRanges<VecRangesIndexed, T> {
}
}

impl<R, T> GRanges<R, T>
where R: RangesIterable<RangeIndexed> {
impl<R, T> GRanges<R, T>
where
R: RangesIterable<RangeEmpty>,
{
/// Create a new [`GRangesIterator`] to iterate through all
/// the ranges in this [`GRanges`] object. These ranges carry
/// no data index, unlike the method [`GRanges.iter_ranges()`]
/// available for range type for associated data containers.
pub fn iter_ranges_empty(&self) -> GRangesEmptyIterator<'_, R> {
GRangesEmptyIterator::new(&self.ranges)
}
}

impl<R, T> GRanges<R, T>
where
R: RangesIterable<RangeIndexed>,
{
/// Create a new [`GRangesIterator`] to iterate through all the ranges in this [`GRanges`] object.
pub fn iter_ranges<'a>(&'a self) -> GRangesIterator<'a, R> {
pub fn iter_ranges(&self) -> GRangesIterator<'_, R> {
GRangesIterator::new(&self.ranges)
}
}
Expand Down
45 changes: 32 additions & 13 deletions src/io/io.rs → src/io/file.rs
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,13 @@ use std::io::{self, BufWriter};
use std::io::{BufRead, BufReader, Read};
use std::path::PathBuf;

use crate::Position;
use crate::error::GRangesError;
use crate::Position;

/// Read a tab-delimited *genome file* of sequence (i.e. chromosome) names and their lengths.
pub fn read_seqlens(filepath: impl Into<PathBuf>) -> Result<IndexMap<String, Position>, GRangesError> {
pub fn read_seqlens(
filepath: impl Into<PathBuf>,
) -> Result<IndexMap<String, Position>, GRangesError> {
let input_file = InputFile::new(filepath);
let reader = input_file.reader()?;

Expand Down Expand Up @@ -136,12 +138,17 @@ impl InputFile {
}
}

enum OutputDestination {
File(PathBuf),
Stdout,
}

/// Represents an output file.
///
/// This struct is used to handle operations on an output file, such as writing to the file.
/// This abstracts writing both plaintext and gzip-compressed files.
pub struct OutputFile {
pub filepath: PathBuf,
destination: OutputDestination,
pub header: Option<Vec<String>>,
}

Expand All @@ -155,7 +162,15 @@ impl OutputFile {
/// * `header` - An optional vector of strings representing commented header lines to be written to the file.
pub fn new(filepath: impl Into<PathBuf>, header: Option<Vec<String>>) -> Self {
Self {
filepath: filepath.into(),
destination: OutputDestination::File(filepath.into()),
header,
}
}

/// Constructs a new [`OutputFile`] for standard output.
pub fn new_stdout(header: Option<Vec<String>>) -> Self {
Self {
destination: OutputDestination::Stdout,
header,
}
}
Expand All @@ -170,15 +185,19 @@ impl OutputFile {
///
/// A result containing a `Box<dyn Write>` on success, or an `io::Error` on failure.
pub fn writer(&self) -> io::Result<Box<dyn Write>> {
let outfile = &self.filepath;
let is_gzip = outfile.ends_with(".gz");
let mut writer: Box<dyn Write> = if is_gzip {
Box::new(BufWriter::new(GzEncoder::new(
File::create(outfile)?,
Compression::default(),
)))
} else {
Box::new(BufWriter::new(File::create(outfile)?))
let mut writer: Box<dyn Write> = match &self.destination {
OutputDestination::File(path) => {
let is_gzip = path.ends_with(".gz");
if is_gzip {
Box::new(BufWriter::new(GzEncoder::new(
File::create(path)?,
Compression::default(),
)))
} else {
Box::new(BufWriter::new(File::create(path)?))
}
}
OutputDestination::Stdout => Box::new(io::stdout()),
};
// write header if one is set
if let Some(entries) = &self.header {
Expand Down
6 changes: 3 additions & 3 deletions src/io/mod.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
//! Input/Output
//!
pub mod io;
pub mod file;
pub mod noodles;
pub mod parsers;

pub use io::{InputFile, OutputFile};
pub use parsers::{Bed3RecordIterator, TsvRecordIterator, BedlikeIterator};
pub use file::{InputFile, OutputFile};
pub use parsers::{Bed3RecordIterator, BedlikeIterator, TsvRecordIterator};
22 changes: 11 additions & 11 deletions src/io/parsers.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,11 @@ use std::collections::HashSet;
use std::io::{BufRead, BufReader, Read};
use std::path::PathBuf;

use crate::Position;
use crate::error::GRangesError;
use crate::io::io::InputFile;
use crate::io::file::InputFile;
use crate::ranges::RangeRecord;
use crate::traits::GeneralRangeRecordIterator;
use crate::Position;

use super::noodles::convert_noodles_position;

Expand Down Expand Up @@ -101,6 +101,7 @@ where

/// A BED-like file parser. This works by lazy-parsing the first three
/// columns, which are standard to all BED files.
#[allow(clippy::type_complexity)]
pub struct BedlikeIterator {
iter: TsvRecordIterator<fn(&str) -> Result<RangeRecord<String>, GRangesError>, String>,
}
Expand All @@ -109,7 +110,7 @@ impl BedlikeIterator {
pub fn new(filepath: impl Into<PathBuf>) -> Result<Self, GRangesError> {
// Wrap the parse_bedlike_to_range_record function to conform with TsvRecordIterator's expectations.
let parser: fn(&str) -> Result<RangeRecord<String>, GRangesError> = parse_bed_lazy;

let iter = TsvRecordIterator::new(filepath, parser)?;
Ok(Self { iter })
}
Expand All @@ -123,8 +124,6 @@ impl Iterator for BedlikeIterator {
}
}



/// An iterator over [`IntervalRecord`] items that filters based on sequence name.
///
/// Note that that the exclude filter is prioritized over the retain filter. So,
Expand Down Expand Up @@ -282,7 +281,11 @@ pub fn parse_bed_lazy(line: &str) -> Result<RangeRecord<String>, GRangesError> {
let start: Position = parse_column(columns[1], line)?;
let end: Position = parse_column(columns[2], line)?;

let data = columns[3].to_string();
let data = if columns.len() > 3 {
columns[3].to_string()
} else {
String::new()
};

Ok(RangeRecord {
seqname,
Expand Down Expand Up @@ -312,7 +315,7 @@ mod tests {
// let first_interval = gr_iter.next().unwrap();
// assert_eq!(first_interval.first, 7);
// assert_eq!(first_interval.last, 12);
//
//
// let second_interval = gr_iter.next().unwrap();
// assert_eq!(second_interval.first, 20);
// assert_eq!(second_interval.last, 33);
Expand All @@ -336,9 +339,6 @@ mod tests {
// note: the Rust LSP thinks this isn't used for some reason, so prefaced with _
// to silence warnings.
let _msg = "column '-1' in 'chr1\t-1\t20'".to_string();
assert!(matches!(
result,
Err(GRangesError::InvalidColumnType(_msg))
));
assert!(matches!(result, Err(GRangesError::InvalidColumnType(_msg))));
}
}
57 changes: 55 additions & 2 deletions src/iterators.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
use genomap::GenomeMap;

use crate::{
ranges::{RangeIndexed, RangeIndexedRecord},
ranges::{RangeEmpty, RangeEmptyRecord, RangeIndexed, RangeIndexedRecord},
traits::{RangeContainer, RangesIterable},
};

/// An iterator over [`RangeIndexedRecord`], which store
/// An iterator yielding [`RangeIndexedRecord`], which store
/// indices to the sequence names and data container.
///
/// # Developer Notes
Expand Down Expand Up @@ -63,6 +63,59 @@ where
}
}

/// An iterator over [`RangeEmptyRecord`], which store
/// indices to the sequence names (but carries no data index).
pub struct GRangesEmptyIterator<'a, R> {
ranges: &'a GenomeMap<R>,
current_seqname_index: usize,
current_range_iter: Box<dyn Iterator<Item = RangeEmpty> + 'a>,
}

impl<'a, R> GRangesEmptyIterator<'a, R>
where
R: RangesIterable<RangeEmpty>,
{
pub fn new(ranges: &'a GenomeMap<R>) -> Self {
let current_range_iter = ranges.get_by_index(0).unwrap().iter_ranges();
Self {
ranges,
current_seqname_index: 0,
current_range_iter,
}
}
}

impl<'a, R> Iterator for GRangesEmptyIterator<'a, R>
where
R: RangeContainer + RangesIterable<RangeEmpty>,
{
type Item = RangeEmptyRecord;

fn next(&mut self) -> Option<Self::Item> {
loop {
if let Some(next_range) = self.current_range_iter.next() {
return Some(RangeEmptyRecord {
seqname_index: self.current_seqname_index,
start: next_range.start,
end: next_range.end,
});
} else {
// try to load another sequence's set of ranges.
self.current_seqname_index += 1;
if self.current_seqname_index >= self.ranges.len() {
// we're out of range container iterators
return None;
}
self.current_range_iter = self
.ranges
.get_by_index(self.current_seqname_index)
.unwrap()
.iter_ranges();
}
}
}
}

#[cfg(test)]
mod tests {
use crate::{ranges::RangeIndexedRecord, test_utilities::granges_test_case_01};
Expand Down
7 changes: 5 additions & 2 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,10 +19,13 @@ pub type PositionOffset = i32; // signed variant
pub mod prelude {
pub use crate::error::GRangesError;
pub use crate::granges::GRanges;
pub use crate::io::{Bed3RecordIterator, TsvRecordIterator, BedlikeIterator};
pub use crate::io::file::read_seqlens;
pub use crate::io::{Bed3RecordIterator, BedlikeIterator, TsvRecordIterator};

pub use crate::ranges::vec::{VecRangesEmpty, VecRangesIndexed};
pub use crate::traits::{GeneralRangeRecordIterator, RangesIntoIterable, RangesIterable};
pub use crate::traits::{
GeneralRangeRecordIterator, RangesIntoIterable, RangesIterable, TsvSerialize,
};

pub use crate::seqlens;
}
Expand Down
Loading

0 comments on commit 828d9e2

Please sign in to comment.