From 08ccdb672916150e52bada9637bd5cb98c17f247 Mon Sep 17 00:00:00 2001 From: Andrea Cracco Date: Mon, 13 Jan 2025 18:23:51 +0100 Subject: [PATCH] Fix fast eulertigs bug --- crates/assembler/src/lib.rs | 6 +- .../src/pipeline/compute_matchtigs.rs | 8 +++ crates/assembler/src/pipeline/eulertigs.rs | 4 +- .../src/pipeline/reorganize_reads.rs | 67 ++++++++++++++----- libs-crates/instrumenter-rs | 2 +- 5 files changed, 66 insertions(+), 21 deletions(-) diff --git a/crates/assembler/src/lib.rs b/crates/assembler/src/lib.rs index 5bb94f2..c245f1f 100644 --- a/crates/assembler/src/lib.rs +++ b/crates/assembler/src/lib.rs @@ -427,20 +427,24 @@ pub fn run_assembler< let (reorganized_reads, _final_unitigs_bucket) = if step <= AssemblerStartingStep::ReorganizeReads { - if generate_maximal_unitigs_links || compute_tigs_mode.needs_matchtigs_library() { + if generate_maximal_unitigs_links || compute_tigs_mode.needs_temporary_tigs() { reorganize_reads::>( + k, sequences, reads_map, temp_dir.as_path(), compressed_temp_unitigs_file.as_ref().unwrap(), + circular_temp_unitigs_file.as_ref(), buckets_count, ) } else { reorganize_reads::>( + k, sequences, reads_map, temp_dir.as_path(), &final_unitigs_file, + None, buckets_count, ) } diff --git a/crates/assembler/src/pipeline/compute_matchtigs.rs b/crates/assembler/src/pipeline/compute_matchtigs.rs index d02dba3..affd4d0 100644 --- a/crates/assembler/src/pipeline/compute_matchtigs.rs +++ b/crates/assembler/src/pipeline/compute_matchtigs.rs @@ -317,6 +317,7 @@ pub enum MatchtigMode { pub trait MatchtigHelperTrait { fn needs_simplitigs(&self) -> bool; + fn needs_temporary_tigs(&self) -> bool; fn needs_matchtigs_library(&self) -> bool; fn get_matchtigs_mode(&self) -> Self; } @@ -326,6 +327,13 @@ impl MatchtigHelperTrait for Option { *self == Some(MatchtigMode::FastSimpliTigs) || *self == Some(MatchtigMode::FastEulerTigs) } + fn needs_temporary_tigs(&self) -> bool { + *self == Some(MatchtigMode::EulerTigs) + || *self == Some(MatchtigMode::GreedyTigs) + || *self == Some(MatchtigMode::PathTigs) + || *self == Some(MatchtigMode::FastEulerTigs) + } + fn needs_matchtigs_library(&self) -> bool { *self == Some(MatchtigMode::EulerTigs) || *self == Some(MatchtigMode::GreedyTigs) diff --git a/crates/assembler/src/pipeline/eulertigs.rs b/crates/assembler/src/pipeline/eulertigs.rs index a115e76..11140ca 100644 --- a/crates/assembler/src/pipeline/eulertigs.rs +++ b/crates/assembler/src/pipeline/eulertigs.rs @@ -162,6 +162,7 @@ impl CircularUnitig { last.rc ^= self.rc; let end_offset = if !last.rc && !write_full { 0 } else { k - 1 }; + let start_offset = if last.rc && !write_full { k - 1 } else { 0 }; let last_part_entry = unitigs.get(&last.orig_index).unwrap(); @@ -176,7 +177,8 @@ impl CircularUnitig { }; } - let last_part_slice = last.start_pos..last.start_pos + last.length + end_offset; + let last_part_slice = + last.start_pos + start_offset..last.start_pos + last.length + end_offset; let last_part = last_part_entry .0 .as_reference(unitigs_kmers) diff --git a/crates/assembler/src/pipeline/reorganize_reads.rs b/crates/assembler/src/pipeline/reorganize_reads.rs index 1dd4315..77df588 100644 --- a/crates/assembler/src/pipeline/reorganize_reads.rs +++ b/crates/assembler/src/pipeline/reorganize_reads.rs @@ -156,10 +156,12 @@ pub fn reorganize_reads< CX: ColorsManager, BK: StructuredSequenceBackend, ()>, >( + k: usize, mut reads: Vec, mut mapping_files: Vec, temp_path: &Path, out_file: &StructuredSequenceWriter, (), BK>, + circular_out_file: Option<&StructuredSequenceWriter, (), BK>>, buckets_count: usize, ) -> (Vec, PathBuf) { PHASES_TIMES_MONITOR @@ -199,10 +201,13 @@ pub fn reorganize_reads< NoMultiplicity, >, >::new(&buckets, buffers.take()); - let mut tmp_lonely_unitigs_buffer = FastaWriterConcurrentBuffer::new(out_file, DEFAULT_OUTPUT_BUFFER_SIZE, true); + let mut tmp_circular_unitigs_buffer = circular_out_file.map(|out_file| { + FastaWriterConcurrentBuffer::new(out_file, DEFAULT_OUTPUT_BUFFER_SIZE, true) + }); + let mut mappings = Vec::new(); assert_eq!(read_file.index, mapping_file.index); @@ -268,23 +273,49 @@ pub fn reorganize_reads< ); map_index += 1; } else { - // No mapping, write unitig to file - - tmp_lonely_unitigs_buffer.add_read( - seq, - None, - extra_data.colors, - color_buffer, - (), - &(), - #[cfg(feature = "support_kmer_counters")] - SequenceAbundance { - first: extra_data.counters.first, - sum: extra_data.counters.sum, - last: extra_data.counters.last, - }, - ); - + // Loop to allow skipping code parts with break + 'skip_writing: loop { + let first_kmer_node = &seq[0..k - 1]; + let last_kmer_node = &seq[seq.len() - k + 1..]; + if let Some(circular_unitigs_buffer) = &mut tmp_circular_unitigs_buffer { + // Check if unitig is circular + if first_kmer_node == last_kmer_node { + circular_unitigs_buffer.add_read( + seq, + None, + extra_data.colors, + color_buffer, + (), + &(), + #[cfg(feature = "support_kmer_counters")] + SequenceAbundance { + first: extra_data.counters.first, + sum: extra_data.counters.sum, + last: extra_data.counters.last, + }, + ); + break 'skip_writing; + } + } + + // No mapping, write unitig to file + tmp_lonely_unitigs_buffer.add_read( + seq, + None, + extra_data.colors, + color_buffer, + (), + &(), + #[cfg(feature = "support_kmer_counters")] + SequenceAbundance { + first: extra_data.counters.first, + sum: extra_data.counters.sum, + last: extra_data.counters.last, + }, + ); + + break; + } // write_fasta_entry::( // &mut fasta_temp_buffer, // &mut tmp_lonely_unitigs_buffer, diff --git a/libs-crates/instrumenter-rs b/libs-crates/instrumenter-rs index 08675e3..911ea9c 160000 --- a/libs-crates/instrumenter-rs +++ b/libs-crates/instrumenter-rs @@ -1 +1 @@ -Subproject commit 08675e3df3815435906b0f45dea18f2c01f8d460 +Subproject commit 911ea9c70122a6308f02b73740da2e20f851d100