From 2994b3e7c639417c974bdc41e35e92f732e86a21 Mon Sep 17 00:00:00 2001 From: edolzhenko Date: Fri, 7 Apr 2017 09:38:19 -0700 Subject: [PATCH 1/3] Do not recover nearby mates --- src/irr_counting.cc | 85 ++++++++++++++++++++++++--------------------- 1 file changed, 46 insertions(+), 39 deletions(-) diff --git a/src/irr_counting.cc b/src/irr_counting.cc index 15102bf..d576587 100644 --- a/src/irr_counting.cc +++ b/src/irr_counting.cc @@ -35,6 +35,7 @@ using std::vector; using std::map; #include using std::array; +#include #include "purity/purity.h" #include "rep_align/rep_align.h" @@ -42,15 +43,15 @@ using std::array; /*****************************************************************************/ // Check if two alignments are same. -static bool SameAlign(Align& al1, Align& al2) { +static bool SameAlign(Align &al1, Align &al2) { return (al1.name == al2.name && al1.mate_pos == al2.mate_pos && al1.flag == al2.flag && al1.bases == al2.bases); } -void CacheReadsFromRegion(const Region& region, const WhatToCache whatToCache, - const vector>& units_shifts, - double min_wp_score, BamFile* bam_file, - AlignPairs* align_pairs) { +void CacheReadsFromRegion(const Region ®ion, const WhatToCache whatToCache, + const vector> &units_shifts, + double min_wp_score, BamFile *bam_file, + AlignPairs *align_pairs) { // Jump to the target region or the unaligned reads if the chromosome name // is "*" then jump to unaligned reads. if (region.chrom() == "*") { @@ -84,7 +85,7 @@ void CacheReadsFromRegion(const Region& region, const WhatToCache whatToCache, // permitted. align.region = region.AsString(); - AlignPair& frag = (*align_pairs)[align.name]; + AlignPair &frag = (*align_pairs)[align.name]; if (align.IsFirstMate()) { if (frag[0].name.empty()) { @@ -126,11 +127,11 @@ void CacheReadsFromRegion(const Region& region, const WhatToCache whatToCache, } } -bool CheckAnchoredIrrs(const BamFile& bam_file, const Parameters& parameters, - const Region& target_neighborhood, - const RepeatSpec& repeat_spec, const Align& read_align, - const Align& mate_align, - const vector>& units_shifts) { +bool CheckAnchoredIrrs(const BamFile &bam_file, const Parameters ¶meters, + const Region &target_neighborhood, + const RepeatSpec &repeat_spec, const Align &read_align, + const Align &mate_align, + const vector> &units_shifts) { const size_t min_mapq = parameters.min_anchor_mapq(); // Check if the read has low mapping quality and it is an off-target // anchor; such reads are reported but not included into the calculation. @@ -172,10 +173,10 @@ bool CheckAnchoredIrrs(const BamFile& bam_file, const Parameters& parameters, /*****************************************************************************/ -void FillinMates(BamFile& bam_file, AlignPairs& align_pairs) { +void FillinMates(BamFile &bam_file, AlignPairs &align_pairs) { for (AlignPairs::iterator it = align_pairs.begin(); it != align_pairs.end(); ++it) { - AlignPair& frag = it->second; + AlignPair &frag = it->second; // At least one read must always be filled out. assert(!frag[0].name.empty() || !frag[1].name.empty()); @@ -184,14 +185,20 @@ void FillinMates(BamFile& bam_file, AlignPairs& align_pairs) { // Get references for exisitng Align and the one that // needs to be filled in and then process them below. This // is done to avoid code duplication. - Align& existing_al = frag[0].name.empty() ? frag[1] : frag[0]; - Align& missing_al = frag[0].name.empty() ? frag[0] : frag[1]; + Align &existing_al = frag[0].name.empty() ? frag[1] : frag[0]; + Align &missing_al = frag[0].name.empty() ? frag[0] : frag[1]; + + // Do not recover nearby mates. + if ((existing_al.chrom_id == existing_al.mate_chrom_id) && + (std::abs(existing_al.pos - existing_al.mate_pos) < 1000)) { + continue; + } if (bam_file.GetAlignedMate(existing_al, missing_al)) { // region typically stores the position of the region from // which a read was cached. Since this read was not cached // from anywhere, we store its position in the region field. - const vector& refVec = bam_file.ref_vec(); + const vector &refVec = bam_file.ref_vec(); assert(missing_al.chrom_id < refVec.size()); missing_al.region = Region(refVec[missing_al.chrom_id], missing_al.pos + 1, missing_al.pos + 2) @@ -205,10 +212,10 @@ void FillinMates(BamFile& bam_file, AlignPairs& align_pairs) { } // TODO(edolzhenko): Move cache creation out of the function. -bool CountUnalignedIrrs(BamFile& bam_file, const Parameters& parameters, - size_t& numUnalignedIRRs, - const vector>& units_shifts, - vector* irr_rep_aligns) { +bool CountUnalignedIrrs(BamFile &bam_file, const Parameters ¶meters, + size_t &numUnalignedIRRs, + const vector> &units_shifts, + vector *irr_rep_aligns) { numUnalignedIRRs = 0; AlignPairs align_pairs; @@ -223,7 +230,7 @@ bool CountUnalignedIrrs(BamFile& bam_file, const Parameters& parameters, for (AlignPairs::const_iterator it = align_pairs.begin(); it != align_pairs.end(); ++it) { - const AlignPair& frag = it->second; + const AlignPair &frag = it->second; double topScore1 = MatchRepeatRc(units_shifts, frag[0].bases, frag[0].quals); @@ -259,8 +266,8 @@ bool CountUnalignedIrrs(BamFile& bam_file, const Parameters& parameters, irr_rep_aligns->push_back(rep_align); } else if (is_irr1 || is_irr2) { ++numUnalignedIRRs; - const Align& irr = is_irr1 ? frag[0] : frag[1]; - const Align& mate = is_irr1 ? frag[1] : frag[0]; + const Align &irr = is_irr1 ? frag[0] : frag[1]; + const Align &mate = is_irr1 ? frag[1] : frag[0]; RepeatAlign rep_align; rep_align.name = it->first; @@ -278,16 +285,16 @@ bool CountUnalignedIrrs(BamFile& bam_file, const Parameters& parameters, } } -size_t CountAlignedIrr(const BamFile& bam_file, const Parameters& parameters, - const AlignPairs& align_pairs, - map& numIrrConfRegion, - const vector>& units_shifts, - vector* irr_rep_aligns) { +size_t CountAlignedIrr(const BamFile &bam_file, const Parameters ¶meters, + const AlignPairs &align_pairs, + map &numIrrConfRegion, + const vector> &units_shifts, + vector *irr_rep_aligns) { size_t irr_count = 0; bool isFwdKmer = false; for (AlignPairs::const_iterator it = align_pairs.begin(); it != align_pairs.end(); ++it) { - const AlignPair& frag = it->second; + const AlignPair &frag = it->second; double first_top_score = MatchRepeatRc(units_shifts, frag[0].bases, frag[0].quals); if (!frag[0].bases.empty()) { @@ -328,20 +335,20 @@ size_t CountAlignedIrr(const BamFile& bam_file, const Parameters& parameters, return irr_count; } -void CountAnchoredIrrs(const BamFile& bam_file, const Parameters& parameters, - const Region& targetNhood, const RepeatSpec& repeat_spec, - const unordered_set& ontarget_frag_names, - AlignPairs& align_pairs, size_t& numAnchoredIrrs, - const vector>& units_shifts, - vector* anchored_irrs) { +void CountAnchoredIrrs(const BamFile &bam_file, const Parameters ¶meters, + const Region &targetNhood, const RepeatSpec &repeat_spec, + const unordered_set &ontarget_frag_names, + AlignPairs &align_pairs, size_t &numAnchoredIrrs, + const vector> &units_shifts, + vector *anchored_irrs) { numAnchoredIrrs = 0; // Check fragments from the target locus. for (unordered_set::const_iterator it = ontarget_frag_names.begin(); it != ontarget_frag_names.end(); ++it) { const AlignPair frag = align_pairs[*it]; - const Align& al1 = frag[0]; - const Align& al2 = frag[1]; + const Align &al1 = frag[0]; + const Align &al2 = frag[1]; // Counts fragments for which both mates are present. if (!al1.name.empty() && !al2.name.empty()) { @@ -358,8 +365,8 @@ void CountAnchoredIrrs(const BamFile& bam_file, const Parameters& parameters, al1, units_shifts); if (is_mate1_anchored_irr || is_mate2_anchored_irr) { - const Align& irr = is_mate1_anchored_irr ? al1 : al2; - const Align& anchor = is_mate1_anchored_irr ? al2 : al1; + const Align &irr = is_mate1_anchored_irr ? al1 : al2; + const Align &anchor = is_mate1_anchored_irr ? al2 : al1; ++numAnchoredIrrs; RepeatAlign rep_align; From 51b9178fcd4430a74ca936eb2ff5e97c2c50d7c0 Mon Sep 17 00:00:00 2001 From: edolzhenko Date: Fri, 7 Apr 2017 10:42:05 -0700 Subject: [PATCH 2/3] Set status of all reads to kNoCheck --- src/read_alignment.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/read_alignment.cc b/src/read_alignment.cc index 140897a..b740601 100644 --- a/src/read_alignment.cc +++ b/src/read_alignment.cc @@ -36,13 +36,13 @@ bool GetAlignFromHtsAlign(bam1_t* hts_align_ptr, Align& align, align.name = bam_get_qname(hts_align_ptr); align.flag = hts_align_ptr->core.flag; + align.status = kNoCheck; if (assumeUnaligned) { align.chrom_id = -1; // since unaligned align.pos = -1; // since unaligned align.mapq = 0; // since unaligned align.mate_chrom_id = -1; // since unaligned align.mate_pos = -1; // since unaligned - align.status = kNoCheck; } else { align.chrom_id = hts_align_ptr->core.tid; align.pos = hts_align_ptr->core.pos; From 035d597edca34cc8c405fc08e4e920597bc075fe Mon Sep 17 00:00:00 2001 From: edolzhenko Date: Fri, 7 Apr 2017 10:43:12 -0700 Subject: [PATCH 3/3] Bump version number --- include/version.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/version.h b/include/version.h index da22b58..584cf5d 100644 --- a/include/version.h +++ b/include/version.h @@ -25,6 +25,6 @@ #include -const std::string kProgramVersion = "Expansion Hunter v2.0.8"; +const std::string kProgramVersion = "Expansion Hunter v2.0.9"; #endif // VERSION_H_