From 4823b9eac5aa5a83d22302df3c4d734ce51de349 Mon Sep 17 00:00:00 2001 From: Sergei L Kosakovsky Pond Date: Mon, 2 Apr 2018 10:31:56 -0400 Subject: [PATCH 1/3] Adding JSON output for sequence counts in files --- src/TN93.cpp | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/TN93.cpp b/src/TN93.cpp index 645d949..1650fcd 100644 --- a/src/TN93.cpp +++ b/src/TN93.cpp @@ -523,6 +523,14 @@ int main(int argc, const char *argv[]) { << endl; (*outStream) << "\t\"Links found\" : " << foundLinks << ',' << endl; (*outStream) << "\t\"Maximum distance\" : " << max[0] << ',' << endl; + + if (args.input2 == NULL) { + (*outStream) << "\t\"Sequences\" : " << sequenceCount << ',' << endl; + } else { + (*outStream) << "\t\"Sequences in first file\" : " << seqLengthInFile1 << ',' << endl; + (*outStream) << "\t\"Sequences in second file\" : " << seqLengthInFile2 << ',' << endl; + } + if (do_fst) { const char *keys[4] = {"File 1", "File 2", "Between", "Combined"}; for (unsigned long k = 0; k < 3; k++) { From bb79f2327881422ca2be071d696099fbf9cfea3d Mon Sep 17 00:00:00 2001 From: Sergei L Kosakovsky Pond Date: Wed, 4 Apr 2018 16:18:51 -0400 Subject: [PATCH 2/3] Resolving the segfault issue on very gappy data --- src/TN93.cpp | 11 ++++++++--- src/tn93_shared.cc | 20 ++++++++++++++++---- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/src/TN93.cpp b/src/TN93.cpp index 1650fcd..564f83d 100644 --- a/src/TN93.cpp +++ b/src/TN93.cpp @@ -323,6 +323,7 @@ int main(int argc, const char *argv[]) { */ bool cross_comparison_only = (args.input2 && !do_fst); + unsigned long system_CPU_count = omp_get_max_threads(); #pragma omp parallel shared( \ skipped_comparisons, sequence_descriptors, resolutionOption, foundLinks, \ @@ -330,7 +331,7 @@ int main(int argc, const char *argv[]) { args, nameLengths, names, pairwise, percentDone, cerr, max, randFlag, \ distanceMatrix, upperBound, seqLengthInFile1, seqLengthInFile2, mean, \ randSeqs, weighted_counts, do_fst, randomized_fst, randomized_idx, \ - recounts, cross_comparison_only, report_self) + recounts, cross_comparison_only, report_self) { @@ -341,8 +342,9 @@ int main(int argc, const char *argv[]) { } } -#pragma omp for schedule(dynamic) +#pragma omp for schedule(guided) for (long seq1 = 0; seq1 < upperBound; seq1++) { + long mapped_id = randomized_fst ? randomized_idx.value(seq1) : (randSeqs ? randSeqs[seq1] : seq1); @@ -402,6 +404,8 @@ int main(int argc, const char *argv[]) { } } + + double thisD = sequence_descriptors ? computeTN93(s1, stringText(sequences, seqLengths, mapped_id2), @@ -415,6 +419,7 @@ int main(int argc, const char *argv[]) { args.overlap, &(histogram_counts[which_bin][0]), HISTOGRAM_SLICE, HISTOGRAM_BINS, weighted_count); + if (thisD >= -1.e-10 && thisD <= args.distance) { local_links_found += weighted_count; // char *s2 = stringText(sequences, seqLengths, seq1); @@ -488,7 +493,7 @@ int main(int argc, const char *argv[]) { } } } - } + } // end of the parallel loop if (args.do_count == false && args.format == hyphy) { fprintf(args.output, "{"); diff --git a/src/tn93_shared.cc b/src/tn93_shared.cc index 6107e00..696569b 100644 --- a/src/tn93_shared.cc +++ b/src/tn93_shared.cc @@ -360,8 +360,11 @@ long perfect_match (const char* source, char* target, const long sequence_length struct sequence_gap_structure describe_sequence (const char* source, const unsigned long sequence_length, const unsigned long char_count) { sequence_gap_structure result; - unsigned long start_run = sequence_length + 1UL, - end_run = 0L; + long start_run = sequence_length + 1UL, + end_run = 0L; + + result.resolved_end = end_run; + result.resolved_start = start_run; for (unsigned long char_idx = 0UL; char_idx < sequence_length; char_idx++) { unsigned long this_char = source[char_idx]; @@ -397,6 +400,10 @@ struct sequence_gap_structure describe_sequence (const char* source, const unsig result.resolved_end = end_run; } } + + //cout << sequence_length << "/" << char_count << endl << result.first_nongap << "," + // << result.last_nongap << " : " << result.resolved_start << "," << result.resolved_end << endl; + return result; } @@ -432,12 +439,17 @@ double computeTN93 (const char * s1, const char * s2, const unsigned long L, c if (sequence_descriptor1 && sequence_descriptor2 && matchMode != GAPMM) { + +//#pragma omp critical +// cout << "HERE" << endl; + + unsigned long first_nongap = MAX (sequence_descriptor1->first_nongap, sequence_descriptor2->first_nongap), last_nongap = MIN (sequence_descriptor1->last_nongap, sequence_descriptor2->last_nongap), span_start = MAX (sequence_descriptor1->resolved_start, sequence_descriptor2->resolved_start), span_end = MIN (sequence_descriptor1->resolved_end, sequence_descriptor2->resolved_end); - - //cout << first_nongap << " " << last_nongap << " " << span_start << " " << span_end << endl; +//#pragma omp critical +// cout << first_nongap << " " << last_nongap << " " << span_start << " " << span_end << endl; if (span_start > span_end) { From 563d20f6bf87f92728ace7782b9d62c15233ad4c Mon Sep 17 00:00:00 2001 From: Steven Weaver Date: Wed, 11 Apr 2018 12:40:00 -0400 Subject: [PATCH 3/3] removing leftover debugging variable --- src/TN93.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/TN93.cpp b/src/TN93.cpp index 564f83d..947b90d 100644 --- a/src/TN93.cpp +++ b/src/TN93.cpp @@ -323,7 +323,6 @@ int main(int argc, const char *argv[]) { */ bool cross_comparison_only = (args.input2 && !do_fst); - unsigned long system_CPU_count = omp_get_max_threads(); #pragma omp parallel shared( \ skipped_comparisons, sequence_descriptors, resolutionOption, foundLinks, \