diff --git a/QoRTs-vignette.pdf b/QoRTs-vignette.pdf index 855c108..4bc25d4 100644 Binary files a/QoRTs-vignette.pdf and b/QoRTs-vignette.pdf differ diff --git a/QoRTs.jar b/QoRTs.jar index a74ccb6..225620e 100644 Binary files a/QoRTs.jar and b/QoRTs.jar differ diff --git a/QoRTs_1.2.26.tar.gz b/QoRTs_1.2.26.tar.gz new file mode 100644 index 0000000..7559adb Binary files /dev/null and b/QoRTs_1.2.26.tar.gz differ diff --git a/README.md b/README.md index 32ed18e..8acd3d0 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ -# QoRTs v1.2.25 -(Compiled Sat Mar 4 13:22:59 EST 2017) +# QoRTs v1.2.26 +(Compiled Tue Mar 7 14:22:03 EST 2017) The [QoRTs software package](http://hartleys.github.io/QoRTs/) is a fast, efficient, and portable multifunction toolkit designed to assist in diff --git a/example-walkthrough.pdf b/example-walkthrough.pdf index 3aec341..bdba04e 100644 Binary files a/example-walkthrough.pdf and b/example-walkthrough.pdf differ diff --git a/src/HartleyUtils/src/main/scala/internalUtils/Reporter.scala b/src/HartleyUtils/src/main/scala/internalUtils/Reporter.scala index 77b9c69..60eaf5d 100644 --- a/src/HartleyUtils/src/main/scala/internalUtils/Reporter.scala +++ b/src/HartleyUtils/src/main/scala/internalUtils/Reporter.scala @@ -275,6 +275,17 @@ object Reporter { loggers.map((logger) => logger.startReport(str,verb)) } + + val warningCount = scala.collection.mutable.Map[String,Int]().withDefault((x : String) => 0); + def warning(str : String, warnType : String = "default", limit : Int = -1){ + if(limit < 0 || warningCount(warnType) < limit){ + reportln(str,"warn"); + } else if(limit > 0 && warningCount(warnType) == limit){ + reportln("(("+limit+"+ warnings of type "+warnType+". Further warnings of this type will be silent.))","warn"); + } + warningCount(warnType) += 1; + } + def error(str : String){ reportln("<====== FATAL ERROR! ======>","error"); reportln("----------------------------","error"); @@ -284,9 +295,7 @@ object Reporter { stackTrace.map((ste) => reportln(" " + ste.toString, "error")) reportln("<==========================>","error"); - closeLogs; - throw new Exception(str); } def error(e : Exception){ @@ -299,21 +308,19 @@ object Reporter { reportln("<==========================>","error"); closeLogs; - throw e; } def closeLogs() { + if(! warningCount.keys.isEmpty){ + reportln("<------->","warn"); + reportln(" Note: "+warningCount.keySet.map(warningCount(_)).sum+" Warnings Thrown:","warn"); + warningCount.keySet.foreach(x => { + reportln(" "+warningCount(x)+"\t"+x,"warn"); + }) + reportln("<------->","warn"); + } loggers.map((logger) => logger.close); loggers = List(); - - inProgressFile match { - case Some(pf) => { - pf.delete(); - } - case None => { - //do nothing! - } - } } } diff --git a/src/HartleyUtils/src/main/scala/internalUtils/genomicAnnoUtils.scala b/src/HartleyUtils/src/main/scala/internalUtils/genomicAnnoUtils.scala index ab4ac5a..a578be0 100644 --- a/src/HartleyUtils/src/main/scala/internalUtils/genomicAnnoUtils.scala +++ b/src/HartleyUtils/src/main/scala/internalUtils/genomicAnnoUtils.scala @@ -16,95 +16,32 @@ object genomicAnnoUtils { /* * Useful classes: */ + + /* - class EfficientGenomeSeqContainerOLD(infiles : Seq[String]){ - val seqIter = new GenomeSeqIterator(infiles); - - def getCurrChrom = seqIter.getCurrChrom; - - var currChrom = seqIter.getCurrChrom; - var bufferStart = 0; - var buffer : String = ""; - var maxBuffer : Int = buffer.length; - - def bufferEnd = bufferStart + buffer.length; - - def getSeqForInterval(chrom : String, start : Int, end : Int) : String = { - if(start < bufferStart){ - error("Illegal backwards-read from genome fasta buffer! Are reads coordinate-sorted?"); - } - if(end > bufferEnd){ - extendBufferTo(end); - } - if(end > bufferEnd){ - error("Illegal attempt to pull from beyond end of chromosome!"); - } - maxBuffer = math.max(maxBuffer,buffer.length); - return buffer.substring(start - bufferStart,end - bufferStart); - } - def shiftBufferTo(chrom : String, start : Int){ - if(chrom == currChrom){ - if(start+1 > bufferEnd){ - extendBufferTo(start+1); - } - if(bufferStart < start){ - buffer = buffer.substring(start - bufferStart); - bufferStart = start; - } //else do nothing. - } else { - val oldChrom = currChrom; - seqIter.goToNextChrom(chrom); - bufferStart = 0; - buffer = seqIter.next._2; - currChrom = seqIter.getCurrChrom; - if(currChrom != chrom) error("Chromosomes mis-ordered in fasta file(s)! Must be given in the same order as found in the BAM file!\n"+ - " Looking for: \""+chrom+"\", found \""+oldChrom+"\" and then "+"\""+currChrom+"\""); - } - maxBuffer = math.max(maxBuffer,buffer.length); - } - def extendBufferTo(end : Int) : Boolean = { - //val initEnd = bufferStartPosition + buffer.length; - while(seqIter.hasNext && bufferEnd <= end){ - val (chr,nxt) = seqIter.next; - if(nxt == ""){ - return false; - } - buffer = buffer + nxt; - maxBuffer = math.max(maxBuffer,buffer.length); - } - return true; - } - - def reportBufferStatus { - reportln("EfficientGenomeSeqContainer Status: ","debug"); - reportln(" "+currChrom+":"+bufferStart+"-"+bufferEnd,"debug"); - reportln(" Buffer size: "+buffer.length,"debug"); - reportln(" Max Buffer size: "+maxBuffer,"debug"); - } - }*/ - - - class EfficientGenomeSeqContainer(infiles : Seq[String]){ - var initialReader = internalUtils.fileUtils.getLinesSmartUnzipFromSeq(infiles); - - var currChrom = initialReader.next().substring(1); + * Useful classes: + */ + abstract class EfficientGenomeSeqContainer { + def currIter : Iterator[String]; + //var remainderIter : Iterator[String]; + def switchToChrom(chrom : String); + var currChrom = ""; var bufferStart = 0; var buffer : Vector[String] = Vector[String](); val blockSize = 1000; var maxBuffer : Int = buffer.length; - var chromList : List[String] = List[String](currChrom); - - var (currIter,remainderIter) = initialReader.span(line => line.charAt(0) != '>'); - currIter = currIter.map(_.toUpperCase()); + var chromList : List[String] = List[String](); def bufferEnd = bufferStart + buffer.length * blockSize; - //def bufferEnd = bufferStart + buffer.map(_.length).sum; - //def getBlockIdxForPosition(pos : Int) : Int = { - // return (pos - bufferStart) / blockSize; - //} + + def clearBuffer(){ + residualBuffer = ""; + bufferStart = 0; + buffer = Vector[String](); + } def getSeqForInterval(chrom : String, start : Int, end : Int, truncate : Boolean = false) : String = { if(chrom != currChrom) { @@ -136,16 +73,10 @@ object genomicAnnoUtils { } else { return buffer(firstBlockIdx).substring(startOffset,blockSize) + buffer.slice(firstBlockIdx+1,lastBlockIdx).mkString("") + buffer(lastBlockIdx).substring(0,endOffset); } - //buffer.substring(start - bufferStart,end - bufferStart); } var residualBuffer = ""; def getNextBlock() : String = { - /*if(residualBuffer.length > blockSize){ - val nextBlock = residualBuffer.substring(0,blockSize); - residualBuffer = residualBuffer.substring(blockSize); - return(nextBlock); - }*/ var nextBlock = residualBuffer; while(currIter.hasNext && nextBlock.length < blockSize){ nextBlock = nextBlock + currIter.next; @@ -176,52 +107,10 @@ object genomicAnnoUtils { buffer = buffer.drop(blockIdx); } //else do nothing. } else { - if(remainderIter.hasNext()){ - currChrom = remainderIter.next.substring(1); - val iterPair = remainderIter.span(line => line.charAt(0) != '>'); - currIter = iterPair._1.map(_.toUpperCase()); - remainderIter = iterPair._2; - } else { - if(allowRecycle){ - reportln("Returning to start of genome FASTA file. NOTE: for optimal performance, sort the FASTA file so that the chromosomes appear in the same order as in the BAM files.","note"); - initialReader = internalUtils.fileUtils.getLinesSmartUnzipFromSeq(infiles); - currChrom = initialReader.next().substring(1); - val iterPair = remainderIter.span(line => line.charAt(0) != '>'); - currIter = iterPair._1.map(_.toUpperCase()); - remainderIter = iterPair._2; - - shiftBufferTo(chrom, start, allowRecycle = false); - } else { - error("FATAL ERROR: Cannot find chromosome \""+chrom+"\" in genome FASTA file!") - } - } - - residualBuffer = ""; - bufferStart = 0; - buffer = Vector[String](); - - if(currChrom != chrom){ - reportln("SKIPPING FASTA SEQUENCE FOR CHROMOSOME "+currChrom+" (probable cause: 0 reads found on chromosome, or bam/fasta mis-sorted)!","note"); - //if(chromList.contains(chrom)){ - // error("FATAL ERROR: BAM and BED files do not have the same chromosome ordering or BAM file is not properly sorted. Encountered BAM read from chromosome \""+chrom+"\", but have already passed this chromosome in the BED file."); - //} - } else { - reportln("Shifting to chromosome: "+currChrom+"!","note"); - } - chromList = chromList ++ List[String](currChrom); - + switchToChrom(chrom); shiftBufferTo(chrom,start); } } - /*var bufferLengthAlertLimit = 10500; - val bufferLengthAlertUnit = 1000; - def checkBufferStat { - maxBuffer = math.max(maxBuffer,buffer.length); - if(maxBuffer > bufferLengthAlertLimit){ - reportBufferStatus; - bufferLengthAlertLimit += bufferLengthAlertUnit; - } - }*/ def extendBufferTo(end : Int) { while(currIter.hasNext && bufferEnd < end){ @@ -256,135 +145,68 @@ object genomicAnnoUtils { } } - - class EfficientGenomeSeqContainer_OLD(infiles : Seq[String]){ - val initialReader = internalUtils.fileUtils.getLinesSmartUnzipFromSeq(infiles); - - var currChrom = initialReader.next().substring(1); - var bufferStart = 0; - var buffer : String = ""; - var maxBuffer : Int = buffer.length; - var chromList : List[String] = List[String](); - - - var (currIter,remainderIter) = initialReader.span(line => line.charAt(0) != '>'); - currIter = currIter.map(_.toUpperCase()); - - def bufferEnd = bufferStart + buffer.length; - - def getSeqForInterval(chrom : String, start : Int, end : Int) : String = { - if(chrom != currChrom) { - error("ERROR: EfficientGenomeSeqContainer: requested sequence for chromosome other than current chromosome!"); - } - - if(start < bufferStart){ - reportln("ERROR: EfficientGenomeSeqContainer: Illegal request for sequence prior to current buffer limits!\n"+ - " request: "+chrom+":"+start+"-"+end,"note"); - reportBufferStatus; - error("ERROR: EfficientGenomeSeqContainer: Illegal request for sequence prior to current buffer limits!"); - } - if(bufferEnd < end){ - extendBufferTo(end); - } - - buffer.substring(start - bufferStart,end - bufferStart); + def buildEfficientGenomeSeqContainer(infiles : Seq[String]) : EfficientGenomeSeqContainer = { + if(infiles.length == 1){ + new EfficientGenomeSeqContainer_MFA(infiles.head); + } else { + new EfficientGenomeSeqContainer_FAList(infiles); } - def shiftBufferTo(chrom : String, start : Int){ - if(chrom == currChrom){ - if(bufferEnd < start){ - //quickly skip through a region without compiling a sequence as you go: - bufferStart = bufferEnd; - buffer = ""; - while(currIter.hasNext && bufferEnd < start){ - bufferStart = bufferEnd; - buffer = currIter.next; - } - maxBuffer = math.max(maxBuffer,buffer.length); - } else if(bufferStart < start){ - buffer = buffer.substring(start - bufferStart); - bufferStart = start; - maxBuffer = math.max(maxBuffer,buffer.length); - } //else do nothing. - } else { - currChrom = remainderIter.next.substring(1); - bufferStart = 0; - buffer = ""; - val iterPair = remainderIter.span(line => line.charAt(0) != '>'); - currIter = iterPair._1.map(_.toUpperCase()); - remainderIter = iterPair._2; - - if(currChrom != chrom){ - reportln("SKIPPING FASTA SEQUENCE FOR CHROMOSOME "+currChrom+" (probable cause: 0 reads found on chromosome, or bam/fasta mis-sorted)!","note"); - if(chromList.contains(chrom)){ - error("FATAL ERROR: BAM and BED files do not have the same chromosome ordering or BAM file is not properly sorted. Encountered BAM read from chromosome \""+chrom+"\", but have already passed this chromosome in the BED file."); - } + } + + class EfficientGenomeSeqContainer_MFA(infile : String) extends EfficientGenomeSeqContainer { + def switchToChrom(chrom : String){ + var iterPair = remainderIter.span(line => line != (">"+chrom)); + if(iterPair._2.hasNext){ + iterPair = iterPair._2.drop(1).span(line => line.charAt(0) != '>'); } else { - reportln("Shifting to chromosome: "+currChrom+"!","note"); + iterPair = internalUtils.fileUtils.getLinesSmartUnzip(infile).span(line => line != (">"+chrom)); + if(iterPair._2.hasNext){ + reportln("Returning to start of genome FASTA file. NOTE: for optimal performance, sort the FASTA file so that the chromosomes appear in the same order as in the BAM files.","note"); + iterPair = iterPair._2.drop(1).span(line => line.charAt(0) != '>'); + } else { + error("FATAL ERROR: Cannot find chromosome \""+chrom+"\" in genome FASTA file!") + } } - chromList = chromList ++ List[String](currChrom); - - shiftBufferTo(chrom,start); - } - } - var bufferLengthAlertLimit = 10500; - val bufferLengthAlertUnit = 1000; - def checkBufferStat { - maxBuffer = math.max(maxBuffer,buffer.length); - if(maxBuffer > bufferLengthAlertLimit){ - reportBufferStatus; - bufferLengthAlertLimit += bufferLengthAlertUnit; - } + reportln("Switching to Chromosome: " + chrom,"debug"); + currentIter = iterPair._1.map(_.toUpperCase()); + remainderIter = iterPair._2; + clearBuffer(); + currChrom = chrom; } - def extendBufferTo(end : Int) { - while(currIter.hasNext && bufferEnd < end){ - buffer = buffer + currIter.next; - } - if(bufferEnd < end){ - error("ERROR: EfficientGenomeSeqContainer: Attempted to extend buffer beyond chromosome span!"); - } - maxBuffer = math.max(maxBuffer,buffer.length); - } + var initialReader = internalUtils.fileUtils.getLinesSmartUnzip(infile); + currChrom = initialReader.next().substring(1); + chromList = List[String](currChrom); - def reportBufferStatus { - reportln(" [GenomeSeqContainer Status: "+"buf:("+currChrom+":"+bufferStart+"-"+bufferEnd+") "+"n="+buffer.length+", "+"MaxSoFar="+maxBuffer+"]","debug"); - } + var (currentIter,remainderIter) = initialReader.span(line => line.charAt(0) != '>'); + currentIter = currentIter.map(_.toUpperCase()); + def currIter = currentIter; } - class GenomeSeqIterator(infiles : Seq[String]) extends Iterator[(String,String)]{ - var reader = internalUtils.fileUtils.getLinesSmartUnzipFromSeq(infiles); - var curr = reader.next(); - var chrom = curr.substring(1); - - def getCurrChrom : String = chrom; - def hasNext : Boolean = reader.hasNext; - def next : (String,String) = { - curr = reader.next(); - if(curr.charAt(0) != '>'){ - return (chrom,curr.toUpperCase()); - } else { - val oldChrom = chrom; - chrom = curr.substring(1); - return (oldChrom,""); - } - } - def goToNextChrom(chr : String) : Boolean = { - if(chr == chrom){ - return true; - } else { - while(hasNext){ - curr = reader.next(); - if(curr.charAt(0) == '>'){ - chrom = curr.substring(1); - return true; - } + class EfficientGenomeSeqContainer_FAList(infiles : Seq[String]) extends EfficientGenomeSeqContainer { + val fastaMap : Map[String,String] = infiles.map(infile => { + (internalUtils.fileUtils.getLinesSmartUnzip(infile).next().tail,infile) + }).toMap; + var currentIter : Iterator[String] = Iterator() + + def currIter = currentIter; + def switchToChrom(chrom : String){ + if(fastaMap.containsKey(chrom)){ + currentIter = internalUtils.fileUtils.getLinesSmartUnzip(fastaMap(chrom)).drop(1).map(_.toUpperCase()); + clearBuffer(); + currChrom = chrom; + } else { + error("FATAL ERROR: Cannot find chromosome \""+chrom+"\" in genome FASTA list!") } - return false; - } - } } + /* + * + */ + + + object GenomicArrayOfSets { def apply[B](isStranded : Boolean) : GenomicArrayOfSets[B] = { diff --git a/src/HartleyUtils/src/main/scala/internalUtils/stdUtils.scala b/src/HartleyUtils/src/main/scala/internalUtils/stdUtils.scala index 1c4ff2c..5d2b279 100644 --- a/src/HartleyUtils/src/main/scala/internalUtils/stdUtils.scala +++ b/src/HartleyUtils/src/main/scala/internalUtils/stdUtils.scala @@ -14,8 +14,62 @@ object stdUtils { * Utility Classes: */ - - + + /* + * Untested: + */ + def getOrdering[A](x : Seq[A])(implicit ord : math.Ordering[A]) : Seq[Int] = { + x.zipWithIndex.sorted(new Ordering[(A,Int)]{ + def compare(x : (A,Int), y : (A,Int)) : Int = ord.compare(x._1,y._1); + }).map(_._2); + } + + def getQuantileCutoffs[A](x : Seq[A], quantiles : Seq[Double])(implicit ord : math.Ordering[A]) : Seq[Option[A]] = { + val xsort = x.sorted(ord); + val qsort = quantiles.sorted; + return qsort.zipWithIndex.map{case (q,i) => { + val cutoffInt = (q * xsort.length.toDouble).toInt; + if(cutoffInt < xsort.length){ + Some(xsort(cutoffInt)); + } else { + None; + } + }} + } + def splitByQuantile[A](x : Seq[A], quantiles : Seq[Double])(implicit ord : math.Ordering[A]) : Seq[Seq[A]] = { + val xsort = x.sorted(ord); + val len = xsort.length.toDouble; + val qsort = quantiles.sorted; + val ividx = qsort.init.zip(qsort.tail).map{case (s,e) => { + ((s * len).toInt,(e*len).toInt) + }}.map{case (s,e) => { + (if(s < xsort.length){ xsort.indexOf(xsort(s)) }else{ xsort.length }, + if(e < xsort.length){ xsort.indexOf(xsort(e)) }else{ xsort.length }) + }} + ividx.map{case(s,e) => { + xsort.slice(s,e); + }} + } + def splitListByQuantileX[A,B](items : Seq[B], x : Seq[A], quantiles : Seq[Double])(implicit ord : math.Ordering[A]) : Seq[Set[B]] = { + val ordering = getOrdering(x); + val xsort = ordering.map(x(_)); + val itemSort = ordering.map(items(_)); + val len = xsort.length.toDouble; + val qsort = quantiles.sorted; + val ividx = qsort.init.zip(qsort.tail).map{case (s,e) => { + ((s * len).toInt,(e*len).toInt) + }}.map{case (s,e) => { + (if(s < xsort.length){ xsort.indexOf(xsort(s)) }else{ xsort.length }, + if(e < xsort.length){ xsort.indexOf(xsort(e)) }else{ xsort.length }) + }} + ividx.map{ case(s,e) => { + itemSort.slice(s,e).toSet + }} + } + /* + * END untested + */ + def generalizedTo(f : Int, t : Int) : Seq[Int] = { if(f == t) return (f until t); else if(f < t) return (f to t); diff --git a/src/HartleyUtils/src/main/scala/qcUtils/qcGetGeneCounts.scala b/src/HartleyUtils/src/main/scala/qcUtils/qcGetGeneCounts.scala index 83e077d..5d212d7 100644 --- a/src/HartleyUtils/src/main/scala/qcUtils/qcGetGeneCounts.scala +++ b/src/HartleyUtils/src/main/scala/qcUtils/qcGetGeneCounts.scala @@ -115,11 +115,11 @@ object qcGetGeneCounts { /* * FIX ME FOR STRANDEDNESS!!!! */ - def makeGeneIntervalMap(intervalBreaks : Seq[Double], stdGeneArray : GenomicArrayOfSets[String], strandedGeneArray : GenomicArrayOfSets[String]) : Map[String, Vector[TreeSet[GenomicInterval]]] = { + def makeGeneIntervalMap(intervalBreaks : Seq[Double], stdGeneArray : GenomicArrayOfSets[String], strandedGeneArray : GenomicArrayOfSets[String], geneMapStrict : Map[String,TreeSet[GenomicInterval]]) : Map[String, Vector[TreeSet[GenomicInterval]]] = { //val initialMap = geneSet.foldLeft( new scala.collection.immutable.HashMap[String, TreeSet[GenomicInterval] ]() )((soFar,curr) =>{ // soFar + ((curr, new TreeSet[GenomicInterval]() )); //}) - val geneMap = helper_calculateGeneAssignmentMap_strict(stdGeneArray, strandedGeneArray); + val geneMap = geneMapStrict // helper_calculateGeneAssignmentMap_strict(stdGeneArray, strandedGeneArray); reportln("making makeGeneIntervalMap for geneBody calculations. Found: " + geneMap.size + " acceptable genes for gene-body analysis.","debug"); val geneLengths = geneMap.map((cg) => { @@ -155,51 +155,7 @@ object qcGetGeneCounts { // (geneID, new Array[Int](intervalCount)); //}).toMap; } - - private def helper_calculateGeneAssignmentMap_strict(stdGeneArray : GenomicArrayOfSets[String], strandedGeneArray : GenomicArrayOfSets[String]) : Map[String, TreeSet[GenomicInterval]] = { - val badGeneSet = stdGeneArray.getSteps.foldLeft(Set[String]())( (soFar, curr) =>{ - val (iv, geneSet) = curr; - if(geneSet.size > 1){ - soFar ++ geneSet.toSet; - } else { - soFar; - } - }); - //val allGeneSet = geneArray.getSteps.foldLeft(Set[String]())( (soFar, curr) =>{ - // soFar ++ curr._2.toSet; - //}); - - reportln("helper_calculateGeneAssignmentMap_strict. Found: " + strandedGeneArray.getValueSet.size + " genes in the supplied annotation.","debug"); - reportln("helper_calculateGeneAssignmentMap_strict. Found: " + badGeneSet.size + " genes with ambiguous segments.","debug"); - - val buf = strandedGeneArray.getSteps.foldLeft( Map[String, TreeSet[GenomicInterval] ]() )( (soFar, curr) => { - val currIv = curr._1; - val currGeneSet = curr._2; - if(currGeneSet.size == 1){ - val currGene = currGeneSet.head; - if(! badGeneSet.contains(currGene)){ - soFar.get(currGene) match { - case Some(ts : TreeSet[GenomicInterval]) => soFar + ((currGene, ts + currIv )); - case None => soFar + ((currGene, TreeSet[GenomicInterval](currIv) )); - } - } else { - soFar; - } - } else { - soFar; - } - }) - reportln("helper_calculateGeneAssignmentMap_strict. Found: " + buf.size + " genes after first-pass filtering","debug"); - - return buf.filter( (curr) => { - val (geneID, geneTree) = curr; - if(geneTree.size == 0) false; - else { - geneTree.forall( (c) => geneTree.head.chromName == c.chromName && geneTree.head.strand == c.strand ); - } - }); - } /* * Old note: FIX ME FOR STRANDEDNESS!!!! @@ -271,7 +227,11 @@ object qcGetGeneCounts { val default_coverageLevelThresholds = Seq(("1.bottomHalf",0.5),("2.upperMidQuartile",0.75),("3.75to90",0.9),("4.high",1.0)); //UNFINISHED? - def geneBody_calculateGeneBodyCoverage_summaries(writer : WriterUtil, writer2 : WriterUtil, geneBody_intervalBreaks : Seq[Double], coverageLevelThresholds : Seq[(String,Double)], geneBody_CoverageCountArrays : GenMap[String,Array[Int]], geneCounts : scala.collection.mutable.Map[String,Int]){ + def geneBody_calculateGeneBodyCoverage_summaries(writer : WriterUtil, writer2 : WriterUtil, + geneBody_intervalBreaks : Seq[Double], + coverageLevelThresholds : Seq[(String,Double)], + geneBody_CoverageCountArrays : GenMap[String,Array[Int]], + geneCounts : scala.collection.mutable.Map[String,Int]){ val geneBody_IntervalCount = geneBody_intervalBreaks.length - 1; val totalGeneBodyCoverage_simple = geneBody_CoverageCountArrays.foldLeft(repToSeq(0,geneBody_IntervalCount))((sofar, currPair) =>{ (0 until sofar.length).map(i => { @@ -408,61 +368,74 @@ class qcGetGeneCounts( stranded : Boolean, val geneArray : GenomicArrayOfSets[String] = anno_holder.geneArray; val strandedGeneArray : GenomicArrayOfSets[String] = anno_holder.strandedGeneArray; - val geneBody_IntervalCount : Int = geneBodyIntervalCount; - val geneBody_intervalBreaks = (0 to geneBody_IntervalCount).map(_.toDouble / geneBody_IntervalCount.toDouble).toSeq + lazy val geneBody_IntervalCount : Int = geneBodyIntervalCount; + lazy val geneBody_intervalBreaks = (0 to geneBody_IntervalCount).map(_.toDouble / geneBody_IntervalCount.toDouble).toSeq //Major change: fix genebody data for unstranded data: - val geneBody_CoverageIntervalMap = qcGetGeneCounts.makeGeneIntervalMap(geneBody_intervalBreaks, geneArray, strandedGeneArray); + lazy val geneBody_CoverageIntervalMap = qcGetGeneCounts.makeGeneIntervalMap(geneBody_intervalBreaks, geneArray, strandedGeneArray, anno_holder.geneMapStrict); //val geneBody_CoverageIntervalMap = qcGetGeneCounts.makeGeneIntervalMap(geneBody_intervalBreaks, strandedGeneArray); ////reportln("making geneBody_CoverageIntervalMap for geneBody calculations. Found: " + geneBody_CoverageIntervalMap.size + " acceptable genes.","debug"); - val geneBody_CoverageCountArrays : scala.collection.mutable.Map[String,Array[Int]] = qcGetGeneCounts.makeGeneBodyCoverageCountArrays(geneBody_IntervalCount, geneBody_CoverageIntervalMap.keys); + lazy val geneBody_CoverageCountArrays : scala.collection.mutable.Map[String,Array[Int]] = qcGetGeneCounts.makeGeneBodyCoverageCountArrays(geneBody_IntervalCount, geneBody_CoverageIntervalMap.keys); + + lazy val geneLengthMap = anno_holder.geneMapStrict.map((cg) => { + val currGene : String = cg._1; + val currIvSet : TreeSet[GenomicInterval] = cg._2; + (currGene, currIvSet.foldLeft(0)((sum,curr) => sum + (curr.end - curr.start) )); + }); + + val geneSizeQuantiles : Vector[Double] = Vector(0,0.25,0.5,0.75,1.0); + + lazy val geneBody_geneSizeQuartiles : Vector[Set[String]] = { + val geneList = geneLengthMap.keySet.toVector; + splitListByQuantileX(geneList,geneList.map(geneLengthMap(_)),geneSizeQuantiles).toVector + } //val mapLocation_CDS : GenomicArrayOfSets[String] - val geneArea_cdsArray = anno_holder.qcGetGeneCounts_cdsArray; - val geneArea_intronsArray = anno_holder.qcGetGeneCounts_intronArray; + lazy val geneArea_cdsArray = anno_holder.qcGetGeneCounts_cdsArray; + lazy val geneArea_intronsArray = anno_holder.qcGetGeneCounts_intronArray; //INITIALIZE COUNTERS: - val geneCounts : scala.collection.mutable.Map[String,Int] = qcGtfAnnotationBuilder.initializeCounter[String](geneArray.getValueSet); + lazy val geneCounts : scala.collection.mutable.Map[String,Int] = qcGtfAnnotationBuilder.initializeCounter[String](geneArray.getValueSet); //val utilCounts : scala.collection.mutable.Map[String,Int] = qcGtfAnnotationBuilder.initializeCounter[String](Set("_no_feature","_ambiguous")); - val geneArea_cdsCounts : scala.collection.mutable.Map[String,Int] = qcGtfAnnotationBuilder.initializeCounter[String](geneArray.getValueSet); + lazy val geneArea_cdsCounts : scala.collection.mutable.Map[String,Int] = qcGtfAnnotationBuilder.initializeCounter[String](geneArray.getValueSet); - val spanArray : GenomicArrayOfSets[String] = if(calcDetailedGeneCounts){ + lazy val spanArray : GenomicArrayOfSets[String] = if(calcDetailedGeneCounts){ anno_holder.qcGetGeneCounts_spanArray; } else { GenomicArrayOfSets[String](stranded); } //Advanced gene counts: - val geneArea_intronsCounts : scala.collection.mutable.Map[String,Int] = if(calcDetailedGeneCounts) qcGtfAnnotationBuilder.initializeCounter[String](spanArray.getValueSet); + lazy val geneArea_intronsCounts : scala.collection.mutable.Map[String,Int] = if(calcDetailedGeneCounts) qcGtfAnnotationBuilder.initializeCounter[String](spanArray.getValueSet); else scala.collection.mutable.Map[String,Int](); - val geneArea_intronsAmbig : scala.collection.mutable.Map[String,Int] = if(calcDetailedGeneCounts) qcGtfAnnotationBuilder.initializeCounter[String](spanArray.getValueSet); + lazy val geneArea_intronsAmbig : scala.collection.mutable.Map[String,Int] = if(calcDetailedGeneCounts) qcGtfAnnotationBuilder.initializeCounter[String](spanArray.getValueSet); else scala.collection.mutable.Map[String,Int](); - val geneArea_nearCounts : scala.collection.mutable.Map[String,Int] = if(calcDetailedGeneCounts) qcGtfAnnotationBuilder.initializeCounter[String](spanArray.getValueSet); + lazy val geneArea_nearCounts : scala.collection.mutable.Map[String,Int] = if(calcDetailedGeneCounts) qcGtfAnnotationBuilder.initializeCounter[String](spanArray.getValueSet); else scala.collection.mutable.Map[String,Int](); - val geneArea_nearAmbig : scala.collection.mutable.Map[String,Int] = if(calcDetailedGeneCounts) qcGtfAnnotationBuilder.initializeCounter[String](spanArray.getValueSet); + lazy val geneArea_nearAmbig : scala.collection.mutable.Map[String,Int] = if(calcDetailedGeneCounts) qcGtfAnnotationBuilder.initializeCounter[String](spanArray.getValueSet); else scala.collection.mutable.Map[String,Int](); - val geneArea_farCounts : scala.collection.mutable.Map[String,Int] = if(calcDetailedGeneCounts) qcGtfAnnotationBuilder.initializeCounter[String](spanArray.getValueSet); + lazy val geneArea_farCounts : scala.collection.mutable.Map[String,Int] = if(calcDetailedGeneCounts) qcGtfAnnotationBuilder.initializeCounter[String](spanArray.getValueSet); else scala.collection.mutable.Map[String,Int](); - val geneArea_farAmbig : scala.collection.mutable.Map[String,Int] = if(calcDetailedGeneCounts) qcGtfAnnotationBuilder.initializeCounter[String](spanArray.getValueSet); + lazy val geneArea_farAmbig : scala.collection.mutable.Map[String,Int] = if(calcDetailedGeneCounts) qcGtfAnnotationBuilder.initializeCounter[String](spanArray.getValueSet); else scala.collection.mutable.Map[String,Int](); - val geneArea_wrongStrandCts : scala.collection.mutable.Map[String,Int] = if(stranded && calcDetailedGeneCounts) qcGtfAnnotationBuilder.initializeCounter[String](geneArray.getValueSet); + lazy val geneArea_wrongStrandCts : scala.collection.mutable.Map[String,Int] = if(stranded && calcDetailedGeneCounts) qcGtfAnnotationBuilder.initializeCounter[String](geneArray.getValueSet); else scala.collection.mutable.Map[String,Int](); - val geneArea_wrongStrandAmbig : scala.collection.mutable.Map[String,Int] = if(stranded && calcDetailedGeneCounts) qcGtfAnnotationBuilder.initializeCounter[String](geneArray.getValueSet); + lazy val geneArea_wrongStrandAmbig : scala.collection.mutable.Map[String,Int] = if(stranded && calcDetailedGeneCounts) qcGtfAnnotationBuilder.initializeCounter[String](geneArray.getValueSet); else scala.collection.mutable.Map[String,Int](); - val geneArea_wrongStrandAndIntron : scala.collection.mutable.Map[String,Int] = if(stranded && calcDetailedGeneCounts) qcGtfAnnotationBuilder.initializeCounter[String](spanArray.getValueSet); + lazy val geneArea_wrongStrandAndIntron : scala.collection.mutable.Map[String,Int] = if(stranded && calcDetailedGeneCounts) qcGtfAnnotationBuilder.initializeCounter[String](spanArray.getValueSet); else scala.collection.mutable.Map[String,Int](); - val geneArea_wrongStrandAndIntronAmbig : scala.collection.mutable.Map[String,Int] = if(stranded && calcDetailedGeneCounts) qcGtfAnnotationBuilder.initializeCounter[String](spanArray.getValueSet); + lazy val geneArea_wrongStrandAndIntronAmbig : scala.collection.mutable.Map[String,Int] = if(stranded && calcDetailedGeneCounts) qcGtfAnnotationBuilder.initializeCounter[String](spanArray.getValueSet); else scala.collection.mutable.Map[String,Int](); //val geneArea_wrongStrandOrIntron : scala.collection.mutable.Map[String,Int] = if(stranded && calcDetailedGeneCounts) qcGtfAnnotationBuilder.initializeCounter[String](geneArea_intronsArray.getValueSet); // else scala.collection.mutable.Map[String,Int](); - val geneCounts_ambig : scala.collection.mutable.Map[String,Int] = qcGtfAnnotationBuilder.initializeCounter[String](geneArray.getValueSet); - val geneCounts_utr : scala.collection.mutable.Map[String,Int] = qcGtfAnnotationBuilder.initializeCounter[String](geneArray.getValueSet); + lazy val geneCounts_ambig : scala.collection.mutable.Map[String,Int] = qcGtfAnnotationBuilder.initializeCounter[String](geneArray.getValueSet); + lazy val geneCounts_utr : scala.collection.mutable.Map[String,Int] = qcGtfAnnotationBuilder.initializeCounter[String](geneArray.getValueSet); var readNoFeature : Int = 0; var readAmbiguous : Int = 0; diff --git a/src/HartleyUtils/src/main/scala/qcUtils/qcGtfAnnotationBuilder.scala b/src/HartleyUtils/src/main/scala/qcUtils/qcGtfAnnotationBuilder.scala index f4e86b6..87a0294 100644 --- a/src/HartleyUtils/src/main/scala/qcUtils/qcGtfAnnotationBuilder.scala +++ b/src/HartleyUtils/src/main/scala/qcUtils/qcGtfAnnotationBuilder.scala @@ -43,6 +43,8 @@ class qcGtfAnnotationBuilder(gtffile : String, flatgtffile : Option[String], str lazy val qcGetGeneCounts_spanArray : GenomicArrayOfSets[String] = qcGtfAnnotationBuilder.qcGetGeneCounts_geneArea_regions(stranded,gtffile, stdCodes).finalizeStepVectors; lazy val flatFeatureList : IndexedSeq[String] = qcGtfAnnotationBuilder.getFlatFeatureList(makeFlatReader, stranded, flatCodes); + lazy val geneMapStrict : Map[String,TreeSet[GenomicInterval]] = qcGtfAnnotationBuilder.helper_calculateGeneAssignmentMap_strict(geneArray, strandedGeneArray); + lazy val geneLengthMap : GenMap[String,Int] = qcGtfAnnotationBuilder.getGeneLengthMap(geneArray); lazy val flatExonArray : GenomicArrayOfSets[String] = qcGtfAnnotationBuilder.makeFlatExonMap(stranded,makeFlatReader, flatCodes).finalizeStepVectors; lazy val flatGeneSet : Set[String] = qcGtfAnnotationBuilder.makeFlatGeneSet(flatExonArray); @@ -121,6 +123,51 @@ object qcGtfAnnotationBuilder { /* * Misc other utils: */ + + private def helper_calculateGeneAssignmentMap_strict(stdGeneArray : GenomicArrayOfSets[String], strandedGeneArray : GenomicArrayOfSets[String]) : Map[String, TreeSet[GenomicInterval]] = { + + val badGeneSet = stdGeneArray.getSteps.foldLeft(Set[String]())( (soFar, curr) =>{ + val (iv, geneSet) = curr; + if(geneSet.size > 1){ + soFar ++ geneSet.toSet; + } else { + soFar; + } + }); + //val allGeneSet = geneArray.getSteps.foldLeft(Set[String]())( (soFar, curr) =>{ + // soFar ++ curr._2.toSet; + //}); + + reportln("helper_calculateGeneAssignmentMap_strict. Found: " + strandedGeneArray.getValueSet.size + " genes in the supplied annotation.","debug"); + reportln("helper_calculateGeneAssignmentMap_strict. Found: " + badGeneSet.size + " genes with ambiguous segments.","debug"); + + val buf = strandedGeneArray.getSteps.foldLeft( Map[String, TreeSet[GenomicInterval] ]() )( (soFar, curr) => { + val currIv = curr._1; + val currGeneSet = curr._2; + if(currGeneSet.size == 1){ + val currGene = currGeneSet.head; + if(! badGeneSet.contains(currGene)){ + soFar.get(currGene) match { + case Some(ts : TreeSet[GenomicInterval]) => soFar + ((currGene, ts + currIv )); + case None => soFar + ((currGene, TreeSet[GenomicInterval](currIv) )); + } + } else { + soFar; + } + } else { + soFar; + } + }) + reportln("helper_calculateGeneAssignmentMap_strict. Found: " + buf.size + " genes after first-pass filtering","debug"); + + return buf.filter( (curr) => { + val (geneID, geneTree) = curr; + if(geneTree.size == 0) false; + else { + geneTree.forall( (c) => geneTree.head.chromName == c.chromName && geneTree.head.strand == c.strand ); + } + }); + } private def getFlatFeatureList(makeFlatReader : (() => Iterator[FlatGtfLine]), stranded : Boolean, codes : GtfCodes) : IndexedSeq[String] = { makeFlatReader().foldLeft( IndexedSeq[String]() )( (soFar,gtfLine) =>{ diff --git a/src/HartleyUtils/src/main/scala/qcUtils/qcInnerDistance.scala b/src/HartleyUtils/src/main/scala/qcUtils/qcInnerDistance.scala index 45dad5f..856c315 100644 --- a/src/HartleyUtils/src/main/scala/qcUtils/qcInnerDistance.scala +++ b/src/HartleyUtils/src/main/scala/qcUtils/qcInnerDistance.scala @@ -480,6 +480,11 @@ class qcInnerDistance(annoHolder : qcGtfAnnotationBuilder, stranded : Boolean, f writer3.write("DROPPED_RevRead_Starts_Before_Fwd_And_Too_Many_Adaptor_Bases_Align_Limit_Is_"+qcInnerDistance.STAGGERED_ADAPTOR_ALIGNMENT_LIMIT+" " +insertSizeMap_noOverlap(i) + " "+insertSizeMap_staggeredOverlap(i) + " "+insertSizeMap_partialOverlap(i)+" "+insertSizeMap(i)+"\n"); //i = qcInnerDistance.STAGGERED_FULL_BLOCK_OF_ADAPTOR_ALIGNED; //writer3.write("RevRead_Starts_Before_Fwd_And_RevRead_Splices_Before_Overlapping "+" " +insertSizeMap_noOverlap(i) + " "+insertSizeMap_staggeredOverlap(i) + " "+insertSizeMap_partialOverlap(i)+" "+insertSizeMap(i)+"\n"); + + val negativeSizeSeq = (sizeSeqBad.toSet -- Set(qcInnerDistance.STAGGERED_NO_OVERLAP,qcInnerDistance.OVERLAP_CIGAR_MISMATCH_PARTIAL_OVERLAP, qcInnerDistance.STAGGERED_TOO_MUCH_ADAPTOR_ALIGNED)).toSeq.sorted.reverse; + for(i <- negativeSizeSeq){ + writer3.write(i+"\t" +insertSizeMap_noOverlap(i) + "\t"+insertSizeMap_staggeredOverlap(i) + "\t"+insertSizeMap_partialOverlap(i)+"\t"+insertSizeMap(i)+"\n"); + } close(writer3); val numGood = sizeSeq.map(i => insertSizeMap(i)).sum; diff --git a/src/HartleyUtils/src/main/scala/qcUtils/qcOverlapMatch.scala b/src/HartleyUtils/src/main/scala/qcUtils/qcOverlapMatch.scala index bbd7847..b983d0a 100644 --- a/src/HartleyUtils/src/main/scala/qcUtils/qcOverlapMatch.scala +++ b/src/HartleyUtils/src/main/scala/qcUtils/qcOverlapMatch.scala @@ -95,8 +95,8 @@ class qcOverlapMatch(readLen : Int, mismatchSamFile : Option[String], reportln("> Init OverlapMatch Utility","debug"); val samwriter : Option[WriterUtil] = if(mismatchSamFile.isEmpty) None else Some(openWriterSmart(mismatchSamFile.get)); - - val genomeSeq = if(genomeFa.isEmpty) null else new EfficientGenomeSeqContainer(genomeFa.get); + + val genomeSeq = if(genomeFa.isEmpty) null else buildEfficientGenomeSeqContainer(genomeFa.get); val bufferSize = genomeBufferSize; var ppoLengthMismatch = 0; diff --git a/src/HartleyUtils/src/main/scala/qcUtils/runAllQC.scala b/src/HartleyUtils/src/main/scala/qcUtils/runAllQC.scala index 660400a..1b21f85 100644 --- a/src/HartleyUtils/src/main/scala/qcUtils/runAllQC.scala +++ b/src/HartleyUtils/src/main/scala/qcUtils/runAllQC.scala @@ -464,7 +464,7 @@ object runAllQC { ) :: new UnaryArgument( name = "isRNASeq", arg = List("--RNA"), // name of value - argDesc = "Indicates that the data is RNA-Seq (the default: flag does nothing)."+ + argDesc = "Indicates that the data is RNA-Seq (this is the default: flag does nothing)."+ "" // description ) :: @@ -472,24 +472,27 @@ object runAllQC { name = "genomeFA", arg = List("--genomeFA"), valueName = "chr.fa.gz[,chr2.fa,...]", - argDesc = "Provides a genome fasta (or multiple fasta's) for the genome. "+ - "This is used by certain experimental sub-utilities. "+ + argDesc = "Reference genome sequence. This can either be a single FASTA file with all the chromosomes included, or a comma-delimited list of fasta files with 1 chromosome each. "+ + "Note: IF multiple fasta files are specificed, each must contain ONLY ONE chromosome. "+ + "If a single multi-chromosome fasta file is specificed, performance will be improved if the chromosomes are in the same order as they are found in the BAM file, "+ + "however, this is not required. "+ + "The genomic sequence is used by certain experimental sub-utilities (currently only the referenceMatch utility). "+ "Comma delimited, no spaces. "+ - "Fasta files can be gzipped or zipped."+ + "Fasta files can be in plaintext, gzipped or zipped."+ "" ) :: new BinaryArgument[Int]( name = "genomeBufferSize", arg = List("--genomeBufferSize"), valueName = "val", - argDesc = "The size of the genome fasta buffer. If this is too small, then errors may randomly occur."+ + argDesc = "The size of the genome fasta buffer. Tuning this parameter may improve performance."+ ""+ ""+ ""+ "", defaultValue = Some(10000) ) :: - + new BinaryArgument[String]( name = "outfilePrefix", arg = List("--outfilePrefix"), diff --git a/src/HartleyUtils/src/main/scala/runner/runner.scala b/src/HartleyUtils/src/main/scala/runner/runner.scala index 39d102c..d98a49d 100644 --- a/src/HartleyUtils/src/main/scala/runner/runner.scala +++ b/src/HartleyUtils/src/main/scala/runner/runner.scala @@ -9,9 +9,9 @@ import internalUtils.commandLineUI._; object runner { - final val QORTS_VERSION = "1.2.25"; // REPLACE_THIS_QORTS_VERSION_VARIABLE_WITH_VERSION_NUMBER (note this exact text is used in a search-and-replace. Do not change it.) - final val QORTS_COMPILE_DATE = "Sat Mar 4 13:22:59 EST 2017"; // REPLACE_THIS_QORTS_DATE_VARIABLE_WITH_DATE (note this exact text is used in a search-and-replace. Do not change it.) - final val QORTS_COMPILE_TIME : Long = 1488651779; // REPLACE_THIS_QORTS_DATE_VARIABLE_WITH_TIME (note this exact text is used in a search-and-replace. Do not change it.) + final val QORTS_VERSION = "1.2.26"; // REPLACE_THIS_QORTS_VERSION_VARIABLE_WITH_VERSION_NUMBER (note this exact text is used in a search-and-replace. Do not change it.) + final val QORTS_COMPILE_DATE = "Tue Mar 7 14:22:03 EST 2017"; // REPLACE_THIS_QORTS_DATE_VARIABLE_WITH_DATE (note this exact text is used in a search-and-replace. Do not change it.) + final val QORTS_COMPILE_TIME : Long = 1488914523; // REPLACE_THIS_QORTS_DATE_VARIABLE_WITH_TIME (note this exact text is used in a search-and-replace. Do not change it.) final val QORTS_MAJOR_VERSION = QORTS_VERSION.split("\\.")(0); final val QORTS_MINOR_VERSION = QORTS_VERSION.split("\\.")(1); diff --git a/src/QoRTs/DESCRIPTION b/src/QoRTs/DESCRIPTION index 9c3cec5..c81f51c 100644 --- a/src/QoRTs/DESCRIPTION +++ b/src/QoRTs/DESCRIPTION @@ -1,6 +1,6 @@ Package: QoRTs -Version: 1.2.25 -Date: 2017-03-04 +Version: 1.2.26 +Date: 2017-03-07 Title: Quality of RNA-seq Tool Authors@R: c(person("Stephen Hartley, PhD", "Developer", role = c("aut", "cre"), email = "QoRTs-contact@list.nih.gov")) diff --git a/src/QoRTs/vignettes/QoRTs-vignette.Rnw b/src/QoRTs/vignettes/QoRTs-vignette.Rnw index 7491ad4..3c78944 100644 --- a/src/QoRTs/vignettes/QoRTs-vignette.Rnw +++ b/src/QoRTs/vignettes/QoRTs-vignette.Rnw @@ -136,7 +136,7 @@ A UCSC browser session produced in this pipeline is also \href{https://genome.uc \emph{Optional:} \begin{itemize} - \item \emph{Reference Genome FASTA file}: In order to calculate reference mismatch rates, a genomic FASTA file is required. The chromosomes must be listed in the same order that they appear in the BAM file. In addition, the BAM file must be coordinate-sorted. + \item \emph{Reference Genome FASTA file}: In order to calculate reference mismatch rates, a genomic FASTA file is required. A list of single-chromosome FASTA files can also be provided. In addition, the BAM file must be coordinate-sorted. \item \emph{On-Target BED file}: For WES or similar, an on-target bed file can be supplied so that on-target stats can be calculated. \item \emph{Un-aligned FASTQ files}: In addition to processing the aligned BAM data, QoRTs can optionally be passed the raw unaligned FASTQ files as well. QoRTs will then calculate a number of QC metrics based on the raw fastq data. \end{itemize} @@ -206,7 +206,7 @@ Running in the default mode, QoRTs will accept both name-sorted and position-sor QoRTs also has a separate mode designed only for name-sorted samples, which can be activated using the "\verb|--nameSorted|" option. Under certain conditions this may improve speed and reduce memory usage. Under typical conditions any improvement is trivial. -\emph{In order to calculate the reference mismatch rate, the BAM files MUST be sorted by coordinate.} In addition, it is recommended that the genome fasta file be sorted in the same chromosome ordering as the BAM files. +\emph{In order to calculate the reference mismatch rate, the BAM files MUST be sorted by coordinate.} %----------------------------------------------------------- \section{Quick Start} \label{sec:quickstart} @@ -222,7 +222,7 @@ java -Xmx4G -jar /path/to/jarfile/QoRTs.jar QC \ /qc/data/dir/path/ \end{verbatim} -Note: This command must be executed as a single line. Additional options may be required depending on the nature of the dataset. For single-ended data, the \verb|--singleEnded| option should be included. For strand-specific data, the \verb|--stranded| option should be included, and possibly also the \verb|--fr_secondStrand| option depending on the stranded library type. For position-sorted data, the \verb|--coordSorted| option should be used. See \ref{sec:initialProcessing}, or the QC utility documentation\footnote{Found \href{http://hartleys.github.io/QoRTs/jarHtml/QC.html}{here}.} for more information on the available options. +Note: This command must be executed as a single line. Additional options may be required depending on the nature of the dataset. For single-ended data, the \verb|--singleEnded| option should be included. For strand-specific data, the \verb|--stranded| option should be included, and possibly also the \verb|--fr_secondStrand| option depending on the stranded library type. In order to calculate the reference mismatch rate the BAM file must be coordinate sorted, otherwise it can be sorted by either coordinate or name. See \ref{sec:initialProcessing}, or the QC utility documentation\footnote{Found \href{http://hartleys.github.io/QoRTs/jarHtml/QC.html}{here}.} for more information on the available options. If the genome fasta file and the raw unaligned fastq files are available, use the additional parameters: