From 8d6d7fcda6f1c32bc1add047489195c85985b492 Mon Sep 17 00:00:00 2001 From: Derek Barnett Date: Thu, 30 Jun 2016 11:36:08 -0500 Subject: [PATCH 1/2] added list-tag support to bamtools split --- src/toolkit/bamtools_split.cpp | 137 ++++++++++++++++++++++++++++----- 1 file changed, 118 insertions(+), 19 deletions(-) diff --git a/src/toolkit/bamtools_split.cpp b/src/toolkit/bamtools_split.cpp index 2e25627c..5dc8f122 100644 --- a/src/toolkit/bamtools_split.cpp +++ b/src/toolkit/bamtools_split.cpp @@ -72,6 +72,7 @@ struct SplitTool::SplitSettings { bool HasCustomOutputStub; bool HasCustomRefPrefix; bool HasCustomTagPrefix; + bool HasListTagDelimiter; bool IsSplittingMapped; bool IsSplittingPaired; bool IsSplittingReference; @@ -83,6 +84,7 @@ struct SplitTool::SplitSettings { string CustomTagPrefix; string InputFilename; string TagToSplit; + string ListTagDelimiter; // constructor SplitSettings(void) @@ -90,6 +92,7 @@ struct SplitTool::SplitSettings { , HasCustomOutputStub(false) , HasCustomRefPrefix(false) , HasCustomTagPrefix(false) + , HasListTagDelimiter(false) , IsSplittingMapped(false) , IsSplittingPaired(false) , IsSplittingReference(false) @@ -99,6 +102,7 @@ struct SplitTool::SplitSettings { , CustomTagPrefix("") , InputFilename(Options::StandardIn()) , TagToSplit("") + , ListTagDelimiter("--") { } }; @@ -139,8 +143,14 @@ class SplitTool::SplitToolPrivate { // finds first alignment and calls corresponding SplitTagImpl<> // depending on tag type bool SplitTag(void); - // templated split tag implementation - // handle the various types that are possible for tags + + public: + + // handles list-type tags + template + bool SplitListTagImpl(BamAlignment& al); + + // handles single-value tags template bool SplitTagImpl(BamAlignment& al); @@ -383,27 +393,37 @@ bool SplitTool::SplitToolPrivate::SplitTag(void) { // pass it the current alignment found switch ( tagType ) { - case (Constants::BAM_TAG_TYPE_INT8) : - case (Constants::BAM_TAG_TYPE_INT16) : - case (Constants::BAM_TAG_TYPE_INT32) : - return SplitTagImpl(al); - - case (Constants::BAM_TAG_TYPE_UINT8) : - case (Constants::BAM_TAG_TYPE_UINT16) : - case (Constants::BAM_TAG_TYPE_UINT32) : - return SplitTagImpl(al); - - case (Constants::BAM_TAG_TYPE_FLOAT) : - return SplitTagImpl(al); + case (Constants::BAM_TAG_TYPE_INT8) : return SplitTagImpl(al); + case (Constants::BAM_TAG_TYPE_INT16) : return SplitTagImpl(al); + case (Constants::BAM_TAG_TYPE_INT32) : return SplitTagImpl(al); + case (Constants::BAM_TAG_TYPE_UINT8) : return SplitTagImpl(al); + case (Constants::BAM_TAG_TYPE_UINT16) : return SplitTagImpl(al); + case (Constants::BAM_TAG_TYPE_UINT32) : return SplitTagImpl(al); + case (Constants::BAM_TAG_TYPE_FLOAT) : return SplitTagImpl(al); case (Constants::BAM_TAG_TYPE_ASCII) : case (Constants::BAM_TAG_TYPE_STRING) : case (Constants::BAM_TAG_TYPE_HEX) : return SplitTagImpl(al); - case (Constants::BAM_TAG_TYPE_ARRAY) : - cerr << "bamtools split ERROR: array tag types are not supported" << endl; - return false; + case (Constants::BAM_TAG_TYPE_ARRAY) : { + + char arrayTagType(0); + if (!al.GetArrayTagType(m_settings->TagToSplit, arrayTagType)) + continue; + switch(arrayTagType) { + case (Constants::BAM_TAG_TYPE_INT8) : return SplitListTagImpl(al); + case (Constants::BAM_TAG_TYPE_INT16) : return SplitListTagImpl(al); + case (Constants::BAM_TAG_TYPE_INT32) : return SplitListTagImpl(al); + case (Constants::BAM_TAG_TYPE_UINT8) : return SplitListTagImpl(al); + case (Constants::BAM_TAG_TYPE_UINT16) : return SplitListTagImpl(al); + case (Constants::BAM_TAG_TYPE_UINT32) : return SplitListTagImpl(al); + case (Constants::BAM_TAG_TYPE_FLOAT) : return SplitListTagImpl(al); + default: + cerr << "bamtools split ERROR: array tag has unsupported element type: " << arrayTagType << endl; + return false; + } + } default: cerr << "bamtools split ERROR: unknown tag type encountered: " << tagType << endl; @@ -447,7 +467,83 @@ void SplitTool::SplitToolPrivate::CloseWriters(map& writers) { writers.clear(); } -// handle the various types that are possible for tags +// handle list-type tags +template +bool SplitTool::SplitToolPrivate::SplitListTagImpl(BamAlignment& al) { + + typedef T TagElementType; + typedef vector TagValueType; + typedef map WriterMap; + typedef typename WriterMap::iterator WriterMapIterator; + + // set up splitting data structure + WriterMap outputFiles; + WriterMapIterator writerIter; + + // determine tag prefix + string tagPrefix = SPLIT_TAG_TOKEN; + if ( m_settings->HasCustomTagPrefix ) + tagPrefix = m_settings->CustomTagPrefix; + + // make sure prefix starts with '.' + const size_t dotFound = tagPrefix.find('.'); + if ( dotFound != 0 ) + tagPrefix = string(".") + tagPrefix; + + const string tag = m_settings->TagToSplit; + BamWriter* writer; + TagValueType currentValue; + while (m_reader.GetNextAlignment(al)) { + + string listTagLabel; + if (!al.GetTag(tag, currentValue)) + listTagLabel = "none"; + else { + // make list label from tag data + stringstream listTagLabelStream; + typename TagValueType::const_iterator tagValueIter = currentValue.cbegin(); + typename TagValueType::const_iterator tagValueEnd = currentValue.cend(); + for (; tagValueIter != tagValueEnd; ++tagValueIter) + listTagLabelStream << (*tagValueIter) << m_settings->ListTagDelimiter; + listTagLabel = listTagLabelStream.str(); + if (!listTagLabel.empty()) + listTagLabel = listTagLabel.substr(0, listTagLabel.size() - m_settings->ListTagDelimiter.size()); // pop last delimiter + } + + // lookup writer for label + writerIter = outputFiles.find(listTagLabel); + + // if not found, create one + if (writerIter == outputFiles.cend()) { + + // open new BamWriter, save first alignment + stringstream outputFilenameStream; + outputFilenameStream << m_outputFilenameStub << tagPrefix << tag << "_" << listTagLabel << ".bam"; + writer = new BamWriter; + if ( !writer->Open(outputFilenameStream.str(), m_header, m_references) ) { + cerr << "bamtools split ERROR: could not open " << outputFilenameStream.str() + << " for writing." << endl; + return false; + } + + // store in map + outputFiles.insert( make_pair(listTagLabel, writer) ); + } + + // else grab existing writer + else writer = (*writerIter).second; + + // store alignment in proper BAM output file + if ( writer ) + writer->SaveAlignment(al); + } + + // clean up & return success + CloseWriters(outputFiles); + return true; +} + +// handle the single-value tags template bool SplitTool::SplitToolPrivate::SplitTagImpl(BamAlignment& al) { @@ -554,13 +650,16 @@ SplitTool::SplitTool(void) // set up options OptionGroup* IO_Opts = Options::CreateOptionGroup("Input & Output"); - Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", m_settings->HasInputFilename, m_settings->InputFilename, IO_Opts, Options::StandardIn()); + Options::AddValueOption("-in", "BAM filename", "the input BAM file", "", + m_settings->HasInputFilename, m_settings->InputFilename, IO_Opts, Options::StandardIn()); Options::AddValueOption("-refPrefix", "string", "custom prefix for splitting by references. Currently files end with REF_.bam. This option allows you to replace \"REF_\" with a prefix of your choosing.", "", m_settings->HasCustomRefPrefix, m_settings->CustomRefPrefix, IO_Opts); Options::AddValueOption("-tagPrefix", "string", "custom prefix for splitting by tags. Current files end with TAG__.bam. This option allows you to replace \"TAG_\" with a prefix of your choosing.", "", m_settings->HasCustomTagPrefix, m_settings->CustomTagPrefix, IO_Opts); Options::AddValueOption("-stub", "filename stub", "prefix stub for output BAM files (default behavior is to use input filename, without .bam extension, as stub). If input is stdin and no stub provided, a timestamp is generated as the stub.", "", m_settings->HasCustomOutputStub, m_settings->CustomOutputStub, IO_Opts); + Options::AddValueOption("-tagListDelim", "string", "delimiter used to separate values in the filenames generated from splitting on list-type tags [--]", "", + m_settings->HasListTagDelimiter, m_settings->ListTagDelimiter, IO_Opts); OptionGroup* SplitOpts = Options::CreateOptionGroup("Split Options"); Options::AddOption("-mapped", "split mapped/unmapped alignments", m_settings->IsSplittingMapped, SplitOpts); From 51ba207654e4d298321163976bcbbbfdc3e27788 Mon Sep 17 00:00:00 2001 From: Derek Barnett Date: Thu, 30 Jun 2016 11:43:47 -0500 Subject: [PATCH 2/2] version bump -> 2.4.1 --- CMakeLists.txt | 2 +- docs/Doxyfile | 2 +- src/api/CMakeLists.txt | 2 +- src/toolkit/CMakeLists.txt | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8e115356..e1f2aa74 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,7 +32,7 @@ ensure_out_of_source_build( " # set BamTools version information set( BamTools_VERSION_MAJOR 2 ) set( BamTools_VERSION_MINOR 4 ) -set( BamTools_VERSION_BUILD 0 ) +set( BamTools_VERSION_BUILD 1 ) # set our library and executable destination dirs set( EXECUTABLE_OUTPUT_PATH "${CMAKE_SOURCE_DIR}/bin" ) diff --git a/docs/Doxyfile b/docs/Doxyfile index c89c5a58..6c8f51be 100644 --- a/docs/Doxyfile +++ b/docs/Doxyfile @@ -31,7 +31,7 @@ PROJECT_NAME = BamTools # This could be handy for archiving the generated documentation or # if some version control system is used. -PROJECT_NUMBER = 2.4.0 +PROJECT_NUMBER = 2.4.1 # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) # base path where the generated documentation will be put. diff --git a/src/api/CMakeLists.txt b/src/api/CMakeLists.txt index 32edcff1..4d5838b8 100644 --- a/src/api/CMakeLists.txt +++ b/src/api/CMakeLists.txt @@ -34,7 +34,7 @@ set( BamToolsAPISources # create main BamTools API shared library add_library( BamTools SHARED ${BamToolsAPISources} ) set_target_properties( BamTools PROPERTIES - SOVERSION "2.4.0" + SOVERSION "2.4.1" OUTPUT_NAME "bamtools" ) # create main BamTools API static library diff --git a/src/toolkit/CMakeLists.txt b/src/toolkit/CMakeLists.txt index c781148e..cc76b35b 100644 --- a/src/toolkit/CMakeLists.txt +++ b/src/toolkit/CMakeLists.txt @@ -31,7 +31,7 @@ add_executable( bamtools_cmd # set BamTools application properties set_target_properties( bamtools_cmd PROPERTIES - VERSION 2.4.0 + VERSION 2.4.1 OUTPUT_NAME "bamtools" ) # make version info available in application