Merge pull request #24 from N-Hoffmann/stringtie

Add gffcompare class codes to final gtf; reformat code
IGDRion · Aug 19, 2024 · 2d66137 · 2d66137
2 parents 9557f8d + c4f93cc
commit 2d66137
Show file tree

Hide file tree

Showing 18 changed files with 162 additions and 96 deletions.
diff --git a/bin/class_code.R b/bin/class_code.R
@@ -7,13 +7,14 @@ library(rtracklayer)
 args = commandArgs(trailingOnly=TRUE)
 class_code_gtf = args[1]
 extended_annotation = args[2]
+output = args[3]
 
 #############################################################################
 # Read both GTF and assign gffcompare class_code to extended annotation
 #############################################################################
-file1 <- readGFF(class_code_gtf)
-file2 <- readGFF(extended_annotation)
+cc_gtf <- readGFF(class_code_gtf)
+ext_anno <- readGFF(extended_annotation)
 
-file2$class_code <- file1$class_code[match(paste(file2$transcript_id, file2$type), paste(file1$transcript_id, file1$type))]
+ext_anno$class_code <- cc_gtf$class_code[match(paste(ext_anno$transcript_id, ext_anno$type), paste(cc_gtf$transcript_id, cc_gtf$type))]
 
-export(file2, 'extended_annotation_class_code.gtf')
+export(ext_anno, output)
diff --git a/bin/validate_gtf.py b/bin/validate_gtf.py
@@ -16,6 +16,11 @@
     #######################################################
 
     for record in GTF.parse_by_line(args.gtf):
+        if record.feature =="transcript":
+            if "gene_biotype" in record:
+                g_biotype = record["gene_biotype"]
+            elif "gene_type" in record:
+                g_biotype = record["gene_type"]
         if record.feature == "gene" or record.feature == "transcript":
             continue
 
@@ -39,9 +44,8 @@
 
         #######################################################
         # Check if gene_biotype in each transcripts and exons
-        if not "gene_biotype" in record:
-            record["gene_biotype"] = "NA"
-        g_biotype = record["gene_biotype"]
+        if not "gene_biotype" in record and "gene_type" not in record:
+            record["gene_biotype"] = g_biotype
 
         # Check for RefSeq gene_biotype format
         if g_biotype == "mRNA":

diff --git a/main.nf b/main.nf
@@ -36,7 +36,6 @@ include { INDEX_BAM                      } from './modules/index_bam.nf'
 include { BAMBU                          } from './modules/bambu/bambu.nf'
 include { STRINGTIE                      } from './modules/stringtie/stringtie_workflow.nf'
 include { GFFCOMPARE                     } from './modules/gffcompare/gffcompare.nf'
-include { ADD_CLASS_CODE                 } from './modules/add_class_code.nf'
 include { SPLIT_EXTENDED_ANNOTATION      } from './modules/split_extended_annotation.nf'
 include { FEELNC_CODPOT                  } from './modules/feelnc/codpot.nf'
 include { FEELNC_FORMAT                  } from './modules/feelnc/format.nf'
@@ -45,6 +44,7 @@ include { MERGE_NOVEL                    } from './modules/merge_novel.nf'
 include { TRANSDECODER                   } from './modules/transdecoder/transdecoder_workflow.nf'
 include { TFKMERS                        } from './modules/transforkmers/workflow.nf'
 include { QC as QC_FULL; QC as QC_FILTER } from './modules/qc/workflow.nf'
+include { ADD_CLASS_CODE                 } from './modules/add_class_code.nf'
 
 workflow {
   ///////////////////////////////////////////////////////////////////////////
@@ -66,17 +66,16 @@ workflow {
   ///////////////////////////////////////////////////////////////////////////
   if(params.tx_discovery == "bambu") {
     BAMBU(samples.collect(), VALIDATE_INPUT_GTF.out, ref_fa)
-    GFFCOMPARE(input_gtf, ref_fa, BAMBU.out.bambu_gtf)
-    ADD_CLASS_CODE(GFFCOMPARE.out.class_code_gtf, BAMBU.out.bambu_gtf)
+    GFFCOMPARE(VALIDATE_INPUT_GTF.out, ref_fa, BAMBU.out.bambu_gtf)
+    SPLIT_EXTENDED_ANNOTATION(BAMBU.out.bambu_gtf)
   }
   else if (params.tx_discovery == "stringtie2") {
     STRINGTIE(samples, VALIDATE_INPUT_GTF.out, ref_fa)
-    ADD_CLASS_CODE(STRINGTIE.out.class_code_gtf, STRINGTIE.out.stringtie_gtf)
+    SPLIT_EXTENDED_ANNOTATION(STRINGTIE.out.stringtie_gtf)
   }
 
-  SPLIT_EXTENDED_ANNOTATION(ADD_CLASS_CODE.out.extended_annotation_class_code)
   ///////////////////////////////////////////////////////////////////////////
-  // EXTRACT AND CLASSIFY NEW TRANSCRIPTS, AND PERFORM QC
+  // EXTRACT AND CLASSIFY NEW TRANSCRIPTS
   ///////////////////////////////////////////////////////////////////////////
   FEELNC_CODPOT(VALIDATE_INPUT_GTF.out, ref_fa, SPLIT_EXTENDED_ANNOTATION.out.novel_genes)
   FEELNC_FORMAT(FEELNC_CODPOT.out.mRNA, FEELNC_CODPOT.out.lncRNA)
@@ -87,30 +86,35 @@ workflow {
     ch_gene_counts = BAMBU.out.gene_counts
     ch_tx_counts = BAMBU.out.tx_counts
     ch_ndr = BAMBU.out.ndr
+    class_code = GFFCOMPARE.out.class_code_gtf
   }
   else if (params.tx_discovery == "stringtie2") {
     ch_gene_counts = STRINGTIE.out.gene_counts
     ch_tx_counts = STRINGTIE.out.tx_counts
     ch_ndr = STRINGTIE.out.ndr
+    class_code = STRINGTIE.out.class_code_gtf
   }
 
+  ///////////////////////////////////////////////////////////////////////////
+  // PREDICT CDS ON NOVEL TRANSCRIPTS
+  ///////////////////////////////////////////////////////////////////////////
+  TRANSDECODER(MERGE_NOVEL.out.novel_full_gtf, ref_fa)
+
+  ///////////////////////////////////////////////////////////////////////////
+  // PERFORM QC ON FULL ANNOTATION
+  ///////////////////////////////////////////////////////////////////////////
   QC_FULL(samples, 
           INDEX_BAM.out, 
-          MERGE_NOVEL.out, 
+          TRANSDECODER.out, 
           VALIDATE_INPUT_GTF.out, 
           ch_gene_counts,
           "full")
 
-  ///////////////////////////////////////////////////////////////////////////
-  // PREDICT CDS ON NOVEL TRANSCRIPTS AND MERGE WITH NOVEL ANNOTATION
-  ///////////////////////////////////////////////////////////////////////////
-  TRANSDECODER(MERGE_NOVEL.out.novel_full_gtf, ref_fa)
-
   ///////////////////////////////////////////////////////////////////////////
   // FILTER NEW TRANSCRIPTS, AND QC ON FILTERED ANNOTATION
   ///////////////////////////////////////////////////////////////////////////
   if(params.filter) {
-    TFKMERS(TRANSDECODER.out.cds_gtf, 
+    TFKMERS(TRANSDECODER.out, 
             ref_fa, 
             ch_ndr, 
             tokenizer, 
@@ -120,7 +124,16 @@ workflow {
               INDEX_BAM.out, 
               TFKMERS.out.gtf, 
               VALIDATE_INPUT_GTF.out, 
-              ch_gene_counts, 
+              ch_gene_counts,
               "filter")
   }
+
+  ///////////////////////////////////////////////////////////////////////////
+  // ADD GFFCOMPARE CLASS CODES TO FINAL GTFS
+  ///////////////////////////////////////////////////////////////////////////
+  final_gtf = TRANSDECODER.out.gtf.mix(QC_FULL.out.gtf)
+  if (params.filter){
+  final_gtf = TRANSDECODER.out.gtf.mix(QC_FULL.out.gtf,TFKMERS.out.gtf,QC_FILTER.out.gtf)
+  }
+  ADD_CLASS_CODE(class_code, final_gtf)
 }
diff --git a/modules/add_class_code.nf b/modules/add_class_code.nf
@@ -1,16 +1,20 @@
 process ADD_CLASS_CODE {
   conda (params.enable_conda ? "$baseDir/environment.yml" : null)
   container "ghcr.io/igdrion/annexa:${workflow.revision? workflow.revision: "main"}"
+  publishDir "$params.outdir/final", mode: 'copy', saveAs: {filename -> "${gtf}"}, overwrite: true
+  tag "$gtf"
 
   input:
   file class_code_gtf
-  file extended_annotation
+  file gtf
 
   output:
-  path 'extended_annotation_class_code.gtf', emit: extended_annotation_class_code
+  path "class_code.${gtf}"
 
   script:
-  """
-  class_code.R ${class_code_gtf} ${extended_annotation}
+  """  
+  class_code.R ${class_code_gtf} ${gtf} "class_code.${gtf}"
+  ## Remove header created by gtfsort
+  sed -i 1,3d "class_code.${gtf}"
   """
 }
diff --git a/modules/gffcompare/gffcompare.nf b/modules/gffcompare/gffcompare.nf
@@ -3,64 +3,22 @@ process GFFCOMPARE {
   container "${ workflow.containerEngine == 'singularity' ?
       'https://depot.galaxyproject.org/singularity/gffcompare:0.12.6--h9f5acd7_0' :
       'biocontainers/gffcompare:0.12.6--h9f5acd7_0' }"
-  publishDir "$params.outdir/stringtie2", mode: 'copy', pattern: 'extended_annotations.gtf'
   cpus params.maxCpu
 
-  input:
     input:
     path reference_gtf
     path fasta
     path merged_gtf
 
     output:
     path("*.annotated.gtf"), emit: class_code_gtf
-    path("*.loci")         , emit: loci
-    path("*.stats")        , emit: stats
-    path("*.tracking")     , emit: tracking
-    path("extended_annotations.gtf"), emit: stringtie_gtf, optional: true
-
-    shell:
-    if (params.tx_discovery == "bambu")
-    '''
-    gffcompare \
-    -r !{reference_gtf} \
-    -s !{fasta} \
-    !{merged_gtf}
-    '''
-
-    else if (params.tx_discovery == "stringtie2")
-    '''
+    path("gffcmp.tracking"), emit: tracking_file
+
+    script:
+    """
     gffcompare \
-    -r !{reference_gtf} \
-    -s !{fasta} \
-    !{merged_gtf}
-
-    # Add information for stringtie2 process
-    # Reformat the output of gffcompare to correctly match novel isoforms to known genes
-    # Takes the transcript_id identified by Stringtie and assigns it to reference gene_id
-
-    awk 'BEGIN{
-        while(getline<"gffcmp.tracking">0){
-            if ($4 !="u" && $4 !="r"){
-                split($3,gn,"|");
-                split($5,tx,"|"); 
-                final["\\""tx[2]"\\";"]="\\""gn[1]"\\";"
-                }
-            }
-    } {
-        if ($12 in final){
-            $10=final[$12]; print $0} else {print $0}
-    }' !{merged_gtf} | gtf2gtf_cleanall.sh  > extended_annotations_preaclean.gtf
-
-    # Match correct ref_gene_id to gene_id to some overlapping genes in the reference annotation
-
-    awk '{if ($3 == "transcript" && $13=="ref_gene_id" && $10!=$14) {
-        $10 = $14;
-        print $0
-    } else if ($3 == "exon" && $15=="ref_gene_id" && $10!=$16) {
-        $10 = $16;
-        print $0
-    } else {print $0}
-    }' extended_annotations_preaclean.gtf | gtf2gtf_cleanall.sh > extended_annotations.gtf
-    '''
+    -r ${reference_gtf} \
+    -s ${fasta} \
+    ${merged_gtf}
+    """
 }
diff --git a/modules/index_bam.nf b/modules/index_bam.nf
@@ -3,6 +3,7 @@ process INDEX_BAM {
   container "${ workflow.containerEngine == 'singularity' ? 
                 'https://depot.galaxyproject.org/singularity/samtools%3A1.16.1--h6899075_0' : 
                 'quay.io/biocontainers/samtools:1.16.1--h1170115_0' }"
+  tag "$bam"
 
   input:
   file bam

diff --git a/modules/qc/merge_known_novel.nf b/modules/qc/merge_known_novel.nf
@@ -11,7 +11,7 @@ process MERGE_ANNOTATIONS {
   val origin
 
   output:
-  path "extended_annotations.${origin}.gtf"
+  path "extended_annotations.${origin}.gtf", emit: gtf
 
   """
   cat ${novel} ${ref} | GTF.py format > extended_annotations.${origin}.gtf

diff --git a/modules/qc/workflow.nf b/modules/qc/workflow.nf
@@ -30,4 +30,7 @@ workflow QC {
         origin
       )
     }
+
+    emit:
+      gtf = MERGE_ANNOTATIONS.out.gtf
 }
diff --git a/modules/stringtie/stringtie_format_gffcompare.nf b/modules/stringtie/stringtie_format_gffcompare.nf
@@ -0,0 +1,44 @@
+process FORMAT_GFFCOMPARE {
+  publishDir "$params.outdir/stringtie2", pattern: 'extended_annotations.gtf', mode: 'copy'
+
+  input:
+  path merged_gtf
+  path tracking_file
+
+  output:
+  path("extended_annotations.gtf"), emit: stringtie_gtf
+
+  shell:
+  '''
+  # Add information for stringtie2 process
+  # Reformat the output of gffcompare to correctly match novel isoforms to known genes
+  # Takes the transcript_id identified by Stringtie and assigns it to reference gene_id
+
+  awk 'BEGIN{
+      while(getline<"!{tracking_file}">0){
+          if ($4 !="u" && $4 !="r"){
+              split($3,gn,"|");
+              split($5,tx,"|"); 
+              final["\\""tx[2]"\\";"]="\\""gn[1]"\\";"
+              }
+          }
+  } {
+      if ($12 in final){
+          $10=final[$12]; print $0} else {print $0}
+  }' !{merged_gtf} | gtf2gtf_cleanall.sh  > extended_annotations_preaclean.gtf
+
+  # Match correct ref_gene_id to gene_id to some overlapping genes in the reference annotation
+
+  awk '{if ($3 == "transcript" && $13=="ref_gene_id" && $10!=$14) {
+      $10 = $14;
+      print $0
+  } else if ($3 == "exon" && $15=="ref_gene_id" && $10!=$16) {
+      $10 = $16;
+      print $0
+  } else {print $0}
+  }' extended_annotations_preaclean.gtf | gtf2gtf_cleanall.sh > extended_annotations.gtf
+
+  # Remove header lines (command and version)
+  sed -i 1,2d extended_annotations.gtf
+  '''
+}
diff --git a/modules/stringtie/stringtie_merge_counts.nf b/modules/stringtie/stringtie_merge_counts.nf
@@ -1,5 +1,8 @@
 process MERGE_COUNTS {
   publishDir "$params.outdir/stringtie2", mode: 'copy', pattern: '*.txt'
+  if (params.filter == false){
+    publishDir "$params.outdir/final", mode: 'copy', pattern: 'counts_transcript.txt', saveAs: {filename -> 'counts_transcript.full.gtf'}
+  }
 
   input:
   path gene_counts
@@ -12,20 +15,34 @@ process MERGE_COUNTS {
 
   shell:
   '''
+  # Sort list of input files alphanumerically
+  gene_counts=$(echo !{gene_counts} | tr ' ' '\\n' | sort | xargs)
+  tx_counts=$(echo !{tx_counts} | tr ' ' '\\n' | sort | xargs)
+  
   # Merge the individual outputs of featurecount of each .bam into a single file
 
-  paste !{gene_counts} \
+  paste \${gene_counts} \
   | awk '{printf("%s ",$1); for (i=2;i<=NF;i+=2){printf("%s ",$i)}print "\\n"}' \
   | grep -v -e "^$"  \
   | awk -v OFS='\\t' '{$1=$1}1' > counts_gene.txt
 
-  paste !{tx_counts} \
+  paste \${tx_counts} \
   | awk '{printf("\\n%s %s ",$1,$2); for (i=3;i<=NF;i+=3){printf("%s ",$i)}}' \
   | grep -v -e "^$"  \
   | awk -v OFS='\\t' '{$1=$1}1' > counts_transcript.txt
 
+  # Rename first line
+  sed -i '1s/^Geneid/transcript_id/' counts_transcript.txt
+
+  # Sort genes and tx rows alphanumerically
+  (head -n 1 counts_transcript.txt && tail -n +2 counts_transcript.txt | sort -k1,1) > counts_transcript.txt.new \
+  && mv counts_transcript.txt.new counts_transcript.txt
+
+  (head -n 1 counts_gene.txt && tail -n +2 counts_gene.txt | sort -k1,1) > counts_gene.txt.new \
+  && mv counts_gene.txt.new counts_gene.txt
+
   # Create empty NDR file for TFKMERS to have same workflow as bambu in filtering step 
   # (placeholder, not used)
   touch empty.ndr
   '''
-}
+}
diff --git a/modules/stringtie/stringtie_quant.nf b/modules/stringtie/stringtie_quant.nf
@@ -4,18 +4,14 @@ process STRINGTIE_QUANTIFY {
                 'https://depot.galaxyproject.org/singularity/stringtie%3A2.2.3--h43eeafb_0' :
                 'quay.io/biocontainers/stringtie:2.2.3--h43eeafb_0'}"
   cpus params.maxCpu
+  tag 'bam'
 
   input:
-  path '*'
+  path bam
   path merged_gtf
 
   output:
   path 'stringtie_quant.gtf', emit: stringtie_quant_qtf
-  path 'e_data.ctab'
-  path 'e2t.ctab'
-  path 'i_data.ctab'
-  path 'i2t.ctab'
-  path 't_data.ctab'
 
   script:
   """
@@ -25,6 +21,6 @@ process STRINGTIE_QUANTIFY {
     -p ${params.maxCpu} \
     -G ${merged_gtf} \
     -o stringtie_quant.gtf \
-    *.bam
+    ${bam}
   """
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -30,4 +30,7 @@ workflow QC { @@
             origin
           )
         }
+        emit:
+          gtf = MERGE_ANNOTATIONS.out.gtf
     }