Merge pull request #28 from N-Hoffmann/stringtie

Add restranding process
IGDRion · Oct 31, 2024 · 314a64e · 314a64e
2 parents c6cfde2 + 7212235
commit 314a64e
Show file tree

Hide file tree

Showing 22 changed files with 548 additions and 10 deletions.
diff --git a/assets/metro_map.drawio b/assets/metro_map.drawio
diff --git a/bin/class_code.R b/bin/class_code.R
@@ -1,5 +1,6 @@
 #! /usr/bin/env Rscript
 library(rtracklayer)
+library(dplyr)
 
 #############################################################################
 # CLI Args
@@ -15,6 +16,10 @@ output = args[3]
 cc_gtf <- readGFF(class_code_gtf)
 ext_anno <- readGFF(extended_annotation)
 
+# Add class_code
 ext_anno$class_code <- cc_gtf$class_code[match(paste(ext_anno$transcript_id, ext_anno$type), paste(cc_gtf$transcript_id, cc_gtf$type))]
 
+# Change order of transcript_id column
+ext_anno <- ext_anno %>% relocate(transcript_id, .after = gene_id)
+
 export(ext_anno, output)
diff --git a/bin/qc.R b/bin/qc.R
@@ -8,6 +8,7 @@ library(grid)
 library(gridExtra)
 library(viridis)
 library(ggpattern)
+library(stringr)
 
 brew = c("#008387", "#a0d3db", "#ad8500", "#d9cb98")
 palette = c("#00AFBB", "#FC4E07", "#E7B800")
@@ -395,6 +396,14 @@ cover <- textGrob("ANNEXA report",
                   gp = gpar(fontsize = 40,
                             col = "black"))
 
+sub_cover_text <- paste(
+  str_wrap(paste("Reference genome:", genome_ref), width = 40),
+  str_wrap(paste("Reference annotation:", gtf_ref), width = 40),
+  "\n",
+  print(Sys.Date()),
+  paste("ANNEXA version:", ANNEXA_version),
+  sep = "\n")
+
 sub_cover_text <- paste(
   print(Sys.Date()), 
   paste("ANNEXA version:", ANNEXA_version), sep = "\n")

diff --git a/bin/restrand_isoforms.R b/bin/restrand_isoforms.R
@@ -0,0 +1,72 @@
+#! /usr/bin/env Rscript
+
+library(rtracklayer)
+library(dplyr)
+library(stringr)
+
+#############################################
+# CLI Args
+args = commandArgs(trailingOnly=TRUE)
+gtf_file = args[1]
+tx_tool = args[2]
+output = args[3]
+
+# Define prefix to identify novel transcripts
+if (tx_tool == "stringtie2") {
+  prefix = "MSTRG"
+} else{
+    prefix = "Bambu"
+}
+
+gtf <- rtracklayer::readGFF(gtf_file)
+
+# Restrand isoforms of known genes ----------------------------------------
+
+# Check if gtf already has ref_gene_id feature (Stringtie), if not (Bambu), create it based on gene_id column
+if (!"ref_gene_id" %in% colnames(gtf)){
+  gtf$ref_gene_id <- ifelse(!grepl("Bambu|MSTRG", gtf$gene_id), 
+                                  gtf$gene_id, 
+                                  NA)
+}
+
+# Get strand of reference genes
+ref_strand <- unique(gtf %>% filter(!is.na(ref_gene_id)) %>% select(ref_gene_id, strand))
+# Find novel isoforms
+to_change <- gtf %>% filter(str_detect(transcript_id, prefix) & !str_detect(gene_id, prefix))
+# Match and apply new strand (if different) to novel isoforms
+matches <- match(to_change$gene_id, ref_strand$ref_gene_id)
+to_change$strand <- ref_strand$strand[matches]
+
+# Change strand of novel isoforms in gtf dataframe
+gtf <- gtf %>%
+  mutate(strand = case_when(
+    transcript_id %in% to_change$transcript_id ~ to_change$strand[match(transcript_id, to_change$transcript_id)],
+    TRUE ~ strand
+  ))
+
+# Restrand novel tx of novel genes ----------------------------------------
+
+# Look for novel tx of novel genes
+novel <- gtf %>% filter(str_detect(transcript_id, prefix) & str_detect(gene_id, prefix))
+
+# Restrand novel transcripts
+# If all isoforms are unstraded, keep unstranded
+# If some are stranded, use strand as new strand
+# If all three strands are there, use majority rule
+new_gene_strands <- novel %>%
+  group_by(gene_id) %>%
+  summarize(gene_strand = if(all(strand == "*")) "*" 
+            else {
+              strands <- table(strand[strand != "*"])
+              if(length(strands) > 1 && max(strands) == min(strands)) "*"
+              else names(which.max(strands))
+            })
+
+# Change strand in gtf
+gtf <- gtf %>%
+  left_join(new_gene_strands, by="gene_id") %>%
+  mutate(strand = coalesce(gene_strand, strand)) %>%
+  select(-gene_strand)
+
+# Export gtf to new restranded file
+export(gtf, output)
diff --git a/main.nf b/main.nf
@@ -53,12 +53,14 @@ include { INDEX_BAM                      } from './modules/index_bam.nf'
 include { BAMBU                          } from './modules/bambu/bambu.nf'
 include { STRINGTIE                      } from './modules/stringtie/stringtie_workflow.nf'
 include { GFFCOMPARE                     } from './modules/gffcompare/gffcompare.nf'
+include { RESTRAND_ISOFORMS              } from './modules/restrand_isoforms.nf'
 include { SPLIT_EXTENDED_ANNOTATION      } from './modules/split_extended_annotation.nf'
 include { FEELNC_CODPOT                  } from './modules/feelnc/codpot.nf'
 include { FEELNC_FORMAT                  } from './modules/feelnc/format.nf'
 include { RESTORE_BIOTYPE                } from './modules/restore_biotypes.nf'
 include { MERGE_NOVEL                    } from './modules/merge_novel.nf'
 include { TRANSDECODER                   } from './modules/transdecoder/transdecoder_workflow.nf'
+include { RESTRAND_NOVEL                 } from './modules/restrand_novel.nf'
 include { TFKMERS                        } from './modules/transforkmers/workflow.nf'
 include { QC as QC_FULL; QC as QC_FILTER } from './modules/qc/workflow.nf'
 include { ADD_CLASS_CODE                 } from './modules/add_class_code.nf'
@@ -84,11 +86,13 @@ workflow {
   if(params.tx_discovery == "bambu") {
     BAMBU(samples.collect(), VALIDATE_INPUT_GTF.out, ref_fa)
     GFFCOMPARE(VALIDATE_INPUT_GTF.out, ref_fa, BAMBU.out.bambu_gtf)
-    SPLIT_EXTENDED_ANNOTATION(BAMBU.out.bambu_gtf)
+    RESTRAND_ISOFORMS(BAMBU.out.bambu_gtf)
+    SPLIT_EXTENDED_ANNOTATION(RESTRAND_ISOFORMS.out)
   }
   else if (params.tx_discovery == "stringtie2") {
     STRINGTIE(samples, VALIDATE_INPUT_GTF.out, ref_fa)
-    SPLIT_EXTENDED_ANNOTATION(STRINGTIE.out.stringtie_gtf)
+    RESTRAND_ISOFORMS(STRINGTIE.out.stringtie_gtf)
+    SPLIT_EXTENDED_ANNOTATION(RESTRAND_ISOFORMS.out)
   }
 
   ///////////////////////////////////////////////////////////////////////////
@@ -116,13 +120,13 @@ workflow {
   // PREDICT CDS ON NOVEL TRANSCRIPTS
   ///////////////////////////////////////////////////////////////////////////
   TRANSDECODER(MERGE_NOVEL.out.novel_full_gtf, ref_fa)
-
+  RESTRAND_NOVEL(TRANSDECODER.out)
   ///////////////////////////////////////////////////////////////////////////
   // PERFORM QC ON FULL ANNOTATION
   ///////////////////////////////////////////////////////////////////////////
   QC_FULL(samples, 
           INDEX_BAM.out, 
-          TRANSDECODER.out, 
+          RESTRAND_NOVEL.out, 
           VALIDATE_INPUT_GTF.out, 
           ch_gene_counts,
           "full")
@@ -131,7 +135,7 @@ workflow {
   // FILTER NEW TRANSCRIPTS, AND QC ON FILTERED ANNOTATION
   ///////////////////////////////////////////////////////////////////////////
   if(params.filter) {
-    TFKMERS(TRANSDECODER.out, 
+    TFKMERS(RESTRAND_NOVEL.out, 
             ref_fa, 
             ch_ndr, 
             tokenizer, 
@@ -148,9 +152,9 @@ workflow {
   ///////////////////////////////////////////////////////////////////////////
   // ADD GFFCOMPARE CLASS CODES TO FINAL GTFS
   ///////////////////////////////////////////////////////////////////////////
-  final_gtf = TRANSDECODER.out.gtf.mix(QC_FULL.out.gtf)
+  final_gtf = RESTRAND_NOVEL.out.mix(QC_FULL.out.gtf)
   if (params.filter){
-  final_gtf = TRANSDECODER.out.gtf.mix(QC_FULL.out.gtf,TFKMERS.out.gtf,QC_FILTER.out.gtf)
+  final_gtf = RESTRAND_NOVEL.out.mix(QC_FULL.out.gtf,TFKMERS.out.gtf,QC_FILTER.out.gtf)
   }
   ADD_CLASS_CODE(class_code, final_gtf)
 }
diff --git a/modules/add_class_code.nf b/modules/add_class_code.nf
@@ -12,13 +12,13 @@ process ADD_CLASS_CODE {
   path "class_code.${gtf}"
 
   script:
-  """  
+  """
   class_code.R ${class_code_gtf} ${gtf} "class_code.${gtf}"
 
   # Remove header created by gtfsort
   sed -i 1,3d "class_code.${gtf}"
 
-  # Add semicolon at end of tx lines
-  sed -i '/\\ttranscript\\t/s/\$/;/' "class_code.${gtf}"
+  # Add semicolon if missing during rtracklayer export
+  sed -i 's/[^;]\$/&;/' "class_code.${gtf}"
   """
 }
diff --git a/modules/feelnc/codpot.nf b/modules/feelnc/codpot.nf
@@ -14,6 +14,7 @@ process FEELNC_CODPOT {
   path "feelnc_codpot_out/new.lncRNA.gtf", emit: lncRNA
   path "feelnc_codpot_out/new.mRNA.gtf", emit: mRNA
 
+  script:
   """
   grep "protein_coding" ${ref} > known_mRNA.gtf
   grep -v "protein_coding" ${ref} > known_lncRNA.gtf

diff --git a/modules/feelnc/format.nf b/modules/feelnc/format.nf
@@ -11,6 +11,7 @@ process FEELNC_FORMAT {
   output:
   path "novel.genes.gtf"
 
+  script:
   """
   merge_feelnc.py --lncRNA ${lncRNA} --mRNA ${mRNA} > novel.genes.gtf
   """

diff --git a/modules/index_bam.nf b/modules/index_bam.nf
@@ -11,6 +11,7 @@ process INDEX_BAM {
   output:
   file("*.bai")
 
+  script:
   """
   samtools index $bam
   """

diff --git a/modules/input/validate.nf b/modules/input/validate.nf
@@ -10,6 +10,7 @@ process VALIDATE_INPUT_GTF {
   output:
   file 'input.formatted.gtf'
 
+  script:
   """
   validate_gtf.py ${gtf} > input.formatted.gtf
   """

diff --git a/modules/merge_novel.nf b/modules/merge_novel.nf
@@ -12,6 +12,7 @@ process MERGE_NOVEL {
   output:
   path "novel.full.gtf", emit: novel_full_gtf
 
+  script:
   """
   cat ${novel_genes} ${novel_isoforms} | GTF.py format > novel.full.gtf
   """

diff --git a/modules/restore_biotypes.nf b/modules/restore_biotypes.nf
@@ -11,6 +11,7 @@ process RESTORE_BIOTYPE {
   output:
   path "novel.isoforms.gtf"
 
+  script:
   """
   restore_ref_attributes.py -gtf ${novel_isoforms} -ref $ref > novel.isoforms.gtf
   """

diff --git a/modules/restrand_isoforms.nf b/modules/restrand_isoforms.nf
@@ -0,0 +1,16 @@
+process RESTRAND_ISOFORMS {
+  conda (params.enable_conda ? "$baseDir/environment.yml" : null)
+  container "ghcr.io/igdrion/annexa:${workflow.revision? workflow.revision: "main"}"
+
+  input:
+  file gtf
+
+  output:
+  path "restranded.${gtf}"
+
+  shell:
+  '''
+  restrand_isoforms.R !{gtf} !{params.tx_discovery} "restranded.!{gtf}"
+  sed -i '/;\s*$/!s/\s*$/;/' "restranded.!{gtf}"
+  '''
+}
diff --git a/modules/restrand_novel.nf b/modules/restrand_novel.nf
@@ -0,0 +1,18 @@
+process RESTRAND_NOVEL {
+  conda (params.enable_conda ? "$baseDir/environment.yml" : null)
+  container "ghcr.io/igdrion/annexa:${workflow.revision? workflow.revision: "main"}"
+  publishDir "$params.outdir/final", mode: 'copy', saveAs: {filename -> 'novel.full.gtf'}, overwrite: true
+
+  input:
+  file gtf
+
+  output:
+  path "${gtf}"
+
+  shell:
+  '''
+  restrand_isoforms.R !{gtf} !{params.tx_discovery} "restranded.!{gtf}"
+  mv restranded.!{gtf} !{gtf}
+  sed -i '/;\s*$/!s/\s*$/;/' !{gtf}
+  '''
+}
diff --git a/modules/rseqc/gene_body_coverage.nf b/modules/rseqc/gene_body_coverage.nf
@@ -15,6 +15,7 @@ process GENEBODY_COVERAGE {
   file "${prefix}.${bed.simpleName}.geneBodyCoverage.curves.pdf"
   file "${prefix}.${bed.simpleName}.geneBodyCoverage.txt"
 
+  script:
   """
   geneBody_coverage.py \
     -i ./ \

diff --git a/modules/rseqc/genepredtobed.nf b/modules/rseqc/genepredtobed.nf
@@ -10,6 +10,7 @@ process GENEPRED_TO_BED {
   output:
   path "${genePred.simpleName}.bed12"
 
+  script:
   """
   genePredToBed ${genePred} ${genePred.simpleName}.bed12
   """

diff --git a/modules/rseqc/gtftogenepred.nf b/modules/rseqc/gtftogenepred.nf
@@ -10,6 +10,7 @@ process GTF_TO_GENEPRED {
   output:
   path "${gtf.simpleName}.genePred"
 
+  script:
   """
   gtfToGenePred -genePredExt ${gtf} ${gtf.simpleName}.genePred
   """

diff --git a/modules/rseqc/prepare.nf b/modules/rseqc/prepare.nf
@@ -6,6 +6,7 @@ process PREPARE_RSEQC {
   output:
   path "*.rseqc.gtf"
 
+  script:
   """
   grep "protein_coding" ${novel} > novel_mRNA.rseqc.gtf
   grep "lncRNA" ${novel} > novel_lncRNA.rseqc.gtf

diff --git a/modules/transforkmers/extract_regions.nf b/modules/transforkmers/extract_regions.nf
@@ -10,6 +10,7 @@ process EXTRACT_TSS_REGIONS {
   output:
   path "tss_regions.bed6", emit: tss_regions
 
+  script:
   """
   cat ${novel_gtf} | extract_tss.py -l 512 > tss_regions.bed6
   """

diff --git a/modules/transforkmers/extract_sequences.nf b/modules/transforkmers/extract_sequences.nf
@@ -11,6 +11,7 @@ process EXTRACT_TSS_SEQUENCES {
   output:
   path "tss.fa", emit: tss_sequences
 
+  script:
   """
   bedtools getfasta -nameOnly -fi ${fa} -bed ${tss_regions} | tr a-z A-Z > tss.fa
   """

diff --git a/modules/transforkmers/filter.nf b/modules/transforkmers/filter.nf
@@ -16,6 +16,7 @@ process FILTER {
   path "counts_transcript.filter.txt"
   path "counts_transcript.full.txt"
 
+  script:
   """
   cp ${counts_tx} counts_transcript.full.txt
 

diff --git a/modules/transforkmers/predict.nf b/modules/transforkmers/predict.nf
@@ -14,6 +14,7 @@ process PREDICT {
   output:
   path "output.csv", emit: tss_prob
 
+  script:
   """
   transforkmers predict \
     --model_path_or_name ${model} \
-Original file line number
+Diff line change
@@ Expand Up / @@ -11,6 +11,7 @@ process FEELNC_FORMAT { @@
       output:
       path "novel.genes.gtf"
+      script:
       """
       merge_feelnc.py --lncRNA ${lncRNA} --mRNA ${mRNA} > novel.genes.gtf
       """
@@ Expand Down @@