Skip to content

Commit

Permalink
Merge pull request #28 from N-Hoffmann/stringtie
Browse files Browse the repository at this point in the history
Add restranding process
  • Loading branch information
N-Hoffmann authored Oct 31, 2024
2 parents c6cfde2 + 7212235 commit 314a64e
Show file tree
Hide file tree
Showing 22 changed files with 548 additions and 10 deletions.
400 changes: 400 additions & 0 deletions assets/metro_map.drawio

Large diffs are not rendered by default.

5 changes: 5 additions & 0 deletions bin/class_code.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#! /usr/bin/env Rscript
library(rtracklayer)
library(dplyr)

#############################################################################
# CLI Args
Expand All @@ -15,6 +16,10 @@ output = args[3]
cc_gtf <- readGFF(class_code_gtf)
ext_anno <- readGFF(extended_annotation)

# Add class_code
ext_anno$class_code <- cc_gtf$class_code[match(paste(ext_anno$transcript_id, ext_anno$type), paste(cc_gtf$transcript_id, cc_gtf$type))]

# Change order of transcript_id column
ext_anno <- ext_anno %>% relocate(transcript_id, .after = gene_id)

export(ext_anno, output)
9 changes: 9 additions & 0 deletions bin/qc.R
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ library(grid)
library(gridExtra)
library(viridis)
library(ggpattern)
library(stringr)

brew = c("#008387", "#a0d3db", "#ad8500", "#d9cb98")
palette = c("#00AFBB", "#FC4E07", "#E7B800")
Expand Down Expand Up @@ -395,6 +396,14 @@ cover <- textGrob("ANNEXA report",
gp = gpar(fontsize = 40,
col = "black"))

sub_cover_text <- paste(
str_wrap(paste("Reference genome:", genome_ref), width = 40),
str_wrap(paste("Reference annotation:", gtf_ref), width = 40),
"\n",
print(Sys.Date()),
paste("ANNEXA version:", ANNEXA_version),
sep = "\n")

sub_cover_text <- paste(
print(Sys.Date()),
paste("ANNEXA version:", ANNEXA_version), sep = "\n")
Expand Down
72 changes: 72 additions & 0 deletions bin/restrand_isoforms.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
#! /usr/bin/env Rscript

library(rtracklayer)
library(dplyr)
library(stringr)

#############################################
# CLI Args
args = commandArgs(trailingOnly=TRUE)
gtf_file = args[1]
tx_tool = args[2]
output = args[3]

# Define prefix to identify novel transcripts
if (tx_tool == "stringtie2") {
prefix = "MSTRG"
} else{
prefix = "Bambu"
}

gtf <- rtracklayer::readGFF(gtf_file)

# Restrand isoforms of known genes ----------------------------------------

# Check if gtf already has ref_gene_id feature (Stringtie), if not (Bambu), create it based on gene_id column
if (!"ref_gene_id" %in% colnames(gtf)){
gtf$ref_gene_id <- ifelse(!grepl("Bambu|MSTRG", gtf$gene_id),
gtf$gene_id,
NA)
}

# Get strand of reference genes
ref_strand <- unique(gtf %>% filter(!is.na(ref_gene_id)) %>% select(ref_gene_id, strand))
# Find novel isoforms
to_change <- gtf %>% filter(str_detect(transcript_id, prefix) & !str_detect(gene_id, prefix))
# Match and apply new strand (if different) to novel isoforms
matches <- match(to_change$gene_id, ref_strand$ref_gene_id)
to_change$strand <- ref_strand$strand[matches]

# Change strand of novel isoforms in gtf dataframe
gtf <- gtf %>%
mutate(strand = case_when(
transcript_id %in% to_change$transcript_id ~ to_change$strand[match(transcript_id, to_change$transcript_id)],
TRUE ~ strand
))

# Restrand novel tx of novel genes ----------------------------------------

# Look for novel tx of novel genes
novel <- gtf %>% filter(str_detect(transcript_id, prefix) & str_detect(gene_id, prefix))

# Restrand novel transcripts
# If all isoforms are unstraded, keep unstranded
# If some are stranded, use strand as new strand
# If all three strands are there, use majority rule
new_gene_strands <- novel %>%
group_by(gene_id) %>%
summarize(gene_strand = if(all(strand == "*")) "*"
else {
strands <- table(strand[strand != "*"])
if(length(strands) > 1 && max(strands) == min(strands)) "*"
else names(which.max(strands))
})

# Change strand in gtf
gtf <- gtf %>%
left_join(new_gene_strands, by="gene_id") %>%
mutate(strand = coalesce(gene_strand, strand)) %>%
select(-gene_strand)

# Export gtf to new restranded file
export(gtf, output)
18 changes: 11 additions & 7 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -53,12 +53,14 @@ include { INDEX_BAM } from './modules/index_bam.nf'
include { BAMBU } from './modules/bambu/bambu.nf'
include { STRINGTIE } from './modules/stringtie/stringtie_workflow.nf'
include { GFFCOMPARE } from './modules/gffcompare/gffcompare.nf'
include { RESTRAND_ISOFORMS } from './modules/restrand_isoforms.nf'
include { SPLIT_EXTENDED_ANNOTATION } from './modules/split_extended_annotation.nf'
include { FEELNC_CODPOT } from './modules/feelnc/codpot.nf'
include { FEELNC_FORMAT } from './modules/feelnc/format.nf'
include { RESTORE_BIOTYPE } from './modules/restore_biotypes.nf'
include { MERGE_NOVEL } from './modules/merge_novel.nf'
include { TRANSDECODER } from './modules/transdecoder/transdecoder_workflow.nf'
include { RESTRAND_NOVEL } from './modules/restrand_novel.nf'
include { TFKMERS } from './modules/transforkmers/workflow.nf'
include { QC as QC_FULL; QC as QC_FILTER } from './modules/qc/workflow.nf'
include { ADD_CLASS_CODE } from './modules/add_class_code.nf'
Expand All @@ -84,11 +86,13 @@ workflow {
if(params.tx_discovery == "bambu") {
BAMBU(samples.collect(), VALIDATE_INPUT_GTF.out, ref_fa)
GFFCOMPARE(VALIDATE_INPUT_GTF.out, ref_fa, BAMBU.out.bambu_gtf)
SPLIT_EXTENDED_ANNOTATION(BAMBU.out.bambu_gtf)
RESTRAND_ISOFORMS(BAMBU.out.bambu_gtf)
SPLIT_EXTENDED_ANNOTATION(RESTRAND_ISOFORMS.out)
}
else if (params.tx_discovery == "stringtie2") {
STRINGTIE(samples, VALIDATE_INPUT_GTF.out, ref_fa)
SPLIT_EXTENDED_ANNOTATION(STRINGTIE.out.stringtie_gtf)
RESTRAND_ISOFORMS(STRINGTIE.out.stringtie_gtf)
SPLIT_EXTENDED_ANNOTATION(RESTRAND_ISOFORMS.out)
}

///////////////////////////////////////////////////////////////////////////
Expand Down Expand Up @@ -116,13 +120,13 @@ workflow {
// PREDICT CDS ON NOVEL TRANSCRIPTS
///////////////////////////////////////////////////////////////////////////
TRANSDECODER(MERGE_NOVEL.out.novel_full_gtf, ref_fa)

RESTRAND_NOVEL(TRANSDECODER.out)
///////////////////////////////////////////////////////////////////////////
// PERFORM QC ON FULL ANNOTATION
///////////////////////////////////////////////////////////////////////////
QC_FULL(samples,
INDEX_BAM.out,
TRANSDECODER.out,
RESTRAND_NOVEL.out,
VALIDATE_INPUT_GTF.out,
ch_gene_counts,
"full")
Expand All @@ -131,7 +135,7 @@ workflow {
// FILTER NEW TRANSCRIPTS, AND QC ON FILTERED ANNOTATION
///////////////////////////////////////////////////////////////////////////
if(params.filter) {
TFKMERS(TRANSDECODER.out,
TFKMERS(RESTRAND_NOVEL.out,
ref_fa,
ch_ndr,
tokenizer,
Expand All @@ -148,9 +152,9 @@ workflow {
///////////////////////////////////////////////////////////////////////////
// ADD GFFCOMPARE CLASS CODES TO FINAL GTFS
///////////////////////////////////////////////////////////////////////////
final_gtf = TRANSDECODER.out.gtf.mix(QC_FULL.out.gtf)
final_gtf = RESTRAND_NOVEL.out.mix(QC_FULL.out.gtf)
if (params.filter){
final_gtf = TRANSDECODER.out.gtf.mix(QC_FULL.out.gtf,TFKMERS.out.gtf,QC_FILTER.out.gtf)
final_gtf = RESTRAND_NOVEL.out.mix(QC_FULL.out.gtf,TFKMERS.out.gtf,QC_FILTER.out.gtf)
}
ADD_CLASS_CODE(class_code, final_gtf)
}
6 changes: 3 additions & 3 deletions modules/add_class_code.nf
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@ process ADD_CLASS_CODE {
path "class_code.${gtf}"

script:
"""
"""
class_code.R ${class_code_gtf} ${gtf} "class_code.${gtf}"
# Remove header created by gtfsort
sed -i 1,3d "class_code.${gtf}"
# Add semicolon at end of tx lines
sed -i '/\\ttranscript\\t/s/\$/;/' "class_code.${gtf}"
# Add semicolon if missing during rtracklayer export
sed -i 's/[^;]\$/&;/' "class_code.${gtf}"
"""
}
1 change: 1 addition & 0 deletions modules/feelnc/codpot.nf
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ process FEELNC_CODPOT {
path "feelnc_codpot_out/new.lncRNA.gtf", emit: lncRNA
path "feelnc_codpot_out/new.mRNA.gtf", emit: mRNA

script:
"""
grep "protein_coding" ${ref} > known_mRNA.gtf
grep -v "protein_coding" ${ref} > known_lncRNA.gtf
Expand Down
1 change: 1 addition & 0 deletions modules/feelnc/format.nf
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ process FEELNC_FORMAT {
output:
path "novel.genes.gtf"

script:
"""
merge_feelnc.py --lncRNA ${lncRNA} --mRNA ${mRNA} > novel.genes.gtf
"""
Expand Down
1 change: 1 addition & 0 deletions modules/index_bam.nf
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ process INDEX_BAM {
output:
file("*.bai")

script:
"""
samtools index $bam
"""
Expand Down
1 change: 1 addition & 0 deletions modules/input/validate.nf
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ process VALIDATE_INPUT_GTF {
output:
file 'input.formatted.gtf'

script:
"""
validate_gtf.py ${gtf} > input.formatted.gtf
"""
Expand Down
1 change: 1 addition & 0 deletions modules/merge_novel.nf
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ process MERGE_NOVEL {
output:
path "novel.full.gtf", emit: novel_full_gtf

script:
"""
cat ${novel_genes} ${novel_isoforms} | GTF.py format > novel.full.gtf
"""
Expand Down
1 change: 1 addition & 0 deletions modules/restore_biotypes.nf
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ process RESTORE_BIOTYPE {
output:
path "novel.isoforms.gtf"

script:
"""
restore_ref_attributes.py -gtf ${novel_isoforms} -ref $ref > novel.isoforms.gtf
"""
Expand Down
16 changes: 16 additions & 0 deletions modules/restrand_isoforms.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
process RESTRAND_ISOFORMS {
conda (params.enable_conda ? "$baseDir/environment.yml" : null)
container "ghcr.io/igdrion/annexa:${workflow.revision? workflow.revision: "main"}"

input:
file gtf

output:
path "restranded.${gtf}"

shell:
'''
restrand_isoforms.R !{gtf} !{params.tx_discovery} "restranded.!{gtf}"
sed -i '/;\s*$/!s/\s*$/;/' "restranded.!{gtf}"
'''
}
18 changes: 18 additions & 0 deletions modules/restrand_novel.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
process RESTRAND_NOVEL {
conda (params.enable_conda ? "$baseDir/environment.yml" : null)
container "ghcr.io/igdrion/annexa:${workflow.revision? workflow.revision: "main"}"
publishDir "$params.outdir/final", mode: 'copy', saveAs: {filename -> 'novel.full.gtf'}, overwrite: true

input:
file gtf

output:
path "${gtf}"

shell:
'''
restrand_isoforms.R !{gtf} !{params.tx_discovery} "restranded.!{gtf}"
mv restranded.!{gtf} !{gtf}
sed -i '/;\s*$/!s/\s*$/;/' !{gtf}
'''
}
1 change: 1 addition & 0 deletions modules/rseqc/gene_body_coverage.nf
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ process GENEBODY_COVERAGE {
file "${prefix}.${bed.simpleName}.geneBodyCoverage.curves.pdf"
file "${prefix}.${bed.simpleName}.geneBodyCoverage.txt"

script:
"""
geneBody_coverage.py \
-i ./ \
Expand Down
1 change: 1 addition & 0 deletions modules/rseqc/genepredtobed.nf
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ process GENEPRED_TO_BED {
output:
path "${genePred.simpleName}.bed12"

script:
"""
genePredToBed ${genePred} ${genePred.simpleName}.bed12
"""
Expand Down
1 change: 1 addition & 0 deletions modules/rseqc/gtftogenepred.nf
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ process GTF_TO_GENEPRED {
output:
path "${gtf.simpleName}.genePred"

script:
"""
gtfToGenePred -genePredExt ${gtf} ${gtf.simpleName}.genePred
"""
Expand Down
1 change: 1 addition & 0 deletions modules/rseqc/prepare.nf
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ process PREPARE_RSEQC {
output:
path "*.rseqc.gtf"

script:
"""
grep "protein_coding" ${novel} > novel_mRNA.rseqc.gtf
grep "lncRNA" ${novel} > novel_lncRNA.rseqc.gtf
Expand Down
1 change: 1 addition & 0 deletions modules/transforkmers/extract_regions.nf
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ process EXTRACT_TSS_REGIONS {
output:
path "tss_regions.bed6", emit: tss_regions

script:
"""
cat ${novel_gtf} | extract_tss.py -l 512 > tss_regions.bed6
"""
Expand Down
1 change: 1 addition & 0 deletions modules/transforkmers/extract_sequences.nf
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ process EXTRACT_TSS_SEQUENCES {
output:
path "tss.fa", emit: tss_sequences

script:
"""
bedtools getfasta -nameOnly -fi ${fa} -bed ${tss_regions} | tr a-z A-Z > tss.fa
"""
Expand Down
1 change: 1 addition & 0 deletions modules/transforkmers/filter.nf
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ process FILTER {
path "counts_transcript.filter.txt"
path "counts_transcript.full.txt"

script:
"""
cp ${counts_tx} counts_transcript.full.txt
Expand Down
1 change: 1 addition & 0 deletions modules/transforkmers/predict.nf
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ process PREDICT {
output:
path "output.csv", emit: tss_prob

script:
"""
transforkmers predict \
--model_path_or_name ${model} \
Expand Down

0 comments on commit 314a64e

Please sign in to comment.