Skip to content

Commit

Permalink
Merge pull request #24 from N-Hoffmann/stringtie
Browse files Browse the repository at this point in the history
Add gffcompare class codes to final gtf; reformat code
  • Loading branch information
N-Hoffmann authored Aug 19, 2024
2 parents 9557f8d + c4f93cc commit 2d66137
Show file tree
Hide file tree
Showing 18 changed files with 162 additions and 96 deletions.
9 changes: 5 additions & 4 deletions bin/class_code.R
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,14 @@ library(rtracklayer)
args = commandArgs(trailingOnly=TRUE)
class_code_gtf = args[1]
extended_annotation = args[2]
output = args[3]

#############################################################################
# Read both GTF and assign gffcompare class_code to extended annotation
#############################################################################
file1 <- readGFF(class_code_gtf)
file2 <- readGFF(extended_annotation)
cc_gtf <- readGFF(class_code_gtf)
ext_anno <- readGFF(extended_annotation)

file2$class_code <- file1$class_code[match(paste(file2$transcript_id, file2$type), paste(file1$transcript_id, file1$type))]
ext_anno$class_code <- cc_gtf$class_code[match(paste(ext_anno$transcript_id, ext_anno$type), paste(cc_gtf$transcript_id, cc_gtf$type))]

export(file2, 'extended_annotation_class_code.gtf')
export(ext_anno, output)
10 changes: 7 additions & 3 deletions bin/validate_gtf.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@
#######################################################

for record in GTF.parse_by_line(args.gtf):
if record.feature =="transcript":
if "gene_biotype" in record:
g_biotype = record["gene_biotype"]
elif "gene_type" in record:
g_biotype = record["gene_type"]
if record.feature == "gene" or record.feature == "transcript":
continue

Expand All @@ -39,9 +44,8 @@

#######################################################
# Check if gene_biotype in each transcripts and exons
if not "gene_biotype" in record:
record["gene_biotype"] = "NA"
g_biotype = record["gene_biotype"]
if not "gene_biotype" in record and "gene_type" not in record:
record["gene_biotype"] = g_biotype

# Check for RefSeq gene_biotype format
if g_biotype == "mRNA":
Expand Down
41 changes: 27 additions & 14 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ include { INDEX_BAM } from './modules/index_bam.nf'
include { BAMBU } from './modules/bambu/bambu.nf'
include { STRINGTIE } from './modules/stringtie/stringtie_workflow.nf'
include { GFFCOMPARE } from './modules/gffcompare/gffcompare.nf'
include { ADD_CLASS_CODE } from './modules/add_class_code.nf'
include { SPLIT_EXTENDED_ANNOTATION } from './modules/split_extended_annotation.nf'
include { FEELNC_CODPOT } from './modules/feelnc/codpot.nf'
include { FEELNC_FORMAT } from './modules/feelnc/format.nf'
Expand All @@ -45,6 +44,7 @@ include { MERGE_NOVEL } from './modules/merge_novel.nf'
include { TRANSDECODER } from './modules/transdecoder/transdecoder_workflow.nf'
include { TFKMERS } from './modules/transforkmers/workflow.nf'
include { QC as QC_FULL; QC as QC_FILTER } from './modules/qc/workflow.nf'
include { ADD_CLASS_CODE } from './modules/add_class_code.nf'

workflow {
///////////////////////////////////////////////////////////////////////////
Expand All @@ -66,17 +66,16 @@ workflow {
///////////////////////////////////////////////////////////////////////////
if(params.tx_discovery == "bambu") {
BAMBU(samples.collect(), VALIDATE_INPUT_GTF.out, ref_fa)
GFFCOMPARE(input_gtf, ref_fa, BAMBU.out.bambu_gtf)
ADD_CLASS_CODE(GFFCOMPARE.out.class_code_gtf, BAMBU.out.bambu_gtf)
GFFCOMPARE(VALIDATE_INPUT_GTF.out, ref_fa, BAMBU.out.bambu_gtf)
SPLIT_EXTENDED_ANNOTATION(BAMBU.out.bambu_gtf)
}
else if (params.tx_discovery == "stringtie2") {
STRINGTIE(samples, VALIDATE_INPUT_GTF.out, ref_fa)
ADD_CLASS_CODE(STRINGTIE.out.class_code_gtf, STRINGTIE.out.stringtie_gtf)
SPLIT_EXTENDED_ANNOTATION(STRINGTIE.out.stringtie_gtf)
}

SPLIT_EXTENDED_ANNOTATION(ADD_CLASS_CODE.out.extended_annotation_class_code)
///////////////////////////////////////////////////////////////////////////
// EXTRACT AND CLASSIFY NEW TRANSCRIPTS, AND PERFORM QC
// EXTRACT AND CLASSIFY NEW TRANSCRIPTS
///////////////////////////////////////////////////////////////////////////
FEELNC_CODPOT(VALIDATE_INPUT_GTF.out, ref_fa, SPLIT_EXTENDED_ANNOTATION.out.novel_genes)
FEELNC_FORMAT(FEELNC_CODPOT.out.mRNA, FEELNC_CODPOT.out.lncRNA)
Expand All @@ -87,30 +86,35 @@ workflow {
ch_gene_counts = BAMBU.out.gene_counts
ch_tx_counts = BAMBU.out.tx_counts
ch_ndr = BAMBU.out.ndr
class_code = GFFCOMPARE.out.class_code_gtf
}
else if (params.tx_discovery == "stringtie2") {
ch_gene_counts = STRINGTIE.out.gene_counts
ch_tx_counts = STRINGTIE.out.tx_counts
ch_ndr = STRINGTIE.out.ndr
class_code = STRINGTIE.out.class_code_gtf
}

///////////////////////////////////////////////////////////////////////////
// PREDICT CDS ON NOVEL TRANSCRIPTS
///////////////////////////////////////////////////////////////////////////
TRANSDECODER(MERGE_NOVEL.out.novel_full_gtf, ref_fa)

///////////////////////////////////////////////////////////////////////////
// PERFORM QC ON FULL ANNOTATION
///////////////////////////////////////////////////////////////////////////
QC_FULL(samples,
INDEX_BAM.out,
MERGE_NOVEL.out,
TRANSDECODER.out,
VALIDATE_INPUT_GTF.out,
ch_gene_counts,
"full")

///////////////////////////////////////////////////////////////////////////
// PREDICT CDS ON NOVEL TRANSCRIPTS AND MERGE WITH NOVEL ANNOTATION
///////////////////////////////////////////////////////////////////////////
TRANSDECODER(MERGE_NOVEL.out.novel_full_gtf, ref_fa)

///////////////////////////////////////////////////////////////////////////
// FILTER NEW TRANSCRIPTS, AND QC ON FILTERED ANNOTATION
///////////////////////////////////////////////////////////////////////////
if(params.filter) {
TFKMERS(TRANSDECODER.out.cds_gtf,
TFKMERS(TRANSDECODER.out,
ref_fa,
ch_ndr,
tokenizer,
Expand All @@ -120,7 +124,16 @@ workflow {
INDEX_BAM.out,
TFKMERS.out.gtf,
VALIDATE_INPUT_GTF.out,
ch_gene_counts,
ch_gene_counts,
"filter")
}

///////////////////////////////////////////////////////////////////////////
// ADD GFFCOMPARE CLASS CODES TO FINAL GTFS
///////////////////////////////////////////////////////////////////////////
final_gtf = TRANSDECODER.out.gtf.mix(QC_FULL.out.gtf)
if (params.filter){
final_gtf = TRANSDECODER.out.gtf.mix(QC_FULL.out.gtf,TFKMERS.out.gtf,QC_FILTER.out.gtf)
}
ADD_CLASS_CODE(class_code, final_gtf)
}
12 changes: 8 additions & 4 deletions modules/add_class_code.nf
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
process ADD_CLASS_CODE {
conda (params.enable_conda ? "$baseDir/environment.yml" : null)
container "ghcr.io/igdrion/annexa:${workflow.revision? workflow.revision: "main"}"
publishDir "$params.outdir/final", mode: 'copy', saveAs: {filename -> "${gtf}"}, overwrite: true
tag "$gtf"

input:
file class_code_gtf
file extended_annotation
file gtf

output:
path 'extended_annotation_class_code.gtf', emit: extended_annotation_class_code
path "class_code.${gtf}"

script:
"""
class_code.R ${class_code_gtf} ${extended_annotation}
"""
class_code.R ${class_code_gtf} ${gtf} "class_code.${gtf}"
## Remove header created by gtfsort
sed -i 1,3d "class_code.${gtf}"
"""
}
58 changes: 8 additions & 50 deletions modules/gffcompare/gffcompare.nf
Original file line number Diff line number Diff line change
Expand Up @@ -3,64 +3,22 @@ process GFFCOMPARE {
container "${ workflow.containerEngine == 'singularity' ?
'https://depot.galaxyproject.org/singularity/gffcompare:0.12.6--h9f5acd7_0' :
'biocontainers/gffcompare:0.12.6--h9f5acd7_0' }"
publishDir "$params.outdir/stringtie2", mode: 'copy', pattern: 'extended_annotations.gtf'
cpus params.maxCpu

input:
input:
path reference_gtf
path fasta
path merged_gtf

output:
path("*.annotated.gtf"), emit: class_code_gtf
path("*.loci") , emit: loci
path("*.stats") , emit: stats
path("*.tracking") , emit: tracking
path("extended_annotations.gtf"), emit: stringtie_gtf, optional: true

shell:
if (params.tx_discovery == "bambu")
'''
gffcompare \
-r !{reference_gtf} \
-s !{fasta} \
!{merged_gtf}
'''

else if (params.tx_discovery == "stringtie2")
'''
path("gffcmp.tracking"), emit: tracking_file

script:
"""
gffcompare \
-r !{reference_gtf} \
-s !{fasta} \
!{merged_gtf}
# Add information for stringtie2 process
# Reformat the output of gffcompare to correctly match novel isoforms to known genes
# Takes the transcript_id identified by Stringtie and assigns it to reference gene_id
awk 'BEGIN{
while(getline<"gffcmp.tracking">0){
if ($4 !="u" && $4 !="r"){
split($3,gn,"|");
split($5,tx,"|");
final["\\""tx[2]"\\";"]="\\""gn[1]"\\";"
}
}
} {
if ($12 in final){
$10=final[$12]; print $0} else {print $0}
}' !{merged_gtf} | gtf2gtf_cleanall.sh > extended_annotations_preaclean.gtf
# Match correct ref_gene_id to gene_id to some overlapping genes in the reference annotation
awk '{if ($3 == "transcript" && $13=="ref_gene_id" && $10!=$14) {
$10 = $14;
print $0
} else if ($3 == "exon" && $15=="ref_gene_id" && $10!=$16) {
$10 = $16;
print $0
} else {print $0}
}' extended_annotations_preaclean.gtf | gtf2gtf_cleanall.sh > extended_annotations.gtf
'''
-r ${reference_gtf} \
-s ${fasta} \
${merged_gtf}
"""
}
1 change: 1 addition & 0 deletions modules/index_bam.nf
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ process INDEX_BAM {
container "${ workflow.containerEngine == 'singularity' ?
'https://depot.galaxyproject.org/singularity/samtools%3A1.16.1--h6899075_0' :
'quay.io/biocontainers/samtools:1.16.1--h1170115_0' }"
tag "$bam"

input:
file bam
Expand Down
2 changes: 1 addition & 1 deletion modules/qc/merge_known_novel.nf
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ process MERGE_ANNOTATIONS {
val origin

output:
path "extended_annotations.${origin}.gtf"
path "extended_annotations.${origin}.gtf", emit: gtf

"""
cat ${novel} ${ref} | GTF.py format > extended_annotations.${origin}.gtf
Expand Down
3 changes: 3 additions & 0 deletions modules/qc/workflow.nf
Original file line number Diff line number Diff line change
Expand Up @@ -30,4 +30,7 @@ workflow QC {
origin
)
}

emit:
gtf = MERGE_ANNOTATIONS.out.gtf
}
44 changes: 44 additions & 0 deletions modules/stringtie/stringtie_format_gffcompare.nf
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
process FORMAT_GFFCOMPARE {
publishDir "$params.outdir/stringtie2", pattern: 'extended_annotations.gtf', mode: 'copy'

input:
path merged_gtf
path tracking_file

output:
path("extended_annotations.gtf"), emit: stringtie_gtf

shell:
'''
# Add information for stringtie2 process
# Reformat the output of gffcompare to correctly match novel isoforms to known genes
# Takes the transcript_id identified by Stringtie and assigns it to reference gene_id
awk 'BEGIN{
while(getline<"!{tracking_file}">0){
if ($4 !="u" && $4 !="r"){
split($3,gn,"|");
split($5,tx,"|");
final["\\""tx[2]"\\";"]="\\""gn[1]"\\";"
}
}
} {
if ($12 in final){
$10=final[$12]; print $0} else {print $0}
}' !{merged_gtf} | gtf2gtf_cleanall.sh > extended_annotations_preaclean.gtf
# Match correct ref_gene_id to gene_id to some overlapping genes in the reference annotation
awk '{if ($3 == "transcript" && $13=="ref_gene_id" && $10!=$14) {
$10 = $14;
print $0
} else if ($3 == "exon" && $15=="ref_gene_id" && $10!=$16) {
$10 = $16;
print $0
} else {print $0}
}' extended_annotations_preaclean.gtf | gtf2gtf_cleanall.sh > extended_annotations.gtf
# Remove header lines (command and version)
sed -i 1,2d extended_annotations.gtf
'''
}
23 changes: 20 additions & 3 deletions modules/stringtie/stringtie_merge_counts.nf
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
process MERGE_COUNTS {
publishDir "$params.outdir/stringtie2", mode: 'copy', pattern: '*.txt'
if (params.filter == false){
publishDir "$params.outdir/final", mode: 'copy', pattern: 'counts_transcript.txt', saveAs: {filename -> 'counts_transcript.full.gtf'}
}

input:
path gene_counts
Expand All @@ -12,20 +15,34 @@ process MERGE_COUNTS {

shell:
'''
# Sort list of input files alphanumerically
gene_counts=$(echo !{gene_counts} | tr ' ' '\\n' | sort | xargs)
tx_counts=$(echo !{tx_counts} | tr ' ' '\\n' | sort | xargs)
# Merge the individual outputs of featurecount of each .bam into a single file
paste !{gene_counts} \
paste \${gene_counts} \
| awk '{printf("%s ",$1); for (i=2;i<=NF;i+=2){printf("%s ",$i)}print "\\n"}' \
| grep -v -e "^$" \
| awk -v OFS='\\t' '{$1=$1}1' > counts_gene.txt
paste !{tx_counts} \
paste \${tx_counts} \
| awk '{printf("\\n%s %s ",$1,$2); for (i=3;i<=NF;i+=3){printf("%s ",$i)}}' \
| grep -v -e "^$" \
| awk -v OFS='\\t' '{$1=$1}1' > counts_transcript.txt
# Rename first line
sed -i '1s/^Geneid/transcript_id/' counts_transcript.txt
# Sort genes and tx rows alphanumerically
(head -n 1 counts_transcript.txt && tail -n +2 counts_transcript.txt | sort -k1,1) > counts_transcript.txt.new \
&& mv counts_transcript.txt.new counts_transcript.txt
(head -n 1 counts_gene.txt && tail -n +2 counts_gene.txt | sort -k1,1) > counts_gene.txt.new \
&& mv counts_gene.txt.new counts_gene.txt
# Create empty NDR file for TFKMERS to have same workflow as bambu in filtering step
# (placeholder, not used)
touch empty.ndr
'''
}
}
10 changes: 3 additions & 7 deletions modules/stringtie/stringtie_quant.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,14 @@ process STRINGTIE_QUANTIFY {
'https://depot.galaxyproject.org/singularity/stringtie%3A2.2.3--h43eeafb_0' :
'quay.io/biocontainers/stringtie:2.2.3--h43eeafb_0'}"
cpus params.maxCpu
tag 'bam'

input:
path '*'
path bam
path merged_gtf

output:
path 'stringtie_quant.gtf', emit: stringtie_quant_qtf
path 'e_data.ctab'
path 'e2t.ctab'
path 'i_data.ctab'
path 'i2t.ctab'
path 't_data.ctab'

script:
"""
Expand All @@ -25,6 +21,6 @@ process STRINGTIE_QUANTIFY {
-p ${params.maxCpu} \
-G ${merged_gtf} \
-o stringtie_quant.gtf \
*.bam
${bam}
"""
}
Loading

0 comments on commit 2d66137

Please sign in to comment.