-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #28 from N-Hoffmann/stringtie
Add restranding process
- Loading branch information
Showing
22 changed files
with
548 additions
and
10 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
#! /usr/bin/env Rscript | ||
|
||
library(rtracklayer) | ||
library(dplyr) | ||
library(stringr) | ||
|
||
############################################# | ||
# CLI Args | ||
args = commandArgs(trailingOnly=TRUE) | ||
gtf_file = args[1] | ||
tx_tool = args[2] | ||
output = args[3] | ||
|
||
# Define prefix to identify novel transcripts | ||
if (tx_tool == "stringtie2") { | ||
prefix = "MSTRG" | ||
} else{ | ||
prefix = "Bambu" | ||
} | ||
|
||
gtf <- rtracklayer::readGFF(gtf_file) | ||
|
||
# Restrand isoforms of known genes ---------------------------------------- | ||
|
||
# Check if gtf already has ref_gene_id feature (Stringtie), if not (Bambu), create it based on gene_id column | ||
if (!"ref_gene_id" %in% colnames(gtf)){ | ||
gtf$ref_gene_id <- ifelse(!grepl("Bambu|MSTRG", gtf$gene_id), | ||
gtf$gene_id, | ||
NA) | ||
} | ||
|
||
# Get strand of reference genes | ||
ref_strand <- unique(gtf %>% filter(!is.na(ref_gene_id)) %>% select(ref_gene_id, strand)) | ||
# Find novel isoforms | ||
to_change <- gtf %>% filter(str_detect(transcript_id, prefix) & !str_detect(gene_id, prefix)) | ||
# Match and apply new strand (if different) to novel isoforms | ||
matches <- match(to_change$gene_id, ref_strand$ref_gene_id) | ||
to_change$strand <- ref_strand$strand[matches] | ||
|
||
# Change strand of novel isoforms in gtf dataframe | ||
gtf <- gtf %>% | ||
mutate(strand = case_when( | ||
transcript_id %in% to_change$transcript_id ~ to_change$strand[match(transcript_id, to_change$transcript_id)], | ||
TRUE ~ strand | ||
)) | ||
|
||
# Restrand novel tx of novel genes ---------------------------------------- | ||
|
||
# Look for novel tx of novel genes | ||
novel <- gtf %>% filter(str_detect(transcript_id, prefix) & str_detect(gene_id, prefix)) | ||
|
||
# Restrand novel transcripts | ||
# If all isoforms are unstraded, keep unstranded | ||
# If some are stranded, use strand as new strand | ||
# If all three strands are there, use majority rule | ||
new_gene_strands <- novel %>% | ||
group_by(gene_id) %>% | ||
summarize(gene_strand = if(all(strand == "*")) "*" | ||
else { | ||
strands <- table(strand[strand != "*"]) | ||
if(length(strands) > 1 && max(strands) == min(strands)) "*" | ||
else names(which.max(strands)) | ||
}) | ||
|
||
# Change strand in gtf | ||
gtf <- gtf %>% | ||
left_join(new_gene_strands, by="gene_id") %>% | ||
mutate(strand = coalesce(gene_strand, strand)) %>% | ||
select(-gene_strand) | ||
|
||
# Export gtf to new restranded file | ||
export(gtf, output) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -11,6 +11,7 @@ process INDEX_BAM { | |
output: | ||
file("*.bai") | ||
|
||
script: | ||
""" | ||
samtools index $bam | ||
""" | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
process RESTRAND_ISOFORMS { | ||
conda (params.enable_conda ? "$baseDir/environment.yml" : null) | ||
container "ghcr.io/igdrion/annexa:${workflow.revision? workflow.revision: "main"}" | ||
|
||
input: | ||
file gtf | ||
|
||
output: | ||
path "restranded.${gtf}" | ||
|
||
shell: | ||
''' | ||
restrand_isoforms.R !{gtf} !{params.tx_discovery} "restranded.!{gtf}" | ||
sed -i '/;\s*$/!s/\s*$/;/' "restranded.!{gtf}" | ||
''' | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,18 @@ | ||
process RESTRAND_NOVEL { | ||
conda (params.enable_conda ? "$baseDir/environment.yml" : null) | ||
container "ghcr.io/igdrion/annexa:${workflow.revision? workflow.revision: "main"}" | ||
publishDir "$params.outdir/final", mode: 'copy', saveAs: {filename -> 'novel.full.gtf'}, overwrite: true | ||
|
||
input: | ||
file gtf | ||
|
||
output: | ||
path "${gtf}" | ||
|
||
shell: | ||
''' | ||
restrand_isoforms.R !{gtf} !{params.tx_discovery} "restranded.!{gtf}" | ||
mv restranded.!{gtf} !{gtf} | ||
sed -i '/;\s*$/!s/\s*$/;/' !{gtf} | ||
''' | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters