From ed7c3007d1ee6d1d882f311fc790374f0828fef5 Mon Sep 17 00:00:00 2001 From: Fengyuan Hu Date: Wed, 6 Jan 2016 16:09:33 +0000 Subject: [PATCH] add all scripts --- commands/ribotaper_Gao.q | 9 + commands/ribotaper_Zebr.q | 9 + commands/ribotaper_Zebr.q~ | 9 + commands/ribotaper_this_study.q | 9 + commands_executed | 91 ++ scripts/CCDS_orf_finder.R | 1070 +++++++++++++++ scripts/NONCCDS_orf_finder.R | 983 ++++++++++++++ scripts/ORF_final_results.R | 208 +++ scripts/P_sites_RNA_sites_calc.bash | 75 ++ scripts/Ribotaper.sh | 191 +++ scripts/Ribotaper.sh~ | 191 +++ scripts/Ribotaper_ORF_find.sh | 129 ++ scripts/analyze_multi_clust.bash | 54 + scripts/annotate_exons.R | 118 ++ scripts/bowrrna_star.q | 34 + scripts/create_annotations_files.bash | 231 ++++ scripts/create_metaplots.bash | 76 ++ scripts/create_protein_db.R | 310 +++++ scripts/create_tracks.bash | 75 ++ scripts/functions.R | 1734 +++++++++++++++++++++++++ scripts/genes_coor.R | 14 + scripts/gtf_to_start_stop_tr.R | 93 ++ scripts/include_multi_nomerge.R | 79 ++ scripts/metag.R | 133 ++ scripts/quality_check.R | 265 ++++ scripts/tracks_analysis.R | 127 ++ 26 files changed, 6317 insertions(+) create mode 100644 commands/ribotaper_Gao.q create mode 100644 commands/ribotaper_Zebr.q create mode 100644 commands/ribotaper_Zebr.q~ create mode 100644 commands/ribotaper_this_study.q create mode 100644 commands_executed create mode 100755 scripts/CCDS_orf_finder.R create mode 100755 scripts/NONCCDS_orf_finder.R create mode 100755 scripts/ORF_final_results.R create mode 100755 scripts/P_sites_RNA_sites_calc.bash create mode 100755 scripts/Ribotaper.sh create mode 100644 scripts/Ribotaper.sh~ create mode 100755 scripts/Ribotaper_ORF_find.sh create mode 100755 scripts/analyze_multi_clust.bash create mode 100755 scripts/annotate_exons.R create mode 100644 scripts/bowrrna_star.q create mode 100755 scripts/create_annotations_files.bash create mode 100755 scripts/create_metaplots.bash create mode 100755 scripts/create_protein_db.R create mode 100755 scripts/create_tracks.bash create mode 100755 scripts/functions.R create mode 100755 scripts/genes_coor.R create mode 100755 scripts/gtf_to_start_stop_tr.R create mode 100755 scripts/include_multi_nomerge.R create mode 100755 scripts/metag.R create mode 100755 scripts/quality_check.R create mode 100755 scripts/tracks_analysis.R diff --git a/commands/ribotaper_Gao.q b/commands/ribotaper_Gao.q new file mode 100644 index 0000000..196b9d6 --- /dev/null +++ b/commands/ribotaper_Gao.q @@ -0,0 +1,9 @@ +#!/bin/bash +#$ -pe smp 7 +#$ -l h_vmem=8G +#$ -e "error_ribot_new" +#$ -o "out_ribot_new" +#$ -cwd + + +../scripts/Ribotaper.sh ../alignment_files/HEK_293_Ribo_Gao_etal_Aligned.out.sorted.bam ../alignment_files/HEK_293_RNA_Gao_etal_Aligned.out.sorted.bam ../annotation_dir_human/ 26,27,28,29 12,12,12,12 ../scripts/ ../bedtools_dir/ 7 diff --git a/commands/ribotaper_Zebr.q b/commands/ribotaper_Zebr.q new file mode 100644 index 0000000..7ace1d3 --- /dev/null +++ b/commands/ribotaper_Zebr.q @@ -0,0 +1,9 @@ +#!/bin/bash +#$ -pe smp 7 +#$ -l h_vmem=8G +#$ -e "error_ribot_new" +#$ -o "out_ribot_new" +#$ -cwd + + +../scripts/Ribotaper.sh ../alignment_files/Danio_rerio_Bazzini_5hPF_Ribo_Aligned.out.sorted.bam ../alignment_files/Danio_rerio_Bazzini_5hPF_RNA_Aligned.out.sorted.bam ../annotation_dir_zebr/ 28,29 12,12 ../scripts/ ../bedtools_dir/ 7 diff --git a/commands/ribotaper_Zebr.q~ b/commands/ribotaper_Zebr.q~ new file mode 100644 index 0000000..ba9f1b5 --- /dev/null +++ b/commands/ribotaper_Zebr.q~ @@ -0,0 +1,9 @@ +#!/bin/bash +#$ -pe smp 7 +#$ -l h_vmem=6G +#$ -e "error_ribot_new" +#$ -o "out_ribot_new" +#$ -cwd + + +../scripts/Ribotaper.sh ../alignment_files/Danio_rerio_Bazzini_5hPF_Ribo_Aligned.out.sorted.bam ../alignment_files/Danio_rerio_Bazzini_5hPF_RNA_Aligned.out.sorted.bam ../annotation_dir_zebr/ 28,29 12,12 ../scripts/ ../bedtools_dir/ 7 diff --git a/commands/ribotaper_this_study.q b/commands/ribotaper_this_study.q new file mode 100644 index 0000000..670329e --- /dev/null +++ b/commands/ribotaper_this_study.q @@ -0,0 +1,9 @@ +#!/bin/bash +#$ -pe smp 7 +#$ -l h_vmem=8G +#$ -e "error_ribot_new" +#$ -o "out_ribot_new" +#$ -cwd + + +../scripts/Ribotaper.sh ../alignment_files/HEK_293_Ribo_This_study_Aligned.out.sorted.bam ../alignment_files/HEK_293_RNA_This_study_Aligned.out.sorted.bam ../annotation_dir_human/ 26,28,29 9,12,12 ../scripts/ ../bedtools_dir/ 7 diff --git a/commands_executed b/commands_executed new file mode 100644 index 0000000..fa88f02 --- /dev/null +++ b/commands_executed @@ -0,0 +1,91 @@ + +####----------------Here a list of commands executed for the RiboTaper analysis.(version 1.2, November 2015)------------------ + + +################################################################### +# This file is part of RiboTaper. +# RiboTaper is a method for defining traslated ORFs using +# Ribosome Profiling data. +# +# Copyright (C) 2015 Lorenzo Calviello +# +# RiboTaper is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# RiboTaper is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with RiboTaper. If not, see . +# +# Contact: Lorenzo.Calviello@mdc-berlin.de +####################################################################### + + + +####Unpack the provided files on our website (https://ohlerlab.mdc-berlin.de/software/): + +#alignment_files.tar.gz +#annotation_human_daniorerio.tar.gz +#RiboTaper_v1.0.tar.gz + + +tar -zxvf alignment_files.tar.gz +tar -zxvf annotation_human_daniorerio.tar.gz +tar -zxvf RiboTaper_v1.0.tar.gz + + +#### create annotation files for human using ccds and appris tags (Gencode 19 + hg19 genome): + +scripts/create_annotations_files.bash annotation_human_daniorerio/gencode.v19.annotation.gtf annotation_human_daniorerio/hg19_genome.fa true true annotation_dir_human bedtools_dir/ scripts/ + + +#### create annotation files Danio rerio created without using any tags (no ccds, no appris): + + +scripts/create_annotations_files.bash annotation_human_daniorerio/Danio_rerio.Zv9.76_noscaff.gtf annotation_human_daniorerio/Danio_rerio.Zv9.dna.toplevel_noscaff.fa false false annotation_dir_zebr bedtools_dir/ scripts/ + + +#### create new directories for the provided experiments + +mkdir HEK_this_study HEK_Gao Zebrafish_Bazzini_5hPF + + + +### go inside the first directory + +cd HEK_this_study + + +### submit bash script for SGE computing, using 7 cores and 8 Gigabyte of RAM per core +### HEK293 data for this study, Ribo-seq newly generated + RNA seq from http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE49831 +### annotation files from human + +qsub ../commands/ribotaper_this_study.q + + +### go inside the second directory + +cd ../HEK_Gao + +### submit bash script for SGE computing, using 7 cores and 8 Gigabyte of RAM per core +### HEK293 data for Gao et al, cycloheximide (Ribo) and RNA-seq experiments for the "control" sample: http://www.ncbi.nlm.nih.gov/sra/SRX740748%5Baccn%5D http://www.ncbi.nlm.nih.gov/sra/SRX740751%5Baccn%5D + +qsub ../commands/ribotaper_Gao.q + + + +### go inside the third directory + +cd ../Zebrafish_Bazzini_5hPF + + +### Danio Rerio data for 5h_PF, from http://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE53693 +### submit bash script for SGE computing, using 7 cores and 8 Gigabyte of RAM per core + +qsub ../commands/ribotaper_Zebr.q + diff --git a/scripts/CCDS_orf_finder.R b/scripts/CCDS_orf_finder.R new file mode 100755 index 0000000..70d8d80 --- /dev/null +++ b/scripts/CCDS_orf_finder.R @@ -0,0 +1,1070 @@ +#!/usr/bin/Rscript + +################################################################### +# This file is part of RiboTaper. +# RiboTaper is a method for defining traslated ORFs using +# Ribosome Profiling data. +# +# Copyright (C) 2015 Lorenzo Calviello +# +# RiboTaper is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# RiboTaper is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with RiboTaper. If not, see . +# +# Contact: Lorenzo.Calviello@mdc-berlin.de +####################################################################### + + +###script for CCDS genes ORF Finding, takes as arguments annotation dir, RiboTaper scripts dir, bedtools dir, n of cores + + + +args <- commandArgs(trailingOnly = TRUE) + +print(paste("--- CCDS ORF finding","---",date(),sep=" ")) +###loads functions + +suppressMessages(source(paste(args[2],"functions.R",sep = "/"))) + +###takes n of cores + +registerDoMC(args[4]) + +###loads annotation files + +annot<-paste(args[1],"cds_coords_transcripts",sep = "/") +cdss_transcripts<-read.table(annot,stringsAsFactors=F,header=F) +colnames(cdss_transcripts)<-c("transcript_id","annotated_start","annotated_stop") + +###loads exonic results + + +results_ccds_ORFs<-read.table("all_calculations_ccdsgenes_annot_new",sep="\t",quote = "",stringsAsFactors=F,header=T) +all_annot_notCCDS<-results_ccds_ORFs[results_ccds_ORFs[,"type"]!="ccds",] + +###calculates coordinates for sequence search + + + +fives_threes<-all_annot_notCCDS[all_annot_notCCDS[,"type"]%in%c("3_utrs_ex","3_utrs_st","5_utrs_ex","5_utrs_st"),] +fives_threes_nonov<-fives_threes[is.na(fives_threes["overlapping_ccds_start"]),] +fives_threes_ov<-fives_threes[!is.na(fives_threes["overlapping_ccds_start"]),] + +fives_threes_ok<-rbind(fives_threes_nonov,fives_threes_ov[fives_threes_ov[,"type"]%in%c("3_utrs_st","5_utrs_st"),]) + + +###reads data tracks + + +all_tracks_ccds<-readBigText("data_tracks/Psit_Ribo_Rna_Cent_tracks_ccds") + +index_ccds<-read.table("data_tracks/index_tracks_ccds",stringsAsFactors=F,header=F) +colnames(index_ccds)<-"exon_id" + + +all_tracks_exonsccds<-readBigText("data_tracks/Psit_Ribo_Rna_Cent_tracks_exonsccds") + +index_exonsccds<-read.table("data_tracks/index_tracks_exonsccds",stringsAsFactors=F,header=F) +colnames(index_exonsccds)<-"exon_id" + +tr_ex<-paste(args[1],"transcr_exons_ccds.bed",sep = "/") + +transcr_ccds<-read.table(tr_ex,stringsAsFactors=F,header=F) +colnames(transcr_ccds)<-c("chr","start","end","transcript_id","gene_id","strand") +transcr_ccds$coords_id<-paste(transcr_ccds[,1],transcr_ccds[,2],transcr_ccds[,3],sep="_") +transcr_ccds$coords_ok<-paste(transcr_ccds[,1],transcr_ccds[,2],transcr_ccds[,3],transcr_ccds[,6],sep="_") +results_ccds_ORFs$coords_ok<-paste(results_ccds_ORFs$coords,results_ccds_ORFs$strand.x,sep="_") + +###selects transcript with >2 reads + +transcr_ccds<-unique(merge(transcr_ccds,results_ccds_ORFs[,c("coords_ok","P_sites_sum")],by="coords_ok",all.x=T)) +transcr_sites<-aggregate(transcr_ccds$P_sites_sum,by=list(transcr_ccds$transcript_id),FUN=sum) +colnames(transcr_sites)<-c("transcript_id","n_P_sites") +transcr_sites<-transcr_sites[transcr_sites[,"n_P_sites"]>2,] +transcr_sites<-unique(transcr_sites$transcript_id) +###checks for CCDS transcripts (if available) + + +if(sum(list.files(path =args[1])=="transcr_exons_ccds_ccdsid.bed")>0){ + tr_cc_app_ccdsid<-paste(args[1],"transcr_exons_ccds_ccdsid.bed",sep = "/") + transcr_ccds_ccdsid<-read.table(tr_cc_app_ccdsid,stringsAsFactors=F,header=F) + colnames(transcr_ccds_ccdsid)<-c("chr","start","end","transcript_id","gene_id","strand") + transcr_ccds_ccdsid$coords_id<-paste(transcr_ccds_ccdsid[,1],transcr_ccds_ccdsid[,2],transcr_ccds_ccdsid[,3],sep="_") + transcr_ccds_ccdsid$exon_id<-paste(transcr_ccds_ccdsid$coords_id,"EXONCCDS",transcr_ccds_ccdsid[,5],sep="_") + transcript_ccds_transl_uORF<-unique(transcr_ccds_ccdsid[transcr_ccds_ccdsid[,"exon_id"]%in%fives_threes_ok[,"exon_id"],"transcript_id"]) + transcript_ccds_transl_uORF<-transcript_ccds_transl_uORF[!is.na(transcript_ccds_transl_uORF)] + if(sum(list.files(path =args[1])=="transcr_exons_ccds_appris.bed")==0){ + transcr_sites<-unique(c(transcript_ccds_transl_uORF)) + + } + +} + +###checks for APPRIS transcripts (if available) + + +if(sum(list.files(path =args[1])=="transcr_exons_ccds_appris.bed")>0){ + tr_cc_app<-paste(args[1],"transcr_exons_ccds_appris.bed",sep = "/") + transcr_ccds_appr<-read.table(tr_cc_app,stringsAsFactors=F,header=F) + colnames(transcr_ccds_appr)<-c("chr","start","end","transcript_id","gene_id","strand") + transcr_ccds_appr$coords_id<-paste(transcr_ccds_appr[,1],transcr_ccds_appr[,2],transcr_ccds_appr[,3],sep="_") + transcr_ccds_appr$exon_id<-paste(transcr_ccds_appr$coords_id,"EXONnonCCDS",transcr_ccds_appr[,5],sep="_") + transcr_ccds_appr$coords2<-paste(transcr_ccds_appr$chr,":",transcr_ccds_appr$start,"-",transcr_ccds_appr$end,"(",transcr_ccds_appr$strand,")",sep="") + #see prev versions to change this + #transcript_ccds_transl<-results_ccds_ORFs[results_ccds_ORFs[,"pval_multit_3nt_ribo"]<0.05 & results_ccds_ORFs[,"P_sites_sum"]>5 ,] + transcript_ccds_transl<-transcr_ccds_appr[!is.na(transcr_ccds_appr[,"gene_id"]),] + transcr_sites<-unique(transcript_ccds_transl$transcript_id) + if(sum(list.files(path =args[1])=="transcr_exons_ccds_ccdsid.bed")>0){ + transcript_ccds_transl<-results_ccds_ORFs[results_ccds_ORFs[,"pval_multit_3nt_ribo"]<0.05 & results_ccds_ORFs[,"P_sites_sum"]>5 ,] + transcript_ccds_transl<-transcript_ccds_transl[!is.na(transcript_ccds_transl[,"gene_id"]),] + transcript_ccds_transl<-unique(transcr_ccds_appr[transcr_ccds_appr[,"coords_id"]%in%transcript_ccds_transl[,"coords"],"transcript_id"]) + transcript_ccds_transl<-transcript_ccds_transl[!is.na(transcript_ccds_transl)] + transcr_sites<-unique(c(transcript_ccds_transl,transcript_ccds_transl_uORF)) + + } + +} + + +#reduce the search space to enhance speed + + + +index_coords_ccds<-sapply(strsplit(index_ccds$exon_id,split="_"),function(x){paste(x[1],x[2],x[3],sep="_")}) +index_coords_exonsccds<-sapply(strsplit(index_exonsccds$exon_id,split="_"),function(x){paste(x[1],x[2],x[3],sep="_")}) + + +#index_coords_ccds<-index_coords_ccds[index_coords_ccds%in%transcr_ccds_fin_ids] + +#index_coords_exonsccds<-index_coords_exonsccds[index_coords_exonsccds%in%transcr_ccds_fin_ids] + +if(sum(list.files(path =args[1])=="transcr_exons_ccds_ccdsid.bed")==1){ + transcr_sites<-unique(c(transcript_ccds_transl_uORF,transcr_sites)) +} + +transcr_sites<-transcr_sites[!is.na(transcr_sites)] + +transcr_ccds_fin<-transcr_ccds[transcr_ccds[,"transcript_id"]%in%transcr_sites,] +transcr_ccds_fin_ids<-unique(unlist(transcr_ccds_fin[,"coords_id"])) +# +# all_tracks_ccds<-all_tracks_ccds[(index_coords_ccds%in%transcr_ccds_fin_ids)] +# index_ccds<-subset(index_ccds,index_coords_ccds%in%transcr_ccds_fin_ids) +# +# all_tracks_exonsccds<-all_tracks_exonsccds[(index_coords_exonsccds%in%transcr_ccds_fin_ids)] +# index_exonsccds<-subset(index_exonsccds,index_coords_exonsccds%in%transcr_ccds_fin_ids) +# + +st_st_NA<-data.frame(start_pos=NA,st2vect=NA) +st_st_NA$ORF_frame<-NA +st_st_NA$ORF_length<-NA +st_st_NA$ORF_P_sites<-NA +st_st_NA$ORF_Psit_pct_in_frame<-NA +st_st_NA$ORF_RNA_sites<-NA +st_st_NA$ORF_RNAsit_pct_in_frame<-NA +st_st_NA$ORF_freq_multi_ribo<-NA +st_st_NA$ORF_pval_multi_ribo<-NA +st_st_NA$ORF_spec_multi_ribo<-NA +st_st_NA$ORF_freq_multi_rna<-NA +st_st_NA$ORF_pval_multi_rna<-NA +st_st_NA$ORF_spec_multi_rna<-NA +st_st_NA$ORF_freq3_fft_ribo<-NA +st_st_NA$ORF_spec3_fft_ribo<-NA +st_st_NA$ORF_freq3_spec_ribo<-NA +st_st_NA$ORF_spec3_spec_ribo<-NA +st_st_NA$ORF_freq3_fft_rna<-NA +st_st_NA$ORF_spec3_fft_rna<-NA +st_st_NA$ORF_freq3_spec_rna<-NA +st_st_NA$ORF_spec3_spec_rna<-NA +st_st_NA$ORF_ORF_score_ribo<-NA +st_st_NA$ORF_ORF_score_rna<-NA +st_st_NA$ORF_chisq_ribo<-NA +st_st_NA$ORF_chisq_rna<-NA +st_st_NA$ORF_Ribo_cov_aver<-NA +st_st_NA$ORF_RNA_cov_aver<-NA +st_st_NA$ORF_pept<-NA +st_st_NA$nt_tocheck_next_start<-0 +st_st_NA$pval_next_start<-1 +st_st_NA$P_sites_next_start<-0 +st_st_NA$pct_P_sites_inframe_next_start<-0 +st_st_NA$Method<-NA +st_st_NA$to_check<-NA +st_st_NA$to_check_rem<-NA +st_st_NA$ORF_id_tr<-NA +st_st_NA$ORF_id_gen<-NA +st_st_NA$to_check_ALL<-NA + +CCDS_orfs<-foreach(j=1:length(transcr_sites),.combine=rbind,.multicombine=T) %dopar%{ + transcript<-transcr_sites[j] + + ###assembles transcript + + exons_in_transcr<-transcr_ccds[transcr_ccds[,"transcript_id"]==transcript,] + #order exons + exons_in_transcr<-exons_in_transcr[order(exons_in_transcr$start,decreasing=F),] + list_exons_transcr<-list() + list_exons_seqs<-list() + + for(k in seq(1,dim(exons_in_transcr)[1])){ + exon_track<-c() + subs_ccds<-index_coords_ccds==exons_in_transcr[k,"coords_id"] + if(sum(subs_ccds)>0){ + if(sum(subs_ccds)==5){ + exon_track<-all_tracks_ccds[subs_ccds] + } + if(sum(subs_ccds)>5){ + exon_track<-all_tracks_ccds[which(subs_ccds)[1:5]] + } + } + if(length(exon_track)==0){ + subs_exonsccds<-index_coords_exonsccds==exons_in_transcr[k,"coords_id"] + if(sum(subs_exonsccds)==5){ + + exon_track<-all_tracks_exonsccds[subs_exonsccds] + } + if(sum(subs_exonsccds)>5){ + exon_track<-all_tracks_exonsccds[which(subs_exonsccds)[1:5]] + + } + + + } + + withsep<-strsplit(exon_track,split=" ") + x<-t(data.frame(withsep)) + + strand<-x[1,2] + tracks<-t(x[,-c(1:2)]) + + colnames(tracks)<-c("Psites","RiboCov","RNACov","RNAcent","Seq") + seq<-tracks[,5] + tracks<-tracks[,1:4] + mode(tracks)<-"numeric" + length<-dim(tracks)[1] + list_exons_transcr[[k]]<-tracks + list_exons_seqs[[k]]<-seq + + } + + merged_tracks<-do.call(what=rbind,list_exons_transcr) + + if(strand=="-"){ + merged_tracks<-cbind(rev(merged_tracks[,1]),rev(merged_tracks[,2]),rev(merged_tracks[,3]),rev(merged_tracks[,4])) + } + + tracks<-merged_tracks + length<-dim(tracks)[1] + + if(strand=="+"){ + seq_transcr<-unlist(list_exons_seqs) + } + if(strand=="-"){ + + seq_transcr<-unlist(list_exons_seqs) + seq_transcr<-comp(rev((seq_transcr)),forceToLower=F) + } + transcr_data<-data.frame(transcript_id=transcript,stringsAsFactors=F) + transcr_data$gene_id<-unique(transcr_ccds[transcr_ccds[,"transcript_id"]==transcript,"gene_id"])[1] + transcr_data$annotation<-unique(results_ccds_ORFs[results_ccds_ORFs[,"gene_id"]==transcr_data$gene_id,"annotation",])[1] + transcr_data$gene_symbol<-unique(results_ccds_ORFs[results_ccds_ORFs[,"gene_id"]==transcr_data$gene_id,"gene_symbol",])[1] + + P_sites_sum<-sum(tracks[,1]) + RNA_sites_sum<-sum(tracks[,4]) + transcr_data$strand<-strand + transcr_data$length<-length + transcr_data$n_exons<-dim(exons_in_transcr)[1] + transcr_data$P_sites_sum<-P_sites_sum + transcr_data$RNA_sites<-RNA_sites_sum + transcr_data$Ribo_cov_aver<-mean(tracks[,2]) + transcr_data$RNA_cov_aver<-mean(tracks[,3]) + + transcr_data$freq_multit_3nt<-NA + transcr_data$pval_multit_3nt<-NA + transcr_data$spec_multit_3nt<-NA + if(P_sites_sum>2 & length>5){ + if(length<25){slepians<-dpss(n=length+(50-length),k=24,nw=12)} + if(length>=25){slepians<-dpss(n=length,k=24,nw=12)} + vals_mtm<-take_freqs_Fvalues_all_around_3nt_spec(n_tapers=24,time_bw=12,tracks[,1],slepians_values=slepians)[c(1,6,7)] + transcr_data$freq_multit_3nt<-vals_mtm[1] + transcr_data$pval_multit_3nt<-vals_mtm[2] + transcr_data$spec_multit_3nt<-vals_mtm[3] + + } + + Phase_P_sites_frame<-sum(tracks[seq(1,length,by=3),1]) + Phase_P_sites_frame_1<-sum(tracks[seq(2,length,by=3),1]) + Phase_P_sites_frame_2<-sum(tracks[seq(3,length,by=3),1]) + + transcr_data$chisq_noccds_psit<-NA + if(P_sites_sum>15){ + transcr_data$chisq_noccds_psit<-chisq.test(as.table(c(Phase_P_sites_frame,Phase_P_sites_frame_1,Phase_P_sites_frame_2)))$p.value} + if(P_sites_sum<16 & P_sites_sum>0){ + transcr_data$chisq_noccds_psit<-xmulti(obs=c(Phase_P_sites_frame,Phase_P_sites_frame_1,Phase_P_sites_frame_2),expr=c(1,1,1),statName="Prob",detail=0)$pProb + } + pctPhase_frame<-Phase_P_sites_frame/P_sites_sum + pctPhase_frame_1<-Phase_P_sites_frame_1/P_sites_sum + pctPhase_frame_2<-Phase_P_sites_frame_2/P_sites_sum + + Centered_sites_sum<-round(sum(tracks[,4]),digits=6) + + Phase_Centered_sites_frame<-sum(tracks[seq(1,length,by=3),4]) + Phase_Centered_sites_frame_1<-sum(tracks[seq(2,length,by=3),4]) + Phase_Centered_sites_frame_2<-sum(tracks[seq(3,length,by=3),4]) + + pctPhaseCentered_frame<-Phase_Centered_sites_frame/Centered_sites_sum + pctPhaseCentered_frame_1<-Phase_Centered_sites_frame_1/Centered_sites_sum + pctPhaseCentered_frame_2<-Phase_Centered_sites_frame_2/Centered_sites_sum + + transcr_data$chisq_noccds_rna<-NA + if(Centered_sites_sum>15){ + chisq_rna<-chisq.test(as.table(c(Phase_Centered_sites_frame,Phase_Centered_sites_frame_1,Phase_Centered_sites_frame_2)))$p.value} + if(Centered_sites_sum<16 & Centered_sites_sum>0){ + chisq_rna<-xmulti(obs=c(Phase_Centered_sites_frame,Phase_Centered_sites_frame_1,Phase_Centered_sites_frame_2),expr=c(1,1,1),statName="Prob",detail=0)$pProb + } + + + MAXPhase_frame<-max(c(pctPhase_frame,pctPhase_frame_1,pctPhase_frame_2)) + FRAME_MAX_phase<-max.col(t(c(pctPhase_frame,pctPhase_frame_1,pctPhase_frame_2)))-1 + + MAXPhaseCentered_frame<-max(c(pctPhaseCentered_frame,pctPhaseCentered_frame_1,pctPhaseCentered_frame_2)) + FRAME_MAX_phaseCentered<-max.col(t(c(pctPhaseCentered_frame,pctPhaseCentered_frame_1,pctPhaseCentered_frame_2)))-1 + + frame_start_pred<-FRAME_MAX_phase + frame_end_pred<-(length-(FRAME_MAX_phase+1))%%3 + + ###Finds ORFs on the 3 different frames + + all_sign_frames<-list() + for(u in 0:2){ + + pept<-NA + pept<-unlist(getTrans(seq_transcr,sens="F",frame=u)) + + starts<-pept=="M" + + stops<-pept=="*" + transcr_data$orf_position<-"undetected" + + start_pos<-((1:length(pept))[starts])*3 + if(length(start_pos)>0){ + start_pos<-start_pos+u-2 + } else {start_pos<-NA} + + stop_pos<-((1:length(pept))[stops])*3 + if(length(stop_pos)>0){ + stop_pos<-stop_pos+u-2 + } else {stop_pos<-NA} + + #NAs + if(sum(!is.na(start_pos))==0 | sum(!is.na(stop_pos))==0){ + st_st<-st_st_NA + transcr_data_fr_sORFs<-cbind(transcr_data,st_st_NA) + } + + if(sum(!is.na(start_pos))>0 & sum(!is.na(stop_pos))>0){ + st2vect<-c() + for(h in 1:length(start_pos)){ + st1<-start_pos[h] + diff<-stop_pos-st1 + diff<-diff[diff>0] + if(length(diff)>0){st2<-st1+min(diff)} + if(length(diff)==0){st2<-NA} + st2vect[h]<-st2 + + } + st_st<-data.frame(cbind(start_pos,st2vect)) + + st_st<-st_st[!is.na(st_st[,"st2vect"]),] + if(dim(st_st)[1]>0){ + if(dim(st_st)[1]==1){ + list_coords=list() + list_coords[[1]]<-st_st[,1]:st_st[,2] + } + if(dim(st_st)[1]>1){ + list_coords<-apply(st_st,FUN=function(x){x[1]:x[2]},1) + } + + max_period<-NA + start_pos<-NA + stop_pos<-NA + pval_max_period<-NA + } + if(dim(st_st)[1]>0){ + st_st$ORF_frame<-u + st_st$ORF_length<-NA + st_st$ORF_P_sites<-NA + st_st$ORF_Psit_pct_in_frame<-NA + st_st$ORF_RNA_sites<-NA + st_st$ORF_RNAsit_pct_in_frame<-NA + st_st$ORF_freq_multi_ribo<-NA + st_st$ORF_pval_multi_ribo<-NA + st_st$ORF_spec_multi_ribo<-NA + st_st$ORF_freq_multi_rna<-NA + st_st$ORF_pval_multi_rna<-NA + st_st$ORF_spec_multi_rna<-NA + + st_st$ORF_freq3_fft_ribo<-NA + st_st$ORF_spec3_fft_ribo<-NA + st_st$ORF_freq3_spec_ribo<-NA + st_st$ORF_spec3_spec_ribo<-NA + st_st$ORF_freq3_fft_rna<-NA + st_st$ORF_spec3_fft_rna<-NA + st_st$ORF_freq3_spec_rna<-NA + st_st$ORF_spec3_spec_rna<-NA + st_st$ORF_ORF_score_ribo<-NA + st_st$ORF_ORF_score_rna<-NA + st_st$ORF_chisq_ribo<-NA + st_st$ORF_chisq_rna<-NA + st_st$ORF_Ribo_cov_aver<-NA + st_st$ORF_RNA_cov_aver<-NA + st_st$ORF_pept<-NA + st_st$Method<-NA + st_st$to_check<-NA + st_st$to_check_rem<-NA + st_st$ORF_id_tr<-NA + st_st$ORF_id_gen<-NA + st_st$to_check_ALL<-NA + for(r in 1:dim(st_st)[1]){ + tracks_stst<-tracks[st_st[r,1]:st_st[r,2],] + length<-dim(tracks_stst)[1] + P_sites_sum<-sum(tracks_stst[,1]) + RNA_sites_sum<-sum(tracks_stst[,4]) + st_st[r,"ORF_length"]<-length-1 + st_st[r,"ORF_P_sites"]<-P_sites_sum + st_st[r,"ORF_RNA_sites"]<-RNA_sites_sum + st_st[r,"ORF_Ribo_cov_aver"]<-mean(tracks_stst[,2]) + st_st[r,"ORF_RNA_cov_aver"]<-mean(tracks_stst[,3]) + if(P_sites_sum>5 & length>5){ + Phase_P_sites_frame<-sum(tracks_stst[seq(1,length,by=3),1]) + Phase_P_sites_frame_1<-sum(tracks_stst[seq(2,length,by=3),1]) + Phase_P_sites_frame_2<-sum(tracks_stst[seq(3,length,by=3),1]) + st_st[r,"ORF_Psit_pct_in_frame"]<-Phase_P_sites_frame/P_sites_sum + if((Phase_P_sites_frame/P_sites_sum)>0.5){ + score1<-((Phase_P_sites_frame-P_sites_sum/3)^2)/(P_sites_sum/3) + score2<-((Phase_P_sites_frame_1-P_sites_sum/3)^2)/(P_sites_sum/3) + score3<-((Phase_P_sites_frame_2-P_sites_sum/3)^2)/(P_sites_sum/3) + + orfsc<-log2(score1+score2+score3+1) + st_st[r,"ORF_ORF_score_ribo"]<-orfsc + if(Phase_P_sites_frame<=Phase_P_sites_frame_1 | Phase_P_sites_frame<=Phase_P_sites_frame_2){ + st_st[r,"ORF_ORF_score_ribo"]<--orfsc + } + + if(max(tracks_stst[,1])>(P_sites_sum*.7)){ + new_track<-tracks_stst + new_track[which(new_track[,1]==max(new_track[,1]))]<-0 + st_st[r,"ORF_ORF_score_ribo"]<-NA + if(sum(new_track[,1])>2){ + Phase_P_sites_frame_corr<-sum(new_track[seq(1,length,by=3),1]) + Phase_P_sites_frame_1_corr<-sum(new_track[seq(2,length,by=3),1]) + Phase_P_sites_frame_2_corr<-sum(new_track[seq(3,length,by=3),1]) + score1<-((Phase_P_sites_frame_corr-sum(new_track[,1])/3)^2)/(sum(new_track[,1])/3) + score2<-((Phase_P_sites_frame_1_corr-sum(new_track[,1])/3)^2)/(sum(new_track[,1])/3) + score3<-((Phase_P_sites_frame_2_corr-sum(new_track[,1])/3)^2)/(sum(new_track[,1])/3) + st_st[r,"ORF_ORF_score_ribo"]<-log2(score1+score2+score3+1) + if(Phase_P_sites_frame_corr<=Phase_P_sites_frame_1_corr | Phase_P_sites_frame<=Phase_P_sites_frame_2_corr){ + st_st[r,"ORF_ORF_score_ribo"]<--log2(score1+score2+score3+1) + } } + } + + if(P_sites_sum>15){ + st_st[r,"ORF_chisq_ribo"]<-chisq.test(as.table(c(Phase_P_sites_frame,Phase_P_sites_frame_1,Phase_P_sites_frame_2)))$p.value} + if(P_sites_sum<16 & P_sites_sum>0){ + st_st[r,"ORF_chisq_ribo"]<-xmulti(obs=c(Phase_P_sites_frame,Phase_P_sites_frame_1,Phase_P_sites_frame_2),expr=c(1,1,1),statName="Prob",detail=0)$pProb + } + if(length<25){slepians<-dpss(n=length+(50-length),k=24,nw=12)} + if(length>=25){slepians<-dpss(n=length,k=24,nw=12)} + values_mtm_orf<-take_freqs_Fvalues_all_around_3nt_spec(x=tracks_stst[,1],n_tapers=24,time_bw=12,slepians_values=slepians)[c(1,6,7)] + st_st[r,"ORF_freq_multi_ribo"]<-values_mtm_orf[1] + + st_st[r,"ORF_pval_multi_ribo"]<-values_mtm_orf[2] + st_st[r,"ORF_spec_multi_ribo"]<-values_mtm_orf[3] + fft_sp<-take_maxfreq_and_power_FFT_Spec(tracks_stst[,1]) + st_st[,"ORF_freq3_fft_ribo"]<-fft_sp[1] + st_st[,"ORF_spec3_fft_ribo"]<-fft_sp[2] + st_st[,"ORF_freq3_spec_ribo"]<-fft_sp[3] + st_st[,"ORF_spec3_spec_ribo"]<-fft_sp[4] + + pept<-unlist(getTrans(seq_transcr[st_st[r,1]:st_st[r,2]],sens="F")) + st_st[r,"ORF_pept"]<-paste(pept,sep="",collapse="") + } + if(RNA_sites_sum>5 & (Phase_P_sites_frame/P_sites_sum)>0.5){ + if(length<25){slepians<-dpss(n=length+(50-length),k=24,nw=12)} + if(length>=25){slepians<-dpss(n=length,k=24,nw=12)} + values_mtm_orf_rna<-take_freqs_Fvalues_all_around_3nt_spec(x=tracks_stst[,4],n_tapers=24,time_bw=12,slepians_values=slepians)[c(1,6,7)] + + st_st[r,"ORF_freq_multi_rna"]<-values_mtm_orf_rna[1] + st_st[r,"ORF_pval_multi_rna"]<-values_mtm_orf_rna[2] + st_st[r,"ORF_spec_multi_rna"]<-values_mtm_orf_rna[3] + + fft_sp<-take_maxfreq_and_power_FFT_Spec(tracks_stst[,4]) + st_st[,"ORF_freq3_fft_rna"]<-fft_sp[1] + st_st[,"ORF_spec3_fft_rna"]<-fft_sp[2] + st_st[,"ORF_freq3_spec_rna"]<-fft_sp[3] + st_st[,"ORF_spec3_spec_rna"]<-fft_sp[4] + + Phase_Centered_sites_frame<-sum(tracks_stst[seq(1,length,by=3),4]) + Phase_Centered_sites_frame_1<-sum(tracks_stst[seq(2,length,by=3),4]) + Phase_Centered_sites_frame_2<-sum(tracks_stst[seq(3,length,by=3),4]) + st_st[r,"ORF_RNAsit_pct_in_frame"]<-Phase_Centered_sites_frame/RNA_sites_sum + score1<-((Phase_Centered_sites_frame-P_sites_sum/3)^2)/(RNA_sites_sum/3) + score2<-((Phase_Centered_sites_frame_1-P_sites_sum/3)^2)/(RNA_sites_sum/3) + score3<-((Phase_Centered_sites_frame_2-P_sites_sum/3)^2)/(RNA_sites_sum/3) + + orfsc<-log2(score1+score2+score3+1) + st_st[r,"ORF_ORF_score_rna"]<-orfsc + if(Phase_Centered_sites_frame<=Phase_Centered_sites_frame_1 | Phase_Centered_sites_frame<=Phase_Centered_sites_frame_2){ + st_st[r,"ORF_ORF_score_rna"]<--orfsc + } + + if(max(tracks_stst[,4])>(RNA_sites_sum*.7)){ + new_track<-tracks_stst + new_track[which(new_track[,4]==max(new_track[,4]))]<-0 + st_st[r,"ORF_ORF_score_rna"]<-NA + if(sum(new_track[,4])>2){ + Phase_Centered_sites_frame_corr<-sum(new_track[seq(1,length,by=3),4]) + Phase_Centered_sites_frame_1_corr<-sum(new_track[seq(2,length,by=3),4]) + Phase_Centered_sites_frame_2_corr<-sum(new_track[seq(3,length,by=3),4]) + score1<-((Phase_Centered_sites_frame_corr-sum(new_track[,4])/3)^2)/(sum(new_track[,4])/3) + score2<-((Phase_Centered_sites_frame_1_corr-sum(new_track[,4])/3)^2)/(sum(new_track[,4])/3) + score3<-((Phase_Centered_sites_frame_2_corr-sum(new_track[,4])/3)^2)/(sum(new_track[,4])/3) + st_st[r,"ORF_ORF_score_rna"]<-log2(score1+score2+score3+1) + if(Phase_Centered_sites_frame_corr<=Phase_Centered_sites_frame_1_corr | Phase_Centered_sites_frame_corr<=Phase_Centered_sites_frame_2_corr){ + st_st[r,"ORF_ORF_score_rna"]<--log2(score1+score2+score3+1) + } + } + } + + if(RNA_sites_sum>15){ + st_st[r,"ORF_chisq_rna"]<-chisq.test(as.table(c(Phase_Centered_sites_frame,Phase_Centered_sites_frame_1,Phase_Centered_sites_frame_2)))$p.value} + if(RNA_sites_sum<16 & RNA_sites_sum>0){ + st_st[r,"ORF_chisq_rna"]<-xmulti(obs=c(Phase_Centered_sites_frame,Phase_Centered_sites_frame_1,Phase_Centered_sites_frame_2),expr=c(1,1,1),statName="Prob",detail=0)$pProb + } + + } + } + } + if(dim(st_st)[1]>0){st_st<-st_st[!is.na(st_st[,"ORF_pval_multi_ribo"]),]} + if(dim(st_st)[1]>0){st_st<-st_st[st_st[,"ORF_Psit_pct_in_frame"]>0.5,]} + if(dim(st_st)[1]>0){ + st_st$nt_tocheck_next_start<-0 + st_st$pval_next_start<-1 + st_st$P_sites_next_start<-0 + st_st$pct_P_sites_inframe_next_start<-0 + #find starts per each stop codon + list_stopsorfs<-split.data.frame(x=st_st,f=st_st[,2],drop=T) + + transcr_data_fr<-transcr_data + + list_sORFs_frame_moretap<-list() + list_sORFs_frame_bestperiod<-list() + list_sORFs_frame_maxsit<-list() + + for(g in 1:length(list_stopsorfs)){ + + stoplist<-list_stopsorfs[[g]] + max_period<-stoplist[stoplist[,"ORF_pval_multi_ribo"]==min(stoplist[,"ORF_pval_multi_ribo"]),] + list_sORFs_frame_bestperiod[[g]]<-max_period + stoplists_period<-stoplist[stoplist[,"ORF_pval_multi_ribo"]<0.05,] + if(dim(stoplists_period)[1]>0){ + stoplists_period<-stoplists_period[!is.na(stoplists_period[,"ORF_pval_multi_ribo"]),] + } + if(dim(stoplists_period)[1]>1){ + + for(b in 1:(dim(stoplists_period)[1]-1)){ + stoplists_period[b,"nt_tocheck_next_start"]<-stoplists_period[b+1,"start_pos"]-stoplist[b,"start_pos"] + tracks_stst<-tracks[stoplists_period[b,"start_pos"]:stoplists_period[b+1,"start_pos"],] + length<-dim(tracks_stst)[1] + P_sites_sum<-sum(tracks_stst[,1]) + pval_to_next<-1 + + Phase_P_sites_frame<-sum(tracks_stst[seq(1,length,by=3),1]) + Phase_P_sites_frame_1<-sum(tracks_stst[seq(2,length,by=3),1]) + Phase_P_sites_frame_2<-sum(tracks_stst[seq(3,length,by=3),1]) + + pctPhase_frame<-Phase_P_sites_frame/P_sites_sum + pctPhase_frame_1<-Phase_P_sites_frame_1/P_sites_sum + pctPhase_frame_2<-Phase_P_sites_frame_2/P_sites_sum + + if(P_sites_sum>5){ + if(length<25){slepians<-dpss(n=length+(50-length),k=24,nw=12)} + if(length>=25){slepians<-dpss(n=length,k=24,nw=12)} + + pval_to_next<-take_freqs_Fvalues_all_around_3nt_spec(x=tracks_stst[,1],n_tapers=24,time_bw=12,slepians_values=slepians)[6] + } + stoplists_period[b,"P_sites_next_start"]<-P_sites_sum + + stoplists_period[b,"pct_P_sites_inframe_next_start"]<-pctPhase_frame + + stoplists_period[b,"pval_next_start"]<-pval_to_next + } + + max_sit<-stoplists_period[which(stoplists_period[,"P_sites_next_start"]>5 & stoplists_period[,"pct_P_sites_inframe_next_start"]>0.5)[1],] + max_sit<-max_sit[!is.na(max_sit[,"ORF_length"]),] + + if(dim(max_sit)[1]==0){ + max_sit<-max_period + } + list_sORFs_frame_maxsit[[g]]<-max_sit + + more_tap<-stoplists_period[which(stoplists_period[,"pval_next_start"]<0.05)[1],] + more_tap<-more_tap[!is.na(more_tap[,"ORF_length"]),] + if(dim(more_tap)[1]==0){ + more_tap<-max_period + } + list_sORFs_frame_moretap[[g]]<-more_tap + + } + if(dim(stoplists_period)[1]<2){ + list_sORFs_frame_maxsit[[g]]<-max_period + list_sORFs_frame_moretap[[g]]<-max_period + + } + + + } + sORFs_frame_moretap<-do.call(what=rbind.data.frame,args=list_sORFs_frame_moretap) + sORFs_frame_moretap$Method<-"more_tapers" + sORFs_frame_maxsit<-do.call(what=rbind.data.frame,args=list_sORFs_frame_maxsit) + sORFs_frame_maxsit$Method<-"max_P_sites" + sORFs_frame_bestperiod<-do.call(what=rbind.data.frame,args=list_sORFs_frame_bestperiod) + sORFs_frame_bestperiod$Method<-"best_periodicity" + sORFs_frames<-rbind(sORFs_frame_moretap,sORFs_frame_maxsit,sORFs_frame_bestperiod) + + for(w in 1:dim(sORFs_frames)[1]){ + transcr_data_fr[w,]<-transcr_data_fr[1,] + } + + transcr_data_fr_sORFs<-cbind(transcr_data_fr,sORFs_frames) + transcr_data_fr_sORFs$orf_position<-"detected" + } + } + + if(dim(st_st)[1]==0){ + st_st<-st_st_NA + transcr_data_fr_sORFs<-cbind(transcr_data,st_st_NA) + } + } + + + all_sign_frames[[u+1]]<-transcr_data_fr_sORFs + } + all_sign_frames<-do.call(what=rbind.data.frame,args=all_sign_frames) + transcr_all_frames_res<-unique(all_sign_frames) + transcr_all_frames_res$ORF_id_tr<-paste(transcr_all_frames_res$transcript_id,transcr_all_frames_res$start_pos,transcr_all_frames_res$st2vect,sep="_") + transcr_all_frames_ok<-transcr_all_frames_res[!is.na(transcr_all_frames_res$ORF_pept),] + if(dim(transcr_all_frames_ok)[1]>0){ + all_orfs<-unique(transcr_all_frames_ok[,c("transcript_id","length","strand","start_pos","st2vect","ORF_length","gene_id")]) + transcr<-all_orfs$transcript_id[1] + trascr_length<-all_orfs$length[1] + orf_strand<-all_orfs$strand[1] + ex_intr_coords<-exons_in_transcr$coords_id + if(orf_strand=="-"){ex_intr_coords<-rev(ex_intr_coords)} + + exons_in_transcr_data<-results_ccds_ORFs[results_ccds_ORFs[,"coords"]%in%ex_intr_coords,] + exons_in_transcr_data<-exons_in_transcr_data[match(ex_intr_coords,exons_in_transcr_data$coords),] + cumsumexons<-cumsum(exons_in_transcr_data$length.x) + + list_orfas<-list() + for(z in 1:dim(all_orfs)[1]){ + orfa<-all_orfs[z,] + + transcr_data<-data.frame(transcript_id=transcr) + + orf_start<-orfa$start_pos + orf_end<-orfa$st2vect + + st_ex<-which((cumsumexons-orf_start)==min(cumsumexons[cumsumexons>orf_start]-orf_start)) + end_ex<-which((cumsumexons-orf_end)==min(cumsumexons[cumsumexons>=orf_end]-orf_end)) + in_betw_ex<-st_ex:end_ex + in_betw_ex<-in_betw_ex[!in_betw_ex%in%c(st_ex,end_ex)>0] + exon_inbetween_data<-exons_in_transcr_data[in_betw_ex,] + + + coord_start<-NA + coord_end<-NA + nt_to_rem<-NA + rem_len<-0 + if(st_ex>1){rem_len<-cumsumexons[st_ex-1]} + if(orfa$strand=="+"){coord_start<-exons_in_transcr_data[st_ex,"start"] + (orf_start-rem_len)} + if(orfa$strand=="-"){coord_start<-exons_in_transcr_data[st_ex,"end"] - (orf_start-rem_len)} + + if(length(in_betw_ex)==0){ + if(st_ex==end_ex){nt_to_rem<-0} + if(st_ex!=end_ex){if(orfa$strand=="+"){ + nt_to_rem<-exons_in_transcr_data[st_ex,"end"]-coord_start + } + if(orfa$strand=="-"){ + nt_to_rem<-coord_start-exons_in_transcr_data[st_ex,"start"] + } + } + } + + if(length(in_betw_ex)>0){ + nt_in_betw<-sum(exons_in_transcr_data[in_betw_ex,"length.x"]) + if(orfa$strand=="+"){ + nt_to_rem<-exons_in_transcr_data[st_ex,"end"]-coord_start + } + if(orfa$strand=="-"){ + nt_to_rem<-coord_start-exons_in_transcr_data[st_ex,"start"] + } + nt_to_rem<-nt_to_rem+nt_in_betw + } + + if(st_ex==end_ex & orfa$strand=="+"){coord_end<-coord_start+orfa$ORF_length+1} + if(st_ex==end_ex & orfa$strand=="-"){coord_end<-coord_start-orfa$ORF_length+1} + + if(st_ex!=end_ex & orfa$strand=="+"){coord_end<-exons_in_transcr_data[end_ex,"start"] + (orfa$ORF_length-nt_to_rem)+1} + if(st_ex!=end_ex & orfa$strand=="-"){coord_end<-exons_in_transcr_data[end_ex,"end"] - (orfa$ORF_length-nt_to_rem)+1} + + if(orfa$strand=="-"){ + coord_start2<-coord_start + coord_start<-coord_end + coord_end<-coord_start2 + } + + + if(st_ex!=end_ex & orfa$strand=="+"){to_check_st<-paste(exons_in_transcr_data[st_ex,"chr"],coord_start,exons_in_transcr_data[st_ex,"end"],"CCDS",orfa$gene_id,orfa$strand,sep="_") + to_check_end<-paste(exons_in_transcr_data[end_ex,"chr"],exons_in_transcr_data[end_ex,"start"],coord_end,"CCDS",orfa$gene_id,orfa$strand,sep="_") + to_check<-paste(to_check_st,to_check_end,sep=";") + + } + if(st_ex!=end_ex & orfa$strand=="-"){to_check_st<-paste(exons_in_transcr_data[st_ex,"chr"],exons_in_transcr_data[st_ex,"start"],coord_end,"CCDS",orfa$gene_id,orfa$strand,sep="_") + to_check_end<-paste(exons_in_transcr_data[end_ex,"chr"],coord_start,exons_in_transcr_data[end_ex,"end"],"CCDS",orfa$gene_id,orfa$strand,sep="_") + to_check<-paste(to_check_st,to_check_end,sep=";") + } + + if(st_ex==end_ex){to_check<-paste(exons_in_transcr_data[st_ex,"chr"],coord_start,coord_end,"CCDS",orfa$gene_id,orfa$strand,sep="_")} + orfa$to_check<-to_check + orfa$to_check_rem<-NA + if(length(in_betw_ex)>0){ + orfa$to_check_rem<-paste(exon_inbetween_data$exon_id,collapse=";") + + } + orfa$ORF_id_tr<-paste(transcr_data$transcript_id,orf_start,orf_end,sep="_") + orfa$ORF_id_gen<-paste(exons_in_transcr_data[st_ex,"chr"],coord_start,coord_end,sep="_") + orfa$to_check_ALL<-paste(orfa$to_check,orfa$to_check_rem,sep=";") + list_orfas[[z]]<-orfa + + + } + list_orfas<-do.call(rbind.data.frame,args=list_orfas) + transcr_all_frames_ok$ORF_id_gen<-NULL + transcr_all_frames_ok$to_check<-NULL + transcr_all_frames_ok$to_check_rem<-NULL + transcr_all_frames_ok$to_check_ALL<-NULL + + transcr_all_frames_ok<-merge(transcr_all_frames_ok,list_orfas[,c("ORF_id_tr","ORF_id_gen","to_check","to_check_rem","to_check_ALL")],by="ORF_id_tr") + #reconcile and maybe add the rest + return(transcr_all_frames_ok) + } + if(dim(transcr_all_frames_ok)[1]==0){return(transcr_all_frames_res)} + + +} +CCDS_orfs_found<-CCDS_orfs[!is.na(CCDS_orfs[,"ORF_pept"]),] + + +CCDS_orfs<-merge(CCDS_orfs_found,cdss_transcripts,by="transcript_id",all.x=T) + +write.table(CCDS_orfs,file="orfs_found",quote=F,row.names=F,sep="\t",col.names=T) + +options(scipen=999) + +CCDS_orfs$ORF_id_tr_minus2<-paste(CCDS_orfs$transcript_id,CCDS_orfs$start_pos,CCDS_orfs$st2vect+2,sep="_") +CCDS_orfs$ORF_id_tr_annotated<-paste(CCDS_orfs$transcript_id,CCDS_orfs$annotated_start,CCDS_orfs$annotated_stop,sep="_") + + +#nonccds_res<-results_ccds_ORFs +CCDS_orfs_uniq<-CCDS_orfs + +print(paste("--- checking CCDS ORF coverage and multi-mapping ratio,",date(),sep=" ")) + +all_sORFs_CCDS_multi<-CCDS_orfs_uniq + + +ex_to_check<-strsplit(all_sORFs_CCDS_multi$to_check,split=";") + +ex_to_check<-unique(unlist(ex_to_check)) + +ex_to_check_spl<-strsplit(ex_to_check,split="_") + +bedfiles_to_check<-data.frame(chr=NA,start=NA,end=NA,type=NA,gene_id=NA,strand=NA) +for(h in 1:length(ex_to_check_spl)){ + to_bed<-ex_to_check_spl[[h]] + bedfiles_to_check[h,"chr"]<-to_bed[1] + bedfiles_to_check[h,"start"]<-to_bed[2] + bedfiles_to_check[h,"end"]<-to_bed[3] + bedfiles_to_check[h,"type"]<-to_bed[4] + bedfiles_to_check[h,"gene_id"]<-to_bed[5] + bedfiles_to_check[h,"strand"]<-to_bed[6] + +} + + + +write.table(bedfiles_to_check,file="bed_tocheck_ccds.bed",quote=F,row.names=F,sep="\t",col.names=F) + +scr<-paste(args[2],"analyze_multi_clust.bash",sep="/") +syst_scr<-paste(scr,"bed_tocheck_ccds.bed bed_tocheck_ccds",args[3],sep = " ") +system(syst_scr) + +scr<-paste(args[2],"include_multi_nomerge.R",sep="/") +syst_scr<-paste(scr,"bed_tocheck_ccds",sep = " ") + +system(syst_scr) + +res_to_check<-read.table(file="multi_table_bed_tocheck_ccds",header=T,stringsAsFactors=F) + +dir.create("tmp_ccds", showWarnings = FALSE) + +system("mv *tocheck_ccds* tmp_ccds/") + +setwd("tmp_ccds") + + +ex_rem<-strsplit(as.character(all_sORFs_CCDS_multi$to_check_rem),split=";") + +ex_rem<-unique(unlist(ex_rem)) +ex_rem<-ex_rem[!is.na(ex_rem)] + + +res_ex_rem<-results_ccds_ORFs[results_ccds_ORFs[,"exon_id"]%in%ex_rem,c(c("exon_id","strand.x","length.y","reads_ribo","reads_multi_ribo","pct_region_covered_ribo","pct_covered_onlymulti_ribo","reads_rna","reads_multi_rna","pct_region_covered_rna","pct_covered_onlymulti_rna"))] +names(res_ex_rem)<-names(res_to_check) + +res_all_multi<-rbind.data.frame(res_ex_rem,res_to_check) + +res_all_multi$exon_id_2<-paste(res_all_multi$exon_id,res_all_multi$strand,sep="_") + + +all_sORFs_CCDS_multi_final<-foreach(g=1:(dim(all_sORFs_CCDS_multi)[1]),.combine=rbind,.multicombine=T) %dopar%{ + s<-all_sORFs_CCDS_multi[g,] + list_ex<-strsplit(s$to_check_ALL,split=";")[[1]] + with_exon2<-which(res_all_multi[,"exon_id_2"]%in%list_ex) + with_exon1<-which(res_all_multi[,"exon_id"]%in%list_ex) + to_take<-unique(c(with_exon2,with_exon1)) + res_multi<-res_all_multi[to_take,] + res_multi$reads_ribo<-sum(res_multi$reads_ribo) + res_multi$reads_multi_ribo<-sum(res_multi$reads_multi_ribo) + res_multi$pct_region_covered_ribo_ALL<-res_multi$pct_region_covered_ribo*res_multi$length.y + res_multi$pct_covered_onlymulti_ribo_ALL<-res_multi$pct_covered_onlymulti_ribo*res_multi$length.y + res_multi$pct_region_covered_ribo<-sum(res_multi$pct_region_covered_ribo_ALL)/(sum(res_multi$length.y)) + res_multi$pct_covered_onlymulti_ribo<-sum(res_multi$pct_covered_onlymulti_ribo_ALL)/(sum(res_multi$length.y)) + res_multi$reads_rna<-sum(res_multi$reads_rna) + res_multi$reads_multi_rna<-sum(res_multi$reads_multi_rna) + res_multi$pct_region_covered_rna_ALL<-res_multi$pct_region_covered_rna*res_multi$length.y + res_multi$pct_covered_onlymulti_rna_ALL<-res_multi$pct_covered_onlymulti_rna*res_multi$length.y + res_multi$pct_region_covered_rna<-sum(res_multi$pct_region_covered_rna_ALL)/sum(res_multi$length.y) + res_multi$pct_covered_onlymulti_rna<-sum(res_multi$pct_covered_onlymulti_rna_ALL)/sum(res_multi$length.y) + + s<-cbind(s,res_multi[1,]) + s +} + +print(paste("--- Selecting best transcript per CCDS ORF,",date(),sep=" ")) + + +write.table(all_sORFs_CCDS_multi_final,file="orfs_bef_ag",quote=F,row.names=F,sep="\t",col.names=T) + + +agg<-aggregate(x=all_sORFs_CCDS_multi_final[,"RNA_sites"],by=list(all_sORFs_CCDS_multi_final[,"gene_id"],all_sORFs_CCDS_multi_final[,"ORF_pept"],all_sORFs_CCDS_multi_final[,"Method"]),FUN=max) +names(agg)<-c("gene_id","ORF_pept","Method","RNA_sites") +agg2<-merge(x=all_sORFs_CCDS_multi_final[,c("ORF_id_tr_minus2","length","gene_id","ORF_pept","Method","RNA_sites")],agg,by=c("gene_id","ORF_pept","Method","RNA_sites")) + +agg3<-aggregate(x=agg2[,"length"],by=list(agg2[,"gene_id"],agg2[,"ORF_pept"],agg2[,"Method"],agg2[,"RNA_sites"]),FUN=max) + +names(agg3)<-c("gene_id","ORF_pept","Method","RNA_sites","length") +agg4<-merge(x=all_sORFs_CCDS_multi_final[,c("ORF_id_tr_minus2","length","gene_id","ORF_pept","Method","RNA_sites")],agg3,by=c("gene_id","ORF_pept","Method","length","RNA_sites")) +all_sORFs_CCDS_multi_final<-all_sORFs_CCDS_multi_final[all_sORFs_CCDS_multi_final[,"ORF_id_tr_minus2"]%in%agg4[,"ORF_id_tr_minus2"],] + + +all_sORFs_CCDS_periodic<-all_sORFs_CCDS_multi_final[all_sORFs_CCDS_multi_final[,"ORF_pval_multi_ribo"]<0.05,] +all_sORFs_CCDS_periodic<-all_sORFs_CCDS_multi_final[!is.na(all_sORFs_CCDS_multi_final[,"transcript_id"]),] + + + +all_sORFs_CCDS_periodic$n_exons_ORF<-sapply(strsplit(all_sORFs_CCDS_periodic$to_check_ALL,split=";"),FUN=function(x){sum(x!="NA")}) + + +print(paste("--- Checking CCDS ORFs intersections with annotated CDS regions,",date(),sep=" ")) + + + +ex_to_check<-strsplit(all_sORFs_CCDS_periodic$to_check_ALL,split=";") + +ex_to_check_spl<-unique(unlist(ex_to_check)) + +ex_to_check_spl<-strsplit(ex_to_check_spl,split="_") + +bedfiles_to_check<-data.frame(chr=NA,start=NA,end=NA,type=NA,gene_id=NA,strand=NA) +for(h in 1:length(ex_to_check_spl)){ + to_bed<-ex_to_check_spl[[h]] + bedfiles_to_check[h,"chr"]<-to_bed[1] + bedfiles_to_check[h,"start"]<-to_bed[2] + bedfiles_to_check[h,"end"]<-to_bed[3] + bedfiles_to_check[h,"type"]<-to_bed[4] + bedfiles_to_check[h,"gene_id"]<-to_bed[5] + bedfiles_to_check[h,"strand"]<-to_bed[6] + +} +bedfiles_to_check<-bedfiles_to_check[!is.na(bedfiles_to_check[,"chr"]),] +bedfiles_to_check<-bedfiles_to_check[bedfiles_to_check[,"chr"]!="NA",] + +write.table(bedfiles_to_check,file="sORFs_totest",quote=F,row.names=F,sep="\t",col.names=F) + +system("sort -k1,1 -k2,2n sORFs_totest > sORFs_totest.bed") + +bedfiles_to_check<-read.table("sORFs_totest.bed",stringsAsFactors=F,header=F) +colnames(bedfiles_to_check)<-c("chr","start","end","type","gene_id","strand") +bedfiles_to_check<-bedfiles_to_check[!is.na(bedfiles_to_check[,"chr"]),] + +bedfiles_to_check[is.na(bedfiles_to_check["strand"]),"strand"]<-"+" + +write.table(bedfiles_to_check,file="sORFs_totest.bed",quote=F,row.names=F,sep="\t",col.names=F) + +fhalf_scr<-paste(args[3],"intersectBed -v -a sORFs_totest.bed -b",sep = "/") + +shalf_scr<-paste(args[1],"all_cds.bed > sORFs_totest_nocds.bed",sep = "/") + +system(paste(fhalf_scr,shalf_scr,sep = " ")) + + +command<-paste("wc -l","sORFs_totest_nocds.bed") +lines_in_file<-system(command,intern=T) +lines_in_file<-as.numeric(strsplit(lines_in_file,split=" ")[[1]][1]) + +if(lines_in_file>0){ + results_nonoverlapcdss<-read.table("sORFs_totest_nocds.bed",stringsAsFactors=F,header=F) + names(results_nonoverlapcdss)<-names(bedfiles_to_check) + results_nonoverlapcdss[,"exon_id"]<-paste(results_nonoverlapcdss[,"chr"],results_nonoverlapcdss[,"start"],results_nonoverlapcdss[,"end"],results_nonoverlapcdss[,"type"],results_nonoverlapcdss[,"gene_id"],results_nonoverlapcdss[,"strand"],sep="_") + NA_str<-which(is.na(results_nonoverlapcdss[,"strand"])) + if(length(NA_str)>0){ + for(o in NA_str){ + results_nonoverlapcdss[o,"exon_id"]<-paste(results_nonoverlapcdss[o,"chr"],results_nonoverlapcdss[o,"start"],results_nonoverlapcdss[o,"end"],results_nonoverlapcdss[o,"type"],results_nonoverlapcdss[o,"gene_id"],sep="_") + + } + } +} + +if(lines_in_file==0){ + results_nonoverlapcdss<-data.frame(exon_id=NA,stringsAsFactors=F) +} + +overl_cds<-c() +for(i in 1:length(ex_to_check)){ + a<-ex_to_check[[i]] + a<-a[a!="NA"] + overl_cds[i]<-sum(!a%in%results_nonoverlapcdss$exon_id)>0 + +} + +all_sORFs_CCDS_periodic_nocds<-all_sORFs_CCDS_periodic[!overl_cds,] + +all_sORFs_CCDS_periodic_nocds<-all_sORFs_CCDS_periodic_nocds[all_sORFs_CCDS_periodic_nocds[,"ORF_pval_multi_ribo"]<0.05,] +all_sORFs_CCDS_periodic_nocds<-all_sORFs_CCDS_periodic_nocds[!is.na(all_sORFs_CCDS_periodic_nocds[,"transcript_id"]),] +write.table(all_sORFs_CCDS_periodic_nocds,file="orfs_before_u_dorfs",quote=F,row.names=F,sep="\t",col.names=T) +if(dim(all_sORFs_CCDS_periodic_nocds)[1]>0){ + all_sORFs_CCDS_periodic_nocds$type<-NA + + for(r in 1:dim(all_sORFs_CCDS_periodic_nocds)[1]){ + + x<-all_sORFs_CCDS_periodic_nocds[r,] + type<-NA + if(!is.na(as.numeric(x[,"annotated_start"])) & !is.na(as.numeric(x[,"annotated_stop"]))){ + if(as.numeric(x[,"start_pos"])as.numeric(x[,"annotated_stop"])){type<-"dORF"} + if(as.numeric(x[,"start_pos"])>as.numeric(x[,"annotated_start"]) & x[,"start_pos"]as.numeric(x[,"annotated_stop"])){type<-"Overl_dORF"} + if(as.numeric(x[,"start_pos"])as.numeric(x[,"annotated_start"])){type<-"Overl_uORF"} + + } + all_sORFs_CCDS_periodic_nocds[r,"type"]<-type + } +} + +if(dim(all_sORFs_CCDS_periodic_nocds)[1]==0){ + print("Warning! No u/dORFs found ! all ORFs overlap annotated CDS exons") + all_sORFs_CCDS_periodic_nocds[1,]<-NA + all_sORFs_CCDS_periodic_nocds$type<-NA + +} + +all_sORFs_CCDS_periodic_nocds_filtered_multi<-all_sORFs_CCDS_periodic_nocds[(all_sORFs_CCDS_periodic_nocds$pct_covered_onlymulti_ribo/all_sORFs_CCDS_periodic_nocds$pct_region_covered_ribo)<0.3,] +all_sORFs_CCDS_periodic_nocds_filtered_multi<-all_sORFs_CCDS_periodic_nocds_filtered_multi[all_sORFs_CCDS_periodic_nocds_filtered_multi$pct_region_covered_ribo>0.3,] +all_sORFs_CCDS_periodic_nocds_filtered_multi<-all_sORFs_CCDS_periodic_nocds_filtered_multi[!is.na(all_sORFs_CCDS_periodic_nocds_filtered_multi[,"transcript_id"]),] + +all_sORFs_CCDS_periodic<-all_sORFs_CCDS_periodic[overl_cds,] +all_sORFs_CCDS_periodic_nofilt<-all_sORFs_CCDS_periodic +all_sORFs_CCDS_periodic<-all_sORFs_CCDS_periodic[(all_sORFs_CCDS_periodic$pct_covered_onlymulti_ribo/all_sORFs_CCDS_periodic$pct_region_covered_ribo)<0.3,] +all_sORFs_CCDS_periodic<-all_sORFs_CCDS_periodic[!is.na(all_sORFs_CCDS_periodic[,"transcript_id"]),] + + +setwd("../") + + +dir.create("ORFs_CCDS", showWarnings = FALSE) +dir.create("ORFs_CCDS/best_periodicity", showWarnings = FALSE) +dir.create("ORFs_CCDS/max_P_sites", showWarnings = FALSE) +dir.create("ORFs_CCDS/more_tapers", showWarnings = FALSE) + + +sORFs_sign_filtered_cds<-all_sORFs_CCDS_periodic_nocds[all_sORFs_CCDS_periodic_nocds[,"Method"]=="best_periodicity",] +write.table(sORFs_sign_filtered_cds,file="ORFs_CCDS/best_periodicity/sORFs_sign_filtered_cds",quote=F,row.names=F,sep="\t",col.names=T) +sORFs_sign_filtered_cds_multi<-all_sORFs_CCDS_periodic_nocds_filtered_multi[all_sORFs_CCDS_periodic_nocds_filtered_multi[,"Method"]=="best_periodicity",] +write.table(sORFs_sign_filtered_cds_multi,file="ORFs_CCDS/best_periodicity/sORFs_sign_filtered_cds_multi",quote=F,row.names=F,sep="\t",col.names=T) +ORFs_sign_filtered_multi<-all_sORFs_CCDS_periodic[all_sORFs_CCDS_periodic[,"Method"]=="best_periodicity",] +ORFs_sign_notfiltered_multi<-all_sORFs_CCDS_periodic_nofilt[all_sORFs_CCDS_periodic_nofilt[,"Method"]=="best_periodicity",] + +write.table(ORFs_sign_filtered_multi,file="ORFs_CCDS/best_periodicity/ORFs_sign_filtered_multi",quote=F,row.names=F,sep="\t",col.names=T) +write.table(ORFs_sign_notfiltered_multi,file="ORFs_CCDS/best_periodicity/ORFs_sign_notfiltered_multi",quote=F,row.names=F,sep="\t",col.names=T) + +ORFs_all<-CCDS_orfs[CCDS_orfs[,"Method"]=="best_periodicity",] +ORFs_all<-ORFs_all[!is.na(ORFs_all[,"transcript_id"]),] +write.table(ORFs_all,file="ORFs_CCDS/best_periodicity/ORFs_all",quote=F,row.names=F,sep="\t",col.names=T) + + +sORFs_sign_filtered_cds<-all_sORFs_CCDS_periodic_nocds[all_sORFs_CCDS_periodic_nocds[,"Method"]=="max_P_sites",] +write.table(sORFs_sign_filtered_cds,file="ORFs_CCDS/max_P_sites/sORFs_sign_filtered_cds",quote=F,row.names=F,sep="\t",col.names=T) +sORFs_sign_filtered_cds_multi<-all_sORFs_CCDS_periodic_nocds_filtered_multi[all_sORFs_CCDS_periodic_nocds_filtered_multi[,"Method"]=="max_P_sites",] +write.table(sORFs_sign_filtered_cds_multi,file="ORFs_CCDS/max_P_sites/sORFs_sign_filtered_cds_multi",quote=F,row.names=F,sep="\t",col.names=T) +ORFs_sign_filtered_multi<-all_sORFs_CCDS_periodic[all_sORFs_CCDS_periodic[,"Method"]=="max_P_sites",] + +ORFs_sign_notfiltered_multi<-all_sORFs_CCDS_periodic_nofilt[all_sORFs_CCDS_periodic_nofilt[,"Method"]=="max_P_sites",] + +write.table(ORFs_sign_notfiltered_multi,file="ORFs_CCDS/max_P_sites/ORFs_sign_notfiltered_multi",quote=F,row.names=F,sep="\t",col.names=T) + + +write.table(ORFs_sign_filtered_multi,file="ORFs_CCDS/max_P_sites/ORFs_sign_filtered_multi",quote=F,row.names=F,sep="\t",col.names=T) +ORFs_all<-CCDS_orfs[CCDS_orfs[,"Method"]=="max_P_sites",] +ORFs_all<-ORFs_all[!is.na(ORFs_all[,"transcript_id"]),] + +write.table(ORFs_all,file="ORFs_CCDS/max_P_sites/ORFs_all",quote=F,row.names=F,sep="\t",col.names=T) + + +sORFs_sign_filtered_cds<-all_sORFs_CCDS_periodic_nocds[all_sORFs_CCDS_periodic_nocds[,"Method"]=="more_tapers",] +write.table(sORFs_sign_filtered_cds,file="ORFs_CCDS/more_tapers/sORFs_sign_filtered_cds",quote=F,row.names=F,sep="\t",col.names=T) +sORFs_sign_filtered_cds_multi<-all_sORFs_CCDS_periodic_nocds_filtered_multi[all_sORFs_CCDS_periodic_nocds_filtered_multi[,"Method"]=="more_tapers",] +write.table(sORFs_sign_filtered_cds_multi,file="ORFs_CCDS/more_tapers/sORFs_sign_filtered_cds_multi",quote=F,row.names=F,sep="\t",col.names=T) +ORFs_sign_filtered_multi<-all_sORFs_CCDS_periodic[all_sORFs_CCDS_periodic[,"Method"]=="more_tapers",] + +ORFs_sign_notfiltered_multi<-all_sORFs_CCDS_periodic_nofilt[all_sORFs_CCDS_periodic_nofilt[,"Method"]=="more_tapers",] + +write.table(ORFs_sign_notfiltered_multi,file="ORFs_CCDS/more_tapers/ORFs_sign_notfiltered_multi",quote=F,row.names=F,sep="\t",col.names=T) + + +write.table(ORFs_sign_filtered_multi,file="ORFs_CCDS/more_tapers/ORFs_sign_filtered_multi",quote=F,row.names=F,sep="\t",col.names=T) +ORFs_all<-CCDS_orfs[CCDS_orfs[,"Method"]=="more_tapers",] +ORFs_all<-ORFs_all[!is.na(ORFs_all[,"transcript_id"]),] + +write.table(ORFs_all,file="ORFs_CCDS/more_tapers/ORFs_all",quote=F,row.names=F,sep="\t",col.names=T) + +print(paste("--- CCDS ORF finding Done!","---",date(),sep=" ")) + + diff --git a/scripts/NONCCDS_orf_finder.R b/scripts/NONCCDS_orf_finder.R new file mode 100755 index 0000000..92ac6b5 --- /dev/null +++ b/scripts/NONCCDS_orf_finder.R @@ -0,0 +1,983 @@ +#!/usr/bin/Rscript + +################################################################### +# This file is part of RiboTaper. +# RiboTaper is a method for defining traslated ORFs using +# Ribosome Profiling data. +# +# Copyright (C) 2015 Lorenzo Calviello +# +# RiboTaper is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# RiboTaper is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with RiboTaper. If not, see . +# +# Contact: Lorenzo.Calviello@mdc-berlin.de +####################################################################### + + +###script for NONCCDS genes ORF Finding, takes as arguments annotation dir, RiboTaper scripts dir, bedtools dir, n of cores + + +args <- commandArgs(trailingOnly = TRUE) + +###loads functions + +print(paste("--- non-CCDS ORF finding","---",date(),sep=" ")) + +suppressMessages(source(paste(args[2],"functions.R",sep = "/"))) + +###takes n of cores + +registerDoMC(args[4]) + +###loads annotation files + + +tr_ex<-paste(args[1],"transcr_exons_nonccds.bed",sep = "/") +transcr_nonccds<-read.table(tr_ex,stringsAsFactors=F,header=F) +colnames(transcr_nonccds)<-c("chr","start","end","transcript_id","gene_id","strand") +transcr_nonccds$coords_id<-paste(transcr_nonccds[,1],transcr_nonccds[,2],transcr_nonccds[,3],sep="_") + +transcr_nonccds$exon_id<-paste(transcr_nonccds$coords_id,"EXONnonCCDS",transcr_nonccds[,5],sep="_") +transcr_nonccds$coords2<-paste(transcr_nonccds$chr,":",transcr_nonccds$start,"-",transcr_nonccds$end,"(",transcr_nonccds$strand,")",sep="") +###loads exonic results + + +results_nonccds_ORFs<-read.table("results_nonccds_annot",sep="\t",stringsAsFactors=F,header=T,quote = "") +results_nonccds_ORFs$chr<-sapply(strsplit(results_nonccds_ORFs$exon_id,split="_"),"[[",1) +results_nonccds_ORFs$start<-sapply(strsplit(results_nonccds_ORFs$exon_id,split="_"),"[[",2) +results_nonccds_ORFs$end<-sapply(strsplit(results_nonccds_ORFs$exon_id,split="_"),"[[",3) + +###calculates coordinates for sequence search + +results_nonccds_ORFs_copy<-results_nonccds_ORFs +nonccds_res<-results_nonccds_ORFs + +###loads data tracks + +all_tracks_nonccds<-readBigText("data_tracks/Psit_Ribo_Rna_Cent_tracks_nonccds") + +index_nonccds<-read.table("data_tracks/index_tracks_nonccds",stringsAsFactors=F,header=F) +colnames(index_nonccds)<-"exon_id" + +#Subset + +#it does make sense, but it was not like this in the orig +# transcr_nonccds<-unique(merge(transcr_nonccds,results_nonccds_ORFs[,c("exon_id","P_sites_sum")],by="exon_id",all.x=T)) +# transcr_sites<-aggregate(transcr_nonccds$P_sites_sum,by=list(transcr_nonccds$transcript_id),FUN=sum) +# colnames(transcr_sites)<-c("transcript_id","n_P_sites") +#transcr_sites<-transcr_sites[transcr_sites[,"n_P_sites"]>2,] + +transcript_nonccds_transl<-results_nonccds_ORFs[results_nonccds_ORFs[,"P_sites_sum"]>5,] + +transcript_nonccds_transl<-transcript_nonccds_transl[!is.na(transcript_nonccds_transl[,"exon_id"]),] + +transcript_nonccds_transl<-unique(transcr_nonccds[transcr_nonccds[,"exon_id"]%in%transcript_nonccds_transl[,"exon_id"],"transcript_id"]) + +transcript_nonccds_transl<-transcript_nonccds_transl[!is.na(transcript_nonccds_transl)] + +transcr_sites<-unique(transcript_nonccds_transl) +transcr_sites<-transcr_sites[!is.na(transcr_sites)] +transcr_nonccds_fin<-transcr_nonccds[transcr_nonccds[,"transcript_id"]%in%transcr_sites,] +transcr_nonccds_fin_ids<-unique(unlist(transcr_nonccds_fin[,c("exon_id")])) + +all_tracks_nonccds<-all_tracks_nonccds[(index_nonccds[,"exon_id"]%in%transcr_nonccds_fin_ids)] +index_nonccds<-subset(index_nonccds,exon_id%in%transcr_nonccds_fin_ids) + + +st_st_NA<-data.frame(start_pos=NA,st2vect=NA) +st_st_NA$ORF_frame<-NA +st_st_NA$ORF_length<-NA +st_st_NA$ORF_P_sites<-NA +st_st_NA$ORF_Psit_pct_in_frame<-NA +st_st_NA$ORF_RNA_sites<-NA +st_st_NA$ORF_RNAsit_pct_in_frame<-NA +st_st_NA$ORF_freq_multi_ribo<-NA +st_st_NA$ORF_pval_multi_ribo<-NA +st_st_NA$ORF_spec_multi_ribo<-NA +st_st_NA$ORF_freq_multi_rna<-NA +st_st_NA$ORF_pval_multi_rna<-NA +st_st_NA$ORF_spec_multi_rna<-NA +st_st_NA$ORF_freq3_fft_ribo<-NA +st_st_NA$ORF_spec3_fft_ribo<-NA +st_st_NA$ORF_freq3_spec_ribo<-NA +st_st_NA$ORF_spec3_spec_ribo<-NA +st_st_NA$ORF_freq3_fft_rna<-NA +st_st_NA$ORF_spec3_fft_rna<-NA +st_st_NA$ORF_freq3_spec_rna<-NA +st_st_NA$ORF_spec3_spec_rna<-NA +st_st_NA$ORF_ORF_score_ribo<-NA +st_st_NA$ORF_ORF_score_rna<-NA +st_st_NA$ORF_chisq_ribo<-NA +st_st_NA$ORF_chisq_rna<-NA +st_st_NA$ORF_Ribo_cov_aver<-NA +st_st_NA$ORF_RNA_cov_aver<-NA +st_st_NA$ORF_pept<-NA +st_st_NA$nt_tocheck_next_start<-0 +st_st_NA$pval_next_start<-1 +st_st_NA$P_sites_next_start<-0 +st_st_NA$pct_P_sites_inframe_next_start<-0 +st_st_NA$Method<-NA +st_st_NA$to_check<-NA +st_st_NA$to_check_rem<-NA +st_st_NA$ORF_id_tr<-NA +st_st_NA$ORF_id_gen<-NA +st_st_NA$to_check_ALL<-NA + +NONCCDS_orfs<-foreach(j=1:length(transcr_sites),.combine=rbind,.multicombine=T) %dopar%{ + transcript<-transcr_sites[j] + + ###assembles transcript + + exons_in_transcr<-transcr_nonccds[transcr_nonccds[,"transcript_id"]==transcript,] + exons_in_transcr<-exons_in_transcr[order(exons_in_transcr$start,decreasing=F),] + + list_exons_transcr<-list() + list_exons_seqs<-list() + + for(k in seq(1,dim(exons_in_transcr)[1])){ + exon_track<-c() + subs_nonccds<-index_nonccds[,"exon_id"]==exons_in_transcr[k,"exon_id"] + + if(sum(subs_nonccds)>0){ + if(sum(subs_nonccds)==5){ + exon_track<-all_tracks_nonccds[subs_nonccds] + } + if(sum(subs_nonccds)>5){ + exon_track<-all_tracks_nonccds[which(subs_nonccds)[1:4]] + } + } + + + withsep<-strsplit(exon_track,split=" ") + x<-t(data.frame(withsep)) + + strand<-x[1,2] + tracks<-t(x[,-c(1:2)]) + colnames(tracks)<-c("Psites","RiboCov","RNACov","RNAcent","Seq") + seq<-tracks[,5] + tracks<-tracks[,1:4] + mode(tracks)<-"numeric" + length<-dim(tracks)[1] + list_exons_transcr[[k]]<-tracks + list_exons_seqs[[k]]<-seq + } + + merged_tracks<-do.call(what=rbind,list_exons_transcr) + + if(strand=="-"){ + merged_tracks<-cbind(rev(merged_tracks[,1]),rev(merged_tracks[,2]),rev(merged_tracks[,3]),rev(merged_tracks[,4])) + } + + tracks<-merged_tracks + length<-dim(tracks)[1] + if(strand=="+"){ + seq_transcr<-unlist(list_exons_seqs) + } + if(strand=="-"){ + + seq_transcr<-unlist(list_exons_seqs) + seq_transcr<-comp(rev((seq_transcr)),forceToLower=F) + } + + transcr_data<-data.frame(transcript_id=transcript,stringsAsFactors=F) + transcr_data$gene_id<-unique(transcr_nonccds[transcr_nonccds[,"transcript_id"]==transcript,"gene_id"])[1] + transcr_data$annotation<-unique(results_nonccds_ORFs[results_nonccds_ORFs[,"gene_id"]==transcr_data$gene_id,"annotation",])[1] + transcr_data$gene_symbol<-unique(results_nonccds_ORFs[results_nonccds_ORFs[,"gene_id"]==transcr_data$gene_id,"gene_symbol",])[1] + + P_sites_sum<-sum(tracks[,1]) + RNA_sites_sum<-sum(tracks[,4]) + transcr_data$strand<-strand + transcr_data$length<-length + transcr_data$n_exons<-dim(exons_in_transcr)[1] + transcr_data$P_sites_sum<-P_sites_sum + transcr_data$RNA_sites<-RNA_sites_sum + transcr_data$Ribo_cov_aver<-mean(tracks[,2]) + transcr_data$RNA_cov_aver<-mean(tracks[,3]) + + transcr_data$freq_multit_3nt<-NA + transcr_data$pval_multit_3nt<-NA + transcr_data$spec_multit_3nt<-NA + if(P_sites_sum>2 & length>5){ + if(length<25){slepians<-dpss(n=length+(50-length),k=24,nw=12)} + if(length>=25){slepians<-dpss(n=length,k=24,nw=12)} + vals_mtm<-take_freqs_Fvalues_all_around_3nt_spec(n_tapers=24,time_bw=12,tracks[,1],slepians_values=slepians)[c(1,6,7)] + transcr_data$freq_multit_3nt<-vals_mtm[1] + transcr_data$pval_multit_3nt<-vals_mtm[2] + transcr_data$spec_multit_3nt<-vals_mtm[3] + + } + Phase_P_sites_frame<-sum(tracks[seq(1,length,by=3),1]) + Phase_P_sites_frame_1<-sum(tracks[seq(2,length,by=3),1]) + Phase_P_sites_frame_2<-sum(tracks[seq(3,length,by=3),1]) + + transcr_data$chisq_noccds_psit<-NA + if(P_sites_sum>15){ + transcr_data$chisq_noccds_psit<-chisq.test(as.table(c(Phase_P_sites_frame,Phase_P_sites_frame_1,Phase_P_sites_frame_2)))$p.value} + if(P_sites_sum<16 & P_sites_sum>0){ + transcr_data$chisq_noccds_psit<-xmulti(obs=c(Phase_P_sites_frame,Phase_P_sites_frame_1,Phase_P_sites_frame_2),expr=c(1,1,1),statName="Prob",detail=0)$pProb + } + pctPhase_frame<-Phase_P_sites_frame/P_sites_sum + pctPhase_frame_1<-Phase_P_sites_frame_1/P_sites_sum + pctPhase_frame_2<-Phase_P_sites_frame_2/P_sites_sum + + Centered_sites_sum<-round(sum(tracks[,4]),digits=6) + + Phase_Centered_sites_frame<-sum(tracks[seq(1,length,by=3),4]) + Phase_Centered_sites_frame_1<-sum(tracks[seq(2,length,by=3),4]) + Phase_Centered_sites_frame_2<-sum(tracks[seq(3,length,by=3),4]) + + pctPhaseCentered_frame<-Phase_Centered_sites_frame/Centered_sites_sum + pctPhaseCentered_frame_1<-Phase_Centered_sites_frame_1/Centered_sites_sum + pctPhaseCentered_frame_2<-Phase_Centered_sites_frame_2/Centered_sites_sum + + transcr_data$chisq_noccds_rna<-NA + if(Centered_sites_sum>15){ + chisq_rna<-chisq.test(as.table(c(Phase_Centered_sites_frame,Phase_Centered_sites_frame_1,Phase_Centered_sites_frame_2)))$p.value} + if(Centered_sites_sum<16 & Centered_sites_sum>0){ + chisq_rna<-xmulti(obs=c(Phase_Centered_sites_frame,Phase_Centered_sites_frame_1,Phase_Centered_sites_frame_2),expr=c(1,1,1),statName="Prob",detail=0)$pProb + } + + + MAXPhase_frame<-max(c(pctPhase_frame,pctPhase_frame_1,pctPhase_frame_2)) + FRAME_MAX_phase<-max.col(t(c(pctPhase_frame,pctPhase_frame_1,pctPhase_frame_2)))-1 + + MAXPhaseCentered_frame<-max(c(pctPhaseCentered_frame,pctPhaseCentered_frame_1,pctPhaseCentered_frame_2)) + FRAME_MAX_phaseCentered<-max.col(t(c(pctPhaseCentered_frame,pctPhaseCentered_frame_1,pctPhaseCentered_frame_2)))-1 + + frame_start_pred<-FRAME_MAX_phase + frame_end_pred<-(length-(FRAME_MAX_phase+1))%%3 + + ###Finds ORFs on the 3 different frames + + all_sign_frames<-list() + for(u in 0:2){ + + pept<-NA + pept<-unlist(getTrans(seq_transcr,sens="F",frame=u)) + + starts<-pept=="M" + + stops<-pept=="*" + transcr_data$orf_position<-"undetected" + + start_pos<-((1:length(pept))[starts])*3 + if(length(start_pos)>0){ + start_pos<-start_pos+u-2 + } else {start_pos<-NA} + + stop_pos<-((1:length(pept))[stops])*3 + if(length(stop_pos)>0){ + stop_pos<-stop_pos+u-2 + } else {stop_pos<-NA} + + + if(sum(!is.na(start_pos))==0 | sum(!is.na(stop_pos))==0){ + st_st<-st_st_NA + transcr_data_fr_sORFs<-cbind(transcr_data,st_st_NA) + } + + if(sum(!is.na(start_pos))>0 & sum(!is.na(stop_pos))>0){ + st2vect<-c() + for(h in 1:length(start_pos)){ + st1<-start_pos[h] + diff<-stop_pos-st1 + diff<-diff[diff>0] + if(length(diff)>0){st2<-st1+min(diff)} + if(length(diff)==0){st2<-NA} + st2vect[h]<-st2 + + } + st_st<-data.frame(cbind(start_pos,st2vect)) + + st_st<-st_st[!is.na(st_st[,"st2vect"]),] + if(dim(st_st)[1]>0){ + if(dim(st_st)[1]==1){ + list_coords=list() + list_coords[[1]]<-st_st[,1]:st_st[,2] + } + if(dim(st_st)[1]>1){ + list_coords<-apply(st_st,FUN=function(x){x[1]:x[2]},1) + } + + max_period<-NA + start_pos<-NA + stop_pos<-NA + pval_max_period<-NA + } + if(dim(st_st)[1]>0){ + st_st$ORF_frame<-u + st_st$ORF_length<-NA + st_st$ORF_P_sites<-NA + st_st$ORF_Psit_pct_in_frame<-NA + st_st$ORF_RNA_sites<-NA + st_st$ORF_RNAsit_pct_in_frame<-NA + st_st$ORF_freq_multi_ribo<-NA + st_st$ORF_pval_multi_ribo<-NA + st_st$ORF_spec_multi_ribo<-NA + st_st$ORF_freq_multi_rna<-NA + st_st$ORF_pval_multi_rna<-NA + st_st$ORF_spec_multi_rna<-NA + + st_st$ORF_freq3_fft_ribo<-NA + st_st$ORF_spec3_fft_ribo<-NA + st_st$ORF_freq3_spec_ribo<-NA + st_st$ORF_spec3_spec_ribo<-NA + st_st$ORF_freq3_fft_rna<-NA + st_st$ORF_spec3_fft_rna<-NA + st_st$ORF_freq3_spec_rna<-NA + st_st$ORF_spec3_spec_rna<-NA + st_st$ORF_ORF_score_ribo<-NA + st_st$ORF_ORF_score_rna<-NA + st_st$ORF_chisq_ribo<-NA + st_st$ORF_chisq_rna<-NA + st_st$ORF_Ribo_cov_aver<-NA + st_st$ORF_RNA_cov_aver<-NA + st_st$ORF_pept<-NA + st_st$Method<-NA + st_st$to_check<-NA + st_st$to_check_rem<-NA + st_st$ORF_id_tr<-NA + st_st$ORF_id_gen<-NA + st_st$to_check_ALL<-NA + + for(r in 1:dim(st_st)[1]){ + tracks_stst<-tracks[st_st[r,1]:st_st[r,2],] + length<-dim(tracks_stst)[1] + P_sites_sum<-sum(tracks_stst[,1]) + RNA_sites_sum<-sum(tracks_stst[,4]) + st_st[r,"ORF_length"]<-length-1 + st_st[r,"ORF_P_sites"]<-P_sites_sum + st_st[r,"ORF_RNA_sites"]<-RNA_sites_sum + st_st[r,"ORF_Ribo_cov_aver"]<-mean(tracks_stst[,2]) + st_st[r,"ORF_RNA_cov_aver"]<-mean(tracks_stst[,3]) + if(P_sites_sum>5 & length>5){ + Phase_P_sites_frame<-sum(tracks_stst[seq(1,length,by=3),1]) + Phase_P_sites_frame_1<-sum(tracks_stst[seq(2,length,by=3),1]) + Phase_P_sites_frame_2<-sum(tracks_stst[seq(3,length,by=3),1]) + st_st[r,"ORF_Psit_pct_in_frame"]<-Phase_P_sites_frame/P_sites_sum + if((Phase_P_sites_frame/P_sites_sum)>0.5){ + score1<-((Phase_P_sites_frame-P_sites_sum/3)^2)/(P_sites_sum/3) + score2<-((Phase_P_sites_frame_1-P_sites_sum/3)^2)/(P_sites_sum/3) + score3<-((Phase_P_sites_frame_2-P_sites_sum/3)^2)/(P_sites_sum/3) + + orfsc<-log2(score1+score2+score3+1) + st_st[r,"ORF_ORF_score_ribo"]<-orfsc + if(Phase_P_sites_frame<=Phase_P_sites_frame_1 | Phase_P_sites_frame<=Phase_P_sites_frame_2){ + st_st[r,"ORF_ORF_score_ribo"]<--orfsc + } + + if(max(tracks_stst[,1])>(P_sites_sum*.7)){ + new_track<-tracks_stst + new_track[which(new_track[,1]==max(new_track[,1]))]<-0 + st_st[r,"ORF_ORF_score_ribo"]<-NA + if(sum(new_track[,1])>2){ + Phase_P_sites_frame_corr<-sum(new_track[seq(1,length,by=3),1]) + Phase_P_sites_frame_1_corr<-sum(new_track[seq(2,length,by=3),1]) + Phase_P_sites_frame_2_corr<-sum(new_track[seq(3,length,by=3),1]) + score1<-((Phase_P_sites_frame_corr-sum(new_track[,1])/3)^2)/(sum(new_track[,1])/3) + score2<-((Phase_P_sites_frame_1_corr-sum(new_track[,1])/3)^2)/(sum(new_track[,1])/3) + score3<-((Phase_P_sites_frame_2_corr-sum(new_track[,1])/3)^2)/(sum(new_track[,1])/3) + st_st[r,"ORF_ORF_score_ribo"]<-log2(score1+score2+score3+1) + if(Phase_P_sites_frame_corr<=Phase_P_sites_frame_1_corr | Phase_P_sites_frame<=Phase_P_sites_frame_2_corr){ + st_st[r,"ORF_ORF_score_ribo"]<--log2(score1+score2+score3+1) + } + } + } + + if(P_sites_sum>15){ + st_st[r,"ORF_chisq_ribo"]<-chisq.test(as.table(c(Phase_P_sites_frame,Phase_P_sites_frame_1,Phase_P_sites_frame_2)))$p.value} + if(P_sites_sum<16 & P_sites_sum>0){ + st_st[r,"ORF_chisq_ribo"]<-xmulti(obs=c(Phase_P_sites_frame,Phase_P_sites_frame_1,Phase_P_sites_frame_2),expr=c(1,1,1),statName="Prob",detail=0)$pProb + } + if(length<25){slepians<-dpss(n=length+(50-length),k=24,nw=12)} + if(length>=25){slepians<-dpss(n=length,k=24,nw=12)} + values_mtm_orf<-take_freqs_Fvalues_all_around_3nt_spec(x=tracks_stst[,1],n_tapers=24,time_bw=12,slepians_values=slepians)[c(1,6,7)] + + st_st[r,"ORF_freq_multi_ribo"]<-values_mtm_orf[1] + st_st[r,"ORF_pval_multi_ribo"]<-values_mtm_orf[2] + st_st[r,"ORF_spec_multi_ribo"]<-values_mtm_orf[3] + + fft_sp<-take_maxfreq_and_power_FFT_Spec(tracks_stst[,1]) + st_st[,"ORF_freq3_fft_ribo"]<-fft_sp[1] + st_st[,"ORF_spec3_fft_ribo"]<-fft_sp[2] + st_st[,"ORF_freq3_spec_ribo"]<-fft_sp[3] + st_st[,"ORF_spec3_spec_ribo"]<-fft_sp[4] + + pept<-unlist(getTrans(seq_transcr[st_st[r,1]:st_st[r,2]],sens="F")) + st_st[r,"ORF_pept"]<-paste(pept,sep="",collapse="") + } + if(RNA_sites_sum>5 & (Phase_P_sites_frame/P_sites_sum)>0.5){ + if(length<25){slepians<-dpss(n=length+(50-length),k=24,nw=12)} + if(length>=25){slepians<-dpss(n=length,k=24,nw=12)} + values_mtm_orf_rna<-take_freqs_Fvalues_all_around_3nt_spec(x=tracks_stst[,4],n_tapers=24,time_bw=12,slepians_values=slepians)[c(1,6,7)] + + st_st[r,"ORF_freq_multi_rna"]<-values_mtm_orf_rna[1] + st_st[r,"ORF_pval_multi_rna"]<-values_mtm_orf_rna[2] + st_st[r,"ORF_spec_multi_rna"]<-values_mtm_orf_rna[3] + + + fft_sp<-take_maxfreq_and_power_FFT_Spec(tracks_stst[,4]) + st_st[,"ORF_freq3_fft_rna"]<-fft_sp[1] + st_st[,"ORF_spec3_fft_rna"]<-fft_sp[2] + st_st[,"ORF_freq3_spec_rna"]<-fft_sp[3] + st_st[,"ORF_spec3_spec_rna"]<-fft_sp[4] + + Phase_Centered_sites_frame<-sum(tracks_stst[seq(1,length,by=3),4]) + Phase_Centered_sites_frame_1<-sum(tracks_stst[seq(2,length,by=3),4]) + Phase_Centered_sites_frame_2<-sum(tracks_stst[seq(3,length,by=3),4]) + st_st[r,"ORF_RNAsit_pct_in_frame"]<-Phase_Centered_sites_frame/RNA_sites_sum + score1<-((Phase_Centered_sites_frame-P_sites_sum/3)^2)/(RNA_sites_sum/3) + score2<-((Phase_Centered_sites_frame_1-P_sites_sum/3)^2)/(RNA_sites_sum/3) + score3<-((Phase_Centered_sites_frame_2-P_sites_sum/3)^2)/(RNA_sites_sum/3) + + orfsc<-log2(score1+score2+score3+1) + st_st[r,"ORF_ORF_score_rna"]<-orfsc + if(Phase_Centered_sites_frame<=Phase_Centered_sites_frame_1 | Phase_Centered_sites_frame<=Phase_Centered_sites_frame_2){ + st_st[r,"ORF_ORF_score_rna"]<--orfsc + } + + if(max(tracks_stst[,4])>(RNA_sites_sum*.7)){ + new_track<-tracks_stst + new_track[which(new_track[,4]==max(new_track[,4]))]<-0 + st_st[r,"ORF_ORF_score_rna"]<-NA + if(sum(new_track[,4])>2){ + Phase_Centered_sites_frame_corr<-sum(new_track[seq(1,length,by=3),4]) + Phase_Centered_sites_frame_1_corr<-sum(new_track[seq(2,length,by=3),4]) + Phase_Centered_sites_frame_2_corr<-sum(new_track[seq(3,length,by=3),4]) + score1<-((Phase_Centered_sites_frame_corr-sum(new_track[,4])/3)^2)/(sum(new_track[,4])/3) + score2<-((Phase_Centered_sites_frame_1_corr-sum(new_track[,4])/3)^2)/(sum(new_track[,4])/3) + score3<-((Phase_Centered_sites_frame_2_corr-sum(new_track[,4])/3)^2)/(sum(new_track[,4])/3) + st_st[r,"ORF_ORF_score_rna"]<-log2(score1+score2+score3+1) + if(Phase_Centered_sites_frame_corr<=Phase_Centered_sites_frame_1_corr | Phase_Centered_sites_frame_corr<=Phase_Centered_sites_frame_2_corr){ + st_st[r,"ORF_ORF_score_rna"]<--log2(score1+score2+score3+1) + } + } + } + + if(RNA_sites_sum>15){ + st_st[r,"ORF_chisq_rna"]<-chisq.test(as.table(c(Phase_Centered_sites_frame,Phase_Centered_sites_frame_1,Phase_Centered_sites_frame_2)))$p.value} + if(RNA_sites_sum<16 & RNA_sites_sum>0){ + st_st[r,"ORF_chisq_rna"]<-xmulti(obs=c(Phase_Centered_sites_frame,Phase_Centered_sites_frame_1,Phase_Centered_sites_frame_2),expr=c(1,1,1),statName="Prob",detail=0)$pProb + } + + } + } + } + if(dim(st_st)[1]>0){st_st<-st_st[!is.na(st_st[,"ORF_pval_multi_ribo"]),]} + if(dim(st_st)[1]>0){st_st<-st_st[st_st[,"ORF_Psit_pct_in_frame"]>0.5,]} + if(dim(st_st)[1]>0){ + st_st$nt_tocheck_next_start<-0 + st_st$pval_next_start<-1 + st_st$P_sites_next_start<-0 + st_st$pct_P_sites_inframe_next_start<-0 + #find starts per each stop codon + list_stopsorfs<-split.data.frame(x=st_st,f=st_st[,2],drop=T) + + transcr_data_fr<-transcr_data + + list_sORFs_frame_moretap<-list() + list_sORFs_frame_bestperiod<-list() + list_sORFs_frame_maxsit<-list() + + for(g in 1:length(list_stopsorfs)){ + + stoplist<-list_stopsorfs[[g]] + max_period<-stoplist[stoplist[,"ORF_pval_multi_ribo"]==min(stoplist[,"ORF_pval_multi_ribo"]),] + list_sORFs_frame_bestperiod[[g]]<-max_period + stoplists_period<-stoplist[stoplist[,"ORF_pval_multi_ribo"]<0.05,] + if(dim(stoplists_period)[1]>0){ + stoplists_period<-stoplists_period[!is.na(stoplists_period[,"ORF_pval_multi_ribo"]),] + } + if(dim(stoplists_period)[1]>1){ + + for(b in 1:(dim(stoplists_period)[1]-1)){ + stoplists_period[b,"nt_tocheck_next_start"]<-stoplists_period[b+1,"start_pos"]-stoplist[b,"start_pos"] + tracks_stst<-tracks[stoplists_period[b,"start_pos"]:stoplists_period[b+1,"start_pos"],] + length<-dim(tracks_stst)[1] + P_sites_sum<-sum(tracks_stst[,1]) + pval_to_next<-1 + + Phase_P_sites_frame<-sum(tracks_stst[seq(1,length,by=3),1]) + Phase_P_sites_frame_1<-sum(tracks_stst[seq(2,length,by=3),1]) + Phase_P_sites_frame_2<-sum(tracks_stst[seq(3,length,by=3),1]) + + pctPhase_frame<-Phase_P_sites_frame/P_sites_sum + pctPhase_frame_1<-Phase_P_sites_frame_1/P_sites_sum + pctPhase_frame_2<-Phase_P_sites_frame_2/P_sites_sum + + if(P_sites_sum>5){ + if(length<25){slepians<-dpss(n=length+(50-length),k=24,nw=12)} + if(length>=25){slepians<-dpss(n=length,k=24,nw=12)} + + pval_to_next<-take_freqs_Fvalues_all_around_3nt_spec(x=tracks_stst[,1],n_tapers=24,time_bw=12,slepians_values=slepians)[6] + } + stoplists_period[b,"P_sites_next_start"]<-P_sites_sum + + stoplists_period[b,"pct_P_sites_inframe_next_start"]<-pctPhase_frame + + stoplists_period[b,"pval_next_start"]<-pval_to_next + } + + max_sit<-stoplists_period[which(stoplists_period[,"P_sites_next_start"]>5 & stoplists_period[,"pct_P_sites_inframe_next_start"]>0.5)[1],] + max_sit<-max_sit[!is.na(max_sit[,"ORF_length"]),] + if(dim(max_sit)[1]==0){ + max_sit<-max_period + } + list_sORFs_frame_maxsit[[g]]<-max_sit + + more_tap<-stoplists_period[which(stoplists_period[,"pval_next_start"]<0.05)[1],] + more_tap<-more_tap[!is.na(more_tap[,"ORF_length"]),] + if(dim(more_tap)[1]==0){ + more_tap<-max_period + } + list_sORFs_frame_moretap[[g]]<-more_tap + + } + if(dim(stoplists_period)[1]<2){ + list_sORFs_frame_maxsit[[g]]<-max_period + list_sORFs_frame_moretap[[g]]<-max_period + + } + + + } + sORFs_frame_moretap<-do.call(what=rbind.data.frame,args=list_sORFs_frame_moretap) + sORFs_frame_moretap$Method<-"more_tapers" + sORFs_frame_maxsit<-do.call(what=rbind.data.frame,args=list_sORFs_frame_maxsit) + sORFs_frame_maxsit$Method<-"max_P_sites" + sORFs_frame_bestperiod<-do.call(what=rbind.data.frame,args=list_sORFs_frame_bestperiod) + sORFs_frame_bestperiod$Method<-"best_periodicity" + sORFs_frames<-rbind(sORFs_frame_moretap,sORFs_frame_maxsit,sORFs_frame_bestperiod) + + for(w in 1:dim(sORFs_frames)[1]){ + transcr_data_fr[w,]<-transcr_data_fr[1,] + } + + transcr_data_fr_sORFs<-cbind(transcr_data_fr,sORFs_frames) + transcr_data_fr_sORFs$orf_position<-"detected" + } + } + + if(dim(st_st)[1]==0){ + st_st<-st_st_NA + transcr_data_fr_sORFs<-cbind(transcr_data,st_st_NA) + } + } + + + all_sign_frames[[u+1]]<-transcr_data_fr_sORFs + } + all_sign_frames<-do.call(what=rbind.data.frame,args=all_sign_frames) + transcr_all_frames_res<-unique(all_sign_frames) + transcr_all_frames_res$ORF_id_tr<-paste(transcr_all_frames_res$transcript_id,transcr_all_frames_res$start_pos,transcr_all_frames_res$st2vect,sep="_") + transcr_all_frames_ok<-transcr_all_frames_res[!is.na(transcr_all_frames_res$ORF_pept),] + if(dim(transcr_all_frames_ok)[1]>0){ + all_orfs<-unique(transcr_all_frames_ok[,c("transcript_id","length","strand","start_pos","st2vect","ORF_length","gene_id")]) + transcr<-all_orfs$transcript_id[1] + trascr_length<-all_orfs$length[1] + orf_strand<-all_orfs$strand[1] + ex_intr_coords<-exons_in_transcr$exon_id + if(orf_strand=="-"){ex_intr_coords<-rev(ex_intr_coords)} + + exons_in_transcr_data<-results_nonccds_ORFs[results_nonccds_ORFs[,"exon_id"]%in%ex_intr_coords,] + exons_in_transcr_data<-exons_in_transcr_data[match(ex_intr_coords,exons_in_transcr_data$exon_id),] + cumsumexons<-cumsum(exons_in_transcr_data$length.x) + + list_orfas<-list() + for(z in 1:dim(all_orfs)[1]){ + orfa<-all_orfs[z,] + + transcr_data<-data.frame(transcript_id=transcr) + + orf_start<-orfa$start_pos + orf_end<-orfa$st2vect + + st_ex<-which((cumsumexons-orf_start)==min(cumsumexons[cumsumexons>orf_start]-orf_start)) + end_ex<-which((cumsumexons-orf_end)==min(cumsumexons[cumsumexons>=orf_end]-orf_end)) + in_betw_ex<-st_ex:end_ex + in_betw_ex<-in_betw_ex[!in_betw_ex%in%c(st_ex,end_ex)>0] + exon_inbetween_data<-exons_in_transcr_data[in_betw_ex,] + + + coord_start<-NA + coord_end<-NA + nt_to_rem<-NA + rem_len<-0 + if(st_ex>1){rem_len<-cumsumexons[st_ex-1]} + if(orfa$strand=="+"){coord_start<-as.numeric(exons_in_transcr_data[st_ex,"start"]) + (orf_start-rem_len)} + if(orfa$strand=="-"){coord_start<-as.numeric(exons_in_transcr_data[st_ex,"end"]) - (orf_start-rem_len)} + + if(length(in_betw_ex)==0){ + if(st_ex==end_ex){nt_to_rem<-0} + if(st_ex!=end_ex){if(orfa$strand=="+"){ + nt_to_rem<-as.numeric(exons_in_transcr_data[st_ex,"end"])-coord_start + } + if(orfa$strand=="-"){ + nt_to_rem<-coord_start-as.numeric(exons_in_transcr_data[st_ex,"start"]) + } + } + } + + if(length(in_betw_ex)>0){ + nt_in_betw<-sum(exons_in_transcr_data[in_betw_ex,"length.x"]) + if(orfa$strand=="+"){ + nt_to_rem<-as.numeric(exons_in_transcr_data[st_ex,"end"])-coord_start + } + if(orfa$strand=="-"){ + nt_to_rem<-coord_start-as.numeric(exons_in_transcr_data[st_ex,"start"]) + } + nt_to_rem<-nt_to_rem+nt_in_betw + } + + if(st_ex==end_ex & orfa$strand=="+"){coord_end<-coord_start+orfa$ORF_length+1} + if(st_ex==end_ex & orfa$strand=="-"){coord_end<-coord_start-orfa$ORF_length+1} + + if(st_ex!=end_ex & orfa$strand=="+"){coord_end<-as.numeric(exons_in_transcr_data[end_ex,"start"]) + (orfa$ORF_length-nt_to_rem)+1} + if(st_ex!=end_ex & orfa$strand=="-"){coord_end<-as.numeric(exons_in_transcr_data[end_ex,"end"]) - (orfa$ORF_length-nt_to_rem)+1} + + if(orfa$strand=="-"){ + coord_start2<-coord_start + coord_start<-coord_end + coord_end<-coord_start2 + } + + + if(st_ex!=end_ex & orfa$strand=="+"){to_check_st<-paste(exons_in_transcr_data[st_ex,"chr"],coord_start,exons_in_transcr_data[st_ex,"end"],"CCDS",orfa$gene_id,orfa$strand,sep="_") + to_check_end<-paste(exons_in_transcr_data[end_ex,"chr"],exons_in_transcr_data[end_ex,"start"],coord_end,"CCDS",orfa$gene_id,orfa$strand,sep="_") + to_check<-paste(to_check_st,to_check_end,sep=";") + + } + if(st_ex!=end_ex & orfa$strand=="-"){to_check_st<-paste(exons_in_transcr_data[st_ex,"chr"],exons_in_transcr_data[st_ex,"start"],coord_end,"CCDS",orfa$gene_id,orfa$strand,sep="_") + to_check_end<-paste(exons_in_transcr_data[end_ex,"chr"],coord_start,exons_in_transcr_data[end_ex,"end"],"CCDS",orfa$gene_id,orfa$strand,sep="_") + to_check<-paste(to_check_st,to_check_end,sep=";") + } + + if(st_ex==end_ex){to_check<-paste(exons_in_transcr_data[st_ex,"chr"],coord_start,coord_end,"CCDS",orfa$gene_id,orfa$strand,sep="_")} + orfa$to_check<-to_check + orfa$to_check_rem<-NA + if(length(in_betw_ex)>0){ + orfa$to_check_rem<-paste(exon_inbetween_data$exon_id,collapse=";") + + } + orfa$ORF_id_tr<-paste(transcr_data$transcript_id,orf_start,orf_end,sep="_") + orfa$ORF_id_gen<-paste(exons_in_transcr_data[st_ex,"chr"],coord_start,coord_end,sep="_") + orfa$to_check_ALL<-paste(orfa$to_check,orfa$to_check_rem,sep=";") + list_orfas[[z]]<-orfa + + + } + list_orfas<-do.call(rbind.data.frame,args=list_orfas) + transcr_all_frames_ok$ORF_id_gen<-NULL + transcr_all_frames_ok$to_check<-NULL + transcr_all_frames_ok$to_check_rem<-NULL + transcr_all_frames_ok$to_check_ALL<-NULL + + transcr_all_frames_ok<-merge(transcr_all_frames_ok,list_orfas[,c("ORF_id_tr","ORF_id_gen","to_check","to_check_rem","to_check_ALL")],by="ORF_id_tr") + #reconcile and maybe add the rest + return(transcr_all_frames_ok) + } + if(dim(transcr_all_frames_ok)[1]==0){return(transcr_all_frames_res)} + + + +} + +NONCCDS_orfs<-NONCCDS_orfs[!is.na(NONCCDS_orfs[,"ORF_pept"]),] + +write.table(NONCCDS_orfs,file="orfs_found_nonccds",quote=F,row.names=F,sep="\t",col.names=T) + +options(scipen=999) + + +print(paste("--- checking non-CCDS ORF coverage and multi-mapping ratio,",date(),sep=" ")) + + + + +NONCCDS_orfs$ORF_id_tr_minus2<-paste(NONCCDS_orfs$transcript_id,NONCCDS_orfs$start_pos,NONCCDS_orfs$st2vect+2,sep="_") + +all_sORFs_noncod_multi<-NONCCDS_orfs + + +ex_to_check<-strsplit(all_sORFs_noncod_multi$to_check,split=";") + +ex_to_check<-unique(unlist(ex_to_check)) + +ex_to_check_spl<-strsplit(ex_to_check,split="_") + +bedfiles_to_check<-data.frame(chr=NA,start=NA,end=NA,type=NA,gene_id=NA,strand=NA) +for(h in 1:length(ex_to_check_spl)){ + to_bed<-ex_to_check_spl[[h]] + bedfiles_to_check[h,"chr"]<-to_bed[1] + bedfiles_to_check[h,"start"]<-to_bed[2] + bedfiles_to_check[h,"end"]<-to_bed[3] + bedfiles_to_check[h,"type"]<-to_bed[4] + bedfiles_to_check[h,"gene_id"]<-to_bed[5] + bedfiles_to_check[h,"strand"]<-to_bed[6] + +} + +write.table(bedfiles_to_check,file="bed_tocheck_nonccds.bed",quote=F,row.names=F,sep="\t",col.names=F) + + +scr<-paste(args[2],"analyze_multi_clust.bash",sep="/") +syst_scr<-paste(scr,"bed_tocheck_nonccds.bed bed_tocheck_nonccds",args[3],sep = " ") +system(syst_scr) + +scr<-paste(args[2],"include_multi_nomerge.R",sep="/") +syst_scr<-paste(scr,"bed_tocheck_nonccds",sep = " ") + +system(syst_scr) + +res_to_check<-read.table(file="multi_table_bed_tocheck_nonccds",header=T,stringsAsFactors=F) + +dir.create("tmp_nonccds", showWarnings = FALSE) + +system("mv *tocheck_nonccds* tmp_nonccds/") + +setwd("tmp_nonccds") + +print(paste("--- Selecting best transcript per non-CCDS ORF,",date(),sep=" ")) + + +if(sum(!is.na(all_sORFs_noncod_multi$to_check_rem))>0){ +ex_rem<-strsplit(all_sORFs_noncod_multi$to_check_rem,split=";") + +ex_rem<-unique(unlist(ex_rem)) +ex_rem<-ex_rem[!is.na(ex_rem)] + + +res_ex_rem<-nonccds_res[nonccds_res[,"exon_id"]%in%ex_rem,c(c("exon_id","strand.x","length.y","reads_ribo","reads_multi_ribo","pct_region_covered_ribo","pct_covered_onlymulti_ribo","reads_rna","reads_multi_rna","pct_region_covered_rna","pct_covered_onlymulti_rna"))] +names(res_ex_rem)<-names(res_to_check) + +res_all_multi<-rbind.data.frame(res_ex_rem,res_to_check) +} + +if(sum(!is.na(all_sORFs_noncod_multi$to_check_rem))==0){ + res_all_multi<-res_to_check +} + +res_all_multi$exon_id_2<-paste(res_all_multi$exon_id,res_all_multi$strand,sep="_") + +all_sORFs_noncod_multi$to_check_ALL<-paste(all_sORFs_noncod_multi$to_check,all_sORFs_noncod_multi$to_check_rem,sep=";") + +all_sORFs_noncod_multi_final<-foreach(g=1:(dim(all_sORFs_noncod_multi)[1]),.combine=rbind,.multicombine=T) %dopar%{ + s<-all_sORFs_noncod_multi[g,] + list_ex<-strsplit(s$to_check_ALL,split=";")[[1]] + with_exon2<-which(res_all_multi[,"exon_id_2"]%in%list_ex) + with_exon1<-which(res_all_multi[,"exon_id"]%in%list_ex) + to_take<-unique(c(with_exon2,with_exon1)) + res_multi<-res_all_multi[to_take,] + res_multi$reads_ribo<-sum(res_multi$reads_ribo) + res_multi$reads_multi_ribo<-sum(res_multi$reads_multi_ribo) + res_multi$pct_region_covered_ribo_ALL<-res_multi$pct_region_covered_ribo*res_multi$length.y + res_multi$pct_covered_onlymulti_ribo_ALL<-res_multi$pct_covered_onlymulti_ribo*res_multi$length.y + res_multi$pct_region_covered_ribo<-sum(res_multi$pct_region_covered_ribo_ALL)/(sum(res_multi$length.y)) + res_multi$pct_covered_onlymulti_ribo<-sum(res_multi$pct_covered_onlymulti_ribo_ALL)/(sum(res_multi$length.y)) + res_multi$reads_rna<-sum(res_multi$reads_rna) + res_multi$reads_multi_rna<-sum(res_multi$reads_multi_rna) + res_multi$pct_region_covered_rna_ALL<-res_multi$pct_region_covered_rna*res_multi$length.y + res_multi$pct_covered_onlymulti_rna_ALL<-res_multi$pct_covered_onlymulti_rna*res_multi$length.y + res_multi$pct_region_covered_rna<-sum(res_multi$pct_region_covered_rna_ALL)/sum(res_multi$length.y) + res_multi$pct_covered_onlymulti_rna<-sum(res_multi$pct_covered_onlymulti_rna_ALL)/sum(res_multi$length.y) + + s<-cbind(s,res_multi[1,]) + s +} + + + +#for every peptide choses one transcript +agg<-aggregate(x=all_sORFs_noncod_multi_final[,"RNA_sites"],by=list(all_sORFs_noncod_multi_final[,"gene_id"],all_sORFs_noncod_multi_final[,"ORF_pept"],all_sORFs_noncod_multi_final[,"Method"]),FUN=max) +names(agg)<-c("gene_id","ORF_pept","Method","RNA_sites") +agg2<-merge(x=all_sORFs_noncod_multi_final[,c("ORF_id_tr_minus2","length","gene_id","ORF_pept","Method","RNA_sites")],agg,by=c("gene_id","ORF_pept","Method","RNA_sites")) + +agg3<-aggregate(x=agg2[,"length"],by=list(agg2[,"gene_id"],agg2[,"ORF_pept"],agg2[,"Method"],agg2[,"RNA_sites"]),FUN=min) + +names(agg3)<-c("gene_id","ORF_pept","Method","RNA_sites","length") +agg4<-merge(x=all_sORFs_noncod_multi_final[,c("ORF_id_tr_minus2","length","gene_id","ORF_pept","Method","RNA_sites")],agg3,by=c("gene_id","ORF_pept","Method","length","RNA_sites")) +all_sORFs_noncod_multi_final<-all_sORFs_noncod_multi_final[all_sORFs_noncod_multi_final[,"ORF_id_tr_minus2"]%in%agg4[,"ORF_id_tr_minus2"],] + + +all_sORFs_noncod_periodic<-all_sORFs_noncod_multi_final[all_sORFs_noncod_multi_final[,"ORF_pval_multi_ribo"]<0.05,] +all_sORFs_noncod_periodic<-all_sORFs_noncod_periodic[!is.na(all_sORFs_noncod_periodic[,"transcript_id"]),] + + +noncod_all<-NONCCDS_orfs +noncod_found<-all_sORFs_noncod_periodic +noncod_found$n_exons_ORF<-sapply(strsplit(noncod_found$to_check_ALL,split=";"),FUN=function(x){sum(x!="NA")}) + + +print(paste("--- Checking non-CCDS ORFs intersections with annotated CDS regions,",date(),sep=" ")) + + + +ex_to_check<-strsplit(noncod_found$to_check_ALL,split=";") + +ex_to_check_spl<-unique(unlist(ex_to_check)) + +ex_to_check_spl<-strsplit(ex_to_check_spl,split="_") + +bedfiles_to_check<-data.frame(chr=NA,start=NA,end=NA,type=NA,gene_id=NA,strand=NA) +for(h in 1:length(ex_to_check_spl)){ + to_bed<-ex_to_check_spl[[h]] + bedfiles_to_check[h,"chr"]<-to_bed[1] + bedfiles_to_check[h,"start"]<-to_bed[2] + bedfiles_to_check[h,"end"]<-to_bed[3] + bedfiles_to_check[h,"type"]<-to_bed[4] + bedfiles_to_check[h,"gene_id"]<-to_bed[5] + bedfiles_to_check[h,"strand"]<-to_bed[6] + +} + +write.table(bedfiles_to_check,file="sORFs_totest",quote=F,row.names=F,sep="\t",col.names=F) + +system("sort -k1,1 -k2,2n sORFs_totest > sORFs_totest.bed") + +bedfiles_to_check<-read.table("sORFs_totest.bed",stringsAsFactors=F,header=F) +colnames(bedfiles_to_check)<-c("chr","start","end","type","gene_id","strand") +bedfiles_to_check<-bedfiles_to_check[!is.na(bedfiles_to_check[,"chr"]),] +write.table(bedfiles_to_check,file="sORFs_totest.bed",quote=F,row.names=F,sep="\t",col.names=F) + +#intersect out ORFs overlapping CDS regions + +fhalf_scr<-paste(args[3],"intersectBed -v -a sORFs_totest.bed -b",sep = "/") + +shalf_scr<-paste(args[1],"all_cds.bed > sORFs_totest_nocds.bed",sep = "/") + +system(paste(fhalf_scr,shalf_scr,sep = " ")) + +command<-paste("wc -l","sORFs_totest_nocds.bed") +lines_in_file<-system(command,intern=T) +lines_in_file<-as.numeric(strsplit(lines_in_file,split=" ")[[1]][1]) + +if(lines_in_file>0){ + results_nonoverlapcdss<-read.table("sORFs_totest_nocds.bed",stringsAsFactors=F,header=F) + names(results_nonoverlapcdss)<-names(bedfiles_to_check) + results_nonoverlapcdss[,"exon_id"]<-paste(results_nonoverlapcdss[,"chr"],results_nonoverlapcdss[,"start"],results_nonoverlapcdss[,"end"],results_nonoverlapcdss[,"type"],results_nonoverlapcdss[,"gene_id"],results_nonoverlapcdss[,"strand"],sep="_") + NA_str<-which(is.na(results_nonoverlapcdss[,"strand"])) + if(length(NA_str)>0){ + for(o in NA_str){ + results_nonoverlapcdss[o,"exon_id"]<-paste(results_nonoverlapcdss[o,"chr"],results_nonoverlapcdss[o,"start"],results_nonoverlapcdss[o,"end"],results_nonoverlapcdss[o,"type"],results_nonoverlapcdss[o,"gene_id"],sep="_") + + } + } +} + + +if(lines_in_file==0){ + results_nonoverlapcdss<-data.frame(exon_id=NA,stringsAsFactors=F) + print("Warning! No ncORFs found! all ORFs overlap annotated CDS exons ") +} + +overl_cds<-c() +for(i in 1:length(ex_to_check)){ + a<-ex_to_check[[i]] + a<-a[a!="NA"] + overl_cds[i]<-sum(!a%in%results_nonoverlapcdss$exon_id)>0 + +} +#divides in overlcds (nonccds coding ORFs) and ncORFs +noncod_found_overl<-noncod_found[overl_cds,] +noncod_found_overl_sign<-noncod_found_overl[noncod_found_overl[,"ORF_pval_multi_ribo"]<0.05,] +noncod_found_overl_sign<-noncod_found_overl_sign[!is.na(noncod_found_overl_sign[,"transcript_id"]),] +noncod_found_overl_sign_nomultifilt<-noncod_found_overl_sign +noncod_found_overl_sign<-noncod_found_overl_sign[(noncod_found_overl_sign$pct_covered_onlymulti_ribo/noncod_found_overl_sign$pct_region_covered_ribo)<0.3,] +noncod_found_overl_sign<-noncod_found_overl_sign[noncod_found_overl_sign$pct_region_covered_ribo>0.3,] +noncod_found_overl_sign<-noncod_found_overl_sign[!is.na(noncod_found_overl_sign[,"transcript_id"]),] +#nonoverlcds +noncod_found<-noncod_found[!overl_cds,] +noncod_found_nofiltmult<-noncod_found +noncod_found<-noncod_found[(noncod_found$pct_covered_onlymulti_ribo/noncod_found$pct_region_covered_ribo)<0.3,] +noncod_found<-noncod_found[!is.na(noncod_found$transcript_id),] + +setwd("../") + +dir.create("ORFs_NONCCDS", showWarnings = FALSE) +dir.create("ORFs_NONCCDS/best_periodicity", showWarnings = FALSE) +dir.create("ORFs_NONCCDS/max_P_sites", showWarnings = FALSE) +dir.create("ORFs_NONCCDS/more_tapers", showWarnings = FALSE) + + +ORFs_sign_filtered_multi<-noncod_found_overl_sign[noncod_found_overl_sign[,"Method"]=="best_periodicity",] +ORFs_sign<-noncod_found_overl_sign_nomultifilt[noncod_found_overl_sign_nomultifilt[,"Method"]=="best_periodicity",] + +write.table(ORFs_sign_filtered_multi,file="ORFs_NONCCDS/best_periodicity/ORFs_sign_nocds_filtered_multi",quote=F,row.names=F,sep="\t",col.names=T) +write.table(ORFs_sign,file="ORFs_NONCCDS/best_periodicity/ORFs_sign_nocds_nofilter",quote=F,row.names=F,sep="\t",col.names=T) + +noncod_overl_cds<-noncod_found[noncod_found[,"Method"]=="best_periodicity",] +write.table(noncod_overl_cds,file="ORFs_NONCCDS/best_periodicity/ORFs_sign_cds_filtered_multi",quote=F,row.names=F,sep="\t",col.names=T) + + +noncod_overl_cds_nofilt<-noncod_found_nofiltmult[noncod_found_nofiltmult[,"Method"]=="best_periodicity",] +write.table(noncod_overl_cds_nofilt,file="ORFs_NONCCDS/best_periodicity/ORFs_sign_cds_notfiltered_multi",quote=F,row.names=F,sep="\t",col.names=T) + +noncod_all_meth<-noncod_all[noncod_all[,"Method"]=="best_periodicity",] + +noncod_noORF<-noncod_all_meth[!noncod_all_meth[,"gene_id"]%in%noncod_found[,"gene_id"],] +noncod_noORF<-noncod_noORF[!is.na(noncod_noORF[,"transcript_id"]),] +noncod_noORF<-noncod_noORF[noncod_noORF[,"P_sites_sum"]>10,] +write.table(noncod_noORF,file="ORFs_NONCCDS/best_periodicity/noncod_noORF",quote=F,row.names=F,sep="\t",col.names=T) +ORF_all<-NONCCDS_orfs[NONCCDS_orfs[,"Method"]=="best_periodicity",] +write.table(ORF_all,file="ORFs_NONCCDS/best_periodicity/ORFs_all",quote=F,row.names=F,sep="\t",col.names=T) + + +ORFs_sign<-noncod_found_overl_sign_nomultifilt[noncod_found_overl_sign_nomultifilt[,"Method"]=="max_P_sites",] +ORFs_sign_filtered_multi<-noncod_found_overl_sign[noncod_found_overl_sign[,"Method"]=="max_P_sites",] +write.table(ORFs_sign_filtered_multi,file="ORFs_NONCCDS/max_P_sites/ORFs_sign_nocds_filtered_multi",quote=F,row.names=F,sep="\t",col.names=T) +write.table(ORFs_sign,file="ORFs_NONCCDS/max_P_sites/ORFs_sign_nocds_nofilter",quote=F,row.names=F,sep="\t",col.names=T) + +noncod_overl_cds_nofilt<-noncod_found_nofiltmult[noncod_found_nofiltmult[,"Method"]=="max_P_sites",] +write.table(noncod_overl_cds_nofilt,file="ORFs_NONCCDS/max_P_sites/ORFs_sign_cds_notfiltered_multi",quote=F,row.names=F,sep="\t",col.names=T) + + +noncod_overl_cds<-noncod_found[noncod_found[,"Method"]=="max_P_sites",] +write.table(noncod_overl_cds,file="ORFs_NONCCDS/max_P_sites/ORFs_sign_cds_filtered_multi",quote=F,row.names=F,sep="\t",col.names=T) + +noncod_all_meth<-noncod_all[noncod_all[,"Method"]=="max_P_sites",] +noncod_noORF<-noncod_all_meth[!noncod_all_meth[,"gene_id"]%in%noncod_found[,"gene_id"],] +noncod_noORF<-noncod_noORF[!is.na(noncod_noORF[,"transcript_id"]),] +noncod_noORF<-noncod_noORF[noncod_noORF[,"P_sites_sum"]>10,] +write.table(noncod_noORF,file="ORFs_NONCCDS/max_P_sites/noncod_noORF",quote=F,row.names=F,sep="\t",col.names=T) +ORF_all<-NONCCDS_orfs[NONCCDS_orfs[,"Method"]=="max_P_sites",] +write.table(ORF_all,file="ORFs_NONCCDS/max_P_sites/ORFs_all",quote=F,row.names=F,sep="\t",col.names=T) + + +ORFs_sign<-noncod_found_overl_sign_nomultifilt[noncod_found_overl_sign_nomultifilt[,"Method"]=="more_tapers",] + +ORFs_sign_filtered_multi<-noncod_found_overl_sign[noncod_found_overl_sign[,"Method"]=="more_tapers",] +write.table(ORFs_sign_filtered_multi,file="ORFs_NONCCDS/more_tapers/ORFs_sign_nocds_filtered_multi",quote=F,row.names=F,sep="\t",col.names=T) +write.table(ORFs_sign,file="ORFs_NONCCDS/more_tapers/ORFs_sign_nocds_nofilter",quote=F,row.names=F,sep="\t",col.names=T) + +noncod_overl_cds_nofilt<-noncod_found_nofiltmult[noncod_found_nofiltmult[,"Method"]=="more_tapers",] +write.table(noncod_overl_cds_nofilt,file="ORFs_NONCCDS/more_tapers/ORFs_sign_cds_notfiltered_multi",quote=F,row.names=F,sep="\t",col.names=T) + + +noncod_overl_cds<-noncod_found[noncod_found[,"Method"]=="more_tapers",] +write.table(noncod_overl_cds,file="ORFs_NONCCDS/more_tapers/ORFs_sign_cds_filtered_multi",quote=F,row.names=F,sep="\t",col.names=T) + +noncod_all_meth<-noncod_all[noncod_all[,"Method"]=="more_tapers",] +noncod_noORF<-noncod_all_meth[!noncod_all_meth[,"gene_id"]%in%noncod_found[,"gene_id"],] +noncod_noORF<-noncod_noORF[!is.na(noncod_noORF[,"transcript_id"]),] +noncod_noORF<-noncod_noORF[noncod_noORF[,"P_sites_sum"]>10,] + +write.table(noncod_noORF,file="ORFs_NONCCDS/more_tapers/noncod_noORF",quote=F,row.names=F,sep="\t",col.names=T) +ORF_all<-NONCCDS_orfs[NONCCDS_orfs[,"Method"]=="more_tapers",] +write.table(ORF_all,file="ORFs_NONCCDS/more_tapers/ORFs_all",quote=F,row.names=F,sep="\t",col.names=T) + + +print(paste("--- non-CCDS ORF finding Done!","---",date(),sep=" ")) + + + + diff --git a/scripts/ORF_final_results.R b/scripts/ORF_final_results.R new file mode 100755 index 0000000..21f6870 --- /dev/null +++ b/scripts/ORF_final_results.R @@ -0,0 +1,208 @@ +#!/usr/bin/Rscript + +################################################################### +# This file is part of RiboTaper. +# RiboTaper is a method for defining traslated ORFs using +# Ribosome Profiling data. +# +# Copyright (C) 2015 Lorenzo Calviello +# +# RiboTaper is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# RiboTaper is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with RiboTaper. If not, see . +# +# Contact: Lorenzo.Calviello@mdc-berlin.de +####################################################################### + + +###script for plotting general results about the identified ORFs, takes no arguments + + +print(paste("--- plotting ORF finding results ---",date(),sep= " ")) +ORFs<-read.table("ORFs_max_filt",stringsAsFactors=F,header=T,quote = "") +ORFs_all<-read.table("ORFs_max",stringsAsFactors=F,header=T,quote = "") + +df<-(data.frame(table(ORFs_all$category,ORFs_all$annotation))) +names(df)<-c("category","annotation","n_ORFs") +df<-df[df[,"n_ORFs"]>0,] + +ORFs_genes<-unique(ORFs_all[,c("category","annotation","gene_id")]) +df_genes<-(data.frame(table(ORFs_genes$category,ORFs_genes$annotation))) +names(df_genes)<-c("category","annotation","n_genes") +df_genes<-df_genes[df_genes[,"n_genes"]>0,] + +df_filt<-(data.frame(table(ORFs$category,ORFs$annotation))) +names(df_filt)<-c("category","annotation","n_ORFs_filtered") +df_filt<-df_filt[df_filt[,"n_ORFs_filtered"]>0,] + +ORFs_genes_filt<-unique(ORFs[,c("category","annotation","gene_id")]) +df_genes_filt<-(data.frame(table(ORFs_genes_filt$category,ORFs_genes_filt$annotation))) +names(df_genes_filt)<-c("category","annotation","n_genes_filtered") +df_genes_filt<-df_genes_filt[df_genes_filt[,"n_genes_filtered"]>0,] + + + +df_new<-merge(df,df_filt,by=c("category","annotation"),all.x=T) + +df_new<-merge(df_new,df_genes,by=c("category","annotation"),all.x=T) +df_new<-merge(df_new,df_genes_filt,by=c("category","annotation"),all.x=T) +df_new<-df_new[order(df_new$n_ORFs,decreasing=T),] + +write.table("ORFs_genes_found",x=df_new,quote=F,row.names=F,col.names=T,sep="\t") + +ORFs_coding<-ORFs[ORFs[,"annotation"]=="protein_coding",] +if(dim(ORFs_coding)[1]>0){ + tb<-as.data.frame(table(ORFs_coding$category),stringsAsFactors=F) + names(tb)<-c("category","counts") + tb<-tb[order(tb$counts,decreasing=T),] + if(dim(tb)[1]>4){ + tb_ok<-tb[1:4,] + tb_more<-tb[5:dim(tb)[1],] + tb_other<-data.frame(counts=sum(tb_more$counts),category="other_coding",stringsAsFactors=F) + tb<-rbind.data.frame(tb_ok,tb_other) + ORFs_coding[ORFs_coding[,"category"]%in%tb_more$category,"category"]<-"other_coding" + + ORFs_coding$category<-factor(ORFs_coding$category,levels=tb$category) + + + } + if(dim(tb)[1]>=4){ + ORFs_coding$category<-factor(ORFs_coding$category,levels=tb$category) + } + + +} +ncORFs<-ORFs[ORFs[,"category"]=="ncORFS",] + +if(dim(ncORFs)[1]>0){ + tb<-as.data.frame(table(ncORFs$annotation),stringsAsFactors=F) + names(tb)<-c("annotation","counts") + tb<-tb[order(tb$counts,decreasing=T),] + if(dim(tb)[1]>4){ + tb_ok<-tb[1:4,] + tb_more<-tb[5:dim(tb)[1],] + tb_other<-data.frame(counts=sum(tb_more$counts),annotation="other_ncORFs",stringsAsFactors=F) + tb<-rbind.data.frame(tb_ok,tb_other) + ncORFs[ncORFs[,"annotation"]%in%tb_more$annotation,"annotation"]<-"other_ncORFs" + + ncORFs$category<-factor(ncORFs$annotation,levels=tb$annotation) + + + } + if(dim(tb)[1]>=4){ + ncORFs$category<-factor(ncORFs$annotation,levels=tb$annotation) + } + + +} + + +all<-rbind.data.frame(ORFs_coding[,c("category","ORF_length","ORF_P_sites")],ncORFs[,c("category","ORF_length","ORF_P_sites")]) +all_filt<-all + + +ORFs_coding<-ORFs_all[ORFs_all[,"annotation"]=="protein_coding",] +if(dim(ORFs_coding)[1]>0){ + tb<-as.data.frame(table(ORFs_coding$category),stringsAsFactors=F) + names(tb)<-c("category","counts") + tb<-tb[order(tb$counts,decreasing=T),] + if(dim(tb)[1]>4){ + tb_ok<-tb[1:4,] + tb_more<-tb[5:dim(tb)[1],] + tb_other<-data.frame(counts=sum(tb_more$counts),category="other_coding",stringsAsFactors=F) + tb<-rbind.data.frame(tb_ok,tb_other) + ORFs_coding[ORFs_coding[,"category"]%in%tb_more$category,"category"]<-"other_coding" + + ORFs_coding$category<-factor(ORFs_coding$category,levels=tb$category) + + + } + if(dim(tb)[1]>=4){ + ORFs_coding$category<-factor(ORFs_coding$category,levels=tb$category) + } + + +} +ncORFs<-ORFs_all[ORFs_all[,"category"]=="ncORFS",] + +if(dim(ncORFs)[1]>0){ + tb<-as.data.frame(table(ncORFs$annotation),stringsAsFactors=F) + names(tb)<-c("annotation","counts") + tb<-tb[order(tb$counts,decreasing=T),] + if(dim(tb)[1]>4){ + tb_ok<-tb[1:4,] + tb_more<-tb[5:dim(tb)[1],] + tb_other<-data.frame(counts=sum(tb_more$counts),annotation="other_ncORFs",stringsAsFactors=F) + tb<-rbind.data.frame(tb_ok,tb_other) + ncORFs[ncORFs[,"annotation"]%in%tb_more$annotation,"annotation"]<-"other_ncORFs" + + ncORFs$category<-factor(ncORFs$annotation,levels=tb$annotation) + + + } + if(dim(tb)[1]>=4){ + ncORFs$category<-factor(ncORFs$annotation,levels=tb$annotation) + } + + +} + + +all<-rbind.data.frame(ORFs_coding[,c("category","ORF_length","ORF_P_sites")],ncORFs[,c("category","ORF_length","ORF_P_sites")]) + + + +pdf(file="Final_ORF_results.pdf",width=7,height=10,onefile=T,title="ORFs_results") +par(mar=c(10,4,4,4)) +par(mfrow=c(2,2)) +barplot(table(all_filt$category),col=c("red","dark red","yellow","orange","grey","dark blue","blue","cornflowerblue","cyan4","grey"),ylab="ORFs_filtered",las=2) +grid(lwd=1.2,col="black") +barplot(log10(table(all_filt$category)),col=c("red","dark red","yellow","orange","grey","dark blue","blue","cornflowerblue","cyan4","grey"),ylab="ORFs_filtered(logscale)",yaxt="n",las=2) +axis(side=2,at=0:4,labels=10^(0:4)) +grid(lwd=1.2,col="black") + +barplot(table(all$category),col=c("red","dark red","yellow","orange","grey","dark blue","blue","cornflowerblue","cyan4","grey"),ylab="ORFs_all",las=2) +grid(lwd=1.2,col="black") +barplot(log10(table(all$category)),col=c("red","dark red","yellow","orange","grey","dark blue","blue","cornflowerblue","cyan4","grey"),ylab="ORFs_all(logscale)",yaxt="n",las=2) +axis(side=2,at=0:4,labels=10^(0:4)) +grid(lwd=1.2,col="black") + + + +par(mfrow=c(3,1)) + +boxplot(log10(all$ORF_P_sites)~all$category,col=c("red","dark red","yellow","orange","grey","dark blue","blue","cornflowerblue","cyan4","grey"),ylab="ORF_P_sites",yaxt="n",main="ORFs_filtered",las=2) +axis(side=2,at=1:4,labels=10^(1:4),las=2) +grid(lwd=1.2,col="black") +boxplot(log10(all$ORF_length)~all$category,col=c("red","dark red","yellow","orange","grey","dark blue","blue","cornflowerblue","cyan4","grey"),ylab="ORF_length",yaxt="n",las=2) +axis(side=2,at=1:4,labels=10^(1:4),las=2) +grid(lwd=1.2,col="black") +boxplot((all$ORF_P_sites/(all$ORF_length/3))~all$category,col=c("red","dark red","yellow","orange","grey","dark blue","blue","cornflowerblue","cyan4","grey"),ylab="ORF_P_sites_per_codon",ylim=c(0,6),las=2) +grid(lwd=1.2,col="black") + +par(mfrow=c(3,1)) + +boxplot(log10(all$ORF_P_sites)~all$category,col=c("red","dark red","yellow","orange","grey","dark blue","blue","cornflowerblue","cyan4","grey"),ylab="ORF_P_sites",yaxt="n",main="ORFs_all",las=2) +axis(side=2,at=1:4,labels=10^(1:4),las=2) +grid(lwd=1.2,col="black") +boxplot(log10(all$ORF_length)~all$category,col=c("red","dark red","yellow","orange","grey","dark blue","blue","cornflowerblue","cyan4","grey"),ylab="ORF_length",yaxt="n",las=2) +axis(side=2,at=1:4,labels=10^(1:4),las=2) +grid(lwd=1.2,col="black") +boxplot((all$ORF_P_sites/(all$ORF_length/3))~all$category,col=c("red","dark red","yellow","orange","grey","dark blue","blue","cornflowerblue","cyan4","grey"),ylab="ORF_P_sites_per_codon",ylim=c(0,6),las=2) +grid(lwd=1.2,col="black") + + +dev.off() + + +print(paste("--- ORF finding results Done! ---",date(),sep= " ")) + diff --git a/scripts/P_sites_RNA_sites_calc.bash b/scripts/P_sites_RNA_sites_calc.bash new file mode 100755 index 0000000..a72716c --- /dev/null +++ b/scripts/P_sites_RNA_sites_calc.bash @@ -0,0 +1,75 @@ +#!/bin/bash + + +################################################################### +# This file is part of RiboTaper. +# RiboTaper is a method for defining traslated ORFs using +# Ribosome Profiling data. +# +# Copyright (C) 2015 Lorenzo Calviello +# +# RiboTaper is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# RiboTaper is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with RiboTaper. If not, see . +# +# Contact: Lorenzo.Calviello@mdc-berlin.de +####################################################################### + + +##This script calculates P-sites and RNA sites positions, it uses as arguments a comma-separated list of read lengths to be used, a comma-separated list of offsets, the bedtools exec directory + +if [ $# -ne 3 ]; then + echo "------------Usage: P_sites_RNA_sites_calc.bash " + exit 1 +fi + +lengths=$1 + +lengths=(${lengths//,/ }) + +bedtools_dir=$3 + +offsets=$2 +offsets=(${offsets//,/ }) + +if [ ${#lengths[@]} -ne ${#offsets[@]} ]; then + echo ${#lengths[@]} + echo ${#offsets[@]} + echo "------------The number of read lengths and offsets differ! Insert comma-separated value for read lengths and offsets e.g. 28,29 11,12 Usage: P_sites_RNA_sites_calc.bash " + exit 1 + +fi +n_frag=${#lengths[@]} + +for (( i=0; i<${n_frag}; i++ )); +do + len=${lengths[$i]} + offs=${offsets[$i]} + echo "------------processing" $len "nt reads with offset of +" $offs + $bedtools_dir"/bamToBed" -cigar -bed12 -i RIBO_best.bam | awk -v env_var=$len -F"\t" '{split($11,c,","); if((c[1]+c[2]+c[3])==env_var) print $0 "\t" c[1]+c[2]+c[3] > "tmp_align_len"}' + less tmp_align_len | awk -F"\t" '{split($11,c,","); print $0 "\t" c[1]+c[2]+c[3]+c[4]+c[5] }' | awk -v env_var=$offs '{split($11,c,","); split($12,d,","); if($6=="+" && c[1]env_var) $2=($2)+d[2]+(env_var-c[1]); if($6=="+" && c[1]>=env_var) $2=($2)+env_var; if($6=="+" && c[1]env_var) $2=($2)+d[3]+(env_var-c[1]-c[2]); if($6=="+" && (c[1]+c[2]+c[3])=$NF-env_var) $2=$2+$NF-env_var-1; if($6=="-" && c[1]<($NF-env_var) && (c[1]+c[2])>=($NF-env_var)) $2=($2)+d[2]+(($NF-env_var-1)-c[1]); if($6=="-" && c[1]<($NF-env_var) && (c[1]+c[2])<($NF-env_var) && (c[1]+c[2]+c[3])>=($NF-env_var)) $2=($2)+d[3]+(($NF-env_var-1)-c[1]-c[2]); if($6=="+" && (c[1]+c[2]+c[3])<($NF-env_var)) $2=($2)+d[4]+(($NF-env_var)-c[1]-c[2]-c[3]); print $0}' OFS="\t" | awk '{$3=$2+1 ; print $0 }' OFS="\t" | awk '{if($2>0) print $0}' OFS="\t" > P_sites_len + rm tmp_align_len + mv P_sites_len tmp_P_sites_"$len" + + +done + +cat tmp_P_sites_* > P_sites_all +rm tmp_P_sites_* +echo "------------Done!" + +echo "------------processing RNA-seq with offset of + 25" + +$bedtools_dir"/bamToBed" -cigar -bed12 -i RNA_best.bam | awk -F"\t" '{split($11,c,","); print $0 "\t" c[1]+c[2]+c[3]+c[4]+c[5] }' | awk -v env_var=25 '{split($11,c,","); split($12,d,","); if($6=="+" && c[1]env_var) $2=($2)+d[2]+(env_var-c[1]); if($6=="+" && c[1]>=env_var) $2=($2)+env_var; if($6=="+" && c[1]env_var) $2=($2)+d[3]+(env_var-c[1]-c[2]); if($6=="+" && (c[1]+c[2]+c[3])=$NF-env_var) $2=$2+$NF-env_var-1; if($6=="-" && c[1]<($NF-env_var) && (c[1]+c[2])>=($NF-env_var)) $2=($2)+d[2]+(($NF-env_var-1)-c[1]); if($6=="-" && c[1]<($NF-env_var) && (c[1]+c[2])<($NF-env_var) && (c[1]+c[2]+c[3])>=($NF-env_var)) $2=($2)+d[3]+(($NF-env_var-1)-c[1]-c[2]); if($6=="+" && (c[1]+c[2]+c[3])<($NF-env_var)) $2=($2)+d[4]+(($NF-env_var)-c[1]-c[2]-c[3]); print $0}' OFS="\t" | awk '{$3=$2+1 ; print $0 }' OFS="\t" | awk '{if($2>0) print $0}' OFS="\t" > Centered_RNA + +echo "------------P_sites and RNA_sites calculated !!!" + diff --git a/scripts/Ribotaper.sh b/scripts/Ribotaper.sh new file mode 100755 index 0000000..50b220d --- /dev/null +++ b/scripts/Ribotaper.sh @@ -0,0 +1,191 @@ +#!/bin/bash + + +################################################################### +# This file is part of RiboTaper. +# RiboTaper is a method for defining traslated ORFs using +# Ribosome Profiling data. +# +# Copyright (C) 2015 Lorenzo Calviello +# +# RiboTaper is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# RiboTaper is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with RiboTaper. If not, see . +# +# Contact: Lorenzo.Calviello@mdc-berlin.de +####################################################################### + +### RiboTaper master File + + +set -e + +if [ $# -ne 8 ]; then + echo "Usage: ./Ribotaper.sh " + exit 1 +fi +if ! [[ -f "$1" ]]; then + echo "!!!!! ribo_bam file not found!." + exit 1 + fi + +if ! [[ -f "$2" ]]; then + echo "!!!!! ribo_bam not found!." + exit 1 + fi + +if [ ! -d "$3" ]; then + echo "!!!!! annotation_directory not found!." + exit 1 +fi + +if [ ! -d "$6" ]; then + echo "!!!!! scripts_directory not found!." + exit 1 +fi + +if [ ! -d "$7" ]; then + echo "!!!!! bedtools_directory not found!." + exit 1 +fi + + +re='^[0-9]+$' +if ! [[ "$8" =~ $re ]] ; then + echo "!!!!! n of cores not valid" + exit 1 +fi + + +if [ "$8" == 1 ]; then + echo "!!!!! n of cores required >1." + exit 1 +fi + + + +ribo_bam="`readlink -f $1`" +rna_bam="`readlink -f $2`" +annot_dir="`readlink -f $3`" +read_len=$4 +cutoffs=$5 +scripts_dir="`readlink -f $6`" +bedtools_dir="`readlink -f $7`" +n_of_cores=$8 + + +echo "Parameters used:" +echo "" + +echo " $ribo_bam" +echo " $rna_bam" +echo " $annot_dir" +echo " $read_len" +echo " $cutoffs" +echo " $scripts_dir" +echo " $bedtools_dir" +echo " $8" +echo "" +echo "---------------" +echo "" + + + +#take bams for unique and best alignments + +echo "Taking unique - best alignments..." + +samtools view -b -q 50 $ribo_bam > RIBO_unique.bam +samtools view -b -F 0X100 $ribo_bam > RIBO_best.bam + +samtools view -b -q 50 $rna_bam > RNA_unique.bam +samtools view -b -F 0X100 $rna_bam > RNA_best.bam + + +#calculates P-sites (from argument) and RNA-sites (default 25nt offset) + +echo "Calculating P-sites..." + +$scripts_dir"/P_sites_RNA_sites_calc.bash" $read_len $cutoffs $bedtools_dir + +#creates exonic tracks for ccds regions, exons_in ccds genes and non_ccds genes (if a ccds annotation is not available, CCDS = CDS) + +echo "Creating tracks..." + +$scripts_dir"/create_tracks.bash" $annot_dir"/unique_ccds.bed" $annot_dir"/sequences_ccds" ccds $bedtools_dir + +$scripts_dir"/create_tracks.bash" $annot_dir"/unique_exons_ccds.bed" $annot_dir"/sequences_exonsccds" exonsccds $bedtools_dir + +$scripts_dir"/create_tracks.bash" $annot_dir"/unique_nonccds.bed" $annot_dir"/sequences_nonccds" nonccds $bedtools_dir + + +#run calculation on CCDS, ExonsCCDS, non-CCDS exons and makes quality checks plots for length-coverage statistics + +echo "Running calculations ccds..." + +$scripts_dir"/tracks_analysis.R" ccds $scripts_dir $n_of_cores + +echo "Running calculations exons_ccds..." + +$scripts_dir"/tracks_analysis.R" exonsccds $scripts_dir $n_of_cores + +echo "Running calculations nonccds..." + +$scripts_dir"/tracks_analysis.R" nonccds $scripts_dir $n_of_cores + +# annotates the exons relative to ccds regions TO BE ADAPTED, CHECK WHICH FILES THEY NEED. + +echo "Annotate exons..." + +$scripts_dir"/annotate_exons.R" $annot_dir $scripts_dir $n_of_cores + +echo "Making quality plots..." + +$scripts_dir"/quality_check.R" $annot_dir + +#echo "Calculating coherence..." + +#$scripts_dir"/calculate_coherence_all_draft.R" $scripts_dir $n_of_cores + +#echo "Calculating alternative exon usage..." + +#$scripts_dir"/alt_exon_usage_draft.R" $annot_dir $scripts_dir $n_of_cores + +#ORF-finding + +echo "CCDS ORF finding..." + +$scripts_dir"/CCDS_orf_finder.R" $annot_dir $scripts_dir $bedtools_dir $n_of_cores + +echo "NONCCDS ORF finding..." + +$scripts_dir"/NONCCDS_orf_finder.R" $annot_dir $scripts_dir $bedtools_dir $n_of_cores + +# Groups ORFs and creates BED files + protein fasta database + +echo "Grouping ORFs and creating protein fasta database..." + +$scripts_dir"/create_protein_db.R" + +# makes summary plot for the found ORFs + +echo "Summarizing ORF finding results" + +$scripts_dir"/ORF_final_results.R" + +echo "RiboTaper analysis finished !!!" + + + + + + diff --git a/scripts/Ribotaper.sh~ b/scripts/Ribotaper.sh~ new file mode 100644 index 0000000..ee467ba --- /dev/null +++ b/scripts/Ribotaper.sh~ @@ -0,0 +1,191 @@ +#!/bin/bash + + +################################################################### +# This file is part of RiboTaper. +# RiboTaper is a method for defining traslated ORFs using +# Ribosome Profiling data. +# +# Copyright (C) 2015 Lorenzo Calviello +# +# RiboTaper is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# RiboTaper is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with RiboTaper. If not, see . +# +# Contact: Lorenzo.Calviello@mdc-berlin.de +####################################################################### + +### RiboTaper master File + + +set -e + +if [ $# -ne 8 ]; then + echo "Usage: ./Ribotaper.sh " + exit 1 +fi +if ! [[ -f "$1" ]]; then + echo "!!!!! ribo_bam file not found!." + exit 1 + fi + +if ! [[ -f "$2" ]]; then + echo "!!!!! ribo_bam not found!." + exit 1 + fi + +if [ ! -d "$3" ]; then + echo "!!!!! annotation_directory not found!." + exit 1 +fi + +if [ ! -d "$6" ]; then + echo "!!!!! scripts_directory not found!." + exit 1 +fi + +if [ ! -d "$7" ]; then + echo "!!!!! bedtools_directory not found!." + exit 1 +fi + + +re='^[0-9]+$' +if ! [[ "$8" =~ $re ]] ; then + echo "!!!!! n of cores not valid" + exit 1 +fi + + +if [ "$8" == 1 ]; then + echo "!!!!! n of cores required >1." + exit 1 +fi + + + +ribo_bam="`readlink -f $1`" +rna_bam="`readlink -f $2`" +annot_dir="`readlink -f $3`" +read_len=$4 +cutoffs=$5 +scripts_dir="`readlink -f $6`" +bedtools_dir="`readlink -f $7`" +n_of_cores=$8 + + +echo "Parameters used:" +echo "" + +echo " $ribo_bam" +echo " $rna_bam" +echo " $annot_dir" +echo " $read_len" +echo " $cutoffs" +echo " $scripts_dir" +echo " $bedtools_dir" +echo " $8" +echo "" +echo "---------------" +echo "" + + + +#take bams for unique and best alignments + +echo "Taking unique - best alignments..." + +samtools view -b -q 255 $ribo_bam > RIBO_unique.bam +samtools view -b -F 0X100 $ribo_bam > RIBO_best.bam + +samtools view -b -q 255 $rna_bam > RNA_unique.bam +samtools view -b -F 0X100 $rna_bam > RNA_best.bam + + +#calculates P-sites (from argument) and RNA-sites (default 25nt offset) + +echo "Calculating P-sites..." + +$scripts_dir"/P_sites_RNA_sites_calc.bash" $read_len $cutoffs $bedtools_dir + +#creates exonic tracks for ccds regions, exons_in ccds genes and non_ccds genes (if a ccds annotation is not available, CCDS = CDS) + +echo "Creating tracks..." + +$scripts_dir"/create_tracks.bash" $annot_dir"/unique_ccds.bed" $annot_dir"/sequences_ccds" ccds $bedtools_dir + +$scripts_dir"/create_tracks.bash" $annot_dir"/unique_exons_ccds.bed" $annot_dir"/sequences_exonsccds" exonsccds $bedtools_dir + +$scripts_dir"/create_tracks.bash" $annot_dir"/unique_nonccds.bed" $annot_dir"/sequences_nonccds" nonccds $bedtools_dir + + +#run calculation on CCDS, ExonsCCDS, non-CCDS exons and makes quality checks plots for length-coverage statistics + +echo "Running calculations ccds..." + +$scripts_dir"/tracks_analysis.R" ccds $scripts_dir $n_of_cores + +echo "Running calculations exons_ccds..." + +$scripts_dir"/tracks_analysis.R" exonsccds $scripts_dir $n_of_cores + +echo "Running calculations nonccds..." + +$scripts_dir"/tracks_analysis.R" nonccds $scripts_dir $n_of_cores + +# annotates the exons relative to ccds regions TO BE ADAPTED, CHECK WHICH FILES THEY NEED. + +echo "Annotate exons..." + +$scripts_dir"/annotate_exons.R" $annot_dir $scripts_dir $n_of_cores + +echo "Making quality plots..." + +$scripts_dir"/quality_check.R" $annot_dir + +#echo "Calculating coherence..." + +#$scripts_dir"/calculate_coherence_all_draft.R" $scripts_dir $n_of_cores + +#echo "Calculating alternative exon usage..." + +#$scripts_dir"/alt_exon_usage_draft.R" $annot_dir $scripts_dir $n_of_cores + +#ORF-finding + +echo "CCDS ORF finding..." + +$scripts_dir"/CCDS_orf_finder.R" $annot_dir $scripts_dir $bedtools_dir $n_of_cores + +echo "NONCCDS ORF finding..." + +$scripts_dir"/NONCCDS_orf_finder.R" $annot_dir $scripts_dir $bedtools_dir $n_of_cores + +# Groups ORFs and creates BED files + protein fasta database + +echo "Grouping ORFs and creating protein fasta database..." + +$scripts_dir"/create_protein_db.R" + +# makes summary plot for the found ORFs + +echo "Summarizing ORF finding results" + +$scripts_dir"/ORF_final_results.R" + +echo "RiboTaper analysis finished !!!" + + + + + + diff --git a/scripts/Ribotaper_ORF_find.sh b/scripts/Ribotaper_ORF_find.sh new file mode 100755 index 0000000..525b08c --- /dev/null +++ b/scripts/Ribotaper_ORF_find.sh @@ -0,0 +1,129 @@ +#!/bin/bash + + +################################################################### +# This file is part of RiboTaper. +# RiboTaper is a method for defining traslated ORFs using +# Ribosome Profiling data. +# +# Copyright (C) 2015 Lorenzo Calviello +# +# RiboTaper is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# RiboTaper is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with RiboTaper. If not, see . +# +# Contact: Lorenzo.Calviello@mdc-berlin.de +####################################################################### + +### RiboTaper master File + + +set -e + +if [ $# -ne 8 ]; then + echo "Usage: ./Ribotaper.sh " + exit 1 +fi +if ! [[ -f "$1" ]]; then + echo "!!!!! ribo_bam file not found!." + exit 1 + fi + +if ! [[ -f "$2" ]]; then + echo "!!!!! ribo_bam not found!." + exit 1 + fi + +if [ ! -d "$3" ]; then + echo "!!!!! annotation_directory not found!." + exit 1 +fi + +if [ ! -d "$6" ]; then + echo "!!!!! scripts_directory not found!." + exit 1 +fi + +if [ ! -d "$7" ]; then + echo "!!!!! bedtools_directory not found!." + exit 1 +fi + + +re='^[0-9]+$' +if ! [[ "$8" =~ $re ]] ; then + echo "!!!!! n of cores not valid" + exit 1 +fi + + +if [ "$8" == 1 ]; then + echo "!!!!! n of cores required >1." + exit 1 +fi + + + +ribo_bam="`readlink -f $1`" +rna_bam="`readlink -f $2`" +annot_dir="`readlink -f $3`" +read_len=$4 +cutoffs=$5 +scripts_dir="`readlink -f $6`" +bedtools_dir="`readlink -f $7`" +n_of_cores=$8 + + +echo "Parameters used:" +echo "" + +echo " $ribo_bam" +echo " $rna_bam" +echo " $annot_dir" +echo " $read_len" +echo " $cutoffs" +echo " $scripts_dir" +echo " $bedtools_dir" +echo " $8" +echo "" +echo "---------------" +echo "" + + + +echo "CCDS ORF finding..." + +$scripts_dir"/CCDS_orf_finder.R" $annot_dir $scripts_dir $bedtools_dir $n_of_cores + +echo "NONCCDS ORF finding..." + +$scripts_dir"/NONCCDS_orf_finder.R" $annot_dir $scripts_dir $bedtools_dir $n_of_cores + +# Groups ORFs and creates BED files + protein fasta database + +echo "Grouping ORFs and creating protein fasta database..." + +$scripts_dir"/create_protein_db.R" + +# makes summary plot for the found ORFs + +echo "Summarizing ORF finding results" + +$scripts_dir"/ORF_final_results.R" + +echo "RiboTaper analysis finished !!!" + + + + + + diff --git a/scripts/analyze_multi_clust.bash b/scripts/analyze_multi_clust.bash new file mode 100755 index 0000000..ec9182f --- /dev/null +++ b/scripts/analyze_multi_clust.bash @@ -0,0 +1,54 @@ +#!/bin/bash + + +################################################################### +# This file is part of RiboTaper. +# RiboTaper is a method for defining traslated ORFs using +# Ribosome Profiling data. +# +# Copyright (C) 2015 Lorenzo Calviello +# +# RiboTaper is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# RiboTaper is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with RiboTaper. If not, see . +# +# Contact: Lorenzo.Calviello@mdc-berlin.de +####################################################################### + + +##This script counts the multi-mapping/unique reads ratio per region, including coverage information, it uses as arguments a bed file, a name as an appendix for further analysis, the bedtools exec directory + +if [ $# -ne 3 ]; then + echo "Usage: analyze_multi.bash " + exit 1 +fi +if ! [[ -f "$1" ]]; then + echo "!!!!! bed file not found!." + exit 1 + fi + +bedtools_dir=$3 + +echo "-----Intersecting with unique/best alignments-----" + +$bedtools_dir"/coverageBed" -s -split -abam RIBO_unique.bam -b $1 | sort -k1,1 -k2,2g | sed 's/_//g' | awk '{ print $1 "_" $2 "_" $3 "_" $4 "_" $5"\t" $6 "\t" $7 "\t" $8 "\t" $9 "\t" $10}' > RIBO_unique_counts"_$2" + +$bedtools_dir"/coverageBed" -s -split -abam RNA_unique.bam -b $1 | sort -k1,1 -k2,2g | sed 's/_//g' | awk '{ print $1 "_" $2 "_" $3 "_" $4 "_" $5"\t" $6 "\t" $7 "\t" $8 "\t" $9 "\t" $10}' > RNA_unique_counts"_$2" + +$bedtools_dir"/coverageBed" -s -split -abam RIBO_best.bam -b $1 | sort -k1,1 -k2,2g | sed 's/_//g' | awk '{ print $1 "_" $2 "_" $3 "_" $4 "_" $5"\t" $6 "\t" $7 "\t" $8 "\t" $9 "\t" $10}' > RIBO_best_counts"_$2" + +$bedtools_dir"/coverageBed" -s -split -abam RNA_best.bam -b $1 | sort -k1,1 -k2,2g | sed 's/_//g' | awk '{ print $1 "_" $2 "_" $3 "_" $4 "_" $5"\t" $6 "\t" $7 "\t" $8 "\t" $9 "\t" $10}' > RNA_best_counts"_$2" + +echo "-----Done !!!-----" + + + diff --git a/scripts/annotate_exons.R b/scripts/annotate_exons.R new file mode 100755 index 0000000..672ef7c --- /dev/null +++ b/scripts/annotate_exons.R @@ -0,0 +1,118 @@ +#!/usr/bin/Rscript + +################################################################### +# This file is part of RiboTaper. +# RiboTaper is a method for defining traslated ORFs using +# Ribosome Profiling data. +# +# Copyright (C) 2015 Lorenzo Calviello +# +# RiboTaper is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# RiboTaper is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with RiboTaper. If not, see . +# +# Contact: Lorenzo.Calviello@mdc-berlin.de +####################################################################### + + +###script for annotating exons, it takes as arguments the annotation directory, the RiboTaper scripts directory, the n of cores + +args <- commandArgs(trailingOnly = TRUE) + + +print(paste("--- annotating exons","---",date(),sep=" ")) + + +suppressMessages(source(paste(args[2],"functions.R",sep = "/"))) + + +genes<-paste(args[1],"gene_annot_names",sep = "/") + +genes_annot<-read.table(genes,stringsAsFactors=F,header=F) + +colnames(genes_annot)<-c("gene_id","annotation","gene_symbol") + +nonccds_res<-read.table("results_nonccds",header=T,stringsAsFactors=F) + +ids_nonccds<-sapply(nonccds_res$exon_id,FUN=function(x){strsplit(x,split="_")}) +nonccds_res$gene_id<-as.character(lapply(ids_nonccds,"[[",5)) + +nonccds_res<-merge(nonccds_res,genes_annot,by="gene_id") +nonccds_res$type<-"non_ccds_exon" + + + +ccds_res<-read.table("results_ccds",header=T,stringsAsFactors=F) +ids_ccds<-sapply(ccds_res$exon_id,FUN=function(x){strsplit(x,split="_")}) +ccds_res$gene_id<-as.character(lapply(ids_ccds,"[[",5)) + +ccds_res<-merge(ccds_res,genes_annot,by="gene_id") +ccds_res$type<-"ccds" + + +exons_ccds_res<-read.table("results_exonsccds",header=T,stringsAsFactors=F) +ids_exons_ccds<-sapply(exons_ccds_res$exon_id,FUN=function(x){strsplit(x,split="_")}) +exons_ccds_res$gene_id<-as.character(lapply(ids_exons_ccds,"[[",5)) + +exons_ccds_res<-merge(exons_ccds_res,genes_annot,by="gene_id") +exons_ccds_res$type<-"exon" + + +all<-rbind(ccds_res,exons_ccds_res) + +coords<-matrix(nrow=dim(all)[1],ncol=1) +for(i in seq(1,dim(all)[1])){ + coords[i,1]<-paste(strsplit(all$exon_id[i],split="_")[[1]][1:3],collapse="_") +} +all$coords<-coords + +all <-all[order(all$coords,all$type,decreasing=F),] + + +coords2<-matrix(nrow=dim(all)[1],ncol=3) +for(i in seq(1,dim(all)[1])){ + coords2[i,1]<-strsplit(all$exon_id[i],split="_")[[1]][1] + coords2[i,2]<-strsplit(all$exon_id[i],split="_")[[1]][2] + coords2[i,3]<-strsplit(all$exon_id[i],split="_")[[1]][3] +} + + +all$chr<-coords2[,1] +all$start<-as.integer(coords2[,2]) +all$end<-as.integer(coords2[,3]) + +all$nt_more<-NA +all$nt_more_ribocovered<-NA +all$nt_more_P_sites<-NA +all$nt_more_rnacovered<-NA +all$nt_more_cent_sites<-NA +all$overlapping_ccds_start<-NA +all$overlapping_ccds_end<-NA + + +list_genes_exon_ccds<-split.data.frame(all,f=all$gene_id,drop=T) + + +list_genes_exon_ccds_annot<-list() + +list_genes_exon_ccds_annot<-mclapply(X=list_genes_exon_ccds,FUN=annotate_exons,mc.cores=args[3],mc.preschedule = TRUE) + + +all_annot<-do.call(rbind.data.frame,list_genes_exon_ccds_annot) + + + +write.table(file="results_nonccds_annot",sep="\t",nonccds_res,quote=F,row.names=F) + + +write.table(file="all_calculations_ccdsgenes_annot_new",sep="\t",all_annot,quote=F,row.names=F) +print(paste("--- annotating exons, Done!","---",date(),sep=" ")) diff --git a/scripts/bowrrna_star.q b/scripts/bowrrna_star.q new file mode 100644 index 0000000..7945f68 --- /dev/null +++ b/scripts/bowrrna_star.q @@ -0,0 +1,34 @@ +#!/bin/bash +#$ -pe smp 4 +#$ -l h_vmem=15G +#$ -e "error_mapp_bowrrnastar" +#$ -o "out_mapp_bowrrnastar" +#$ -cwd + + +##fastq, rRNA ref, star_in, start_stop bed file + +fastq=$1 + +full_fastq="`readlink -f $fastq`" + +name_exp="`echo $fastq | sed 's/\.fastq//g'`" + +full_name_exp="`echo $full_fastq | sed 's/\.fastq//g'`" + +/data/ohler/Lorenzo/bins/bowtie1/bowtie --best -S -p 4 --al $name_exp"_rRNA.fastq" --un $name_exp"_notrRNA.fastq" $2 $1 > /dev/null + + +mkdir "starmapp_star_"$name_exp/ + +cd "starmapp_star_"$name_exp/ + +/data/ohler/Lorenzo/STAR_2.3.1z1/STAR --genomeDir $3 --alignEndsType EndToEnd --readFilesIn $full_name_exp"_notrRNA.fastq" --runThreadN 4 --outFilterMismatchNmax 4 --outFilterMultimapNmax 8 --chimScoreSeparation 10 --chimScoreMin 20 --chimSegmentMin 15 --outSAMattributes All --outFilterIntronMotifs RemoveNoncanonicalUnannotated --alignSJoverhangMin 500 --outFileNamePrefix "star_"$name_exp"_" --outReadsUnmapped Fastx +samtools view -bS "star_"$name_exp"_"Aligned.out.sam | samtools sort - "star_"$name_exp"_"Aligned.out.sorted +samtools index "star_"$name_exp"_"Aligned.out.sorted.bam + + +/data/ohler/website/files/RiboTaper/Version_1.2/create_metaplots.bash "star_"$name_exp"_"Aligned.out.sorted.bam $4 $name_exp"_metaplots" + +echo "done"$name_exp"!!!" + diff --git a/scripts/create_annotations_files.bash b/scripts/create_annotations_files.bash new file mode 100755 index 0000000..19609ec --- /dev/null +++ b/scripts/create_annotations_files.bash @@ -0,0 +1,231 @@ +#!/bin/bash + + +################################################################### +# This file is part of RiboTaper. +# RiboTaper is a method for defining traslated ORFs using +# Ribosome Profiling data. +# +# Copyright (C) 2015 Lorenzo Calviello +# +# RiboTaper is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# RiboTaper is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with RiboTaper. If not, see . +# +# Contact: Lorenzo.Calviello@mdc-berlin.de +####################################################################### + + +##This script creates annotation files to be used in the RiboTaper pipeline, it uses as arguments a gtf file, a genome fasta file, a logical value for using the CCDS annotation (true or false) , a logical value for using the APPRIS annotation (true or false), a destination folder, the bedtools executables directory, the RiboTaper scripts directory + + + + +if [ $# -ne 7 ]; then + echo "Usage: ./create_annotation_files.bash " + exit 1 +fi +if ! [[ -f "$1" ]]; then + echo "!!!!! gtf_file not found!." + exit 1 + fi + +if ! [[ -f "$2" ]]; then + echo "!!!!! genome fasta not found!." + exit 1 + fi + +if ! [ "$3" = true ] ; then + if ! [ "$3" = false ]; then + echo "use_ccdsid = "true" or "false"" + exit 1 + fi +fi + +if ! [ "$4" = true ] ; then + if ! [ "$4" = false ]; then + echo "use_appris = "true" or "false"" + exit 1 + fi +fi + + + + +gencode_ann=$1 +genc_full="`readlink -e $gencode_ann`" + + +genome=$2 +genome_full=`readlink -e $genome` + +scripts_dir=$7 +scripts_dir_full=`readlink -e $scripts_dir` + +dest_folder=$5 +dest_folder_full=`readlink -f $dest_folder` + + +bedtools_path=$6 +bedtools_path_full=`readlink -e $bedtools_path` + +echo "Parameters used:" +echo "" + + +echo " $genc_full" +echo " $genome_full" +echo " $3" +echo " $4" +echo " $dest_folder_full" +echo " $bedtools_path_full" +echo " $scripts_dir_full" +echo "" +echo "---------------" +echo "" + + + + +echo "creating directory..." +mkdir -p $dest_folder_full + +cd $dest_folder_full + +echo "Extracting gene names + biotypes from gtf..." + +#grep out at each step! + +awk '{for (x=1;x<=NF;x++) if ($x~"^gene_id") for (y=1;y<=NF;y++) if ($y~/gene_type|gene_biotype/) for (z=1;z<=NF;z++) if ($z~"gene_name") print $(x+1) "\t" $(y+1) "\t" $(z+1)}' $genc_full | sort | uniq | sed 's/;//g' | sed 's/"//g' > gene_name_type +less gene_name_type | cut -f 1 | grep -Fvf - $genc_full | awk '{for (x=1;x<=NF;x++) if ($x~"^gene_id") for (z=1;z<=NF;z++) if ($z~"gene_name") print $(x+1) "\t" "no_biotype" "\t" $(z+1)}' | sort | uniq | sed 's/;//g' | sed 's/"//g' > gene_name_notype +less gene_name_type | cut -f 1 | grep -Fvf - $genc_full | awk '{for (x=1;x<=NF;x++) if ($x~"^gene_id") for (y=1;y<=NF;y++) if ($y~/gene_type|gene_biotype/) print $(x+1) "\t" $(y+1) "\t" "no_name"}' | sort | uniq | sed 's/;//g' | sed 's/"//g' > gene_noname_type + + +cat gene_name_type gene_name_notype gene_noname_type > gene_annot_name_pre +less gene_annot_name_pre | cut -f 1 | grep -Fvf - $genc_full | awk '{for (x=1;x<=NF;x++) if ($x~"^gene_id") print $(x+1) "\t" "no_biotype" "\t" "no_name"}' | sort | uniq | sed 's/;//g' | sed 's/"//g' > gene_noname_notype + +cat gene_annot_name_pre gene_noname_notype | sort | uniq > gene_annot_names + + +rm gene_name_type gene_name_notype gene_noname_type gene_annot_name_pre gene_noname_notype + + + +echo "creating bed_files..." + +#TAKE CDS OF CCDS REGIONS + +if [ "$3" = true ] ; then + awk '{if($3=="CDS") print $0}' $genc_full | grep ccdsid | awk '{ for (x=1;x<=NF;x++) if ($x~"^gene_id") print $1 "\t" $4-1 "\t" $5 "\t" "CCDS" "\t" $(x+1) "\t" $7 }' | sort -k1,1 -k2,2n | uniq | sed 's/;//g' | sed 's/"//g' > unique_ccds.bed + less $genc_full | grep ccdsid | awk '{for (x=1;x<=NF;x++) if ($x~"^gene_id") for (y=1;y<=NF;y++) if ($y~"^transcript_id") if($3=="exon") print $1 "\t" $4-1 "\t" $5 "\t" $(y+1)"\t" $(x+1) "\t" $7}'| sed 's/"//g' | sed 's/;//g' | sort -k1,1 -k2,2n | uniq > transcr_exons_ccds_ccdsid.bed + +fi + +if [ "$3" = false ] ; then + awk '{if($3=="CDS") print $0}' $genc_full | awk '{for (x=1;x<=NF;x++) if ($x~"^gene_id") print $1 "\t" $4-1 "\t" $5 "\t" "CCDS" "\t" $(x+1) "\t" $7 }' | sort -k1,1 -k2,2n | uniq | sed 's/;//g' | sed 's/"//g' > unique_ccds.bed +fi + +#STORE CCDS GENES +less unique_ccds.bed | cut -f 5 | sort | uniq > genes_ccds + +#STORE COORDINATES CDS CCDS +less unique_ccds.bed | cut -f 1-3 | tr '\t' '_' > coords_ccds + +#TAKE ALL EXONS OF CCDS GENES +grep -Ff genes_ccds $genc_full | awk '{if($3=="exon") print $0}' | awk '{for (x=1;x<=NF;x++) if ($x~"^gene_id") print $1 "\t" $4-1 "\t" $5 "\t" "EXONCCDS" "\t" $(x+1) "\t" $7 }' | sort -k1,1 -k2,2n | uniq | sed 's/;//g' | sed 's/"//g' > unique_exons_ccds.bed + +#STORE COORDINATES EXONS CCDS +less unique_exons_ccds.bed | awk '{print $1"_"$2"_"$3 "\t" $0}' > coords_unique_exons_ccds.bed +#TAKE OUT CDS CCDS FROM EXONS CCDS +grep -Fvf coords_ccds coords_unique_exons_ccds.bed | awk '{print $2 "\t" $3 "\t" $4 "\t" "EXONCCDS" "\t" $6 "\t" $7}' > unique_exons_ccds.bed +#REMOVE COORDS +rm coords_ccds coords_unique_exons_ccds.bed + + + + +#TAKE EXONS OF NONCCDS GENES +grep -Fvf genes_ccds $genc_full | awk '{for (x=1;x<=NF;x++) if ($x~"^gene_id") if($3=="exon") print $1 "\t" $4-1 "\t" $5 "\t" "EXONnonCCDS" "\t" $(x+1) "\t" $7 }' | sort -k1,1 -k2,2n | uniq | sed 's/;//g' | sed 's/"//g' > unique_nonccds.bed + +#TAKE SEQUENCES, STRANDED INFO + +echo "creating fasta sequences..." + +fastaFb="$bedtools_path_full/fastaFromBed" + +$fastaFb -s -fi $genome_full -bed unique_ccds.bed -fo unique_ccds_seq.fa + +awk '{$4=$1"_"$2"_"$3"_"$4"_"$5"_"$6; print $0}' OFS="\t" unique_ccds.bed | $fastaFb -fi $genome_full -name -bed - -tab -fo unique_ccds_seq_name_tab +paste <(cut -f 1 unique_ccds_seq_name_tab| tr '_' '\t') <(cut -f 2 unique_ccds_seq_name_tab | sed 's/[A-Z]/& /g') > sequences_ccds + +awk '{$4=$1"_"$2"_"$3"_"$4"_"$5"_"$6; print $0}' OFS="\t" unique_exons_ccds.bed | $fastaFb -fi $genome_full -name -bed - -tab -fo unique_exons_ccds_seq_name_tab +paste <(cut -f 1 unique_exons_ccds_seq_name_tab | tr '_' '\t') <(cut -f 2 unique_exons_ccds_seq_name_tab | sed 's/[A-Z]/& /g') > sequences_exonsccds + +awk '{$4=$1"_"$2"_"$3"_"$4"_"$5"_"$6; print $0}' OFS="\t" unique_nonccds.bed | $fastaFb -fi $genome_full -name -bed - -tab -fo unique_nonccds_seq_name_tab +paste <(cut -f 1 unique_nonccds_seq_name_tab | tr '_' '\t') <(cut -f 2 unique_nonccds_seq_name_tab | sed 's/[A-Z]/& /g') > sequences_nonccds + + +$fastaFb -s -fi $genome_full -bed unique_exons_ccds.bed -fo unique_exons_ccds_seq.fa +$fastaFb -s -fi $genome_full -bed unique_nonccds.bed -fo unique_exons_nonccds_seq.fa + +#CAT SEQUENCES TOGETHER FOR ORF FINDING +cat unique_ccds_seq.fa unique_exons_ccds_seq.fa > unique_ccds_exonccds_seq.fa + +#make all CDS regions +less $genc_full | awk '{if($3=="CDS") print $0}' | awk '{for (x=1;x<=NF;x++) if ($x~"^gene_id") print $1 "\t" $4-1 "\t" $5 "\t" "cds" "\t" $(x+1) "\t" $7 }' | sort -k1,1 -k2,2n | uniq | sed 's/;//g' | sed 's/"//g' > all_cds.bed + +echo "assembling transcript information..." + +#TAKE TRANSCR CCDS +grep -Ff genes_ccds $genc_full | awk '{ for (x=1;x<=NF;x++) if ($x~"^gene_id") for (y=1;y<=NF;y++) if ($y~"^transcript_id") if($3=="exon") print $1 "\t" $4-1 "\t" $5 "\t" $(y+1) "\t" $(x+1) "\t" $7}'| sed 's/"//g' | sed 's/;//g' | sort -k1,1 -k2,2n | uniq > transcr_exons_ccds.bed + + +#TAKE TRANSCR APPRIS CCDS + +if [ "$4" = true ] ; then + +grep -Ff genes_ccds $genc_full | grep appris_prin | awk '{for (x=1;x<=NF;x++) if ($x~"^gene_id") for (y=1;y<=NF;y++) if ($y~"^transcript_id") if($3=="exon") print $1 "\t" $4-1 "\t" $5 "\t" $(y+1)"\t" $(x+1) "\t" $7}'| sed 's/"//g' | sed 's/;//g' | sort -k1,1 -k2,2n | uniq > transcr_exons_ccds_appris_prin.bed +cut -f 5 transcr_exons_ccds_appris_prin.bed | sort | uniq > genes_appris_prin +grep -Ff genes_ccds $genc_full | grep -Fvf genes_appris_prin - | grep appris | awk '{for (x=1;x<=NF;x++) if ($x~"^gene_id") for (y=1;y<=NF;y++) if ($y~"^transcript_id") if($3=="exon") print $1 "\t" $4-1 "\t" $5 "\t" $(y+1)"\t" $(x+1) "\t" $7}'| sed 's/"//g' | sed 's/;//g' | sort -k1,1 -k2,2n | uniq > transcr_exons_ccds_appris_noprin.bed +cut -f 5 transcr_exons_ccds_appris_noprin.bed | sort | uniq > genes_appris_noprin +grep -Ff genes_ccds $genc_full | grep -Fvf genes_appris_prin - | grep -Fvf genes_appris_noprin | awk '{for (x=1;x<=NF;x++) if ($x~"^gene_id") for (y=1;y<=NF;y++) if ($y~"^transcript_id") if($3=="exon") print $1 "\t" $4-1 "\t" $5 "\t" $(y+1)"\t" $(x+1) "\t" $7}'| sed 's/"//g' | sed 's/;//g' | sort -k1,1 -k2,2n | uniq > transcr_exons_ccds_noappris_noprin.bed +cut -f 5 transcr_exons_ccds_noappris_noprin.bed | sort | uniq > genes_noappris_noprin +cat transcr_exons_ccds_appris_prin.bed transcr_exons_ccds_appris_noprin.bed transcr_exons_ccds_noappris_noprin.bed > transcr_exons_ccds_appris.bed +cat genes_appris_prin genes_appris_noprin genes_noappris_noprin > genes_ccds_appris + +fi + +#TAKE TRANSCR NONCCDS +grep -Fvf genes_ccds $genc_full | awk '{for (x=1;x<=NF;x++) if ($x~"^gene_id") for (y=1;y<=NF;y++) if ($y~"^transcript_id") if($3=="exon") print $1 "\t" $4-1 "\t" $5 "\t" $(y+1)"\t" $(x+1) "\t" $7}'| sed 's/"//g' | sed 's/;//g' | sort -k1,1 -k2,2n | uniq > transcr_exons_nonccds.bed + +#start_stop_cds + +less $genc_full | awk '{for (x=1;x<=NF;x++) if ($x~"^gene_id") if($3=="start_codon" || $3=="stop_codon") print $1 "\t"$4-1 "\t"$5 "\t" $3 "\t" $(x+1) "\t"$7}' | sed 's/;//g' | sed 's/"//g' | sort -k1,1 -k2,2g | uniq | awk 'p{print $0 "\t" $2-p}{p=$2}' | tac | awk 'p{print $0 "\t" $2-p}{p=$2}' | tac | awk '{if($NF<-100 || $(NF-1)>100) print $1 "\t" $2 "\t" $3 "\t" $4 "\t" $5 "\t" $6}' > start_stops_FAR.bed + +#make cds transcript coords +echo "Creating transcript cds coordinates from gtf..." + +awk '{ for (x=1;x<=NF;x++) if ($x~"^transcript_id") if ( $3=="exon" || $3=="CDS" ) print $1 "\t" $3 "\t" $4 "\t" $5 "\t" $7 "\t" $(x+1)}' $genc_full | sed 's/"//g' | sed 's/;//g' | sort -k1,1 -k3,3g > exons_cds_all +$scripts_dir_full"/gtf_to_start_stop_tr.R" + +#make cds frames + +less $genc_full | awk '{for (x=1;x<=NF;x++) if ($x~"^gene_id") if($3=="CDS") print $1 "_" $4-1 "_" $5 "_" "CCDS" "_" $(x+1) "\t" $8 "\t"$7 "\t" $5-($4-1)}' | sed 's/;//g' | sed 's/"//g' | sort -k1,1 -k2,2 | uniq > frames_ccds + + +#take all exonic regions +less $genc_full | awk '{if($3=="exon") print $0}' | awk '{for (x=1;x<=NF;x++) if ($x~"^gene_id") print $1 "\t" $4-1 "\t" $5 "\t" "exon" "\t" $(x+1) "\t" $7 }' | sort -k1,1 -k2,2n | uniq | sed 's/;//g' | sed 's/"//g' > all_exons.bed + +$scripts_dir_full"/genes_coor.R" +echo "Done!" + + diff --git a/scripts/create_metaplots.bash b/scripts/create_metaplots.bash new file mode 100755 index 0000000..bd1525b --- /dev/null +++ b/scripts/create_metaplots.bash @@ -0,0 +1,76 @@ +#!/bin/bash + + +################################################################### +# This file is part of RiboTaper. +# RiboTaper is a method for defining traslated ORFs using +# Ribosome Profiling data. +# +# Copyright (C) 2015 Lorenzo Calviello +# +# RiboTaper is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# RiboTaper is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with RiboTaper. If not, see . +# +# Contact: Lorenzo.Calviello@mdc-berlin.de +####################################################################### + + +##This script creates aggregate plots around start-stop codons, it uses as arguments a bam file, a bed file for start-stop positions (e.g. the one produced by the create_annotation.bash script), a name as an appendix for further analysis, the RiboTaper scripts directory + + +if [ $# -ne 5 ]; then + echo "Usage: create_metaplots.bash " + exit 1 +fi +if ! [[ -f "$1" ]]; then + echo "!!!!! ribo.bam file not found!." + exit 1 + fi +if ! [[ -f "$2" ]]; then + echo "!!!!! start_stop bed file not found!." + exit 1 + fi + +bedtools_dir=$4 + +echo "Downsampling to 10%..." + +samtools view -s 1.03 $1 > sample_to_metapl.sam + +cat <( samtools view -H $1 ) <(cat sample_to_metapl.sam) | samtools view - -bS > sample_to_metapl.bam + +echo "Intersecting alignments with start/stop sites ..." + +$bedtools_dir"bamToBed" -i sample_to_metapl.bam -bed12 -split | /data/ohler/Lorenzo/bins/windowBed -w 100 -sm -b stdin -a $2 | awk '{print $7 "\t" $8 "\t" $9 "\t" $10 "\t" $11 "\t" $12 "\t" $13 "\t" $14 "\t" $15 "\t" $16 "\t" $17 "\t" $18}' | sort -k1,1 -k2,2g | /data/ohler/Lorenzo/bins/closestBed -s -t "last" -a stdin -b $2 > $3 + +if ! [[ -s $3 ]]; then + echo "!!!!! no intersections found, check input files" + exit 1 + fi + + + +echo "Creating metaplots..." + +scripts_dir=$5 + +$scripts_dir"metag.R" $3 + +mkdir metaplots + +mv *.png metaplots/ + + + +echo "Done !!! " + diff --git a/scripts/create_protein_db.R b/scripts/create_protein_db.R new file mode 100755 index 0000000..7cf4c3d --- /dev/null +++ b/scripts/create_protein_db.R @@ -0,0 +1,310 @@ +#!/usr/bin/Rscript + +################################################################### +# This file is part of RiboTaper. +# RiboTaper is a method for defining traslated ORFs using +# Ribosome Profiling data. +# +# Copyright (C) 2015 Lorenzo Calviello +# +# RiboTaper is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# RiboTaper is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with RiboTaper. If not, see . +# +# Contact: Lorenzo.Calviello@mdc-berlin.de +####################################################################### + + +###script for grouping and filtering identified ORFs, writing BED files and creating a protein FASTA file. + + +suppressMessages(library("seqinr")) + +print(paste("--- create protein db and output final ORFs ---",date(),sep=" ")) + + +ORFs_new_more<-read.table("ORFs_CCDS/more_tapers/ORFs_sign_notfiltered_multi",stringsAsFactors=F,header=T,quote = "") + +ORFs_new_max<-read.table("ORFs_CCDS/max_P_sites/ORFs_sign_notfiltered_multi",stringsAsFactors=F,header=T,quote = "") + +ORFs_new_best<-read.table("ORFs_CCDS/best_periodicity/ORFs_sign_notfiltered_multi",stringsAsFactors=F,header=T,quote = "") + +nonccdsORFS_new_more<-read.table("ORFs_NONCCDS/more_tapers/ORFs_sign_nocds_nofilter",stringsAsFactors=F,header=T,quote = "") + +nonccdsORFS_new_max<-read.table("ORFs_NONCCDS/max_P_sites/ORFs_sign_nocds_nofilter",stringsAsFactors=F,header=T,quote = "") + +nonccdsORFS_new_best<-read.table("ORFs_NONCCDS/best_periodicity/ORFs_sign_nocds_nofilter",stringsAsFactors=F,header=T,quote = "") + + +sORFS_new_more<-read.table("ORFs_CCDS/more_tapers/sORFs_sign_filtered_cds",stringsAsFactors=F,header=T,quote = "") + +sORFS_new_max<-read.table("ORFs_CCDS/max_P_sites/sORFs_sign_filtered_cds",stringsAsFactors=F,header=T,quote = "") + +sORFS_new_best<-read.table("ORFs_CCDS/best_periodicity/sORFs_sign_filtered_cds",stringsAsFactors=F,header=T,quote = "") + + +ncORFS_new_more<-read.table("ORFs_NONCCDS/more_tapers/ORFs_sign_cds_notfiltered_multi",stringsAsFactors=F,header=T,quote = "") + +ncORFS_new_max<-read.table("ORFs_NONCCDS/max_P_sites/ORFs_sign_cds_notfiltered_multi",stringsAsFactors=F,header=T,quote = "") + +ncORFS_new_best<-read.table("ORFs_NONCCDS/best_periodicity/ORFs_sign_cds_notfiltered_multi",stringsAsFactors=F,header=T,quote = "") + +if(dim(ORFs_new_more)[1]>0 & dim(ORFs_new_max)[1]>0 & dim(ORFs_new_best)[1]>0){ + + ORFs_new<-rbind(ORFs_new_more,ORFs_new_max[!ORFs_new_max[,"gene_id"]%in%ORFs_new_more[,"gene_id"],]) + ORFs_new<-rbind(ORFs_new,ORFs_new_best[!ORFs_new_best[,"gene_id"]%in%ORFs_new[,"gene_id"],]) + ORFs_new$category<-"ORFs_ccds" + ORFs_new$annotation<-"protein_coding" + ORFs_new$header_tofasta<-paste(ORFs_new$ORF_id_tr,ORFs_new$gene_id,ORFs_new$Method,ORFs_new$annotation,ORFs_new$category,ORFs_new$ORF_P_sites,ORFs_new$ORF_spec3_spec_ribo,ORFs_new$ORF_spec_multi_ribo,sep=":") +} +if(dim(ncORFS_new_more)[1]>0 & dim(ncORFS_new_max)[1]>0 & dim(ncORFS_new_best)[1]>0){ + + ncORFS_new<-rbind(ncORFS_new_more,ncORFS_new_max[!ncORFS_new_max[,"gene_id"]%in%ncORFS_new_more[,"gene_id"],]) + ncORFS_new<-rbind(ncORFS_new,ncORFS_new_best[!ncORFS_new_best[,"gene_id"]%in%ncORFS_new[,"gene_id"],]) + ncORFS_new$category<-"ncORFS" + ncORFS_new$header_tofasta<-paste(ncORFS_new$ORF_id_tr,ncORFS_new$gene_id,ncORFS_new$Method,ncORFS_new$annotation,ncORFS_new$category,ncORFS_new$ORF_P_sites,ncORFS_new$ORF_spec3_spec_ribo,ncORFS_new$ORF_spec_multi_ribo,sep=":") + ncORFS_new[,c("annotated_start","annotated_stop","ORF_id_tr_annotated")]<-NA +} +if(dim(nonccdsORFS_new_more)[1]>0 & dim(nonccdsORFS_new_max)[1]>0 & dim(nonccdsORFS_new_best)[1]>0){ + nonccdsORFS_new<-rbind(nonccdsORFS_new_more,nonccdsORFS_new_max[!nonccdsORFS_new_max[,"gene_id"]%in%nonccdsORFS_new_more[,"gene_id"],]) + nonccdsORFS_new<-rbind(nonccdsORFS_new,nonccdsORFS_new_best[!nonccdsORFS_new_best[,"gene_id"]%in%nonccdsORFS_new[,"gene_id"],]) + nonccdsORFS_new$category<-"nonccds_coding_ORFs" + nonccdsORFS_new$header_tofasta<-paste(nonccdsORFS_new$ORF_id_tr,nonccdsORFS_new$gene_id,nonccdsORFS_new$Method,nonccdsORFS_new$annotation,nonccdsORFS_new$category,nonccdsORFS_new$ORF_P_sites,nonccdsORFS_new$ORF_spec3_spec_ribo,nonccdsORFS_new$ORF_spec_multi_ribo,sep=":") + nonccdsORFS_new[,c("annotated_start","annotated_stop","ORF_id_tr_annotated")]<-NA +} + +if(dim(sORFS_new_more)[1]>0 & dim(sORFS_new_max)[1]>0 & dim(sORFS_new_best)[1]>0){ + + sORFS_new<-rbind(sORFS_new_more,sORFS_new_max[!sORFS_new_max[,"gene_id"]%in%sORFS_new_more[,"gene_id"],]) + sORFS_new<-rbind(sORFS_new,sORFS_new_best[!sORFS_new_best[,"gene_id"]%in%sORFS_new[,"gene_id"],]) + sORFS_new$category<-sORFS_new$type + sORFS_new$type<-NULL + sORFS_new$annotation<-"protein_coding" + sORFS_new$header_tofasta<-paste(sORFS_new$ORF_id_tr,sORFS_new$gene_id,sORFS_new$Method,sORFS_new$annotation,sORFS_new$category,sORFS_new$ORF_P_sites,sORFS_new$ORF_spec3_spec_ribo,sORFS_new$ORF_spec_multi_ribo,sep=":") +} +if(is.null(sORFS_new$annotated_start)){sORFS_new$annotated_start<-NA} +if(is.null(sORFS_new$annotated_stop)){sORFS_new$annotated_stop<-NA} +if(is.null(sORFS_new$ORF_id_tr_annotated)){sORFS_new$ORF_id_tr_annotated<-NA} + +if(is.null(ORFs_new$annotated_start)){ORFs_new$annotated_start<-NA} +if(is.null(ORFs_new$annotated_stop)){ORFs_new$annotated_stop<-NA} +if(is.null(ORFs_new$ORF_id_tr_annotated)){ORFs_new$ORF_id_tr_annotated<-NA} + + +cat_obj<-c("ORFs_new","ncORFS_new","nonccdsORFS_new","sORFS_new") +present<-c() +for(q in 1:length(cat_obj)){ + present[q]<-exists(cat_obj[q]) +} + +ORFs_ALL<-do.call(rbind.data.frame,mget(cat_obj[present])) + +ORFs_ALL<-ORFs_ALL[(ORFs_ALL$ORF_pval_multi_ribo<0.05),] + +ORFs_ALL<-ORFs_ALL[!is.na(ORFs_ALL$ORF_pept),] +names(ORFs_ALL)<-gsub(x=names(ORFs_ALL),pattern="st2vect",replacement="stop_pos") + +ORFs_ALL_filt<-ORFs_ALL[which((ORFs_ALL$pct_covered_onlymulti_ribo/ORFs_ALL$pct_region_covered_ribo)<0.3),] +rem<-which(ORFs_ALL_filt[,"category"]=="nonccds_coding_ORFs" & ORFs_ALL_filt[,"annotation"]!="protein_coding") +if(length(rem)>0){ + ORFs_ALL_filt<-ORFs_ALL_filt[-rem,] +} +ORFs_ALL_filt<-ORFs_ALL_filt[!is.na(ORFs_ALL_filt$ORF_pept),] + + + +ORFs_ALL<-ORFs_ALL[,c("gene_id","gene_symbol","transcript_id","annotation", + "length","strand", "n_exons", "P_sites_sum", "RNA_sites", "Ribo_cov_aver", + "RNA_cov_aver","category","ORF_id_tr", "start_pos","stop_pos", "annotated_start", "annotated_stop", "ORF_id_gen", + "ORF_length", "reads_ribo", "reads_rna", "ORF_P_sites","ORF_Psit_pct_in_frame", + "ORF_RNA_sites", "ORF_RNAsit_pct_in_frame", "ORF_pval_multi_ribo", + "ORF_pval_multi_rna","ORF_spec_multi_ribo","ORF_spec_multi_rna", "ORF_id_tr_annotated", "n_exons_ORF","pct_region_covered_ribo", "pct_covered_onlymulti_ribo", "pct_region_covered_rna", + "pct_covered_onlymulti_rna", "Method", "header_tofasta", "ORF_pept") + ] + +ORFs_ALL_filt<-ORFs_ALL_filt[,c("gene_id","gene_symbol","transcript_id","annotation", + "length","strand", "n_exons", "P_sites_sum", "RNA_sites", "Ribo_cov_aver", + "RNA_cov_aver","category","ORF_id_tr", "start_pos","stop_pos", "annotated_start", "annotated_stop", "ORF_id_gen", + "ORF_length","reads_ribo","reads_rna", "ORF_P_sites", "ORF_Psit_pct_in_frame", + "ORF_RNA_sites", "ORF_RNAsit_pct_in_frame", "ORF_pval_multi_ribo", + "ORF_pval_multi_rna","ORF_spec_multi_ribo","ORF_spec_multi_rna", "ORF_id_tr_annotated", "n_exons_ORF","pct_region_covered_ribo", "pct_covered_onlymulti_ribo", "pct_region_covered_rna", + "pct_covered_onlymulti_rna", "Method", "header_tofasta", "ORF_pept") + ] + +names(ORFs_ALL)[which(names(ORFs_ALL)=="reads_ribo")]<-"ORF_reads_ribo" +names(ORFs_ALL)[which(names(ORFs_ALL)=="reads_rna")]<-"ORF_reads_rna" +names(ORFs_ALL_filt)[which(names(ORFs_ALL_filt)=="reads_ribo")]<-"ORF_reads_ribo" +names(ORFs_ALL_filt)[which(names(ORFs_ALL_filt)=="reads_rna")]<-"ORF_reads_rna" + +#write.table(ORFs_ALL_filt,file="ORFs_more_filt",quote=F,col.names=T,row.names=F,sep="\t") +#write.table(ORFs_ALL,file="ORFs_more",quote=F,col.names=T,row.names=F,sep="\t") +#write.fasta(sequences=as.list(ORFs_ALL$ORF_pept),names=ORFs_ALL$header_tofasta,file.out="protein_db_more.fasta") + +if(dim(ORFs_new_more)[1]>0 & dim(ORFs_new_max)[1]>0 & dim(ORFs_new_best)[1]>0){ + ORFs_new<-rbind(ORFs_new_max,ORFs_new_more[!ORFs_new_more[,"gene_id"]%in%ORFs_new_max[,"gene_id"],]) + ORFs_new<-rbind(ORFs_new,ORFs_new_best[!ORFs_new_best[,"gene_id"]%in%ORFs_new[,"gene_id"],]) + ORFs_new$category<-"ORFs_ccds" + ORFs_new$annotation<-"protein_coding" + ORFs_new$header_tofasta<-paste(ORFs_new$ORF_id_tr,ORFs_new$gene_id,ORFs_new$Method,ORFs_new$annotation,ORFs_new$category,ORFs_new$ORF_P_sites,ORFs_new$ORF_spec3_spec_ribo,ORFs_new$ORF_spec_multi_ribo,sep=":") +} +if(dim(ncORFS_new_more)[1]>0 & dim(ncORFS_new_max)[1]>0 & dim(ncORFS_new_best)[1]>0){ + + ncORFS_new<-rbind(ncORFS_new_max,ncORFS_new_more[!ncORFS_new_max[,"gene_id"]%in%ncORFS_new_max[,"gene_id"],]) + ncORFS_new<-rbind(ncORFS_new,ncORFS_new_best[!ncORFS_new_best[,"gene_id"]%in%ncORFS_new[,"gene_id"],]) + ncORFS_new$category<-"ncORFS" + ncORFS_new$header_tofasta<-paste(ncORFS_new$ORF_id_tr,ncORFS_new$gene_id,ncORFS_new$Method,ncORFS_new$annotation,ncORFS_new$category,ncORFS_new$ORF_P_sites,ncORFS_new$ORF_spec3_spec_ribo,ncORFS_new$ORF_spec_multi_ribo,sep=":") + ncORFS_new[,c("annotated_start","annotated_stop","ORF_id_tr_annotated")]<-NA +} +if(dim(nonccdsORFS_new_more)[1]>0 & dim(nonccdsORFS_new_max)[1]>0 & dim(nonccdsORFS_new_best)[1]>0){ + + nonccdsORFS_new<-rbind(nonccdsORFS_new_max,nonccdsORFS_new_more[!nonccdsORFS_new_max[,"gene_id"]%in%nonccdsORFS_new_max[,"gene_id"],]) + nonccdsORFS_new<-rbind(nonccdsORFS_new,nonccdsORFS_new_best[!nonccdsORFS_new_best[,"gene_id"]%in%nonccdsORFS_new[,"gene_id"],]) + nonccdsORFS_new$category<-"nonccds_coding_ORFs" + nonccdsORFS_new$header_tofasta<-paste(nonccdsORFS_new$ORF_id_tr,nonccdsORFS_new$gene_id,nonccdsORFS_new$Method,nonccdsORFS_new$annotation,nonccdsORFS_new$category,nonccdsORFS_new$ORF_P_sites,nonccdsORFS_new$ORF_spec3_spec_ribo,nonccdsORFS_new$ORF_spec_multi_ribo,sep=":") + nonccdsORFS_new[,c("annotated_start","annotated_stop","ORF_id_tr_annotated")]<-NA +} +if(dim(sORFS_new_more)[1]>0 & dim(sORFS_new_max)[1]>0 & dim(sORFS_new_best)[1]>0){ + + sORFS_new<-rbind(sORFS_new_max,sORFS_new_more[!sORFS_new_max[,"gene_id"]%in%sORFS_new_max[,"gene_id"],]) + sORFS_new<-rbind(sORFS_new,sORFS_new_best[!sORFS_new_best[,"gene_id"]%in%sORFS_new[,"gene_id"],]) + sORFS_new$category<-sORFS_new$type + sORFS_new$type<-NULL + sORFS_new$annotation<-"protein_coding" + sORFS_new$header_tofasta<-paste(sORFS_new$ORF_id_tr,sORFS_new$gene_id,sORFS_new$Method,sORFS_new$annotation,sORFS_new$category,sORFS_new$ORF_P_sites,sORFS_new$ORF_spec3_spec_ribo,sORFS_new$ORF_spec_multi_ribo,sep=":") +} +if(is.null(sORFS_new$annotated_start)){sORFS_new$annotated_start<-NA} +if(is.null(sORFS_new$annotated_stop)){sORFS_new$annotated_stop<-NA} +if(is.null(sORFS_new$ORF_id_tr_annotated)){sORFS_new$ORF_id_tr_annotated<-NA} + +if(is.null(ORFs_new$annotated_start)){ORFs_new$annotated_start<-NA} +if(is.null(ORFs_new$annotated_stop)){ORFs_new$annotated_stop<-NA} +if(is.null(ORFs_new$ORF_id_tr_annotated)){ORFs_new$ORF_id_tr_annotated<-NA} + + +cat_obj<-c("ORFs_new","ncORFS_new","nonccdsORFS_new","sORFS_new") +present<-c() +for(q in 1:length(cat_obj)){ + present[q]<-exists(cat_obj[q]) +} + +ORFs_ALL<-do.call(rbind.data.frame,mget(cat_obj[present])) +ORFs_ALL<-ORFs_ALL[(ORFs_ALL$ORF_pval_multi_ribo<0.05),] + +ORFs_ALL<-ORFs_ALL[!is.na(ORFs_ALL$ORF_pept),] +names(ORFs_ALL)<-gsub(x=names(ORFs_ALL),pattern="st2vect",replacement="stop_pos") + +ORFs_ALL_filt<-ORFs_ALL[which((ORFs_ALL$pct_covered_onlymulti_ribo/ORFs_ALL$pct_region_covered_ribo)<0.3),] +rem<-which(ORFs_ALL_filt[,"category"]=="nonccds_coding_ORFs" & ORFs_ALL_filt[,"annotation"]!="protein_coding") +if(length(rem)>0){ + ORFs_ALL_filt<-ORFs_ALL_filt[-rem,] +} + +ORFs_ALL_filt<-ORFs_ALL_filt[!is.na(ORFs_ALL_filt$ORF_pept),] + + +list_coords_bed<-list() + +for(i in 1:dim(ORFs_ALL)[1]){ + orf<-ORFs_ALL[i,] + strand<-orf$strand + P_sites_sum<-orf$ORF_P_sites + orf_id<-orf$ORF_id_tr + orf_category<-orf$category + orf_annotation<-orf$annotation + + all_ex<-strsplit(orf$to_check_ALL,split=";")[[1]] + all_ex<-all_ex[all_ex!="NA"] + list_exs<-list() + for(j in 1:length(all_ex)){ + ex<-strsplit(all_ex[j],split="_")[[1]] + bed<-data.frame(chr=ex[1],start=ex[2],end=ex[3],orf_name=paste(orf_id,orf_category,orf_annotation,sep=";"),P_sites=P_sites_sum,strand_bed=strand,stringsAsFactors=F) + list_exs[[j]]<-bed + } + exs<-do.call(args=list_exs,what=rbind.data.frame) + list_coords_bed[[i]]<-exs + +} + +coords_bed<-do.call(args=list_coords_bed,what=rbind.data.frame) + + +write.table(file="translated_ORFs.bed",x=coords_bed,col.names=F,row.names=F,quote=F,sep="\t") + +system("sort -k1,1 -k2,2n translated_ORFs.bed > translated_ORFs_sorted.bed") +system("rm translated_ORFs.bed") + + +list_coords_bed<-list() + +for(i in 1:dim(ORFs_ALL_filt)[1]){ + orf<-ORFs_ALL_filt[i,] + strand<-orf$strand + P_sites_sum<-orf$ORF_P_sites + orf_id<-orf$ORF_id_tr + orf_category<-orf$category + orf_annotation<-orf$annotation + + all_ex<-strsplit(orf$to_check_ALL,split=";")[[1]] + all_ex<-all_ex[all_ex!="NA"] + list_exs<-list() + for(j in 1:length(all_ex)){ + ex<-strsplit(all_ex[j],split="_")[[1]] + bed<-data.frame(chr=ex[1],start=ex[2],end=ex[3],orf_name=paste(orf_id,orf_category,orf_annotation,sep=";"),P_sites=P_sites_sum,strand_bed=strand,stringsAsFactors=F) + list_exs[[j]]<-bed + } + exs<-do.call(args=list_exs,what=rbind.data.frame) + list_coords_bed[[i]]<-exs + +} + +coords_bed<-do.call(args=list_coords_bed,what=rbind.data.frame) + + +write.table(file="translated_ORFs_filtered.bed",x=coords_bed,col.names=F,row.names=F,quote=F,sep="\t") + +system("sort -k1,1 -k2,2n translated_ORFs_filtered.bed > translated_ORFs_filtered_sorted.bed") +system("rm translated_ORFs_filtered.bed") + + + + +ORFs_ALL<-ORFs_ALL[,c("gene_id","gene_symbol","transcript_id","annotation", + "length","strand", "n_exons", "P_sites_sum", "RNA_sites", "Ribo_cov_aver", + "RNA_cov_aver","category","ORF_id_tr", "start_pos","stop_pos", "annotated_start", "annotated_stop", "ORF_id_gen", + "ORF_length", "reads_ribo", "reads_rna", "ORF_P_sites","ORF_Psit_pct_in_frame", + "ORF_RNA_sites", "ORF_RNAsit_pct_in_frame", "ORF_pval_multi_ribo", + "ORF_pval_multi_rna","ORF_spec_multi_ribo","ORF_spec_multi_rna", "ORF_id_tr_annotated", "n_exons_ORF","pct_region_covered_ribo", "pct_covered_onlymulti_ribo", "pct_region_covered_rna", + "pct_covered_onlymulti_rna", "Method", "header_tofasta", "ORF_pept") + ] + +ORFs_ALL_filt<-ORFs_ALL_filt[,c("gene_id","gene_symbol","transcript_id","annotation", + "length","strand", "n_exons", "P_sites_sum", "RNA_sites", "Ribo_cov_aver", + "RNA_cov_aver","category","ORF_id_tr", "start_pos","stop_pos", "annotated_start", "annotated_stop", "ORF_id_gen", + "ORF_length","reads_ribo", "reads_rna", "ORF_P_sites", "ORF_Psit_pct_in_frame", + "ORF_RNA_sites", "ORF_RNAsit_pct_in_frame", "ORF_pval_multi_ribo", + "ORF_pval_multi_rna","ORF_spec_multi_ribo","ORF_spec_multi_rna", "ORF_id_tr_annotated", "n_exons_ORF","pct_region_covered_ribo", "pct_covered_onlymulti_ribo", "pct_region_covered_rna", + "pct_covered_onlymulti_rna", "Method", "header_tofasta", "ORF_pept") + ] + + +names(ORFs_ALL)[which(names(ORFs_ALL)=="reads_ribo")]<-"ORF_reads_ribo" +names(ORFs_ALL)[which(names(ORFs_ALL)=="reads_rna")]<-"ORF_reads_rna" +names(ORFs_ALL_filt)[which(names(ORFs_ALL_filt)=="reads_ribo")]<-"ORF_reads_ribo" +names(ORFs_ALL_filt)[which(names(ORFs_ALL_filt)=="reads_rna")]<-"ORF_reads_rna" + + +write.table(ORFs_ALL_filt,file="ORFs_max_filt",quote=F,col.names=T,row.names=F,sep="\t") +write.table(ORFs_ALL,file="ORFs_max",quote=F,col.names=T,row.names=F,sep="\t") +write.fasta(sequences=as.list(ORFs_ALL$ORF_pept),names=ORFs_ALL$header_tofasta,file.out="protein_db_max.fasta") + +print(paste("--- protein db and output final ORFs, Done! ---",date(),sep=" ")) + diff --git a/scripts/create_tracks.bash b/scripts/create_tracks.bash new file mode 100755 index 0000000..40f1305 --- /dev/null +++ b/scripts/create_tracks.bash @@ -0,0 +1,75 @@ +#!/bin/bash + + +################################################################### +# This file is part of RiboTaper. +# RiboTaper is a method for defining traslated ORFs using +# Ribosome Profiling data. +# +# Copyright (C) 2015 Lorenzo Calviello +# +# RiboTaper is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# RiboTaper is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with RiboTaper. If not, see . +# +# Contact: Lorenzo.Calviello@mdc-berlin.de +####################################################################### + + +##This script creates the data_tracks files, it uses as arguments a bed file, a name as an appendix for further analysis, the bedtools exec directory + +if [ $# -ne 4 ]; then + echo "Usage: create_tracks.bash " + exit 1 +fi +if ! [[ -f "$1" ]]; then + echo "!!!!! bed file not found!." + exit 1 + fi + +if ! [[ -f "$2" ]]; then + echo "!!!!! fasta file not found!." + exit 1 + fi +bedtools_dir=$4 + +mkdir -p data_tracks + +echo "-----Calculating coverage tracks for each exon-----" + +$bedtools_dir"/coverageBed" -s -split -abam RIBO_unique.bam -b $1 | sort -k1,1 -k2,2g | sed 's/_//g' | awk '{ print $1 "_" $2 "_" $3 "_" $4 "_" $5"\t" $6 "\t" $7 "\t" $8 "\t" $9 "\t" $10}' > RIBO_unique_counts"_$3" + +$bedtools_dir"/coverageBed" -s -split -abam RNA_unique.bam -b $1 | sort -k1,1 -k2,2g | sed 's/_//g' | awk '{ print $1 "_" $2 "_" $3 "_" $4 "_" $5"\t" $6 "\t" $7 "\t" $8 "\t" $9 "\t" $10}' > RNA_unique_counts"_$3" + +$bedtools_dir"/coverageBed" -s -split -abam RIBO_best.bam -b $1 | sort -k1,1 -k2,2g | sed 's/_//g' | awk '{ print $1 "_" $2 "_" $3 "_" $4 "_" $5"\t" $6 "\t" $7 "\t" $8 "\t" $9 "\t" $10}' > RIBO_best_counts"_$3" + +$bedtools_dir"/coverageBed" -s -split -abam RNA_best.bam -b $1 | sort -k1,1 -k2,2g | sed 's/_//g' | awk '{ print $1 "_" $2 "_" $3 "_" $4 "_" $5"\t" $6 "\t" $7 "\t" $8 "\t" $9 "\t" $10}' > RNA_best_counts"_$3" + + +$bedtools_dir"/coverageBed" -s -d -a P_sites_all -b $1 | awk '{ print $1 ";" $2 ";" $3 ";" $4 ";" $5";" $6 "\t" "_" $8}' | awk -F"\t" '{if(a[$1])a[$1]=a[$1]" "$NF; else a[$1]=$NF}END{for (i in a)print i "\t" a[i]}'| sed 's/_//g' | sed 's/;/\t/g' | sort -k1,1 -k4,4 -k2,2g > data_tracks/P_sites_all_tracks"_$3" + + +$bedtools_dir"/coverageBed" -s -d -split -abam RIBO_best.bam -b $1 | awk '{ print $1 ";" $2 ";" $3 ";" $4 ";" $5";" $6 "\t" "_" $8}' | awk -F"\t" '{if(a[$1])a[$1]=a[$1]" "$NF; else a[$1]=$NF}END{for (i in a)print i "\t" a[i]}'| sed 's/_//g' | sed 's/;/\t/g' | sort -k1,1 -k4,4 -k2,2g > data_tracks/RIBO_tracks"_$3" + +$bedtools_dir"/coverageBed" -s -d -split -abam RNA_best.bam -b $1 | awk '{ print $1 ";" $2 ";" $3 ";" $4 ";" $5";" $6 "\t" "_" $8}' | awk -F"\t" '{if(a[$1])a[$1]=a[$1]" "$NF; else a[$1]=$NF}END{for (i in a)print i "\t" a[i]}'| sed 's/_//g' | sed 's/;/\t/g' | sort -k1,1 -k4,4 -k2,2g > data_tracks/RNA_tracks"_$3" + +$bedtools_dir"/coverageBed" -s -d -a Centered_RNA -b $1 | awk '{ print $1 ";" $2 ";" $3 ";" $4 ";" $5";" $6 "\t" "_" $8}' | awk -F"\t" '{if(a[$1])a[$1]=a[$1]" "$NF; else a[$1]=$NF}END{for (i in a)print i "\t" a[i]}'| sed 's/_//g' | sed 's/;/\t/g' | sort -k1,1 -k4,4 -k2,2g > data_tracks/Centered_RNA_tracks"_$3" + +echo "-----Merging tracks together-----" + + +cat data_tracks/P_sites_all_tracks"_$3" data_tracks/RIBO_tracks"_$3" data_tracks/RNA_tracks"_$3" data_tracks/Centered_RNA_tracks"_$3" $2 | tr '\t' '_' | sed 's/_/\t/6' | awk -F"\t" '{a[$1]=a[$1]"\n" $1 "\t" $2}END{for (i in a)print i "\t" a[i];}' | awk -F"\t" '{if($2>=0)print $0}' | sed 's/_/ /5' | sed 's/\t/ /1' > data_tracks/Psit_Ribo_Rna_Cent_tracks"_$3" + +cut -f 1 data_tracks/Psit_Ribo_Rna_Cent_tracks"_$3" -d" " > data_tracks/index_tracks"_$3" + + + diff --git a/scripts/functions.R b/scripts/functions.R new file mode 100755 index 0000000..a6b1cb0 --- /dev/null +++ b/scripts/functions.R @@ -0,0 +1,1734 @@ +library("XNomial") +library("foreach") +library("doMC") + +library("multitaper") +library("seqinr") + +################################################################### +# This file is part of RiboTaper. +# RiboTaper is a method for defining traslated ORFs using +# Ribosome Profiling data. +# +# Copyright (C) 2015 Lorenzo Calviello +# +# RiboTaper is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# RiboTaper is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with RiboTaper. If not, see . +# +# Contact: Lorenzo.Calviello@mdc-berlin.de +####################################################################### + + +###This functions reads big text files efficiently: from http://www.r-bloggers.com/faster-files-in-r/ +readBigText<-function(x){ + f=file(x,"rb") + a=readChar(f,file.info(x)$size,useBytes=T);a<-strsplit(a,"\n",fixed=T,useBytes=T)[[1]] + close(f) + return(a) +} + + + +### This function gets the FFT frequence and power values, from http://stackoverflow.com/questions/3485456/useful-little-functions-in-r + + +getFFTFreqs<-function(Nyq.Freq, data) +{ + if ((length(data) %% 2) == 1) # Odd number of samples + { + FFTFreqs <- c(seq(0, Nyq.Freq, length.out=(length(data)+1)/2), + seq(-Nyq.Freq, 0, length.out=(length(data)-1)/2)) + } + else # Even number + { + FFTFreqs <- c(seq(0, Nyq.Freq, length.out=length(data)/2), + seq(-Nyq.Freq, 0, length.out=length(data)/2)) + } + + return (FFTFreqs) +} + +### This function outputs the max FFT frequence and power values + +take_maxfreq_and_power_FFT_Spec<-function(x){ + + if(length(x)<10){x<-c(rep(0,3),x,rep(0,3))} + gino<-getFFTFreqs(Nyq.Freq=0.5,data=x) + modFFT <- Mod(fft(x)) + FFTdata <- cbind(gino, modFFT) + + freq3_fft<-abs(FFTdata[which(abs((abs(FFTdata[,1])-(1/3)))==min(abs((abs(FFTdata[,1])-(1/3))))),1]) + + power3_fft<-FFTdata[which(abs((abs(FFTdata[,1])-(1/3)))==min(abs((abs(FFTdata[,1])-(1/3))))),2] + + + + spect_x<-spectrum(x,plot=FALSE) + + freq3_sp<-abs(spect_x$freq[which(abs(spect_x$freq-(1/3))==min(abs((spect_x$freq)-(1/3))))]) + power3_sp<-abs(spect_x$spec[which(abs(spect_x$freq-(1/3))==min(abs((spect_x$freq)-(1/3))))]) + + return(c(freq3_fft,power3_fft,freq3_sp,power3_sp)) +} + + +### This function plots the raw FFT periodogram, from http://stackoverflow.com/questions/3485456/useful-little-functions-in-r + + +plotFFT<-function(x, y, samplingFreq, shadeNyq=TRUE, showPeriod = TRUE) +{ + Nyq.Freq <- samplingFreq/2 + FFTFreqs <- getFFTFreqs(Nyq.Freq, y) + + FFT <- fft(y) + modFFT <- Mod(FFT) + FFTdata <- cbind(FFTFreqs, modFFT) + plot(FFTdata[1:nrow(FFTdata)/2,], t="l", pch=20, lwd=2, cex=0.8, main="", + xlab="Frequency (Hz)", ylab="Power") + if (showPeriod == TRUE) + { + # Period axis on top + a <- axis(3, lty=0, labels=FALSE) + axis(3, cex.axis=0.6, labels=format(1/a, digits=2), at=a) + } + if (shadeNyq == TRUE) + { + # Gray out lower frequencies + rect(0, 0, 2/max(x), max(FFTdata[,2])*2, col="gray", density=30) + } + + ret <- list("freq"=FFTFreqs, "FFT"=FFT, "modFFT"=modFFT) +} + +### This function calculates the CSCPD as in Michel et al 2012 Gen Res + + +dual_take_CSCPDs<-function(tracks_to_analyze=all_tracks,index_tracks=all_tracks_index,exon_ids=all_tracks_index){ + unique_index<-unique(as.data.frame(exon_ids)[,"exon_id"]) + interpolation_mat1 = matrix(NA, nrow = length(unique_index), ncol=100) + interpolation_mat2 = matrix(NA, nrow = length(unique_index), ncol=100) + interpolation_mat3 = matrix(NA, nrow = length(unique_index), ncol=100) + rownames(interpolation_mat1)<-unique_index + rownames(interpolation_mat2)<-unique_index + rownames(interpolation_mat3)<-unique_index + for(i in 1:length(unique_index)){ + id<-unique_index[i] + exon_track<-tracks_to_analyze[index_tracks[,1]==id] + withsep<-strsplit(exon_track,split=" ") + x<-t(data.frame(withsep)) + #rnames[i]<-x[1,1] + strand<-x[1,2] + if(length(grep("CCDS",id))>0){tracks_pre<-t(x[,-c(1:3)])} else { + tracks_pre<-t(x[,-c(1:2)])} + if(strand=="-"){ + tracks<-cbind(rev(tracks_pre[,1]),rev(tracks_pre[,2]),rev(tracks_pre[,3]),rev(tracks_pre[,4])) + } else if (strand=="+"){ + tracks<-tracks_pre} + colnames(tracks)<-c("Psites","RiboCov","RNACov","RNAcent") + mode(tracks)<-"numeric" + length<-dim(tracks)[1] + Phase_P_sites_frame<-sum(tracks[seq(1,length,by=3),1]) + Phase_P_sites_frame_1<-sum(tracks[seq(2,length,by=3),1]) + Phase_P_sites_frame_2<-sum(tracks[seq(3,length,by=3),1]) + FRAME_MAX_phase<-max.col(t(c(Phase_P_sites_frame,Phase_P_sites_frame_1,Phase_P_sites_frame_2)))-1 + nts_toadd<-(3-FRAME_MAX_phase)%%3 + + read_mat<-t(as.matrix(c(rep(x=0,nts_toadd),tracks[,1]))) + length<-(length(read_mat)) + + upprop = matrix(NA,nrow = 1,ncol = length) + downprop = matrix(NA,nrow = 1,ncol = length) + seq_length1 = (length/3)-1 + seq_length2 = seq_length1 - 1 + + #calculate the cumulative upstream proportions + denom = cumsum(read_mat[1,]) + upprop[1,(1+3*(0:seq_length1))] = cumsum(read_mat[1,(1+3*(0:seq_length1))])/ + denom[(3+3*(0:seq_length1))] + upprop[1,(2+3*(0:seq_length1))] = cumsum(read_mat[1,(2+3*(0:seq_length1))])/ + denom[(3+3*(0:seq_length1))] + upprop[1,(3+3*(0:seq_length1))] = cumsum(read_mat[1,(3+3*(0:seq_length1))])/ + denom[(3+3*(0:seq_length1))] + #calculate the cumulative downstream proportions + totalsDown = rep(NA, seq_length1) + for (j in 1:seq_length1){ + totalsDown[j] = sum(read_mat[1+3*(j:seq_length1)]+ read_mat[2+3*(j:seq_length1)]+ + read_mat[3+3*(j:seq_length1)])} + downprop[1,(1+3*(0:seq_length2))] = rev(cumsum(rev(read_mat[1,(1+3*(1:seq_length1))])))/ + totalsDown[0:seq_length1] + downprop[1,(2+3*(0:seq_length2))] = rev(cumsum(rev(read_mat[1,(2+3*(1:seq_length1))])))/ + totalsDown[0:seq_length1] + downprop[1,(3+3*(0:seq_length2))] = rev(cumsum(rev(read_mat[1,(3+3*(1:seq_length1))])))/ + totalsDown[0:seq_length1] + #Calculate the CSCPD (absolute difference between cumulative upstream and downstream + #proportions for sub-codon positions 1,2,3) + third_full_seq_minus1 = (length/3)-1 + y1 = abs(upprop-downprop)[1+3*(0:third_full_seq_minus1)] + y1_withoutNA = y1[which(y1 != "NA")] + y1_withoutNaN = y1_withoutNA[which(y1_withoutNA != "NaN")] + + y2 = abs(upprop-downprop)[2+3*(0:third_full_seq_minus1)] + y2_withoutNA = y2[which(y2 != "NA")] + y2_withoutNaN = y2_withoutNA[which(y2_withoutNA != "NaN")] + + y3 = abs(upprop-downprop)[3+3*(0:third_full_seq_minus1)] + y3_withoutNA = y3[which(y3 != "NA")] + y3_withoutNaN = y3_withoutNA[which(y3_withoutNA != "NaN")] + + + + if(length(y1_withoutNaN)==0 ){ + y1_withoutNaN<-c(0.1,0.1,0.1,0.1,0.1) + } + if(length(y2_withoutNaN)==0 ){ + y2_withoutNaN<-c(0.1,0.1,0.1,0.1,0.1) + } + if(length(y3_withoutNaN)==0 ){ + y3_withoutNaN<-c(0.1,0.1,0.1,0.1,0.1) + } + + + + + if(length(y1_withoutNaN)<4 & length(y1_withoutNaN)>2 ){ + y1_withoutNaN<-c(y1_withoutNaN[1],y1_withoutNaN,y1_withoutNaN[length(y1_withoutNaN)]) + } + if(length(y2_withoutNaN)<4 & length(y2_withoutNaN)>2 ){ + y2_withoutNaN<-c(y2_withoutNaN[1],y2_withoutNaN,y2_withoutNaN[length(y2_withoutNaN)]) + } + if(length(y3_withoutNaN)<4 & length(y3_withoutNaN)>2 ){ + y3_withoutNaN<-c(y3_withoutNaN[1],y3_withoutNaN,y3_withoutNaN[length(y3_withoutNaN)]) + } + + + + if(length(y1_withoutNaN)<3 & length(y1_withoutNaN)>0 ){ + y1_withoutNaN<-c(y1_withoutNaN[1],y1_withoutNaN[1],y1_withoutNaN,y1_withoutNaN[length(y1_withoutNaN)],y1_withoutNaN[length(y1_withoutNaN)]) + } + if(length(y2_withoutNaN)<3 & length(y2_withoutNaN)>0 ){ + y2_withoutNaN<-c(y2_withoutNaN[1],y2_withoutNaN[1],y2_withoutNaN,y2_withoutNaN[length(y2_withoutNaN)],y2_withoutNaN[length(y2_withoutNaN)]) + } + if(length(y3_withoutNaN)<3 & length(y3_withoutNaN)>0 ){ + y3_withoutNaN<-c(y3_withoutNaN[1],y3_withoutNaN[1],y3_withoutNaN,y3_withoutNaN[length(y3_withoutNaN)],y3_withoutNaN[length(y3_withoutNaN)]) + } + + + + length_third_seq_withoutNaN = length(y1_withoutNaN) + length_third_seq_withoutNaN_minus1= length(y1_withoutNaN)-1 + + + + x1 = 1+3*(0:length_third_seq_withoutNaN_minus1) + x2 = 2+3*(0:length_third_seq_withoutNaN_minus1) + x3 = 3+3*(0:length_third_seq_withoutNaN_minus1) + #Converting all coordinates in coding region to relative values between 0 and 1 and using a smoothing function + ys1 = smooth.spline(x1/(length_third_seq_withoutNaN*3),y1_withoutNaN) + ys2 = smooth.spline(x2/(length_third_seq_withoutNaN*3),y2_withoutNaN) + ys3 = smooth.spline(x3/(length_third_seq_withoutNaN*3),y3_withoutNaN) + #Sampling 100 equidistant CSCPD values between 0 and 1 + xout = 0.01*(1:100) + yout1 = predict(ys1,xout)$y + yout2 = predict(ys2,xout)$y + yout3 = predict(ys3,xout)$y + interpolation_mat1[i,] = yout1 + interpolation_mat2[i,] = yout2 + interpolation_mat3[i,] = yout3 + } + interpolation_list<-list(interpolation_mat1,interpolation_mat2,interpolation_mat3) + return(interpolation_list) +} + + +### This function calculates the PTS as in Michel et al 2012 Gen Res + +dual_calculate_PTSs<-function(all_tracks,index,prev_percentiles){ + data_frame<-as.data.frame(prev_percentiles) + attach(data_frame) + unique_index<-unique(index) + difference_mat1 = matrix(NA, nrow = dim(unique_index)[1], ncol=100) + difference_mat2 = matrix(NA, nrow = dim(unique_index)[1], ncol=100) + difference_mat3 = matrix(NA, nrow = dim(unique_index)[1], ncol=100) + + + rnames = rep("",dim(unique_index)[1]) + cnames = c("PTS1", "PTS2", "PTS3", "PTS") + PTS = matrix(0, nrow = dim(unique_index)[1], ncol=4, dimnames=list(rnames,cnames)) + + + for(i in 1:dim(unique_index)[1]){ + id<-unique_index[i,] + exon_track<-all_tracks[index==id] + withsep<-strsplit(exon_track,split=" ") + x<-t(data.frame(withsep)) + rnames[i]<-x[1,1] + strand<-x[1,2] + if(length(grep("CCDS",id))>0){tracks_pre<-t(x[,-c(1:3)])} else { + tracks_pre<-t(x[,-c(1:2)])} + if(strand=="-"){ + tracks<-cbind(rev(tracks_pre[,1]),rev(tracks_pre[,2]),rev(tracks_pre[,3]),rev(tracks_pre[,4])) + } else if (strand=="+"){ + tracks<-tracks_pre} + colnames(tracks)<-c("Psites","RiboCov","RNACov","RNAcent") + mode(tracks)<-"numeric" + length<-dim(tracks)[1] + Phase_P_sites_frame<-sum(tracks[seq(1,length,by=3),1]) + Phase_P_sites_frame_1<-sum(tracks[seq(2,length,by=3),1]) + Phase_P_sites_frame_2<-sum(tracks[seq(3,length,by=3),1]) + FRAME_MAX_phase<-max.col(t(c(Phase_P_sites_frame,Phase_P_sites_frame_1,Phase_P_sites_frame_2)))-1 + nts_toadd<-(3-FRAME_MAX_phase)%%3 + + read_mat<-t(as.matrix(c(rep(x=0,nts_toadd),tracks[,1]))) + length<-(length(read_mat)) + + upprop = matrix(NA,nrow = 1,ncol = length) + downprop = matrix(NA,nrow = 1,ncol = length) + seq_length1 = (length/3)-1 + seq_length2 = seq_length1 - 1 + + #calculate the cumulative upstream proportions + denom = cumsum(read_mat[1,]) + upprop[1,(1+3*(0:seq_length1))] = cumsum(read_mat[1,(1+3*(0:seq_length1))])/ + denom[(3+3*(0:seq_length1))] + upprop[1,(2+3*(0:seq_length1))] = cumsum(read_mat[1,(2+3*(0:seq_length1))])/ + denom[(3+3*(0:seq_length1))] + upprop[1,(3+3*(0:seq_length1))] = cumsum(read_mat[1,(3+3*(0:seq_length1))])/ + denom[(3+3*(0:seq_length1))] + #calculate the cumulative downstream proportions + totalsDown = rep(NA, seq_length1) + for (j in 1:seq_length1){ + totalsDown[j] = sum(read_mat[1+3*(j:seq_length1)]+ read_mat[2+3*(j:seq_length1)]+ + read_mat[3+3*(j:seq_length1)])} + downprop[1,(1+3*(0:seq_length2))] = rev(cumsum(rev(read_mat[1,(1+3*(1:seq_length1))])))/ + totalsDown[0:seq_length1] + downprop[1,(2+3*(0:seq_length2))] = rev(cumsum(rev(read_mat[1,(2+3*(1:seq_length1))])))/ + totalsDown[0:seq_length1] + downprop[1,(3+3*(0:seq_length2))] = rev(cumsum(rev(read_mat[1,(3+3*(1:seq_length1))])))/ + totalsDown[0:seq_length1] + #Calculate the CSCPD (absolute difference between cumulative upstream and downstream + #proportions for sub-codon positions 1,2,3) + third_full_seq_minus1 = (length/3)-1 + y1 = abs(upprop-downprop)[1+3*(0:third_full_seq_minus1)] + y1_withoutNA = y1[which(y1 != "NA")] + y1_withoutNaN = y1_withoutNA[which(y1_withoutNA != "NaN")] + + y2 = abs(upprop-downprop)[2+3*(0:third_full_seq_minus1)] + y2_withoutNA = y2[which(y2 != "NA")] + y2_withoutNaN = y2_withoutNA[which(y2_withoutNA != "NaN")] + + y3 = abs(upprop-downprop)[3+3*(0:third_full_seq_minus1)] + y3_withoutNA = y3[which(y3 != "NA")] + y3_withoutNaN = y3_withoutNA[which(y3_withoutNA != "NaN")] + + if(length(y1_withoutNaN)<4){ + y1_withoutNaN<-c(y1_withoutNaN[1],y1_withoutNaN[1],y1_withoutNaN,y1_withoutNaN[length(y1_withoutNaN)],y1_withoutNaN[length(y1_withoutNaN)]) + } + if(length(y2_withoutNaN)<4){ + y2_withoutNaN<-c(y2_withoutNaN[1],y2_withoutNaN[1],y2_withoutNaN,y2_withoutNaN[length(y2_withoutNaN)],y2_withoutNaN[length(y2_withoutNaN)]) + } + if(length(y3_withoutNaN)<4){ + y3_withoutNaN<-c(y3_withoutNaN[1],y3_withoutNaN[1],y3_withoutNaN,y3_withoutNaN[length(y3_withoutNaN)],y3_withoutNaN[length(y3_withoutNaN)]) + } + + + length_third_seq_withoutNaN = length(y1_withoutNaN) + length_third_seq_withoutNaN_minus1= length(y1_withoutNaN)-1 + + + + x1 = 1+3*(0:length_third_seq_withoutNaN_minus1) + x2 = 2+3*(0:length_third_seq_withoutNaN_minus1) + x3 = 3+3*(0:length_third_seq_withoutNaN_minus1) + #Converting all coordinates in coding region to relative values between 0 and 1 and using a smoothing function + ys1 = smooth.spline(x1/(length_third_seq_withoutNaN*3),y1_withoutNaN) + ys2 = smooth.spline(x2/(length_third_seq_withoutNaN*3),y2_withoutNaN) + ys3 = smooth.spline(x3/(length_third_seq_withoutNaN*3),y3_withoutNaN) + #Sampling 100 equidistant CSCPD values between 0 and 1 + + xout = 0.01*(1:100) + yout1 = predict(ys1,xout)$y + yout2 = predict(ys2,xout)$y + yout3 = predict(ys3,xout)$y + + + difference_mat1[i,] = yout1 - Percentile_P1 + difference_mat2[i,] = yout2 - Percentile_P2 + difference_mat3[i,] = yout3 - Percentile_P3 + gene_counter<-i + + for (k in 1:100) + { + if (as.numeric(difference_mat1[gene_counter,k]) >= 0){ + PTS[gene_counter,1] = PTS[gene_counter,1] + difference_mat1[gene_counter,k] + } + if (difference_mat2[gene_counter,k] >= 0){ + PTS[gene_counter,2] = PTS[gene_counter,2] + difference_mat2[gene_counter,k] + } + if (difference_mat3[gene_counter,k] >= 0){ + PTS[gene_counter,3] = PTS[gene_counter,3] + difference_mat3[gene_counter,k] + } + } + + PTS[gene_counter,4] = PTS[gene_counter,1] + PTS[gene_counter,2] + PTS[gene_counter,3] + + } + detach(data_frame) + PTS<-as.data.frame(PTS,row.names=F,stringsAsFactors=F) + PTS$exon_id<-unique_index[,1] + PTS<-PTS[,c("exon_id","PTS1","PTS2","PTS3","PTS")] + return(PTS) +} + + +### This function plots the PTS as in Michel et al 2012 Gen Res (outdated) + + + +plot_CSCPDs<-function(x,y){ + one<-x[[1]][y,] + two<-x[[2]][y,] + three<-x[[3]][y,] + exon_id<-rownames(x[[1]])[y] + plot(one,type="l",col="red",ylim=c(0,1),ylab="CSCPDs") + lines(two,type="l",col="green") + lines(three,type="l",col="blue") + legend("top",exon_id) +} + +### This function plots data_tracks information (P-sites distribution, FFT etc...) (outdated) + + +plot_tracks_fig<-function(exon_id,complete_tracks=all_tracks,index=all_tracks_index){ + + exon_track<-complete_tracks[index==exon_id] + withsep<-strsplit(exon_track,split=" ") + x<-t(data.frame(withsep)) + strand<-x[1,2] + if(length(grep("CCDS",exon_id,))>0){tracks_pre<-t(x[,-c(1:3)])} else { + tracks_pre<-t(x[,-c(1:2)])} + if(strand=="-"){ + tracks<-cbind(rev(tracks_pre[,1]),rev(tracks_pre[,2]),rev(tracks_pre[,3]),rev(tracks_pre[,4])) + } else if (strand=="+"){ + tracks<-tracks_pre} + colnames(tracks)<-c("Psites","RiboCov","RNACov","RNAcent") + mode(tracks)<-"numeric" + length<-dim(tracks)[1] + + + Phase0<-round(x=sum(tracks[seq(1,dim(tracks)[1],by=3),1])/sum(tracks[,1]),digits=4) + Phase1<-round(x=sum(tracks[seq(2,dim(tracks)[1],by=3),1])/sum(tracks[,1]),digits=4) + Phase2<-round(x=sum(tracks[seq(3,dim(tracks)[1],by=3),1])/sum(tracks[,1]),digits=4) + + Phase0_RNA<-round(x=sum(tracks[seq(1,dim(tracks)[1],by=3),4])/sum(tracks[,4]),digits=4) + Phase1_RNA<-round(x=sum(tracks[seq(2,dim(tracks)[1],by=3),4])/sum(tracks[,4]),digits=4) + Phase2_RNA<-round(x=sum(tracks[seq(3,dim(tracks)[1],by=3),4])/sum(tracks[,4]),digits=4) + + valuesribo<-c(min(tracks[,2]),tracks[,2],min(tracks[,2])) + valuesrna<-c(min(tracks[,3]),tracks[,3],min(tracks[,3])) + nucleot<-c(min(seq(1,dim(tracks)[1])),seq(1,dim(tracks)[1]),max(seq(1,dim(tracks)[1]))) + name_region<-exon_id + x11(width=16,height=10) + par(mar=c(4, 4, 1, 1)) + split.screen( figs = c( 2, 2 ) ) + split.screen( figs = c( 2, 1 ) ,screen=1) + + screen(5) + + + plot(tracks[,1],type="h",col=c("red","dark green","blue"),ylab="P_sites",xlab="nt") + + split.screen( figs = c( 1, 2 ) ,screen=6) + screen(7) + barplot(c(Phase0,Phase1,Phase2),xlab="Phases",ylim=c(0,1),ylab="%_Alignments",main="%Frames_RIBO",col=c("red","dark green","blue")) + screen(8) + plotFFT(x=seq(1,dim(tracks)[1]),y=tracks[,1],samplingFreq=1) + + screen(2) + plot(tracks[,2],type="l",col="red",ylab="Ribo_cov",xlab="nt",main=x[1,1]) + polygon(x=nucleot,y=valuesribo,col="red") + split.screen( figs = c( 2, 1 ) ,screen=3) + screen(9) + plot(tracks[,4],type="h",col=c("red","dark green","blue"),ylab="RNA_center",xlab="nt") + split.screen( figs = c( 1, 2 ) ,screen=10) + screen(11) + barplot(c(Phase0_RNA,Phase1_RNA,Phase2_RNA),ylim=c(0,1),xlab="Phases",ylab="%_Alignments",main="%Frames_RNA",col=c("red","dark green","blue")) + screen(12) + plotFFT(x=seq(1,dim(tracks)[1]),y=tracks[,4],samplingFreq=1) + screen(4) + plot(tracks[,3],type="l",col="dark grey",ylab="RNA_cov",xlab="nt") + polygon(x=nucleot,y=valuesrna,col="dark grey") + close.screen(all.screens=T) +} + + +### This function calculates the PTS from the CSPD, as in Michel et al 2012 Gen Res + + +calculate_PTS_from_CSPDs<-function(list_cscpds,quantile_value=0.95){ + + cnames = c("Percentile_P1", "Percentile_P2", "Percentile_P3") + rnames = rep("",100) + quantiles = matrix(0, nrow = 100, ncol=3,dimnames=list(rnames,cnames)) + + CSCPDs_1<-list_cscpds[[1]] + CSCPDs_2<-list_cscpds[[2]] + CSCPDs_3<-list_cscpds[[3]] + + difference_mat1 = matrix(NA, nrow = dim(CSCPDs_1)[1], ncol=100) + difference_mat2 = matrix(NA, nrow = dim(CSCPDs_1)[1], ncol=100) + difference_mat3 = matrix(NA, nrow = dim(CSCPDs_1)[1], ncol=100) + + for(j in 1:100){ + quantiles[j,1] = quantile(CSCPDs_1[,j],quantile_value,na.rm = T) + quantiles[j,2] = quantile(CSCPDs_2[,j],quantile_value,na.rm = T) + quantiles[j,3] = quantile(CSCPDs_3[,j],quantile_value,na.rm = T) + } + + percentiles<-as.data.frame(quantiles,row.names=F) + + PTS = matrix(0, nrow = dim(CSCPDs_1)[1], ncol=4) + + + + difference_mat1 = t(apply(X=CSCPDs_1,MARGIN=1,FUN=function(x){x<-x-percentiles[,"Percentile_P1"]})) + difference_mat2 = t(apply(X=CSCPDs_2,MARGIN=1,FUN=function(x){x<-x-percentiles[,"Percentile_P2"]})) + difference_mat3 = t(apply(X=CSCPDs_3,MARGIN=1,FUN=function(x){x<-x-percentiles[,"Percentile_P3"]})) + + PTS[,1]<-t(apply(X=difference_mat1,MARGIN=1,FUN=function(x){sum(x[x>=0])})) + PTS[,2]<-t(apply(X=difference_mat2,MARGIN=1,FUN=function(x){sum(x[x>=0])})) + PTS[,3]<-t(apply(X=difference_mat3,MARGIN=1,FUN=function(x){sum(x[x>=0])})) + PTS[,4]<-t(apply(X=PTS[,1:3],MARGIN=1,FUN=sum)) + PTS<-as.data.frame(PTS,row.names=NULL,stringsAsFactors=F) + colnames(PTS)<-c("PTS1", "PTS2", "PTS3", "PTS") + PTS$exon_id<-rownames(CSCPDs_1) + PTS<-PTS[,c("exon_id","PTS1","PTS2","PTS3","PTS")] + return(PTS) +} + + +### This function takes frequencies F-values and spectral coefficient for a data-track object. +### (you have to calculate slepian functions beforehand) + +take_freqs_Fvalues_all_around_3nt_spec<-function(x,n_tapers,time_bw,slepians_values){ + if(length(x)<25){ + remain<-50-length(x) + x<-c(rep(0,as.integer(remain/2)),x,rep(0,remain%%2+as.integer(remain/2))) + } + if(length(x)<1024/2){padding<-1024} + if(length(x)>=1024/2){padding<-"default"} + resSpec1 <- spec.mtm(as.ts(x), k=n_tapers, nw=time_bw, nFFT = padding, centreWithSlepians = TRUE, Ftest = TRUE, maxAdaptiveIterations = 100,returnZeroFreq=F,plot=F,dpssIN=slepians_values) + resSpec2<-dropFreqs(resSpec1,0.1,0.45) + freq_max<-resSpec2$freq[which(resSpec2$mtm$Ftest==max(resSpec2$mtm$Ftest))] + Fmax<-resSpec2$mtm$Ftest[which(resSpec2$mtm$Ftest==max(resSpec2$mtm$Ftest))] + P_all<-(pf(Fmax,df1=2,df2=(2*n_tapers)-2,lower.tail=F)) + + resSpec2<-dropFreqs(resSpec2,0.29,0.39) + + freq_max_around_3nt<-resSpec2$freq[which(resSpec2$mtm$Ftest==max(resSpec2$mtm$Ftest))] + + Fmax_around_3nt<-resSpec2$mtm$Ftest[which(resSpec2$mtm$Ftest==max(resSpec2$mtm$Ftest))] + P_around_3nt<-(pf(q=Fmax_around_3nt,df1=2,df2=(2*n_tapers)-2,lower.tail=F)) + + + freq_max_3nt<-resSpec1$freq[which(abs((resSpec1$freq-(1/3)))==min(abs((resSpec1$freq-(1/3)))))] + + Fmax_3nt<-resSpec1$mtm$Ftest[which(abs((resSpec1$freq-(1/3)))==min(abs((resSpec1$freq-(1/3)))))] + P_3nt<-(pf(q=Fmax_3nt,df1=2,df2=(2*n_tapers)-2,lower.tail=F)) + Spec_3nt<-resSpec1$spec[which(abs((resSpec1$freq-(1/3)))==min(abs((resSpec1$freq-(1/3)))))] + + return(c(freq_max,P_all,freq_max_around_3nt,P_around_3nt,freq_max_3nt,P_3nt,Spec_3nt)) + +} + + +### This function calculates periodicity and other statistics on single exon tracks + + +make_analysis_exons<-function(x){ + strand<-x[1,2] + tracks_pre<-t(x[,-c(1:2)]) + + + if(strand=="-"){ + tracks<-cbind(rev(tracks_pre[,1]),rev(tracks_pre[,2]),rev(tracks_pre[,3]),rev(tracks_pre[,4]),rev(tracks_pre[,5])) + } else if (strand=="+"){ + tracks<-tracks_pre} + colnames(tracks)<-c("Psites","RiboCov","RNACov","RNAcent","Seq") + tracks<-tracks[,1:4] + if(is.null(dim(tracks))){ + tracks<-t(as.matrix(tracks)) + } + + mode(tracks)<-"numeric" + + exon<-data.frame(exon_id=x[1,1],stringsAsFactors=F,row.names=NULL) + exon$strand<-strand + exon$frame_start_pred<-NA + exon$frame_end_pred<-NA + + exon$length<-dim(tracks)[1] + length<-dim(tracks)[1] + + + P_sites_sum<-round(sum(tracks[,1]),digits=6) + exon$P_sites_sum<-P_sites_sum + + + Centered_sites_sum<-round(sum(tracks[,4]),digits=6) + exon$RNA_sites_sum<-Centered_sites_sum + exon$Ribocov_aver<-round(mean(tracks[,2]),digits=6) + exon$RNAseqcov_aver<-round(mean(tracks[,3]),digits=6) + exon$pctPhase_frame<-NA + exon$pctPhase_frame_1<-NA + exon$pctPhase_frame_2<-NA + exon$pctPhaseCentered_frame<-NA + exon$pctPhaseCentered_frame_1<-NA + exon$pctPhaseCentered_frame_2<-NA + if(length>2){ + Phase_P_sites_frame<-sum(tracks[seq(1,length,by=3),1]) + Phase_P_sites_frame_1<-sum(tracks[seq(2,length,by=3),1]) + Phase_P_sites_frame_2<-sum(tracks[seq(3,length,by=3),1]) + + + exon$pctPhase_frame<-Phase_P_sites_frame/P_sites_sum + exon$pctPhase_frame_1<-Phase_P_sites_frame_1/P_sites_sum + exon$pctPhase_frame_2<-Phase_P_sites_frame_2/P_sites_sum + + + Phase_Centered_sites_frame<-sum(tracks[seq(1,length,by=3),4]) + Phase_Centered_sites_frame_1<-sum(tracks[seq(2,length,by=3),4]) + Phase_Centered_sites_frame_2<-sum(tracks[seq(3,length,by=3),4]) + + + exon$pctPhaseCentered_frame<-Phase_Centered_sites_frame/Centered_sites_sum + exon$pctPhaseCentered_frame_1<-Phase_Centered_sites_frame_1/Centered_sites_sum + exon$pctPhaseCentered_frame_2<-Phase_Centered_sites_frame_2/Centered_sites_sum + + + MAXPhase_frame<-max(c(exon$pctPhase_frame,exon$pctPhase_frame_1,exon$pctPhase_frame_2)) + FRAME_MAX_phase<-max.col(t(c(exon$pctPhase_frame,exon$pctPhase_frame_1,exon$pctPhase_frame_2)))-1 + + MAXPhaseCentered_frame<-max(c(exon$pctPhaseCentered_frame,exon$pctPhaseCentered_frame_1,exon$pctPhaseCentered_frame_2)) + FRAME_MAX_phaseCentered<-max.col(t(c(exon$pctPhaseCentered_frame,exon$pctPhaseCentered_frame_1,exon$pctPhaseCentered_frame_2)))-1 + } + + + exon$multit_freq_best_ribo<-NA + exon$pval_multit_3nt_ribo<-NA + exon$spec_multit_3nt_ribo<-NA + exon$fft_max_freq_ribo<-NA + exon$fft_power_3_ribo<-NA + exon$fft_aver_ribo<-NA + exon$spec_max_freq_ribo<-NA + exon$spec_power_3_ribo<-NA + exon$spec_aver_power_ribo<-NA + + exon$multit_freq_best_rna<-NA + exon$pval_multit_3nt_rna<-NA + exon$spec_multit_3nt_rna<-NA + exon$fft_max_freq_rna<-NA + exon$fft_power_3_rna<-NA + exon$fft_aver_rna<-NA + exon$spec_max_freq_rna<-NA + exon$spec_power_3_rna<-NA + exon$spec_aver_power_rna<-NA + + exon$ORF_score_ribo<-NA + exon$ORF_score_rna<-NA + + if(P_sites_sum>2 & length>5){ + if(length<25){slepians<-dpss(n=length+(50-length),k=24,nw=12)} + if(length>=25){slepians<-dpss(n=length,k=24,nw=12)} + bestfreq_3ntpval_ribo<-take_freqs_Fvalues_all_around_3nt_spec(x=tracks[,1],n_tapers=24,time_bw=12,slepians_values=slepians)[c(1,6,7)] + exon$multit_freq_best_ribo<-bestfreq_3ntpval_ribo[1] + exon$pval_multit_3nt_ribo<-bestfreq_3ntpval_ribo[2] + exon$spec_multit_3nt_ribo<-bestfreq_3ntpval_ribo[3] + score1<-((Phase_P_sites_frame-P_sites_sum/3)^2)/(P_sites_sum/3) + score2<-((Phase_P_sites_frame_1-P_sites_sum/3)^2)/(P_sites_sum/3) + score3<-((Phase_P_sites_frame_2-P_sites_sum/3)^2)/(P_sites_sum/3) + exon$ORF_score_ribo<-log2(score1+score2+score3+1) + + if(max(tracks[,1])>(P_sites_sum*.7)){ + new_track<-tracks + new_track[which(new_track[,1]==max(new_track[,1]))]<-0 + exon$ORF_score_ribo<-NA + if(sum(new_track[,1])>2){ + Phase_P_sites_frame_corr<-sum(new_track[seq(1,length,by=3),1]) + Phase_P_sites_frame_1_corr<-sum(new_track[seq(2,length,by=3),1]) + Phase_P_sites_frame_2_corr<-sum(new_track[seq(3,length,by=3),1]) + score1<-((Phase_P_sites_frame_corr-sum(new_track[,1])/3)^2)/(sum(new_track[,1])/3) + score2<-((Phase_P_sites_frame_1_corr-sum(new_track[,1])/3)^2)/(sum(new_track[,1])/3) + score3<-((Phase_P_sites_frame_2_corr-sum(new_track[,1])/3)^2)/(sum(new_track[,1])/3) + exon$ORF_score_ribo<-log2(score1+score2+score3+1) + } + } + + gino<-getFFTFreqs(Nyq.Freq=0.5,data=tracks[,1]) + modFFT <- Mod(fft(tracks[,1])) + FFTdata <- cbind(gino, modFFT) + exon$fft_aver_ribo<-mean(FFTdata[,2]) + exon$fft_power_3_ribo<-FFTdata[which(abs((gino-(1/3)))==min(abs((gino-(1/3))))),2] + exon$fft_max_freq_ribo<-abs(gino[which(FFTdata==max((FFTdata[10:dim(FFTdata)[1]/2,2])),arr.ind=TRUE)[1]])[1] + + + spect_P_sites<-spectrum(tracks[,1],plot=FALSE) + exon$spec_max_freq_ribo<-spect_P_sites$freq[which(spect_P_sites$spec==max(spect_P_sites$spec),arr.ind=TRUE)][1] + exon$spec_power_3_ribo<-spect_P_sites$spec[which(abs((spect_P_sites$freq-(1/3)))==min(abs((spect_P_sites$freq-(1/3)))))] + exon$spec_aver_power_ribo<-mean(spect_P_sites$spec) + if(Centered_sites_sum>2){ + + gino<-getFFTFreqs(Nyq.Freq=0.5,data=tracks[,4]) + modFFT <- Mod(fft(tracks[,4])) + FFTdata <- cbind(gino, modFFT) + exon$fft_aver_rna<-mean(FFTdata[,2]) + exon$fft_power_3_rna<-FFTdata[which(abs((gino-(1/3)))==min(abs((gino-(1/3))))),2] + exon$fft_max_freq_rna<-1/abs(gino[which(FFTdata==max((FFTdata[10:dim(FFTdata)[1]/2,2])),arr.ind=TRUE)[1]])[1] + + + + spect_rna<-spectrum(tracks[,4],plot=FALSE) + exon$spec_max_freq_rna<-spect_rna$freq[which(spect_rna$spec==max(spect_rna$spec),arr.ind=TRUE)][1] + exon$spec_power_3_rna<-spect_rna$spec[which(abs((spect_rna$freq-(1/3)))==min(abs((spect_rna$freq-(1/3)))))] + exon$spec_aver_power_rna<-mean(spect_rna$spec) + bestfreq_3ntpval_rna<-take_freqs_Fvalues_all_around_3nt_spec(x=tracks[,4],n_tapers=24,time_bw=12,slepians_values=slepians)[c(1,6,7)] + + exon$multit_freq_best_rna<-bestfreq_3ntpval_rna[1] + exon$pval_multit_3nt_rna<-bestfreq_3ntpval_rna[2] + exon$spec_multit_3nt_rna<-bestfreq_3ntpval_rna[3] + score_rna_1<-((Phase_Centered_sites_frame-Centered_sites_sum/3)^2)/(Centered_sites_sum/3) + score_rna_2<-((Phase_Centered_sites_frame_1-Centered_sites_sum/3)^2)/(Centered_sites_sum/3) + score_rna_3<-((Phase_Centered_sites_frame_2-Centered_sites_sum/3)^2)/(Centered_sites_sum/3) + exon$ORF_score_rna<-log2(score_rna_1+score_rna_2+score_rna_3+1) + if(max(tracks[,4])>(Centered_sites_sum*.7)){ + new_track<-tracks + new_track[which(new_track[,4]==max(new_track[,4]))]<-0 + exon$ORF_score_rna<-NA + if(sum(new_track[,4])>2){ + Phase_Centered_sites_frame_corr<-sum(new_track[seq(1,length,by=3),4]) + Phase_Centered_sites_frame_1_corr<-sum(new_track[seq(2,length,by=3),4]) + Phase_Centered_sites_frame_2_corr<-sum(new_track[seq(3,length,by=3),4]) + score1<-((Phase_Centered_sites_frame_corr-sum(new_track[,4])/3)^2)/(sum(new_track[,4])/3) + score2<-((Phase_Centered_sites_frame_1_corr-sum(new_track[,4])/3)^2)/(sum(new_track[,4])/3) + score3<-((Phase_Centered_sites_frame_2_corr-sum(new_track[,4])/3)^2)/(sum(new_track[,4])/3) + exon$ORF_score_rna<-log2(score1+score2+score3+1) + } + } + } + + + + } + + + exon$chisq_ribo<-NA + exon$chisq_rna<-NA + + + if(P_sites_sum>15 & length>5){ + exon$chisq_ribo<-chisq.test(as.table(c(Phase_P_sites_frame,Phase_P_sites_frame_1,Phase_P_sites_frame_2)))$p.value} + if(P_sites_sum<16 & P_sites_sum>0 & length>5){ + exon$chisq_ribo<-xmulti(obs=c(Phase_P_sites_frame,Phase_P_sites_frame_1,Phase_P_sites_frame_2),expr=c(1,1,1),statName="Prob",detail=0)$pProb + } + + + if(Centered_sites_sum>15 & length>5){ + exon$chisq_rna<-chisq.test(as.table(c(Phase_Centered_sites_frame,Phase_Centered_sites_frame_1,Phase_Centered_sites_frame_2)))$p.value} + if(Centered_sites_sum<16 & Centered_sites_sum>0 & length>5){ + exon$chisq_rna<-xmulti(obs=c(Phase_Centered_sites_frame,Phase_Centered_sites_frame_1,Phase_Centered_sites_frame_2),expr=c(1,1,1),statName="Prob",detail=0)$pProb + } + + + exon$max_notcov_ribo<-max((!tracks[,2]) * unlist(lapply(rle(tracks[,2])$lengths, seq_len))) + exon$coords_notcov_ribo<-max.col(t((!tracks[,2]) * unlist(lapply(rle(tracks[,2])$lengths, seq_len))))-max((!tracks[,2]) * unlist(lapply(rle(tracks[,2])$lengths, seq_len))) + + exon$max_notcov_rna<-max((!tracks[,3]) * unlist(lapply(rle(tracks[,3])$lengths, seq_len))) + exon$coords_notcov_rna<-max.col(t((!tracks[,3]) * unlist(lapply(rle(tracks[,3])$lengths, seq_len))))-max((!tracks[,3]) * unlist(lapply(rle(tracks[,3])$lengths, seq_len))) + + + + if(strand=="-"){ + exon$coords_notcov_ribo<-length-exon$coords_notcov_ribo + exon$coords_notcov_rna<-length-exon$coords_notcov_rna + } + + if(exon$max_notcov_ribo==0){ + exon$max_notcov_ribo<-"NA" + } + + if(exon$max_notcov_rna==0){ + exon$max_notcov_rna<-"NA" + } + + exon$notcovered_ribo<-sum(tracks[,2] == 0) + exon$notcovered_rna<-sum(tracks[,3] == 0) + + + + + if(length>2){ + exon$frame_start_pred<-FRAME_MAX_phase + exon$frame_end_pred<-(length-(FRAME_MAX_phase+1))%%3 + } + if(x[1,2]=="-" & length>2){ + + exon$frame_end_pred<-FRAME_MAX_phase + exon$frame_start_pred<-(length-(FRAME_MAX_phase+1))%%3 + } + + + exon + + +} + + +### This function annotates exons based on their position relative to CCDS exons + +annotate_exons<-function(x){ + annot_pos<-x[,c("type","start","end","length.x","P_sites_sum","RNA_sites_sum","notcovered_ribo","notcovered_rna","nt_more","nt_more_ribocovered","nt_more_P_sites","nt_more_rnacovered","nt_more_cent_sites","overlapping_ccds_start","overlapping_ccds_end")] + + ccdss<-which(annot_pos$type=="ccds") + if(length(ccdss)>0){ + ccdss_coords<-annot_pos[ccdss,2:3] + ccdss_all<-annot_pos[ccdss,] + + + middle_ex<-which(annot_pos[,1]=="exon") + middle_ex_coords<-annot_pos[which(annot_pos[,1]=="exon"),] + listcoordsccds<-list() + + for(i in seq(1,dim(ccdss_coords)[1])){ + listcoordsccds[[i]]<-seq(from=ccdss_coords[i,1],to=ccdss_coords[i,2]) + } + + if(length(middle_ex)>0){ + for(y in seq(1,dim(middle_ex_coords)[1])){ + a<-seq(from=middle_ex_coords[y,2],to=middle_ex_coords[y,3]) + intersect<-c() + beginning<-c() + endpos<-c() + for(i in seq(1,length(listcoordsccds))){ + b<-listcoordsccds[[i]] + if(sum(a%in%b)>0){ + intersect[i]<-TRUE + beginning[i]<-(a%in%b)[1] + endpos[i]<-(a%in%b)[length(a%in%b)]} else { + intersect[i]<-FALSE + beginning[i]<-FALSE + endpos[i]<-FALSE + } + } + if(sum(intersect)>0 & sum(beginning)>0 & sum(endpos)>0){middle_ex_coords[y,1]<-"inside_ccds"} + if(sum(intersect)>0 & sum(beginning)==0 & sum(endpos)>0){middle_ex_coords[y,1]<-"overlapping_ccds"} + if(sum(intersect)>0 & sum(beginning)>0 & sum(endpos)==0){middle_ex_coords[y,1]<-"overlapping_ccds"} + if(sum(intersect)>0 & sum(beginning)==0 & sum(endpos)==0){middle_ex_coords[y,1]<-"containing_ccds"} + ccds_inters<-ccdss_all[intersect,] + if(dim(ccds_inters)[1]>1){middle_ex_coords[y,1]<-"overlapping_multiple_ccdss"} + if(dim(ccds_inters)[1]==1){middle_ex_coords[y,"nt_more"]<-middle_ex_coords[y,"length.x"]-ccds_inters[,"length.x"] + middle_ex_coords[y,"nt_more_ribocovered"]<-1-((middle_ex_coords[y,"notcovered_ribo"]-ccds_inters[,"notcovered_ribo"])/middle_ex_coords[y,"nt_more"]) + middle_ex_coords[y,"nt_more_P_sites"]<-middle_ex_coords[y,"P_sites_sum"]-ccds_inters[,"P_sites_sum"] + middle_ex_coords[y,"nt_more_rnacovered"]<-1-((middle_ex_coords[y,"notcovered_rna"]-ccds_inters[,"notcovered_rna"])/middle_ex_coords[y,"nt_more"]) + middle_ex_coords[y,"nt_more_cent_sites"]<-middle_ex_coords[y,"RNA_sites_sum"]-ccds_inters[,"RNA_sites_sum"] + middle_ex_coords[y,"overlapping_ccds_start"]<-ccds_inters[,"start"] + middle_ex_coords[y,"overlapping_ccds_end"]<-ccds_inters[,"end"]} + } + } + + inside_ex<-middle_ex_coords[,1]=="inside_ccds" + + + if(length(middle_ex)>0){ + middle_ex_coords[middle_ex_coords[,2]%in%ccdss_coords[,1] & middle_ex_coords[,1]!="overlapping_multiple_ccdss",1]<-"exon_alt_donor" + middle_ex_coords[middle_ex_coords[,3]%in%ccdss_coords[,2] & middle_ex_coords[,1]!="overlapping_multiple_ccdss",1]<-"exon_alt_acceptor" + middle_ex_coords[middle_ex_coords[,2]%in%ccdss_coords[,1] & middle_ex_coords[,1]=="overlapping_multiple_ccdss",1]<-"overlapping_multiple_ccdss_alt_donor" + middle_ex_coords[middle_ex_coords[,3]%in%ccdss_coords[,2] & middle_ex_coords[,1]=="overlapping_multiple_ccdss",1]<-"overlapping_multiple_ccdss_alt_acceptor" + annot_pos[middle_ex,]<-middle_ex_coords + } + + if(sum(inside_ex)>0){ + middle_ex_coords[inside_ex & middle_ex_coords[,1]=="exon_alt_donor",1]<-"int_exon_alt_donor" + middle_ex_coords[inside_ex & middle_ex_coords[,1]=="exon_alt_acceptor",1]<-"int_exon_alt_acceptor" + } + annot_pos[middle_ex,]<-middle_ex_coords + + + + annot_pos[1:(ccdss[1]-1),1]<-"5_utrs_ex" + annot_pos[ccdss,1]<-"ccds" + coords_start<-c(annot_pos[ccdss[1],2],annot_pos[ccdss[1],3]) + ccdss_start<-annot_pos[ccdss[1],] + five_with_cds<-which(annot_pos[,2]<=coords_start[1] & annot_pos[,3]>=coords_start[2]) + five_with_cds<-five_with_cds[!five_with_cds%in%ccdss] + annot_pos[five_with_cds,1]<-"5_utrs_st" + annot_pos_fiveutr<-annot_pos[five_with_cds,] + for(f in seq(1,dim(annot_pos_fiveutr)[1])){ + annot_pos_fiveutr[f,"nt_more"]<-as.numeric(annot_pos_fiveutr[f,"length.x"]-ccdss_start[,"length.x"]) + annot_pos_fiveutr[f,"nt_more_ribocovered"]<-1-((annot_pos_fiveutr[f,"notcovered_ribo"]-ccdss_start[,"notcovered_ribo"])/annot_pos_fiveutr[f,"nt_more"]) + annot_pos_fiveutr[f,"nt_more_P_sites"]<-annot_pos_fiveutr[f,"P_sites_sum"]-ccdss_start[,"P_sites_sum"] + annot_pos_fiveutr[f,"nt_more_rnacovered"]<-1-((annot_pos_fiveutr[f,"notcovered_rna"]-ccdss_start[,"notcovered_rna"])/annot_pos_fiveutr[f,"nt_more"]) + annot_pos_fiveutr[f,"nt_more_cent_sites"]<-annot_pos_fiveutr[f,"RNA_sites_sum"]-ccdss_start[,"RNA_sites_sum"] + annot_pos_fiveutr[f,"overlapping_ccds_start"]<-ccdss_start[1,2] + annot_pos_fiveutr[f,"overlapping_ccds_end"]<-ccdss_start[1,3] + } + annot_pos[five_with_cds,]<-annot_pos_fiveutr + + + annot_pos[(1+(ccdss[length(ccdss)])):dim(annot_pos)[1],1]<-"3_utrs_ex" + annot_pos[ccdss,1]<-"ccds" + coords_stop<-c(annot_pos[tail(ccdss,1),2],annot_pos[tail(ccdss,1),3]) + ccdss_stop<-annot_pos[tail(ccdss,1),] + three_with_cds<-which(annot_pos[,2]<=coords_stop[1] & annot_pos[,3]>=coords_stop[2]) + three_with_cds<-three_with_cds[!three_with_cds%in%ccdss] + annot_pos[three_with_cds,1]<-"3_utrs_st" + annot_pos_threeutr<-annot_pos[three_with_cds,] + for(f in seq(1,dim(annot_pos_threeutr)[1])){ + annot_pos_threeutr[f,"nt_more"]<-as.numeric(annot_pos_threeutr[f,"length.x"]-ccdss_stop[,"length.x"]) + annot_pos_threeutr[f,"nt_more_ribocovered"]<-1-((annot_pos_threeutr[f,"notcovered_ribo"]-ccdss_stop[,"notcovered_ribo"])/annot_pos_threeutr[f,"nt_more"]) + annot_pos_threeutr[f,"nt_more_P_sites"]<-annot_pos_threeutr[f,"P_sites_sum"]-ccdss_stop[,"P_sites_sum"] + annot_pos_threeutr[f,"nt_more_rnacovered"]<-1-((annot_pos_threeutr[f,"notcovered_rna"]-ccdss_stop[,"notcovered_rna"])/annot_pos_threeutr[f,"nt_more"]) + annot_pos_threeutr[f,"nt_more_cent_sites"]<-annot_pos_threeutr[f,"RNA_sites_sum"]-ccdss_stop[,"RNA_sites_sum"] + annot_pos_threeutr[f,"overlapping_ccds_start"]<-ccdss_stop[,"start"] + annot_pos_threeutr[f,"overlapping_ccds_end"]<-ccdss_stop[,"end"] + } + annot_pos[ three_with_cds,]<-annot_pos_threeutr + + if(x$strand.x[1]=="-"){ + int_don<-which(annot_pos[,1]=="int_exon_alt_donor") + int_acc<-which(annot_pos[,1]=="int_exon_alt_acceptor") + don<-which(annot_pos[,1]=="exon_alt_donor") + acc<-which(annot_pos[,1]=="exon_alt_acceptor") + multi_don<-which(annot_pos[,1]=="overlapping_multiple_ccdss_alt_donor") + multi_acc<-which(annot_pos[,1]=="overlapping_multiple_ccdss_alt_acceptor") + fiveex<-which(annot_pos[,1]=="5_utrs_ex") + fivest<-which(annot_pos[,1]=="5_utrs_st") + threeex<-which(annot_pos[,1]=="3_utrs_ex") + threest<-which(annot_pos[,1]=="3_utrs_st") + annot_pos[don,1]<-"exon_alt_acceptor" + annot_pos[acc,1]<-"exon_alt_donor" + annot_pos[int_don,1]<-"int_exon_alt_acceptor" + annot_pos[int_acc,1]<-"int_exon_alt_donor" + annot_pos[multi_don,1]<-"overlapping_multiple_ccdss_alt_acceptor" + annot_pos[multi_acc,1]<-"overlapping_multiple_ccdss_alt_donor" + annot_pos[fiveex,1]<-"3_utrs_ex" + annot_pos[fivest,1]<-"3_utrs_st" + annot_pos[threeex,1]<-"5_utrs_ex" + annot_pos[threest,1]<-"5_utrs_st" + + } + annot_pos<-annot_pos[!is.na(annot_pos[,"start"]),] + } + + + x[,c("type","start","end","length.x","P_sites_sum","RNA_sites_sum","notcovered_ribo","notcovered_rna","nt_more","nt_more_ribocovered","nt_more_P_sites","nt_more_rnacovered","nt_more_cent_sites","overlapping_ccds_start","overlapping_ccds_end")]<-annot_pos + x +} + + +### This function calculates periodicity on NON-CCDS region of an exons + + +alt_exon_analysis<-function(x,sequences=seq_exons,tracks_exons=all_tracks,index_tracks=tracks_index){ + + + exon<-x + names_exons<-names(sequences) + + seq_exon<-sequences[which(names_exons%in%exon["coords2"])][[1]] + myexon_id<-exon[,"exon_id"] + exon_track<-tracks_exons[index_tracks==myexon_id] + withsep<-strsplit(exon_track,split=" ") + x<-t(data.frame(withsep)) + + strand<-x[1,2] + tracks_pre<-t(x[,-c(1:2)]) + + if(strand=="-"){ + tracks<-cbind(rev(tracks_pre[,1]),rev(tracks_pre[,2]),rev(tracks_pre[,3]),rev(tracks_pre[,4]),rev(tracks_pre[,5])) + } else if (strand=="+"){ + tracks<-tracks_pre} + colnames(tracks)<-c("Psites","RiboCov","RNACov","RNAcent","Seq") + + tracks<-tracks[,1:4] + + mode(tracks)<-"numeric" + length<-dim(tracks)[1] + exon$exon_id_noccds<-exon$exon_id + if(exon$type=="exon_alt_acceptor"){ + + tracks<-tracks[1:exon$nt_more,] + seq_exon<-seq_exon[1:(exon$nt_more)] + length<-exon$nt_more + if(strand=="+"){exon$end=exon$overlapping_ccds_start-1} + if(strand=="-"){exon$start=exon$overlapping_ccds_end+1} + exon$exon_id_noccds<-paste(exon$chr,exon$start,exon$end,exon$type,exon$gene_id,sep="_") + } + + + if(exon$type=="exon_alt_donor"){ + tracks<-tracks[(length+1-exon$nt_more):length,] + seq_exon<-seq_exon[(length+1-exon$nt_more):length] + length<-exon$nt_more + if(strand=="+"){exon$start=exon$overlapping_ccds_end+1} + if(strand=="-"){exon$end=exon$overlapping_ccds_start-1} + exon$exon_id_noccds<-paste(exon$chr,exon$start,exon$end,exon$type,exon$gene_id,sep="_") + } + + exon<-data.frame(exon_id=exon$exon_id_noccds,exon_id_orig=exon$exon_id,type=exon$type,gene_id=exon$gene_id,annotation=exon$annotation) + exon$strand<-strand + exon$length<-dim(tracks)[1] + exon$frame_start_pred<-NA + exon$frame_end_pred<-NA + + + length<-dim(tracks)[1] + + + P_sites_sum<-round(sum(tracks[,1]),digits=6) + exon$P_sites_sum<-P_sites_sum + + + Centered_sites_sum<-round(sum(tracks[,4]),digits=6) + exon$RNA_sites_sum<-Centered_sites_sum + exon$Ribocov_aver<-round(mean(tracks[,2]),digits=6) + exon$RNAseqcov_aver<-round(mean(tracks[,3]),digits=6) + exon$pctPhase_frame<-NA + exon$pctPhase_frame_1<-NA + exon$pctPhase_frame_2<-NA + exon$pctPhaseCentered_frame<-NA + exon$pctPhaseCentered_frame_1<-NA + exon$pctPhaseCentered_frame_2<-NA + if(length>2){ + Phase_P_sites_frame<-sum(tracks[seq(1,length,by=3),1]) + Phase_P_sites_frame_1<-sum(tracks[seq(2,length,by=3),1]) + Phase_P_sites_frame_2<-sum(tracks[seq(3,length,by=3),1]) + + + exon$pctPhase_frame<-Phase_P_sites_frame/P_sites_sum + exon$pctPhase_frame_1<-Phase_P_sites_frame_1/P_sites_sum + exon$pctPhase_frame_2<-Phase_P_sites_frame_2/P_sites_sum + + + Phase_Centered_sites_frame<-sum(tracks[seq(1,length,by=3),4]) + Phase_Centered_sites_frame_1<-sum(tracks[seq(2,length,by=3),4]) + Phase_Centered_sites_frame_2<-sum(tracks[seq(3,length,by=3),4]) + + + exon$pctPhaseCentered_frame<-Phase_Centered_sites_frame/Centered_sites_sum + exon$pctPhaseCentered_frame_1<-Phase_Centered_sites_frame_1/Centered_sites_sum + exon$pctPhaseCentered_frame_2<-Phase_Centered_sites_frame_2/Centered_sites_sum + + + MAXPhase_frame<-max(c(exon$pctPhase_frame,exon$pctPhase_frame_1,exon$pctPhase_frame_2)) + FRAME_MAX_phase<-max.col(t(c(exon$pctPhase_frame,exon$pctPhase_frame_1,exon$pctPhase_frame_2)))-1 + + MAXPhaseCentered_frame<-max(c(exon$pctPhaseCentered_frame,exon$pctPhaseCentered_frame_1,exon$pctPhaseCentered_frame_2)) + FRAME_MAX_phaseCentered<-max.col(t(c(exon$pctPhaseCentered_frame,exon$pctPhaseCentered_frame_1,exon$pctPhaseCentered_frame_2)))-1 + } + + + exon$multit_freq_best_ribo<-NA + exon$pval_multit_3nt_ribo<-NA + exon$spec_multit_3nt_ribo<-NA + exon$fft_max_freq_ribo<-NA + exon$fft_power_3_ribo<-NA + exon$fft_aver_ribo<-NA + exon$spec_max_freq_ribo<-NA + exon$spec_power_3_ribo<-NA + exon$spec_aver_power_ribo<-NA + + exon$multit_freq_best_rna<-NA + exon$pval_multit_3nt_rna<-NA + exon$spec_multit_3nt_rna<-NA + exon$fft_max_freq_rna<-NA + exon$fft_power_3_rna<-NA + exon$fft_aver_rna<-NA + exon$spec_max_freq_rna<-NA + exon$spec_power_3_rna<-NA + exon$spec_aver_power_rna<-NA + + exon$ORF_score_ribo<-NA + exon$ORF_score_rna<-NA + + if(P_sites_sum>2 & length>5){ + if(length<25){slepians<-dpss(n=length+(50-length),k=24,nw=12)} + if(length>=25){slepians<-dpss(n=length,k=24,nw=12)} + bestfreq_3ntpval_ribo<-take_freqs_Fvalues_all_around_3nt_spec(x=tracks[,1],n_tapers=24,time_bw=12,slepians_values=slepians)[c(1,6,7)] + exon$multit_freq_best_ribo<-bestfreq_3ntpval_ribo[1] + exon$pval_multit_3nt_ribo<-bestfreq_3ntpval_ribo[2] + exon$spec_multit_3nt_ribo<-bestfreq_3ntpval_ribo[3] + score1<-((Phase_P_sites_frame-P_sites_sum/3)^2)/(P_sites_sum/3) + score2<-((Phase_P_sites_frame_1-P_sites_sum/3)^2)/(P_sites_sum/3) + score3<-((Phase_P_sites_frame_2-P_sites_sum/3)^2)/(P_sites_sum/3) + exon$ORF_score_ribo<-log2(score1+score2+score3+1) + + + gino<-getFFTFreqs(Nyq.Freq=0.5,data=tracks[,1]) + modFFT <- Mod(fft(tracks[,1])) + FFTdata <- cbind(gino, modFFT) + exon$fft_aver_ribo<-mean(FFTdata[,2]) + exon$fft_power_3_ribo<-FFTdata[which(abs((gino-(1/3)))==min(abs((gino-(1/3))))),2] + exon$fft_max_freq_ribo<-abs(gino[which(FFTdata==max((FFTdata[10:dim(FFTdata)[1]/2,2])),arr.ind=TRUE)[1]])[1] + + + spect_P_sites<-spectrum(tracks[,1],plot=FALSE) + exon$spec_max_freq_ribo<-spect_P_sites$freq[which(spect_P_sites$spec==max(spect_P_sites$spec),arr.ind=TRUE)][1] + exon$spec_power_3_ribo<-spect_P_sites$spec[which(abs((spect_P_sites$freq-(1/3)))==min(abs((spect_P_sites$freq-(1/3)))))] + exon$spec_aver_power_ribo<-mean(spect_P_sites$spec) + if(Centered_sites_sum>2){ + + gino<-getFFTFreqs(Nyq.Freq=0.5,data=tracks[,4]) + modFFT <- Mod(fft(tracks[,4])) + FFTdata <- cbind(gino, modFFT) + exon$fft_aver_rna<-mean(FFTdata[,2]) + exon$fft_power_3_rna<-FFTdata[which(abs((gino-(1/3)))==min(abs((gino-(1/3))))),2] + exon$fft_max_freq_rna<-1/abs(gino[which(FFTdata==max((FFTdata[10:dim(FFTdata)[1]/2,2])),arr.ind=TRUE)[1]])[1] + + + + spect_rna<-spectrum(tracks[,4],plot=FALSE) + exon$spec_max_freq_rna<-spect_rna$freq[which(spect_rna$spec==max(spect_rna$spec),arr.ind=TRUE)][1] + exon$spec_power_3_rna<-spect_rna$spec[which(abs((spect_rna$freq-(1/3)))==min(abs((spect_rna$freq-(1/3)))))] + exon$spec_aver_power_rna<-mean(spect_rna$spec) + bestfreq_3ntpval_rna<-take_freqs_Fvalues_all_around_3nt_spec(x=tracks[,4],n_tapers=24,time_bw=12,slepians_values=slepians)[c(1,6,7)] + + exon$multit_freq_best_rna<-bestfreq_3ntpval_rna[1] + exon$pval_multit_3nt_rna<-bestfreq_3ntpval_rna[2] + exon$spec_multit_3nt_rna<-bestfreq_3ntpval_rna[3] + + score_rna_1<-((Phase_Centered_sites_frame-Centered_sites_sum/3)^2)/(Centered_sites_sum/3) + score_rna_2<-((Phase_Centered_sites_frame_1-Centered_sites_sum/3)^2)/(Centered_sites_sum/3) + score_rna_3<-((Phase_Centered_sites_frame_2-Centered_sites_sum/3)^2)/(Centered_sites_sum/3) + exon$ORF_score_rna<-log2(score_rna_1+score_rna_2+score_rna_3+1) + } + + + + } + + + exon$chisq_ribo<-NA + exon$chisq_rna<-NA + + + if(P_sites_sum>15 & length>5){ + exon$chisq_ribo<-chisq.test(as.table(c(Phase_P_sites_frame,Phase_P_sites_frame_1,Phase_P_sites_frame_2)))$p.value} + if(P_sites_sum<16 & P_sites_sum>0 & length>5){ + exon$chisq_ribo<-xmulti(obs=c(Phase_P_sites_frame,Phase_P_sites_frame_1,Phase_P_sites_frame_2),expr=c(1,1,1),statName="Prob",detail=0)$pProb + } + + + if(Centered_sites_sum>15 & length>5){ + exon$chisq_rna<-chisq.test(as.table(c(Phase_Centered_sites_frame,Phase_Centered_sites_frame_1,Phase_Centered_sites_frame_2)))$p.value} + if(Centered_sites_sum<16 & Centered_sites_sum>0 & length>5){ + exon$chisq_rna<-xmulti(obs=c(Phase_Centered_sites_frame,Phase_Centered_sites_frame_1,Phase_Centered_sites_frame_2),expr=c(1,1,1),statName="Prob",detail=0)$pProb + } + + + exon$max_notcov_ribo<-max((!tracks[,2]) * unlist(lapply(rle(tracks[,2])$lengths, seq_len))) + exon$coords_notcov_ribo<-max.col(t((!tracks[,2]) * unlist(lapply(rle(tracks[,2])$lengths, seq_len))))-max((!tracks[,2]) * unlist(lapply(rle(tracks[,2])$lengths, seq_len))) + + exon$max_notcov_rna<-max((!tracks[,3]) * unlist(lapply(rle(tracks[,3])$lengths, seq_len))) + exon$coords_notcov_rna<-max.col(t((!tracks[,3]) * unlist(lapply(rle(tracks[,3])$lengths, seq_len))))-max((!tracks[,3]) * unlist(lapply(rle(tracks[,3])$lengths, seq_len))) + + + + if(strand=="-"){ + exon$coords_notcov_ribo<-length-exon$coords_notcov_ribo + exon$coords_notcov_rna<-length-exon$coords_notcov_rna + } + + if(exon$max_notcov_ribo==0){ + exon$max_notcov_ribo<-"NA" + } + + if(exon$max_notcov_rna==0){ + exon$max_notcov_rna<-"NA" + } + + exon$notcovered_ribo<-sum(tracks[,2] == 0) + exon$notcovered_rna<-sum(tracks[,3] == 0) + + + + + if(length>2){ + exon$frame_start_pred<-FRAME_MAX_phase + exon$frame_end_pred<-(length-(FRAME_MAX_phase+1))%%3 + } + if(x[1,2]=="-" & length>2){ + + exon$frame_end_pred<-FRAME_MAX_phase + exon$frame_start_pred<-(length-(FRAME_MAX_phase+1))%%3 + } + + pept<-NA + exon$transl_pept_notccds<-NA + if(P_sites_sum>0){ + if(exon$strand=="-"){ + pept<-unlist(getTrans(seq_exon,sens="F",frame=exon$frame_end_pred)) + } else {pept<-unlist(getTrans(seq_exon,sens="F",frame=exon$frame_start_pred))} + exon$transl_pept_notccds<-paste(pept,sep="",collapse="") + } + + return(exon) +} + + + +### This function calculates coherence values for candidate regions with multi-frame translation + + +calculate_coherence<-function(x){ + strand<-x[1,2] + tracks_pre<-t(x[,-c(1:2)]) + + + if(strand=="-"){ + tracks<-cbind(rev(tracks_pre[,1]),rev(tracks_pre[,2]),rev(tracks_pre[,3]),rev(tracks_pre[,4])) + } else if (strand=="+"){ + tracks<-tracks_pre} + colnames(tracks)<-c("Psites","RiboCov","RNACov","RNAcent") + mode(tracks)<-"numeric" + + exon<-data.frame(exon_id=x[1,1],stringsAsFactors=F,row.names=NULL) + exon$strand<-strand + exon$frame_start_pred<-NA + exon$frame_end_pred<-NA + + exon$length<-dim(tracks)[1] + length<-dim(tracks)[1] + + + P_sites_sum<-round(sum(tracks[,1]),digits=6) + exon$P_sites_sum<-P_sites_sum + + + Centered_sites_sum<-round(sum(tracks[,4]),digits=6) + exon$RNA_sites_sum<-Centered_sites_sum + exon$Ribocov_aver<-round(mean(tracks[,2]),digits=6) + exon$RNAseqcov_aver<-round(mean(tracks[,3]),digits=6) + exon$pctPhase_frame<-NA + exon$pctPhase_frame_1<-NA + exon$pctPhase_frame_2<-NA + exon$pctPhaseCentered_frame<-NA + exon$pctPhaseCentered_frame_1<-NA + exon$pctPhaseCentered_frame_2<-NA + if(length>2){ + Phase_P_sites_frame<-sum(tracks[seq(1,length,by=3),1]) + Phase_P_sites_frame_1<-sum(tracks[seq(2,length,by=3),1]) + Phase_P_sites_frame_2<-sum(tracks[seq(3,length,by=3),1]) + + + exon$pctPhase_frame<-Phase_P_sites_frame/P_sites_sum + exon$pctPhase_frame_1<-Phase_P_sites_frame_1/P_sites_sum + exon$pctPhase_frame_2<-Phase_P_sites_frame_2/P_sites_sum + + + Phase_Centered_sites_frame<-sum(tracks[seq(1,length,by=3),4]) + Phase_Centered_sites_frame_1<-sum(tracks[seq(2,length,by=3),4]) + Phase_Centered_sites_frame_2<-sum(tracks[seq(3,length,by=3),4]) + + + exon$pctPhaseCentered_frame<-Phase_Centered_sites_frame/Centered_sites_sum + exon$pctPhaseCentered_frame_1<-Phase_Centered_sites_frame_1/Centered_sites_sum + exon$pctPhaseCentered_frame_2<-Phase_Centered_sites_frame_2/Centered_sites_sum + + + MAXPhase_frame<-max(c(exon$pctPhase_frame,exon$pctPhase_frame_1,exon$pctPhase_frame_2)) + FRAME_MAX_phase<-max.col(t(c(exon$pctPhase_frame,exon$pctPhase_frame_1,exon$pctPhase_frame_2)))-1 + + MAXPhaseCentered_frame<-max(c(exon$pctPhaseCentered_frame,exon$pctPhaseCentered_frame_1,exon$pctPhaseCentered_frame_2)) + FRAME_MAX_phaseCentered<-max.col(t(c(exon$pctPhaseCentered_frame,exon$pctPhaseCentered_frame_1,exon$pctPhaseCentered_frame_2)))-1 + } + + + exon$multit_freq_best_ribo<-NA + exon$pval_multit_3nt_ribo<-NA + exon$spec_multit_3nt_ribo<-NA + + exon$coherence_1_2_ribo<-NA + exon$coherence_1_3_ribo<-NA + exon$coherence_2_3_ribo<-NA + exon$min_coherence_ribo<-NA + exon$multit_freq_best_rna<-NA + exon$pval_multit_3nt_rna<-NA + exon$spec_multit_3nt_rna<-NA + exon$coherence_1_2_rna<-NA + exon$coherence_1_3_rna<-NA + exon$coherence_2_3_rna<-NA + exon$min_coherence_rna<-NA + if(P_sites_sum>10 & length>5){ + + if(length<25){slepians<-dpss(n=length+(50-length),k=24,nw=12)} + if(length>=25){slepians<-dpss(n=length,k=24,nw=12)} + bestfreq_3ntpval_ribo<-take_freqs_Fvalues_all_around_3nt_spec(x=tracks[,1],n_tapers=24,time_bw=12,slepians_values=slepians)[c(1,6,5,7)] + exon$multit_freq_best_ribo<-bestfreq_3ntpval_ribo[1] + exon$pval_multit_3nt_ribo<-bestfreq_3ntpval_ribo[2] + exon$spec_multit_3nt_ribo<-bestfreq_3ntpval_ribo[4] + + y<-tracks[,1] + + if(length(y)<25){ + remain<-50-length(y) + y<-c(rep(0,as.integer(remain/2)),y,rep(0,remain%%2+as.integer(remain/2))) + } + if(length(y)<1024/2){padding<-1024} + if(length(y)>=1024/2){padding<-"default"} + length<-length(y) + + y1<-rep(0,length) + y2<-rep(0,length) + y3<-rep(0,length) + + y1[seq(1,length,by=3)]<-y[seq(1,length,by=3)] + y2[seq(2,length,by=3)]<-y[seq(2,length,by=3)] + y3[seq(3,length,by=3)]<-y[seq(3,length,by=3)] + + + + spec_y1<-spec.mtm(timeSeries=as.ts(y1),nw=12,k=24,dpssIN=slepians,returnInternals=T,plot=F,nFFT=padding) + spec_y2<-spec.mtm(timeSeries=as.ts(y2),nw=12,k=24,dpssIN=slepians,returnInternals=T,plot=F,nFFT=padding) + spec_y3<-spec.mtm(timeSeries=as.ts(y3),nw=12,k=24,dpssIN=slepians,returnInternals=T,plot=F,nFFT=padding) + + coh1_2<-mtm.coh(spec_y1,spec_y2,plot=F) + coh1_3<-mtm.coh(spec_y1,spec_y3,plot=F) + coh2_3<-mtm.coh(spec_y2,spec_y3,plot=F) + + exon$coherence_1_2_ribo<-coh1_2$msc[which(coh1_2$freq==bestfreq_3ntpval_ribo[3])] + exon$coherence_1_3_ribo<-coh1_3$msc[which(coh1_3$freq==bestfreq_3ntpval_ribo[3])] + exon$coherence_2_3_ribo<-coh2_3$msc[which(coh2_3$freq==bestfreq_3ntpval_ribo[3])] + exon$min_coherence_ribo<-min(c(exon$coherence_1_2_ribo,exon$coherence_1_3_ribo,exon$coherence_2_3_ribo),na.rm=T) + if((Phase_Centered_sites_frame > 5 & Phase_Centered_sites_frame_1 > 5) | (Phase_Centered_sites_frame > 5 & Phase_Centered_sites_frame_2 > 5) | (Phase_Centered_sites_frame_1 > 5 & Phase_Centered_sites_frame_2 > 5)){ + + bestfreq_3ntpval_rna<-take_freqs_Fvalues_all_around_3nt_spec(x=tracks[,4],n_tapers=24,time_bw=12,slepians_values=slepians)[c(1,6,5,7)] + exon$multit_freq_best_rna<-bestfreq_3ntpval_rna[1] + exon$pval_multit_3nt_rna<-bestfreq_3ntpval_rna[2] + exon$spec_multit_3nt_rna<-bestfreq_3ntpval_rna[4] + + y<-tracks[,4] + + if(length(y)<25){ + remain<-50-length(y) + y<-c(rep(0,as.integer(remain/2)),y,rep(0,remain%%2+as.integer(remain/2))) + } + if(length(y)<1024/2){padding<-1024} + if(length(y)>=1024/2){padding<-"default"} + length<-length(y) + + y1<-rep(0,length) + y2<-rep(0,length) + y3<-rep(0,length) + + y1[seq(1,length,by=3)]<-y[seq(1,length,by=3)] + y2[seq(2,length,by=3)]<-y[seq(2,length,by=3)] + y3[seq(3,length,by=3)]<-y[seq(3,length,by=3)] + + + + spec_y1<-spec.mtm(timeSeries=as.ts(y1),nw=12,k=24,dpssIN=slepians,returnInternals=T,plot=F,nFFT=padding) + spec_y2<-spec.mtm(timeSeries=as.ts(y2),nw=12,k=24,dpssIN=slepians,returnInternals=T,plot=F,nFFT=padding) + spec_y3<-spec.mtm(timeSeries=as.ts(y3),nw=12,k=24,dpssIN=slepians,returnInternals=T,plot=F,nFFT=padding) + + coh1_2<-mtm.coh(spec_y1,spec_y2,plot=F) + coh1_3<-mtm.coh(spec_y1,spec_y3,plot=F) + coh2_3<-mtm.coh(spec_y2,spec_y3,plot=F) + + exon$coherence_1_2_rna<-coh1_2$msc[which(coh1_2$freq==bestfreq_3ntpval_rna[3])] + exon$coherence_1_3_rna<-coh1_3$msc[which(coh1_3$freq==bestfreq_3ntpval_rna[3])] + exon$coherence_2_3_rna<-coh2_3$msc[which(coh2_3$freq==bestfreq_3ntpval_rna[3])] + exon$min_coherence_rna<-min(c(exon$coherence_1_2_rna,exon$coherence_1_3_rna,exon$coherence_2_3_rna),na.rm=T) + + } + } + exon +} + + +### This function calculates exonic information on nonCCDS ORFs, to calculate multimapping information and CDS overlaps + + +pre_multi_nonCCDS_ORFs<-function(x,counter,all_exons_in_the_sign_transcr=exons_transcr_nonccds_sign,signif_exons=nonccds_res_sign){ + transcr<-x[,"transcript_id"] + trascr_length<-x$length + orf_strand<-x$strand + transcr_data<-data.frame(transcript_id=transcr) + + exons_in_transcr<-all_exons_in_the_sign_transcr[all_exons_in_the_sign_transcr[,4]%in%transcr,"exon_id"] + if(orf_strand=="-"){exons_in_transcr<-rev(exons_in_transcr)} + + exons_in_transcr_data<-nonccds_res[nonccds_res[,"exon_id"]%in%exons_in_transcr,] + exons_in_transcr_data<-exons_in_transcr_data[match(exons_in_transcr,exons_in_transcr_data$exon_id),] + + orf_start<-x$start_pos + orf_end<-x$st2vect + cumsumexons<-cumsum(exons_in_transcr_data$length.x) + + st_ex<-which((cumsumexons-orf_start)==min(cumsumexons[cumsumexons>orf_start]-orf_start)) + end_ex<-which((cumsumexons-orf_end)==min(cumsumexons[cumsumexons>=orf_end]-orf_end)) + in_betw_ex<-st_ex:end_ex + in_betw_ex<-in_betw_ex[!in_betw_ex%in%c(st_ex,end_ex)>0] + exon_inbetween_data<-exons_in_transcr_data[in_betw_ex,] + + + coord_start<-NA + coord_end<-NA + nt_to_rem<-NA + rem_len<-0 + if(st_ex>1){rem_len<-cumsumexons[st_ex-1]} + if(x$strand=="+"){coord_start<-exons_in_transcr_data[st_ex,"start"] + (orf_start-rem_len)} + if(x$strand=="-"){coord_start<-exons_in_transcr_data[st_ex,"end"] - (orf_start-rem_len)} + + if(length(in_betw_ex)==0){ + if(st_ex==end_ex){nt_to_rem<-0} + if(st_ex!=end_ex){if(x$strand=="+"){ + nt_to_rem<-exons_in_transcr_data[st_ex,"end"]-coord_start + } + if(x$strand=="-"){ + nt_to_rem<-coord_start-exons_in_transcr_data[st_ex,"start"] + } + } + } + + if(length(in_betw_ex)>0){ + nt_in_betw<-sum(exons_in_transcr_data[in_betw_ex,"length.x"]) + if(x$strand=="+"){ + nt_to_rem<-exons_in_transcr_data[st_ex,"end"]-coord_start + } + if(x$strand=="-"){ + nt_to_rem<-coord_start-exons_in_transcr_data[st_ex,"start"] + } + nt_to_rem<-nt_to_rem+nt_in_betw + } + + if(st_ex==end_ex & x$strand=="+"){coord_end<-coord_start+x$ORF_length+1} + if(st_ex==end_ex & x$strand=="-"){coord_end<-coord_start-x$ORF_length+1} + + if(st_ex!=end_ex & x$strand=="+"){coord_end<-exons_in_transcr_data[end_ex,"start"] + (x$ORF_length-nt_to_rem)+1} + if(st_ex!=end_ex & x$strand=="-"){coord_end<-exons_in_transcr_data[end_ex,"end"] - (x$ORF_length-nt_to_rem)+1} + + if(x$strand=="-"){ + coord_start2<-coord_start + coord_start<-coord_end + coord_end<-coord_start2 + } + + + if(st_ex!=end_ex & x$strand=="+"){to_check_st<-paste(exons_in_transcr_data[st_ex,"chr"],coord_start,exons_in_transcr_data[st_ex,"end"],"EXONnonCCDS",x$gene_id,x$strand,sep="_") + to_check_end<-paste(exons_in_transcr_data[end_ex,"chr"],exons_in_transcr_data[end_ex,"start"],coord_end,"EXONnonCCDS",x$gene_id,x$strand,sep="_") + to_check<-paste(to_check_st,to_check_end,sep=";") + + } + if(st_ex!=end_ex & x$strand=="-"){to_check_st<-paste(exons_in_transcr_data[st_ex,"chr"],exons_in_transcr_data[st_ex,"start"],coord_end,"EXONnonCCDS",x$gene_id,x$strand,sep="_") + to_check_end<-paste(exons_in_transcr_data[end_ex,"chr"],coord_start,exons_in_transcr_data[end_ex,"end"],"EXONnonCCDS",x$gene_id,x$strand,sep="_") + to_check<-paste(to_check_st,to_check_end,sep=";") + } + + if(st_ex==end_ex){to_check<-paste(exons_in_transcr_data[st_ex,"chr"],coord_start,coord_end,"EXONnonCCDS",x$gene_id,x$strand,sep="_")} + x$to_check<-to_check + x$to_check_rem<-NA + if(length(in_betw_ex)>0){ + x$to_check_rem<-paste(exon_inbetween_data$exon_id,collapse=";") + + } + x$ORF_id_tr<-paste(transcr_data$transcript_id,orf_start,orf_end,sep="_") + x$ORF_id_gen<-paste(exons_in_transcr_data[st_ex,"chr"],coord_start,coord_end,sep="_") + x + + +} + +### This function calculates exonic information on CCDS ORFs, to calculate multimapping information and CDS overlaps + + + +pre_multi_CCDS_ORFs<-function(x,counter,all_exons_in_the_sign_transcr=exons_transcr_nonccds_sign,signif_exons=nonccds_res){ + transcr<-x[,"transcript_id"] + trascr_length<-x$length + orf_strand<-x$strand + transcr_data<-data.frame(transcript_id=transcr) + + exons_in_transcr<-all_exons_in_the_sign_transcr[all_exons_in_the_sign_transcr[,4]%in%transcr,"coords_id"] + if(orf_strand=="-"){exons_in_transcr<-rev(exons_in_transcr)} + + exons_in_transcr_data<-signif_exons[signif_exons[,"coords"]%in%exons_in_transcr,] + exons_in_transcr_data<-exons_in_transcr_data[match(exons_in_transcr,exons_in_transcr_data$coords),] + + orf_start<-x$start_pos + orf_end<-x$st2vect + cumsumexons<-cumsum(exons_in_transcr_data$length.x) + + st_ex<-which((cumsumexons-orf_start)==min(cumsumexons[cumsumexons>orf_start]-orf_start)) + end_ex<-which((cumsumexons-orf_end)==min(cumsumexons[cumsumexons>=orf_end]-orf_end)) + in_betw_ex<-st_ex:end_ex + in_betw_ex<-in_betw_ex[!in_betw_ex%in%c(st_ex,end_ex)>0] + exon_inbetween_data<-exons_in_transcr_data[in_betw_ex,] + + + coord_start<-NA + coord_end<-NA + nt_to_rem<-NA + rem_len<-0 + if(st_ex>1){rem_len<-cumsumexons[st_ex-1]} + if(x$strand=="+"){coord_start<-exons_in_transcr_data[st_ex,"start"] + (orf_start-rem_len)} + if(x$strand=="-"){coord_start<-exons_in_transcr_data[st_ex,"end"] - (orf_start-rem_len)} + + if(length(in_betw_ex)==0){ + if(st_ex==end_ex){nt_to_rem<-0} + if(st_ex!=end_ex){if(x$strand=="+"){ + nt_to_rem<-exons_in_transcr_data[st_ex,"end"]-coord_start + } + if(x$strand=="-"){ + nt_to_rem<-coord_start-exons_in_transcr_data[st_ex,"start"] + } + } + } + + if(length(in_betw_ex)>0){ + nt_in_betw<-sum(exons_in_transcr_data[in_betw_ex,"length.x"]) + if(x$strand=="+"){ + nt_to_rem<-exons_in_transcr_data[st_ex,"end"]-coord_start + } + if(x$strand=="-"){ + nt_to_rem<-coord_start-exons_in_transcr_data[st_ex,"start"] + } + nt_to_rem<-nt_to_rem+nt_in_betw + } + + if(st_ex==end_ex & x$strand=="+"){coord_end<-coord_start+x$ORF_length+1} + if(st_ex==end_ex & x$strand=="-"){coord_end<-coord_start-x$ORF_length+1} + + if(st_ex!=end_ex & x$strand=="+"){coord_end<-exons_in_transcr_data[end_ex,"start"] + (x$ORF_length-nt_to_rem)+1} + if(st_ex!=end_ex & x$strand=="-"){coord_end<-exons_in_transcr_data[end_ex,"end"] - (x$ORF_length-nt_to_rem)+1} + + if(x$strand=="-"){ + coord_start2<-coord_start + coord_start<-coord_end + coord_end<-coord_start2 + } + + + if(st_ex!=end_ex & x$strand=="+"){to_check_st<-paste(exons_in_transcr_data[st_ex,"chr"],coord_start,exons_in_transcr_data[st_ex,"end"],"CCDS",x$gene_id,x$strand,sep="_") + to_check_end<-paste(exons_in_transcr_data[end_ex,"chr"],exons_in_transcr_data[end_ex,"start"],coord_end,"CCDS",x$gene_id,x$strand,sep="_") + to_check<-paste(to_check_st,to_check_end,sep=";") + + } + if(st_ex!=end_ex & x$strand=="-"){to_check_st<-paste(exons_in_transcr_data[st_ex,"chr"],exons_in_transcr_data[st_ex,"start"],coord_end,"CCDS",x$gene_id,x$strand,sep="_") + to_check_end<-paste(exons_in_transcr_data[end_ex,"chr"],coord_start,exons_in_transcr_data[end_ex,"end"],"CCDS",x$gene_id,x$strand,sep="_") + to_check<-paste(to_check_st,to_check_end,sep=";") + } + + if(st_ex==end_ex){to_check<-paste(exons_in_transcr_data[st_ex,"chr"],coord_start,coord_end,"CCDS",x$gene_id,x$strand,sep="_")} + x$to_check<-to_check + x$to_check_rem<-NA + if(length(in_betw_ex)>0){ + x$to_check_rem<-paste(exon_inbetween_data$exon_id,collapse=";") + + } + x$ORF_id_tr<-paste(transcr_data$transcript_id,orf_start,orf_end,sep="_") + x$ORF_id_gen<-paste(exons_in_transcr_data[st_ex,"chr"],coord_start,coord_end,sep="_") + x + + +} + + + + +### This function calculates results for real and simulated exons for the multitaper analysis + + +take_simuls_multi<-function(x,tapers,bw,nsimul){ + unique_ex_id<-x[,"exon_id"] + list_exons_tracks<-list() + for(i in seq(1:length(unique_ex_id))){ + list_exons_tracks[[i]]<-all_tracks[index==unique_ex_id[i]] + } + simuls_eachexons<-list() + for(s in 1:length(unique_ex_id)){ + withsep<-strsplit(list_exons_tracks[[s]],split=" ") + x<-t(data.frame(withsep)) + id<-unique_ex_id[s] + exon<-data.frame(exon_id=id,stringsAsFactors=F,row.names=NULL) + strand<-x[1,2] + tracks_pre<-t(x[,-c(1:2)]) + if(strand=="-"){ + tracks<-cbind(rev(tracks_pre[,1]),rev(tracks_pre[,2]),rev(tracks_pre[,3]),rev(tracks_pre[,4])) + } else if (strand=="+"){ + tracks<-tracks_pre} + colnames(tracks)<-c("Psites","RiboCov","RNACov","RNAcent") + mode(tracks)<-"numeric" + length<-dim(tracks)[1] + exon$length<-length + + if(length<25){ + slepians<-dpss(n=length+(50-length),k=tapers,nw=bw) + } + if(length>=25){ + slepians<-dpss(n=length,k=tapers,nw=bw) + } + + exon$pval_multi_ribo<-take_freqs_Fvalues_all_around_3nt_spec(x=tracks[,1],n_tapers=tapers,time_bw=bw,slepians_values=slepians)[6] + ribo_covered_pos<-which(tracks[,2]>0) + P_sites_sum<-sum(tracks[,1]) + exon$P_sites_sum<-P_sites_sum + exon$RNA_sites_sum<-sum(tracks[,4]) + Phase_P_sites_frame<-sum(tracks[seq(1,length,by=3),1]) + Phase_P_sites_frame_1<-sum(tracks[seq(2,length,by=3),1]) + Phase_P_sites_frame_2<-sum(tracks[seq(3,length,by=3),1]) + score1<-((Phase_P_sites_frame-P_sites_sum/3)^2)/(P_sites_sum/3) + score2<-((Phase_P_sites_frame_1-P_sites_sum/3)^2)/(P_sites_sum/3) + score3<-((Phase_P_sites_frame_2-P_sites_sum/3)^2)/(P_sites_sum/3) + + simuls_results<-foreach(j=1:nsimul,.combine=c,.multicombine=T) %dopar%{ + set.seed(j) + simtrack<-rep(0,length) + rand_pos<-sample(ribo_covered_pos,P_sites_sum,replace=T) + for(i in rand_pos){ + simtrack[i]<-simtrack[i]+1 + } + + simul_Pval_multi_3nt<-take_freqs_Fvalues_all_around_3nt_spec(x=simtrack,n_tapers=tapers,time_bw=bw,slepians_values=slepians)[6] + + return(simul_Pval_multi_3nt) + } + exon$n_simul_sign_multi<-sum(simuls_results<0.05) + exon$pct_simul_sign_multi<-sum(simuls_results<0.05)/length(simuls_results) + simuls_eachexons[[s]]<-exon + + } + results_simuls<-do.call(args=simuls_eachexons,what=rbind.data.frame) + results_simuls +} + + +### This function calculates results for real and simulated exons, for Chi-square and ORFscore + + +take_simuls_chisq_ORFscore<-function(x,nsimul,cutoff_ORFscore=quantile85_ORFscore){ + unique_ex_id<-x[,"exon_id"] + list_exons_tracks<-list() + for(i in seq(1:length(unique_ex_id))){ + list_exons_tracks[[i]]<-all_tracks[index==unique_ex_id[i]] + } + simuls_eachexons<-list() + for(s in 1:length(unique_ex_id)){ + withsep<-strsplit(list_exons_tracks[[s]],split=" ") + x<-t(data.frame(withsep)) + id<-unique_ex_id[s] + exon<-data.frame(exon_id=id,stringsAsFactors=F,row.names=NULL) + strand<-x[1,2] + tracks_pre<-t(x[,-c(1:2)]) + if(strand=="-"){ + tracks<-cbind(rev(tracks_pre[,1]),rev(tracks_pre[,2]),rev(tracks_pre[,3]),rev(tracks_pre[,4])) + } else if (strand=="+"){ + tracks<-tracks_pre} + colnames(tracks)<-c("Psites","RiboCov","RNACov","RNAcent") + mode(tracks)<-"numeric" + length<-dim(tracks)[1] + + + ribo_covered_pos<-which(tracks[,2]>0) + P_sites_sum<-sum(tracks[,1]) + Phase_P_sites_frame<-sum(tracks[seq(1,length,by=3),1]) + Phase_P_sites_frame_1<-sum(tracks[seq(2,length,by=3),1]) + Phase_P_sites_frame_2<-sum(tracks[seq(3,length,by=3),1]) + score1<-((Phase_P_sites_frame-P_sites_sum/3)^2)/(P_sites_sum/3) + score2<-((Phase_P_sites_frame_1-P_sites_sum/3)^2)/(P_sites_sum/3) + score3<-((Phase_P_sites_frame_2-P_sites_sum/3)^2)/(P_sites_sum/3) + exon$ORF_score<-log2(score1+score2+score3+1) + if(P_sites_sum>15){ + exon$chisq<-chisq.test(as.table(c(Phase_P_sites_frame,Phase_P_sites_frame_1,Phase_P_sites_frame_2)))$p.value} + if(P_sites_sum<16 & P_sites_sum>0){ + exon$chisq<-xmulti(obs=c(Phase_P_sites_frame,Phase_P_sites_frame_1,Phase_P_sites_frame_2),expr=c(1,1,1),statName="Prob",detail=0)$pProb + } + simuls_results<-foreach(j=1:nsimul,.combine=rbind,.multicombine=T) %dopar%{ + set.seed(j) + simtrack<-rep(0,length) + rand_pos<-sample(ribo_covered_pos,P_sites_sum,replace=T) + for(i in rand_pos){ + simtrack[i]<-simtrack[i]+1 + } + + Phase_P_sites_frame<-sum(simtrack[seq(1,length,by=3)]) + Phase_P_sites_frame_1<-sum(simtrack[seq(2,length,by=3)]) + Phase_P_sites_frame_2<-sum(simtrack[seq(3,length,by=3)]) + + score1<-((Phase_P_sites_frame-P_sites_sum/3)^2)/(P_sites_sum/3) + score2<-((Phase_P_sites_frame_1-P_sites_sum/3)^2)/(P_sites_sum/3) + score3<-((Phase_P_sites_frame_2-P_sites_sum/3)^2)/(P_sites_sum/3) + simul_ORF_score<-log2(score1+score2+score3+1) + if(P_sites_sum>15){ + simul_Chisq<-chisq.test(as.table(c(Phase_P_sites_frame,Phase_P_sites_frame_1,Phase_P_sites_frame_2)))$p.value} + if(P_sites_sum<16 & P_sites_sum>0){ + simul_Chisq<-xmulti(obs=c(Phase_P_sites_frame,Phase_P_sites_frame_1,Phase_P_sites_frame_2),expr=c(1,1,1),statName="Prob",detail=0)$pProb + } + return(c(simul_Chisq,simul_ORF_score)) + } + colnames(simuls_results)<-c("simul_Chisq","simul_ORF_score") + exon$n_simul_sign_Chiq<-sum(simuls_results[,"simul_Chisq"]<0.05) + exon$n_simul_sign_ORFscore<-sum(simuls_results[,"simul_ORF_score"]>cutoff_ORFscore) + exon$pct_simul_sign_Chiq<-sum(simuls_results[,"simul_Chisq"]<0.05)/dim(simuls_results)[1] + exon$pct_simul_sign_ORFscore<-sum(simuls_results[,"simul_ORF_score"]>6)/dim(simuls_results)[1] + simuls_eachexons[[s]]<-exon + + } + results_simuls<-do.call(args=simuls_eachexons,what=rbind.data.frame) + results_simuls +} + + +# Multiple plot function, from http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_%28ggplot2%29/ +# +# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects) +# - cols: Number of columns in layout +# - layout: A matrix specifying the layout. If present, 'cols' is ignored. +# +# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE), +# then plot 1 will go in the upper left, 2 will go in the upper right, and +# 3 will go all the way across the bottom. +# +multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) { + require(grid) + + # Make a list from the ... arguments and plotlist + plots <- c(list(...), plotlist) + + numPlots = length(plots) + + # If layout is NULL, then use 'cols' to determine layout + if (is.null(layout)) { + # Make the panel + # ncol: Number of columns of plots + # nrow: Number of rows needed, calculated from # of cols + layout <- matrix(seq(1, cols * ceiling(numPlots/cols)), + ncol = cols, nrow = ceiling(numPlots/cols)) + } + + if (numPlots==1) { + print(plots[[1]]) + + } else { + # Set up the page + grid.newpage() + pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout)))) + + # Make each plot, in the correct location + for (i in 1:numPlots) { + # Get the i,j matrix positions of the regions that contain this subplot + matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE)) + + print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row, + layout.pos.col = matchidx$col)) + } + } +} diff --git a/scripts/genes_coor.R b/scripts/genes_coor.R new file mode 100755 index 0000000..ad2f02f --- /dev/null +++ b/scripts/genes_coor.R @@ -0,0 +1,14 @@ +#!/usr/bin/Rscript + +all_ex<-read.table("all_exons.bed",stringsAsFactors=F,header=F) +spli<-split.data.frame(all_ex,f=all_ex$V5) + +spli2<-lapply(spli,FUN=function(x){ + minc<-min(x[,2]) + maxc<-max(x[,3]) + data.frame(chr=x[1,1],start=minc,end=maxc,le=maxc-minc,gene_id=x[1,5],strand=x[1,6],stringsAsFactors=F) +}) +spli3<-do.call(what=rbind.data.frame,args=spli2) +write.table(file="genes_start_end",x=spli3,col.names=F,row.names=F,quote=F,sep="\t") +system("sort -k1,1 -k2,2n genes_start_end > genes_start_end.bed ") +system("rm genes_start_end") \ No newline at end of file diff --git a/scripts/gtf_to_start_stop_tr.R b/scripts/gtf_to_start_stop_tr.R new file mode 100755 index 0000000..5fa320f --- /dev/null +++ b/scripts/gtf_to_start_stop_tr.R @@ -0,0 +1,93 @@ +#!/usr/bin/Rscript + +################################################################### +# This file is part of RiboTaper. +# RiboTaper is a method for defining traslated ORFs using +# Ribosome Profiling data. +# +# Copyright (C) 2015 Lorenzo Calviello +# +# RiboTaper is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# RiboTaper is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with RiboTaper. If not, see . +# +# Contact: Lorenzo.Calviello@mdc-berlin.de +####################################################################### + + +###script for creating transcript-level coordinates of CDS positions (start and stop) from a .gtf file + +print(paste("--- extracting transcript-level CDS cordinates","---",date(),sep=" ")) + +exons_cds_all<-read.table("exons_cds_all",stringsAsFactors=F,header=F) +colnames(exons_cds_all)<-c("chr","type","start","end","strand","transcript_id") +tr_cds<-unique(exons_cds_all[exons_cds_all[,"type"]=="CDS","transcript_id"]) + +exons_cds_all2<-exons_cds_all[exons_cds_all[,"transcript_id"]%in%tr_cds,] +exons_cds_all2$length<-1+(exons_cds_all2$end-exons_cds_all2$start) +list_exons_cds_tr<-split.data.frame(x=exons_cds_all2,f=exons_cds_all2$transcript_id,drop=T) + +list_coords<-list() +for(i in 1:length(list_exons_cds_tr)){ + transcr<-tr_cds[i] + trascr_data<-list_exons_cds_tr[[transcr]] + + strand<-trascr_data$strand[1] + + exons_in_transcr<-trascr_data[trascr_data[,"type"]=="exon",] + if(strand=="-"){exons_in_transcr<-exons_in_transcr[dim(exons_in_transcr)[1]:1,]} + + + + cds_in_transcr<-trascr_data[trascr_data[,"type"]=="CDS",] + if(strand=="-"){cds_in_transcr<-cds_in_transcr[dim(cds_in_transcr)[1]:1,]} + + + cumsumexons<-cumsum(exons_in_transcr$length) + revcumsumexons<-cumsum(rev(exons_in_transcr$length)) + + cumsumcds<-cumsum(cds_in_transcr$length) + + st_cod<-cds_in_transcr[1,"start"] + if(strand=="-"){st_cod<-cds_in_transcr[1,"end"]} + + end_cod<-(cds_in_transcr[dim(cds_in_transcr)[1],"end"]) + if(strand=="-"){end_cod<-(cds_in_transcr[dim(cds_in_transcr)[1],"start"])} + + st_ex<-which((st_cod>=exons_in_transcr$start & st_cod<=exons_in_transcr$end)) + + end_ex<-which((end_cod>=exons_in_transcr$start & end_cod<=exons_in_transcr$end)) + + nt_dist_start<-st_cod-exons_in_transcr[st_ex,"start"] + if(strand=="-"){nt_dist_start<-exons_in_transcr[st_ex,"end"]-st_cod} + + if(st_ex>1){nt_dist_start<-nt_dist_start+cumsumexons[st_ex-1]} + + nt_dist_stop<-exons_in_transcr[end_ex,"end"]-end_cod + if(strand=="-"){nt_dist_stop<-end_cod-exons_in_transcr[end_ex,"start"]} + + if(end_ex. +# +# Contact: Lorenzo.Calviello@mdc-berlin.de +####################################################################### + + +###script to group the information about the multimapping read coverage vs uniquely mapping reads + + +args <- commandArgs(trailingOnly = TRUE) + + +names_covbeds<-c("exon_id","strand","reads","bases_covered","total_bases","pct_region_covered") + + +RIBO_best<-read.table(paste("RIBO_best_counts",as.character(args[1]),sep="_"),stringsAsFactors=F,header=F) + +colnames(RIBO_best)<-names_covbeds + + +RIBO_unique<-read.table(paste("RIBO_unique_counts",as.character(args[1]),sep="_"),stringsAsFactors=F,header=F) + +colnames(RIBO_unique)<-names_covbeds + + +RNA_best<-read.table(paste("RNA_best_counts",as.character(args[1]),sep="_"),stringsAsFactors=F,header=F) + +colnames(RNA_best)<-names_covbeds + + +RNA_unique<-read.table(paste("RNA_unique_counts",as.character(args[1]),sep="_"),stringsAsFactors=F,header=F) + +colnames(RNA_unique)<-names_covbeds + + +multi_table_RIBO<-merge(RIBO_best,RIBO_unique,by="exon_id") +multi_table_RNA<-merge(RNA_best,RNA_unique,by="exon_id") +multi_table_RIBO$pct_covered_onlymulti<-multi_table_RIBO$pct_region_covered.x-multi_table_RIBO$pct_region_covered.y +multi_table_RIBO$reads_multi<-multi_table_RIBO$reads.x-multi_table_RIBO$reads.y + +multi_table_RNA$pct_covered_onlymulti<-multi_table_RNA$pct_region_covered.x-multi_table_RNA$pct_region_covered.y +multi_table_RNA$reads_multi<-multi_table_RNA$reads.x-multi_table_RNA$reads.y + + +multi_table_RIBO<-multi_table_RIBO[,c(1,2,5,3,13,6,12)] +names(multi_table_RIBO)<-c("exon_id", "strand", "length.y", "reads_ribo", "reads_multi_ribo","pct_region_covered_ribo", + "pct_covered_onlymulti_ribo") + + + +multi_table_RNA<-multi_table_RNA[,c(1,3,13,6,12)] +names(multi_table_RNA)<-c("exon_id", "reads_rna", "reads_multi_rna", "pct_region_covered_rna", + "pct_covered_onlymulti_rna") + +multi_table<-merge(multi_table_RIBO,multi_table_RNA,by="exon_id") + + +write.table(multi_table,file=paste("multi_table",as.character(args[1]),sep="_"),quote=F,row.names=F,sep="\t",col.names=T) + diff --git a/scripts/metag.R b/scripts/metag.R new file mode 100755 index 0000000..9f1d689 --- /dev/null +++ b/scripts/metag.R @@ -0,0 +1,133 @@ +#!/usr/bin/Rscript + + +################################################################### +# This file is part of RiboTaper. +# RiboTaper is a method for defining traslated ORFs using +# Ribosome Profiling data. +# +# Copyright (C) 2015 Lorenzo Calviello +# +# RiboTaper is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# RiboTaper is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with RiboTaper. If not, see . +# +# Contact: Lorenzo.Calviello@mdc-berlin.de +####################################################################### + + +###script for making aggregate plots around start-stop codons, takes as argument the bed file from create_metaplots.bash + +print(paste("--- plotting aggreate start-stop profiles","---",date(),sep=" ")) + +args <- commandArgs(trailingOnly = TRUE) + +reads<-read.table(args[1],stringsAsFactors=F,header=F,sep="\t",comment.char="") +colnames(reads)<-c("chr","start","end","read_id","map_quality","strand",".1",".2",".3","spanning_exons","length_per_exon","length_introns","chr_stst","start_stst","end_stst","type_stst","gene_id_stst","strand_stst") + + +reads_simpl<-reads[reads[,"length_introns"]=="0",] + +reads_simpl$count<-1 + +list_str<-split.data.frame(reads_simpl,f=reads_simpl[,"strand"]) +list_str[["+"]]$distance<-list_str[["+"]][,"start"]-list_str[["+"]][,"start_stst"] +list_str[["-"]]$distance<-list_str[["-"]][,"end_stst"]-list_str[["-"]][,"end"] + +reads_simpl<-do.call(rbind.data.frame,list_str) + +dists_all<-with(reads_simpl,aggregate(count,by=list(type_stst,length_per_exon,distance),FUN=sum)) +colnames(dists_all)<-c("type","length","distance","counts") +lw<-3 +lengths<-as.numeric(sort(unique(dists_all$length),decreasing=F)) +for(i in lengths){ + names<-paste(args,"_",as.character(i),".png",sep="") + + png(filename=names,width=3024,height=1968) + par(mfrow=c(2,2),cex=2.6) + + starts_ok<-dists_all[dists_all[,"length"]==i & dists_all[,"type"]=="start_codon",] + stops_ok<-dists_all[dists_all[,"length"]==i & dists_all[,"type"]=="stop_codon",] + + starts<-starts_ok[starts_ok[,"distance"]%in%c(-20:20),] + if(dim(starts)[1]==0){plot(1,1,type="n")} + if(dim(starts)[1]>0){ + plotto<-as.data.frame(t(t(-20:20)),stringsAsFactors=F) + colnames(plotto)<-"distance" + plotto$counts<-0 + for(g in 1:dim(plotto)[1]){ + dis<-plotto$distance[g] + if(sum(starts$distance==dis)>0){ + plotto[g,"counts"]<-starts$counts[starts$distance==dis] + } + } + plot(plotto$counts,col=c("red","blue","green"),type="h",xlab="Distance",ylab="Alignments",xaxt="n",main=paste("distance 5' - starts",as.character(starts$length[1]),"nt\n",i,sep=" "),lwd=lw) + axis(1, at=seq(1,length(plotto$counts),by=1), labels=as.character(seq(min(plotto$distance),max(plotto$distance),by=1)),xaxp = c(-40,40,80),las=2) + } + starts<-starts_ok[starts_ok[,"distance"]%in%c(16:56),] + + if(dim(starts)[1]==0){plot(1,1,type="n")} + if(dim(starts)[1]>0){ + plotto<-as.data.frame(t(t(16:56)),stringsAsFactors=F) + colnames(plotto)<-"distance" + plotto$counts<-0 + for(g in 1:dim(plotto)[1]){ + dis<-plotto$distance[g] + if(sum(starts$distance==dis)>0){ + plotto[g,"counts"]<-starts$counts[starts$distance==dis] + } + } + plot(plotto$counts,col=c("red","blue","green"),type="h",xlab="Distance",ylab="Alignments",xaxt="n",main=paste("distance 5' - starts",as.character(starts$length[1]),"nt\n",i,sep=" "),lwd=lw) + axis(1, at=seq(1,length(plotto$counts),by=1), labels=as.character(seq(min(plotto$distance),max(plotto$distance),by=1)),xaxp = c(-40,40,80),las=2) + } + + stops<-stops_ok[stops_ok[,"distance"]%in%c(-68:-28),] + + if(dim(stops)[1]==0){plot(1,1,type="n")} + if(dim(stops)[1]>0){ + + plotto<-as.data.frame(t(t(-68:-28)),stringsAsFactors=F) + colnames(plotto)<-"distance" + plotto$counts<-0 + for(g in 1:dim(plotto)[1]){ + dis<-plotto$distance[g] + if(sum(stops$distance==dis)>0){ + plotto[g,"counts"]<-stops$counts[stops$distance==dis] + } + } + plot(plotto$counts,col=c("red","blue","green"),type="h",xlab="Distance",ylab="Alignments",xaxt="n",main=paste("distance 5' - stops",as.character(stops$length[1]),"nt\n",i,sep=" "),lwd=lw) + axis(1, at=seq(1,length(plotto$counts),by=1), labels=as.character(seq(min(plotto$distance),max(plotto$distance),by=1)),xaxp = c(-40,40,80),las=2) + } + + + stops<-stops_ok[stops_ok[,"distance"]%in%c(-32:10),] + + if(dim(stops)[1]==0){plot(1,1,type="n")} + if(dim(stops)[1]>0){ + plotto<-as.data.frame(t(t(-32:10)),stringsAsFactors=F) + colnames(plotto)<-"distance" + plotto$counts<-0 + for(g in 1:dim(plotto)[1]){ + dis<-plotto$distance[g] + if(sum(stops$distance==dis)>0){ + plotto[g,"counts"]<-stops$counts[stops$distance==dis] + } + } + plot(plotto$counts,col=c("red","blue","green"),type="h",xlab="Distance",ylab="Alignments",xaxt="n",main=paste("distance 5' - stops",as.character(stops$length[1]),"nt\n",i,sep=" "),lwd=lw) + axis(1, at=seq(1,length(plotto$counts),by=1), labels=as.character(seq(min(plotto$distance),max(plotto$distance),by=1)),xaxp = c(-40,40,80),las=2) + } + + dev.off() +} + +print(paste("--- aggregate start-stop plots, Done!","---",date(),sep=" ")) + diff --git a/scripts/quality_check.R b/scripts/quality_check.R new file mode 100755 index 0000000..80d914b --- /dev/null +++ b/scripts/quality_check.R @@ -0,0 +1,265 @@ +#!/usr/bin/Rscript + + +################################################################### +# This file is part of RiboTaper. +# RiboTaper is a method for defining traslated ORFs using +# Ribosome Profiling data. +# +# Copyright (C) 2015 Lorenzo Calviello +# +# RiboTaper is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# RiboTaper is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with RiboTaper. If not, see . +# +# Contact: Lorenzo.Calviello@mdc-berlin.de +####################################################################### + + +###script for plotting general results about the exon-level analysis as a QC step, takes as arguments the annotation directory + + +print(paste("--- create QC plots ---",date(),sep=" ")) + + + + +args <- commandArgs(trailingOnly = TRUE) + +ribo_best<-system("samtools view -c RIBO_best.bam",intern=T) +ribo_unique<-system("samtools view -c RIBO_unique.bam",intern=T) +ribo_psit<-system("wc -l P_sites_all ",intern=T) +ribo_psit<-strsplit(ribo_psit,split=" ")[[1]][1] + +rna_best<-system("samtools view -c RNA_best.bam",intern=T) +rna_unique<-system("samtools view -c RNA_unique.bam",intern=T) +rna_psit<-system("wc -l Centered_RNA ",intern=T) +rna_psit<-strsplit(rna_psit,split=" ")[[1]][1] + +all_annot<-read.table("all_calculations_ccdsgenes_annot_new",header=T,stringsAsFactors=F,quote = "") + +ccds<-all_annot[all_annot[,"type"]=="ccds" & all_annot[,"P_sites_sum"]>5 & !is.na(all_annot$pval_multit_3nt_ribo),] + +nonccds<-read.table("results_nonccds_annot",header=T,stringsAsFactors=F,quote = "") +noncoding<-nonccds[nonccds[,"annotation"]!="protein_coding",] +noncoding<-noncoding[noncoding[,"P_sites_sum"]>5,] +noncoding<-noncoding[noncoding[,"length.x"]>5,] +utrs<-all_annot[all_annot[,"type"]%in%c("3_utrs_ex","5_utrs_ex"),] +utrs<-utrs[utrs[,"P_sites_sum"]>5,] + +fra<-paste(args[1],"frames_ccds",sep = "/") + +frames<-read.table(fra,stringsAsFactors=F,header=F) + +colnames(frames)<-c("exon_id","frame_start_annot","strand","length") +ccds_frames<-merge(ccds,frames,by="exon_id") +ccds_frames$ok_annot<-FALSE + +ccds_frames[ccds_frames[,"strand"]=="+" & ccds_frames[,"frame_start_annot"]==ccds_frames[,"frame_start_pred"],"ok_annot"]<-TRUE +ccds_frames[ccds_frames[,"strand"]=="-" & ccds_frames[,"frame_start_annot"]==ccds_frames[,"frame_end_pred"],"ok_annot"]<-TRUE + + +lib_size<-as.numeric(ribo_psit) + +all_ccds<-ccds + +all_ccds$RPKM_ribo<-apply(X=all_ccds,MARGIN=1,FUN=function(x){(10^9 * as.numeric(x["P_sites_sum"]))/(lib_size * as.numeric(x["length.x"]))}) + +quantiles_RPKM_ribo<-quantile(all_ccds$RPKM_ribo,probs=seq(0,1,length.out=8)) +all_ccds$quant_RPKM_ribo<-cut(x=all_ccds$RPKM_ribo,breaks=quantiles_RPKM_ribo,labels=as.character(1:7)) +quantiles_length<-quantile(all_ccds$length.x,probs=seq(0,1,length.out=8)) +all_ccds$quant_length<-cut(x=all_ccds$length.x,breaks=quantiles_length,labels=as.character(1:7)) + +length_rpkm<-rbind(c(2,2),c(2,4),c(2,6),c(4,2),c(4,4),c(4,6),c(6,2),c(6,4),c(6,6)) +rownames(length_rpkm)<-c("short_low","short_med","short_high","medium_low","medium_med","medium_high","long_low","long_med","long_high") +colnames(length_rpkm)<-c("length","rpkm") +results<-list() +for(i in 1:dim(length_rpkm)[1]){ + combin<-length_rpkm[i,] + name<-rownames(length_rpkm)[i] + exons_all<-all_ccds[all_ccds[,"quant_length"]==combin["length"] & all_ccds[,"quant_RPKM_ribo"]==combin["rpkm"],] + if(dim(exons_all)[1]>10){ + res<-as.data.frame(t(as.matrix(table(exons_all[,"pval_multit_3nt_ribo"]<0.05)/dim(exons_all)[1])),stringsAsFactors=F) + if(dim(res)[2]==2){ + colnames(res)<-c("non-periodic","periodic") + } + if(dim(res)[2]==1){ + if(res[,1]==FALSE){ + res[,2]<-0 + colnames(res)<-c("non-periodic","periodic") + } + if(res[,1]==TRUE){ + res[,2]<-res[,1] + res[,1]<-0 + colnames(res)<-c("non-periodic","periodic") + + } + } + res_rna<-as.data.frame(t(as.matrix(table(exons_all[,"pval_multit_3nt_rna"]<0.05)/sum(!is.na(exons_all$pval_multit_3nt_rna)))),stringsAsFactors=F) + + if(dim(res_rna)[2]==2){ + colnames(res_rna)<-c("non-periodic","periodic") + } + if(dim(res_rna)[2]==1){ + if(res_rna[,1]==TRUE){ + res_rna[,2]<-0 + colnames(res_rna)<-c("non-periodic","periodic") + } + if(res_rna[,1]==FALSE){ + res_rna[,2]<-res_rna[,1] + res_rna[,1]<-0 + colnames(res_rna)<-c("non-periodic","periodic") + + } + } + res<-cbind(res,res_rna) + res[,"n_exons"]<-dim(exons_all)[1] + res[,"RPKM"]<-paste(paste(round(quantiles_RPKM_ribo[combin[2]],digits=1),round(quantiles_RPKM_ribo[combin[2]+1],digits=1),sep="-"),"RPKM") + res[,"length"]<-paste(paste(round(quantiles_length[combin[1]],digits=1),round(quantiles_length[combin[1]+1],digits=1),sep="-"),"nt") + res[,"category"]<-name + } + if(dim(exons_all)[1]<=10){ + res<-NULL + } + results[[i]]<-res + +} +results<-do.call(rbind.data.frame,args=results) + +results$length<-factor(results$length, levels=unique(results$length)) +results$RPKM<-factor(results$RPKM, levels=unique(results$RPKM)) + +### + +pdf(file="quality_check_plots.pdf",width=35,height=25,onefile=T,title="") + + +lefts<-c(0,.25,.5,.75, 0,.25,.5,.75, 0,.5, 0,.5, 0,.5) + +rights<-c(.25,.5,.75,1, .25,.5,.75,1, .5,1, .5,1, .5,1) + +bottoms<-c(.75,.75,.75,.75, .5,.5,.5,.5, .33,.33,.166,.166,0,0) + +tops<-c(1,1,1,1, .75,.75,.75,.75, .5,.5,.33,.33,.166,.166) + + +matfig<-(cbind(lefts,rights,bottoms,tops)) + +close.screen(a=T) +#par(mgp=c(13, 1, 0)) +n_ccds<-length(which(all_annot$type=="ccds")) +n_ccds_5nt<-dim(all_ccds)[1] +n_ccds_5nt_rna<-length(which(!is.na(ccds$chisq_rna))) + + +n_ccds_period<-length(which(all_ccds$pval_multit_3nt_ribo<0.05)) +n_ccds_chisq<-length(which(all_ccds$chisq_ribo<0.05)) +n_ccds_period_rna<-length(which(all_ccds$pval_multit_3nt_rna<0.05)) +n_ccds_chisq_rna<-length(which(all_ccds$chisq_rna<0.05)) + +split.screen(matfig) + +screen(1) +par(mar=c(6.1,8,2,2)) + +barp<-barplot(as.numeric(c(ribo_best,ribo_unique,ribo_psit)),beside=T,col=c("orange","red","dark red"),names.arg=c(""),cex.axis=1.8,cex.lab=1.8,cex.main=1.8,cex=1.8,mgp=c(13, 1, 0),main="") +axis(side=1,labels=c("Ribo\naligned reads","Ribo\nunique reads","P-sites\npositions"),at=barp,cex.axis=1.8,cex.main=1.8,cex=1.8,mgp=c(3,2.1,0)) +screen(2) +par(mar=c(6.1,8,2,2)) + +barp<-barplot(as.numeric(c(rna_best,rna_unique,rna_psit)),beside=T,col=c("white","grey","dark grey"),names.arg=c(""),cex.axis=1.8,cex.lab=1.8,cex.main=1.8,cex=1.8,main="") +axis(side=1,labels=c("RNA\naligned reads","RNA\nunique reads","RNA-sites\npositions"),at=barp,cex.axis=1.8,cex.main=1.8,cex=1.8,mgp=c(3,2.1,0)) +screen(3) +par(mar=c(6.1,8,2,2)) + +barp<-barplot(c(n_ccds,n_ccds_5nt,n_ccds_5nt_rna),names.arg=c(""),col=c("white","indianred2","red"),cex.main=1.8,cex.axis=1.8,cex.lab=1.8,cex.names=1.8,main="") +axis(side=1,labels=c("all ccds\nexons","ccds exons\n>5 P-sites","ccds exons\n>5 RNA & P-sit"),at=barp,cex.axis=1.8,cex.main=1.8,cex=1.8,mgp=c(3,2.1,0)) +screen(4) +par(mar=c(6.1,8,2,2)) + +rna_ok<-length(which(!is.na(ccds$pval_multit_3nt_rna))) +rna_ok2<-length(which(!is.na(ccds$chisq_rna))) +m_r<-table(ccds$pval_multit_3nt_ribo<0.05)/dim(ccds)[1] +m_rn<-table(ccds$pval_multit_3nt_rna<0.05)/rna_ok +# +c_r<-table(ccds$chisq_ribo<0.05)/dim(ccds)[1] +c_rn<-table(ccds$chisq_rna<0.05)/rna_ok2 + + +barp<-barplot(c(m_r[2],c_r[2],m_rn[1],c_rn[1]),xpd=F,col=c("red","red","grey","grey"),space=0.1,names.arg="",ylab="% CCDS exons",main="",cex.main=1.8,cex.axis=1.8,cex.lab=1.8,cex.names=1.8) +axis(side=1,labels=c("Multitap\nribo","Chi-sq\nribo","Multitap\nrna","Chi-sq\nrna"),at=barp,cex.axis=1.8,cex.main=1.8,cex=1.8,mgp=c(3,2.1,0)) + +#barplot(c(m_rn[1],c_rn[1]),xpd=F,col="grey",space=0.1,names.arg=c("Multi-taper test","Chi-squared test"),ylab="% CCDS exons",main="Negative exons, (P-value > 0.05) \n This_study RNA-seq",cex.main=1.8,cex.axis=1.8,cex.lab=1.8,cex.names=1.8) +screen(5) +par(mar=c(6.1,8,2,2)) + +hist(ccds$pval_multit_3nt_rna,breaks=50,col="grey",main="",cex.main=1.8,cex.axis=1.8,cex.lab=1.8,xlab="") +axis(side=1,labels="P-values multitaper test \n RNA-seq",at=.5,cex.axis=1.8,cex.main=1.8,cex=1.8,mgp=c(3,5,0)) +screen(6) +par(mar=c(6.1,8,2,2)) + +hist(ccds$chisq_rna,breaks=50,col="grey",main="",xlab="",cex.main=1.8,cex.axis=1.8,cex.lab=1.8) +axis(side=1,labels="P-values Chi-squared test \n RNA-seq",at=.5,cex.axis=1.8,cex.main=1.8,cex=1.8,mgp=c(3,5,0)) +screen(7) +par(mar=c(6.1,8,2,2)) + +gino<-density(apply(ccds[,c("pctPhase_frame","pctPhase_frame_1","pctPhase_frame_2")],FUN=max,1),from=0,to=1) +plot(gino,col="violet",main="",xlab="% of P-sites on the max frame",cex.main=2,cex.axis=2,lwd=5,cex.lab=2) +gino<-density(apply(utrs[,c("pctPhase_frame","pctPhase_frame_1","pctPhase_frame_2")],FUN=max,1),from=0,to=1) +lines(gino,col="dark grey",lwd=5) +gino<-density(apply(noncoding[,c("pctPhase_frame","pctPhase_frame_1","pctPhase_frame_2")],FUN=max,1),from=0,to=1) +lines(gino,col="orange",lwd=5) +legend("topleft",c("CCDS exons","UTRs","non-coding"),lty=c(1,1,1),col=c("violet","orange","dark grey"),lwd=c(2.2,2.2,2.2),cex=1.8) + + +screen(8) +par(mar=c(6.1,1,2,2)) + +same_as_annot<-table(ccds_frames$ok_annot)/dim(ccds_frames)[1] +names(same_as_annot)<-c("diff_annot","same_annot") +same_as_annot<-round(same_as_annot,digits=3) +pie(same_as_annot,labels=paste(names(same_as_annot),":\n",as.character(100 * same_as_annot),"%"),col=c("dark grey","violet"),cex=1.8,cex.main=2,main="",init.angle=270) + + + +for(j in 1:3){ + to_barpl<-split.data.frame(results,f=results$length) + barpl<-(to_barpl[[j]]) + if(j==1){screen(9)} + if(j==2){screen(11)} + if(j==3){screen(13)} + par(mar=c(6.1,8,2,2)) + + barp<-barplot(t(100*as.matrix(barpl[,2:1])),ylim=c(0,100),col=c("dark red","grey"),names.arg=rep("",dim(barpl)[1]),ylab="% periodic CCDS exons\nRibo-seq",main=paste(barpl$length[1],"exons"),cex.axis=1.8,cex.lab=1.8,cex.main=1.8,cex=1.8) + axis(side=1,labels=paste(barpl$RPKM,"\n n_of exons=",barpl$n_exons),at=barp,cex.axis=1.8,cex.main=1.8,cex=1.8,mgp=c(3,2.7,0)) + + # if(j==1){ + # mtext(side=3,"3nt periodicity in ccds exons, Ribo-seq",line=+2.6) + # } + if(j==1){screen(10)} + if(j==2){screen(12)} + if(j==3){screen(14)} + par(mar=c(6.1,8,2,2)) + + barp<-barplot(t(100*as.matrix(barpl[,4:3])),ylim=c(0,100),col=c("dark red","dark grey"),names.arg=rep("",dim(barpl)[1]),ylab="% periodic CCDS exons\nRNA-seq",main=paste(barpl$length[1],"exons"),cex.axis=1.8,cex.lab=1.8,cex.main=1.8,cex=1.8) + axis(side=1,labels=paste(barpl$RPKM,"\n n_of exons=",barpl$n_exons),at=barp,cex.axis=1.8,cex.main=1.8,cex=1.8,mgp=c(3,2.7,0)) + + # if(j==1){ + # mtext(side=3,"3nt periodicity in ccds exons, RNA-seq",line=+2.6) + # } +} +### + + +dev.off() + +print(paste("--- QC plots Done! ---",date(),sep=" ")) diff --git a/scripts/tracks_analysis.R b/scripts/tracks_analysis.R new file mode 100755 index 0000000..ca4ceb0 --- /dev/null +++ b/scripts/tracks_analysis.R @@ -0,0 +1,127 @@ +#!/usr/bin/Rscript + + +################################################################### +# This file is part of RiboTaper. +# RiboTaper is a method for defining traslated ORFs using +# Ribosome Profiling data. +# +# Copyright (C) 2015 Lorenzo Calviello +# +# RiboTaper is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# RiboTaper is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with RiboTaper. If not, see . +# +# Contact: Lorenzo.Calviello@mdc-berlin.de +####################################################################### + + +###script for analyzing data tracks, it takes as arguments the name tag of the track (i.e. "ccds"), the RiboTaper scripts directory, the n of cores + +args <- commandArgs(trailingOnly = TRUE) + +print(paste("--- analyzing",args[1],"exonic tracks","---",date(),sep=" ")) + + +suppressMessages(source(paste(args[2],"functions.R",sep = "/"))) + + +registerDoMC(args[3]) + +sink(file=NULL,type="message") + + +all_tracks<-readBigText(paste("data_tracks/Psit_Ribo_Rna_Cent_tracks",as.character(args[1]),sep="_")) + +all_index<-read.table(paste("data_tracks/index_tracks",as.character(args[1]),sep="_"),stringsAsFactors=F,header=F) + +index_chrs<-sapply(strsplit(all_index$V1,split="_"),"[[",1) + + +regions<-unique(all_index) + +results_regions<-list() + + + +results_regions<-foreach(s=1:dim(regions)[1],.combine=rbind,.multicombine=T) %dopar%{ + tryCatch({ + chr_reg<-sapply(strsplit(regions[s,1],split="_"),"[[",1) + all_tr<-all_tracks[which(index_chrs==chr_reg)] + ind_tr<-subset(all_index,index_chrs==chr_reg) + x<-all_tr[ind_tr==regions[s,1]] + x<-t(data.frame(strsplit(x,split=" "))) + return(make_analysis_exons(x)) + + }, error=function(x){ + + return("error") + } + ) +} + + + + +names_covbeds<-c("exon_id","strand","reads","bases_covered","total_bases","pct_region_covered") + + +RIBO_best<-read.table(paste("RIBO_best_counts",as.character(args[1]),sep="_"),stringsAsFactors=F,header=F) + +colnames(RIBO_best)<-names_covbeds + + +RIBO_unique<-read.table(paste("RIBO_unique_counts",as.character(args[1]),sep="_"),stringsAsFactors=F,header=F) + +colnames(RIBO_unique)<-names_covbeds + + +RNA_best<-read.table(paste("RNA_best_counts",as.character(args[1]),sep="_"),stringsAsFactors=F,header=F) + +colnames(RNA_best)<-names_covbeds + + +RNA_unique<-read.table(paste("RNA_unique_counts",as.character(args[1]),sep="_"),stringsAsFactors=F,header=F) + +colnames(RNA_unique)<-names_covbeds + + + + +multi_table_RIBO<-merge(RIBO_best,RIBO_unique,by="exon_id") +multi_table_RNA<-merge(RNA_best,RNA_unique,by="exon_id") +multi_table_RIBO$pct_covered_onlymulti<-multi_table_RIBO$pct_region_covered.x-multi_table_RIBO$pct_region_covered.y +multi_table_RIBO$reads_multi<-multi_table_RIBO$reads.x-multi_table_RIBO$reads.y + +multi_table_RNA$pct_covered_onlymulti<-multi_table_RNA$pct_region_covered.x-multi_table_RNA$pct_region_covered.y +multi_table_RNA$reads_multi<-multi_table_RNA$reads.x-multi_table_RNA$reads.y + + +multi_table_RIBO<-multi_table_RIBO[,c(1,2,5,3,13,6,12)] +names(multi_table_RIBO)<-c("exon_id", "strand", "length", "reads_ribo", "reads_multi_ribo","pct_region_covered_ribo", + "pct_covered_onlymulti_ribo") + + + +multi_table_RNA<-multi_table_RNA[,c(1,3,13,6,12)] +names(multi_table_RNA)<-c("exon_id", "reads_rna", "reads_multi_rna", "pct_region_covered_rna", + "pct_covered_onlymulti_rna") + +multi_table<-merge(multi_table_RIBO,multi_table_RNA,by="exon_id") + +RESULTS<-merge(results_regions,multi_table,by=1) + +write.table(RESULTS,file=paste("results",args[1],sep="_"),quote=F,sep="\t",row.names=F) + + +print(paste("--- track_analysis Done!","---",date(),sep=" ")) +