all modules

lixin4306ren · Aug 12, 2016 · 7b1c36c · 7b1c36c
1 parent a956183
commit 7b1c36c
Show file tree

Hide file tree

Showing 5 changed files with 190 additions and 85 deletions.
diff --git a/README.md b/README.md
@@ -1,29 +1,51 @@
 # RNA-Seq分析流程
 ## 安装
-此流程用OSA进行比对，安装OSA参考此[网页](http://www.arrayserver.com/wiki/index.php?title=Oshell#Overview)。
+此流程依赖于array suite，安装array suite参考此[网页](http://www.arrayserver.com/wiki/index.php?title=Oshell#Overview)。
 
-sample.list格式为所有样品目录路径，目录下为fastq文件
+## 主要参数
+主程序为`RNA_seq_align_all_module.pl`，用`-h`显示所有参数如下：
 ```
-/home/others/xli/raw_data/2016_08_01_ZY/RNA-Seq/S1
-/home/others/xli/raw_data/2016_08_01_ZY/RNA-Seq/S2
-/home/others/xli/raw_data/2016_08_01_ZY/RNA-Seq/S3
-/home/others/xli/raw_data/2016_08_01_ZY/RNA-Seq/S4
-/home/others/xli/raw_data/2016_08_01_ZY/RNA-Seq/S5
-/home/others/xli/raw_data/2016_08_01_ZY/RNA-Seq/S6
+        RNA-Seq pipeline
+
+        Usage: RNA_seq_align_all_module.pl <options>
+
+                -sample_list        sample list
+
+                -out_dir            output result folder
+
+                -ref_dir            reference OSA base path
+
+                -ref                reference name
+
+                -gene_model         gene_model name
+
+                -gtf                gene gtf file
+
+                -gtf_exon           exon gff file for DEXSeq
+
+                -step               align or merge_rmdup or count or count_exon or tdf
+
+                -h or -help         Show Help , have a choice
 ```
 
-nohup /usr/local/bin/bcl2fastq --runfolder-dir /home/yliu/data/Xin/2016_08_01_ZY/ --output-dir 2016_08_01_ZY_100bp --sample-sheet 2016_08_01_ZY.csv --use-bases-mask Y100n51,I6,Y100n51 
 
-mono /home/others/xli/soft/oshell/16-8-11/oshell.exe --runscript /home/others/xli/data/genome /home/others/xli/RNA-Seq/Alignment.oscript /tmp/ /opt/apps/mono/bin/mono
+`sample.list`为所有样品目录路径，目录下为fastq.gz文件，格式如下：  
+```
+/home/others/xli/raw_data/2016_08_01_ZY/RNA-Seq/S1
+/home/others/xli/raw_data/2016_08_01_ZY/RNA-Seq/S2
+```
+`-ref`默认基因组版本Human.B37.3  
+`-gene_model`默认版本为RefGene  
+基因组和基因注释版本对应信息参考此[网页](http://www.arrayserver.com/wiki/index.php?title=A_list_of_compiled_genome_and_gene_model_from_OmicSoft)
 
 
-perl ~/scripts/RNA_seq_align_all_module.pl -sample_list sample.list -step align
+## alignment
+```
+perl ~/scripts/rnaseq_pipeline/RNA_seq_align_all_module.pl -sample_list sample.list -out_dir /home/others/xli/RNA-Seq -step align
+```
+生成`align.sh`文件和响应的`*.oscript`文件。用`qsub`提交`align.sh`运行alignment步骤。完成后所有结果在`/home/others/xli/RNA-Seq/sample_name`目录。
 
-perl ~/scripts/RNA_seq_align.pl -sample_list sample.list -ref_dir ~/amber3/no_back_up/data/genome/hg19_OSA/ -ref Human.B37 -gene_model RefGene -step align
+## 合并bam文件去duplicates
 
-perl ~/scripts/RNA_seq_align.pl -sample_list sample.list -ref_dir ~/amber3/no_back_up/data/genome/galgal4/for_OSA/ -ref gal4 -gene_model gal4gene -step merge
-perl ~/scripts/RNA_seq_align.pl -sample_list sample.list -gtf ~/amber3/no_back_up/data/gene/chicken/Gallus_gallus.Galgal4.71.gtf -step count
-perl ~/scripts/RNA_seq_align.pl -sample_list sample.list -gtf_exon ~/amber3/no_back_up/data/gene/chicken/Gallus_gallus.Galgal4.71.for.dexseq.gff -step count_exon
-perl ~/scripts/RNA_seq_align.pl -sample_list sample.list -ref_dir ~/amber3/no_back_up/data/genome/galgal4/for_OSA/ -ref gal4 -gene_model gal4gene -step align_sum
-perl ~/scripts/RNA_seq_align.pl -sample_list sample.list -ref_dir ~/amber3/no_back_up/data/genome/galgal4/for_OSA/ -step tdf
+## 生成可供IGV broswer浏览的tdf文件
 
diff --git a/RNA_seq_align_all_module.pl b/RNA_seq_align_all_module.pl
@@ -40,7 +40,7 @@
 if ($step eq 'align') {
 open O,">align.sh"||die;
 chomp($out_dir);
-my $cmd="perl ~xli/scripts/Run_OSA_all_modules.pl $sample_list $out_dir $ref_dir $ref $gene_model";
+my $cmd="perl ~xli/scripts/rnaseq_pipeline/Run_OSA_all_modules.pl $sample_list $out_dir $ref_dir $ref $gene_model";
 my $cmd2=`$cmd`;
 print O $cmd2,"\n";
 

diff --git a/Run_OSA_all_modules.pl b/Run_OSA_all_modules.pl
@@ -34,6 +34,9 @@
 
 chomp($fastq_files);
 
+my $slash="\\\\";
+my $back="\b";
+
 print O <<"EOT";
 Begin Macro;
  
@@ -53,156 +56,157 @@
 \@GeneModelName\@ $gene_model;
  
 End;
+EOT
 
 
 
-
-
+print O <<'EOT2';
 #region Oshell run
 //Create the OmicSoft project enviroment
 Begin NewProject;
-File "\@ProjectFolder\@/\@ProjectName\@.osprj";
+File "@ProjectFolder@/@[email protected]";
 Options /Distributed=true;
 End;
  
 #region Raw data QC
 //QC: Raw sequence, using NgsQCWizard
 Begin NgsQCWizard /Namespace=NgsLib;
 Files
-"\@FileNames\@";
-Options /ThreadNumberPerJob=\@ThreadNum\@ /FileFormat=AUTO /CompressionMethod=\@CompressionMethod\@ /QualityEncoding=Automatic /PreviewMode=false 
+"@FileNames@";
+Options /ThreadNumberPerJob=@ThreadNum@ /FileFormat=AUTO /CompressionMethod=@CompressionMethod@ /QualityEncoding=Automatic /PreviewMode=false 
 /BasicStatistics=true /BaseDistribution=true /QualityBoxPlot=true /KMerAnalysis=true /SequenceDuplication=true;
 Output rawseq_qc;
 End;
+
  
 // save the OmicSoft project enviroment after each major step
 Begin SaveProject;
-Project \@ProjectName\@;
-File "\@ProjectFolder\@/\@ProjectName\@.osprj";
+Project @ProjectName@;
+File "@ProjectFolder@/@[email protected]";
 End;
 #endregion
  
 #region RNA-Seq alignment to human reference using OSA
 Begin MapRnaSeqReadsToGenome /Namespace=NgsLib;
 Files 
-"\@FileNames\@";
-Reference \@ReferenceName\@;
-GeneModel \@GeneModelName\@;
+"@FileNames@";
+Reference @ReferenceName@;
+GeneModel @GeneModelName@;
 Trimming  /Mode=TrimByQuality /ReadTrimQuality=2;
-Options  /BamSubFolder=primary_alignment /ParallelJobNumber=1 /PairedEnd=\@PairedEnd\@ /FileFormat=AUTO /AutoPenalty=True 
+Options  /BamSubFolder=primary_alignment /ParallelJobNumber=1 /PairedEnd=@PairedEnd@ /FileFormat=AUTO /AutoPenalty=True 
 /FixedPenalty=2 /Greedy=false /IndelPenalty=2 /DetectIndels=False /MaxMiddleInsertionSize=10 /MaxMiddleDeletionSize=10 
 /MaxEndInsertionSize=10 /MaxEndDeletionSize=10 /MinDistalEndSize=3 /ExcludeNonUniqueMapping=True /ReportCutoff=10 
-/WriteReadsInSeparateFiles=True /OutputFolder="" /GenerateSamFiles=True /ThreadNumberPerJob=\@ThreadNum\@ 
+/WriteReadsInSeparateFiles=True /OutputFolder="" /GenerateSamFiles=True /ThreadNumberPerJob=@ThreadNum@ 
 /InsertSizeStandardDeviation=40 /ExpectedInsertSize=300 /InsertOnSameStrand=False /InsertOnDifferentStrand=True 
-/QualityEncoding=Automatic /CompressionMethod=\@CompressionMethod\@ /Gzip=\@Gzip\@ /SearchNovelExonJunction=True /ExcludeUnmappedInBam=True;
+/QualityEncoding=Automatic /CompressionMethod=@CompressionMethod@ /Gzip=@Gzip@ /SearchNovelExonJunction=True /ExcludeUnmappedInBam=True;
 Output primary_alignment;
 End;
  
 Begin SaveProject;
-Project \@ProjectName\@;
-File "\@ProjectFolder\@/\@ProjectName\@.osprj";
+Project @ProjectName@;
+File "@ProjectFolder@/@[email protected]";
 End;
  
 #endregion
  
 #region Alignment QC
 //Alignment QC Metrics using RnaSeqQCMetrics
 Begin RnaSeqQCMetrics /Namespace=NgsLib;
-Project \@ProjectName\@;
-Data \@ProjectName\@\\primary_alignment;
-GeneModel \@GeneModelName\@;
+Project @ProjectName@;
+Data @ProjectName@\\primary_alignment;
+GeneModel @GeneModelName@;
 Metrics Alignment,Flag,Profile,Source,InsertSize,Duplication,Coverage,Strand;
-Options /ThreadNumberPerJob=\@ThreadNum\@ /ExcludeFailedAlignments=true /ExcludeSecondaryAlignments=true 
+Options /ThreadNumberPerJob=@ThreadNum@ /ExcludeFailedAlignments=true /ExcludeSecondaryAlignments=true 
 /ExcludeMultiReads=false /ExcludeSingletons=false;
 Output alignment_qc;
 End;
  
 //RNA-Seq 5'->3' trend using SummarizeRnaSeqTrend53
 Begin SummarizeRnaSeqTrend53 /Namespace=NgsLib;
-Project \@ProjectName\@;
-Data \@ProjectName\@\\primary_alignment;
-GeneModel \@GeneModelName\@;
-Options /OutputFolder="\@ProjectFolder\@/\@ProjectName\@/primary_alignment" /ThreadNumberPerJob=\@ThreadNum\@ 
+Project @ProjectName@;
+Data @ProjectName@\\primary_alignment;
+GeneModel @GeneModelName@;
+Options /OutputFolder="@ProjectFolder@/@ProjectName@/primary_alignment" /ThreadNumberPerJob=@ThreadNum@ 
 /BinNumber=100 /TranscriptLengthBins=500, 1000, 2000, 3000, 4000, 5000 /ExcludeGenesWithMultipleIsoforms=true 
 /ScaleCoverage=true /ReportTranscriptData=false;
 Output qc_trend53;
 End;
  
 Begin SaveProject;
-Project \@ProjectName\@;
-File "\@ProjectFolder\@/\@ProjectName\@.osprj";
+Project @ProjectName@;
+File "@ProjectFolder@/@[email protected]";
 End;
 #endregion
  
 #region Quantification at gene, isoform, exon and exon-junction level
 //gene level rsem estimation
 Begin ReportGeneTranscriptCounts /Namespace=NgsLib;
-Project \@ProjectName\@;
-Data \@ProjectName\@\\primary_alignment;
-GeneModel \@GeneModelName\@;
-Options /ExpressionMeasurement=RPKM /ThreadNumberPerJob=\@ThreadNum\@ /Add1=False /CountFragments=True /ExcludeMultiReads=False 
+Project @ProjectName@;
+Data @ProjectName@\\primary_alignment;
+GeneModel @GeneModelName@;
+Options /ExpressionMeasurement=RPKM /ThreadNumberPerJob=@ThreadNum@ /Add1=False /CountFragments=True /ExcludeMultiReads=False 
 /UseEffectiveTranscriptLength=True /CountStrandedReads=True /CountReverseStrandedReads=True;
 Output quantification_gene;
 End;
  
 //transcript level rsem estimation
 Begin ReportGeneTranscriptCounts /Namespace=NgsLib;
-Project \@ProjectName\@;
-Data \@ProjectName\@\\primary_alignment;
-GeneModel \@GeneModelName\@;
-Options /ExpressionMeasurement=RPKM_Transcript /ThreadNumberPerJob=\@ThreadNum\@ /Add1=False /CountFragments=True /ExcludeMultiReads=False 
+Project @ProjectName@;
+Data @ProjectName@\\primary_alignment;
+GeneModel @GeneModelName@;
+Options /ExpressionMeasurement=RPKM_Transcript /ThreadNumberPerJob=@ThreadNum@ /Add1=False /CountFragments=True /ExcludeMultiReads=False 
 /UseEffectiveTranscriptLength=True /CountStrandedReads=True /CountReverseStrandedReads=True;
 Output quantification_tx;
 End;
  
 //quantify the known exon and exon junctions based on a gene model
 Begin ReportExonCounts /Namespace=NgsLib;
-Project \@ProjectName\@;
-Data \@ProjectName\@\\primary_alignment;
-GeneModel \@GeneModelName\@;
-Options /ThreadNumberPerJob=\@ThreadNum\@ /ReportExonCounts=True /ReportExonJunctionCounts=True /ExcludeSingletons=False /Rpkm=True 
+Project @ProjectName@;
+Data @ProjectName@\\primary_alignment;
+GeneModel @GeneModelName@;
+Options /ThreadNumberPerJob=@ThreadNum@ /ReportExonCounts=True /ReportExonJunctionCounts=True /ExcludeSingletons=False /Rpkm=True 
 /CountStrandedReads=True /CountReverseStrandedReads=True;
 Output quantification_known;
 End;
  
 //quantify the each exon junction based on alignment and annotate with gene model info (known or novel exon junctions)
 Begin ReportExonJunctions /Namespace=NgsLib;
-Project \@ProjectName\@;
-Data \@ProjectName\@\\primary_alignment;
+Project @ProjectName@;
+Data @ProjectName@\\primary_alignment;
 Options /ExcludeMultiReads=False /ExcludeSingletons=False /NMCutoff=4 /JunctionOverhangCutoff=4 /GenerateReport=True 
-/GenerateBedFile=False /BedFileOutputFolder="" /MinimalHit=1 /ThreadNumberPerJob=\@ThreadNum\@ 
-/OutputFolder="\@ProjectFolder\@/\@ProjectName\@/primary_alignment";
+/GenerateBedFile=False /BedFileOutputFolder="" /MinimalHit=1 /ThreadNumberPerJob=@ThreadNum@ 
+/OutputFolder="@ProjectFolder@/@ProjectName@/primary_alignment";
 Output quantification_jxn;
 End;
  
 Begin SaveProject;
-Project \@ProjectName\@;
-File "\@ProjectFolder\@/\@ProjectName\@.osprj";
+Project @ProjectName@;
+File "@ProjectFolder@/@[email protected]";
 End;
 #endregion
  
 #region Fusion gene detection
 //Detect fusion based on inter-transcript read pairs (PE)
 Begin ReportPairedEndFusionGenes /Namespace=NgsLib;
-Project \@ProjectName\@;
-Data \@ProjectName\@\\primary_alignment;
-GeneModel \@GeneModelName\@;
-Options /GenerateData=True /OutputFusionReads=True /MinimalHit=2 /FusionReportCutoff=1 /ThreadNumberPerJob=\@ThreadNum\@ 
+Project @ProjectName@;
+Data @ProjectName@\\primary_alignment;
+GeneModel @GeneModelName@;
+Options /GenerateData=True /OutputFusionReads=True /MinimalHit=2 /FusionReportCutoff=1 /ThreadNumberPerJob=@ThreadNum@ 
 /FilterBy=DefaultList /DefaultFilterListVersion=v1 /FilterGeneListFileName="" /FilterGeneFamilyFileName="" 
-/GenerateTableland=True /OutputFolder="\@ProjectFolder\@/\@ProjectName\@/fusion_PEReads";
+/GenerateTableland=True /OutputFolder="@ProjectFolder@/@ProjectName@/fusion_PEReads";
 Output FusionPE;
 End;
  
 //Detect fusion based alignment of junction spanning reads (SE)
 Begin MapFusionReads /Namespace=NgsLib;
-Project \@ProjectName\@;
-Data \@ProjectName\@\\primary_alignment;
-Reference \@ReferenceName\@;
-GeneModel \@GeneModelName\@;
+Project @ProjectName@;
+Data @ProjectName@\\primary_alignment;
+Reference @ReferenceName@;
+GeneModel @GeneModelName@;
 Trimming /Mode=TrimByQuality /ReadTrimQuality=2;
 Options /FusionVersion=2 /ParallelJobNumber=1 /PairedEnd=False /RnaMode=True /FileFormat=BAM /AutoPenalty=True /FixedPenalty=2 
-/OutputFolder="\@ProjectFolder\@/\@ProjectName\@/fusion_SE_alignment" /MaxMiddleInsertionSize=10 
-/ThreadNumberPerJob=\@ThreadNum\@ /QualityEncoding=Automatic /CompressionMethod=None /Gzip=False /MinimalFusionAlignmentLength=25 
+/OutputFolder="@ProjectFolder@/@ProjectName@/fusion_SE_alignment" /MaxMiddleInsertionSize=10 
+/ThreadNumberPerJob=@ThreadNum@ /QualityEncoding=Automatic /CompressionMethod=None /Gzip=False /MinimalFusionAlignmentLength=25 
 /FilterUnlikelyFusionReads=True /FullLengthPenaltyProportion=8 /OutputFusionReads=True /MinimalHit=4 /MinimalFusionSpan=5000 
 /FusionReportCutoff=1 /NonCanonicalSpliceJunctionPenalty=2 /FilterBy=DefaultList /DefaultFilterListVersion=v1 
 /FilterGeneListFileName="" /FilterGeneFamilyFileName="" /GenerateTableland=True;
@@ -213,47 +217,47 @@
 #region Mutation detection
 //detect mutation and annotate with dbSNP; will download dbSNP from omicsoft at first time 
 Begin SummarizeMutation /Namespace=NgsLib;
-Project \@ProjectName\@;
-Data \@ProjectName\@\\primary_alignment;
+Project @ProjectName@;
+Data @ProjectName@\\primary_alignment;
 Options /BaseQualityCutoff=20 /MapQualityCutoff=20 /MinimalIndelSize=1 /ExcludeSingletons=False /ExcludeMultiReads=False 
-/LeftExclusion=5 /RightExclusion=5 /ThreadNumberPerJob=\@ThreadNum\@ /MinimalTotalHit=20 /MinimalMutationHit=5 /MinimalMutationFrequency=0.15 
+/LeftExclusion=5 /RightExclusion=5 /ThreadNumberPerJob=@ThreadNum@ /MinimalTotalHit=20 /MinimalMutationHit=5 /MinimalMutationFrequency=0.15 
 /ExcludeNonMutantSites=True /GenerateSummarizedReport=True /GenerateIndividualReport=False /GenerateTableland=True 
-/MaxFrequencyCutoff=0.15 /DbsnpVersion=v135 /OutputFolder="\@ProjectFolder\@/\@ProjectName\@/mutation";
+/MaxFrequencyCutoff=0.15 /DbsnpVersion=v135 /OutputFolder="@ProjectFolder@/@ProjectName@/mutation";
 Output MutationSummary;
 End;
  
 //Annotate Mutation 
 Begin AnnotateMutation /Namespace=NgsLib;
-Project \@ProjectName\@;
-Data \@ProjectName\@\\MutationSummary.MutationReport;
+Project @ProjectName@;
+Data @ProjectName@\\MutationSummary.MutationReport;
 ID ID;
 Chromosome Chromosome;
 Position Position;
 Mutation Mutation;
 Other (default);
-Options /ReferenceLibraryID=\@ReferenceName\@ /GeneModelID=\@GeneModelName\@ /DbsnpVersion=v135 /GenerateClusteringFlag=True 
+Options /ReferenceLibraryID=@ReferenceName@ /GeneModelID=@GeneModelName@ /DbsnpVersion=v135 /GenerateClusteringFlag=True 
 /ClusteringFlagWindowSize=10 /GenerateTableland=True /AnnotateLongestTranscriptOnly=False /AnnotateFunctionalMutation=False 
 /AnnotateSomaticMutation=False;
 Output MutationAnnotated.MutationReport;
 End;
  
 Begin SaveProject;
-Project \@ProjectName\@;
-File "\@ProjectFolder\@/\@ProjectName\@.osprj";
+Project @ProjectName@;
+File "@ProjectFolder@/@[email protected]";
 End;
  
 #endregion
  
 Begin ExportView;
-Project \@ProjectName\@;
-OutputFolder "\@ProjectFolder\@/\@ProjectName\@/ExportedResults";
+Project @ProjectName@;
+OutputFolder "@ProjectFolder@/@ProjectName@/ExportedResults";
 End;
  
 Begin CloseProject;
-Project \@ProjectName\@;
+Project @ProjectName@;
 End;
 
-EOT
+EOT2