-
Notifications
You must be signed in to change notification settings - Fork 17
/
main.nf
2000 lines (1719 loc) · 73.1 KB
/
main.nf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#!/usr/bin/env nextflow
/*
* Copyright (c) 2020, Sheynkman Lab and the authors.
*
* This file is part of 'proteogenomics-nf' a pipeline repository to run
* the "long read proteogenomics" pipeline.
*
*
* @authors
* Ben Jordan
* Rachel Miller
* Gloria Sheynkman
* Christina Chatzipantsiou
* Anne Deslattes Mays
*/
def helpMessage() {
log.info logHeader()
log.info """
Usage:
The typical command for running the pipeline is as follows:
Other:
--max_cpus Maximum number of CPUs (int)
--max_memory Maximum memory (memory unit)
--max_time Maximum time (time unit)
See here for more info: https://github.com/sheynkman-lab/Long-Read-Proteogenomics/blob/master/docs/usage.md
""".stripIndent()
}
// Show help message
if (params.help) {
helpMessage()
exit 0
}
log.info "Longread Proteogenomics - N F ~ version 0.1"
log.info "====================================="
// Header log info
log.info "\nPARAMETERS SUMMARY"
log.info "coding_score_cutoff : ${params.coding_score_cutoff}"
log.info "config : ${params.config}"
log.info "fastq_read_1 : ${params.fastq_read_1}"
log.info "fastq_read_2 : ${params.fastq_read_2}"
log.info "gencode_gtf : ${params.gencode_gtf}"
log.info "gencode_transcript_fasta : ${params.gencode_transcript_fasta}"
log.info "gencode_translation_fasta : ${params.gencode_translation_fasta}"
log.info "genome_fasta : ${params.genome_fasta}"
log.info "hexamer : ${params.hexamer}"
log.info "logit_model : ${params.logit_model}"
log.info "mainScript : ${params.mainScript}"
log.info "mass_spec : ${params.mass_spec}"
log.info "max_cpus : ${params.max_cpus}"
log.info "metamorpheus toml : ${params.metamorpheus_toml}"
log.info "name : ${params.name}"
log.info "normalized_ribo_kallisto : ${params.normalized_ribo_kallisto}"
log.info "outdir : ${params.outdir}"
log.info "primers_fasta : ${params.primers_fasta}"
log.info "rescue and resolve toml : ${params.rescue_resolve_toml}"
log.info "sample_ccs : ${params.sample_ccs}"
log.info "sample_kallisto_tpm : ${params.sample_kallisto_tpm}"
log.info "sqanti classification : ${params.sqanti_classification}"
log.info "sqanti fasta : ${params.sqanti_fasta}"
log.info "sqanti gtf : ${params.sqanti_gtf}"
log.info "star_genome_dir : ${params.star_genome_dir}"
log.info "uniprot_protein_fasta : ${params.uniprot_protein_fasta}"
log.info ""
if (!params.gencode_gtf) exit 1, "Cannot find gtf file for parameter --gencode_gtf: ${params.gencode_gtf}"
ch_gencode_gtf = Channel.value(file(params.gencode_gtf))
if (!params.gencode_transcript_fasta) exit 1, "Cannot find any file for parameter --gencode_transcript_fasta: ${params.gencode_transcript_fasta}"
if (!params.gencode_translation_fasta) exit 1, "Cannot find any file for parameter --gencode_translation_fasta: ${params.gencode_translation_fasta}"
if (!params.uniprot_protein_fasta) exit 1, "Cannot find any file for parameter --uniprot_protein_fasta: ${params.uniprot_protein_fasta}"
if (!params.hexamer) exit 1, "Cannot find headmer file for parameter --hexamer: ${params.hexamer}"
ch_hexamer = Channel.value(file(params.hexamer))
if (!params.logit_model) exit 1, "Cannot find any logit model file for parameter --logit_model: ${params.logit_model}"
if (!params.sample_kallisto_tpm) exit 1, "Cannot find any sample_kallisto_tpm file for parameter --sample_kallisto_tpm: ${params.sample_kallisto_tpm}"
ch_sample_kallisto = Channel.value(file(params.sample_kallisto_tpm))
if (!params.normalized_ribo_kallisto) exit 1, "Cannot find any normalized_ribo_kallisto file for parameter --normalized_ribo_kallisto: ${params.normalized_ribo_kallisto}"
ch_normalized_ribo_kallisto = Channel.value(file(params.normalized_ribo_kallisto))
//if (!params.metamorpheus_toml) exit 1, "Cannot find any file for parameter --metamorpheus_toml: ${params.metamorpheus_toml}"
// if (!params.fastq_read_1) exit 1, "No file found for the parameter --fastq_read_1 at the location ${params.fastq_read_1}"
// if (!params.fastq_read_2) exit 1, "No file found for the parameter --fastq_read_2 at the location ${params.fastq_read_2}"
if (params.mass_spec != false & params.rescue_resolve_toml == false){
exit 1, "Cannot find file for parameter --rescue_resolve_toml: ${params.rescue_resolve_toml}"
} else if (params.mass_spec != false & params.rescue_resolve_toml != false){
ch_rr_toml = Channel.value(file(params.rescue_resolve_toml))
} else{
ch_rr_toml = Channel.from("no mass spec")
}
if (params.gencode_translation_fasta.endsWith('.gz')){
ch_gencode_translation_fasta = Channel.value(file(params.gencode_translation_fasta))
} else {
ch_gencode_translation_fasta_uncompressed = Channel.value(file(params.gencode_translation_fasta))
}
if (params.gencode_transcript_fasta.endsWith('.gz')){
ch_gencode_transcript_fasta = Channel.value(file(params.gencode_transcript_fasta))
} else {
ch_gencode_transcript_fasta_uncompressed = Channel.value(file(params.gencode_transcript_fasta))
}
if (params.genome_fasta.endsWith('.gz')){
ch_genome_fasta = Channel.value(file(params.genome_fasta))
} else {
ch_genome_fasta_uncompressed = Channel.value(file(params.genome_fasta))
}
if (params.uniprot_protein_fasta.endsWith('.gz')){
ch_uniprot_protein_fasta = Channel.value(file(params.uniprot_protein_fasta))
} else {
ch_uniprot_protein_fasta_uncompressed = Channel.value(file(params.uniprot_protein_fasta))
}
if (params.logit_model.endsWith('.gz')) {
ch_logit_model = Channel.value(file(params.logit_model))
} else {
ch_logit_model_uncompressed = Channel.value(file(params.logit_model))
}
if (!params.sqanti_fasta == false) {
if (params.sqanti_fasta.endsWith('.gz')) {
ch_sqanti_fasta = Channel.value(file(params.sqanti_fasta))
} else {
ch_sqanti_fasta_uncompressed = Channel.value(file(params.sqanti_fasta))
}
}
ch_fastq_reads = Channel.from(params.fastq_read_1, params.fastq_read_2).filter(String).flatMap{ files(it) }
// Implements logic for cloud compatibility, NO_TOML_FILE as variable only works for envs with local file system
projectDir = workflow.projectDir
if (!params.metamorpheus_toml) {
log.warn "No toml file specified via --metamorpheus_toml for Metamorpheus, proceeding without"
ch_metamorpheus_toml = Channel.value(file("${projectDir}/assets/NO_TOML_FILE"))
}
if (params.metamorpheus_toml) {
log.warn "Metamorpheus toml file specified: ${params.metamorpheus_toml}"
ch_metamorpheus_toml = Channel.value(file(params.metamorpheus_toml))
}
ch_metamorpheus_toml.into{
ch_metamorpheus_toml_gencode
ch_metamorpheus_toml_uniprot
ch_metamorpheus_toml_pacbio_refined
ch_metamorpheus_toml_pacbio_filtered
ch_metamorpheus_toml_pacbio_hybrid
}
if (!params.mass_spec == false) {
if (!params.mass_spec.endsWith("tar.gz")) {
ch_mass_spec_raw = Channel.fromPath("${params.mass_spec}/*.raw")
ch_mass_spec_mzml = Channel.fromPath("${params.mass_spec}/*.{mzml,mzML}")
} else {
if (params.mass_spec.endsWith("tar.gz")){
ch_mass_spec_raw_mzml_tar_gz = Channel.value(file(params.mass_spec))
}
}
} else {
ch_mass_spec_raw = Channel.from("no mass spec")
ch_mass_spec_mzml = Channel.from("no mass spec")
}
if (!params.star_genome_dir == false) {
if (!params.star_genome_dir.endsWith("tar.gz")) {
ch_genome_dir = Channel.fromPath(params.star_genome_dir, type:'dir')
} else {
if (params.star_genome_dir.endsWith("tar.gz")) {
ch_genome_dir_tar_gz = Channel.fromPath(params.star_genome_dir)
}
}
}
/*--------------------------------------------------
Decompress Logit Model
---------------------------------------------------*/
if (params.logit_model.endsWith('.gz')) {
process gunzip_logit_model {
tag "decompress logit model"
cpus 1
input:
file(logit_model) from ch_logit_model
output:
file("*.RData") into ch_logit_model_uncompressed
script:
"""
gunzip -f ${logit_model}
"""
}
}
/*--------------------------------------------------
Untar & decompress mass spec file
---------------------------------------------------*/
if (params.mass_spec != false) {
if (params.mass_spec.endsWith("tar.gz")) {
process untar_mass_spec {
tag "${raw_mzml_tar_gz}"
cpus 1
input:
file(raw_mzml_tar_gz) from ch_mass_spec_raw_mzml_tar_gz
output:
file("${raw_mzml_tar_gz.simpleName}/*.raw") optional true into ch_mass_spec_raw
file("${raw_mzml_tar_gz.simpleName}/*.{mzml,mzML}") optional true into ch_mass_spec_mzml
script:
"""
tar xvzf $raw_mzml_tar_gz
"""
}
}
}
/*--------------------------------------------------
Decompress gencode translation fasta file
---------------------------------------------------*/
if (params.gencode_translation_fasta.endsWith('.gz')) {
process gunzip_gencode_translation_fasta {
tag "decompress gzipped gencode translation fasta"
cpus 1
input:
file(gencode_translation_fasta) from ch_gencode_translation_fasta
output:
file("*.{fa,fasta}") into ch_gencode_translation_fasta_uncompressed
script:
"""
gunzip -f ${gencode_translation_fasta}
"""
}
}
/*--------------------------------------------------
Decompress gencode transcript fasta file
---------------------------------------------------*/
if (params.gencode_transcript_fasta.endsWith('.gz')) {
process gunzip_gencode_transcript_fasta {
tag "decompress gzipped gencode transcript fasta"
cpus 1
input:
file(gencode_transcript_fasta) from ch_gencode_transcript_fasta
output:
file("*.{fa,fasta}") into ch_gencode_transcript_fasta_uncompressed
script:
"""
gunzip -f ${gencode_transcript_fasta}
"""
}
}
/*--------------------------------------------------
Decompress genome fasta file
---------------------------------------------------*/
if (params.genome_fasta.endsWith('.gz')) {
process gunzip_gencome_fasta {
tag "decompress gzipped genome fasta"
cpus 1
input:
file(genome_fasta) from ch_genome_fasta
output:
file("*.{fa,fasta}") into ch_genome_fasta_uncompressed
script:
"""
gunzip -f ${genome_fasta}
"""
}
}
/*--------------------------------------------------
Decompress uniprot protein fasta file
---------------------------------------------------*/
if (params.uniprot_protein_fasta.endsWith('.gz')) {
process gunzip_uniprot_protein_fasta {
tag "decompress gzipped uniprot protein fasta"
cpus 1
input:
file(uniprot_protein_fasta) from ch_uniprot_protein_fasta
output:
file("*.{fa,fasta}") into ch_uniprot_protein_fasta_uncompressed
script:
"""
gunzip -f ${uniprot_protein_fasta}
"""
}
}
/*--------------------------------------------------
Reference Tables
---------------------------------------------------*/
process generate_reference_tables {
tag "${gencode_gtf}, ${gencode_transcript_fasta}"
cpus 1
publishDir "${params.outdir}/${params.name}/reference_tables/", mode: 'copy'
input:
file(gencode_gtf) from ch_gencode_gtf
file(gencode_transcript_fasta) from ch_gencode_transcript_fasta_uncompressed
output:
file("ensg_gene.tsv") into ch_ensg_gene
file("enst_isoname.tsv") into ch_enst_isoname
file("gene_ensp.tsv") into ch_gene_ensp
file("gene_isoname.tsv") into ch_gene_isoname
file("isoname_lens.tsv") into ch_isoname_lens
file("gene_lens.tsv") into ch_gene_lens
file("protein_coding_genes.txt") into ch_protein_coding_genes
script:
"""
prepare_reference_tables.py \
--gtf $gencode_gtf \
--fa $gencode_transcript_fasta \
--ensg_gene ensg_gene.tsv \
--enst_isoname enst_isoname.tsv \
--gene_ensp gene_ensp.tsv \
--gene_isoname gene_isoname.tsv \
--isoname_lens isoname_lens.tsv \
--gene_lens gene_lens.tsv \
--protein_coding_genes protein_coding_genes.txt
"""
}
// partition channels for use by multiple modules
ch_protein_coding_genes.into{
ch_protein_coding_genes_gencode_fasta
ch_protein_coding_genes_filter_sqanti
}
ch_ensg_gene.into{
ch_ensg_gene_filter
ch_ensg_gene_six_frame
ch_ensg_gene_pclass
}
ch_gene_lens.into{
ch_gene_lens_transcriptome
ch_gene_lens_aggregate
}
ch_gene_isoname.into{
ch_gene_isoname_pep_viz
ch_gene_isoname_pep_analysis
}
/*--------------------------------------------------
GENCODE Database
* Clusters same-protein GENCODE entries
* Output: gencode_protein.fasta
* - gencode fasta file without duplicate entries
* gencode_isoname_clusters.tsv
* - transcript accessions that were clustered
---------------------------------------------------*/
process make_gencode_database {
tag "${gencode_translation_fasta}"
cpus 1
publishDir "${params.outdir}/${params.name}/gencode_db/", mode: 'copy'
input:
file(gencode_translation_fasta) from ch_gencode_translation_fasta_uncompressed
file(protein_coding_genes) from ch_protein_coding_genes_gencode_fasta
output:
file("gencode_protein.fasta") into ch_gencode_protein_fasta
file("gencode_isoname_clusters.tsv")
script:
"""
make_gencode_database.py \
--gencode_fasta $gencode_translation_fasta \
--protein_coding_genes $protein_coding_genes \
--output_fasta gencode_protein.fasta \
--output_cluster gencode_isoname_clusters.tsv
"""
}
ch_gencode_protein_fasta.into{
ch_gencode_protein_fasta_metamorpheus
ch_gencode_protein_fasta_mapping
ch_gencode_protein_fasta_hybrid
ch_gencode_protein_fasta_novel
}
/*--------------------------------------------------
Run IsoSeq3 and SQANTI3 if SQANTI output not provided
---------------------------------------------------*/
if( params.sqanti_classification==false || params.sqanti_fasta==false || params.sqanti_gtf==false ){
if (!params.sample_ccs) exit 1, "Cannot find file for parameter --sample_ccs: ${params.sample_ccs}"
ch_sample_ccs = Channel.value(file(params.sample_ccs))
if (!params.primers_fasta) exit 1, "Cannot find any seq file for parameter --primers_fasta: ${params.primers_fasta}"
ch_primers_fasta = Channel.value(file(params.primers_fasta))
if (!params.genome_fasta) exit 1, "Cannot find any seq file for parameter --genome_fasta: ${params.genome_fasta}"
ch_genome_fasta_uncompressed.into{
ch_genome_fasta_star
ch_genome_fasta_isoseq
ch_genome_fasta_sqanti
}
/*--------------------------------------------------
IsoSeq3
* Runs IsoSeq3 on CCS reads, aligning to genome and
* collapsing redundant reads
* STEPS
* - ensure only qv10 reads from ccs are kept as input
* - find and remove adapters/barcodes
* - filter for non-concatamer, polya containing reads
* - clustering of reads
* - align reads to the genome
* - collapse redundant reads
---------------------------------------------------*/
process isoseq3 {
tag "${sample_ccs}, ${genome_fasta}, ${primers_fasta}"
cpus params.max_cpus
publishDir "${params.outdir}/${params.name}/isoseq3/", mode: 'copy'
input:
file(sample_ccs) from ch_sample_ccs
file(genome_fasta) from ch_genome_fasta_isoseq
file(primers_fasta) from ch_primers_fasta
output:
file("${params.name}.collapsed.gff") into ch_isoseq_gtf
file("${params.name}.collapsed.abundance.txt") into ch_fl_count
file("${params.name}.collapsed.fasta")
file("${params.name}.collapsed.report.json")
file("${params.name}.demult.lima.summary")
file("${params.name}.flnc.bam")
file("${params.name}.flnc.bam.pbi")
file("${params.name}.flnc.filter_summary.json")
script:
"""
# ensure that only qv10 reads from ccs are input
bamtools filter -tag 'rq':'>=0.90' -in $sample_ccs -out filtered.$sample_ccs
# create an index for the ccs bam
pbindex filtered.$sample_ccs
# find and remove adapters/barcodes
lima --isoseq --dump-clips --peek-guess -j ${task.cpus} filtered.$sample_ccs $primers_fasta ${params.name}.demult.bam
# filter for non-concatamer, polya containing reads
isoseq3 refine --require-polya ${params.name}.demult.NEB_5p--NEB_3p.bam $primers_fasta ${params.name}.flnc.bam
# clustering of reads, can only make faster by putting more cores on machine (cannot parallelize)
isoseq3 cluster ${params.name}.flnc.bam ${params.name}.clustered.bam --verbose --use-qvs
# align reads to the genome, takes few minutes (40 core machine)
pbmm2 align $genome_fasta ${params.name}.clustered.hq.bam ${params.name}.aligned.bam --preset ISOSEQ --sort -j ${task.cpus} --log-level INFO
# collapse redundant reads
isoseq3 collapse ${params.name}.aligned.bam ${params.name}.collapsed.gff
"""
}
/*--------------------------------------------------
Untar & Decompress star genome directory
---------------------------------------------------*/
if (params.star_genome_dir != false) {
if (params.star_genome_dir.endsWith("tar.gz")) {
process untar_star_genome_dir {
tag "${genome_dir_tar_gz}"
cpus 1
input:
file(genome_dir_tar_gz) from ch_genome_dir_tar_gz
output:
file("star_genome") into ch_genome_dir
script:
"""
tar xvzf $genome_dir_tar_gz
"""
}
}
}
/*--------------------------------------------------
STAR Alignment
* STAR alignment is run only if sqanti has not been
* previously been run and if fastq (short read RNAseq) files
* have been provided.
* if( params.sqanti_classification==false || params.sqanti_fasta==false || params.sqanti_gtf==false )
* STAR alignment is run if fastq reads are provided
* Junction alignments are fed to SQANTI3 where
* information is used in classificaiton filtering
*
* STEPS
* - generate star genome index (skipped if provided)
* - star read alignment
---------------------------------------------------*/
if(!params.star_genome_dir){
process star_generate_genome{
cpus params.max_cpus
publishDir "${params.outdir}/${params.name}/star_index", mode: "copy"
when:
(params.fastq_read_1 != false | params.fastq_read_2 !=false) & params.star_genome_dir == false
input :
file(gencode_gtf) from ch_gencode_gtf
file(genome_fasta) from ch_genome_fasta_star
output:
path("star_genome") into ch_genome_dir
script:
"""
mkdir star_genome
STAR --runThreadN ${task.cpus} \
--runMode genomeGenerate \
--genomeDir star_genome \
--genomeFastaFiles $genome_fasta \
--sjdbGTFfile $gencode_gtf \
--genomeSAindexNbases 11
"""
}
}
if(params.fastq_read_1 != false | params.fastq_read_2 !=false){
process star_alignment{
cpus params.max_cpus
publishDir "${params.outdir}/${params.name}/star", mode: "copy"
when:
params.fastq_read_1 != false | params.fastq_read_2 !=false
input :
file(fastq_reads) from ch_fastq_reads.collect()
path(genome_dir) from ch_genome_dir
output:
file("*SJ.out.tab") into ch_star_junction
file("*Log.final.out")
script:
"""
STAR --runThreadN ${task.cpus} \
--genomeDir $genome_dir \
--outFileNamePrefix ./${params.name} \
--readFilesIn $fastq_reads \
--readFilesCommand zcat
"""
}
}
else{
Channel
.from('none')
.set{ch_star_junction}
}
/*--------------------------------------------------
SQANTI3
* https://github.com/ConesaLab/SQANTI3
* Corrects any errors in alignment from IsoSeq3 and
* classifies each accession in relation to the reference
* genome
---------------------------------------------------*/
process sqanti3 {
tag "${fl_count}, ${gencode_gtf}, ${gencode_fasta}, ${sample_gtf},"
cpus params.max_cpus
publishDir "${params.outdir}/${params.name}/sqanti3/", mode: 'copy'
input:
file(fl_count) from ch_fl_count
file(gencode_gtf) from ch_gencode_gtf
file(genome_fasta) from ch_genome_fasta_sqanti
file(sample_gtf) from ch_isoseq_gtf
file(star_junction) from ch_star_junction
output:
file("${params.name}_classification.txt") into ch_sample_unfiltered_classification
file("${params.name}_corrected.fasta") into ch_sample_unfiltered_fasta
file("${params.name}_corrected.gtf") into ch_sample_unfiltered_gtf
file("${params.name}_junctions.txt")
file("${params.name}_sqanti_report.pdf")
file("${params.name}.params.txt")
script:
if(params.fastq_read_1 == false & params.fastq_read_2 ==false)
"""
sqanti3_qc.py \
$sample_gtf \
$gencode_gtf \
$genome_fasta \
--skipORF \
-o ${params.name} \
--fl_count $fl_count \
--gtf
"""
else
"""
sqanti3_qc.py \
$sample_gtf \
$gencode_gtf \
$genome_fasta \
--skipORF \
-o ${params.name} \
--fl_count $fl_count \
--gtf \
-c $star_junction
"""
//
}
}
else{
ch_sample_unfiltered_classification = Channel.value(file(params.sqanti_classification))
// ch_sample_unfiltered_fasta = Channel.value(file(params.sqanti_fasta))
ch_sample_unfiltered_fasta = ch_sqanti_fasta_uncompressed
ch_sample_unfiltered_gtf = Channel.value(file(params.sqanti_gtf))
}
/*--------------------------------------------------
Filter SQANTI
* Filter SQANTI results based on several criteria
* - protein coding only
* PB transcript aligns to a GENCODE-annotated protein coding gene.
* - percent A downstream
* perc_A_downstreamTTS : percent of genomic "A"s in the downstream 20 bp window.
* If this number if high (> 80%), the 3' end have arisen from intra-priming during the RT step
* - RTS stage
* RTS_stage: TRUE if one of the junctions could be an RT template switching artifact.
* - Structural Category
* keep only transcripts that have a isoform structural category of:
* -novel_not_in_catalog
* -novel_in_catalog
* -incomplete-splice_match
* -full-splice_match
---------------------------------------------------*/
process filter_sqanti {
publishDir "${params.outdir}/${params.name}/sqanti3-filtered/", mode: 'copy'
input:
file(classification) from ch_sample_unfiltered_classification
file(sample_fasta) from ch_sample_unfiltered_fasta
file(sample_gtf) from ch_sample_unfiltered_gtf
file(protein_coding_genes) from ch_protein_coding_genes_filter_sqanti
file(ensg_gene) from ch_ensg_gene_filter
output:
file("${params.name}_classification.5degfilter.tsv") into ch_sample_classification
file("${params.name}_corrected.5degfilter.fasta") into ch_sample_fasta
file("${params.name}_corrected.5degfilter.gff") into ch_sample_gtf
file("*")
script:
"""
filter_sqanti.py \
--sqanti_classification $classification \
--sqanti_corrected_fasta $sample_fasta \
--sqanti_corrected_gtf $sample_gtf \
--protein_coding_genes $protein_coding_genes \
--ensg_gene $ensg_gene \
--filter_protein_coding yes \
--filter_intra_polyA yes \
--filter_template_switching yes \
--percent_A_downstream_threshold 95 \
--structural_categories_level strict \
--minimum_illumina_coverage 3 \
collapse_isoforms.py \
--name ${params.name} \
--sqanti_gtf filtered_${params.name}_corrected.gtf \
--sqanti_fasta filtered_${params.name}_corrected.fasta
collapse_classification.py \
--name ${params.name} \
--collapsed_fasta ${params.name}_corrected.5degfilter.fasta \
--classification filtered_${params.name}_classification.tsv
"""
}
ch_sample_classification.into{
ch_sample_classification_six_frame
ch_sample_classification_transcriptome
ch_sample_classification_orf
}
ch_sample_fasta.into{
ch_sample_fasta_cpat
ch_sample_fasta_six_frame
ch_sample_fasta_orf
ch_sample_fasta_refine
}
ch_sample_gtf.into{
ch_sample_gtf_orf
ch_sample_gtf_cds
}
/*--------------------------------------------------
Six-Frame Translation
* Generates a fasta file of all possible protein sequences
* derivable from each PacBio transcript, by translating the
* fasta file in all six frames (3+, 3-). This is used to examine
* what peptides could theoretically match the peptides found via
* a mass spectrometry search against GENCODE.
---------------------------------------------------*/
process six_frame_translation {
cpus 1
tag "${classification}, ${ensg_gene}"
publishDir "${params.outdir}/${params.name}/pacbio_6frm_gene_grouped/", mode: 'copy'
input:
file(classification) from ch_sample_classification_six_frame
file(ensg_gene) from ch_ensg_gene_six_frame
file(sample_fasta) from ch_sample_fasta_six_frame
output:
file("${params.name}.6frame.fasta") into ch_six_frame
script:
"""
six_frame_translation.py \
--iso_annot $classification \
--ensg_gene $ensg_gene \
--sample_fasta $sample_fasta \
--output_fasta ${params.name}.6frame.fasta
"""
}
/*--------------------------------------------------
Transcriptome Summary
* Compares the abundance (CPM) based on long-read sequencing
* to the abundances (TPM) inferred from short-read sequencing,
* as computed by Kallisto (analyzed outside of this pipeline).
* Additionally produces a pacbio-gene reference table
---------------------------------------------------*/
process transcriptome_summary {
cpus 1
publishDir "${params.outdir}/${params.name}/transcriptome_summary/", mode: 'copy'
input:
file(sqanti_classification) from ch_sample_classification_transcriptome
file(tpm) from ch_sample_kallisto
file(ribo) from ch_normalized_ribo_kallisto
file(ensg_to_gene) from ch_ensg_gene
file(enst_to_isoname) from ch_enst_isoname
file(len_stats) from ch_gene_lens_transcriptome
output:
file("gene_level_tab.tsv") into ch_gene_level
file("sqanti_isoform_info.tsv") into ch_sqanti_isoform_info
file("pb_gene.tsv") into ch_pb_gene
script:
"""
transcriptome_summary.py \
--sq_out $sqanti_classification \
--tpm $tpm \
--ribo $ribo \
--ensg_to_gene $ensg_to_gene \
--enst_to_isoname $enst_to_isoname \
--len_stats $len_stats
"""
}
ch_pb_gene.into{
ch_pb_gene_orf
ch_pb_gene_cds
ch_pb_gene_peptide_analysis
}
/*--------------------------------------------------
CPAT
* CPAT is a bioinformatics tool to predict an RNA’s coding probability
* based on the RNA sequence characteristics.
* To achieve this goal, CPAT calculates scores of sequence-based features
* from a set of known protein-coding genes and background set of non-coding genes.
* ORF size
* ORF coverage
* Fickett score
* Hexamer usage bias
*
* CPAT will then builds a logistic regression model using these 4 features as
* predictor variables and the “protein-coding status” as the response variable.
* After evaluating the performance and determining the probability cutoff,
* the model can be used to predict new RNA sequences.
*
* https://cpat.readthedocs.io/en/latest/
---------------------------------------------------*/
process cpat {
cpus 1
tag "${hexamer}, ${logit_model}, ${sample_fasta}"
publishDir "${params.outdir}/${params.name}/cpat/", mode: 'copy'
input:
file(hexamer) from ch_hexamer
file(logit_model) from ch_logit_model_uncompressed
file(sample_fasta) from ch_sample_fasta_cpat
output:
file("${params.name}.ORF_prob.tsv") into ch_cpat_all_orfs
file("${params.name}.ORF_prob.best.tsv") into ch_cpat_best_orf
file("${params.name}.ORF_seqs.fa") into ch_cpat_protein_fasta
file("*")
script:
"""
cpat.py \
-x $hexamer \
-d $logit_model \
-g $sample_fasta \
--min-orf=50 \
--top-orf=50 \
-o ${params.name} \
1> ${params.name}_cpat.output \
2> ${params.name}_cpat.error
"""
}
ch_cpat_all_orfs.into{
ch_cpat_all_orfs_for_orf_calling
ch_cpat_all_orfs_for_peptide_analysis
}
ch_cpat_protein_fasta.into{
ch_cpat_protein_fasta_orf_calling
ch_cpat_protein_fasta_peptide_analysis
}
/*--------------------------------------------------
ORF Calling
* Selects the most plausible ORF from each pacbio transcript,
* using the following information
* comparison of ATG start to reference (GENCODE)
* - selects ORF with ATG start matching the one in the reference, if it exists
* coding probability score from CPAT
* number of upstream ATGs for the candidate ORF
* - decrease score as number of upstream ATGs increases
* using sigmoid function
* Additionally provides calling confidence of each ORF called
* - Clear Best ORF : best score and fewest upstream ATGs of all called ORFs
* - Plausible ORF : not clear best, but decent CPAT coding_score (>0.364)
* - Low Quality ORF : low CPAT coding_score (<0.364)
---------------------------------------------------*/
process orf_calling {
tag "${orf_coord}, ${gencode_gtf}, ${sample_gtf}, ${pb_gene}, ${classification}, ${sample_fasta} "
cpus params.max_cpus
publishDir "${params.outdir}/${params.name}/orf_calling/", mode: 'copy'
input:
file(cpat_orfs) from ch_cpat_all_orfs_for_orf_calling
file(cpat_fasta) from ch_cpat_protein_fasta_orf_calling
file(gencode_gtf) from ch_gencode_gtf
file(sample_gtf) from ch_sample_gtf_orf
file(sample_fasta) from ch_sample_fasta_orf
file(pb_gene) from ch_pb_gene_orf
file(classification) from ch_sample_classification_orf
output:
file("${params.name}_best_orf.tsv") into ch_best_orf
script:
"""
orf_calling.py \
--orf_coord $cpat_orfs \
--orf_fasta $cpat_fasta \
--gencode $gencode_gtf \
--sample_gtf $sample_gtf \
--pb_gene $pb_gene \
--classification $classification \
--sample_fasta $sample_fasta \
--num_cores ${task.cpus} \
--output ${params.name}_best_orf.tsv
"""
}
ch_best_orf.into{
ch_best_orf_refine
ch_best_orf_cds
ch_best_orf_sqanti_protein
ch_best_orf_pclass
}
/*--------------------------------------------------
Refined DB Generation
* - Filteres ORF database to only include accessions
* with a CPAT coding score above a threshold (default 0.0)
* - Filters ORFs to only include ORFs that have a stop codon
* - Collapses transcripts that produce the same protein
* into one entry, keeping a base accession (first alphanumeric).
* Abundances of transcripts (CPM) are collapsed during this process.
---------------------------------------------------*/
process refine_orf_database {
cpus 1
tag "${best_orfs}, ${sample_fasta}, ${params.coding_score_cutoff}"
publishDir "${params.outdir}/${params.name}/refined_database/", mode: 'copy'
input:
file(best_orfs) from ch_best_orf_refine
file(sample_fasta) from ch_sample_fasta_refine
output:
// file("*")
file("${params.name}_orf_refined.tsv") into ch_refined_info
file("${params.name}_orf_refined.fasta") into ch_refined_fasta
script:
"""
refine_orf_database.py \
--name ${params.name} \
--orfs $best_orfs \
--pb_fasta $sample_fasta \
--coding_score_cutoff ${params.coding_score_cutoff} \
"""
}
ch_refined_info.into{
ch_refined_info_rename
ch_refined_info_cds
ch_refined_info_pclass
ch_refined_info_aggregate
}
/*--------------------------------------------------
PacBio CDS GTF
* derive a GTF file that includes the ORF regions (as CDS features)
---------------------------------------------------*/
process make_pacbio_cds_gtf {
cpus 1
publishDir "${params.outdir}/${params.name}/pacbio_cds/", mode: 'copy'
input:
file(sample_gtf) from ch_sample_gtf_cds
file(refined_info) from ch_refined_info_cds
file(called_orfs) from ch_best_orf_cds
file(pb_gene) from ch_pb_gene_cds
output:
file("${params.name}_with_cds.gtf") into ch_pb_cds
file("*")
script:
"""
make_pacbio_cds_gtf.py \
--name ${params.name} \
--sample_gtf $sample_gtf \
--refined_database $refined_info \
--called_orfs $called_orfs \
--pb_gene $pb_gene \
--include_transcript yes
make_pacbio_cds_gtf.py \
--name ${params.name}_no_transcript \
--sample_gtf $sample_gtf \
--refined_database $refined_info \
--called_orfs $called_orfs \
--pb_gene $pb_gene \