-
Notifications
You must be signed in to change notification settings - Fork 4
/
get_SNPs.pl
executable file
·810 lines (708 loc) · 28.3 KB
/
get_SNPs.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
#!/usr/bin/env perl
## Pombert JF, Illinois Tech - 2020
my $version = '2.0f';
my $name = 'get_SNPs.pl';
my $updated = '2023-05-03';
use strict;
use warnings;
use File::Basename;
use File::Path qw(make_path);
use Getopt::Long qw(GetOptions);
#########################################################################
### Command line options
#########################################################################
my $hint = "Type get_SNPs.pl -h (--help) for list of options\n";
my $usage = <<"USAGE";
NAME ${name}
VERSION ${version}
UPDATED ${updated}
SYNOPSIS Automates read-mapping of FASTQ datasets against reference sequence(s) and
performs variant calling (if desired)
USAGE ${name} [options]
EXAMPLES:
simple ${name} -fa *.fasta -fq *.fastq -o RESULTS
advanced ${name} --fasta *.fasta --fastq *.fastq --mapper bowtie2 --caller varscan2 --type both --var ./VarScan.v2.4.3.jar --threads 16
paired ends ${name} --fasta *.fasta --pe1 *R1.fastq --pe2 *R2.fastq --X 1000 --mapper bowtie2 --caller bcftools --threads 16
read-mapping only ${name} --fasta *.fasta --pe1 *R1.fastq --pe2 *R2.fastq --X 1000 --mapper bowtie2 --rmo --bam --threads 16
USAGE
unless (@ARGV){
print "\n$usage\n$hint\n";
exit(0);
};
my $options = <<'END_OPTIONS';
OPTIONS:
-h (--help) Display this list of options
-v (--version) Display script version
-o (--outdir) Output directory [Default: ./]
# Mapping options
-fa (--fasta) Reference genome(s) in fasta file
-fq (--fastq) Fastq reads (single ends) to be mapped against reference(s)
-pe1 Fastq reads #1 (paired ends) to be mapped against reference(s)
-pe2 Fastq reads #2 (paired ends) to be mapped against reference(s)
-mapper Read mapping tool: bowtie2, minimap2, winnowmap, ngmlr or hisat2 [default: minimap2]
-threads Number of processing threads [default: 16]
-mem Max total memory for samtools (in Gb) [default: 16] ## mem/threads = memory per thread
-bam Keeps BAM files generated
-idx (--index) Type of bam index generated (bai or csi) [default = csi] ## .bai not compatible with -mem
-sam Keeps SAM files generated; SAM files can be quite large
-rmo (--read_mapping_only) Do not perform variant calling; useful when only interested in bam/sam files and/or mapping stats
-ns (--no_stats) Do not calculate stats; stats can take a while to compute for large eukaryote genomes
# Mapper-specific options
-preset MINIMAP2/Winnowmap - Preset: sr, map-ont, map-pb or asm20 [default: sr]
-X BOWTIE2 - Maximum paired ends insert size [default: 750]
# Variant calling options
-caller Variant caller: varscan2 or bcftools [default: varscan2]
-type Type of variant called: snp, indel, or both [default: snp]
-ploidy BCFtools ploidy option [default: 1]
-B (--noBAQ) Disables Samtools base alignment quality (BAQ) computation - http://www.htslib.org/doc/samtools-mpileup.html
## Can cause problem with long reads
# VarScan2 parameters (see http://dkoboldt.github.io/varscan/using-varscan.html)
-var Which varscan jar file to use [default: VarScan.v2.4.6.jar]
-mc (--min-coverage) Minimum read depth at a position to make a call [default: 15]
-mr (--min-reads2) Minimum supporting reads at a position to call variants [default: 15]
-maq (--min-avg-qual) Minimum base quality at a position to count a read [default: 28]
-mvf (--min-var-freq) Minimum variant allele frequency threshold [default: 0.7]
-mhom (--min-freq-for-hom) Minimum frequency to call homozygote [default: 0.75]
-pv (--p-value) P-value threshold for calling variants [default: 1e-02]
-sf (--strand-filter) Strand filter: 0 or 1 [default: 0] ## 1 ignores variants with >90% support on one strand
END_OPTIONS
my @command = @ARGV;
my $help ='';
my $vn;
my $outdir = './';
## Mapping
my $mapper = 'minimap2';
my $threads = 16;
my $mem = 16;
my $index = 'csi';
my $sam = '';
my $bam = '';
my @fasta;
my @fastq;
my @pe1;
my @pe2;
my $rmo;
my $nostat;
## Mapper-specific
my $preset = 'sr';
my $maxins = '750';
## Variant calling
my $caller = 'varscan2';
my $type = 'snp';
my $ploidy = 1;
my $nobaq;
## VarScan-specific
my $varjar = 'VarScan.v2.4.6.jar';
my $mc = 15;
my $mr = 15;
my $maq = 28;
my $mvf = '0.7';
my $mhom = '0.75';
my $pv = '1e-02';
my $sf = 0;
GetOptions(
'h|help' => \$help,
'v|version', => \$vn,
'o|outdir=s' => \$outdir,
## Mapping
'mapper=s' => \$mapper,
'threads=i' => \$threads,
'mem=i' => \$mem,
'idx|index=s' => \$index,
'sam' => \$sam,
'bam' => \$bam,
'fa|fasta=s@{1,}' => \@fasta,
'fq|fastq=s@{1,}' => \@fastq,
'pe1=s@{1,}' => \@pe1,
'pe2=s@{1,}' => \@pe2,
'rmo|read_mapping_only' => \$rmo,
'ns|no_stats' => \$nostat,
## Mapper-specific
'preset=s' => \$preset,
'X=i' => \$maxins,
## Variant calling
'caller=s' => \$caller,
'type=s' => \$type,
'ploidy=i' => \$ploidy,
'B|noBAQ' => \$nobaq,
## VarScan-specific
'var=s' => \$varjar,
'mc|min-coverage=i' => \$mc,
'mr|min-reads2=i' => \$mr,
'maq|min-avg-qual=i' => \$maq,
'mvf|min-var-freq=s' => \$mvf,
'mhom|min-freq-for-hom=s' => \$mhom,
'pv|p-value=s' => \$pv,
'sf|strand-filter=i' => \$sf,
);
#########################################################################
### Help
#########################################################################
if ($help){
print "\n";
print $usage."\n";
print "$options\n";
}
#########################################################################
### Version
#########################################################################
if ($vn){
print "\n";
print "Script: $name\n";
print "Version: $version\n";
print "Updated: $updated\n\n";
exit(0);
}
#########################################################################
### Program checks for samtools, read mappers and variant callers
#########################################################################
chkinstall('samtools');
if ($mapper =~ /bowtie2|hisat2|minimap2|winnowmap|ngmlr/){
chkinstall($mapper);
}
else {
print "\n";
print "[E] Mapper option $mapper is unrecognized.\n";
print "[E] Please use bowtie2, hisat2, minimap2, winnowmap or ngmlr...\n\n";
exit(1);
}
unless ($rmo){
if ($caller =~ /bcftools/){
chkinstall($caller);
}
elsif ($caller eq 'varscan2'){
unless (-f $varjar){
die "Cannot find varscan jar file: $varjar\n";
}
}
else {
print "\n";
print "[E] Variant caller option $caller is unrecognized.\n";
print "[E] Please use varscan2 or bcftools...\n\n";
exit(1);
}
}
## Option checks
if (scalar@fasta == 0) {
die "\nPlease enter at least one FASTA reference before proceeding...\n\n";
}
if ((scalar@fastq == 0) && (scalar@pe1 == 0)) {
die "\nPlease enter at least one FASTQ dataset before proceeding...\n\n";
}
## Setting the caller variable to rmo (read-mapping only); doesn't make sense to
## keep the default variant caller label for file names if -rmo is enabled...
if ($rmo){
$caller = 'rmo';
}
###################################################################################################
### Output directory
###################################################################################################
unless (-d $outdir){
make_path( $outdir, { mode => 0755 } ) or die "Can't create output directory $outdir: $!\n";
}
###################################################################################################
### Timestamps and logs
###################################################################################################
my $start = localtime();
my $tstart = time;
my $todo = 0;
if (@fastq){
$todo += scalar(@fastq) * scalar(@fasta);
}
if (@pe1 && @pe2){
$todo += scalar(@pe1) * scalar(@fasta);
}
my $map_log = "${outdir}/mapping.$mapper.log";
my $time_log = "${outdir}/time.$mapper.$caller.log";
open MAP, '>>', $map_log or die "Can't create file $map_log: $!\n";
open LOG, '>', $time_log or die "Can't create file $time_log.log: $!\n";
print MAP "COMMAND LINE:\n${name} @command\n";
print LOG "Mapping/SNP calling started on: $start\n";
print LOG "A total of $todo pairwise comparisons will be performed\n";
print_options();
my $comparison = 0;
#########################################################################
### Read mapping/SNP calling
#########################################################################
my $fasta;
my $fastq;
my $file;
my $fa;
my $dir;
my $qdir;
my $flagstat;
## Creating indexes and/or kmer frequencies
foreach (@fasta){
$fasta = $_;
($fa, $dir) = fileparse($fasta);
unless (-f $fasta) {
die "\nFASTA file named $fasta not found. Please check your command line...\n\n";
}
if ($mapper eq 'bowtie2'){
system ("bowtie2-build --threads $threads $_ $fa.bt2") == 0 or checksig();
}
elsif ($mapper eq 'hisat2'){
system ("hisat2-build $_ $fa.ht") == 0 or checksig();
}
elsif ($mapper eq 'winnowmap'){
my $winnow_dir = "$outdir/winnowmap.files";
unless (-d $winnow_dir) {
mkdir ($winnow_dir,0755) or die "Can't create $winnow_dir: $!\n";
}
system ("meryl count k=15 output $winnow_dir/merylDB $fasta") == 0 or checksig();
system ("meryl print greater-than distinct=0.9998 merylDB > $winnow_dir/$fa.repetitive_k15.txt") == 0 or checksig();
}
}
my $index_time = time - $tstart;
print LOG "Time required to create all indexes: $index_time seconds\n";
## Read mapping files
my $log_file = "${outdir}/mapping.$mapper.log";
my $sam_file;
my $bam_file;
## Single end read-mapping
if (@fastq){
foreach (@fastq){
$fastq = $_;
($file, $qdir) = fileparse($fastq);
unless (-f $fastq) {
die "\nFASTQ file named $fastq not found. Please check your command line...\n\n";
}
print "\n## FASTQ information:\n";
print "FASTQ parsed as: $file\n";
print "FASTQ input DIR parsed as: $qdir\n";
foreach (@fasta){
$fasta = $_;
($fa, $dir) = fileparse($fasta);
$sam_file = "${outdir}/$file.$fa.$mapper.sam";
$bam_file = "${outdir}/$file.$fa.$mapper.bam";
print "\n## FASTA information:\n";
print "FASTA parsed as: $fa\n";
print "FASTA input DIR parsed as: $dir\n\n";
print "Mapping $fastq on $fasta with $mapper...\n";
my $mstart = localtime();
print MAP "\n".'###'." Mapping started on $mstart\n";
print MAP "\n$fastq vs. $fasta\n";
## Read-mapping
if ($mapper eq 'bowtie2'){
system ("bowtie2 \\
--rg-id $fastq \\
--rg SM:$fasta \\
-p $threads \\
-x $fa.bt2 \\
-U $fastq \\
-S $sam_file \\
2>> $log_file") == 0 or checksig();
}
elsif ($mapper eq 'minimap2'){ #-R \@RG\\\\tID:$fastq\\\\tSM:$fasta
system ("minimap2 \\
-t $threads \\
--MD \\
-R \@RG\\\\tID:$fastq\\\\tSM:$fasta \\
-L \\
-ax $preset \\
$fasta \\
$fastq \\
1> $sam_file \\
2>> $log_file") == 0 or checksig();
}
elsif ($mapper eq 'winnowmap'){
system ("winnowmap \\
-t $threads \\
-W $outdir/winnowmap.files/$fa.repetitive_k15.txt \\
--MD \\
-R \@RG\\\\tID:$fastq\\\\tSM:$fasta \\
-L \\
-ax $preset \\
$fasta \\
$fastq \\
1> $sam_file \\
2>> $log_file") == 0 or checksig();
}
elsif ($mapper eq 'ngmlr'){
system ("ngmlr \\
-t $threads \\
--rg-id $fastq \\
--rg-sm $fasta \\
-r $fasta \\
-q $fastq \\
-o $sam_file \\
2>&1 | tee $log_file") == 0 or checksig();
}
elsif ($mapper eq 'hisat2'){
system ("hisat2 \\
-p $threads \\
--phred33 \\
--rg-id $fastq \\
--rg SM:$fasta \\
-x $fa.ht \\
-U $fastq \\
--no-spliced-alignment \\
-S $sam_file \\
2>> $log_file") == 0 or checksig();
}
samtools(); ## Converting to BAM
unless ($rmo){variant();} ## Calling variants
unless ($nostat){stats();} ## Printing stats
logs();
## Cleaning SAM/BAM files
unless ($bam) {system "rm $bam_file $bam_file.$index";}
unless ($sam) {system "rm $sam_file";}
}
}
}
## Paired ends read mapping
if (@pe1 && @pe2){
while (my $pe1 = shift@pe1){
my $pe2 = shift@pe2;
unless (-f $pe1) { die "\nFASTQ PE1 file named $pe1 not found. Please check your command line...\n\n"; }
unless (-f $pe2) { die "\nFASTQ PE2 file named $pe2 not found. Please check your command line...\n\n"; }
($file, $qdir) = fileparse($pe1);
my $r2 = fileparse($pe2);
print "\n## FASTQ information:\n";
print "R1 FASTQ parsed as: $file\n";
print "R2 FASTQ parsed as: $r2\n";
print "FASTQ input DIR parsed as: $qdir\n";
foreach (@fasta){
$fasta = $_;
($fa, $dir) = fileparse($fasta);
$sam_file = "${outdir}/$file.$fa.$mapper.sam";
$bam_file = "${outdir}/$file.$fa.$mapper.bam";
print "\n## FASTA information:\n";
print "FASTA parsed as: $fa\n";
print "FASTA input DIR parsed as: $dir\n\n";
print "Mapping $pe1 and $pe2 on $fasta with $mapper...\n";
my $mstart = localtime();
print MAP "\n".'###'." Mapping started on $mstart\n";
print MAP "\n$pe1 + $pe2 vs. $fasta\n";
## Read mapping
if ($mapper eq 'bowtie2'){
system ("bowtie2 \\
--rg-id $pe1 \\
--rg SM:$fasta \\
-p $threads \\
-x $fa.bt2 \\
-X $maxins \\
-1 $pe1 \\
-2 $pe2 \\
-S $sam_file \\
2>> $log_file") == 0 or checksig();
}
elsif ($mapper eq 'hisat2'){
system ("hisat2 \\
-p $threads \\
--phred33 \\
--rg-id $pe1 \\
--rg SM:$fasta \\
-x $fa.ht \\
-1 $pe1 \\
-2 $pe2 \\
--no-spliced-alignment \\
-S $sam_file \\
2>> $log_file") == 0 or checksig();
}
elsif ($mapper eq 'minimap2'){
system ("minimap2 \\
-t $threads \\
-R \@RG\\\\tID:$pe1\\\\tSM:$fasta \\
-ax $preset \\
$fasta \\
$pe1 \\
$pe2 \\
1> $sam_file \\
2>> $log_file") == 0 or checksig();
}
samtools(); ## Converting to BAM
unless ($rmo){variant();} ## Calling variants
unless ($nostat){stats();} ## Printing stats
logs();
## Cleaning SAM/BAM files
unless ($bam) {system "rm $bam_file $bam_file.$index";}
unless ($sam) {system "rm $sam_file";}
}
}
}
## Cleaning up
if ($mapper =~ /bowtie2|hisat2|ngmlr/){
mkdir ("${outdir}/$mapper.indexes",0755);
}
if ($mapper eq 'bowtie2'){
system "mv ${outdir}/*.bt2 ${outdir}/*.fai ${outdir}/$mapper.indexes/";
}
elsif ($mapper eq 'hisat2'){
system "mv ${outdir}/*.ht2 ${outdir}/*.fai ${outdir}/$mapper.indexes/";
}
elsif ($mapper eq 'ngmlr'){
system "mv ${outdir}/*.ngm ${outdir}/$mapper.indexes/";
}
if ($bam){
mkdir ("${outdir}/$mapper.BAM",0755);
system "mv ${outdir}/*.bam ${outdir}/*.$index ${outdir}/$mapper.BAM/";
}
if ($sam){
mkdir ("${outdir}/$mapper.SAM",0755);
system "mv ${outdir}/*.sam ${outdir}/$mapper.SAM/";
}
unless ($rmo) {
mkdir ("${outdir}/$mapper.$caller.VCFs",0755);
system "mv ${outdir}/*.vcf ${outdir}/$mapper.$caller.VCFs/";
}
unless ($nostat){
mkdir ("${outdir}/$mapper.$caller.depth",0755);
mkdir ("${outdir}/$mapper.$caller.stats",0755);
system "mv ${outdir}/*.$mapper.depth ${outdir}/$mapper.$caller.depth/";
system "mv ${outdir}/*.$mapper.$type.stats ${outdir}/$mapper.$caller.stats/";
}
mkdir ("${outdir}/$mapper.$caller.coverage",0755);
system "mv ${outdir}/*.$mapper.coverage ${outdir}/$mapper.$caller.coverage/";
## Printing timestamps and logs
my $end = localtime();
my $time_taken = time - $tstart;
my $average_time = sprintf("%.1f", $time_taken/$comparison);
print "\nMapping/SNP calling started on: $start\n";
print "Mapping/SNP calling ended on: $end\n";
print "Time elapsed: $time_taken seconds\n";
print LOG "Mapping/SNP calling ended on: $end\n";
print LOG "Time elapsed: $time_taken seconds\n";
print LOG "Average time per pairwise comparison: $average_time seconds\n";
#########################################################################
### Subroutines
#########################################################################
# sub to check if programs are installed
sub chkinstall {
my $exe = $_[0];
my $prog = `echo \$(command -v $exe)`;
chomp $prog;
if ($prog eq ''){die "\nERROR: Cannot find $exe. Please install $exe in your \$PATH\n\n";}
}
# sub to check for SIGINT and SIGTERM
sub checksig {
my $exit_code = $?;
my $modulo = $exit_code % 255;
print "\nExit code = $exit_code; modulo = $modulo \n";
if ($modulo == 2) {
print "\nSIGINT detected: Ctrl+C => exiting...\n";
exit(2);
}
elsif ($modulo == 131) {
print "\nSIGTERM detected: Ctrl+\\ => exiting...\n";
exit(131);
}
}
# sub to run convert SAM files to binary (BAM) format with samtools
sub samtools {
my $coverage_file = "${outdir}/$file.$fa.$mapper.coverage";
my $sammem = int(($mem/$threads)*1024);
print "Running samtools on $sam_file...\n";
print "Using $sammem Mb per thread for samtools\n";
system "samtools view -@ $threads -bS $sam_file -o ${outdir}/tmp.bam";
system "samtools sort -@ $threads -m ${sammem}M -o $bam_file ${outdir}/tmp.bam";
if($index eq 'bai') { ## Can't use the memory -m CMD line switch with -b / .bai indexes
print "samtools index (-b) not compatible with (-m); ";
print "running samtools index with default memory settings...\n";
system "samtools index -b -@ $threads $bam_file";
}
else { ## .csi indexes; work with -m
system "samtools index -c -@ $threads -m ${sammem}M $bam_file";
}
system "samtools depth -aa $bam_file > $coverage_file"; ## Printing per base coverage information
system "rm ${outdir}/tmp.bam"; ## Discarding unsorted temporary BAM file
$flagstat = `samtools flagstat $bam_file`;
}
# sub to run the variant calling process
sub variant {
(my $passQC) = ($flagstat =~ /(\d+)\s+\+\s+\d+ in total/s);
print "\nQC-passed reads mapping to the genome = $passQC\n\n";
my $vcf_file = "${outdir}/$file.$fa.$mapper.$type.vcf";
## Checking if BAM file is empty
if ($passQC == 0){
print "No QC-passed reads mapped to the genome; skipping variants calling\n\n";
open VCF, ">", "$vcf_file" or die "Can't create file $vcf_file: $!\n";
print VCF '## No QC-passed reads mapped to the genome'."\n";
close VCF;
}
## If not empty, proceed; empty BAM files create isssues when piping mpileup
## to some variant callers (e.g. VarScan2).
else{
print "Calling variants with $caller on $fasta...\n\n";
my $baq_flag = '';
if ($nobaq){
$baq_flag = '-B';
}
if ($caller eq 'varscan2'){
my $vflag = '';
my $cns;
if (($type eq 'snp')||($type eq 'indel')){ $cns = $type; }
elsif ($type eq 'both'){
$cns = 'cns';
$vflag = '--variants';
}
else {die "\nERROR: Unrecognized variant type. Please use: snp, indel, or both\n\n";}
system "samtools \\
mpileup \\
$baq_flag \\
-f $fasta \\
$bam_file \\
| \\
java -jar $varjar \\
mpileup2$cns \\
--min-coverage $mc \\
--min-reads2 $mr \\
--min-avg-qual $maq \\
--min-var-freq $mvf \\
--min-freq-for-hom $mhom \\
--p-value $pv \\
--strand-filter $sf \\
$vflag \\
--output-vcf \\
> $vcf_file";
}
elsif ($caller eq 'bcftools'){ ## tested with 1.10.2
unless ($type =~ /snp|indel|both/){ die "\nERROR: Unrecognized variant type. Please use: snp, indel, or both\n\n"; }
my $varV = '';
if ($type eq 'snp'){ $varV = '-V indels'; }
elsif ($type eq 'indel') { $varV = '-V snps'; }
system "bcftools \\
mpileup \\
$baq_flag \\
-f $fasta \\
$bam_file \\
| \\
bcftools \\
call \\
-vmO v \\
$varV \\
--ploidy $ploidy \\
--output $vcf_file";
}
}
}
## Sub to calculate read mapping stats/metrics
sub stats {
my $run_time = time - $tstart; my $mend = localtime();
my $coverage_file = "${outdir}/$file.$fa.$mapper.coverage";
my $stats_file = "${outdir}/$file.$fa.$mapper.$type.stats";
my $depth_file = "${outdir}/$file.$fa.$mapper.depth";
my $vcf_file = "${outdir}/$file.$fa.$mapper.$type.vcf";
open COV, "<", "$coverage_file" or die "Can't read file $coverage_file: $!\n";
open STATS, ">", "$stats_file" or die "Can't create file $stats_file: $!\n";
open DEPTH, ">", "$depth_file" or die "Can't create file $depth_file: $!\n";
unless ($rmo){open SN, "<", "$vcf_file" or die "Can't read file $vcf_file.vcf: $!\n";}
print "\nCalculating stats...\n";
print DEPTH "Contig\tAverage depth\tAverage (all contigs)\tDifference\tRatio Contig\/Average\n";
my $total = 0; my $covered = 0; my $nocov = 0; my $max = 0; my $sumcov; my $sn =0;
## Run only if variant calling has been performed
unless ($rmo){
while (my $line = <SN>){
if ($line =~ /^#/){next;}
else {$sn++;}
}
}
## Checks for sequencing depth
my %data = (); my @contigs = ();
while (my $line = <COV>){
chomp $line;
$total++;
if ($line =~ /^(\S+)\s+(\d+)\s+(\d+)/){
my $contig = $1; my $position = $2; my $coverage = $3;
$sumcov += $coverage;
if ($coverage >= 1) {
$covered++;
if ($coverage > $max){$max = $coverage;}
}
else {$nocov++;}
if (exists $data{$contig}){
$data{$contig}[0] += 1; ## Keeping track of contig size
$data{$contig}[1] += $coverage; ## Keeping track of cumulative sequencing depth
}
else{
$data{$contig}[0] = 1; ## Initializing new contig
$data{$contig}[1] += $coverage; ## Keeping track of cumulative sequencing depth
push (@contigs, $contig);
}
}
}
## In case the output of samtools depth -aa generates a blank file (if no read maps to the reference)
my $avc; if ($total > 0){$avc = sprintf("%.2f", ($sumcov/$total));} else{$avc = 0;}
## Printing sequencing depths per contig
while (my $tmp = shift@contigs){
my $average = ($data{$tmp}[1]/$data{$tmp}[0]);
$average = sprintf("%.2f", $average);
my $diff = $average - $avc; $diff = sprintf("%.2f", $diff);
my $ratio;
## Preventing possible division by zero
if ($avc > 0){$ratio = $average/$avc; $ratio = sprintf("%.2f", $ratio);}
else{$ratio = 0;}
print DEPTH "$tmp\t$average\t$avc\t$diff\t$ratio\n";
}
print STATS "FASTQ file(s) used: $file (and mate, if PE)\n";
print STATS "Reference fasta file used: $fasta\n";
if ($total > 0){
print STATS "\nTotal number of bases in the reference genome\t$total bp\n";
print STATS "Number of bases covered by at least one read\t$covered\n";
print STATS "Number of bases without coverage\t$nocov\n";
print STATS "Maximum sequencing depth\t$max"."X\n";
print STATS "Average sequencing depth\t$avc"."X\n";
if ($total == $covered){print STATS "Sequencing breadth (percentage of bases covered by at least one read)\t100%\n";}
if ($total != $covered){my $av_cov = sprintf("%.2f%%", ($covered/$total)*100);
print STATS "Sequencing breadth (percentage of bases covered by at least one read)\t$av_cov\n";}
}
else{
print STATS "\nNo read was found to map to the reference: ";
print STATS "the output of samtools depth -aa ($coverage_file) is blank.\n";
}
## Execute only if variant calling has been performed
unless ($rmo){
my $snkb = sprintf("%.2f", ($sn/$covered)*1000);
if ($type eq 'both'){
print STATS "Total number of SNPs + indels found: $sn\n";
print STATS "Average number of SNPs + indels per Kb: $snkb\n";
}
elsif ($type eq 'snp') {
print STATS "Total number of SNPs found: $sn\n";
print STATS "Average number of SNPs per Kb: $snkb\n";
}
elsif ($type eq 'indel') {
print STATS "Total number of indels found: $sn\n";
print STATS "Average number of indels per Kb: $snkb\n";
}
}
print STATS "\n## SAMTOOLS flagstat metrics\n";
print STATS "$flagstat\n";
close STATS;
print "Time to calculate stats: $run_time seconds\n";
}
## LOG subroutines
sub logs {
my $run_time = time - $tstart; my $mend = localtime();
$comparison++;
print LOG "Comparison # $comparison : $file (and mate, if PE) vs. $fasta - cumulative time elapsed: $run_time seconds\n";
print MAP "\n".'###'." Mapping ended on $mend\n\n";
}
sub print_options {
print MAP "\nOPTIONS:\n\n";
print MAP "get_SNP.pl version: $version\n";
print MAP "Number of threads: $threads\n";
print MAP "Read mapper: $mapper\n";
if ($mapper eq 'bowtie'){print MAP "Max insert size for bowtie: $maxins nt\n";}
if ($rmo){print MAP "Read-mapping only requested, skipping variant calling.\n";}
if ($bam){print MAP "Keeping BAM files are requested.\n";}
if ($sam){print MAP "Keeping SAM files are requested.\n";}
if ($nostat){print MAP "Skipping stats calculations.\n";}
unless ($rmo){
print MAP "Variant caller: $caller\n";
if ($caller eq 'varscan2'){
print MAP " Varscan jar file used: $varjar\n";
print MAP " Minimum read depth at a position to make a call: $mc\n";
print MAP " Minimum supporting reads at a position to call variants: $mr\n";
print MAP " Minimum base quality at a position to count a read: $maq\n";
print MAP " Minimum variant allele frequency threshold: $mvf\n";
print MAP " Minimum frequency to call homozygote: $mhom\n";
print MAP " P-value threshold for calling variants: $pv\n";
print MAP " Strand filter: $sf\t\#\# 1 ignores variants with >90% support on one strand\n";
}
if ($caller eq "bcftools"){print MAP "Setting ploidy to: $ploidy\n";}
if ($type eq 'snp'){print MAP "Searching for SNPs...\n";}
elsif ($type eq 'indel') {print MAP "Searching for indels...\n";}
elsif ($type eq 'both') {print MAP "Searching for SNPs and indels...\n";}
}
}