-
Notifications
You must be signed in to change notification settings - Fork 18
/
gatk_split_align
executable file
·124 lines (104 loc) · 2.27 KB
/
gatk_split_align
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/perl
#Programmer: Nicholas Hazekamp
#Date: 6/10/2014
use strict;
use integer;
use Symbol;
my $numargs = $#ARGV + 1;
if ($numargs != 3) {
print STDERR "Usage: perl gran_split_align <number of reads> <reference file> <sam file>\n";
exit 1;
}
my $num_reads = shift;
my $ref_file = shift;
my $sam_file = shift;
my $ref = substr $ref_file, 0, -3;
my $sam = substr $sam_file, 0, -4;
my @ref_limits;
my @files;
my $i = 0;
my $position = 0;
my $read_count = 0;
my $num_outputs = 0;
#Open input file
open(INPUT, $ref_file);
open (OUTPUT,">$ref.$num_outputs.fa");
push @ref_limits, 0;
my $file = gensym;
open($file, ">$sam.$num_outputs.sam");
push @files, $file;
my $last_loc;
while (my $line = <INPUT>) {
chomp $line;
#FASTQ files begin sequence with '@' character
#If line begins with '@' then it is a new sequence
if ($line =~ /^>/){
(my $loc) = $line =~ m/loc(\d+)/;
#Check if the new sequence should be placed in a new file, otherwise place it in same file
if ($read_count == $num_reads){
close(OUTPUT);
$num_outputs++;
$read_count = 1;
open(OUTPUT, ">$ref.$num_outputs.fa");
push @ref_limits, $loc;
$file = gensym;
open($file, ">$sam.$num_outputs.sam");
push @files, $file;
print OUTPUT "$line\n";
print "HELP $num_outputs\n";
}
else{
$last_loc = $loc;
print OUTPUT "$line\n";
$read_count++;
}
}
#place all other lines in FASTQ file under same sequence
else {
print OUTPUT "$line\n";
}
}
push @ref_limits, $last_loc+1;
close(OUTPUT);
close(INPUT);
my $output;
print "START SAM\n";
#Open input file
open(INPUT, $sam_file);
my $in_header = 1;
while (my $line = <INPUT>) {
chomp $line;
(my $loc) = $line =~ /loc(\d+)/;
for( $i = 0; $i <= $num_outputs; $i++ ){
if($loc >= $ref_limits[$i]){
$position = $i;
}
}
if($loc eq ""){
if(not $line =~ /index/){
foreach $output (@files) {
print $output "$line\n";
}
}
}
elsif ($line =~ /^\@SQ/){
if($in_header eq 0){
print "\@SQ found after end of header.\n";
exit;
}
$output = $files[$position];
print $output "$line\n";
}
#place all other lines in FASTQ file under same sequence
else {
if($in_header == 1){
$in_header == 0;
}
$output = $files[$position];
print $output "$line\n";
}
}
close(INPUT);
foreach $output (@files) {
close($output);
}