-
Notifications
You must be signed in to change notification settings - Fork 7
/
make-SIFT-db-all.pl
162 lines (123 loc) · 5.3 KB
/
make-SIFT-db-all.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
#!/usr/bin/perl -w
# create the SIFT databases, must be run in scripts directory because it's lookin for metadocs
use strict;
#require 'common-utils.pl';
#use Getopt::Std;
use File::Basename;
use Cwd qw(abs_path);
my $directory_of_script = dirname(abs_path(__FILE__));
require $directory_of_script . '/common-utils.pl';
use Getopt::Long qw (GetOptions);
my %options = ();
#getopts ("mhf:", \%options);
my $help = "";
my $ensembl_download = "";
my $metafile = "";
GetOptions ('help|h' => \$help,
'ensembl_download' => \$ensembl_download,
'config=s' => \$metafile );
if ($help || !$metafile || !$metafile) {
print "make-SIFT-db-all.pl\n";
print " -h help\n";
print " -config [required config file]\n";
print " -ensembl_download \n";
}
my $meta_href = readMeta($metafile);
my %meta_hash = %{$meta_href};
make_dir ($meta_hash{"PARENT_DIR"});
make_dir ($meta_hash{"PARENT_DIR"} . "/" . $meta_hash{"ORG_VERSION"});
for my $key (keys %meta_hash) {
my $value = $meta_hash{$key};
# print $value . "\n";
if ($key =~ /_DIR$/ && $key ne "PARENT_DIR") {
# print "making $value\n";
my $dir = $meta_hash{"PARENT_DIR"} . "/" . $value;
if (! -d $dir) { mkdir($dir, 0775) or die $dir . $!; }
}
}
my $siftscore_dir = $meta_hash{"PARENT_DIR"} . "/" . $meta_hash{"SINGLE_REC_WITH_SIFTSCORE_DIR"};
if (! -d $siftscore_dir) { mkdir ($siftscore_dir, 0775); }
if ($ensembl_download) {
# not manual, this is from Ensembl and needs to download
print "downloading gene annotation\n";
`perl download-annotation-srcs.pl $metafile`;
print "done downloading gene annotation\n";
print "downloading fasta files\n";
`perl download-fasta.pl $metafile`;
print "done downloading DNA fasta sequences";
print "download dbSNP files\n";
`perl download-dbSNP-files.pl $metafile`;
if (exists ($meta_hash{"DBSNP_VCF_FILE"}) && $meta_hash{"DBSNP_VCF_FILE"} ne "")
{
print "splitting dbSNP file\n";
`perl split-dbSNP-by-chr.pl $metafile`;
}
} # end if downloading files from Ensembl
print "converting gene format to use-able input\n";
`perl gff_gene_format_to_ucsc.pl $metafile`;
#`perl ensembl_gene_format_to_ucsc.pl $metafile`;
print "done converting gene format\n";
# decompress chromosome files so that Bio::DB can process them
my $fasta_dir = $meta_hash{"PARENT_DIR"} . "/" . $meta_hash{"CHR_DOWNLOAD_DEST"} ;
if (glob ("$fasta_dir/*.gz")) {
$fasta_dir .= "/*.gz";
`gunzip $fasta_dir`;
# check that all DNA files finished downloading, otherwise do it again.
# because all future programs won't work properly otherwise.
if ($?) {
die "DNA files do not exist or did not unzip properly\n";
}
} elsif (glob ("$fasta_dir/*.fa") || glob ("$fasta_dir/*.fasta") || glob ("$fasta_dir/*.fna")) {
# *.fa or *.fasta files already exist
} else {
die "no DNA fasta files in $fasta_dir\n";
}
print "making single records file\n";
`perl make-single-records-BIOPERL.pl $metafile`;
print "done making single records template\n";
print "making noncoding records file\n";
`perl make-single-records-noncoding.pl $metafile`;
print "done making noncoding records\n";
print "make the fasta sequences\n";
`perl generate-fasta-subst-files-BIOPERL.pl $metafile`;
print "done making the fasta sequences\n";
print "start siftsharp, getting the alignments\n";
# remove all_prot.fasta if it already exists
my $all_prot_fasta = $meta_hash{"PARENT_DIR"} . "/all_prot.fasta";
if ( -e $all_prot_fasta ) {
unlink $all_prot_fasta;
}
my $combine_prot_fasta_command = "for file in " . $meta_hash{"PARENT_DIR"} . "/" . $meta_hash{"FASTA_DIR"} . "/*.fasta ; do cat \"\$file\" >> " . $all_prot_fasta . "; done";
`$combine_prot_fasta_command`;
my $sift4g_command = $meta_hash{"SIFT4G_PATH"} . " -d " . $meta_hash{"PROTEIN_DB"} . " -q " . $all_prot_fasta . " --subst " . $meta_hash{"PARENT_DIR"} . "/" . $meta_hash{"SUBST_DIR"} . " --out " . $meta_hash{"PARENT_DIR"} . "/" . $meta_hash{"SIFT_SCORE_DIR"} . " --sub-results " ;
print $sift4g_command . "\n";
#`$siftsharp_command`;
`$sift4g_command`;
if ($? != 0) {
exit (-1);
}
print "done getting all the scores\n";
print "populating databases\n";
# check quality
# Pauline to do , by chromosame in script
`perl make-sift-scores-db-batch.pl $metafile`;
print "checking the databases\n";
system ("perl check_genes.pl $metafile");
my @chromosomes = getChr ($meta_hash{"PARENT_DIR"} . "/" . $meta_hash{"GENE_DOWNLOAD_DEST"});
foreach my $chr (@chromosomes) {
my $db_file = $meta_hash{"PARENT_DIR"} . "/" . $meta_hash{"SINGLE_REC_WITH_SIFTSCORE_DIR"} . "/" . $chr . "_scores.Srecords.with_dbSNPid.sorted";
my $check_outfile = $meta_hash{"PARENT_DIR"} . "/" . $meta_hash{"ORG_VERSION"} . "/" . $chr . "_SIFTDB_stats.txt";
`python check_SIFTDB.py $db_file $check_outfile`;
}
my $pep_check_file = $meta_hash{"PARENT_DIR"} . "/ENS_peptide_check.txt";
`perl checkENSPep.pl $metafile ENS $pep_check_file`;
my $final_outfolder = $meta_hash{"PARENT_DIR"} ."/". $meta_hash{"ORG_VERSION"};
system ("cp $metafile $final_outfolder");
# make some space
my $zip_dir = $meta_hash{"PARENT_DIR"} . "/" . $meta_hash{"CHR_DOWNLOAD_DEST"} . "/*" ;
print "zipping up $zip_dir\n";
system ("gzip $zip_dir");
my $rm_dir = $meta_hash{"PARENT_DIR"} . "/" . $meta_hash{"SINGLE_REC_WITH_SIFTSCORE_DIR"} . "/*";
system ("rm $rm_dir");
print "All done!\n";
exit (0);