-
Notifications
You must be signed in to change notification settings - Fork 7
/
make-single-records-noncoding.pl
111 lines (88 loc) · 3.87 KB
/
make-single-records-noncoding.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
#!/usr/bin/perl -w
# Pauline Ng
# Last modified: 2013-10-04
#
# Used BioPerl to generate nt sequence, and
# write directly out to file instead of storing
# and calling writeOutputToFilesByChromosomes
###############################################################
use strict;
use DBI;
use Class::Struct;
use Bio::DB::Fasta;
#require 'common-utils.pl';
#require 'dna_protein_subs.pl';
use File::Basename;
use Cwd qw(abs_path);
my $directory_of_script = dirname(abs_path(__FILE__));
require $directory_of_script . '/common-utils.pl';
require $directory_of_script . '/dna_protein_subs.pl';
our $TRUE = 0;
our $FALSE = 1;
if (scalar @ARGV != 1) {
die "Usage: perl $0 <metafile> \n" .
"Example: perl $0 metadocs/mouse.txt\n";
}
my ($metafile) = @ARGV;
my $meta_href = readMeta($metafile);
my %meta_hash = %{$meta_href};
my $outputdir = $meta_hash{"PARENT_DIR"} . "/". $meta_hash{"SINGLE_REC_BY_CHR_DIR"};
# Check and create directory if it does not exist. DO NOT RE-CREATE IF IT EXISTS!
if (! -d $outputdir) {
print "Output directory does not exist, it will be created.\n";
make_dir($outputdir);
} else {
# remove existing files because this program appends to files
if (glob ("$outputdir/*.singleRecords_noncoding")) {
system ("rm $outputdir/*.singleRecords_noncoding");
}
}
my $chr_dir = $meta_hash{"PARENT_DIR"} . "/" . $meta_hash{"CHR_DOWNLOAD_DEST"};
my $noncoding_transcripts_file = $meta_hash{"PARENT_DIR"} . "/" . $meta_hash{"GENE_DOWNLOAD_DEST"} . "/noncoding.txt";
###### MAIN PROGRAM
&generateOutput($meta_href, $chr_dir, $noncoding_transcripts_file, $outputdir);
sub generateOutput() {
print "Generating Single Records... takes a long time\n";
my ($meta_href, $dbdir, $gene_transcripts_file, $outputdir) = @_;
open (IN_TX, $gene_transcripts_file) || die "can't open $gene_transcripts_file";
my $tx_line;
# my $chr_db = Bio::DB::Fasta->new("$dbdir", -reindex=>1) || die "Died: $!\n";
my $chr_db = Bio::DB::Fasta->new("$dbdir") || die "Died: $!\n";
while ($tx_line = <IN_TX>) {
my ($chr, $biotype, $tx_start, $tx_end, $geneid, $transcriptid, $genename, $transcript_name) = split("\t", $tx_line);
# Open file for writing.
my $chr_file = $outputdir . "/" . $chr . ".singleRecords_noncoding";
#print "$outputdir\n";
open(FILE, ">>$chr_file") || die "Unable to write out to $chr_file\n";
write_out_for_noncoding ($tx_line, $chr_db);
close (FILE);
} # end while reading gene file
} # end generateOutput
sub write_out_for_noncoding {
my ($tx_line, $chr_db) = @_;
my ($chr, $biotype, $exon_start , $exon_end, $geneid, $txid, $genename, $txname) = split("\t", $tx_line);
my $fasta_subseq = $chr_db->seq("$chr:$exon_start,$exon_end");
# print $tx_line;
my @nucs = ("A", "C", "G", "T");
my $length = $exon_end - $exon_start + 1;
if (defined($fasta_subseq)) { # some chr can't retrieve sequence{
for (my $pos = 0; $pos < $length; $pos++) {
foreach my $nt2 (@nucs) {
my $genome_pos = $pos + $exon_start;
my $line = $chr . "," .
($genome_pos- 1) . "," . ($genome_pos) . "," .
"," . "," . # RSID, to be filled up later
"," . # ENSG will be removed
$txid . "," . # Takes the place of ENST
"," . # ENSP -> will be obtained via query to GENE_TRN_2_PROTEIN table
"$biotype," . # REGION
"," .
substr ($fasta_subseq , $pos, 1). "," . $nt2 . "," x 15 .
",$txid,$geneid,$genename,,$txname";
print FILE $line . "\n";;
} # end for each nucleotide
} #end for $pos loop
} else { # end couldn't retrieve sequence
print "skipping because couldn't $tx_line retrieve sequence";
}
}