-
Notifications
You must be signed in to change notification settings - Fork 2
/
update_coordinates_gff.pl
executable file
·199 lines (162 loc) · 6.8 KB
/
update_coordinates_gff.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
#!/usr/bin/perl
=head1 NAME
update_coordinates_gff.pl
=head1 SYNOPSIS
update_coordinates_gff.pl -o [old AGP file] -n [new AGP file] -g [GFF file] -m [output GFF file] -c 0 -d 0
=head1 COMMAND-LINE OPTIONS
-o old AGP file (required)
-n new scaffold AGP file with updated coordinates (required)
-g GFF3 file based on old AGP to update to new AGP file (required)
-m output mapped GFF file (required)
-c remove children of dropped features (1 or 0) (required)
-d debugging messages (1 or 0)
-h Help
=cut
use strict;
use warnings;
use Bio::GenomeUpdate::AGP;
use Bio::GenomeUpdate::GFF;
use Utilities;
use File::Slurp;
use Getopt::Std;
our ( $opt_o, $opt_n, $opt_g, $opt_m, $opt_c, $opt_d, $opt_h );
getopts('o:n:g:m:c:d:h');
if ($opt_h) {
help();
exit;
}
if ( !$opt_o || !$opt_n || !$opt_g || !$opt_m || !defined($opt_c) ) {
print
"\nOld AGP, New AGP, GFF3 based on old AGP, new mapped GFF filename and children flag are required.
See help below\n\n\n";
help();
}
#get input files
my $old_agp_input_file = $opt_o;
my $input_old_agp = read_file($old_agp_input_file)
or die "Could not open old AGP input file: $old_agp_input_file\n";
my $new_agp_input_file = $opt_n;
my $input_new_agp = read_file($new_agp_input_file)
or die "Could not open new AGP input file: $new_agp_input_file\n";
my $gff_input_file = $opt_g;
my $input_gff = read_file($gff_input_file)
or die "Could not open GFF input file: $gff_input_file\n";
my $gff_output_file = $opt_m;
my $drop_children = $opt_c;
my $util = Utilities->new();
if ($opt_d) {
print STDERR "Params parsed..\n";
$util->mem_used();
$util->run_time();
}
#object for old AGP
my $old_agp = Bio::GenomeUpdate::AGP->new();
$old_agp->parse_agp($input_old_agp);
#object for new AGP
my $new_agp = Bio::GenomeUpdate::AGP->new();
$new_agp->parse_agp($input_new_agp);
#object for gff
my $gff = Bio::GenomeUpdate::GFF->new();
$gff->parse_gff( $input_gff, $gff_input_file );
if ($opt_d) {
print STDERR "Files read..\n";
$util->mem_used();
$util->run_time();
}
# Deprecated: get coordinates mapped from old AGP to new AGP space using hash
#my %coords = $gff->get_reordered_coordinates($old_agp,$new_agp);
#my %flips = $gff->get_flipped_coordinates($old_agp,$new_agp);
#if ($opt_d){
# print STDERR "Hashes populated..\n";
# mem_used();
#}
#
##remapping the GFF
#$gff->remap_coordinates_hash(\%coords,\%flips);
#get coordinates mapped from old AGP to new AGP space using optimized routine
if ($drop_children) {
$gff->remap_coordinates_remove_children( $old_agp, $new_agp );
} else {
$gff->remap_coordinates( $old_agp, $new_agp );
}
if ($opt_d) {
print STDERR "Coords remapped..\n";
$util->mem_used();
$util->run_time();
}
#print the GFF ($gff_output_file)
my $new_gff = $gff->get_formatted_gff();
if ( !defined($new_gff) ) {
print STDERR "No valid GFF records found..\n";
exit 1;
} else {
unless ( open( OGFF, ">$gff_output_file" ) ) {
print STDERR "Cannot open $gff_output_file\n";
exit 1;
}
print OGFF $new_gff;
close(OGFF);
if ($opt_d) {
print STDERR "GFF written..\n";
$util->mem_used();
$util->run_time();
}
}
#----------------------------------------------------------------------------
sub help {
print STDERR <<EOF;
$0:
Description:
This script updates the coordinates of features in a Generic Feature Format version 3 (GFF)
using a new Accessioned Golden Path (AGP) file.
NOTE:
The component ids in both AGP files need to be identical.
Usage:
update_coordinates_gff.pl -o [old AGP file] -n [new AGP file] -g [GFF file] -m [output GFF file] -c 0
Flags:
-o old AGP file (required)
-n new scaffold AGP file with updated coordinates (required)
-g GFF3 file based on old AGP to update to new AGP file (required)
-m output mapped GFF file (required)
-c remove children of dropped features (1 or 0)
-d debugging messages (1 or 0)
-h Help
EOF
exit(1);
}
=head1 LICENSE
Same as Perl.
=head1 AUTHORS
Surya Saha <[email protected] , @SahaSurya>
=cut
__END__
# AGP old
SL2.40ch00 21803722 21803821 6262 U 100 contig no
SL2.40ch00 21803822 21805821 6263 W SL2.40sc04627 1 2000 0
SL2.40ch01 1 32987597 1 W SL2.40sc04133 1 32987597 -
SL2.40ch01 32987598 32987697 2 U 100 contig no
SL2.40ch01 32987698 35621724 3 W SL2.40sc03666 1 2634027 0
SL2.40ch01 35621725 35621824 4 U 100 contig no
# AGP new
SL2.40ch00 21801721 21803721 6261 W SL2.40sc03931 1 2001 0
SL2.40ch00 21803722 21803821 6262 U 100 contig no
SL2.40ch00 21803822 21805821 6263 W SL2.40sc04627 1 2000 0
SL2.40ch01 1 32987597 1 F SL2.40SC04133 1 32987597 +
SL2.40ch01 32987598 35267597 2 N 2280000 contig no
SL2.40ch01 35267598 36990194 3 F SL2.40SC04191 1 1722597 +
SL2.40ch01 36990195 39120194 4 N 2130000 contig no
SL2.40ch01 39120195 41754221 5 F SL2.40SC03666 1 2634027 +
SL2.40ch01 41754222 42324221 6 N 570000 contig no
# GFF
SL2.40ch00 SL2.40_assembly supercontig 21801721 21803721 . + . ID=SL2.40sc03931;Parent=SL2.40ch00;Name=SL2.40sc03931;Target=SL2.40sc03931 1 2001 +;reliably_oriented=0
SL2.40ch00 SL2.40_assembly contig 21801721 21803721 . + . ID=SL2.40ct09637;Name=SL2.40ct09637;Parent=SL2.40sc03931;Target=SL2.40ct09637 1 2001 +;scaffold_reliably_oriented=0;reliably_oriented=1
SL2.40ch00 SL2.40_assembly remark 21803722 21803821 . + . Name=contig_gap;Note=type: unknown_gap%2C description: a gap between clone contigs (also called a "layout gap")
SL2.40ch00 SL2.40_assembly supercontig 21803822 21805821 . + . ID=SL2.40sc04627;Parent=SL2.40ch00;Name=SL2.40sc04627;Target=SL2.40sc04627 1 2000 +;reliably_oriented=0
SL2.40ch00 SL2.40_assembly contig 21803822 21805821 . + . ID=SL2.40ct16352;Name=SL2.40ct16352;Parent=SL2.40sc04627;Target=SL2.40ct16352 1 2000 +;scaffold_reliably_oriented=0;reliably_oriented=1
SL2.40ch01 SL2.40_assembly ultracontig 1 90304244 . . . ID=SL2.40ch01;Name=SL2.40ch01
SL2.40ch01 SL2.40_assembly supercontig 1 32987597 . - . ID=SL2.40sc04133;Parent=SL2.40ch01;Name=SL2.40sc04133;Target=SL2.40sc04133 1 32987597 +;reliably_oriented=1
SL2.40ch01 SL2.40_assembly contig 1 3610 . - . ID=SL2.40ct13037;Name=SL2.40ct13037;Parent=SL2.40sc04133;Target=SL2.40ct13037 1 3610 +;scaffold_reliably_oriented=1;reliably_oriented=1
SL2.40ch01 SL2.40_assembly contig 3864 12848 . - . ID=SL2.40ct13036;Name=SL2.40ct13036;Parent=SL2.40sc04133;Target=SL2.40ct13036 1 8985 +;scaffold_reliably_oriented=1;reliably_oriented=1
SL2.40ch01 SL2.40_assembly contig 12869 15910 . - . ID=SL2.40ct13035;Name=SL2.40ct13035;Parent=SL2.40sc04133;Target=SL2.40ct13035 1 3042 +;scaffold_reliably_oriented=1;reliably_oriented=1
SL2.40ch01 SL2.40_assembly contig 16487 76523 . - . ID=SL2.40ct13034;Name=SL2.40ct13034;Parent=SL2.40sc04133;Target=SL2.40ct13034 1 60037 +;scaffold_reliably_oriented=1;reliably_oriented=1
SL2.40ch01 SL2.40_assembly contig 77166 85600 . - . ID=SL2.40ct13033;Name=SL2.40ct13033;Parent=SL2.40sc04133;Target=SL2.40ct13033 1 8435 +;scaffold_reliably_oriented=1;reliably_oriented=1