-
Notifications
You must be signed in to change notification settings - Fork 2
/
validate_gff.pl
executable file
·163 lines (129 loc) · 5.51 KB
/
validate_gff.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#!/usr/bin/perl
=head1 NAME
validate_gff.pl
=head1 SYNOPSIS
validate_gff.pl -g [old GFF file] -f [old Fasta file] -u [updated GFF file] -n [new Fasta file] <other params>
=head1 COMMAND-LINE OPTIONS
-g old GFF file (required)
-f old Fasta file based on old scaffold AGP (required)
-u Updated GFF file output by update_coordinates_gff.pl (required)
-n New Fasta file based on new scaffold AGP (required)
-d debugging messages (1 or 0)
-h Help
=cut
use strict;
use warnings;
use Bio::GenomeUpdate::GFF;
use Bio::SeqIO;
use Utilities;
use File::Slurp;
use Getopt::Std;
our ( $opt_g, $opt_f, $opt_u, $opt_n, $opt_d, $opt_h );
getopts('g:f:u:n:d:h');
if ($opt_h) {
help();
exit;
}
if ( !$opt_g || !$opt_f || !$opt_u || !$opt_n ) {
print "\nOld GFF3/Fasta and updated GFF3/Fasta are required.
See help below\n\n\n";
help();
}
#prep input data
my $gff_old_file = $opt_g;
my $gff_old = read_file($gff_old_file)
or die "Could not open old GFF file: $gff_old_file\n";
my $fasta_old_file;
if ( -e $opt_f ) {
$fasta_old_file = $opt_f;
}
else {
die "Could not open old Fasta file: $opt_f\n";
}
my $fasta_old = read_file($fasta_old_file);
my $gff_updated_file = $opt_u;
my $gff_updated = read_file($gff_updated_file)
or die "Could not open updated GFF file: $gff_updated_file\n";
my $fasta_updated_file;
if ( -e $opt_n ) {
$fasta_updated_file = $opt_n;
}
else {
die "Could not open old Fasta file: $opt_n\n";
}
my $fasta_updated = read_file($fasta_updated_file);
my $util = Utilities->new();
if ($opt_d) {
print STDERR "Params parsed..\n";
$util->mem_used();
$util->run_time();
}
#Read in gff and fasta files
my $gff_old_obj = Bio::GenomeUpdate::GFF->new();
$gff_old_obj->parse_gff( $gff_old, $gff_old_file );
my $fasta_old_str = $gff_old_obj->get_fasta($fasta_old);
my $gff_updated_obj = Bio::GenomeUpdate::GFF->new();
$gff_updated_obj->parse_gff( $gff_updated, $gff_updated_file );
my $fasta_updated_str = $gff_updated_obj->get_fasta($fasta_updated);
if ($opt_d) {
print STDERR "Files read..\n";
$util->mem_used();
$util->run_time();
}
#Compare both Fasta files
if ( $fasta_old_str ne $fasta_updated_str ) {
open( OF, ">${gff_old_file}.features.fasta" );
print OF $fasta_old_str;
close(OF);
open( UF, ">${gff_updated_file}.features.fasta" );
print UF $fasta_updated_str;
close(UF);
die "Fasta of features from old and updated GFFs do not match.
Were any features rejected during update??
See ${gff_old_file}.features.fasta and ${gff_updated_file}.features.fasta";
}
elsif ( $fasta_old_str eq $fasta_updated_str ) {
print STDERR "\n\nUpdated GFF validated..\n\n";
}
if ($opt_d) {
$util->mem_used();
$util->run_time();
}
#----------------------------------------------------------------------------
sub help {
print STDERR <<EOF;
$0:
Description:
This script checks if the sequences produced using the updated GFF file match the sequences produced by the old GFF file. GFF files should be in Generic Feature Format version 3.
NOTE:
Usage:
validate_gff.pl -g [old GFF file] -f [old Fasta file] -u [updated GFF file] -n [new Fasta file] <other params>
Flags:
-g old GFF file (required)
-f old Fasta file based on old scaffold AGP (required)
-u Updated GFF file output by update_coordinates_gff.pl (required)
-n New Fasta file based on new scaffold AGP (required)
-d debugging messages (1 or 0)
-h Help
EOF
exit(1);
}
=head1 LICENSE
Same as Perl.
=head1 AUTHORS
Surya Saha <[email protected] , @SahaSurya>
=cut
__END__
# GFF
SL2.40ch00 SL2.40_assembly supercontig 21801721 21803721 . + . ID=SL2.40sc03931;Parent=SL2.40ch00;Name=SL2.40sc03931;Target=SL2.40sc03931 1 2001 +;reliably_oriented=0
SL2.40ch00 SL2.40_assembly contig 21801721 21803721 . + . ID=SL2.40ct09637;Name=SL2.40ct09637;Parent=SL2.40sc03931;Target=SL2.40ct09637 1 2001 +;scaffold_reliably_oriented=0;reliably_oriented=1
SL2.40ch00 SL2.40_assembly remark 21803722 21803821 . + . Name=contig_gap;Note=type: unknown_gap%2C description: a gap between clone contigs (also called a "layout gap")
SL2.40ch00 SL2.40_assembly supercontig 21803822 21805821 . + . ID=SL2.40sc04627;Parent=SL2.40ch00;Name=SL2.40sc04627;Target=SL2.40sc04627 1 2000 +;reliably_oriented=0
SL2.40ch00 SL2.40_assembly contig 21803822 21805821 . + . ID=SL2.40ct16352;Name=SL2.40ct16352;Parent=SL2.40sc04627;Target=SL2.40ct16352 1 2000 +;scaffold_reliably_oriented=0;reliably_oriented=1
SL2.40ch01 SL2.40_assembly ultracontig 1 90304244 . . . ID=SL2.40ch01;Name=SL2.40ch01
SL2.40ch01 SL2.40_assembly supercontig 1 32987597 . - . ID=SL2.40sc04133;Parent=SL2.40ch01;Name=SL2.40sc04133;Target=SL2.40sc04133 1 32987597 +;reliably_oriented=1
SL2.40ch01 SL2.40_assembly contig 1 3610 . - . ID=SL2.40ct13037;Name=SL2.40ct13037;Parent=SL2.40sc04133;Target=SL2.40ct13037 1 3610 +;scaffold_reliably_oriented=1;reliably_oriented=1
SL2.40ch01 SL2.40_assembly contig 3864 12848 . - . ID=SL2.40ct13036;Name=SL2.40ct13036;Parent=SL2.40sc04133;Target=SL2.40ct13036 1 8985 +;scaffold_reliably_oriented=1;reliably_oriented=1
SL2.40ch01 SL2.40_assembly contig 12869 15910 . - . ID=SL2.40ct13035;Name=SL2.40ct13035;Parent=SL2.40sc04133;Target=SL2.40ct13035 1 3042 +;scaffold_reliably_oriented=1;reliably_oriented=1
SL2.40ch01 SL2.40_assembly contig 16487 76523 . - . ID=SL2.40ct13034;Name=SL2.40ct13034;Parent=SL2.40sc04133;Target=SL2.40ct13034 1 60037 +;scaffold_reliably_oriented=1;reliably_oriented=1
SL2.40ch01 SL2.40_assembly contig 77166 85600 . - . ID=SL2.40ct13033;Name=SL2.40ct13033;Parent=SL2.40sc04133;Target=SL2.40ct13033 1 8435 +;scaffold_reliably_oriented=1;reliably_oriented=1