-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathadd_metadata_to_fasta.pl
executable file
·123 lines (105 loc) · 3.22 KB
/
add_metadata_to_fasta.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
#!/usr/bin/perl
## Pombert Lab, IIT, 2020
my $name = 'add_metadata_to_fasta.pl';
my $version = '0.4b';
my $updated = '2024-03-03';
use strict;
use warnings;
use Getopt::Long qw(GetOptions);
my $usage = <<"OPTIONS";
NAME ${name}
VERSION ${version}
UPDATED ${updated}
SYNOPSIS This script adds metadata to fasta headers. This metadata is required for submission to NCBI GenBank.
USAGE 1 ${name} -f *.fasta -o 'Chloropicon primus RCC138' -s RCC138 -g 1 ## Using CMD line switches
USAGE 2 ${name} -f *.fasta -k metakeys_NCBI.tsv -c chromosomes.tsv ## Using metadata files
OPTIONS:
-f (--fasta) Specifies which FASTA files to add metadata to
## Single metadata keys
-o (--organism) Full organism name; e.g. 'Chloropicon primus RCC138'
-s (--strain) Strain definition; e.g. RCC138
-i (--isolate) Isolate name; e.g. 'Pacific Isolate'
-g (--gcode) NCBI genetic code ## https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi
-m (--moltype) NCBI moltype descriptor (e.g. genomic)
## Metadata files
-k (--keys) Tab-delimited NCBI metadata key -> value file
-c (--chromosome) Tab-delimited contig -> chromosome assignment file
OPTIONS
die "$usage\n" unless @ARGV;
my @fasta;
my $chromosomes;
my $metakeys;
## NCBI Fasta headers
my %meta = (
"organism" => undef,
"strain" => undef,
"isolate" => undef,
"gcode" => undef,
"moltype" => undef,
);
GetOptions(
'f|fasta=s@{1,}' => \@fasta,
'k|keys=s' => \$metakeys,
'c|chromosome=s' => \$chromosomes,
'o|organism=s' => \$meta{"organism"},
's|strain=s' => \$meta{"strain"},
'i|isolate=s' => \$meta{"isolate"},
'g|gcode=i' => \$meta{"gcode"},
'm|moltype=s' => \$meta{"moltype"},
);
die "[E] Fasta files required.\n" unless @fasta;
## Populating database of metadata keys and their values, if desired
if ($metakeys){
open META, '<', $metakeys or die "Can't open metadata file $metakeys\n";
while (my $line = <META>){
chomp $line;
if ($line =~ /^#/){ next; } ## Ignoring comments
elsif ($line =~ /^(\S+)\s+(.*)$/){
my $metakey = $1;
my $metavalue = $2;
$metavalue =~ s/\s+$//; ## Removing trailing spaces, if any
$meta{$metakey} = $metavalue;
}
}
}
## Populating database of contigs and their assigned chromosomes, if desired
my %chromo;
if ($chromosomes){
open CHR, '<', $chromosomes or die "Can't open chromosome file $chromosomes\n";
while (my $line = <CHR>){
chomp $line;
if ($line =~ /^#/){ next; } ## Ignoring comments
elsif ($line =~ /^(\S+)\s+(.*)$/){
my $contig = $1;
my $chromo_assig = $2;
$chromo_assig =~ s/\s+$//; ## Removing trailing spaces, if any
$chromo{$contig} = $chromo_assig;
}
}
}
## Working on FASTA files
while (my $file = shift@fasta){
open IN, '<', $file or die "Can't open $file: $!\n";
open OUT, '>', "$file.headers" or die "Can't create $file.headers: $!\n";
while (my $line = <IN>){
chomp $line;
if ($line =~ /^>(\S+)/){
my $contig = $1;
print OUT ">$contig ";
for my $key (keys %meta){
if ($meta{$key}){
print OUT "[$key=$meta{$key}]";
}
}
if ($chromosomes){
if (exists $chromo{$contig}){
print OUT "[location=chromosome][chromosome=$chromo{$contig}]";
}
}
print OUT "\n";
}
else { print OUT "$line\n"; }
}
close OUT;
system "mv $file.headers $file"; ## Overwrites original file
}