forked from oushujun/LTR_retriever
-
Notifications
You must be signed in to change notification settings - Fork 0
/
LAI
executable file
·164 lines (149 loc) · 8.14 KB
/
LAI
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/perl -w
use strict;
use File::Basename;
##This script is to estimate the mean LTR sequence identity in the genome
##The script also extracts coordinate information for intact LTRs and all LTRs to calculate the LAI score
##Users should provide the LTR_retriever *pass.list file and the RepeatMasker *out file as inputs
##Author: Shujun Ou ([email protected])
##First version: 11/23/2016
##Beta v2: 02/05/2018
##Beta v3: 02/19/2018
my $usage="
The LTR Assembly Index (LAI) is developed to evaluate the assembly continunity of repetitive sequences
Usage: ./LAI -genome genome.fa -intact intact.pass.list -all genome.out [options]
Options:
-genome [file] The genome file that is used to generate everything.
-intact [file] A list of intact LTR-RTs generated by LTR_retriever (genome.fa.pass.list).
-all [file] RepeatMasker annotation of all LTR sequences in the genome (genome.fa.out).
-window [int] Window size for LAI estimation. Default: 3000000 (3 Mb)
-step [int] Step size for the estimation window to move forward. Default: 300000 (300 Kb)
Set step size = window size if prefer non-overlapping outputs.
-q Quick estimation of LTR identity (much faster for large genomes, may sacrifice ~0.5% of accuracy).
-qq No estimation of LTR identity, only output raw LAI for within species comparison (very quick).
-mono [file] This parameter is mainly for ployploid genomes. User provides a list of sequence names that represent a monoploid (1x).
LAI will calculated only on these sequences if provided. So user can also specify sequence of interest for LAI calculation.
-iden [0-100] Mean LTR identity (%) in the monoploid (1x) genome. This parameter will inactivate de novo estimation (same speed to -qq).
-totLTR [0-100] Specify the total LTR sequence content (%) in the genome instead of estimating from the -all RepeatMasker file.
-blast [path] The path to the blastn program. If left unspecified, then blastn must be accessible via shell ENV.
-t [number] Number of threads to run blastn.
-h Display this help info.
\n";
my $window="3000000"; #3Mb/window
my $step="300000"; #300Kb/step
my $iden="NA"; #mean identity (%) of LTR sequences in the monoploid (1x) genome
my $iden_slope="2.8138"; #the correction slope for LTR sequence identity (age). Don't change this unless you are confident.
my $totLTR="NA"; #the total LTR sequence content (%) in the genome. Will be estimated from the -all RepeatMasker file if unspecified.
my $quick=0; #quick estimation of LTR identity (may sacrifice ~0.5% of accuracy)
my $qquick=0; #Obtain only raw LAI for within species comparison (very quick)
my $threads=4; #Number of threads to run blastn
my $blast=''; #path to the blastn and makeblastdb
my $mode="standard mode"; #the mode for LTR identity estimation
my $fuse=1; #1, limit LTR identity to [92, 96]; 0, no limitation of LTR identity
my $monoploid=''; #A list of sequence names from the same monoploid (1x) for ployploid genomes
my $genome='';
my $intact='';
my $all='';
my $version="beta3";
my $k=0;
foreach (@ARGV){
$genome=$ARGV[$k+1] if /^-genome$/i;
$intact=$ARGV[$k+1] if /^-intact$/i;
$all=$ARGV[$k+1] if /^-all$/i;
$window=$ARGV[$k+1] if /^-window$/i;
$step=$ARGV[$k+1] if /^-step$/i;
$quick=1 if /^-q$/i;
$qquick=1 if /^-qq$/i;
$monoploid=$ARGV[$k+1] if /^-mono$/i;
$iden=$ARGV[$k+1] if /^-iden$/i;
$iden_slope=$ARGV[$k+1] if /^-k_iden$/i;
$totLTR=$ARGV[$k+1] if /^-totLTR$/i;
$blast=$ARGV[$k+1] if /^-blast$/i;
$threads=$ARGV[$k+1] if /^-t$/i;
$fuse=0 if /^-unlock$/i;
die $usage if /^-?-h|help$/i;
$k++;
}
$mode="quick mode" if $quick==1;
$mode="rapid mode" if $qquick==1;
die "Input file(s) not exist!\n$usage" unless -s "$genome" and -s "$intact" and -s "$all";
print "
######################################
### LTR Assembly Index (LAI) $version ###
######################################\n
Developer: Shujun Ou
Parameters: @ARGV\n\n\n";
my $date=`date`;
chomp ($date);
print "$date\tDependency checking: ";
#check blastn
$blast=`which blastn 2>/dev/null` if $blast eq '';
$blast=~s/blastn\n?//;
$blast="$blast/" unless $blast=~/\/$/;
die "blastn is not exist in the BLAST+ path $blast!\n" unless -X "${blast}blastn";
#check makeblastdb
$blast=`which makeblastdb 2>/dev/null` if $blast eq '';
$blast=~s/makeblastdb\n//;
die "makeblastdb is not exist in the BLAST+ path $blast!\n" unless -X "${blast}makeblastdb";
print "Passed!\n";
#obtain the exact path for the program location
my $script_path = dirname(__FILE__);
my $rand=int(rand(1000000));
#extract sequence for targeted LAI calculation
$date=`date`;
chomp ($date);
if ($monoploid ne ''){
print "$date\tUser specified sequence list is provided: $monoploid\n\t\t\t\tCalculation of LAI will be based only on these sequences.\n";
`perl $script_path/bin/output_by_list.pl 1 $genome 1 $monoploid -FA > $genome.$monoploid.fa`;
`for i in \`cat $monoploid\`; do cat $all | perl -snle 's/^\\s+//; my \$chr=(split)[4]; \$i=~s/^>//; print \$_ if \$chr eq \$i' -- -i=\$i; done > $all.$monoploid.out`;
$genome = "$genome.$monoploid.fa";
$all = "$all.$monoploid.out";
} else {
print "$date\tCalculation of LAI will be based on the whole genome.\n\t\t\t\tPlease use the -mono parameter if your genome is a recent ployploid, for high identity between homeologues will overcorrect raw LAI scores.\n";
}
#estimate mean identity of LTR sequences in the genome
if ($qquick != 1 and $iden eq "NA"){
$date=`date`;
chomp ($date);
print "$date\tEstimate the identity of LTR sequences in the genome: $mode\n";
$iden=`perl $script_path/bin/Age_est.pl -RMout $all -genome $genome -blast $blast -t $threads -iden_cut 100 -q ` if $quick == 1;
$iden=`perl $script_path/bin/Age_est.pl -RMout $all -genome $genome -blast $blast -t $threads -iden_cut 100 ` if $quick == 0;
$iden = $1 if $iden=~/Mean_identity:([0-9.]+)/;
}
#update info
$date=`date`;
chomp ($date);
print "$date\tThe identity of LTR sequences: $iden%\n";
if ($fuse==1 and $qquick==0){
($iden=92, print "\n\t\t\t\t【Warning】 The identity drops below the safe limit. Instead, identity of 92% will be used for LAI adjustment.\n\n") if $iden<92;
($iden=96, print "\n\t\t\t\t【Warning】 The identity jumps above the safe limit. Instead, identity of 96% will be used for LAI adjustment.\n\n") if $iden>96;
}
elsif ($fuse==0 and $qquick==0) {
print "\n\t\t\t\t【Warning】 The identity drops below the safe limit under the unlock mode.\n\n" if $iden<92;
print "\n\t\t\t\t【Warning】 The identity jumps above the safe limit under the unlock mode.\n\n" if $iden>96;
}
print "$date\tCalculate LAI:\n";
#convert intact LTR information into bed format
`perl -nle 's/^\\s+//; my (\$id, \$age)=(split)[0,11]; \$age=4000000 if \$age eq \"NA\"; next unless \$id=~/:/; \$id=~s/\\.\\./\\t/g; \$id=~s/:/\\t/g; print "\$id\\t\$age"' $intact > $intact.bed`;
#get all LTR sequence annotated in the genome that are no shorter than 80bp
`awk '{if (\$6~/[0-9]/ && \$7-\$6+1>=80)print \$5"\\t"\$6"\\t"\$7}' $all > $all.bed`;
#calculate the LAI score
`perl $script_path/bin/LAI_calc3.pl -intact $intact.bed -all $all.bed -genome $genome -window $window -step $step -iden $iden -k_iden $iden_slope -totLTR $totLTR > $all.LAI`;
`rm $intact.bed $all.bed`;
my $info=`grep whole_genome $all.LAI`;
$info=~s/^\s+//;
my ($int, $total)=(split /\s+/, $info)[3,4];
($int, $total)=($int*100, $total*100); #convert to %
if (($int<0.1 or $total<5) and $fuse==1){
print "\n\t\t\t\t【Error】Intact LTR-RT content ($int%) is too low for accurate LAI calculation (min 0.1% required)\n" if $int<0.1;
print "\t\t\t\t【Error】 Total LTR sequence content ($total%) is too low for accurate LAI calculation (min 5% required)\n" if $total<5;
`rm $all.LAI`;
print "\n\t\t\t\tSorry, LAI is not applicable on the current genome assembly.\n\n";
} else {
print "\n\t\t\t\t【Warning】Intact LTR-RT content ($int%) is too low for accurate LAI calculation (min 0.1% preferred)\n" if $int<0.1;
print "\t\t\t\t【Warning】 Total LTR sequence content ($total%) is too low for accurate LAI calculation (min 5% preferred)\n" if $total<5;
print "\n\t\t\t\t\t\tDone!\n\n";
$date=`date`;
chomp ($date);
print "$date\tResult file: $all.LAI\n\n";
print "\t\t\t\tYou may use either raw_LAI or LAI for intraspecific comparison\n\t\t\t\tbut please use ONLY LAI for interspecific comparison\n\n";
}