From 4039eb7778fd9cbc60021e99a8693285e0fa2daf Mon Sep 17 00:00:00 2001 From: oushujun Date: Sun, 7 Jan 2024 01:03:19 -0500 Subject: [PATCH] make sure candidates have sufficient flanking sequence to extended (50bp) --- LTR_retriever | 2 +- bin/call_seq_by_list.pl | 14 +++++++++++--- bin/get_range.pl | 2 +- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/LTR_retriever b/LTR_retriever index d828d51..c625c80 100755 --- a/LTR_retriever +++ b/LTR_retriever @@ -714,7 +714,7 @@ chomp ($date=`date`); print "$date\tModules 2-5: Start to analyze the structure of candidates... \t\t\t\tThe terminal motif, TSD, boundary, orientation, age, and superfamily will be identified in this step.\n\n"; my $index=$_[0]; -`perl $script_path/bin/get_range.pl $index.retriever.scn $index.ltrTE.stg1 -x`; +`perl $script_path/bin/get_range.pl $index.retriever.scn $index.ltrTE.stg1 -x -strict 1`; `cat $index.retriever.scn.extend |sort -fu > $index.retriever.scn.extend.unq; mv $index.retriever.scn.extend.unq $index.retriever.scn.extend`; `perl $script_path/bin/call_seq_by_list.pl $index.retriever.scn.extend -C $genome > $index.retriever.scn.extend.fa`; #full TE sequence with 50bp-extended on each side diff --git a/bin/call_seq_by_list.pl b/bin/call_seq_by_list.pl index 83a2d78..b5ecf51 100644 --- a/bin/call_seq_by_list.pl +++ b/bin/call_seq_by_list.pl @@ -7,6 +7,7 @@ eg2. perl call_seq_by_LOC.pl array_list -C your_database up_2000 >result ##call sequence of upper 2000 bp region in the list, from the provided database Update history: + v2.6 add -strict parameter to control inclusiveness of input range (2024/01/07) v2.5 do not output sequence with all Ns (2023/01/30) v2.4 output sequences without headers v2.3 output a list of entirely excluded sequences @@ -34,6 +35,7 @@ my $exclude=0; #0 for output sequence specified by list (default); 1 for exclude sequence specified by list my $coverage=1; #work with $exclude, if the excluded portion is too long (default 1, [0-1]), discard the entire sequence my $purge=0; #work with $exclude, switch on=1/off=0(default) to clean up aligned region and joint unaligned sequences +my $strict=0; #0=flexible, will adjust coordinate start and stop if they extend outside of the sequence; 1 will skip the entry if exceed. my $genome; my $i=0; @@ -56,6 +58,7 @@ $header=$ARGV[$i+1] if $para=~/^-header$/i; $coverage=$ARGV[$i+1] if $para=~/^-cov$/i; $purge=$ARGV[$i+1] if $para=~/^-purge$/i; + $strict=$ARGV[$i+1] if $para=~/^-strict$/i; $exclude=1 if $para=~/^-ex$/i; $i++; } @@ -123,12 +126,17 @@ $stop=$start+$length-1; } - $start=1 if $start<=0; next unless exists $genome{$chr}; -# next if $genome{$chr}=~/^\s+$/; next if length $genome{$chr} < 10; - $stop=length $genome{$chr} if $stop>=length $genome{$chr}; + + # adjust boundary if they exceed the sequence range + if ($strict == 0){ + $start=1 if $start<=0; + $stop=length $genome{$chr} if $stop>=length $genome{$chr}; + } else { + next if $start<=0 or $stop>=length $genome{$chr}; + } if ($exclude==0){ $seq=substr ($genome{$chr}, $start-1, $stop-$start+1) if exists $genome{$chr}; diff --git a/bin/get_range.pl b/bin/get_range.pl index 6c91f97..9490c88 100644 --- a/bin/get_range.pl +++ b/bin/get_range.pl @@ -160,7 +160,7 @@ $lLTR_end+=50; $rLTR_start-=50; $rLTR_end+=50; - print Extend "$chr:$element_start..$element_end\t$chr:$lLTR_start..$rLTR_end\n" if defined $chr; + print Extend "$chr:$element_start..$element_end\t$chr:$lLTR_start..$rLTR_end\n" if defined $chr and $lLTR_start > 0; } if ($full==1){