Skip to content

Commit

Permalink
make sure candidates have sufficient flanking sequence to extended (5…
Browse files Browse the repository at this point in the history
…0bp)
  • Loading branch information
oushujun committed Jan 7, 2024
1 parent 0925583 commit 4039eb7
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 5 deletions.
2 changes: 1 addition & 1 deletion LTR_retriever
Original file line number Diff line number Diff line change
Expand Up @@ -714,7 +714,7 @@ chomp ($date=`date`);
print "$date\tModules 2-5: Start to analyze the structure of candidates...
\t\t\t\tThe terminal motif, TSD, boundary, orientation, age, and superfamily will be identified in this step.\n\n";
my $index=$_[0];
`perl $script_path/bin/get_range.pl $index.retriever.scn $index.ltrTE.stg1 -x`;
`perl $script_path/bin/get_range.pl $index.retriever.scn $index.ltrTE.stg1 -x -strict 1`;
`cat $index.retriever.scn.extend |sort -fu > $index.retriever.scn.extend.unq; mv $index.retriever.scn.extend.unq $index.retriever.scn.extend`;
`perl $script_path/bin/call_seq_by_list.pl $index.retriever.scn.extend -C $genome > $index.retriever.scn.extend.fa`; #full TE sequence with 50bp-extended on each side

Expand Down
14 changes: 11 additions & 3 deletions bin/call_seq_by_list.pl
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
eg2. perl call_seq_by_LOC.pl array_list -C your_database up_2000 >result ##call sequence of upper 2000 bp region in the list, from the provided database
Update history:
v2.6 add -strict parameter to control inclusiveness of input range (2024/01/07)
v2.5 do not output sequence with all Ns (2023/01/30)
v2.4 output sequences without headers
v2.3 output a list of entirely excluded sequences
Expand Down Expand Up @@ -34,6 +35,7 @@
my $exclude=0; #0 for output sequence specified by list (default); 1 for exclude sequence specified by list
my $coverage=1; #work with $exclude, if the excluded portion is too long (default 1, [0-1]), discard the entire sequence
my $purge=0; #work with $exclude, switch on=1/off=0(default) to clean up aligned region and joint unaligned sequences
my $strict=0; #0=flexible, will adjust coordinate start and stop if they extend outside of the sequence; 1 will skip the entry if exceed.
my $genome;

my $i=0;
Expand All @@ -56,6 +58,7 @@
$header=$ARGV[$i+1] if $para=~/^-header$/i;
$coverage=$ARGV[$i+1] if $para=~/^-cov$/i;
$purge=$ARGV[$i+1] if $para=~/^-purge$/i;
$strict=$ARGV[$i+1] if $para=~/^-strict$/i;
$exclude=1 if $para=~/^-ex$/i;
$i++;
}
Expand Down Expand Up @@ -123,12 +126,17 @@
$stop=$start+$length-1;
}

$start=1 if $start<=0;

next unless exists $genome{$chr};
# next if $genome{$chr}=~/^\s+$/;
next if length $genome{$chr} < 10;
$stop=length $genome{$chr} if $stop>=length $genome{$chr};

# adjust boundary if they exceed the sequence range
if ($strict == 0){
$start=1 if $start<=0;
$stop=length $genome{$chr} if $stop>=length $genome{$chr};
} else {
next if $start<=0 or $stop>=length $genome{$chr};
}

if ($exclude==0){
$seq=substr ($genome{$chr}, $start-1, $stop-$start+1) if exists $genome{$chr};
Expand Down
2 changes: 1 addition & 1 deletion bin/get_range.pl
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@
$lLTR_end+=50;
$rLTR_start-=50;
$rLTR_end+=50;
print Extend "$chr:$element_start..$element_end\t$chr:$lLTR_start..$rLTR_end\n" if defined $chr;
print Extend "$chr:$element_start..$element_end\t$chr:$lLTR_start..$rLTR_end\n" if defined $chr and $lLTR_start > 0;
}

if ($full==1){
Expand Down

0 comments on commit 4039eb7

Please sign in to comment.