Skip to content

Commit

Permalink
enable strand sensitive outputs
Browse files Browse the repository at this point in the history
  • Loading branch information
oushujun committed Jan 6, 2024
1 parent b746912 commit fab6cde
Show file tree
Hide file tree
Showing 5 changed files with 28 additions and 16 deletions.
1 change: 1 addition & 0 deletions bin/LAI_calc3.pl
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@
s/^\s+//;
my ($chr, $from, $to)=(split)[0,1,2];
next unless exists $intact{$chr};
($from, $to) = ($to, $from) if $from > $to; # fix direction
my $len=$to-$from+1;
substr($intact{$chr}, $from-1, $len)="i" x $len; #substitute '0' with 'i' where intact LTR-RT is occurred
}
Expand Down
4 changes: 3 additions & 1 deletion bin/LTR.identifier.pl
Original file line number Diff line number Diff line change
Expand Up @@ -455,6 +455,7 @@ ()
$info[17]="NA" unless defined $info[17]; #order
$info[18]="unknown" unless defined $info[18]; #superfamily
$info[19]="NA" unless defined $info[19]; #age (ya)
$info[19]="0" if $info[19] eq "-0";

##TSD control, boundary control, MISC control, and reporting
my $internal=($ltr1_e+1)."..".($ltr2_s-1);
Expand All @@ -477,7 +478,8 @@ ()
}

#last four variables: strand/superfamily/order/age
my $defalse = "$chr:$ltr1_s..$ltr2_e\t$decision\tmotif:$motif\tTSD:$TSD\tIN:$internal\t$info[9]\t$info[12]\t$info[18]\t$info[17]\t$info[19]
my $locus = $info[12] eq '-' ? "$chr:$ltr2_e..$ltr1_s" : "$chr:$ltr1_s..$ltr2_e";
my $defalse = "$locus\t$decision\tmotif:$motif\tTSD:$TSD\tIN:$internal\t$info[9]\t$info[12]\t$info[18]\t$info[17]\t$info[19]
Adjust: $adjust\tlLTR: $ll\trLTR: $rl
Alignment regions: $s_start, $s_end, $q_start, $q_end
LTR coordinates: $ltr1_s, $ltr1_e, $ltr2_s, $ltr2_e
Expand Down
33 changes: 21 additions & 12 deletions bin/get_range.pl
Original file line number Diff line number Diff line change
Expand Up @@ -64,14 +64,21 @@
open FA, "<$ARGV[1]" or die "ERROR: $!";
while (<FA>){
s/>//;
$chr{"$2..$3"}=$1 if (/^(\S+).*\[([0-9]+),([0-9]+)\]/); #eg: >9311_chr01 (dbseq-nr 0) [101308,114181]
$chr{"$2..$3"}=$1 if (/^(\S+)_[0-9]+.*\[([0-9]+),([0-9]+)\]/); #eg: >gi.478805111.gb.AQOG01030080.1_1 (dbseq-nr 173) [4033,8637]
$chr{"$2..$3"}=$1 if (/^(\S+)\:([0-9]+)\.\.([0-9]+)\|([0-9]+)\.\.([0-9]+)/); #eg: >Chr1:106522..118080|106502..118100
$chr{"$2..$3"}=$1 if (/^(\S+)\|([0-9]+)\.\.([0-9]+)/);#eg: >Chr1|106522..118080
$chr{"$2..$3"}=$1 if (/^(\S+)\:([0-9]+)\.\.([0-9]+)\|(\S+)/);#eg: >Chr1:106522..118080|Chr1
$chr{"$2..$3"}=$1 if (/^(\S+):([0-9]+)..([0-9]+)\[[12]\]/); #eg: >gi.478805265.gb.AQOG01029926.1:10426..15413[1]
$chr{"$2..$3"}=$1 if (/(\S+)\:([0-9]+)\.\.([0-9]+)/); #eg: gi.478789307.gb.AQOG01045884.1:56716..59758 pass motif:AAAG TSD:TGAAG
$chr{"$2..$3"}=$1 if (/^(\S+)\:([0-9]+)\.\.([0-9]+)\|/);#eg: >Chr1:106522..118080|
my ($from, $to, $chr, $strand);
($from, $to, $chr) = ($2, $3, $1) if (/^(\S+).*\[([0-9]+),([0-9]+)\]/); #eg: >9311_chr01 (dbseq-nr 0) [101308,114181]
($from, $to, $chr) = ($2, $3, $1) if (/^(\S+)_[0-9]+.*\[([0-9]+),([0-9]+)\]/); #eg: >gi.478805111.gb.AQOG01030080.1_1 (dbseq-nr 173) [4033,8637]
($from, $to, $chr) = ($2, $3, $1) if (/^(\S+)\:([0-9]+)\.\.([0-9]+)\|([0-9]+)\.\.([0-9]+)/); #eg: >Chr1:106522..118080|106502..118100
($from, $to, $chr) = ($2, $3, $1) if (/^(\S+)\|([0-9]+)\.\.([0-9]+)/);#eg: >Chr1|106522..118080
($from, $to, $chr) = ($2, $3, $1) if (/^(\S+)\:([0-9]+)\.\.([0-9]+)\|(\S+)/);#eg: >Chr1:106522..118080|Chr1
($from, $to, $chr) = ($2, $3, $1) if (/^(\S+):([0-9]+)..([0-9]+)\[[12]\]/); #eg: >gi.478805265.gb.AQOG01029926.1:10426..15413[1]
($from, $to, $chr) = ($2, $3, $1) if (/(\S+)\:([0-9]+)\.\.([0-9]+)/); #eg: gi.478789307.gb.AQOG01045884.1:56716..59758 pass motif:AAAG
($from, $to, $chr) = ($2, $3, $1) if (/^(\S+)\:([0-9]+)\.\.([0-9]+)\|/);#eg: >Chr1:106522..118080|

# keep reverse direction to allow - strand inputs
if (defined $from) {
$strand = $from < $to ? "+" : "-";
($chr{"$from..$to"}, $chr{"$to..$from"}) = ([$chr, $strand], [$chr, $strand]) if defined $from;
}
}
}

Expand All @@ -84,7 +91,7 @@
chomp;
s/>//g;
s/\s+//g;
$chr{$i}=$_;
$chr{$i}=[$_, '+'];
$i++;
}
}
Expand All @@ -95,29 +102,31 @@
if (/^\s+$/){next}
if (/^$/){next}
s/^\s+//;
my ($element_start, $element_end, $element_length, $sequence, $lLTR_start, $lLTR_end, $lLTR_length, $rLTR_start, $rLTR_end, $rLTR_length, $lTSD_start, $lTSD_end, $lTSD_motif, $rTSD_start, $rTSD_end, $rTSD_motif, $PPT_start, $PPT_end, $PPT_motif, $PPT_strand, $PPT_offset, $PBS_start, $PBS_end, $PBS_strand, $tRNA, $tRNA_motif, $PBS_offset, $tRNA_offset, $PBS_tRNA_edist, $Protein_domain_hits, $similarity, $seq_ID, $chr);
my ($element_start, $element_end, $element_length, $sequence, $lLTR_start, $lLTR_end, $lLTR_length, $rLTR_start, $rLTR_end, $rLTR_length, $lTSD_start, $lTSD_end, $lTSD_motif, $rTSD_start, $rTSD_end, $rTSD_motif, $PPT_start, $PPT_end, $PPT_motif, $PPT_strand, $PPT_offset, $PBS_start, $PBS_end, $PBS_strand, $tRNA, $tRNA_motif, $PBS_offset, $tRNA_offset, $PBS_tRNA_edist, $Protein_domain_hits, $similarity, $seq_ID, $chr, $strand);

##This is for LTRharvest result analysis
##s(ret) e(ret) l(ret) s(lLTR) e(lLTR) l(lLTR) s(rLTR) e(rLTR) l(rLTR) sim(LTRs) seq-nr
#start end len lLTR_str lLTR_end lLTR_len rLTR_str rLTR_end rLTR_len similarity seqid chr direction TSD lTSD rTSD motif superfamily family age(ya)
#34 4594 4561 34 291 258 4335 4594 260 0.962 + CTCAC 29..33, 4595..4599 TG,TG,CA,CA
($element_start, $element_end, $element_length, $lLTR_start, $lLTR_end, $lLTR_length, $rLTR_start, $rLTR_end, $rLTR_length, $similarity, $seq_ID, $chr)=(split /\s+/, $_);
$strand = "+";

if ($genome==1) {
# if $ARGV[1] is a sequence file, then output all entriess in $ARGV[0], here we try to obtain $chr for LTRharvest entries from the sequence file
# use chr as primary sequence ID and seq_ID as secondary
if (!defined $chr or $chr =~ /^[NA|\.|\-|\+|\?]$/i){ # if $chr is missing, it could be replaced by direction (NA, -, ., +, ?)
if (exists $chr{$seq_ID}){
$chr=$chr{$seq_ID};
($chr, $strand)=($chr{$seq_ID}[0], $chr{$seq_ID}[1]);
} else {
$chr=$seq_ID;
}
}
} else {
# if $ARGV[1] is a list file, then only output entries in $ARGV[1]
$chr=$chr{"$element_start..$element_end"};
($chr, $strand)=($chr{"$element_start..$element_end"}[0], $chr{"$element_start..$element_end"}[1]);
next unless defined $chr;
}
($element_start, $element_end) = ($element_end, $element_start) if $strand eq '-'; # strand sensitive

next unless defined $lLTR_length and defined $rLTR_length;
my $long="NA";
Expand Down
3 changes: 2 additions & 1 deletion bin/gff2bed.pl
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
my ($method, $type, $TE_class, $class, $iden) = (undef, undef, undef, undef, 'NA');
my ($chr, $sequence_ontology, $element_start, $element_end, $score, $strand, $phase, $extra) = (split)[0,2,3,4,5,6,7,8];
next unless defined $chr and defined $element_start;
($element_start, $element_end) = ($element_end, $element_start) if $element_start > $element_end; # fix direction

# get class info for summary categories
$class = $sequence_ontology;
Expand Down Expand Up @@ -61,7 +62,7 @@
$type = $1 if $sequence_ontology =~ /^(.*)\/.*/ and $1 !~ /DNA|MITE/i;

# get assortive structural info
my $TE_ID = "$chr:$element_start..$element_end";
my $TE_ID = $strand eq '-' ? "$chr:$element_end..$element_start" : "$chr:$element_start..$element_end"; # strand sensitive for $TE_ID
$TE_ID = $1 if $extra =~ s/Name=(.*?);//i;
$TE_class = $1 if $extra =~ s/Classification=(.*?);//i;
$iden = $1 if $extra =~ s/ltr_identity=([0-9.e\-]+);//i or $extra =~ s/Identity=([0-9.e\-]+);//i;
Expand Down
3 changes: 1 addition & 2 deletions bin/make_gff3.pl
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@
$rTSD=~s/\.\./\t/;
$element_start=(split /\s+/, $lTSD)[0];
$element_end=(split /\s+/, $rTSD)[1];
my $id = "$chr:$lLTR_start..$rLTR_end";
my $id = $strand eq '-' ? "$chr:$rLTR_end..$lLTR_start" : "$chr:$lLTR_start..$rLTR_end"; # strand sensitive $id
my $so = "LTR_retrotransposon";
$so = "Copia_LTR_retrotransposon" if $supfam eq "Copia";
$so = "Gypsy_LTR_retrotransposon" if $supfam eq "Gypsy";
Expand All @@ -73,7 +73,6 @@
print GFF "##sequence-region $chr_ori 1 $seq_flag{$chr_ori}[1]\n"; #seqence length
delete $seq_flag{$chr_ori};
}
# my $info = "Name=$id;motif=$motif;tsd=$TSD;ltr_identity=$sim;Method=structural";
my $info = "Name=$id;Classification=LTR/$supfam";
my $info2 = "ltr_identity=$sim;Method=structural;motif=$motif;tsd=$TSD";
print GFF "$chr\t$annotator\trepeat_region\t$element_start\t$element_end\t.\t$strand\t.\tID=repeat_region_$i;$info;Sequence_ontology=$SO{'repeat_region'};$info2\n";
Expand Down

0 comments on commit fab6cde

Please sign in to comment.