diff --git a/bin/LAI_calc3.pl b/bin/LAI_calc3.pl index f7ce447..eff940d 100644 --- a/bin/LAI_calc3.pl +++ b/bin/LAI_calc3.pl @@ -77,6 +77,7 @@ s/^\s+//; my ($chr, $from, $to)=(split)[0,1,2]; next unless exists $intact{$chr}; + ($from, $to) = ($to, $from) if $from > $to; # fix direction my $len=$to-$from+1; substr($intact{$chr}, $from-1, $len)="i" x $len; #substitute '0' with 'i' where intact LTR-RT is occurred } diff --git a/bin/LTR.identifier.pl b/bin/LTR.identifier.pl index 7503e27..4e91a4c 100644 --- a/bin/LTR.identifier.pl +++ b/bin/LTR.identifier.pl @@ -455,6 +455,7 @@ () $info[17]="NA" unless defined $info[17]; #order $info[18]="unknown" unless defined $info[18]; #superfamily $info[19]="NA" unless defined $info[19]; #age (ya) + $info[19]="0" if $info[19] eq "-0"; ##TSD control, boundary control, MISC control, and reporting my $internal=($ltr1_e+1)."..".($ltr2_s-1); @@ -477,7 +478,8 @@ () } #last four variables: strand/superfamily/order/age - my $defalse = "$chr:$ltr1_s..$ltr2_e\t$decision\tmotif:$motif\tTSD:$TSD\tIN:$internal\t$info[9]\t$info[12]\t$info[18]\t$info[17]\t$info[19] + my $locus = $info[12] eq '-' ? "$chr:$ltr2_e..$ltr1_s" : "$chr:$ltr1_s..$ltr2_e"; + my $defalse = "$locus\t$decision\tmotif:$motif\tTSD:$TSD\tIN:$internal\t$info[9]\t$info[12]\t$info[18]\t$info[17]\t$info[19] Adjust: $adjust\tlLTR: $ll\trLTR: $rl Alignment regions: $s_start, $s_end, $q_start, $q_end LTR coordinates: $ltr1_s, $ltr1_e, $ltr2_s, $ltr2_e diff --git a/bin/get_range.pl b/bin/get_range.pl index d352949..6c91f97 100644 --- a/bin/get_range.pl +++ b/bin/get_range.pl @@ -64,14 +64,21 @@ open FA, "<$ARGV[1]" or die "ERROR: $!"; while (){ s/>//; - $chr{"$2..$3"}=$1 if (/^(\S+).*\[([0-9]+),([0-9]+)\]/); #eg: >9311_chr01 (dbseq-nr 0) [101308,114181] - $chr{"$2..$3"}=$1 if (/^(\S+)_[0-9]+.*\[([0-9]+),([0-9]+)\]/); #eg: >gi.478805111.gb.AQOG01030080.1_1 (dbseq-nr 173) [4033,8637] - $chr{"$2..$3"}=$1 if (/^(\S+)\:([0-9]+)\.\.([0-9]+)\|([0-9]+)\.\.([0-9]+)/); #eg: >Chr1:106522..118080|106502..118100 - $chr{"$2..$3"}=$1 if (/^(\S+)\|([0-9]+)\.\.([0-9]+)/);#eg: >Chr1|106522..118080 - $chr{"$2..$3"}=$1 if (/^(\S+)\:([0-9]+)\.\.([0-9]+)\|(\S+)/);#eg: >Chr1:106522..118080|Chr1 - $chr{"$2..$3"}=$1 if (/^(\S+):([0-9]+)..([0-9]+)\[[12]\]/); #eg: >gi.478805265.gb.AQOG01029926.1:10426..15413[1] - $chr{"$2..$3"}=$1 if (/(\S+)\:([0-9]+)\.\.([0-9]+)/); #eg: gi.478789307.gb.AQOG01045884.1:56716..59758 pass motif:AAAG TSD:TGAAG - $chr{"$2..$3"}=$1 if (/^(\S+)\:([0-9]+)\.\.([0-9]+)\|/);#eg: >Chr1:106522..118080| + my ($from, $to, $chr, $strand); + ($from, $to, $chr) = ($2, $3, $1) if (/^(\S+).*\[([0-9]+),([0-9]+)\]/); #eg: >9311_chr01 (dbseq-nr 0) [101308,114181] + ($from, $to, $chr) = ($2, $3, $1) if (/^(\S+)_[0-9]+.*\[([0-9]+),([0-9]+)\]/); #eg: >gi.478805111.gb.AQOG01030080.1_1 (dbseq-nr 173) [4033,8637] + ($from, $to, $chr) = ($2, $3, $1) if (/^(\S+)\:([0-9]+)\.\.([0-9]+)\|([0-9]+)\.\.([0-9]+)/); #eg: >Chr1:106522..118080|106502..118100 + ($from, $to, $chr) = ($2, $3, $1) if (/^(\S+)\|([0-9]+)\.\.([0-9]+)/);#eg: >Chr1|106522..118080 + ($from, $to, $chr) = ($2, $3, $1) if (/^(\S+)\:([0-9]+)\.\.([0-9]+)\|(\S+)/);#eg: >Chr1:106522..118080|Chr1 + ($from, $to, $chr) = ($2, $3, $1) if (/^(\S+):([0-9]+)..([0-9]+)\[[12]\]/); #eg: >gi.478805265.gb.AQOG01029926.1:10426..15413[1] + ($from, $to, $chr) = ($2, $3, $1) if (/(\S+)\:([0-9]+)\.\.([0-9]+)/); #eg: gi.478789307.gb.AQOG01045884.1:56716..59758 pass motif:AAAG + ($from, $to, $chr) = ($2, $3, $1) if (/^(\S+)\:([0-9]+)\.\.([0-9]+)\|/);#eg: >Chr1:106522..118080| + + # keep reverse direction to allow - strand inputs + if (defined $from) { + $strand = $from < $to ? "+" : "-"; + ($chr{"$from..$to"}, $chr{"$to..$from"}) = ([$chr, $strand], [$chr, $strand]) if defined $from; + } } } @@ -84,7 +91,7 @@ chomp; s/>//g; s/\s+//g; - $chr{$i}=$_; + $chr{$i}=[$_, '+']; $i++; } } @@ -95,29 +102,31 @@ if (/^\s+$/){next} if (/^$/){next} s/^\s+//; - my ($element_start, $element_end, $element_length, $sequence, $lLTR_start, $lLTR_end, $lLTR_length, $rLTR_start, $rLTR_end, $rLTR_length, $lTSD_start, $lTSD_end, $lTSD_motif, $rTSD_start, $rTSD_end, $rTSD_motif, $PPT_start, $PPT_end, $PPT_motif, $PPT_strand, $PPT_offset, $PBS_start, $PBS_end, $PBS_strand, $tRNA, $tRNA_motif, $PBS_offset, $tRNA_offset, $PBS_tRNA_edist, $Protein_domain_hits, $similarity, $seq_ID, $chr); + my ($element_start, $element_end, $element_length, $sequence, $lLTR_start, $lLTR_end, $lLTR_length, $rLTR_start, $rLTR_end, $rLTR_length, $lTSD_start, $lTSD_end, $lTSD_motif, $rTSD_start, $rTSD_end, $rTSD_motif, $PPT_start, $PPT_end, $PPT_motif, $PPT_strand, $PPT_offset, $PBS_start, $PBS_end, $PBS_strand, $tRNA, $tRNA_motif, $PBS_offset, $tRNA_offset, $PBS_tRNA_edist, $Protein_domain_hits, $similarity, $seq_ID, $chr, $strand); ##This is for LTRharvest result analysis ##s(ret) e(ret) l(ret) s(lLTR) e(lLTR) l(lLTR) s(rLTR) e(rLTR) l(rLTR) sim(LTRs) seq-nr #start end len lLTR_str lLTR_end lLTR_len rLTR_str rLTR_end rLTR_len similarity seqid chr direction TSD lTSD rTSD motif superfamily family age(ya) #34 4594 4561 34 291 258 4335 4594 260 0.962 + CTCAC 29..33, 4595..4599 TG,TG,CA,CA ($element_start, $element_end, $element_length, $lLTR_start, $lLTR_end, $lLTR_length, $rLTR_start, $rLTR_end, $rLTR_length, $similarity, $seq_ID, $chr)=(split /\s+/, $_); + $strand = "+"; if ($genome==1) { # if $ARGV[1] is a sequence file, then output all entriess in $ARGV[0], here we try to obtain $chr for LTRharvest entries from the sequence file # use chr as primary sequence ID and seq_ID as secondary if (!defined $chr or $chr =~ /^[NA|\.|\-|\+|\?]$/i){ # if $chr is missing, it could be replaced by direction (NA, -, ., +, ?) if (exists $chr{$seq_ID}){ - $chr=$chr{$seq_ID}; + ($chr, $strand)=($chr{$seq_ID}[0], $chr{$seq_ID}[1]); } else { $chr=$seq_ID; } } } else { # if $ARGV[1] is a list file, then only output entries in $ARGV[1] - $chr=$chr{"$element_start..$element_end"}; + ($chr, $strand)=($chr{"$element_start..$element_end"}[0], $chr{"$element_start..$element_end"}[1]); next unless defined $chr; } + ($element_start, $element_end) = ($element_end, $element_start) if $strand eq '-'; # strand sensitive next unless defined $lLTR_length and defined $rLTR_length; my $long="NA"; diff --git a/bin/gff2bed.pl b/bin/gff2bed.pl index 0d44126..269885d 100644 --- a/bin/gff2bed.pl +++ b/bin/gff2bed.pl @@ -22,6 +22,7 @@ my ($method, $type, $TE_class, $class, $iden) = (undef, undef, undef, undef, 'NA'); my ($chr, $sequence_ontology, $element_start, $element_end, $score, $strand, $phase, $extra) = (split)[0,2,3,4,5,6,7,8]; next unless defined $chr and defined $element_start; + ($element_start, $element_end) = ($element_end, $element_start) if $element_start > $element_end; # fix direction # get class info for summary categories $class = $sequence_ontology; @@ -61,7 +62,7 @@ $type = $1 if $sequence_ontology =~ /^(.*)\/.*/ and $1 !~ /DNA|MITE/i; # get assortive structural info - my $TE_ID = "$chr:$element_start..$element_end"; + my $TE_ID = $strand eq '-' ? "$chr:$element_end..$element_start" : "$chr:$element_start..$element_end"; # strand sensitive for $TE_ID $TE_ID = $1 if $extra =~ s/Name=(.*?);//i; $TE_class = $1 if $extra =~ s/Classification=(.*?);//i; $iden = $1 if $extra =~ s/ltr_identity=([0-9.e\-]+);//i or $extra =~ s/Identity=([0-9.e\-]+);//i; diff --git a/bin/make_gff3.pl b/bin/make_gff3.pl index e72ad85..9479fb0 100644 --- a/bin/make_gff3.pl +++ b/bin/make_gff3.pl @@ -60,7 +60,7 @@ $rTSD=~s/\.\./\t/; $element_start=(split /\s+/, $lTSD)[0]; $element_end=(split /\s+/, $rTSD)[1]; - my $id = "$chr:$lLTR_start..$rLTR_end"; + my $id = $strand eq '-' ? "$chr:$rLTR_end..$lLTR_start" : "$chr:$lLTR_start..$rLTR_end"; # strand sensitive $id my $so = "LTR_retrotransposon"; $so = "Copia_LTR_retrotransposon" if $supfam eq "Copia"; $so = "Gypsy_LTR_retrotransposon" if $supfam eq "Gypsy"; @@ -73,7 +73,6 @@ print GFF "##sequence-region $chr_ori 1 $seq_flag{$chr_ori}[1]\n"; #seqence length delete $seq_flag{$chr_ori}; } -# my $info = "Name=$id;motif=$motif;tsd=$TSD;ltr_identity=$sim;Method=structural"; my $info = "Name=$id;Classification=LTR/$supfam"; my $info2 = "ltr_identity=$sim;Method=structural;motif=$motif;tsd=$TSD"; print GFF "$chr\t$annotator\trepeat_region\t$element_start\t$element_end\t.\t$strand\t.\tID=repeat_region_$i;$info;Sequence_ontology=$SO{'repeat_region'};$info2\n";