Skip to content

Commit

Permalink
modified sort_blast_results so that it now can compress individual BL…
Browse files Browse the repository at this point in the history
…AST result files with global binary $SORTBIN

get_homologues.pl and get_homologues-est.pl now compress individual BLAST files by default ($COMPRESSBLAST=1)
added global $MAXSEQLENGTH to get_homologues-est.pl to warn of long sequences, which often cause downstream problems
  • Loading branch information
eead-csic-compbio committed Mar 13, 2018
1 parent f7572b9 commit da27ecd
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 27 deletions.
3 changes: 3 additions & 0 deletions CHANGES.txt
Original file line number Diff line number Diff line change
Expand Up @@ -248,3 +248,6 @@
31012018: added the complete.cases(tab)check in plot_matrix_heatmap.sh
01022018: install.pl now can download bin.tar and extract binaries to cloned repos
05022018: increased min complete cases to 5 in plot_matrix_heatmap.sh
06032018: modified sort_blast_results so that it now can compress individual BLAST result files with global binary $SORTBIN
06032018: get_homologues.pl and get_homologues-est.pl now compress individual BLAST files by default ($COMPRESSBLAST=1)
13032018: added global $MAXSEQLENGTH to get_homologues-est.pl to warn of long sequences, which often cause downstream problems (thanks Alvaro Rodriguez)
39 changes: 32 additions & 7 deletions get_homologues-est.pl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env perl

# 2017 Bruno Contreras-Moreira (1) and Pablo Vinuesa (2):
# 2018 Bruno Contreras-Moreira (1) and Pablo Vinuesa (2):
# 1: http://www.eead.csic.es/compbio (Estacion Experimental Aula Dei/CSIC/Fundacion ARAID, Spain)
# 2: http://www.ccg.unam.mx/~vinuesa (Center for Genomic Sciences, UNAM, Mexico)

Expand Down Expand Up @@ -39,6 +39,7 @@
my $BATCHSIZE = 1000; # after HS_BLASTN bench

## global variables that control some algorithmic choices
my $MAXSEQLENGTH = 50_000; # used to warn about long sequences, which often cause downstream problems
my $NOFSAMPLESREPORT = 20; # number of genome samples used for the generation of pan/core genomes
my $MAXEVALUEBLASTSEARCH = 0.01; # required to reduce size of blast output files
my $MAXPFAMSEQS = 250; # default for -m cluster jobs, it is updated to 1000 with -m local
Expand All @@ -49,6 +50,9 @@
my $MINSEQLENGTH = 20; # min length for input sequences to be considered (~ primer or miRNA)
my $NOCLOUDINCORE = 1; # when calling -M -c -t X initial core/pan size excludes cloud genes, those with occup < X 8alternative 0)
my $INCLUDEORDER = 0; # use implicit -I taxon order for -c composition analyses
my $KEEPSCNDHSPS = 1; # by default only the first BLAST hsp is kept for further calculations
my $COMPRESSBLAST = 1; # save disk space by compressing BLASTN results, requires gzip


## list of features/binaries required by this program (do not edit)
my @FEATURES2CHECK = ('EXE_BLASTN','EXE_FORMATDB','EXE_MCL','EXE_HMMPFAM',
Expand Down Expand Up @@ -672,10 +676,22 @@
$full_length_file{basename($infile).'.nucl'} = 1;
}

# calculate median sequence length
foreach $seq ( 0 .. $#{$fasta_ref} ){ push(@fasta_length,length($fasta_ref->[$seq][SEQ])) }
# calculate median sequence length and warn if too long sequence are found
my ($total_long_seqs,$len) = (0,0);
foreach $seq ( 0 .. $#{$fasta_ref} )
{
$len = length($fasta_ref->[$seq][SEQ]);
push(@fasta_length,$len);

if($len > $MAXSEQLENGTH){ $total_long_seqs++ }
}
@fasta_length = sort {$a<=>$b} @fasta_length;
printf(" median length = %d",$fasta_length[int($#fasta_length/2)]);
printf(" median length = %d",$fasta_length[int($#fasta_length/2)]);

if($total_long_seqs > 0)
{
printf(" WARNING: %d seqs > $MAXSEQLENGTH b",$total_long_seqs)
}

print "\n";

Expand Down Expand Up @@ -1054,6 +1070,11 @@
if(!-s $blast_file){ push(@tmp_blast_output_files,$blastout); }
next;
}
elsif($COMPRESSBLAST && -s $blastout.'.gz')
{
if(!-s $blast_file){ push(@tmp_blast_output_files,$blastout); }
next;
}

$command = format_BLASTN_command("$newDIR/$new_infile",
$blastout,"$newDIR/$blastDBfile",
Expand Down Expand Up @@ -1086,22 +1107,26 @@
print "# done\n\n";

# concat blast output files to $blast_file (global var)
# compress BLAST files optionally
if(@tmp_blast_output_files)
{
print "# concatenating and sorting blast results...\n";
foreach $blastout (@tmp_blast_output_files)
{
if(!-e $blastout)
if(!-e $blastout && !-e $blastout.'.gz')
{
sleep($QUEUEWAIT); # give disk extra time
if(!-e $blastout)
if(!-e $blastout && !-e $blastout.'.gz')
{
die "# EXIT, $blastout does not exist, BLAST search might failed ".
"or hard drive is still writing it (please re-run)\n";
}
}
}
sort_blast_results($blast_file,1,@tmp_blast_output_files); # 1 -> secondary hsp are kept, as they jump over introns

# secondary hsp are kept, as they jump over introns
sort_blast_results($blast_file,$KEEPSCNDHSPS,$COMPRESSBLAST,@tmp_blast_output_files);

print "# done\n\n";
unlink(@to_be_deleted); # remove .queue files
}
Expand Down
42 changes: 30 additions & 12 deletions get_homologues.pl
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env perl

# 2017 Bruno Contreras-Moreira (1) and Pablo Vinuesa (2):
# 2018 Bruno Contreras-Moreira (1) and Pablo Vinuesa (2):
# 1: http://www.eead.csic.es/compbio (Estacion Experimental Aula Dei/CSIC/Fundacion ARAID, Spain)
# 2: http://www.ccg.unam.mx/~vinuesa (Center for Genomic Sciences, UNAM, Mexico)

Expand Down Expand Up @@ -39,15 +39,16 @@
my $BATCHSIZE = 100;

## global variables that control some algorithmic choices
my $NOFSAMPLESREPORT = 10; # number of genome samples used for the generation of pan/core genomes
my $MAXEVALUEBLASTSEARCH = 0.01; # required to reduce size of blast output files
my $MAXPFAMSEQS = 250; # default for -m cluster jobs, it is updated to 1000 with -m local
my $MININTERGENESIZE = 200; # minimum length (nts) required for intergenic segments to be considered
my $NOFSAMPLESREPORT = 10; # number of genome samples used for the generation of pan/core genomes
my $MAXEVALUEBLASTSEARCH = 0.01; # required to reduce size of blast output files
my $MAXPFAMSEQS = 250; # default for -m cluster jobs, it is updated to 1000 with -m local
my $MININTERGENESIZE = 200; # minimum length (nts) required for intergenic segments to be considered
my $MAXINTERGENESIZE = 700;
my $INTERGENEFLANKORF = 180; # length in nts of intergene flanks borrowed from neighbor ORFs (/3 for amino acids)
my $PRINTCLUSTERSSCREEN = 0; # whether cluster names should be printed to screen
my $KEEPSCNDHSPS = 1; # by default only the first BLAST hsp is kept for further calculations
my $DISSABLEPRND = 1; # dissable paranoid (PRND, -P) options due to no redistribution license;
my $INTERGENEFLANKORF = 180; # length in nts of intergene flanks borrowed from neighbor ORFs (/3 for amino acids)
my $PRINTCLUSTERSSCREEN = 0; # whether cluster names should be printed to screen
my $KEEPSCNDHSPS = 1; # by default only the first BLAST hsp is kept for further calculations
my $COMPRESSBLAST = 1; # set to 0 if gzip is not available
my $DISSABLEPRND = 1; # dissable paranoid (PRND, -P) options due to no redistribution license;


## list of features/binaries required by this program (do not edit)
Expand Down Expand Up @@ -1383,6 +1384,11 @@
if(!-s $blast_file){ push(@tmp_blast_output_files,$blastout); }
next;
}
elsif($COMPRESSBLAST && -s $blastout.'.gz')
{
if(!-s $blast_file){ push(@tmp_blast_output_files,$blastout); }
next;
}

$command = format_DIAMONDblastp_command("$newDIR/$new_infile",
$blastout,$blastDBfile,$MAXEVALUEBLASTSEARCH,$psize{$infile});
Expand Down Expand Up @@ -1423,6 +1429,11 @@
if(!-s $blast_file){ push(@tmp_blast_output_files,$blastout); }
next;
}
elsif($COMPRESSBLAST && -s $blastout.'.gz' && $current_files eq $previous_files)
{
if(!-s $blast_file){ push(@tmp_blast_output_files,$blastout); }
next;
}

$command = format_DIAMONDblastp_command("$newDIR/$new_infile",
$blastout,$blastDBfile,$MAXEVALUEBLASTSEARCH,$n_of_sequences);
Expand Down Expand Up @@ -1471,11 +1482,17 @@
next if( ($bfile1 != $reference_proteome && $bfile2 != $reference_proteome) &&
($bfile1 != $bfile2) );#print "$bfile1 $bfile2 $new_infile $blastDBfile\n";
}

if(-s $blastout) # check previous BLAST runs
{
if(!-s $blast_file){ push(@tmp_blast_output_files,$blastout); }
next;
}
elsif($COMPRESSBLAST && -s $blastout.'.gz')
{
if(!-s $blast_file){ push(@tmp_blast_output_files,$blastout); }
next;
}

if($do_features)
{
Expand Down Expand Up @@ -1518,15 +1535,16 @@
print "# done\n\n";

# concatenate blast output files to $blast_file (global var)
# compress BLAST files optionally
if(@tmp_blast_output_files)
{
print "# concatenating and sorting BLAST/DIAMOND results...\n";
foreach $blastout (@tmp_blast_output_files)
{
if(!-e $blastout)
if(!-e $blastout && !-e $blastout.'.gz')
{
sleep($QUEUEWAIT); # give disk extra time
if(!-e $blastout)
if(!-e $blastout && !-e $blastout.'.gz')
{
die "# EXIT, $blastout does not exist, BLAST/DIAMOND search might failed ".
"or hard drive is still writing it (please re-run)\n";
Expand All @@ -1535,7 +1553,7 @@
}

# NxN BLAST jobs OR 2xN diamond jobs (for N input files)
sort_blast_results($blast_file,$KEEPSCNDHSPS,@tmp_blast_output_files);
sort_blast_results($blast_file,$KEEPSCNDHSPS,$COMPRESSBLAST,@tmp_blast_output_files);

print "# done\n\n";
unlink(@to_be_deleted); # remove .queue files
Expand Down
37 changes: 29 additions & 8 deletions lib/marfil_homology.pm
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ our @EXPORT = qw(
# executable software
our $SORTLIMITRAM = "500M"; # buffer size should fit in all computers
our $SORTBIN = "sort --buffer-size=$SORTLIMITRAM"; # sort is part of all Linux systems, otherwise edit
our $GZIPBIN = "gzip";

# all these defined in phyTools.pm:
our $BLASTP = $ENV{"EXE_BLASTP"};
Expand Down Expand Up @@ -722,15 +723,17 @@ sub pfam_parse
}

# sorts and merges partial BLAST/DIAMOND results using GNU sort
# three arguments:
# optionally also compresses results as a side-effect
# arguments:
# 1. String Variable: blast out file
# 2. boolean flag stating whether secondary hsps are to be conserved
# 3. array of sorted blast output filenames
# uses globals: $TMP_DIR , $SORTLIMITRAM
# Updated Mar2015
# 3. boolean flag stating whether BLAST outfiles are to be compressed
# 4. array of sorted blast output filenames
# uses globals: $TMP_DIR , $SORTLIMITRAM, $SORTBIN, $GZIPBIN
# Updated Mar2018
sub sort_blast_results
{
my ($sorted_outfile,$keep_secondary_hsps,@blast_outfiles) = @_;
my ($sorted_outfile,$keep_secondary_hsps,$compress_blast,@blast_outfiles) = @_;

my ($file,$size,$files,$root,$cleantmpfile,$tmpfile,$sortedtmpfile);
my (@tmpfiles,@roots,%cluster);
Expand All @@ -750,14 +753,32 @@ sub sort_blast_results
unshift(@{$cluster{$root}},$cleantmpfile);
}
else{ push(@{$cluster{$root}},$cleantmpfile); }

if($compress_blast)
{
if(!-s $file.'.gz')
{
system("$GZIPBIN $file");
if($? != 0)
{
die "# sort_blast_results : cannot compress $file\n";
}
}

open(ORIG,"$GZIPBIN -dc $file |") || die "# sort_blast_results : cannot read $file.gz\n";
}
else
{
open(ORIG,$file) || die "# sort_blast_results : cannot read $file\n";
}

# conserve secondary hsps attached to first one to avoid breaking ordered BLAST output
# 90695 5112 70.57 1043 176 17 1 940 248 1262 0.0 1358
# 90695 5112 24.88 422 223 16 183 544 229 616 2e-09 65.5
# 90695 5279 60.49 1030 286 16 1 939 248 1247 0.0 1136
my ($pqry,$psbj,$qry,$sbj) = ('-1','-1');
open(CLEANBLAST,">$cleantmpfile") || die "# sort_blast_results : cannot create $cleantmpfile\n";
open(ORIG,$file) || die "# sort_blast_results : cannot read $file\n";

while(<ORIG>)
{
chomp;
Expand All @@ -780,8 +801,8 @@ sub sort_blast_results
if($keep_secondary_hsps){ print CLEANBLAST "\n"; } # add last newline if required

close(CLEANBLAST);
push(@tmpfiles,$cleantmpfile);

push(@tmpfiles,$cleantmpfile);
}

foreach $root (@roots)
Expand Down

0 comments on commit da27ecd

Please sign in to comment.