modified sort_blast_results so that it now can compress individual BL…

…AST result files with global binary $SORTBIN get_homologues.pl and get_homologues-est.pl now compress individual BLAST files by default ($COMPRESSBLAST=1) added global $MAXSEQLENGTH to get_homologues-est.pl to warn of long sequences, which often cause downstream problems
eead-csic-compbio · Mar 13, 2018 · da27ecd · da27ecd
1 parent f7572b9
commit da27ecd
Show file tree

Hide file tree

Showing 4 changed files with 94 additions and 27 deletions.
diff --git a/CHANGES.txt b/CHANGES.txt
@@ -248,3 +248,6 @@
 31012018: added the complete.cases(tab)check in plot_matrix_heatmap.sh
 01022018: install.pl now can download bin.tar and extract binaries to cloned repos
 05022018: increased min complete cases to 5 in plot_matrix_heatmap.sh
+06032018: modified sort_blast_results so that it now can compress individual BLAST result files with global binary $SORTBIN
+06032018: get_homologues.pl and get_homologues-est.pl now compress individual BLAST files by default ($COMPRESSBLAST=1)
+13032018: added global $MAXSEQLENGTH to get_homologues-est.pl to warn of long sequences, which often cause downstream problems (thanks Alvaro Rodriguez)
diff --git a/get_homologues-est.pl b/get_homologues-est.pl
@@ -1,6 +1,6 @@
 #!/usr/bin/env perl 
 
-# 2017 Bruno Contreras-Moreira (1) and Pablo Vinuesa (2):
+# 2018 Bruno Contreras-Moreira (1) and Pablo Vinuesa (2):
 # 1: http://www.eead.csic.es/compbio (Estacion Experimental Aula Dei/CSIC/Fundacion ARAID, Spain)
 # 2: http://www.ccg.unam.mx/~vinuesa (Center for Genomic Sciences, UNAM, Mexico)
 
@@ -39,6 +39,7 @@
 my $BATCHSIZE = 1000; # after HS_BLASTN bench 
 
 ## global variables that control some algorithmic choices
+my $MAXSEQLENGTH = 50_000;        # used to warn about long sequences, which often cause downstream problems 
 my $NOFSAMPLESREPORT = 20;        # number of genome samples used for the generation of pan/core genomes
 my $MAXEVALUEBLASTSEARCH = 0.01;  # required to reduce size of blast output files
 my $MAXPFAMSEQS          = 250;   # default for -m cluster jobs, it is updated to 1000 with -m local
@@ -49,6 +50,9 @@
 my $MINSEQLENGTH  = 20;           # min length for input sequences to be considered (~ primer or miRNA) 
 my $NOCLOUDINCORE = 1;            # when calling -M -c -t X initial core/pan size excludes cloud genes, those with occup < X 8alternative 0)
 my $INCLUDEORDER  = 0;            # use implicit -I taxon order for -c composition analyses
+my $KEEPSCNDHSPS  = 1;            # by default only the first BLAST hsp is kept for further calculations
+my $COMPRESSBLAST = 1;            # save disk space by compressing BLASTN results, requires gzip
+
 
 ## list of features/binaries required by this program (do not edit)
 my @FEATURES2CHECK = ('EXE_BLASTN','EXE_FORMATDB','EXE_MCL','EXE_HMMPFAM',
@@ -672,10 +676,22 @@
       $full_length_file{basename($infile).'.nucl'} = 1;
     }
 
-    # calculate median sequence length
-    foreach $seq ( 0 .. $#{$fasta_ref} ){ push(@fasta_length,length($fasta_ref->[$seq][SEQ])) }
+    # calculate median sequence length and warn if too long sequence are found
+    my ($total_long_seqs,$len) = (0,0);
+    foreach $seq ( 0 .. $#{$fasta_ref} )
+    { 
+      $len = length($fasta_ref->[$seq][SEQ]);
+      push(@fasta_length,$len);
+
+      if($len > $MAXSEQLENGTH){ $total_long_seqs++ }
+    }
     @fasta_length = sort {$a<=>$b} @fasta_length;
-    printf(" median length = %d",$fasta_length[int($#fasta_length/2)]);
+    printf(" median length = %d",$fasta_length[int($#fasta_length/2)]);   
+
+    if($total_long_seqs > 0)
+    {
+      printf(" WARNING: %d seqs > $MAXSEQLENGTH b",$total_long_seqs)
+    }
 
     print "\n";
 
@@ -1054,6 +1070,11 @@
         if(!-s $blast_file){ push(@tmp_blast_output_files,$blastout); }
         next;
       }
+      elsif($COMPRESSBLAST && -s $blastout.'.gz')
+      {
+        if(!-s $blast_file){ push(@tmp_blast_output_files,$blastout); }
+        next;
+      }
 
       $command = format_BLASTN_command("$newDIR/$new_infile",
         $blastout,"$newDIR/$blastDBfile",
@@ -1086,22 +1107,26 @@
   print "# done\n\n";
 
   # concat blast output files to $blast_file (global var)
+  # compress BLAST files optionally
   if(@tmp_blast_output_files)
   {
     print "# concatenating and sorting blast results...\n";
     foreach $blastout (@tmp_blast_output_files)
     {
-      if(!-e $blastout)
+      if(!-e $blastout && !-e $blastout.'.gz')
       {
         sleep($QUEUEWAIT); # give disk extra time
-        if(!-e $blastout)
+        if(!-e $blastout && !-e $blastout.'.gz')
         {
           die "# EXIT, $blastout does not exist, BLAST search might failed ".
             "or hard drive is still writing it (please re-run)\n";
         }
       }
     }
-    sort_blast_results($blast_file,1,@tmp_blast_output_files); # 1 -> secondary hsp are kept, as they jump over introns
+
+    # secondary hsp are kept, as they jump over introns
+    sort_blast_results($blast_file,$KEEPSCNDHSPS,$COMPRESSBLAST,@tmp_blast_output_files); 
+
     print "# done\n\n"; 
     unlink(@to_be_deleted); # remove .queue files
   }

diff --git a/get_homologues.pl b/get_homologues.pl
@@ -1,6 +1,6 @@
 #!/usr/bin/env perl 
 
-# 2017 Bruno Contreras-Moreira (1) and Pablo Vinuesa (2):
+# 2018 Bruno Contreras-Moreira (1) and Pablo Vinuesa (2):
 # 1: http://www.eead.csic.es/compbio (Estacion Experimental Aula Dei/CSIC/Fundacion ARAID, Spain)
 # 2: http://www.ccg.unam.mx/~vinuesa (Center for Genomic Sciences, UNAM, Mexico)
 
@@ -39,15 +39,16 @@
 my $BATCHSIZE = 100;
 
 ## global variables that control some algorithmic choices
-my $NOFSAMPLESREPORT     = 10;    # number of genome samples used for the generation of pan/core genomes
-my $MAXEVALUEBLASTSEARCH = 0.01;  # required to reduce size of blast output files
-my $MAXPFAMSEQS          = 250;   # default for -m cluster jobs, it is updated to 1000 with -m local
-my $MININTERGENESIZE     = 200;   # minimum length (nts) required for intergenic segments to be considered
+my $NOFSAMPLESREPORT     = 10;   # number of genome samples used for the generation of pan/core genomes
+my $MAXEVALUEBLASTSEARCH = 0.01; # required to reduce size of blast output files
+my $MAXPFAMSEQS          = 250;  # default for -m cluster jobs, it is updated to 1000 with -m local
+my $MININTERGENESIZE     = 200;  # minimum length (nts) required for intergenic segments to be considered
 my $MAXINTERGENESIZE     = 700;
-my $INTERGENEFLANKORF    = 180;   # length in nts of intergene flanks borrowed from neighbor ORFs (/3 for amino acids)
-my $PRINTCLUSTERSSCREEN  = 0;     # whether cluster names should be printed to screen
-my $KEEPSCNDHSPS         = 1;     # by default only the first BLAST hsp is kept for further calculations
-my $DISSABLEPRND         = 1;     # dissable paranoid (PRND, -P) options due to no redistribution license;
+my $INTERGENEFLANKORF    = 180;  # length in nts of intergene flanks borrowed from neighbor ORFs (/3 for amino acids)
+my $PRINTCLUSTERSSCREEN  = 0;    # whether cluster names should be printed to screen
+my $KEEPSCNDHSPS         = 1;    # by default only the first BLAST hsp is kept for further calculations
+my $COMPRESSBLAST        = 1;    # set to 0 if gzip is not available 
+my $DISSABLEPRND         = 1;    # dissable paranoid (PRND, -P) options due to no redistribution license;
 
 
 ## list of features/binaries required by this program (do not edit)
@@ -1383,6 +1384,11 @@
         if(!-s $blast_file){ push(@tmp_blast_output_files,$blastout); }
         next;
       }
+      elsif($COMPRESSBLAST && -s $blastout.'.gz') 
+      {
+        if(!-s $blast_file){ push(@tmp_blast_output_files,$blastout); }
+        next;
+      }
 
       $command = format_DIAMONDblastp_command("$newDIR/$new_infile",
         $blastout,$blastDBfile,$MAXEVALUEBLASTSEARCH,$psize{$infile});
@@ -1423,6 +1429,11 @@
         if(!-s $blast_file){ push(@tmp_blast_output_files,$blastout); }
         next;
       }
+      elsif($COMPRESSBLAST && -s $blastout.'.gz' && $current_files eq $previous_files) 
+      {
+        if(!-s $blast_file){ push(@tmp_blast_output_files,$blastout); }
+        next;
+      }
 
       $command = format_DIAMONDblastp_command("$newDIR/$new_infile",
         $blastout,$blastDBfile,$MAXEVALUEBLASTSEARCH,$n_of_sequences);
@@ -1471,11 +1482,17 @@
           next if( ($bfile1 != $reference_proteome && $bfile2 != $reference_proteome) &&
             ($bfile1 != $bfile2) );#print "$bfile1 $bfile2 $new_infile $blastDBfile\n";
         }
+
         if(-s $blastout) # check previous BLAST runs
         {
           if(!-s $blast_file){ push(@tmp_blast_output_files,$blastout); }
           next;
         }
+        elsif($COMPRESSBLAST && -s $blastout.'.gz')
+        {
+          if(!-s $blast_file){ push(@tmp_blast_output_files,$blastout); }
+          next;
+        }
 
         if($do_features)
         {
@@ -1518,15 +1535,16 @@
   print "# done\n\n";
 
   # concatenate blast output files to $blast_file (global var)
+  # compress BLAST files optionally
   if(@tmp_blast_output_files)
   {
     print "# concatenating and sorting BLAST/DIAMOND results...\n";
     foreach $blastout (@tmp_blast_output_files)
     {
-      if(!-e $blastout)
+      if(!-e $blastout && !-e $blastout.'.gz')
       {
         sleep($QUEUEWAIT); # give disk extra time
-        if(!-e $blastout)
+        if(!-e $blastout && !-e $blastout.'.gz')
         {
           die "# EXIT, $blastout does not exist, BLAST/DIAMOND search might failed ".
             "or hard drive is still writing it (please re-run)\n";
@@ -1535,7 +1553,7 @@
     }
 
     # NxN BLAST jobs OR 2xN diamond jobs (for N input  files)
-    sort_blast_results($blast_file,$KEEPSCNDHSPS,@tmp_blast_output_files);
+    sort_blast_results($blast_file,$KEEPSCNDHSPS,$COMPRESSBLAST,@tmp_blast_output_files);
 
     print "# done\n\n";
     unlink(@to_be_deleted); # remove .queue files

diff --git a/lib/marfil_homology.pm b/lib/marfil_homology.pm
@@ -103,6 +103,7 @@ our @EXPORT = qw(
 # executable software
 our $SORTLIMITRAM        = "500M"; # buffer size should fit in all computers
 our $SORTBIN             = "sort --buffer-size=$SORTLIMITRAM"; # sort is part of all Linux systems, otherwise edit
+our $GZIPBIN             = "gzip";
 
 # all these defined in phyTools.pm:
 our $BLASTP              = $ENV{"EXE_BLASTP"};
@@ -722,15 +723,17 @@ sub pfam_parse
 }
 
 # sorts and merges partial BLAST/DIAMOND results using GNU sort
-# three arguments:
+# optionally also compresses results as a side-effect
+# arguments:
 # 1. String Variable: blast out file
 # 2. boolean flag stating whether secondary hsps are to be conserved 
-# 3. array of sorted blast output filenames
-# uses globals: $TMP_DIR , $SORTLIMITRAM
-# Updated Mar2015
+# 3. boolean flag stating whether BLAST outfiles are to be compressed
+# 4. array of sorted blast output filenames
+# uses globals: $TMP_DIR , $SORTLIMITRAM, $SORTBIN, $GZIPBIN
+# Updated Mar2018
 sub sort_blast_results
 {
-  my ($sorted_outfile,$keep_secondary_hsps,@blast_outfiles) = @_;
+  my ($sorted_outfile,$keep_secondary_hsps,$compress_blast,@blast_outfiles) = @_;
 
   my ($file,$size,$files,$root,$cleantmpfile,$tmpfile,$sortedtmpfile);
   my (@tmpfiles,@roots,%cluster);
@@ -750,14 +753,32 @@ sub sort_blast_results
       unshift(@{$cluster{$root}},$cleantmpfile);
     }
     else{ push(@{$cluster{$root}},$cleantmpfile); }
+
+    if($compress_blast)
+    {
+      if(!-s $file.'.gz')
+      {
+        system("$GZIPBIN $file");
+        if($? != 0)
+        {
+          die "# sort_blast_results : cannot compress $file\n";
+        }
+      }
+
+      open(ORIG,"$GZIPBIN -dc $file |") || die "# sort_blast_results : cannot read $file.gz\n";
+    }
+    else
+    {
+      open(ORIG,$file) || die "# sort_blast_results : cannot read $file\n";
+    }
 
     # conserve secondary hsps attached to first one to avoid breaking ordered BLAST output
     # 90695	5112	70.57	1043	176	17	1	940	248	1262	0.0	1358
     # 90695	5112	24.88	422	223	16	183	544	229	616	2e-09	65.5
     # 90695	5279	60.49	1030	286	16	1	939	248	1247	0.0	1136
     my ($pqry,$psbj,$qry,$sbj) = ('-1','-1');
     open(CLEANBLAST,">$cleantmpfile") || die "# sort_blast_results : cannot create $cleantmpfile\n";
-    open(ORIG,$file) || die "# sort_blast_results : cannot read $file\n";
+
     while(<ORIG>)
     {
       chomp;
@@ -780,8 +801,8 @@ sub sort_blast_results
     if($keep_secondary_hsps){ print CLEANBLAST "\n"; } # add last newline if required
 
     close(CLEANBLAST);
-    
-    push(@tmpfiles,$cleantmpfile);
+
+    push(@tmpfiles,$cleantmpfile);      
   }
 
   foreach $root (@roots)