diff --git a/CHANGES.txt b/CHANGES.txt index 836ecc3..d301e5f 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -248,3 +248,6 @@ 31012018: added the complete.cases(tab)check in plot_matrix_heatmap.sh 01022018: install.pl now can download bin.tar and extract binaries to cloned repos 05022018: increased min complete cases to 5 in plot_matrix_heatmap.sh +06032018: modified sort_blast_results so that it now can compress individual BLAST result files with global binary $SORTBIN +06032018: get_homologues.pl and get_homologues-est.pl now compress individual BLAST files by default ($COMPRESSBLAST=1) +13032018: added global $MAXSEQLENGTH to get_homologues-est.pl to warn of long sequences, which often cause downstream problems (thanks Alvaro Rodriguez) diff --git a/get_homologues-est.pl b/get_homologues-est.pl index 8e6a0df..4d1bd50 100755 --- a/get_homologues-est.pl +++ b/get_homologues-est.pl @@ -1,6 +1,6 @@ #!/usr/bin/env perl -# 2017 Bruno Contreras-Moreira (1) and Pablo Vinuesa (2): +# 2018 Bruno Contreras-Moreira (1) and Pablo Vinuesa (2): # 1: http://www.eead.csic.es/compbio (Estacion Experimental Aula Dei/CSIC/Fundacion ARAID, Spain) # 2: http://www.ccg.unam.mx/~vinuesa (Center for Genomic Sciences, UNAM, Mexico) @@ -39,6 +39,7 @@ my $BATCHSIZE = 1000; # after HS_BLASTN bench ## global variables that control some algorithmic choices +my $MAXSEQLENGTH = 50_000; # used to warn about long sequences, which often cause downstream problems my $NOFSAMPLESREPORT = 20; # number of genome samples used for the generation of pan/core genomes my $MAXEVALUEBLASTSEARCH = 0.01; # required to reduce size of blast output files my $MAXPFAMSEQS = 250; # default for -m cluster jobs, it is updated to 1000 with -m local @@ -49,6 +50,9 @@ my $MINSEQLENGTH = 20; # min length for input sequences to be considered (~ primer or miRNA) my $NOCLOUDINCORE = 1; # when calling -M -c -t X initial core/pan size excludes cloud genes, those with occup < X 8alternative 0) my $INCLUDEORDER = 0; # use implicit -I taxon order for -c composition analyses +my $KEEPSCNDHSPS = 1; # by default only the first BLAST hsp is kept for further calculations +my $COMPRESSBLAST = 1; # save disk space by compressing BLASTN results, requires gzip + ## list of features/binaries required by this program (do not edit) my @FEATURES2CHECK = ('EXE_BLASTN','EXE_FORMATDB','EXE_MCL','EXE_HMMPFAM', @@ -672,10 +676,22 @@ $full_length_file{basename($infile).'.nucl'} = 1; } - # calculate median sequence length - foreach $seq ( 0 .. $#{$fasta_ref} ){ push(@fasta_length,length($fasta_ref->[$seq][SEQ])) } + # calculate median sequence length and warn if too long sequence are found + my ($total_long_seqs,$len) = (0,0); + foreach $seq ( 0 .. $#{$fasta_ref} ) + { + $len = length($fasta_ref->[$seq][SEQ]); + push(@fasta_length,$len); + + if($len > $MAXSEQLENGTH){ $total_long_seqs++ } + } @fasta_length = sort {$a<=>$b} @fasta_length; - printf(" median length = %d",$fasta_length[int($#fasta_length/2)]); + printf(" median length = %d",$fasta_length[int($#fasta_length/2)]); + + if($total_long_seqs > 0) + { + printf(" WARNING: %d seqs > $MAXSEQLENGTH b",$total_long_seqs) + } print "\n"; @@ -1054,6 +1070,11 @@ if(!-s $blast_file){ push(@tmp_blast_output_files,$blastout); } next; } + elsif($COMPRESSBLAST && -s $blastout.'.gz') + { + if(!-s $blast_file){ push(@tmp_blast_output_files,$blastout); } + next; + } $command = format_BLASTN_command("$newDIR/$new_infile", $blastout,"$newDIR/$blastDBfile", @@ -1086,22 +1107,26 @@ print "# done\n\n"; # concat blast output files to $blast_file (global var) + # compress BLAST files optionally if(@tmp_blast_output_files) { print "# concatenating and sorting blast results...\n"; foreach $blastout (@tmp_blast_output_files) { - if(!-e $blastout) + if(!-e $blastout && !-e $blastout.'.gz') { sleep($QUEUEWAIT); # give disk extra time - if(!-e $blastout) + if(!-e $blastout && !-e $blastout.'.gz') { die "# EXIT, $blastout does not exist, BLAST search might failed ". "or hard drive is still writing it (please re-run)\n"; } } } - sort_blast_results($blast_file,1,@tmp_blast_output_files); # 1 -> secondary hsp are kept, as they jump over introns + + # secondary hsp are kept, as they jump over introns + sort_blast_results($blast_file,$KEEPSCNDHSPS,$COMPRESSBLAST,@tmp_blast_output_files); + print "# done\n\n"; unlink(@to_be_deleted); # remove .queue files } diff --git a/get_homologues.pl b/get_homologues.pl index c3630fc..1b5927b 100755 --- a/get_homologues.pl +++ b/get_homologues.pl @@ -1,6 +1,6 @@ #!/usr/bin/env perl -# 2017 Bruno Contreras-Moreira (1) and Pablo Vinuesa (2): +# 2018 Bruno Contreras-Moreira (1) and Pablo Vinuesa (2): # 1: http://www.eead.csic.es/compbio (Estacion Experimental Aula Dei/CSIC/Fundacion ARAID, Spain) # 2: http://www.ccg.unam.mx/~vinuesa (Center for Genomic Sciences, UNAM, Mexico) @@ -39,15 +39,16 @@ my $BATCHSIZE = 100; ## global variables that control some algorithmic choices -my $NOFSAMPLESREPORT = 10; # number of genome samples used for the generation of pan/core genomes -my $MAXEVALUEBLASTSEARCH = 0.01; # required to reduce size of blast output files -my $MAXPFAMSEQS = 250; # default for -m cluster jobs, it is updated to 1000 with -m local -my $MININTERGENESIZE = 200; # minimum length (nts) required for intergenic segments to be considered +my $NOFSAMPLESREPORT = 10; # number of genome samples used for the generation of pan/core genomes +my $MAXEVALUEBLASTSEARCH = 0.01; # required to reduce size of blast output files +my $MAXPFAMSEQS = 250; # default for -m cluster jobs, it is updated to 1000 with -m local +my $MININTERGENESIZE = 200; # minimum length (nts) required for intergenic segments to be considered my $MAXINTERGENESIZE = 700; -my $INTERGENEFLANKORF = 180; # length in nts of intergene flanks borrowed from neighbor ORFs (/3 for amino acids) -my $PRINTCLUSTERSSCREEN = 0; # whether cluster names should be printed to screen -my $KEEPSCNDHSPS = 1; # by default only the first BLAST hsp is kept for further calculations -my $DISSABLEPRND = 1; # dissable paranoid (PRND, -P) options due to no redistribution license; +my $INTERGENEFLANKORF = 180; # length in nts of intergene flanks borrowed from neighbor ORFs (/3 for amino acids) +my $PRINTCLUSTERSSCREEN = 0; # whether cluster names should be printed to screen +my $KEEPSCNDHSPS = 1; # by default only the first BLAST hsp is kept for further calculations +my $COMPRESSBLAST = 1; # set to 0 if gzip is not available +my $DISSABLEPRND = 1; # dissable paranoid (PRND, -P) options due to no redistribution license; ## list of features/binaries required by this program (do not edit) @@ -1383,6 +1384,11 @@ if(!-s $blast_file){ push(@tmp_blast_output_files,$blastout); } next; } + elsif($COMPRESSBLAST && -s $blastout.'.gz') + { + if(!-s $blast_file){ push(@tmp_blast_output_files,$blastout); } + next; + } $command = format_DIAMONDblastp_command("$newDIR/$new_infile", $blastout,$blastDBfile,$MAXEVALUEBLASTSEARCH,$psize{$infile}); @@ -1423,6 +1429,11 @@ if(!-s $blast_file){ push(@tmp_blast_output_files,$blastout); } next; } + elsif($COMPRESSBLAST && -s $blastout.'.gz' && $current_files eq $previous_files) + { + if(!-s $blast_file){ push(@tmp_blast_output_files,$blastout); } + next; + } $command = format_DIAMONDblastp_command("$newDIR/$new_infile", $blastout,$blastDBfile,$MAXEVALUEBLASTSEARCH,$n_of_sequences); @@ -1471,11 +1482,17 @@ next if( ($bfile1 != $reference_proteome && $bfile2 != $reference_proteome) && ($bfile1 != $bfile2) );#print "$bfile1 $bfile2 $new_infile $blastDBfile\n"; } + if(-s $blastout) # check previous BLAST runs { if(!-s $blast_file){ push(@tmp_blast_output_files,$blastout); } next; } + elsif($COMPRESSBLAST && -s $blastout.'.gz') + { + if(!-s $blast_file){ push(@tmp_blast_output_files,$blastout); } + next; + } if($do_features) { @@ -1518,15 +1535,16 @@ print "# done\n\n"; # concatenate blast output files to $blast_file (global var) + # compress BLAST files optionally if(@tmp_blast_output_files) { print "# concatenating and sorting BLAST/DIAMOND results...\n"; foreach $blastout (@tmp_blast_output_files) { - if(!-e $blastout) + if(!-e $blastout && !-e $blastout.'.gz') { sleep($QUEUEWAIT); # give disk extra time - if(!-e $blastout) + if(!-e $blastout && !-e $blastout.'.gz') { die "# EXIT, $blastout does not exist, BLAST/DIAMOND search might failed ". "or hard drive is still writing it (please re-run)\n"; @@ -1535,7 +1553,7 @@ } # NxN BLAST jobs OR 2xN diamond jobs (for N input files) - sort_blast_results($blast_file,$KEEPSCNDHSPS,@tmp_blast_output_files); + sort_blast_results($blast_file,$KEEPSCNDHSPS,$COMPRESSBLAST,@tmp_blast_output_files); print "# done\n\n"; unlink(@to_be_deleted); # remove .queue files diff --git a/lib/marfil_homology.pm b/lib/marfil_homology.pm index 55fc20b..f29ae01 100755 --- a/lib/marfil_homology.pm +++ b/lib/marfil_homology.pm @@ -103,6 +103,7 @@ our @EXPORT = qw( # executable software our $SORTLIMITRAM = "500M"; # buffer size should fit in all computers our $SORTBIN = "sort --buffer-size=$SORTLIMITRAM"; # sort is part of all Linux systems, otherwise edit +our $GZIPBIN = "gzip"; # all these defined in phyTools.pm: our $BLASTP = $ENV{"EXE_BLASTP"}; @@ -722,15 +723,17 @@ sub pfam_parse } # sorts and merges partial BLAST/DIAMOND results using GNU sort -# three arguments: +# optionally also compresses results as a side-effect +# arguments: # 1. String Variable: blast out file # 2. boolean flag stating whether secondary hsps are to be conserved -# 3. array of sorted blast output filenames -# uses globals: $TMP_DIR , $SORTLIMITRAM -# Updated Mar2015 +# 3. boolean flag stating whether BLAST outfiles are to be compressed +# 4. array of sorted blast output filenames +# uses globals: $TMP_DIR , $SORTLIMITRAM, $SORTBIN, $GZIPBIN +# Updated Mar2018 sub sort_blast_results { - my ($sorted_outfile,$keep_secondary_hsps,@blast_outfiles) = @_; + my ($sorted_outfile,$keep_secondary_hsps,$compress_blast,@blast_outfiles) = @_; my ($file,$size,$files,$root,$cleantmpfile,$tmpfile,$sortedtmpfile); my (@tmpfiles,@roots,%cluster); @@ -750,6 +753,24 @@ sub sort_blast_results unshift(@{$cluster{$root}},$cleantmpfile); } else{ push(@{$cluster{$root}},$cleantmpfile); } + + if($compress_blast) + { + if(!-s $file.'.gz') + { + system("$GZIPBIN $file"); + if($? != 0) + { + die "# sort_blast_results : cannot compress $file\n"; + } + } + + open(ORIG,"$GZIPBIN -dc $file |") || die "# sort_blast_results : cannot read $file.gz\n"; + } + else + { + open(ORIG,$file) || die "# sort_blast_results : cannot read $file\n"; + } # conserve secondary hsps attached to first one to avoid breaking ordered BLAST output # 90695 5112 70.57 1043 176 17 1 940 248 1262 0.0 1358 @@ -757,7 +778,7 @@ sub sort_blast_results # 90695 5279 60.49 1030 286 16 1 939 248 1247 0.0 1136 my ($pqry,$psbj,$qry,$sbj) = ('-1','-1'); open(CLEANBLAST,">$cleantmpfile") || die "# sort_blast_results : cannot create $cleantmpfile\n"; - open(ORIG,$file) || die "# sort_blast_results : cannot read $file\n"; + while() { chomp; @@ -780,8 +801,8 @@ sub sort_blast_results if($keep_secondary_hsps){ print CLEANBLAST "\n"; } # add last newline if required close(CLEANBLAST); - - push(@tmpfiles,$cleantmpfile); + + push(@tmpfiles,$cleantmpfile); } foreach $root (@roots)