TransDecoder.Predict

#!/usr/bin/env perl

use strict;
use warnings;
use FindBin;
use Pod::Usage;
use Getopt::Long qw(:config posix_default no_ignore_case bundling pass_through);
use Data::Dumper;
use List::Util qw (min max);
use File::Basename;
use Carp;
use Digest::MD5;

use lib ("$FindBin::RealBin/PerlLib");

use POSIX qw(ceil);
use Gene_obj;
use Nuc_translator;
use Fasta_reader;
use Longest_orf;
use Pipeliner;
use DelimParser;


#my $VERSION = "__BLEEDING_EDGE__";
my $VERSION = "5.7.0";

my $RETAIN_LONG_ORFS_MIN_LENGTH = 1000000; # so essentially, off by default

srand(1234);

my $genetic_code = "Universal";
my $genetic_code_options = join("\n", &Nuc_translator::get_genetic_codes());

my $usage = <<__EOUSAGE__;

########################################################################################
#             ______                 ___                  __
#            /_  __/______ ____ ___ / _ \\___ _______  ___/ /__ ____
#             / / / __/ _ `/ _\\(_-</ // / -_) __/ _ \\/ _  / -_) __/
#            /_/ /_/ \\_,_/_//_/___/____/\\__/\\__/\\___/\\_,_/\\__/_/   .Predict
#
########################################################################################
#
#  Transdecoder.LongOrfs|http://transdecoder.github.io> - Transcriptome Protein Prediction
#
#
#  Required:
#
#   -t <string>                            transcripts.fasta
#
#  Common options:
#
#
#   --retain_long_orfs_mode <string>        'dynamic' or 'strict' (default: dynamic)
#                                        In dynamic mode, sets range according to 1%FDR in random sequence of same GC content.
#
# 
#   --retain_long_orfs_length <int>         under 'strict' mode, retain all ORFs found that are equal or longer than these many nucleotides even if no other evidence 
#                                         marks it as coding (default: 1000000) so essentially turned off by default.)
#
#   --retain_pfam_hits <string>            domain table output file from running hmmscan to search Pfam (see transdecoder.github.io for info)     
#                                        Any ORF with a pfam domain hit will be retained in the final output.
# 
#   --retain_blastp_hits <string>          blastp output in '-outfmt 6' format.
#                                        Any ORF with a blast match will be retained in the final output.
#
#   --single_best_only                     Retain only the single best orf per transcript (prioritized by homology then orf length)
#
#   --output_dir | -O  <string>            output directory from the TransDecoder.LongOrfs step (default: basename( -t val ) + ".transdecoder_dir")
#
#   --no_refine_starts                     start refinement identifies potential start codons for 5' partial ORFs using a PWM, process on by default.
#
##  Advanced options
#
#    -T <int>                            Top longest ORFs to train Markov Model (hexamer stats) (default: 500)
#                                        Note, 10x this value are first selected for removing redundancies,
#                                        and then this -T value of longest ORFs are selected from the non-redundant set.
#  --version                           show version ($VERSION)
#
#   --genetic_code | -G <string>                            genetic code (default: universal; see PerlDoc; options: Euplotes, Tetrahymena, Candida, Acetabularia, ...)
#                                              Genetic Codes (derived from: https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi)
$genetic_code_options
#
#
#########################################################################################


__EOUSAGE__
    ;


my $UTIL_DIR = "$FindBin::RealBin/util";
$ENV{PATH} = "$UTIL_DIR/bin:$ENV{PATH}";


my ($transcripts_file,$train_file);

my $top_ORFs_train = 500;
my $max_prot_length_for_training = 5000;

my $help;

my $verbose;
my $search_pfam = "";
my ($reuse,$pfam_out);


my $RETAIN_LONG_ORFS_MODE = 'dynamic';

#################################
# quantiles based on random orfs
#
# Quant  0.95    0.99    0.999     0.9999
# 80     1230.0  1728.0  2422.656  3240.7914
# 75     927.0   1257.0  1743.0    2278.6704
# 70     750.0   1002.0  1358.1    1681.77
# 65     645.0   829.59  1085.718  1383.5718
# 60     570.0   722.31  927.0     1158.231
# 55     513.0   633.0   796.983   950.8983
# 50     486.0   582.0   748.77    887.877
# 45     456.0   546.0   644.655   694.4931
# 40     431.7   501.0   589.794   612.2352
# 35     426.0   488.76  554.88    565.7904
# 30     406.5   469.5   510.9     518.19
# 25     404.4   448.56  465.39    467.739


# setting it to 0.999% mark:
my @GC_TO_MIN_LONG_ORF_LENGTH = ( [25, 465],
                                  [30, 510],
                                  [35, 555],
                                  [40, 590],
                                  [45, 645],
                                  [50, 749],
                                  [55, 797],
                                  [60, 927],
                                  [65, 1086],
                                  [70, 1358],
                                  [75, 1743],
                                  [80, 2422] );


my $retain_pfam_hits_file;
my $retain_blastp_hits_file;
my $cpu = 1;
my $MPI_DEBUG = 1;
my $single_best_flag = 0;

my $NO_REFINE_START_CODONS_FLAG = 0;

my $output_dir;

my $show_version_flag;


&GetOptions( 't=s' => \$transcripts_file,

             'h' => \$help,
             'v' => \$verbose,
             
             'T=i' => \$top_ORFs_train,

             'search_pfam=s' => \$search_pfam,
             'reuse' => \$reuse,

             'retain_long_orfs_mode=s' => \$RETAIN_LONG_ORFS_MODE,
             'retain_long_orfs_length=i' => \$RETAIN_LONG_ORFS_MIN_LENGTH,

             'debug' => \$MPI_DEBUG,
             
             'retain_pfam_hits=s' => \$retain_pfam_hits_file,
             'retain_blastp_hits=s' => \$retain_blastp_hits_file,
             'cpu=i' => \$cpu,

             'single_best_only' => \$single_best_flag,
             
             'genetic_code|G=s' => \$genetic_code,

             'no_refine_starts' => \$NO_REFINE_START_CODONS_FLAG,

             'version' => \$show_version_flag,

             'output_dir|O=s' => \$output_dir,
                          
             );


if ($help) {
    die $usage;
}

if ($show_version_flag) {
    print "TransDecoder.Predict $VERSION\n";
    exit(0);
}


if (@ARGV) {
    die "Error, don't understand options: @ARGV";
}


our $SEE = $verbose;

unless ($transcripts_file && -s $transcripts_file) {
    die $usage;
}


if ($genetic_code) {
    $genetic_code = " --genetic_code $genetic_code";
}


main: {


    if ($transcripts_file =~ /\.gz$/) {
        # create an uncompressed version locally and use that instead
        my $uncompressed_transcripts_file = basename($transcripts_file);
        $uncompressed_transcripts_file =~ s/\.gz$//;
        if (! -s $uncompressed_transcripts_file) {
            &process_cmd("gunzip -c $transcripts_file > $uncompressed_transcripts_file");
        }
        $transcripts_file = $uncompressed_transcripts_file;
    }
    
    my $workdir = $output_dir;
    unless ($workdir) {
        $workdir = basename($transcripts_file) . ".transdecoder_dir"; 
    }
    
    unless (-d $workdir) {
        die "Error, cannot find directory: $workdir,  be sure to first run TransDecoder.LongOrfs before TransDecoder.Predict\n\n";
    }

    my $checkpoints_dir = "$workdir/__checkpoints_TDpredict";
    if (! -d $checkpoints_dir) {
        mkdir($checkpoints_dir);
    }
    
    my $pipeliner = new Pipeliner('-verbose' => 2, '-checkpoint_dir' => $checkpoints_dir);
    
    my $prefix = "$workdir/longest_orfs";
    my $cds_file = "$prefix.cds";
    my $gff3_file = "$prefix.gff3";
    my $pep_file = "$prefix.pep";

    ## Train a Markov model based on user-provided file or longest candidate CDS sequences, score all candidates, and select the final set.
    
    my $top_cds_file = "$cds_file.top_${top_ORFs_train}_longest";
    {
        
        # to speed things up only check for redundancy up to x the number of entries we want
        my $red_num = $top_ORFs_train * 10;
        my $red_num_cds_longest_file = "$cds_file.top_longest_${red_num}";
        
        $pipeliner->add_commands(new Command("$UTIL_DIR/get_top_longest_fasta_entries.pl $cds_file $red_num $max_prot_length_for_training > $red_num_cds_longest_file",
                                             "get_longest_orfs.ok"));
        
        $pipeliner->add_commands(new Command("$UTIL_DIR/exclude_similar_proteins.pl $red_num_cds_longest_file > $red_num_cds_longest_file.nr", "nr.ok"));
        
        $pipeliner->add_commands(new Command("$UTIL_DIR/get_top_longest_fasta_entries.pl $red_num_cds_longest_file.nr $top_ORFs_train > $top_cds_file",
                                             "top_train_select.ok"));
        
    
        $pipeliner->run();
    
    }
    
        
    # get hexamer scores
    my $hexamer_scores_file = "$workdir/hexamer.scores";
    my $hexamer_checkpoint = "hexamer_scores_file.ok";
    my $base_freqs_file = "$workdir/base_freqs.dat";

    if ($RETAIN_LONG_ORFS_MODE eq 'dynamic') {
        $RETAIN_LONG_ORFS_MIN_LENGTH = &get_dynamic_retain_long_orf_length($base_freqs_file, \@GC_TO_MIN_LONG_ORF_LENGTH);
    }
    
    # score kmers for Markov model
    $pipeliner->add_commands(new Command("$UTIL_DIR/seq_n_baseprobs_to_loglikelihood_vals.pl $top_cds_file $base_freqs_file > $hexamer_scores_file",
                                         "hexamer_scores.ok") );
    
    # score all cds entries
    my $cds_scores_file = "$cds_file.scores";
    my $cds_scores_checkpoint = "$cds_scores_file.ok";
    $pipeliner->add_commands(new Command("$UTIL_DIR/score_CDS_likelihood_all_6_frames.pl $cds_file $hexamer_scores_file > $cds_scores_file",
                                         "score_cdss.ok"));

    $pipeliner->run();


    ##############
    ## select orfs
    
    ## Steps below may be rerun if input parameters are set differently (such as including pfam or blast results in a subsequent run).

    my @checkpoint_inputs = ($transcripts_file);
    if ($retain_pfam_hits_file) {
        push (@checkpoint_inputs, $retain_pfam_hits_file);
    }
    if ($retain_blastp_hits_file) {
        push (@checkpoint_inputs, $retain_blastp_hits_file);
    }
    my $big_checkpoint_string = join("^", @checkpoint_inputs);
    my $checkpoint_hashcode = Digest::MD5::md5_hex($big_checkpoint_string);
        
    my $select_cmd = "$UTIL_DIR/select_best_ORFs_per_transcript.pl --gff3_file $gff3_file --cds_scores $cds_scores_file "
        . " --min_length_auto_accept $RETAIN_LONG_ORFS_MIN_LENGTH ";

    if ($retain_pfam_hits_file) {
        $select_cmd .= " --pfam_hits $retain_pfam_hits_file ";
    }
    if ($retain_blastp_hits_file) {
        $select_cmd .= " --blast_hits $retain_blastp_hits_file ";
    }

    if ($single_best_flag) {
        $select_cmd .= " --single_best_orf ";
    }
    $select_cmd .= " > $cds_file.best_candidates.gff3 ";

    $pipeliner->add_commands(new Command($select_cmd, "select_best_orfs.$checkpoint_hashcode.ok", "Selecting best orfs"));
                
    
    my $final_output_prefix = basename($transcripts_file) . ".transdecoder";

    my $target_final_file = "$cds_file.best_candidates.gff3";
    
    ## start codon refinement
    unless ($NO_REFINE_START_CODONS_FLAG) {
        my $cmd = "$UTIL_DIR/train_start_PWM.pl --transcripts $transcripts_file --selected_orfs $top_cds_file --out_prefix $workdir/start_refinement";        
        $pipeliner->add_commands(new Command($cmd, "train_start_PWM.$checkpoint_hashcode.ok", "Training start codon pattern recognition"));
        
        my $revised_orf_gff3_file = "$target_final_file.revised_starts.gff3";
        $cmd = "$UTIL_DIR/start_codon_refinement.pl --transcripts $transcripts_file --gff3_file $target_final_file --workdir $workdir > $revised_orf_gff3_file";
        $pipeliner->add_commands(new Command($cmd, "revise_starts.$checkpoint_hashcode.ok", "Refining start codon selections."));
                
        $target_final_file = $revised_orf_gff3_file;
    }
    $pipeliner->add_commands(new Command("cp $target_final_file $final_output_prefix.gff3",
                                         "copy_to_final_outputfile.$checkpoint_hashcode.ok",
                                         "copying output to final output file: $final_output_prefix.gff3") );
    
    ## write final outputs:
    
    {
        ## make a BED file for viewing in IGV
        my $gff3_file = "$final_output_prefix.gff3";
        my $bed_file = $gff3_file;
        $bed_file =~ s/\.gff3$/\.bed/;
        my $cmd = "$UTIL_DIR/gff3_file_to_bed.pl $gff3_file > $bed_file";
        $pipeliner->add_commands(new Command($cmd, "make_final_bed.$checkpoint_hashcode.ok", "Making bed file: $bed_file"));
                        
        # make a peptide file:
        my $best_pep_file = $gff3_file;
        $best_pep_file =~ s/\.gff3$/\.pep/;
        $cmd = "$UTIL_DIR/gff3_file_to_proteins.pl --gff3 $gff3_file --fasta $transcripts_file $genetic_code > $best_pep_file";
        $pipeliner->add_commands(new Command($cmd, "make_final_pep.$checkpoint_hashcode.ok", "Making pep file: $best_pep_file"));
                
        # make a CDS file:
        my $best_cds_file = $best_pep_file;
        $best_cds_file =~ s/\.pep$/\.cds/;
        $cmd = "$UTIL_DIR/gff3_file_to_proteins.pl --gff3 $gff3_file --fasta $transcripts_file --seqType CDS $genetic_code > $best_cds_file";
        $pipeliner->add_commands(new Command($cmd, "make_final_cds.$checkpoint_hashcode.ok", "Making cds file: $best_cds_file"));
    }
    
    $pipeliner->run();
    
    print STDERR "transdecoder is finished.  See output files $final_output_prefix.\*\n\n\n";
    
    
    exit(0);
}


####
sub process_cmd {
	my ($cmd) = @_;

	print "CMD: $cmd\n";
	my $ret = system($cmd);

	if ($ret) {
		die "Error, cmd: $cmd died with ret $ret";
	}
	
	return;

}


####
sub parse_pfam_hits {
    my ($pfam_hits_file) = @_;
    
    my %has_pfam_hit;
    
    if (! -e $pfam_hits_file) {
        die "Error, cannot find pfam hits file: $pfam_hits_file";
    }
    
    print "PFAM output found and processing...\n";
    # capture those proteins having pfam hits
    open (my $fh, $pfam_hits_file) or die "Error, cannot open file: $pfam_hits_file";
    while (my $ln=<$fh>) {
        next if $ln=~/^\#/;
        my @x = split(/\s+/,$ln);
        next unless $x[3];  # domtbl
        my $orf_acc = $x[3];
        $has_pfam_hit{$orf_acc} = 1;
    }
    close $fh;
    
    
    return(%has_pfam_hit);
}

####
sub parse_blastp_hits_file {
    my ($blastp_file) = @_;

    unless (-e $blastp_file) {
        die "Error, cannot find file $blastp_file";
    }

    my %blastp_hits;

    open (my $fh, $blastp_file) or die "Error, cannot open file $blastp_file";
    while (<$fh>) {
        chomp;
        my @x = split(/\t/);
        my $id = $x[0];

        $blastp_hits{$id} = 1;
    }
    close $fh;

    return(%blastp_hits);
}


sub check_program() {
    my @paths;
    foreach my $prog (@_) {
        my $path = `which $prog`;
        unless ($path =~ /\w/) {
            die "Error, path to a required program ($prog) cannot be found\n\n"
        }
        chomp($path);
        $path = readlink($path) if -l $path;
        push( @paths, $path );
    }
    return @paths;
}


####
sub get_dynamic_retain_long_orf_length {
    my ($base_freqs_file, $GC_to_min_orf_length_aref) = @_;

    my $pct_GC = -1;

    open(my $fh, $base_freqs_file) or confess "Error, cannot open file: $base_freqs_file";
    while (<$fh>) {
        chomp;
        my ($base, $count, $ratio) = split(/\t/);
        if ($base eq 'C') {
            $pct_GC = 2 * $ratio * 100;
            last;
        }
    }
    close $fh;

    print STDERR "PCT_GC: $pct_GC\n";
    
    unless ($pct_GC > 0) {
        confess "Error, didn't parse percent C from file: $base_freqs_file";
    }


    foreach my $bin (@$GC_to_min_orf_length_aref) {
        my ($gc, $min_len) = @$bin;
        
        if ($pct_GC <= $gc) {
            return($min_len);
        }
    }
    
    # if we got here, then GC content must be crazy high!
    # in this case, rely entirely on Markov model for coding determination.
    return(1000000); #effectively infinity here.

}