sort_stats.pl

#!/usr/bin/env perl
## Pombert lab 2019

my $version= '0.3b';
my $name = 'sort_stats.pl';
my $updated = '2024-05-28';

use strict;
use warnings;
use Math::Complex;
use Getopt::Long qw(GetOptions);

###################################################################################################
### Command line options
###################################################################################################

my $usage = <<"OPTIONS";
NAME        ${name}
VERSION     ${version}
UPDATED     ${updated}
SYNOPSIS    Generates TSV/CSV tables from the .stats files generated by get_SNPs.pl

NOTE        The weighted columns correspond to: (variants found*100)/% of reads mapping against the reference.
            This simple weighting underestimates the number of variants, as the reads not mapping are the ones 
            that are the most different from the reference.

USAGE        ${name} \\
               -s *.stats \\
               -o output_name \\
               -f csv

OPTIONS:
-s (--stats)     Stats files obtained from get_SNPs.pl
-o (--output)    Desired table output name; extension will be appended
-f (--format)    Desired table format, tsv or csv [Default: tsv]
-v (--version)   Show script version
OPTIONS

unless (@ARGV){
    print "\n$usage\n";
    exit(0);
};

my @stats;
my $output;
my $format = 'tsv';
my $sc_version;
GetOptions(
    's|stats=s@{1,}' => \@stats,
    'o|output=s' => \$output,
    'f|format=s' => \$format,
    'v|version' => \$sc_version
);

#########################################################################
### Version
#########################################################################

if ($sc_version){
    print "\n";
    print "Script:     $name\n";
    print "Version:    $version\n";
    print "Updated:    $updated\n\n";
    exit(0);
}

#########################################################################
### Initializing tables & creating headers for species/strains/isolates
#########################################################################

my $outfile = $output.'.'.$format;
my $readmapping = $output.'.readmapping.'.$format;
my $totalvariants = $output.'.totalvariants.'.$format;
my $variantperkb = $output.'.variantperkb.'.$format;
my $weightedvariants = $output.'.weightedvariants.'.$format;

open OUT, ">", $outfile or die "Can't create file $outfile: $!\n";
open RM, ">", $readmapping or die "Can't create file $readmapping: $!\n";
open TV, ">", $totalvariants or die "Can't create file $totalvariants: $!\n";
open SK, ">", $variantperkb or die "Can't create file $variantperkb: $!\n";
open WV, ">", $weightedvariants or die "Can't create file $weightedvariants: $!\n";

my %species = ();
my $type;
my $sep;

if ($format eq 'tsv'){
    $sep = "\t";
}
elsif ($format eq 'csv')
    {$sep = ",";
}

print "NOTE - Species names are derived from the FASTQ files used for read mapping with get_SNPs.pl...\n";

foreach(@stats){
    if ($_ =~ /fastq.([a-zA-Z_0-9\-]+).*\.(\w+).stats$/){
        $type = $2;
        if (exists $species{$1}){next;}
        else {
            $species{$1} = $1; 
            print OUT "${sep}${1}${sep}${sep}${sep}${sep}${sep}";
            print RM "${sep}${1}";
            print TV "${sep}${1}";
            print WV "${sep}${1}";
            print SK "${sep}${1}";
            print "Found species: $1\n";
        }
    }
}
print OUT "\n";

my $num = scalar (keys %species);
my $col;

if ($type eq 'both'){
    $col = "${sep}% reference covered${sep}% read mapping against ref${sep}total SNPs+Indels${sep}SNP+indel/KB${sep}Weighted SNPs+Indels${sep}";
}
elsif ($type eq 'snp'){
    $col = "${sep}% reference covered${sep}% read mapping against ref${sep}total SNPs${sep}SNP/KB${sep}Weighted SNPs${sep}";
}
elsif ($type eq 'indel'){
    $col = "${sep}% reference covered${sep}% read mapping against ref${sep}total indels${sep}indel/KB${sep}Weighted Indels${sep}";
}
my $head = ($col)x($num);

print OUT "$head\n";
print RM "\n";
print TV "\n";
print WV "\n";
print SK "\n"; 

#########################################################################
### Iterating through + parsing stats files
#########################################################################

my $species = undef;
my $minc = 100;
my $maxsnp = 0;
my $maxsnpkb = 0;
my $sumsnp = '0';
my $count = '0';
my $snkbsum = '0';

while (my $file = shift@stats){

    open IN, "<$file";

    my $sp = undef;
    my $depth = undef;
    my $snps = undef;
    my $snkb = undef;
    my $readmap = undef;

    while (my $line = <IN>){

        chomp $line;

        if ($line =~ /^FASTQ file\(s\) used: ([a-zA-Z_0-9\-]+).*.fastq/){
            $sp = $1;
        }
        elsif ($line =~ /^Sequencing breadth \(percentage of bases covered by at least one read\)\s+(\S+)\%/){
            $depth = $1;
            if ($depth < $minc){
                $minc = $depth;
            }
        }
        elsif ($line =~ /^Total number of (SNPs|SNPs \+ indels|indels) found: (\d+)/){
            $snps = $2;
            $sumsnp += $2;
            $count++;
            if ($snps > $maxsnp){
                $maxsnp = $snps;
            }
        }
        elsif ($line =~ /^Average number of (SNPs|SNPs \+ indels|indels) per Kb: (\S+)/){
            $snkb = $2;
            $snkbsum+= $2;
            if ($snkb > $maxsnpkb){
                $maxsnpkb = $snkb;
            }
        }
        elsif ($line =~ /^\d+\s+\+\s+\d+ mapped \((\S+)\% : N\/A\)/){
            $readmap = $1;
        }
    }

    my $weighted;
    if ($readmap == 0){
        $weighted = 0;
    }
    else {
        $weighted = ($snps*100)/$readmap;
        $weighted = sprintf("%.0f", $weighted);
    }

    if (defined $species){
        if ($species eq $sp){
            print OUT "${sep}${sep}${depth}${sep}${readmap}${sep}${snps}${sep}${snkb}${sep}${weighted}";
            print RM "${sep}${readmap}";
            print TV "${sep}${snps}";
            print WV "${sep}${weighted}";
            print SK "${sep}${snkb}";
        }
        else{
            $species = $sp;
            print OUT "\n${species}${sep}${depth}${sep}${readmap}${sep}${snps}${sep}${snkb}${sep}${weighted}";
            print RM "\n${species}${sep}${readmap}";
            print TV "\n${species}${sep}${snps}";
            print WV "\n${species}${sep}${weighted}";
            print SK "\n${species}${sep}${snkb}";
        }
    }
    else{
        $species = $sp;
        print OUT "${species}${sep}${depth}${sep}${readmap}${sep}${snps}${sep}${snkb}${sep}${weighted}";
        print RM "${species}${sep}${readmap}";
        print TV "${species}${sep}${snps}";
        print WV "${species}${sep}${weighted}";
        print SK "${species}${sep}${snkb}";
    }
}
my $avsnp = sprintf ("%.2f", ($sumsnp/$count));
my $avsnb = sprintf ("%.2f", ($snkbsum/$count));
my $spp = sqrt($count);

print RM "\n";
print TV "\n";
print WV "\n";
print SK "\n";

print OUT "\n\n";
print OUT "Number of species found in node:${sep}$spp\n";
print OUT "Minimum genome coverage${sep}$minc \%\n";
print OUT "Max SNP total between species${sep}$maxsnp\n";
print OUT "Average SNP total between species${sep}$avsnp\n";
print OUT "Max SNP/kb total${sep}$maxsnpkb\n";
print OUT "Average SNP/kb between species${sep}$avsnb\n";
print OUT 'NOTE: The weighted columns corresponds to: (variants found*100)/% of reads mapping against the reference'."\n";