intersect_genome_snps.pl

#!/usr/bin/perl

=head1 NAME

 intersect_genome_snps.pl
This script extracts the SNP intersection from a subsets of mpileup file (preferably generated with extract_snp_freq.pl) and one or more intersecting genomes (mpileup files).
A report will be generated with the SNPs removed from the target mpileup file.
The criteria for extracting SNPs from the target file is having a unique SNP, and the SNP position must have minimum coverage in each of the intersecting genomes (if the position is not covered in all genomes, then there is lower confidence level that the SNP )
This should hopefully remove SNPs that are shared between 2 genomes, and will leave potential SNPs that came from the target intersect file   

=cut

=head1 SYPNOSIS

 intersect_genome_snps.pl [-h] --target <input_mpileup_file> --intersect <input_intersecting_mpileup file(s , comma separated) >  -o <output_file> [ -c <minimum coverage in intersecting genome(s) >

=head1 DESCRIPTION

 This script reads 2 files : subset of mpileup file from a target genome and a reference genomes with sought SNPs (could be generated by extract_snp_freq.pl), and anohter set of mpileup file(s) of the genome(s) that should be filtered (for example a target file of an inbred genome filtered from SNPs from a close relative, and a second mpileup file from anohter potential ancestor(s) that should be subtracted from the first set)  


The subset mpileup file should have an extra columns (#7,8) with the alternative allele from the target genome, and the allele frequency from main genome (both are extracted from VarScan files) 
This allows to plot the SNPs of the target genome X reference genome minus the non-unique SNPs, or positions with low coverage as designated from the intersecting genome(s).
This subtracting tage can be run multiple times, if you wish to look at each genome pair separately, each time removing non-specific SNPs of the reference species compared to one other genome.


=head2 I<Flags:>

=over

=item --target | t

B<input mpileup file>             input mpileup file of the target genome (mandatory)

=item --intersect | i

B<input mpileup file (s)>           input mpileup file(s) of a second genome (or more, comma separated lists are supported), to be removed from file #1 above 

=item --cov | c

B<coverage cutoff> cutoff for ignoring SNPs in positions that are not covered well in the intersecting genome(s) 
   
=item -o

B<output_file>            output file (mandatory)


=item -h

B<help>                   print this help doc

=back

=head1 AUTHOR

Naama Menda<nm249@cornell.edu>

=cut


use strict;
use warnings;
use File::Slurp;

use Getopt::Long;
use Pod::Usage;
use Carp qw /croak/;

my ( $target_mpileup, @intersect_mpileup, $out, $help); 
my $intersect_coverage = 1;

GetOptions (
    "target|t=s"    => \$target_mpileup,    # string
    "intersect|i=s"   => \@intersect_mpileup,
    "cov|c=i"       => \$intersect_coverage,
    "out|o=s"    => \$out,
    "help"  => \$help)   # flag
    or  pod2usage(-verbose  => 2);

if ($help || !$target_mpileup || !@intersect_mpileup  || !$out)  { pod2usage(-verbose  => 2); } 


open (TARGET, "<$target_mpileup") || die "Cannot open target mpileup file $target_mpileup.\n";

@intersect_mpileup = split(/,/,join(',',@intersect_mpileup));
my $err = $out . ".err" ;

################################################################################
#print STDERR " Target : $chr \t $pos \t allele = $target_allele ";
my %error;
my %target;
my ($chr , $prev_chr, $pos);

print STDERR " TARGET file $target_mpileup \n\n";

#parse the target mpileup subset file containing the candidate SNPs from the genome of interest
while ( <TARGET> ) {
    my $line = $_;
    chomp($line);
    my @tar = split ("\t", $line);
    $chr = $tar[0];
    $pos = $tar[1];
    my $target_allele = $tar[6];
    no warnings 'uninitialized' ;
    if ( $chr ne $prev_chr ) { print STDERR "$chr\n"; }
    $prev_chr = $chr;

    #############Write to out files only after checking the intersecting genomes 
    $target{ $chr . ":" . $pos }->{pos} = $pos ;
    $target{ $chr . ":" . $pos }->{chr} = $chr ; 
    $target{ $chr . ":" . $pos }->{line}=  $line ;
    $target{ $chr . ":" . $pos }->{target_allele}=  $target_allele ;
}

#look into the mpileup files of the intersecting genomes for surther filtering by
#looking at minimum coverage required, and if the allele in the target file is unique (must be unique when compared to all intersecting files)

foreach my $mpileup_file (@intersect_mpileup) {
    open (MPILEUP, "<$mpileup_file") || croak "Cannot open intersect mpileup file $mpileup_file.\n";
    my %mpile;
    print STDERR "Finding target-mpileup file matches in file $mpileup_file \n";
    #############
    while ( <MPILEUP> ) {
	my $line = $_;
	chomp($line);
	my @mp = split ("\t", $line);
	$chr = $mp[0];
	if ( $chr ne $prev_chr ) { print STDERR "$chr\n"; }  ;
	$prev_chr = $chr;

	$pos = $mp[1];
	my $allele = $mp[2];
	my $coverage = $mp[3];
	if (exists $target{ $chr . ":" . $pos } ) {
	    my $target_allele = $target{ $chr . ":" . $pos }->{target_allele} ;
	    if ( $target_allele && ( $coverage < $intersect_coverage || $allele eq  $target_allele ) ) {
		delete $target{ $chr . ":" . $pos } ;
		write_file( $err , { append => 1 }, ( join("\t" , ( $chr, $pos, $mpileup_file , $target_allele , $allele , $coverage ) ) , "\n" ) ) ;
	    }
	}
    }
}
# need to write to output file at the end, after looping through all genomes
foreach my $key (sort { ($target{$a}->{chr} cmp $target{$b}->{chr} ) ||  ( $target{$a}->{pos} <=> $target{$b}->{pos} ) } keys %target )   { 
    write_file($out, { append => 1} , ( $target{ $key }->{line} ,"\n")  );
}