-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathconvert_vcf_to_dadi_input.pl
114 lines (109 loc) · 2.77 KB
/
convert_vcf_to_dadi_input.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
#! /usr/bin/perl
use strict;
use warnings;
use Bio::SeqIO;
my ($file,$vcf,$list)=@ARGV;
# $file is the location of genome file; $list is like this:
# BEGIN
# sample1 population1
# sample2 population1
# sample3 population2
# END
# the sample name should be same as which in vcf file.
# Usage: perl convert_vcf_to_dadi_input.pl <genome file> <vcf file> <list file>
my %list;
open(IN,"< $list")||die"$!";
while (<IN>) {
chomp;
next if(/^\#/);
my @a=split(/\s+/);
$list{$a[0]}=$a[1];
}
close IN;
my %vcf;
my %pop;
my %record;
open(IN,"< $vcf");
while (<IN>) {
chomp;
next if(/^##/);
if(/^#/){
my @a=split(/\s+/);
for(my $i=9;$i<@a;$i++){
next if(!exists $list{$a[$i]});
#die"$a[$i] does not exists in $list\n" if(!exists $list{$a[$i]});
$record{$i}=$list{$a[$i]};
}
next;
}
my @a=split(/\s+/);
my ($chr,$pos,$ref,$alt)=($a[0],$a[1],$a[3],$a[4]);
next if($alt=~/,/);
$vcf{$chr}{$pos}{ref}=$ref;
$vcf{$chr}{$pos}{alt}=$alt;
#print "DEBUG\t$ref\t$alt\n";
foreach my $i(keys %record){
my $indv=$record{$i};
$pop{$indv}=1;
#print "$indv\t";
my $geno=$a[$i];
#print "$geno\n";
$geno=~/^(.)\/(.):/;
my $tempa=$1;
my $tempb=$2;
#print "$tempa $tempb\t";
my ($a1,$a2)=(0,0);
if($tempa eq "." || $tempb eq "."){
$a1=0;
$a2=0;
}
elsif ($tempa+$tempb == 1) {
$a1=1;
$a2=1;
}
elsif ($tempa+$tempb == 2) {
$a1=0;
$a2=2;
}
elsif ($tempa+$tempb == 0) {
$a1=2;
$a2=0;
}
$vcf{$chr}{$pos}{$indv}{a1}+=$a1;
$vcf{$chr}{$pos}{$indv}{a2}+=$a2;
}
#last;
}
close IN;
open(O,"> $list.data");
my $title="NAME\tOUT\tAllele1";
foreach my $pop(sort keys %pop){
$title.="\t$pop";
}
$title.="\tAllele2";
foreach my $pop(sort keys %pop){
$title.="\t$pop";
}
$title.="\tGene\tPostion\n";
print O "$title";
my $fa=Bio::SeqIO->new(-file=>$file,-format=>'fasta');
while(my $seq=$fa->next_seq){
my $id=$seq->id;
my $seq=$seq->seq;
foreach my $pos (sort {$a<=>$b} keys %{$vcf{$id}}){
my $ref=substr($seq,$pos-2,3);
my $line="$ref\t$ref\t$vcf{$id}{$pos}{ref}";
foreach my $pop(sort keys %pop){
my $num=$vcf{$id}{$pos}{$pop}{a1};
$line.="\t$num";
}
$line.="\t$vcf{$id}{$pos}{alt}";
foreach my $pop(sort keys %pop){
my $num=$vcf{$id}{$pos}{$pop}{a2};
$line.="\t$num";
}
$line.="\t$id\t$pos\n";
print O "$line";
}
}
close O;