-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathRun_gatk_preprocessing.sh
54 lines (40 loc) · 2.27 KB
/
Run_gatk_preprocessing.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
### Usage: Run_gatk_preprocessing.sh <sorted bam> <reference>
###: Author Madikay Senghore
###: Synposis: given a sam file. and a reference file, run the gatk snp calling pipeline
###: Date Wed 23 Mar 2022
### Load modules
module load bcftools/1.5-fasrc02
module load samtools/1.5-fasrc02
module load Java
module load gatk
i=$1
ref=$2
IFS='.' read -r -a f <<< "$i"
### ### Drop reads under 30 bases
samtools view -h $i | awk 'length($10) > 30 || $1 ~ /^@/' | samtools view -bS > $f.f.bam
samtools sort $f.f.bam -o $f.sf.bam
## classical mpileup and filter for know variant sites
bcftools mpileup -Ov -o $f.pile.vcf -f $ref $f.sf.bam
bcftools call -mv -Ov $f.pile.vcf -o $f.calls.vcf
bcftools view -M2 -v snps -i '%QUAL>=50' $f.calls.vcf > $f.confirmed_snps.vcf
gatk IndexFeatureFile -F $f.confirmed_snps.vcf
## Use gatk to add read groups to bam files
gatk AddOrReplaceReadGroups --INPUT $f.sf.bam --OUTPUT $f.s.f.rg.bam --RGLB library1 --RGPL illumina --RGPU unit1 --RGSM $f
### Add one liner to remove unmapped reads and reads less that 30 quality
samtools view -F 0x04 -q 30 -b $f.s.f.rg.bam > $f.s.f.rg.q30.bam
### Mark duplicates in the sorted bam files with read groups assigned
gatk --java-options "-Xmx16g" MarkDuplicates -I $f.s.f.rg.q30.bam -O $f.marked.bam -M $f.metrics
### Perform Base recalibration
gatk --java-options "-Xmx16g" BaseRecalibrator -I $f.marked.bam -R $2 --known-sites $f.confirmed_snps.vcf -O $f.recal.table
### Apply Base recalibration
gatk --java-options "-Xmx16g" ApplyBQSR -R $2 -I $f.marked.bam --bqsr-recal-file $f.recal.table -O $f.analysis.ready.bam
#In parallel run Haplotype caller
gatk --java-options "-Xmx16g" HaplotypeCaller -R $2 -I $f.analysis.ready.bam -O $f.g.vcf -ERC GVCF
# Genotype the vcf from haplottype caller
gatk --java-options "-Xmx4g" GenotypeGVCFs -R $2 -V $f.g.vcf -O $f.genotyped.vcf
### Use bcftools to filter biallelic SNP sites where ALT allele is majority (Threshold 80%)
#bcftools filter -i 'AD[0]/DP < 0.2 && QUAL>100 && DP>20' $f.genotyped.vcf | bcftools view -m2 -M2 -v snps -Ob -o $f.final.snps.bcf.gz
#bcftools index $f.final.snps.bcf.gz
#cat $2 | bcftools consensus $f.final.snps.bcf.gz > ../consensus/$f.consensus.fasta
#### Rename fasta file (Reference ID will vary)
#sed -i "s/>CP000730\.1/>$f/g" ../consensus/$f.consensus.fasta; done