-
Notifications
You must be signed in to change notification settings - Fork 0
/
2015_mapping_steps
87 lines (61 loc) · 3.89 KB
/
2015_mapping_steps
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
cd /n/holylfs05/LABS/hanage_lab/Lab/holyscratch01/madikay/staph/2015/reads
### Take all the unique IDs and store in a text file
for i in *gz; do IFS='S' read -r -a f <<< "$i"; echo $f ; done | uniq > unique_samples.list
###Create a directory for merged reads
mkdir ../merged_reads
## loop through the read file list and concatenate all forward and reverse reads in seperate files
## First check that files tally,
while read i; do echo "$i R1" ; ls $i*R1*; echo "$i R2" ; ls $i*R2*; done < unique_samples.list > merger_QC.list
### Then merge the files using the cat command
while read i; do sbatch -p serial_requeue -t 7200 --mem=4000 --wrap="cat $i*R1* > ../merged_reads/$i.R1.fq.gz"; sbatch -p serial_requeue -t 7200 --mem=4000 --wrap="cat $i*R2* > ../merged_reads/$i.R2.fq.gz"; done < unique_samples.list
##### QC steps
## Check read file sizes
cd ../merged_reads/
ls -lrt
module load fastqc/0.11.8-fasrc01
module load Java/1.8
# Run fastqc on all raw reads
module load fastqc/0.11.8-fasrc01
for i in *.fq.gz; do sbatch -p shared -t 7000 --mem=4000 --wrap="fastqc $i"; done
mkdir ../fastqc/
mv *html ../fastqc/
# map reads to reference using bwa
module load bwa/0.7.17-fasrc01
module load bcftools/1.5-fasrc02
module load samtools/1.5-fasrc02
# Map reads to reference
bwa index CP000730_USA300_TCH1516.fasta
for i in *R1.fq.gz; do IFS='.' read -r -a f <<< "$i"; sbatch -p shared -t 7200 --mem=4000 --wrap="bwa mem -p CP000730_USA300_TCH1516.fasta $i > $f.sam"; done
### Convert sam to bam and sort
for i in *.bam; do IFS='.' read -r -a f <<< "$i"; sbatch -p shared -t 7000 --mem=4000 --wrap="samtools view -bS $i | samtools sort -o $f.bam"; done
### run qualimap on bam files
#for i in *.bam; do sbatch -p shared -t 7000 --mem=4000 --wrap="qualimap bamqc -bam $i -outdir ../qualimap/"; done
for i in *.bam; do IFS='.' read -r -a f <<< "$i";sbatch -p shared -t 7000 --mem=4000 --wrap="qualimap bamqc -bam $i -outdir ../qualimap/$f"; done
### Load multiqc and run it
salloc -p test -c 1 -t 00-01:00 --mem=4000
which singularity
singularity --version
singularity pull docker://ewels/multiqc:1.9
singularity exec --cleanenv /n/singularity_images/informatics/multiqc/multiqc_1.9.sif multiqc
history | tail
### Filter the bam files
### 1) Rename bam files
for i in *.bam; do IFS='_' read -r -a f <<< "$i"; mv $i ${f[0]}_${f[1]}.bam; done
###Run the script for variant calling up to consensus
for i in *.bam; do sbatch -p shared -t 7000 --mem=16000 -c 4 --wrap="sh ../scripts/Run_gatk_preprocessing.sh $i ../merged_reads/CP000730_USA300_TCH1516.fasta"
###########################################################################################################
### Drop reads under 30 bases
for i in *.bam; do IFS='.' read -r -a f <<< "$i"; echo "samtools view -h $i | awk 'length($10) > 30 || \$1 ~ /^@/' | samtools view -bS > $f.sf.bam"; done > Drop_under30base_records.txt
while read -r i; do sbatch -p shared -t 7200 --mem=4000 --wrap="$i"; done < "Drop_under30base_records.txt"
### Add read groups
for i in *sf.bam; do
IFS='.' read -r -a f <<< "$i"; sbatch -p shared -t 7000 --mem=4000 --wrap="gatk AddOrReplaceReadGroups --INPUT $i --OUTPUT $f.s.f.rg.bam --RGLB library1 --RGPL illumina --RGPU unit1 --RGSM $f"; done
### Add one liner to remove unmapped reads and reads less that 30 quality
for i in *rg.bam; do
IFS='.' read -r -a f <<< "$i"; sbatch -p shared -t 7200 --mem=4000 --wrap="samtools view -F 0x04 -q 30 -b $i > $f.s.f.rg.q30.bam"; done
#Mark duplicates in the sorted bam files with read groups assigned
for i in *rg.q30.bam; do
IFS='.' read -r -a f <<< "$i"; sbatch -p shared -t 7200 --mem=4000 --wrap="gatk MarkDuplicates -I $i -O $f.marked.bam -M $f.metrics"; done
#Perform Base recalibration
for i in *.marked.bam; do
IFS='.marked' read -r -a f <<< "$i"; sbatch -p shared -t 7200 --mem=4000 --wrap="gatk BaseRecalibrator -I $i -R Citrobacter_rodentium_ICC168.fasta --known-sites $f_confirmed_snps.vcf -O $f.recal.table"; done