Typing instructions

HopkinsIDD · Jul 14, 2022 · 4d3673d · 4d3673d
1 parent 1e30310
commit 4d3673d
Show file tree

Hide file tree

Showing 3 changed files with 131 additions and 81 deletions.
diff --git a/instruction.md b/instruction.md
@@ -47,7 +47,7 @@ git clone https://github.com/HopkinsIDD/illumina-vc.git .
 
 ### Finding your raw data
 
-Some of the commands below require you to know where your raw Illumina data is stored. When you started your Illumina run, you selected a data folder in which to store the results. Figure out the path to your raw data and provide it below whenever the command calls for the **path-to-raw- data**.
+Some of the commands below require you to know where your raw Illumina data is stored. When you started your Illumina run, you selected a data folder in which to store the results. Figure out the path to your raw data and provide it below whenever the command calls for the **path-to-raw-data**.
 
 
 ### Determining the number of processors on your computer

diff --git a/scripts/map_genes.sh b/scripts/map_genes.sh
@@ -6,6 +6,7 @@ set -o pipefail
 ## ctxA -> cholera toxin; associated with pandemic lineages O1 and O139
 ## wbeO1 -> O1 antigen
 ## wbfO139 -> O139 antigen
+## toxR -> virulence regulator gene
 ## tcpA classical
 ## tcpA El Tor
 
@@ -17,17 +18,17 @@ set -o pipefail
 
 # ---------------- Input Paramters and Files  ---------------- #
 
+# sample name 
+SAMPLENAME=$1
+
 # path to project directory (create a new directory for typing purposes)
-DIR=$1
+DIR=$2
 DIR=${DIR%/} # remove trailing slash from directory name if necessary
 
 # path to FASTQ data
-FASTQ=$2
+FASTQ=$3
 FASTQ=${FASTQ%/} # remove trailing slash from directory name if necessary
 
-# sample name 
-SAMPLENAME=$3
-
 # indicate number of lanes to merge data over
 LANES=$4
 
@@ -52,12 +53,12 @@ for i in $(seq 1 $LANES); do LARRAY+=("L00"$i); done
 
 
 # ensure output directorys exists
-if [ ! -d $DIR/01_mapped ]; then
-	mkdir $DIR/01_mapped
+if [ ! -d $DIR/typing_mapped ]; then
+	mkdir $DIR/typing_mapped
 fi
 
-if [ ! -d $DIR/02_merged ]; then
-	mkdir $DIR/02_merged
+if [ ! -d $DIR/typing_merged ]; then
+	mkdir $DIR/typing_merged
 fi
 
 if [ ! -d $OUTPUT ]; then
@@ -74,6 +75,11 @@ if [ -f $OUTPUT/$SAMPLENAME\_typing_summary.txt ]; then
 	rm $OUTPUT/$SAMPLENAME\_typing_summary.txt
 fi
 
+
+printf "SAMPLENAME\tgene\treads_mapped\treads\tfrac_mapped\tmean_depth\tpos_covered\ttotal_pos\tfrac_covered\n" >> \
+	$OUTPUT/$SAMPLENAME\_typing_summary.txt
+
+
 # ---------------- Map reads to reference and calculate metrics ---------------- #
 
 # loop through all typing genes
@@ -82,18 +88,20 @@ date
 
 for ref in $REFDIR/*.fasta; do
 
+	# obtain the name of the gene
+	gene=${ref%%.fasta}
+	gene=${gene##*/}
+
 	if [ ! -f $ref.ann ]; then
 		bwa index $ref
-		touch check_tmp_typing/.$ref\_ann_check
-	elif [ ! -e check_tmp_typing/.$ref\_ann_check ]; then
+		touch check_tmp_typing/.$gene\_ann_check
+	elif [ ! -e check_tmp_typing/.$gene\_ann_check ]; then
 		rm $ref.ann
 		bwa index $ref
-		touch check_tmp_typing/.$ref\_ann_check
+		touch check_tmp_typing/.$gene\_ann_check
 	fi
 
-	# obtain the name of the gene
-	gene=${ref%%.fasta}
-	gene=${gene##*/}
+
 
 	printf "MAPPING READS To %s" "$gene"
 
@@ -104,82 +112,82 @@ for ref in $REFDIR/*.fasta; do
 
 		# align paired end reads to reference genome (required indexed reference)
 		# this will take a few minutes
-		if [ ! -f $DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sam ]; then
+		if [ ! -f $DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sam ]; then
 			bwa mem -M -t $NUM_CORES $ref $FASTQ/$SAMPLENAME\_$LANE\_R1_001.fastq.gz \
 			$FASTQ/$SAMPLENAME\_$LANE\_R2_001.fastq.gz > \
-			$DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sam
+			$DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sam
 			touch check_tmp_typing/.$SAMPLENAME.$gene\_$LANE.sam.check
 		elif [ ! -e check_tmp_typing/.$SAMPLENAME.$gene\_$LANE.sam.check ]; then
-			rm $DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sam
+			rm $DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sam
 			bwa mem -M -t $NUM_CORES $ref $FASTQ/$SAMPLENAME\_$LANE\_R1_001.fastq.gz \
 			$FASTQ/$SAMPLENAME\_$LANE\_R2_001.fastq.gz > \
-			$DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sam
+			$DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sam
 			touch check_tmp_typing/.$SAMPLENAME.$gene\_$LANE.sam.check
 		fi
 
 		# convert SAM to BAM and fix read pairing information and flags
 		# this will take a minute
-		if [ ! -f $DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped.bam ]; then
-			samtools fixmate -O bam $DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sam \
-			$DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped.bam
+		if [ ! -f $DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped.bam ]; then
+			samtools fixmate -O bam $DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sam \
+			$DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped.bam
 			touch check_tmp_typing/.$SAMPLENAME.$gene\_$LANE.bam.check
 		elif [ ! -e check_tmp_typing/.$SAMPLENAME.$gene\_$LANE.bam.check ]; then
-			rm $DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped.bam
-			samtools fixmate -O bam $DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sam \
-			$DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped.bam
+			rm $DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped.bam
+			samtools fixmate -O bam $DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sam \
+			$DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped.bam
 			touch check_tmp_typing/.$SAMPLENAME.$gene\_$LANE.bam.check			
 		fi
 
 		# sort BAM file
-		if [ ! -f $DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sorted.bam ]; then
-			samtools sort -T $DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped -O bam \
-			-@ $NUM_CORES -o $DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sorted.bam \
-			$DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped.bam
+		if [ ! -f $DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sorted.bam ]; then
+			samtools sort -T $DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped -O bam \
+			-@ $NUM_CORES -o $DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sorted.bam \
+			$DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped.bam
 			touch check_tmp_typing/.$SAMPLENAME.$gene\_$LANE.bam.sort.check
 		elif [ ! -e check_tmp_typing/.$SAMPLENAME.$gene\_$LANE.bam.sort.check ]; then
-			rm $DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sorted.bam
-			samtools sort -T $DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped -O bam \
-			-@ $NUM_CORES -o $DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sorted.bam \
-			$DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped.bam
+			rm $DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sorted.bam
+			samtools sort -T $DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped -O bam \
+			-@ $NUM_CORES -o $DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sorted.bam \
+			$DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped.bam
 			touch check_tmp_typing/.$SAMPLENAME.$gene\_$LANE.bam.sort.check
 		fi
 
 	done
 
 	# create list of files to merge from different lanes
-	if [ ! -f $DIR/01_mapped/$SAMPLENAME.$gene.lanebams.txt ]; then
-		ls $DIR/01_mapped/$SAMPLENAME.$gene\_L00*.mapped.sorted.bam >> \
-		$DIR/01_mapped/$SAMPLENAME.$gene.lanebams.txt
+	if [ ! -f $DIR/typing_mapped/$SAMPLENAME.$gene.lanebams.txt ]; then
+		ls $DIR/typing_mapped/$SAMPLENAME.$gene\_L00*.mapped.sorted.bam >> \
+		$DIR/typing_mapped/$SAMPLENAME.$gene.lanebams.txt
 	else
-		rm $DIR/01_mapped/$SAMPLENAME.$gene.lanebams.txt
-		ls $DIR/01_mapped/$SAMPLENAME.$gene\_L00*.mapped.sorted.bam >> \
-		$DIR/01_mapped/$SAMPLENAME.$gene.lanebams.txt
+		rm $DIR/typing_mapped/$SAMPLENAME.$gene.lanebams.txt
+		ls $DIR/typing_mapped/$SAMPLENAME.$gene\_L00*.mapped.sorted.bam >> \
+		$DIR/typing_mapped/$SAMPLENAME.$gene.lanebams.txt
 	fi
 
 
 
 	# merge data from multiple lanes if needed
 	# this will take a minute
 	if [ "$LANES" -gt 1 ]; then
-		if [ ! -f $DIR/02_merged/$SAMPLENAME.$gene.merged.bam ]; then
-			samtools merge -f $DIR/02_merged/$SAMPLENAME.$gene.merged.bam \
-			-b $DIR/01_mapped/$SAMPLENAME.$gene.lanebams.txt
+		if [ ! -f $DIR/typing_merged/$SAMPLENAME.$gene.merged.bam ]; then
+			samtools merge -f $DIR/typing_merged/$SAMPLENAME.$gene.merged.bam \
+			-b $DIR/typing_mapped/$SAMPLENAME.$gene.lanebams.txt
 			touch check_tmp_typing/.$SAMPLENAME.$gene.bam.merge.check
 		elif [ ! -e check_tmp_typing/.$SAMPLENAME.$gene.bam.merge.check ]; then
-			rm $DIR/02_merged/$SAMPLENAME.$gene.merged.bam
-			samtools merge -f $DIR/02_merged/$SAMPLENAME.$gene.merged.bam \
-			-b $DIR/01_mapped/$SAMPLENAME.$gene.lanebams.txt
+			rm $DIR/typing_merged/$SAMPLENAME.$gene.merged.bam
+			samtools merge -f $DIR/typing_merged/$SAMPLENAME.$gene.merged.bam \
+			-b $DIR/typing_mapped/$SAMPLENAME.$gene.lanebams.txt
 			touch check_tmp_typing/.$SAMPLENAME.$gene.bam.merge.check
 		fi
 	else
-		if [ ! -f $DIR/02_merged/$SAMPLENAME.$gene.merged.bam ]; then
-			cp $DIR/01_mapped/$SAMPLENAME.$gene\_L001.mapped.sorted.bam \
-			$DIR/02_merged/$SAMPLENAME.$gene.merged.bam
+		if [ ! -f $DIR/typing_merged/$SAMPLENAME.$gene.merged.bam ]; then
+			cp $DIR/typing_mapped/$SAMPLENAME.$gene\_L001.mapped.sorted.bam \
+			$DIR/typing_merged/$SAMPLENAME.$gene.merged.bam
 			touch check_tmp_typing/.$SAMPLENAME.$gene.bam.merge.check
 		elif [ ! -e check_tmp_typing/.$SAMPLENAME.$gene.bam.merge.check ]; then
-			rm $DIR/02_merged/$SAMPLENAME.$gene.merged.bam
-			cp $DIR/01_mapped/$SAMPLENAME.$gene\_L001.mapped.sorted.bam \
-			$DIR/02_merged/$SAMPLENAME.$gene.merged.bam
+			rm $DIR/typing_merged/$SAMPLENAME.$gene.merged.bam
+			cp $DIR/typing_mapped/$SAMPLENAME.$gene\_L001.mapped.sorted.bam \
+			$DIR/typing_merged/$SAMPLENAME.$gene.merged.bam
 			touch check_tmp_typing/.$SAMPLENAME.$gene.bam.merge.check
 		fi
 	fi
@@ -189,32 +197,32 @@ for ref in $REFDIR/*.fasta; do
 	samtools faidx $ref
 
 	# generate a depth file//
-	if [ ! -f $DIR/02_merged/$SAMPLENAME.$gene.merged.depth ]; then
-		samtools depth -a $DIR/02_merged/$SAMPLENAME.$gene.merged.bam > \
-		$DIR/02_merged/$SAMPLENAME.$gene.merged.depth
+	if [ ! -f $DIR/typing_merged/$SAMPLENAME.$gene.merged.depth ]; then
+		samtools depth -a $DIR/typing_merged/$SAMPLENAME.$gene.merged.bam > \
+		$DIR/typing_merged/$SAMPLENAME.$gene.merged.depth
 		touch check_tmp_typing/.$SAMPLENAME.$gene.depth.check
 	elif [ ! -e check_tmp_typing/.$SAMPLENAME.$gene.depth.check ]; then
-		rm $DIR/02_merged/$SAMPLENAME.$gene.merged.depth
-		samtools depth -a $DIR/02_merged/$SAMPLENAME.$gene.merged.bam > \
-		$DIR/02_merged/$SAMPLENAME.$gene.merged.depth
+		rm $DIR/typing_merged/$SAMPLENAME.$gene.merged.depth
+		samtools depth -a $DIR/typing_merged/$SAMPLENAME.$gene.merged.bam > \
+		$DIR/typing_merged/$SAMPLENAME.$gene.merged.depth
 		touch check_tmp_typing/.$SAMPLENAME.$gene.depth.check
 	fi
 	# calculate number of reads mapped
-	reads_mapped=$(samtools view -c -F 4 $DIR/02_merged/$SAMPLENAME.$gene.merged.bam)
+	reads_mapped=$(samtools view -c -F 4 $DIR/typing_merged/$SAMPLENAME.$gene.merged.bam)
 
 	# calculate fraction of reads mapped
 	# first calculate number of reads
-	reads=$(samtools view -c $DIR/02_merged/$SAMPLENAME.$gene.merged.bam)
+	reads=$(samtools view -c $DIR/typing_merged/$SAMPLENAME.$gene.merged.bam)
 	frac_mapped=$(echo "scale=7; $reads_mapped / $reads" | bc)
 
 	# calculate mean depth
 	mean_depth=$(awk '{ total += $3 } END { print total/NR }' \
-		$DIR/02_merged/$SAMPLENAME.$gene.merged.depth)
+		$DIR/typing_merged/$SAMPLENAME.$gene.merged.depth)
 
 	# calculate percent of gene covered
 	pos_covered=$(awk -v MIN_COV=$MIN_COV 'BEGIN {count = 0} $3 > MIN_COV {count++} END {print count}' \
-		$DIR/02_merged/$SAMPLENAME.$gene.merged.depth)
-	total_pos=$(wc -l < $DIR/02_merged/$SAMPLENAME.$gene.merged.depth)
+		$DIR/typing_merged/$SAMPLENAME.$gene.merged.depth)
+	total_pos=$(wc -l < $DIR/typing_merged/$SAMPLENAME.$gene.merged.depth)
 	frac_covered=$(echo "scale=7; $pos_covered / $total_pos" | bc)
 
 	# dump all data into a file
@@ -224,26 +232,15 @@ for ref in $REFDIR/*.fasta; do
 
 
 	#remove all intermediate files
-	rm $DIR/01_mapped/$SAMPLENAME.$gene\_L00*.mapped.sam
-	rm $DIR/01_mapped/$SAMPLENAME.$gene\_L00*.mapped.bam
-	rm $DIR/01_mapped/$SAMPLENAME.$gene\_L00*.mapped.sorted.bam
-	rm $DIR/01_mapped/$SAMPLENAME.$gene.lanebams.txt
-	rm $DIR/02_merged/$SAMPLENAME.$gene.merged.bam 
-	rm $DIR/02_merged/$SAMPLENAME.$gene.merged.depth
+	rm $DIR/typing_mapped/$SAMPLENAME.$gene\_L00*.mapped.sam
+	rm $DIR/typing_mapped/$SAMPLENAME.$gene\_L00*.mapped.bam
+	rm $DIR/typing_mapped/$SAMPLENAME.$gene\_L00*.mapped.sorted.bam
+	rm $DIR/typing_mapped/$SAMPLENAME.$gene.lanebams.txt
+	rm $DIR/typing_merged/$SAMPLENAME.$gene.merged.bam 
+	rm $DIR/typing_merged/$SAMPLENAME.$gene.merged.depth
 
 done
 
 
-# ---------------- Quality check ---------------- #
-
-blank_count=$(awk -F'\t' 'BEGIN{count = 0} {for(i = 0; i <= NF; i++) if ($i == "") count++} END{print count}' \
-$OUTPUT/$SAMPLENAME\_typing_summary.txt)
-
-file_size=$(ls -lh $OUTPUT/$SAMPLENAME\_typing_summary.txt | awk '{print $5}')
-
-if [[ $blank_count != 0 || $file_size -eq 0 ]]; then
-	echo "WARNING: empty result or empty file"
-else
-	printf "%s typing summary created successfully" "$SAMPLENAME"
-
+echo "typing finished"
 date
diff --git a/typing_instruction.md b/typing_instruction.md
@@ -0,0 +1,53 @@
+## Vibrio cholerae typing from Illumina data
+
+This is a guide for taking reads generated on an Illumina sequencer and mapping them against specific genes (_ctxA_, _wbeO1_, _wbfO139_, _tcpA_ classical, _tcpA_ El Tor, _toxR_) for the purpose of determining sample serotype. 
+
+## Setting up your working directory
+
+This script assumpes the following:
+
+* That you have already set up your working directory as described in the [assembly instructions](https://github.com/HopkinsIDD/illumina-vc/blob/main/instruction.md). Make sure you know where the raw fastq files are stored and make sure they follow the [naming convention](https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/NamingConvention_FASTQ-files-swBS.htm)
+
+* That the `scripts` folder inside your working directory contains the `map_genes.sh` script.
+
+* That all six gene-specific reference FASTAs are contained in a directory called `typing`. If you cloned this git repository, these reference files can be found in `ref_genomes/typing`.
+
+Once you have the required files in your working directory, navigate to this directory and create a new folder where you will store your typing results:
+
+```
+cd my-project
+```
+
+```
+mkdir typing_results
+```
+
+## Running the typing script
+
+
+Run the typing script as follows:
+
+```
+bash scripts/map_genes.sh sample-name my-project path-to-raw-data num-lanes num-cores ref-dir output-dir
+```
+
+Where `sample-name` is the 'SampleName' prior to '_L00...' in your fastq files,`my-project` is the absolute path to your project directory, and `path-to-raw-data` is the path to your raw fastq files. `num-lanes` is the number of lanes you used to run this sample. `num-cores` should be less or equal to [the number of processors on your computer](https://github.com/HopkinsIDD/illumina-vc/blob/main/instruction.md/#determining-the-number-of-processors-on-your-computer).`ref-dir` should be the directory that contains the gene-specific FASTA files (usually `my-project/ref_genomes/typing`). `output-dir` will be the folder to store typing results (`my-project/typing_results`).
+
+
+
+
+## Evaluating typing results
+
+The typing script will produce one output file (`samplename_typing_summary.txt`). This file will look something like this:
+
+```
+SAMPLENAME      gene    reads_mapped    reads   frac_mapped     mean_depth      pos_covered     total_pos       frac_covered
+ERR3039943      ctxA    96      588182  .0001632        27.4582 777     777     1.0000000
+ERR3039943      tcpA_classical  87      588182  .0001479        9.74046 1048    1048    1.0000000
+ERR3039943      tcpA_eltor      96      588182  .0001632        28.3126 675     675     1.0000000
+ERR3039943      toxR    134     588182  .0002278        30.8923 891     891     1.0000000
+ERR3039943      wbeO1   3660    588185  .0062225        38.4333 21122   21122   1.0000000
+ERR3039943      wbfO139 1923    588187  .0032693        9.17084 46721   46721   1.0000000
+```
+
+These files may be easier to view in a spreadsheet software such as Excel.