diff --git a/instruction.md b/instruction.md index ff51e71..4c1e4df 100644 --- a/instruction.md +++ b/instruction.md @@ -47,7 +47,7 @@ git clone https://github.com/HopkinsIDD/illumina-vc.git . ### Finding your raw data -Some of the commands below require you to know where your raw Illumina data is stored. When you started your Illumina run, you selected a data folder in which to store the results. Figure out the path to your raw data and provide it below whenever the command calls for the **path-to-raw- data**. +Some of the commands below require you to know where your raw Illumina data is stored. When you started your Illumina run, you selected a data folder in which to store the results. Figure out the path to your raw data and provide it below whenever the command calls for the **path-to-raw-data**. ### Determining the number of processors on your computer diff --git a/scripts/map_genes.sh b/scripts/map_genes.sh index 7425c24..4700e45 100644 --- a/scripts/map_genes.sh +++ b/scripts/map_genes.sh @@ -6,6 +6,7 @@ set -o pipefail ## ctxA -> cholera toxin; associated with pandemic lineages O1 and O139 ## wbeO1 -> O1 antigen ## wbfO139 -> O139 antigen +## toxR -> virulence regulator gene ## tcpA classical ## tcpA El Tor @@ -17,17 +18,17 @@ set -o pipefail # ---------------- Input Paramters and Files ---------------- # +# sample name +SAMPLENAME=$1 + # path to project directory (create a new directory for typing purposes) -DIR=$1 +DIR=$2 DIR=${DIR%/} # remove trailing slash from directory name if necessary # path to FASTQ data -FASTQ=$2 +FASTQ=$3 FASTQ=${FASTQ%/} # remove trailing slash from directory name if necessary -# sample name -SAMPLENAME=$3 - # indicate number of lanes to merge data over LANES=$4 @@ -52,12 +53,12 @@ for i in $(seq 1 $LANES); do LARRAY+=("L00"$i); done # ensure output directorys exists -if [ ! -d $DIR/01_mapped ]; then - mkdir $DIR/01_mapped +if [ ! -d $DIR/typing_mapped ]; then + mkdir $DIR/typing_mapped fi -if [ ! -d $DIR/02_merged ]; then - mkdir $DIR/02_merged +if [ ! -d $DIR/typing_merged ]; then + mkdir $DIR/typing_merged fi if [ ! -d $OUTPUT ]; then @@ -74,6 +75,11 @@ if [ -f $OUTPUT/$SAMPLENAME\_typing_summary.txt ]; then rm $OUTPUT/$SAMPLENAME\_typing_summary.txt fi + +printf "SAMPLENAME\tgene\treads_mapped\treads\tfrac_mapped\tmean_depth\tpos_covered\ttotal_pos\tfrac_covered\n" >> \ + $OUTPUT/$SAMPLENAME\_typing_summary.txt + + # ---------------- Map reads to reference and calculate metrics ---------------- # # loop through all typing genes @@ -82,18 +88,20 @@ date for ref in $REFDIR/*.fasta; do + # obtain the name of the gene + gene=${ref%%.fasta} + gene=${gene##*/} + if [ ! -f $ref.ann ]; then bwa index $ref - touch check_tmp_typing/.$ref\_ann_check - elif [ ! -e check_tmp_typing/.$ref\_ann_check ]; then + touch check_tmp_typing/.$gene\_ann_check + elif [ ! -e check_tmp_typing/.$gene\_ann_check ]; then rm $ref.ann bwa index $ref - touch check_tmp_typing/.$ref\_ann_check + touch check_tmp_typing/.$gene\_ann_check fi - # obtain the name of the gene - gene=${ref%%.fasta} - gene=${gene##*/} + printf "MAPPING READS To %s" "$gene" @@ -104,56 +112,56 @@ for ref in $REFDIR/*.fasta; do # align paired end reads to reference genome (required indexed reference) # this will take a few minutes - if [ ! -f $DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sam ]; then + if [ ! -f $DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sam ]; then bwa mem -M -t $NUM_CORES $ref $FASTQ/$SAMPLENAME\_$LANE\_R1_001.fastq.gz \ $FASTQ/$SAMPLENAME\_$LANE\_R2_001.fastq.gz > \ - $DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sam + $DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sam touch check_tmp_typing/.$SAMPLENAME.$gene\_$LANE.sam.check elif [ ! -e check_tmp_typing/.$SAMPLENAME.$gene\_$LANE.sam.check ]; then - rm $DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sam + rm $DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sam bwa mem -M -t $NUM_CORES $ref $FASTQ/$SAMPLENAME\_$LANE\_R1_001.fastq.gz \ $FASTQ/$SAMPLENAME\_$LANE\_R2_001.fastq.gz > \ - $DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sam + $DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sam touch check_tmp_typing/.$SAMPLENAME.$gene\_$LANE.sam.check fi # convert SAM to BAM and fix read pairing information and flags # this will take a minute - if [ ! -f $DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped.bam ]; then - samtools fixmate -O bam $DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sam \ - $DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped.bam + if [ ! -f $DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped.bam ]; then + samtools fixmate -O bam $DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sam \ + $DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped.bam touch check_tmp_typing/.$SAMPLENAME.$gene\_$LANE.bam.check elif [ ! -e check_tmp_typing/.$SAMPLENAME.$gene\_$LANE.bam.check ]; then - rm $DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped.bam - samtools fixmate -O bam $DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sam \ - $DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped.bam + rm $DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped.bam + samtools fixmate -O bam $DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sam \ + $DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped.bam touch check_tmp_typing/.$SAMPLENAME.$gene\_$LANE.bam.check fi # sort BAM file - if [ ! -f $DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sorted.bam ]; then - samtools sort -T $DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped -O bam \ - -@ $NUM_CORES -o $DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sorted.bam \ - $DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped.bam + if [ ! -f $DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sorted.bam ]; then + samtools sort -T $DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped -O bam \ + -@ $NUM_CORES -o $DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sorted.bam \ + $DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped.bam touch check_tmp_typing/.$SAMPLENAME.$gene\_$LANE.bam.sort.check elif [ ! -e check_tmp_typing/.$SAMPLENAME.$gene\_$LANE.bam.sort.check ]; then - rm $DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sorted.bam - samtools sort -T $DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped -O bam \ - -@ $NUM_CORES -o $DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sorted.bam \ - $DIR/01_mapped/$SAMPLENAME.$gene\_$LANE.mapped.bam + rm $DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sorted.bam + samtools sort -T $DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped -O bam \ + -@ $NUM_CORES -o $DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped.sorted.bam \ + $DIR/typing_mapped/$SAMPLENAME.$gene\_$LANE.mapped.bam touch check_tmp_typing/.$SAMPLENAME.$gene\_$LANE.bam.sort.check fi done # create list of files to merge from different lanes - if [ ! -f $DIR/01_mapped/$SAMPLENAME.$gene.lanebams.txt ]; then - ls $DIR/01_mapped/$SAMPLENAME.$gene\_L00*.mapped.sorted.bam >> \ - $DIR/01_mapped/$SAMPLENAME.$gene.lanebams.txt + if [ ! -f $DIR/typing_mapped/$SAMPLENAME.$gene.lanebams.txt ]; then + ls $DIR/typing_mapped/$SAMPLENAME.$gene\_L00*.mapped.sorted.bam >> \ + $DIR/typing_mapped/$SAMPLENAME.$gene.lanebams.txt else - rm $DIR/01_mapped/$SAMPLENAME.$gene.lanebams.txt - ls $DIR/01_mapped/$SAMPLENAME.$gene\_L00*.mapped.sorted.bam >> \ - $DIR/01_mapped/$SAMPLENAME.$gene.lanebams.txt + rm $DIR/typing_mapped/$SAMPLENAME.$gene.lanebams.txt + ls $DIR/typing_mapped/$SAMPLENAME.$gene\_L00*.mapped.sorted.bam >> \ + $DIR/typing_mapped/$SAMPLENAME.$gene.lanebams.txt fi @@ -161,25 +169,25 @@ for ref in $REFDIR/*.fasta; do # merge data from multiple lanes if needed # this will take a minute if [ "$LANES" -gt 1 ]; then - if [ ! -f $DIR/02_merged/$SAMPLENAME.$gene.merged.bam ]; then - samtools merge -f $DIR/02_merged/$SAMPLENAME.$gene.merged.bam \ - -b $DIR/01_mapped/$SAMPLENAME.$gene.lanebams.txt + if [ ! -f $DIR/typing_merged/$SAMPLENAME.$gene.merged.bam ]; then + samtools merge -f $DIR/typing_merged/$SAMPLENAME.$gene.merged.bam \ + -b $DIR/typing_mapped/$SAMPLENAME.$gene.lanebams.txt touch check_tmp_typing/.$SAMPLENAME.$gene.bam.merge.check elif [ ! -e check_tmp_typing/.$SAMPLENAME.$gene.bam.merge.check ]; then - rm $DIR/02_merged/$SAMPLENAME.$gene.merged.bam - samtools merge -f $DIR/02_merged/$SAMPLENAME.$gene.merged.bam \ - -b $DIR/01_mapped/$SAMPLENAME.$gene.lanebams.txt + rm $DIR/typing_merged/$SAMPLENAME.$gene.merged.bam + samtools merge -f $DIR/typing_merged/$SAMPLENAME.$gene.merged.bam \ + -b $DIR/typing_mapped/$SAMPLENAME.$gene.lanebams.txt touch check_tmp_typing/.$SAMPLENAME.$gene.bam.merge.check fi else - if [ ! -f $DIR/02_merged/$SAMPLENAME.$gene.merged.bam ]; then - cp $DIR/01_mapped/$SAMPLENAME.$gene\_L001.mapped.sorted.bam \ - $DIR/02_merged/$SAMPLENAME.$gene.merged.bam + if [ ! -f $DIR/typing_merged/$SAMPLENAME.$gene.merged.bam ]; then + cp $DIR/typing_mapped/$SAMPLENAME.$gene\_L001.mapped.sorted.bam \ + $DIR/typing_merged/$SAMPLENAME.$gene.merged.bam touch check_tmp_typing/.$SAMPLENAME.$gene.bam.merge.check elif [ ! -e check_tmp_typing/.$SAMPLENAME.$gene.bam.merge.check ]; then - rm $DIR/02_merged/$SAMPLENAME.$gene.merged.bam - cp $DIR/01_mapped/$SAMPLENAME.$gene\_L001.mapped.sorted.bam \ - $DIR/02_merged/$SAMPLENAME.$gene.merged.bam + rm $DIR/typing_merged/$SAMPLENAME.$gene.merged.bam + cp $DIR/typing_mapped/$SAMPLENAME.$gene\_L001.mapped.sorted.bam \ + $DIR/typing_merged/$SAMPLENAME.$gene.merged.bam touch check_tmp_typing/.$SAMPLENAME.$gene.bam.merge.check fi fi @@ -189,32 +197,32 @@ for ref in $REFDIR/*.fasta; do samtools faidx $ref # generate a depth file// - if [ ! -f $DIR/02_merged/$SAMPLENAME.$gene.merged.depth ]; then - samtools depth -a $DIR/02_merged/$SAMPLENAME.$gene.merged.bam > \ - $DIR/02_merged/$SAMPLENAME.$gene.merged.depth + if [ ! -f $DIR/typing_merged/$SAMPLENAME.$gene.merged.depth ]; then + samtools depth -a $DIR/typing_merged/$SAMPLENAME.$gene.merged.bam > \ + $DIR/typing_merged/$SAMPLENAME.$gene.merged.depth touch check_tmp_typing/.$SAMPLENAME.$gene.depth.check elif [ ! -e check_tmp_typing/.$SAMPLENAME.$gene.depth.check ]; then - rm $DIR/02_merged/$SAMPLENAME.$gene.merged.depth - samtools depth -a $DIR/02_merged/$SAMPLENAME.$gene.merged.bam > \ - $DIR/02_merged/$SAMPLENAME.$gene.merged.depth + rm $DIR/typing_merged/$SAMPLENAME.$gene.merged.depth + samtools depth -a $DIR/typing_merged/$SAMPLENAME.$gene.merged.bam > \ + $DIR/typing_merged/$SAMPLENAME.$gene.merged.depth touch check_tmp_typing/.$SAMPLENAME.$gene.depth.check fi # calculate number of reads mapped - reads_mapped=$(samtools view -c -F 4 $DIR/02_merged/$SAMPLENAME.$gene.merged.bam) + reads_mapped=$(samtools view -c -F 4 $DIR/typing_merged/$SAMPLENAME.$gene.merged.bam) # calculate fraction of reads mapped # first calculate number of reads - reads=$(samtools view -c $DIR/02_merged/$SAMPLENAME.$gene.merged.bam) + reads=$(samtools view -c $DIR/typing_merged/$SAMPLENAME.$gene.merged.bam) frac_mapped=$(echo "scale=7; $reads_mapped / $reads" | bc) # calculate mean depth mean_depth=$(awk '{ total += $3 } END { print total/NR }' \ - $DIR/02_merged/$SAMPLENAME.$gene.merged.depth) + $DIR/typing_merged/$SAMPLENAME.$gene.merged.depth) # calculate percent of gene covered pos_covered=$(awk -v MIN_COV=$MIN_COV 'BEGIN {count = 0} $3 > MIN_COV {count++} END {print count}' \ - $DIR/02_merged/$SAMPLENAME.$gene.merged.depth) - total_pos=$(wc -l < $DIR/02_merged/$SAMPLENAME.$gene.merged.depth) + $DIR/typing_merged/$SAMPLENAME.$gene.merged.depth) + total_pos=$(wc -l < $DIR/typing_merged/$SAMPLENAME.$gene.merged.depth) frac_covered=$(echo "scale=7; $pos_covered / $total_pos" | bc) # dump all data into a file @@ -224,26 +232,15 @@ for ref in $REFDIR/*.fasta; do #remove all intermediate files - rm $DIR/01_mapped/$SAMPLENAME.$gene\_L00*.mapped.sam - rm $DIR/01_mapped/$SAMPLENAME.$gene\_L00*.mapped.bam - rm $DIR/01_mapped/$SAMPLENAME.$gene\_L00*.mapped.sorted.bam - rm $DIR/01_mapped/$SAMPLENAME.$gene.lanebams.txt - rm $DIR/02_merged/$SAMPLENAME.$gene.merged.bam - rm $DIR/02_merged/$SAMPLENAME.$gene.merged.depth + rm $DIR/typing_mapped/$SAMPLENAME.$gene\_L00*.mapped.sam + rm $DIR/typing_mapped/$SAMPLENAME.$gene\_L00*.mapped.bam + rm $DIR/typing_mapped/$SAMPLENAME.$gene\_L00*.mapped.sorted.bam + rm $DIR/typing_mapped/$SAMPLENAME.$gene.lanebams.txt + rm $DIR/typing_merged/$SAMPLENAME.$gene.merged.bam + rm $DIR/typing_merged/$SAMPLENAME.$gene.merged.depth done -# ---------------- Quality check ---------------- # - -blank_count=$(awk -F'\t' 'BEGIN{count = 0} {for(i = 0; i <= NF; i++) if ($i == "") count++} END{print count}' \ -$OUTPUT/$SAMPLENAME\_typing_summary.txt) - -file_size=$(ls -lh $OUTPUT/$SAMPLENAME\_typing_summary.txt | awk '{print $5}') - -if [[ $blank_count != 0 || $file_size -eq 0 ]]; then - echo "WARNING: empty result or empty file" -else - printf "%s typing summary created successfully" "$SAMPLENAME" - +echo "typing finished" date diff --git a/typing_instruction.md b/typing_instruction.md new file mode 100644 index 0000000..925aa37 --- /dev/null +++ b/typing_instruction.md @@ -0,0 +1,53 @@ +## Vibrio cholerae typing from Illumina data + +This is a guide for taking reads generated on an Illumina sequencer and mapping them against specific genes (_ctxA_, _wbeO1_, _wbfO139_, _tcpA_ classical, _tcpA_ El Tor, _toxR_) for the purpose of determining sample serotype. + +## Setting up your working directory + +This script assumpes the following: + +* That you have already set up your working directory as described in the [assembly instructions](https://github.com/HopkinsIDD/illumina-vc/blob/main/instruction.md). Make sure you know where the raw fastq files are stored and make sure they follow the [naming convention](https://support.illumina.com/help/BaseSpace_OLH_009008/Content/Source/Informatics/BS/NamingConvention_FASTQ-files-swBS.htm) + +* That the `scripts` folder inside your working directory contains the `map_genes.sh` script. + +* That all six gene-specific reference FASTAs are contained in a directory called `typing`. If you cloned this git repository, these reference files can be found in `ref_genomes/typing`. + +Once you have the required files in your working directory, navigate to this directory and create a new folder where you will store your typing results: + +``` +cd my-project +``` + +``` +mkdir typing_results +``` + +## Running the typing script + + +Run the typing script as follows: + +``` +bash scripts/map_genes.sh sample-name my-project path-to-raw-data num-lanes num-cores ref-dir output-dir +``` + +Where `sample-name` is the 'SampleName' prior to '_L00...' in your fastq files,`my-project` is the absolute path to your project directory, and `path-to-raw-data` is the path to your raw fastq files. `num-lanes` is the number of lanes you used to run this sample. `num-cores` should be less or equal to [the number of processors on your computer](https://github.com/HopkinsIDD/illumina-vc/blob/main/instruction.md/#determining-the-number-of-processors-on-your-computer).`ref-dir` should be the directory that contains the gene-specific FASTA files (usually `my-project/ref_genomes/typing`). `output-dir` will be the folder to store typing results (`my-project/typing_results`). + + + + +## Evaluating typing results + +The typing script will produce one output file (`samplename_typing_summary.txt`). This file will look something like this: + +``` +SAMPLENAME gene reads_mapped reads frac_mapped mean_depth pos_covered total_pos frac_covered +ERR3039943 ctxA 96 588182 .0001632 27.4582 777 777 1.0000000 +ERR3039943 tcpA_classical 87 588182 .0001479 9.74046 1048 1048 1.0000000 +ERR3039943 tcpA_eltor 96 588182 .0001632 28.3126 675 675 1.0000000 +ERR3039943 toxR 134 588182 .0002278 30.8923 891 891 1.0000000 +ERR3039943 wbeO1 3660 588185 .0062225 38.4333 21122 21122 1.0000000 +ERR3039943 wbfO139 1923 588187 .0032693 9.17084 46721 46721 1.0000000 +``` + +These files may be easier to view in a spreadsheet software such as Excel.