Skip to content

Commit

Permalink
Monroe ONT Updates (#47)
Browse files Browse the repository at this point in the history
* added wdl fastq parser

* monroe ont redesign, dropped nanopolish updated to use medaka models
  • Loading branch information
k-florek authored Apr 16, 2021
1 parent b518df2 commit aa1aebf
Show file tree
Hide file tree
Showing 9 changed files with 110 additions and 130 deletions.
8 changes: 2 additions & 6 deletions staphb_toolkit/core/docker_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -25,12 +25,8 @@
"image": "docker://staphb/ariba",
"tag": "latest"
},
"artic-ncov2019-medaka": {
"image": "docker://staphb/artic-ncov2019-medaka",
"tag": "latest"
},
"artic-ncov2019-nanopolish": {
"image": "docker://staphb/artic-ncov2019-nanopolish",
"artic-ncov2019": {
"image": "docker://staphb/artic-ncov2019",
"tag": "latest"
},
"augur": {
Expand Down
42 changes: 42 additions & 0 deletions staphb_toolkit/core/wdl_fastq_input.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/usr/bin/env python3

import re
import os,sys

#search a directory and return a dictonary of paired fastq files named by fastq name
def pe_search(path):
if not os.path.isdir(path):
raise ValueError(path + " " + "not found.")
path = os.path.abspath(path)
fastqs = []
fastq_dict = {}
for root,dirs,files in os.walk(path):
for file in files:
if '.fastq' in file or '.fastq.gz' in file or '.fq.gz' in file or '.fq' in file:
fastqs.append(file)
fastqs.sort()
if not len(fastqs)%2 == 0:
print("There is an uneven number of Fastq files in: '"+path+"'")
sys.exit(1)

for readA, readB in zip(fastqs[0::2], fastqs[1::2]):
samplename = re.split("(_R1)|(_1)",readA)[0]
fastq_dict[samplename] = {"read_1":readA,"read_2":readB}
return(fastq_dict)

#search a directory and return a dictonary of fastq files named by fastq name
def se_search(path):
if not os.path.isdir(path):
raise ValueError(path + " " + "not found.")
path = os.path.abspath(path)
fastqs = []
fastq_dict = {}
for root,dirs,files in os.walk(path):
for file in files:
if '.fastq' in file or '.fastq.gz' in file or '.fq.gz' in file or '.fq' in file:
fastqs.append(file)
fastqs.sort()
for read in fastqs:
samplename = re.split("(.fastq)|(.fq)",read)[0]
fastq_dict[samplename] = read
return(fastq_dict)
31 changes: 11 additions & 20 deletions staphb_toolkit/toolkit_workflows.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,13 +73,12 @@ def error(self, message):
##monroe_ont_assembly----------------------------
##medaka nanopolish
subparser_monroe_ont_assembly = monroe_subparsers.add_parser('ont_assembly',help='Assembly SARS-CoV-2 genomes from ONT read data generated from ARTIC amplicons', add_help=False)
subparser_monroe_ont_assembly.add_argument('fast5_path', type=str,help="path to the location of the reads in a fast5 format")
subparser_monroe_ont_assembly.add_argument('--fastq_path', type=str,help="path to the location of the reads in a fastq format, needed if not performing bascalling")
subparser_monroe_ont_assembly.add_argument('--polish_method',type=str,choices=["medaka","nanopolish"],help="polishing method, default: medaka",default="medaka")
subparser_monroe_ont_assembly.add_argument('--summary', type=str,help="path to the location of the sequencing summary, only needed when using nanopolish from fastq data")
subparser_monroe_ont_assembly.add_argument('--fast5_path', type=str,help="path to the location of the reads in a fast5 format")
subparser_monroe_ont_assembly.add_argument('--fastq_path', type=str,help="path to the location of the reads in a fastq format")
subparser_monroe_ont_assembly.add_argument('--demultiplexed', default="false",action="store_const",const="true",help="flag if fastq files have already been demultiplexed")
subparser_monroe_ont_assembly.add_argument('--medaka_model',default="r941_min_high_g360",help="medaka model to use for polishing, default: r941_min_fast_g303")
subparser_monroe_ont_assembly.add_argument('--run_prefix', type=str,help="desired run prefix. Default \"artic_ncov19\"",default="artic_ncov19")
subparser_monroe_ont_assembly.add_argument('--ont_basecalling', default=False, action="store_true",help="perform high accuracy basecalling using GPU (only use if you have setup a GPU compatable device)")
subparser_monroe_ont_assembly.add_argument('--primers', type=str,choices=["V1", "V2", "V3"], help="indicate which ARTIC primers were used (V1, V2, or V3)",required=True)
subparser_monroe_ont_assembly.add_argument('--primers', type=str,choices=["V1", "V2", "V3"], help="indicate which ARTIC primers were used (V1, V2, or V3), default V3",default="V3")
subparser_monroe_ont_assembly.add_argument('--config','-c', type=str,help="Nextflow custom configuration.")
subparser_monroe_ont_assembly.add_argument('--output','-o',metavar="<output_path>",type=str,help="Path to ouput directory, default \"monroe_results\".",default="monroe_results")
subparser_monroe_ont_assembly.add_argument('--resume', default="", action="store_const",const="-resume",help="resume a previous run")
Expand Down Expand Up @@ -359,24 +358,16 @@ def error(self, message):

if args.monroe_command == 'ont_assembly':
#build command
if args.ont_basecalling:
read_paths = f"--basecalling --fast5_dir {args.fast5_path}"
elif args.fast5_path and args.fastq_path:
read_paths = f"--fast5_dir {args.fast5_path} --fastq_dir {args.fastq_path}"
if args.fast5_path:
read_paths = f"--fast5_dir {args.fast5_path}"
elif args.fastq_path:
read_paths = f"--fastq_dir {args.fastq_path}"
else:
subparser_monroe_ont_assembly.print_help()
print("Please provide path to both fastq and fast5 or perform basecalling.")
print("Please provide path to fast5 files to perform basecalling or fastq files, note: basecalling requires GPU")
sys.exit(1)

if args.polish_method == "nanopolish" and args.summary:
seq_summary = f"--sequencing_summary {args.summary}"
elif args.polish_method == "nanopolish" and not args.summary and not args.ont_basecalling:
print("Nanopolish requires a sequencing summary file generated during basecalling.")
sys.exit(1)
else:
seq_summary = ""

command = nextflow_path + f" {config} run {monroe_path}/monroe_ont_assembly.nf {profile} {args.resume} {read_paths} --pipe ont {seq_summary} --polishing {args.polish_method} --primers {args.primers} --outdir {args.output} --run_prefix {args.run_prefix} -with-trace {args.output}/logs/{exec_time}Monroe_trace.txt -with-report {args.output}/logs/{exec_time}Monroe_execution_report.html {work}"
command = nextflow_path + f" {config} run {monroe_path}/monroe_ont_assembly.nf {profile} {args.resume} {read_paths} --pipe ont --primers {args.primers} --outdir {args.output} --run_prefix {args.run_prefix} --medaka_model {args.medaka_model} --demultiplexed {args.demultiplexed} -with-trace {args.output}/logs/{exec_time}Monroe_trace.txt -with-report {args.output}/logs/{exec_time}Monroe_execution_report.html {work}"
#run command using nextflow in a subprocess
print("Starting the Monroe ONT assembly:")
child = pexpect.spawn(command)
Expand Down
5 changes: 1 addition & 4 deletions staphb_toolkit/workflows/monroe/configs/docker.config
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,9 @@ process {
stageInMode = 'copy'
}
withName:artic_guppyplex{
container = artic_nanopolish_container
container = artic_medaka_container
stageInMode = 'copy'
}
withName:artic_nanopolish_pipeline{
container = artic_nanopolish_container
}
withName:artic_medaka_pipeline{
container = artic_medaka_container
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ trimmomatic_container = 'staphb/trimmomatic:0.39'
bbtools_container = 'staphb/bbtools:38.76'
guppy_gpu_container = "genomicpariscentre/guppy-gpu"
guppy_cpu_container = "genomicpariscentre/guppy"
artic_nanopolish_container = "staphb/artic-ncov2019-nanopolish:1.1.0"
artic_medaka_container = "staphb/artic-ncov2019-medaka:1.1.0"
artic_medaka_container = "staphb/artic-ncov2019:1.3.0"
pangolin_container = "staphb/pangolin:2.3.0-pangolearn-2021-02-18"
theiagen_artic_medaka_container = "theiagen/artic-ncov2019:1.1.3"
16 changes: 5 additions & 11 deletions staphb_toolkit/workflows/monroe/configs/ont_user_config.config
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,9 @@

guppy_gpu_container = "genomicpariscentre/guppy-gpu"
guppy_cpu_container = "genomicpariscentre/guppy"
artic_nanopolish_container = "staphb/artic-ncov2019-nanopolish:1.1.0"
artic_medaka_container = "staphb/artic-ncov2019-medaka:1.1.0"
artic_medaka_container = "staphb/artic-ncov2019:1.3.0"
pangolin_container = "staphb/pangolin:2.3.0-pangolearn-2021-02-18"
samtools_container = 'staphb/samtools:1.10'
samtools_container = "staphb/samtools:1.10"

//#####################
//###Pipeline Params###
Expand All @@ -68,9 +67,9 @@ params.chunk_size = 500
params.min_length = 400
params.max_length = 700

//ARTIC Nanopolish/Medaka Pipeline Parameters
params.polishing = "medaka" // polishing mode "nanopolish" or "medaka"
//ARTIC Medaka Pipeline Parameters
params.normalise = 200
params.medaka_model = "r941_min_high_g360"

process {

Expand All @@ -91,14 +90,9 @@ process {
withName:artic_guppyplex{
cpus = 8
memory = '16 GB'
container = artic_nanopolish_container
container = artic_medaka_container
stageInMode = 'copy'
}
withName:artic_nanopolish_pipeline{
cpus = 8
memory = '16 GB'
container = artic_nanopolish_container
}
withName:artic_medaka_pipeline{
cpus = 8
memory = '16 GB'
Expand Down
5 changes: 1 addition & 4 deletions staphb_toolkit/workflows/monroe/configs/singularity.config
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,7 @@ process {
container = guppy_cpu_container
}
withName:artic_guppyplex{
container = artic_nanopolish_container
}
withName:artic_nanopolish_pipeline{
container = artic_nanopolish_container
container = artic_medaka_container
}
withName:artic_medaka_pipeline{
container = artic_medaka_container
Expand Down
3 changes: 1 addition & 2 deletions staphb_toolkit/workflows/monroe/monroe.config
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,7 @@ params.chunk_size = 500
params.min_length = 400
params.max_length = 700

//ARTIC Nanopolish/Medaka Pipeline Parameters
params.polishing = "medaka" // polishing mode "nanopolish" or "medaka"
//ARTIC Medaka Pipeline Parameters
params.normalise = 200

//#######################
Expand Down
127 changes: 46 additions & 81 deletions staphb_toolkit/workflows/monroe/monroe_ont_assembly.nf
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,15 @@
//starting parameters
params.fast5_dir = ""
params.fastq_dir = ""
params.sequencing_summary = ""
params.outdir = ""
params.primers = ""
params.primers = "V3"
params.run_prefix = "artic_ncov19"
params.pipe = ""

// Input channels
Channel
.value( "${params.primers}")
.ifEmpty { exit 1, "Primers used must be included." }
.set { polish_primers }
params.demultiplexed = false
params.medaka_model = "r941_min_high_g360"

// If we have fast5 files then start with basecalling
if(params.basecalling){
if(params.fast5_dir){
Channel
.fromPath( "${params.fast5_dir}")
.ifEmpty { exit 1, "Cannot find any fast5 files in: ${params.fast5_dir} Path must not end with /" }
Expand Down Expand Up @@ -50,38 +45,36 @@ if(params.basecalling){

// If we have already basecalled get fastqs and fast5s for polishing
else {
Channel
.fromPath( "${params.fastq_dir}/*.fastq*")
.ifEmpty { exit 1, "Cannot find any fastq files in: ${params.fastq_dir} Path must not end with /" }
.set { fastq_reads }

Channel
.fromPath( "${params.fast5_dir}")
.ifEmpty { exit 1, "Cannot find any fast5 files in: ${params.fast5_dir} Path must not end with /" }
.set { polish_fast5 }

if(params.polishing == "nanopolish"){
if(params.demultiplexed){
Channel
.fromPath( "${params.fastq_dir}/barcode*",type:'dir')
.ifEmpty { exit 1, "Cannot find any fastq files in: ${params.fastq_dir} Path must not end with /" }
.set { demultiplexed_reads }
}
else{
Channel
.fromPath( "${params.sequencing_summary}")
.ifEmpty { exit 1, "Cannot find sequencing summary in: ${params.sequencing_summary}" }
.set { sequencing_summary }
}
.fromPath( "${params.fastq_dir}/*.fastq*")
.ifEmpty { exit 1, "Cannot find any fastq files in: ${params.fastq_dir} Path must not end with /" }
.set { fastq_reads }
}
}

// Demultiplex fastqs
process guppy_demultiplexing {
publishDir "${params.outdir}/demultiplexing", mode: 'copy'
if(! params.demultiplexed){
// Demultiplex fastqs
process guppy_demultiplexing {
publishDir "${params.outdir}/demultiplexing", mode: 'copy'

input:
file(fastqs) from fastq_reads.collect()
input:
file(fastqs) from fastq_reads.collect()

output:
path("barcode*",type:'dir') into demultiplexed_reads
output:
path("barcode*",type:'dir') into demultiplexed_reads

script:
"""
guppy_barcoder -t ${task.cpus} --require_barcodes_both_ends -i . -s . ${params.demultiplexing_params} -q 0 -r
"""
script:
"""
guppy_barcoder -t ${task.cpus} --require_barcodes_both_ends -i . -s . ${params.demultiplexing_params} -q 0 -r
"""
}
}

// Run artic gupplyplex
Expand All @@ -105,58 +98,30 @@ process artic_guppyplex {
"""
}

// Run artic pipeline using nanopolish
if(params.polishing=="nanopolish"){
process artic_nanopolish_pipeline {
publishDir "${params.outdir}/pipeline_nanopolish", mode: 'copy'
errorStrategy 'ignore'

input:
val primers from polish_primers
tuple file(fastq), path(fast5path), file(sequencing_summary) from polish_files .combine(polish_fast5) .combine(sequencing_summary)

output:
file *.primrtrimmed.bam into alignment_file
file "*{.primertrimmed.bam,.vcf,.variants.tab}"
file "*.consensus.fasta" into consensus_fasta

script:
"""
# get samplename by dropping file extension
filename=${fastq}
samplename=\${filename%.*}
artic minion --normalise ${params.normalise} --threads ${task.cpus} --scheme-directory /artic-ncov2019/primer_schemes --fast5-directory ${fast5path} --sequencing-summary ${sequencing_summary} --read-file ${fastq} nCoV-2019/${primers} \$samplename
"""
}
}

// Run artic pipeline using medaka
else {
process artic_medaka_pipeline {
publishDir "${params.outdir}/pipeline_medaka", mode: 'copy'
publishDir "${params.outdir}/assemblies", mode: 'copy', pattern: '*.fasta'
errorStrategy 'ignore'
process artic_medaka_pipeline {
publishDir "${params.outdir}/pipeline_medaka", mode: 'copy'
publishDir "${params.outdir}/assemblies", mode: 'copy', pattern: '*.fasta'
errorStrategy 'ignore'

input:
val primers from polish_primers
file(fastq) from polish_files
input:
file(fastq) from polish_files

output:
file "*.primertrimmed.rg.sorted.bam" into alignment_file
file "*{.primertrimmed.rg,.primers.vcf,.vcf.gz,.trimmed.rg,.fail.vcf}*"
file "*.consensus.fasta" into consensus_fasta
output:
file "*.primertrimmed.rg.sorted.bam" into alignment_file
file "*{.primertrimmed.rg,.primers.vcf,.vcf.gz,.trimmed.rg,.fail.vcf}*"
file "*.consensus.fasta" into consensus_fasta

script:
"""
# get samplename by dropping file extension
filename=${fastq}
samplename=\${filename%.*}
script:
"""
# get samplename by dropping file extension
filename=${fastq}
samplename=\${filename%.*}
artic minion --medaka --normalise ${params.normalise} --threads ${task.cpus} --scheme-directory /artic-ncov2019/primer_schemes --read-file ${fastq} nCoV-2019/${primers} \$samplename
"""
}
artic minion --medaka --medaka-model ${params.medaka_model} --normalise ${params.normalise} --threads ${task.cpus} --scheme-directory /fieldbioinformatics/test-data/primer-schemes --read-file ${fastq} nCoV-2019/${params.primers} \$samplename
"""
}

//QC of read data
process samtools {
tag "$name"
Expand Down

0 comments on commit aa1aebf

Please sign in to comment.