Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

assembly.assemble_spades: added option to drop contigs shorter than given length. #889

Merged
merged 3 commits into from
Oct 22, 2018
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -352,6 +352,7 @@ def assemble_spades(
spades_opts='',
contigs_trusted=None, contigs_untrusted=None,
filter_contigs=False,
min_contig_len=0,
kmer_sizes=(55,65),
n_reads=10000000,
outReads=None,
Expand All @@ -377,6 +378,7 @@ def assemble_spades(
tools.spades.SpadesTool().assemble(reads_fwd=reads_fwd, reads_bwd=reads_bwd, reads_unpaired=reads_unpaired,
contigs_untrusted=contigs_untrusted, contigs_trusted=contigs_trusted,
contigs_out=out_fasta, filter_contigs=filter_contigs,
min_contig_len=min_contig_len,
kmer_sizes=kmer_sizes, mask_errors=mask_errors, max_kmer_sizes=max_kmer_sizes,
spades_opts=spades_opts, mem_limit_gb=mem_limit_gb,
threads=threads)
Expand All @@ -401,6 +403,8 @@ def parser_assemble_spades(parser=argparse.ArgumentParser()):
parser.add_argument('--outReads', default=None, help='Save the trimmomatic/prinseq/subsamp reads to a BAM file')
parser.add_argument('--filterContigs', dest='filter_contigs', default=False, action='store_true',
help='only output contigs SPAdes is sure of (drop lesser-quality contigs from output)')
parser.add_argument('--minContigLen', dest='min_contig_len', type=int, default=0,
help='only output contigs longer than this many bp')
parser.add_argument('--spadesOpts', dest='spades_opts', default='', help='(advanced) Extra flags to pass to the SPAdes assembler')
parser.add_argument('--memLimitGb', dest='mem_limit_gb', default=4, type=int, help='Max memory to use, in GB (default: %(default)s)')
util.cmd.common_args(parser, (('threads', None), ('loglevel', None), ('version', None), ('tmp_dir', None)))
Expand Down
3 changes: 3 additions & 0 deletions pipes/WDL/workflows/tasks/tasks_assembly.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ task assemble {

Int? trinity_n_reads=250000
Int? spades_n_reads=10000000
Int? spades_min_contig_len=0

String? assembler="trinity" # trinity, spades, or trinity-spades

Expand Down Expand Up @@ -35,6 +36,7 @@ task assemble {
${trim_clip_db} \
${sample_name}.assembly1-spades.fasta \
${'--nReads=' + spades_n_reads} \
${'--minContigLen=' + spades_min_contig_len} \
--memLimitGb $mem_in_gb \
--outReads=${sample_name}.subsamp.bam \
--loglevel=DEBUG
Expand All @@ -54,6 +56,7 @@ task assemble {
${sample_name}.assembly1-spades.fasta \
--contigsUntrusted=${sample_name}.assembly1-trinity.fasta \
${'--nReads=' + spades_n_reads} \
${'--minContigLen=' + spades_min_contig_len} \
--memLimitGb $mem_in_gb \
--loglevel=DEBUG

Expand Down
4 changes: 2 additions & 2 deletions test/unit/test_assembly.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,12 +154,12 @@ def test_assembly(self):
inBam = os.path.join(inDir, '..', 'G5012.3.subset.bam')
clipDb = os.path.join(inDir, 'clipDb.fasta')
with util.file.tempfname('.fasta') as outFasta:
assembly.assemble_spades(in_bam=inBam, clip_db=clipDb, out_fasta=outFasta)
assembly.assemble_spades(in_bam=inBam, clip_db=clipDb, min_contig_len=180, out_fasta=outFasta)
self.assertGreater(os.path.getsize(outFasta), 0)
contig_lens = list(sorted(len(seq.seq) for seq in Bio.SeqIO.parse(outFasta, 'fasta')))
#import sys
#print('test_assembly_contigs_lens:', contig_lens, file=sys.stderr)
self.assertEqual(contig_lens, [168, 170, 177, 180, 184, 187, 190, 191, 194, 197, 211, 243, 244, 247, 288, 328, 348, 430])
self.assertEqual(contig_lens, [180, 184, 187, 190, 191, 194, 197, 211, 243, 244, 247, 288, 328, 348, 430])

def test_assembly_with_previously_assembled_contigs(self):
inDir = util.file.get_test_input_path(self)
Expand Down
26 changes: 24 additions & 2 deletions tools/spades.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@
import shlex
import tempfile

import Bio.SeqIO

import tools
import tools.samtools
import tools.picard
Expand Down Expand Up @@ -41,7 +43,7 @@ def execute(self, args): # pylint: disable=W0221

def assemble(self, reads_fwd, reads_bwd, contigs_out, reads_unpaired=None, contigs_trusted=None,
contigs_untrusted=None, kmer_sizes=(55,65), mask_errors=False, max_kmer_sizes=1,
filter_contigs=False, mem_limit_gb=8, threads=None, spades_opts=''):
filter_contigs=False, min_contig_len=0, mem_limit_gb=8, threads=None, spades_opts=''):
'''Assemble contigs from RNA-seq reads and (optionally) pre-existing contigs.

Inputs:
Expand All @@ -57,6 +59,7 @@ def assemble(self, reads_fwd, reads_bwd, contigs_out, reads_unpaired=None, conti
mask_errors: if True, if spades fails with an error for a kmer size, pretend it just produced no contigs for that kmer size
max_kmer_sizes: if this many kmer sizes succeed, do not try further ones
filter_contigs: if True, outputs only "long and reliable transcripts with rather high expression" (per SPAdes docs)
min_contig_len: drop contigs shorter than this many bp
mem_limit_gb: max memory to use, in gigabytes
threads: number of threads to use
spades_opts: additional options to pass to spades
Expand All @@ -72,6 +75,7 @@ def assemble(self, reads_fwd, reads_bwd, contigs_out, reads_unpaired=None, conti
threads = util.misc.sanitize_thread_count(threads)

util.file.make_empty(contigs_out)
contigs_cumul_count = 0

if ((reads_fwd and reads_bwd and os.path.getsize(reads_fwd) > 0 and os.path.getsize(reads_bwd) > 0) or
(reads_unpaired and os.path.getsize(reads_unpaired) > 0)):
Expand Down Expand Up @@ -103,8 +107,26 @@ def assemble(self, reads_fwd, reads_bwd, contigs_out, reads_unpaired=None, conti
else:
raise

util.file.concat(transcripts_fname, contigs_out, append=True)
if min_contig_len:
transcripts = Bio.SeqIO.parse(transcripts_fname, 'fasta')
transcripts_sans_short = [r for r in transcripts if len(r.seq) >= min_contig_len]
transcripts_fname = os.path.join(spades_dir, 'transcripts_over_{}bp.fasta'.format(min_contig_len))
Bio.SeqIO.write(transcripts_sans_short, transcripts_fname, 'fasta')

contigs_cumul = os.path.join(spades_dir, 'contigs_cumul.{}.fasta'.format(contigs_cumul_count))
contigs_cumul_count += 1

util.file.concat(inputFilePaths=(contigs_out, transcripts_fname), outputFilePath=contigs_cumul, append=True)
shutil.copyfile(contigs_cumul, contigs_out)

if os.path.getsize(transcripts_fname):
kmer_sizes_succeeded += 1
if kmer_sizes_succeeded >= max_kmer_sizes:
break
# end: with util.file.tmp_dir('_spades') as spades_dir
# end: for kmer_size in util.misc.make_seq(kmer_sizes)
# if input non-empty
# end: def assemble(self, reads_fwd, reads_bwd, contigs_out, reads_unpaired=None, contigs_trusted=None, ...)
# end: class SpadesTool(tools.Tool)