Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add uchime2_denovo to close #92 #100

Open
wants to merge 11 commits into
base: dev
Choose a base branch
from
12 changes: 8 additions & 4 deletions q2_vsearch/_chimera.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
from ._format import UchimeStatsFmt


_uchime_defaults = {'dn': 1.4,
_uchime_defaults = {'method': 'uchime',
'dn': 1.4,
'mindiffs': 3,
'mindiv': 0.8,
'minh': 0.28,
Expand Down Expand Up @@ -68,26 +69,29 @@ def _uchime_ref(sequences, table, reference_sequences, dn, mindiffs,

def uchime_denovo(sequences: DNAFASTAFormat,
table: biom.Table,
method: str = _uchime_defaults['method'],
dn: float = _uchime_defaults['dn'],
mindiffs: int = _uchime_defaults['mindiffs'],
mindiv: float = _uchime_defaults['mindiv'],
minh: float = _uchime_defaults['minh'],
xn: float = _uchime_defaults['xn']) \
-> (DNAFASTAFormat, DNAFASTAFormat, UchimeStatsFmt):
cmd, chimeras, nonchimeras, uchime_stats = \
_uchime_denovo(sequences, table, dn, mindiffs, mindiv, minh, xn)
_uchime_denovo(sequences, table, method,
dn, mindiffs, mindiv, minh, xn)
return chimeras, nonchimeras, uchime_stats


def _uchime_denovo(sequences, table, dn, mindiffs, mindiv, minh, xn):
def _uchime_denovo(sequences, table, method,
dn, mindiffs, mindiv, minh, xn):
# this function only exists to simplify testing
chimeras = DNAFASTAFormat()
nonchimeras = DNAFASTAFormat()
uchime_stats = UchimeStatsFmt()
with tempfile.NamedTemporaryFile() as fasta_with_sizes:
_fasta_with_sizes(str(sequences), fasta_with_sizes.name, table)
cmd = ['vsearch',
'--uchime_denovo', fasta_with_sizes.name,
'--' + method + '_denovo', fasta_with_sizes.name,
colinbrislawn marked this conversation as resolved.
Show resolved Hide resolved
'--uchimeout', str(uchime_stats),
'--nonchimeras', str(nonchimeras),
'--chimeras', str(chimeras),
Expand Down
31 changes: 21 additions & 10 deletions q2_vsearch/plugin_setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -368,7 +368,7 @@
'nonchimeras': 'The non-chimeric sequences.',
'stats': 'Summary statistics from chimera checking.'
},
name='Reference-based chimera filtering with vsearch.',
name='Reference-based chimera filtering.',
description=('Apply the vsearch uchime_ref method to identify chimeric '
'feature sequences. The results of this method can be used '
'to filter chimeric features from the corresponding feature '
Expand All @@ -382,6 +382,8 @@
'sequences': FeatureData[Sequence],
'table': FeatureTable[Frequency]},
parameters={
'method': qiime2.plugin.Str % qiime2.plugin.Choices(
['uchime', 'uchime2', 'uchime3']),
'dn': qiime2.plugin.Float % qiime2.plugin.Range(0., None),
'mindiffs': qiime2.plugin.Int % qiime2.plugin.Range(1, None),
'mindiv': qiime2.plugin.Float % qiime2.plugin.Range(0., None),
Expand All @@ -401,12 +403,21 @@
'abundances).'),
},
parameter_descriptions={
'method': ('Which algorithm to use.'),
colinbrislawn marked this conversation as resolved.
Show resolved Hide resolved
# 'abskew': ('The abundance skew is used to distinguish in a threeway '
# 'alignment which sequence is the chimera and which are '
# 'the parents. The parent sequences must be this many '
# 'times more abundant than the child sequence to be '
# 'flagged as chimeric.'),
colinbrislawn marked this conversation as resolved.
Show resolved Hide resolved
'dn': ('No vote pseudo-count, corresponding to the parameter n in '
'the chimera scoring function.'),
'mindiffs': 'Minimum number of differences per segment.',
'mindiv': 'Minimum divergence from closest parent.',
'mindiffs': 'Minimum number of differences per segment. '
'Ignored for uchime2 and uchime3.',
'mindiv': 'Minimum divergence from closest parent. '
'Ignored for uchime2 and uchime3.',
colinbrislawn marked this conversation as resolved.
Show resolved Hide resolved
'minh': ('Minimum score (h). Increasing this value tends to reduce '
'the number of false positives and to decrease sensitivity.'),
'the number of false positives and to decrease sensitivity. '
'Ignored for uchime2 and uchime3.'),
'xn': ('No vote weight, corresponding to the parameter beta in the '
'scoring function.'),
},
Expand All @@ -415,12 +426,12 @@
'nonchimeras': 'The non-chimeric sequences.',
'stats': 'Summary statistics from chimera checking.'
},
name='De novo chimera filtering with vsearch.',
description=('Apply the vsearch uchime_denovo method to identify chimeric '
'feature sequences. The results of this method can be used '
'to filter chimeric features from the corresponding feature '
'table. For more details, please refer to the vsearch '
'documentation.')
name='De novo chimera filtering.',
description=('Apply one of the vsearch uchime*_denovo methods to '
'identify chimeric feature sequences. '
'The results of these methods can be used to filter chimeric '
'features from the corresponding feature table. '
'For more details, please refer to the vsearch manual.')
)


Expand Down
3 changes: 3 additions & 0 deletions q2_vsearch/tests/test_chimera.py
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What are your thoughts on adding tests for these new algorithm versions (beyond testing the command string)?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think there's a trivial test that shows both working.

I don't have an example in which these methods differ.... Would you like me to try and find one?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's true that there is a test to which we pass "uchime3" as the method; however we technically can't be sure that this method is being implemented by the underlying software without differentiating behavior.

I understand if it's too difficult to contrive input data that shows different expected behavior for the different algorithm methods, but if it is reasonably easy to do so it would be best.

Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def test_uchime_denovo(self):

obs_chime = _read_seqs(chime)
exp_chime = [self.input_sequences_list[3]]
# >feature4 is the chimera!
self.assertEqual(obs_chime, exp_chime)

# sequences are reverse-sorted by abundance in output
Expand Down Expand Up @@ -105,8 +106,10 @@ def test_uchime_denovo_no_chimeras_alt_params(self):
with redirected_stdio(stderr=os.devnull):
cmd, chime, nonchime, stats = _uchime_denovo(
sequences=self.input_sequences, table=self.input_table,
method='uchime3',
dn=42.42, mindiffs=4, mindiv=0.5, minh=0.42, xn=9.0)
cmd = ' '.join(cmd)
self.assertTrue('--uchime3_denovo' in cmd)
self.assertTrue('--dn 42.42' in cmd)
self.assertTrue('--mindiffs 4' in cmd)
self.assertTrue('--mindiv 0.5' in cmd)
Expand Down
Loading