Skip to content

Commit

Permalink
Issue 937 (#967)
Browse files Browse the repository at this point in the history
* TST: tests to support API expansion on Table.partition

* API: expand partition API, fixes #937

* DOC: mention of expansion to Table.partition

* lint
  • Loading branch information
wasade authored May 9, 2024
1 parent 047b5e7 commit cdb7816
Show file tree
Hide file tree
Showing 3 changed files with 108 additions and 7 deletions.
1 change: 1 addition & 0 deletions ChangeLog.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ biom 2.1.15-dev

New features:

* Expand API for `Table.partition` to allow for passing `dict` mappings from ids to groups and vice versa, remove of empty vectors, and ignoring `None` partitions. See issue [#937](https://github.com/biocore/biom-format/issues/937)
* NumPy 2.0 support, see issue [#956](https://github.com/biocore/biom-format/issues/956)
* The optimized subsample without replacement method is now exposed as `biom.subsample`. Note that this method operates inplace on SciPy `csr_matrix` and `csc_matrix` objects. See issue [#958](https://github.com/biocore/biom-format/issues/958)

Expand Down
55 changes: 48 additions & 7 deletions biom/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -2399,16 +2399,24 @@ def filter(self, ids_to_keep, axis='sample', invert=False, inplace=True):

return table

def partition(self, f, axis='sample'):
def partition(self, f, axis='sample', remove_empty=False,
ignore_none=False):
"""Yields partitions
Parameters
----------
f : function
f : function, dict
`f` is given the ID and metadata of the vector and must return
what partition the vector is part of.
what partition the vector is part of. If `dict`, a mapping of
either ID -> group, or group -> [list, of, ID] must be provided.
axis : {'sample', 'observation'}, optional
The axis to iterate over
remove_empty : bool, optional
If `True`, remove empty vectors from a partition. Default is
`False`.
ignore_none : bool, optional
If `True`, ignore partitions with the label `None`. Default is
`False`.
Returns
-------
Expand Down Expand Up @@ -2449,11 +2457,39 @@ def partition(self, f, axis='sample'):
O1 1.0
O2 42.0
"""
# we are not checking for whether the IDs are or are not present as
# that introduces complexity of `strict`. Deferring that for now.
if isinstance(f, dict):
test = list(f.values())[0]

if isinstance(test, (list, tuple)):
# group -> [list, of, ids]
mapping = {}
for grp, ids in f.items():
for id_ in ids:
mapping[id_] = grp

elif isinstance(test, str):
# id_ -> grp
mapping = f

else:
raise ValueError(f"Unable to handle a type of `{type(test)}` "
"with mapping")

def part_f(i, m):
return mapping.get(i)
else:
part_f = f

partitions = {}
# conversion of vector types is not necessary, vectors are not
# being passed to an arbitrary function
for vals, id_, md in self.iter(dense=False, axis=axis):
part = f(id_, md)
part = part_f(id_, md)

if part is None:
continue

# try to make it hashable...
if not isinstance(part, Hashable):
Expand Down Expand Up @@ -2485,9 +2521,14 @@ def partition(self, f, axis='sample'):
samp_md = md[:] if md is not None else None
indices = {'sample_index': self._sample_index.copy()}

yield part, Table(data, obs_ids, samp_ids, obs_md, samp_md,
self.table_id, type=self.type, validate=False,
**indices)
tab = Table(data, obs_ids, samp_ids, obs_md, samp_md,
self.table_id, type=self.type, validate=False,
**indices)

if remove_empty:
tab.remove_empty(inplace=True)

yield part, tab

def collapse(self, f, collapse_f=None, norm=True, min_group_size=1,
include_collapsed_metadata=True, one_to_many=False,
Expand Down
59 changes: 59 additions & 0 deletions biom/tests/test_table.py
Original file line number Diff line number Diff line change
Expand Up @@ -4297,6 +4297,65 @@ def test_extract_data_from_tsv_badvalue_complaint(self):
with self.assertRaisesRegex(TypeError, msg):
Table._extract_data_from_tsv(tsv, dtype=int)

def test_partition_remove_empty(self):
t = Table(np.array([[0, 1, 2],
[3, 0, 0],
[4, 0, 0]]),
['O1', 'O2', 'O3'],
['S1', 'S2', 'S3'])
part_f = lambda i, m: i == 'S1' # noqa
obs = dict(t.partition(part_f, remove_empty=True))
exp = {True: Table(np.array([[3, ], [4, ]]), ['O2', 'O3'], ['S1', ]),
False: Table(np.array([[1, 2]]), ['O1', ], ['S2', 'S3'])}
self.assertEqual(obs, exp)

def test_partition_ignore_none(self):
t = Table(np.array([[0, 1, 2],
[3, 0, 0],
[4, 0, 0]]),
['O1', 'O2', 'O3'],
['S1', 'S2', 'S3'])
part_f = lambda i, m: True if i == 'S1' else None # noqa
obs = dict(t.partition(part_f, ignore_none=True))
exp = {True: Table(np.array([[0, ], [3, ], [4, ]]),
['O1', 'O2', 'O3'], ['S1', ])}
self.assertEqual(obs, exp)

def test_partition_dict_ids_to_groups(self):
t = Table(np.array([[0, 1, 2],
[3, 0, 0],
[4, 0, 0]]),
['O1', 'O2', 'O3'],
['S1', 'S2', 'S3'])
by_dict = {'S1': 'foo',
'S2': 'bar',
'S3': 'foo'}
exp = {'foo': Table(np.array([[0, 2], [3, 0], [4, 0]]),
['O1', 'O2', 'O3'],
['S1', 'S3']),
'bar': Table(np.array([[1, ], [0, ], [0, ]]),
['O1', 'O2', 'O3'],
['S2', ])}
obs = dict(t.partition(by_dict))
self.assertEqual(obs, exp)

def test_partition_dict_groups_to_ids(self):
t = Table(np.array([[0, 1, 2],
[3, 0, 0],
[4, 0, 0]]),
['O1', 'O2', 'O3'],
['S1', 'S2', 'S3'])
by_dict_group = {'foo': ['S1', 'S3'],
'bar': ['S2', ]}
exp = {'foo': Table(np.array([[0, 2], [3, 0], [4, 0]]),
['O1', 'O2', 'O3'],
['S1', 'S3']),
'bar': Table(np.array([[1, ], [0, ], [0, ]]),
['O1', 'O2', 'O3'],
['S2', ])}
obs = dict(t.partition(by_dict_group))
self.assertEqual(obs, exp)

def test_bin_samples_by_metadata(self):
"""Yield tables binned by sample metadata"""
def f(id_, md):
Expand Down

0 comments on commit cdb7816

Please sign in to comment.