diff --git a/ChangeLog.md b/ChangeLog.md index 61e5d063..f3aacec9 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -6,6 +6,7 @@ biom 2.1.15-dev New features: +* Expand API for `Table.partition` to allow for passing `dict` mappings from ids to groups and vice versa, remove of empty vectors, and ignoring `None` partitions. See issue [#937](https://github.com/biocore/biom-format/issues/937) * NumPy 2.0 support, see issue [#956](https://github.com/biocore/biom-format/issues/956) * The optimized subsample without replacement method is now exposed as `biom.subsample`. Note that this method operates inplace on SciPy `csr_matrix` and `csc_matrix` objects. See issue [#958](https://github.com/biocore/biom-format/issues/958) diff --git a/biom/table.py b/biom/table.py index 58d20db0..16cebc50 100644 --- a/biom/table.py +++ b/biom/table.py @@ -2399,16 +2399,24 @@ def filter(self, ids_to_keep, axis='sample', invert=False, inplace=True): return table - def partition(self, f, axis='sample'): + def partition(self, f, axis='sample', remove_empty=False, + ignore_none=False): """Yields partitions Parameters ---------- - f : function + f : function, dict `f` is given the ID and metadata of the vector and must return - what partition the vector is part of. + what partition the vector is part of. If `dict`, a mapping of + either ID -> group, or group -> [list, of, ID] must be provided. axis : {'sample', 'observation'}, optional The axis to iterate over + remove_empty : bool, optional + If `True`, remove empty vectors from a partition. Default is + `False`. + ignore_none : bool, optional + If `True`, ignore partitions with the label `None`. Default is + `False`. Returns ------- @@ -2449,11 +2457,39 @@ def partition(self, f, axis='sample'): O1 1.0 O2 42.0 """ + # we are not checking for whether the IDs are or are not present as + # that introduces complexity of `strict`. Deferring that for now. + if isinstance(f, dict): + test = list(f.values())[0] + + if isinstance(test, (list, tuple)): + # group -> [list, of, ids] + mapping = {} + for grp, ids in f.items(): + for id_ in ids: + mapping[id_] = grp + + elif isinstance(test, str): + # id_ -> grp + mapping = f + + else: + raise ValueError(f"Unable to handle a type of `{type(test)}` " + "with mapping") + + def part_f(i, m): + return mapping.get(i) + else: + part_f = f + partitions = {} # conversion of vector types is not necessary, vectors are not # being passed to an arbitrary function for vals, id_, md in self.iter(dense=False, axis=axis): - part = f(id_, md) + part = part_f(id_, md) + + if part is None: + continue # try to make it hashable... if not isinstance(part, Hashable): @@ -2485,9 +2521,14 @@ def partition(self, f, axis='sample'): samp_md = md[:] if md is not None else None indices = {'sample_index': self._sample_index.copy()} - yield part, Table(data, obs_ids, samp_ids, obs_md, samp_md, - self.table_id, type=self.type, validate=False, - **indices) + tab = Table(data, obs_ids, samp_ids, obs_md, samp_md, + self.table_id, type=self.type, validate=False, + **indices) + + if remove_empty: + tab.remove_empty(inplace=True) + + yield part, tab def collapse(self, f, collapse_f=None, norm=True, min_group_size=1, include_collapsed_metadata=True, one_to_many=False, diff --git a/biom/tests/test_table.py b/biom/tests/test_table.py index 52c53fc0..6011f394 100644 --- a/biom/tests/test_table.py +++ b/biom/tests/test_table.py @@ -4297,6 +4297,65 @@ def test_extract_data_from_tsv_badvalue_complaint(self): with self.assertRaisesRegex(TypeError, msg): Table._extract_data_from_tsv(tsv, dtype=int) + def test_partition_remove_empty(self): + t = Table(np.array([[0, 1, 2], + [3, 0, 0], + [4, 0, 0]]), + ['O1', 'O2', 'O3'], + ['S1', 'S2', 'S3']) + part_f = lambda i, m: i == 'S1' # noqa + obs = dict(t.partition(part_f, remove_empty=True)) + exp = {True: Table(np.array([[3, ], [4, ]]), ['O2', 'O3'], ['S1', ]), + False: Table(np.array([[1, 2]]), ['O1', ], ['S2', 'S3'])} + self.assertEqual(obs, exp) + + def test_partition_ignore_none(self): + t = Table(np.array([[0, 1, 2], + [3, 0, 0], + [4, 0, 0]]), + ['O1', 'O2', 'O3'], + ['S1', 'S2', 'S3']) + part_f = lambda i, m: True if i == 'S1' else None # noqa + obs = dict(t.partition(part_f, ignore_none=True)) + exp = {True: Table(np.array([[0, ], [3, ], [4, ]]), + ['O1', 'O2', 'O3'], ['S1', ])} + self.assertEqual(obs, exp) + + def test_partition_dict_ids_to_groups(self): + t = Table(np.array([[0, 1, 2], + [3, 0, 0], + [4, 0, 0]]), + ['O1', 'O2', 'O3'], + ['S1', 'S2', 'S3']) + by_dict = {'S1': 'foo', + 'S2': 'bar', + 'S3': 'foo'} + exp = {'foo': Table(np.array([[0, 2], [3, 0], [4, 0]]), + ['O1', 'O2', 'O3'], + ['S1', 'S3']), + 'bar': Table(np.array([[1, ], [0, ], [0, ]]), + ['O1', 'O2', 'O3'], + ['S2', ])} + obs = dict(t.partition(by_dict)) + self.assertEqual(obs, exp) + + def test_partition_dict_groups_to_ids(self): + t = Table(np.array([[0, 1, 2], + [3, 0, 0], + [4, 0, 0]]), + ['O1', 'O2', 'O3'], + ['S1', 'S2', 'S3']) + by_dict_group = {'foo': ['S1', 'S3'], + 'bar': ['S2', ]} + exp = {'foo': Table(np.array([[0, 2], [3, 0], [4, 0]]), + ['O1', 'O2', 'O3'], + ['S1', 'S3']), + 'bar': Table(np.array([[1, ], [0, ], [0, ]]), + ['O1', 'O2', 'O3'], + ['S2', ])} + obs = dict(t.partition(by_dict_group)) + self.assertEqual(obs, exp) + def test_bin_samples_by_metadata(self): """Yield tables binned by sample metadata""" def f(id_, md):