Issue 937 (#967)

* TST: tests to support API expansion on Table.partition * API: expand partition API, fixes #937 * DOC: mention of expansion to Table.partition * lint
biocore · May 9, 2024 · cdb7816 · cdb7816
1 parent 047b5e7
commit cdb7816
Show file tree

Hide file tree

Showing 3 changed files with 108 additions and 7 deletions.
diff --git a/ChangeLog.md b/ChangeLog.md
@@ -6,6 +6,7 @@ biom 2.1.15-dev
 
 New features:
 
+* Expand API for `Table.partition` to allow for passing `dict` mappings from ids to groups and vice versa, remove of empty vectors, and ignoring `None` partitions. See issue [#937](https://github.com/biocore/biom-format/issues/937)
 * NumPy 2.0 support, see issue [#956](https://github.com/biocore/biom-format/issues/956)
 * The optimized subsample without replacement method is now exposed as `biom.subsample`. Note that this method operates inplace on SciPy `csr_matrix` and `csc_matrix` objects. See issue [#958](https://github.com/biocore/biom-format/issues/958)
 

diff --git a/biom/table.py b/biom/table.py
@@ -2399,16 +2399,24 @@ def filter(self, ids_to_keep, axis='sample', invert=False, inplace=True):
 
         return table
 
-    def partition(self, f, axis='sample'):
+    def partition(self, f, axis='sample', remove_empty=False,
+                  ignore_none=False):
         """Yields partitions
 
         Parameters
         ----------
-        f : function
+        f : function, dict
             `f` is given the ID and metadata of the vector and must return
-            what partition the vector is part of.
+            what partition the vector is part of. If `dict`, a mapping of
+            either ID -> group, or group -> [list, of, ID] must be provided.
         axis : {'sample', 'observation'}, optional
             The axis to iterate over
+        remove_empty : bool, optional
+            If `True`, remove empty vectors from a partition. Default is
+            `False`.
+        ignore_none : bool, optional
+            If `True`, ignore partitions with the label `None`. Default is
+            `False`.
 
         Returns
         -------
@@ -2449,11 +2457,39 @@ def partition(self, f, axis='sample'):
         O1  1.0
         O2  42.0
         """
+        # we are not checking for whether the IDs are or are not present as
+        # that introduces complexity of `strict`. Deferring that for now.
+        if isinstance(f, dict):
+            test = list(f.values())[0]
+
+            if isinstance(test, (list, tuple)):
+                # group -> [list, of, ids]
+                mapping = {}
+                for grp, ids in f.items():
+                    for id_ in ids:
+                        mapping[id_] = grp
+
+            elif isinstance(test, str):
+                # id_ -> grp
+                mapping = f
+
+            else:
+                raise ValueError(f"Unable to handle a type of `{type(test)}` "
+                                 "with mapping")
+
+            def part_f(i, m):
+                return mapping.get(i)
+        else:
+            part_f = f
+
         partitions = {}
         # conversion of vector types is not necessary, vectors are not
         # being passed to an arbitrary function
         for vals, id_, md in self.iter(dense=False, axis=axis):
-            part = f(id_, md)
+            part = part_f(id_, md)
+
+            if part is None:
+                continue
 
             # try to make it hashable...
             if not isinstance(part, Hashable):
@@ -2485,9 +2521,14 @@ def partition(self, f, axis='sample'):
                 samp_md = md[:] if md is not None else None
                 indices = {'sample_index': self._sample_index.copy()}
 
-            yield part, Table(data, obs_ids, samp_ids, obs_md, samp_md,
-                              self.table_id, type=self.type, validate=False,
-                              **indices)
+            tab = Table(data, obs_ids, samp_ids, obs_md, samp_md,
+                        self.table_id, type=self.type, validate=False,
+                        **indices)
+
+            if remove_empty:
+                tab.remove_empty(inplace=True)
+
+            yield part, tab
 
     def collapse(self, f, collapse_f=None, norm=True, min_group_size=1,
                  include_collapsed_metadata=True, one_to_many=False,

diff --git a/biom/tests/test_table.py b/biom/tests/test_table.py
@@ -4297,6 +4297,65 @@ def test_extract_data_from_tsv_badvalue_complaint(self):
         with self.assertRaisesRegex(TypeError, msg):
             Table._extract_data_from_tsv(tsv, dtype=int)
 
+    def test_partition_remove_empty(self):
+        t = Table(np.array([[0, 1, 2],
+                            [3, 0, 0],
+                            [4, 0, 0]]),
+                  ['O1', 'O2', 'O3'],
+                  ['S1', 'S2', 'S3'])
+        part_f = lambda i, m: i == 'S1'  # noqa
+        obs = dict(t.partition(part_f, remove_empty=True))
+        exp = {True: Table(np.array([[3, ], [4, ]]), ['O2', 'O3'], ['S1', ]),
+               False: Table(np.array([[1, 2]]), ['O1', ], ['S2', 'S3'])}
+        self.assertEqual(obs, exp)
+
+    def test_partition_ignore_none(self):
+        t = Table(np.array([[0, 1, 2],
+                            [3, 0, 0],
+                            [4, 0, 0]]),
+                  ['O1', 'O2', 'O3'],
+                  ['S1', 'S2', 'S3'])
+        part_f = lambda i, m: True if i == 'S1' else None  # noqa
+        obs = dict(t.partition(part_f, ignore_none=True))
+        exp = {True: Table(np.array([[0, ], [3, ], [4, ]]),
+                           ['O1', 'O2', 'O3'], ['S1', ])}
+        self.assertEqual(obs, exp)
+
+    def test_partition_dict_ids_to_groups(self):
+        t = Table(np.array([[0, 1, 2],
+                            [3, 0, 0],
+                            [4, 0, 0]]),
+                  ['O1', 'O2', 'O3'],
+                  ['S1', 'S2', 'S3'])
+        by_dict = {'S1': 'foo',
+                   'S2': 'bar',
+                   'S3': 'foo'}
+        exp = {'foo': Table(np.array([[0, 2], [3, 0], [4, 0]]),
+                            ['O1', 'O2', 'O3'],
+                            ['S1', 'S3']),
+               'bar': Table(np.array([[1, ], [0, ], [0, ]]),
+                            ['O1', 'O2', 'O3'],
+                            ['S2', ])}
+        obs = dict(t.partition(by_dict))
+        self.assertEqual(obs, exp)
+
+    def test_partition_dict_groups_to_ids(self):
+        t = Table(np.array([[0, 1, 2],
+                            [3, 0, 0],
+                            [4, 0, 0]]),
+                  ['O1', 'O2', 'O3'],
+                  ['S1', 'S2', 'S3'])
+        by_dict_group = {'foo': ['S1', 'S3'],
+                         'bar': ['S2', ]}
+        exp = {'foo': Table(np.array([[0, 2], [3, 0], [4, 0]]),
+                            ['O1', 'O2', 'O3'],
+                            ['S1', 'S3']),
+               'bar': Table(np.array([[1, ], [0, ], [0, ]]),
+                            ['O1', 'O2', 'O3'],
+                            ['S2', ])}
+        obs = dict(t.partition(by_dict_group))
+        self.assertEqual(obs, exp)
+
     def test_bin_samples_by_metadata(self):
         """Yield tables binned by sample metadata"""
         def f(id_, md):