cytomining · gwaybio · Dec 22, 2022 · Dec 16, 2022 · Dec 19, 2022 · Dec 19, 2022
diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py
@@ -429,13 +429,17 @@ def get_sql_table_col_names(self, table):
 
         return meta_cols, feat_cols
 
-    def load_compartment(self, compartment):
+    def load_compartment(self, compartment, float_datatype: type = np.float64):
         """Creates the compartment dataframe.
 
         Parameters
         ----------
         compartment : str
             The compartment to process.
+        float_datatype: type, default np.float64
+            Numpy floating point datatype to use for load_compartment and resulting
+            dataframes. Please note: using any besides np.float64 are experimentally
+            unverified.
 
         Returns
         -------
@@ -448,8 +452,8 @@ def load_compartment(self, compartment):
         meta_cols, feat_cols = self.get_sql_table_col_names(compartment)
         num_meta, num_feats = len(meta_cols), len(feat_cols)
 
-        # Use pre-allocated np.array for data
-        feats = np.empty(shape=(num_cells, num_feats), dtype=np.float64)
+        # Use pre-allocated np.array for feature data
+        feats = np.empty(shape=(num_cells, num_feats), dtype=float_datatype)
         # Use pre-allocated pd.DataFrame for metadata
         metas = pd.DataFrame(columns=meta_cols, index=range(num_cells))
 
@@ -661,6 +665,7 @@ def merge_single_cells(
         single_cell_normalize: bool = False,
         normalize_args: Optional[Dict] = None,
         platemap: Optional[Union[str, pd.DataFrame]] = None,
+        float_datatype: type = np.float64,
         **kwargs,
     ):
         """Given the linking columns, merge single cell data. Normalization is also supported.
@@ -681,6 +686,10 @@ def merge_single_cells(
             Additional arguments passed as input to pycytominer.normalize().
         platemap: str or pd.DataFrame, default None
             optional platemap filepath str or pd.DataFrame to be used with results via annotate
+        float_datatype: type, default np.float64
+            Numpy floating point datatype to use for load_compartment and resulting
+            dataframes. Please note: using any besides np.float64 are experimentally
+            unverified.
 
         Returns
         -------
@@ -714,7 +723,9 @@ def merge_single_cells(
                 ]
 
                 if isinstance(sc_df, str):
-                    sc_df = self.load_compartment(compartment=left_compartment)
+                    sc_df = self.load_compartment(
+                        compartment=left_compartment, float_datatype=float_datatype
+                    )
 
                     if compute_subsample:
                         # Sample cells proportionally by self.strata
@@ -729,15 +740,19 @@ def merge_single_cells(
                         ).reindex(sc_df.columns, axis="columns")
 
                     sc_df = sc_df.merge(
-                        self.load_compartment(compartment=right_compartment),
+                        self.load_compartment(
+                            compartment=right_compartment, float_datatype=float_datatype
+                        ),
                         left_on=self.merge_cols + [left_link_col],
                         right_on=self.merge_cols + [right_link_col],
                         suffixes=merge_suffix,
                     )
 
                 else:
                     sc_df = sc_df.merge(
-                        self.load_compartment(compartment=right_compartment),
+                        self.load_compartment(
+                            compartment=right_compartment, float_datatype=float_datatype
+                        ),
                         left_on=self.merge_cols + [left_link_col],
                         right_on=self.merge_cols + [right_link_col],
                         suffixes=merge_suffix,

diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py
@@ -3,6 +3,7 @@
 import random
 import tempfile
 
+import numpy as np
 import pandas as pd
 import pytest
 from pycytominer import aggregate, annotate, normalize
@@ -280,6 +281,23 @@ def test_load_compartment():
         check_dtype=False,
     )
 
+    # test using non-default float_datatype
+    loaded_compartment_df = AP.load_compartment(
+        compartment="cells", float_datatype=np.float32
+    )
+    pd.testing.assert_frame_equal(
+        loaded_compartment_df,
+        CELLS_DF.astype(
+            # cast any float type columns to float32 for expected comparison
+            {
+                colname: np.float32
+                for colname in CELLS_DF.columns
+                if pd.api.types.is_float(CELLS_DF[colname].dtype)
+            }
+        ).reindex(columns=loaded_compartment_df.columns),
+        check_dtype=False,
+    )
+
 
 def test_sc_count_sql_table():
     # Iterate over initialized compartments
@@ -416,6 +434,48 @@ def test_merge_single_cells():
         traditional_norm_df.loc[:, new_compartment_cols].abs().describe(),
     )
 
+    # use non-default float_datatype
+    sc_merged_df = AP.merge_single_cells(float_datatype=np.float32)
+
+    # ensure metadata have same types for comparisons
+    meta_types = {
+        colname: "int64"
+        for colname in [
+            "Metadata_ObjectNumber",
+            "Metadata_ObjectNumber_cells",
+            "Metadata_Cytoplasm_Parent_Nuclei",
+            "Metadata_Cytoplasm_Parent_Cells",
+            "Metadata_ObjectNumber_cytoplasm",
+            "Metadata_Site",
+        ]
+    }
+    # apply type changes as per meta_types
+    manual_merge = manual_merge.astype(meta_types)
+    sc_merged_df = sc_merged_df.astype(meta_types)
+
+    # similar to the assert above, we test non-default float dtype specification
+    pd.testing.assert_frame_equal(
+        left=manual_merge.astype(
+            # cast any float type columns to float32 for expected comparisons
+            {
+                colname: np.float32
+                for colname in manual_merge.columns
+                if pd.api.types.is_float(manual_merge[colname].dtype)
+                # note: pd.api.types.is_integer sometimes is unable to detect int64
+                or manual_merge[colname].dtype == "int64"
+                and colname not in meta_types.keys()
+            }
+        )
+        .sort_values(by=manual_merge.columns.tolist(), ascending=True)
+        .reset_index(drop=True),
+        # use manual_merge's column order for sc_merged_df
+        right=sc_merged_df[manual_merge.columns]
+        # use manual_merge's column order for sorting values
+        .sort_values(by=manual_merge.columns.tolist(), ascending=True).reset_index(
+            drop=True
+        ),
+    )
+
 
 def test_merge_single_cells_subsample():