From fea3a73c4c9e239ac9daaf2ffb72915c1b65104d Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Fri, 16 Dec 2022 13:49:34 -0700
Subject: [PATCH 1/6] add float_datatype parameter to load and merge ops

---
 pycytominer/cyto_utils/cells.py               | 27 +++++++--
 .../tests/test_cyto_utils/test_cells.py       | 60 +++++++++++++++++++
 2 files changed, 81 insertions(+), 6 deletions(-)

diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py
index 08029fe6..4921820e 100644
--- a/pycytominer/cyto_utils/cells.py
+++ b/pycytominer/cyto_utils/cells.py
@@ -429,13 +429,17 @@ def get_sql_table_col_names(self, table):
 
         return meta_cols, feat_cols
 
-    def load_compartment(self, compartment):
+    def load_compartment(self, compartment, float_datatype: type = np.float64):
         """Creates the compartment dataframe.
 
         Parameters
         ----------
         compartment : str
             The compartment to process.
+        float_datatype: type, default np.float64
+            Numpy floating point datatype to use for load_compartment and resulting
+            dataframes. Please note: using any besides np.float64 are experimentally
+            unverified.
 
         Returns
         -------
@@ -448,8 +452,8 @@ def load_compartment(self, compartment):
         meta_cols, feat_cols = self.get_sql_table_col_names(compartment)
         num_meta, num_feats = len(meta_cols), len(feat_cols)
 
-        # Use pre-allocated np.array for data
-        feats = np.empty(shape=(num_cells, num_feats), dtype=np.float64)
+        # Use pre-allocated np.array for feature data
+        feats = np.empty(shape=(num_cells, num_feats), dtype=float_datatype)
         # Use pre-allocated pd.DataFrame for metadata
         metas = pd.DataFrame(columns=meta_cols, index=range(num_cells))
 
@@ -661,6 +665,7 @@ def merge_single_cells(
         single_cell_normalize: bool = False,
         normalize_args: Optional[Dict] = None,
         platemap: Optional[Union[str, pd.DataFrame]] = None,
+        float_datatype: type = np.float64,
         **kwargs,
     ):
         """Given the linking columns, merge single cell data. Normalization is also supported.
@@ -681,6 +686,10 @@ def merge_single_cells(
             Additional arguments passed as input to pycytominer.normalize().
         platemap: str or pd.DataFrame, default None
             optional platemap filepath str or pd.DataFrame to be used with results via annotate
+        float_datatype: type, default np.float64
+            Numpy floating point datatype to use for load_compartment and resulting
+            dataframes. Please note: using any besides np.float64 are experimentally
+            unverified.
 
         Returns
         -------
@@ -714,7 +723,9 @@ def merge_single_cells(
                 ]
 
                 if isinstance(sc_df, str):
-                    sc_df = self.load_compartment(compartment=left_compartment)
+                    sc_df = self.load_compartment(
+                        compartment=left_compartment, float_datatype=float_datatype
+                    )
 
                     if compute_subsample:
                         # Sample cells proportionally by self.strata
@@ -729,7 +740,9 @@ def merge_single_cells(
                         ).reindex(sc_df.columns, axis="columns")
 
                     sc_df = sc_df.merge(
-                        self.load_compartment(compartment=right_compartment),
+                        self.load_compartment(
+                            compartment=right_compartment, float_datatype=float_datatype
+                        ),
                         left_on=self.merge_cols + [left_link_col],
                         right_on=self.merge_cols + [right_link_col],
                         suffixes=merge_suffix,
@@ -737,7 +750,9 @@ def merge_single_cells(
 
                 else:
                     sc_df = sc_df.merge(
-                        self.load_compartment(compartment=right_compartment),
+                        self.load_compartment(
+                            compartment=right_compartment, float_datatype=float_datatype
+                        ),
                         left_on=self.merge_cols + [left_link_col],
                         right_on=self.merge_cols + [right_link_col],
                         suffixes=merge_suffix,
diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py
index 295db959..fb80861c 100644
--- a/pycytominer/tests/test_cyto_utils/test_cells.py
+++ b/pycytominer/tests/test_cyto_utils/test_cells.py
@@ -3,6 +3,7 @@
 import random
 import tempfile
 
+import numpy as np
 import pandas as pd
 import pytest
 from pycytominer import aggregate, annotate, normalize
@@ -280,6 +281,23 @@ def test_load_compartment():
         check_dtype=False,
     )
 
+    # test using non-default float_datatype
+    loaded_compartment_df = AP.load_compartment(
+        compartment="cells", float_datatype=np.float32
+    )
+    pd.testing.assert_frame_equal(
+        loaded_compartment_df,
+        CELLS_DF.astype(
+            # cast any float type columns to float32 for expected comparison
+            {
+                colname: np.float32
+                for colname in CELLS_DF.columns
+                if pd.api.types.is_float(CELLS_DF[colname].dtype)
+            }
+        ).reindex(columns=loaded_compartment_df.columns),
+        check_dtype=False,
+    )
+
 
 def test_sc_count_sql_table():
     # Iterate over initialized compartments
@@ -416,6 +434,48 @@ def test_merge_single_cells():
         traditional_norm_df.loc[:, new_compartment_cols].abs().describe(),
     )
 
+    # use non-default float_datatype
+    sc_merged_df = AP.merge_single_cells(float_datatype=np.float32)
+
+    # ensure metadata have same types for comparisons
+    meta_types = {
+        colname: "int64"
+        for colname in [
+            "Metadata_ObjectNumber",
+            "Metadata_ObjectNumber_cells",
+            "Metadata_Cytoplasm_Parent_Nuclei",
+            "Metadata_Cytoplasm_Parent_Cells",
+            "Metadata_ObjectNumber_cytoplasm",
+            "Metadata_Site",
+        ]
+    }
+    # apply type changes as per meta_types
+    manual_merge = manual_merge.astype(meta_types)
+    sc_merged_df = sc_merged_df.astype(meta_types)
+
+    # similar to the assert above, we test non-default float dtype specification
+    pd.testing.assert_frame_equal(
+        left=manual_merge.astype(
+            # cast any float type columns to float32 for expected comparisons
+            {
+                colname: np.float32
+                for colname in manual_merge.columns
+                if pd.api.types.is_float(manual_merge[colname].dtype)
+                # note: pd.api.types.is_integer sometimes is unable to detect int64
+                or manual_merge[colname].dtype == "int64"
+                and colname not in meta_types.keys()
+            }
+        )
+        .sort_values(by=manual_merge.columns.tolist(), ascending=True)
+        .reset_index(drop=True),
+        # use manual_merge's column order for sc_merged_df
+        right=sc_merged_df[manual_merge.columns]
+        # use manual_merge's column order for sorting values
+        .sort_values(by=manual_merge.columns.tolist(), ascending=True).reset_index(
+            drop=True
+        ),
+    )
+
 
 def test_merge_single_cells_subsample():
 

From db55da4188c4b0601e1bf91125495c1f9504ff3f Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Mon, 19 Dec 2022 15:08:52 -0700
Subject: [PATCH 2/6] move to singlecells default_datatype_float attr

---
 pycytominer/cyto_utils/cells.py | 36 +++++++++++++++++----------------
 1 file changed, 19 insertions(+), 17 deletions(-)

diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py
index 4921820e..888fe05e 100644
--- a/pycytominer/cyto_utils/cells.py
+++ b/pycytominer/cyto_utils/cells.py
@@ -69,6 +69,14 @@ class SingleCells(object):
         Name of the fields of view feature.
     object_feature : str, default "Metadata_ObjectNumber"
         Object number feature.
+    default_datatype_float: type
+        Numpy floating point datatype to use for load_compartment and resulting
+        dataframes. This parameter may be used to assist with performance-related
+        issues by reducing the memory required for floating-point data. 
+        For example, using np.float32 instead of np.float64 for this parameter 
+        will reduce memory consumed by float columns by roughly 50%.
+        Please note: using any besides np.float64 are experimentally
+        unverified.
 
     Notes
     -----
@@ -105,6 +113,7 @@ def __init__(
         fields_of_view="all",
         fields_of_view_feature="Metadata_Site",
         object_feature="Metadata_ObjectNumber",
+        default_datatype_float=np.float64,
     ):
         """Constructor method"""
         # Check compartments specified
@@ -139,6 +148,7 @@ def __init__(
         self.compartment_linking_cols = compartment_linking_cols
         self.fields_of_view_feature = fields_of_view_feature
         self.object_feature = object_feature
+        self.default_datatype_float = default_datatype_float
 
         # Confirm that the compartments and linking cols are formatted properly
         assert_linking_cols_complete(
@@ -429,17 +439,16 @@ def get_sql_table_col_names(self, table):
 
         return meta_cols, feat_cols
 
-    def load_compartment(self, compartment, float_datatype: type = np.float64):
+    def load_compartment(self, compartment):
         """Creates the compartment dataframe.
 
+        Note: makes use of default_datatype_float attribute
+        for setting a default floating point datatype.
+
         Parameters
         ----------
         compartment : str
             The compartment to process.
-        float_datatype: type, default np.float64
-            Numpy floating point datatype to use for load_compartment and resulting
-            dataframes. Please note: using any besides np.float64 are experimentally
-            unverified.
 
         Returns
         -------
@@ -453,7 +462,9 @@ def load_compartment(self, compartment, float_datatype: type = np.float64):
         num_meta, num_feats = len(meta_cols), len(feat_cols)
 
         # Use pre-allocated np.array for feature data
-        feats = np.empty(shape=(num_cells, num_feats), dtype=float_datatype)
+        feats = np.empty(
+            shape=(num_cells, num_feats), dtype=self.default_datatype_float
+        )
         # Use pre-allocated pd.DataFrame for metadata
         metas = pd.DataFrame(columns=meta_cols, index=range(num_cells))
 
@@ -665,7 +676,6 @@ def merge_single_cells(
         single_cell_normalize: bool = False,
         normalize_args: Optional[Dict] = None,
         platemap: Optional[Union[str, pd.DataFrame]] = None,
-        float_datatype: type = np.float64,
         **kwargs,
     ):
         """Given the linking columns, merge single cell data. Normalization is also supported.
@@ -686,10 +696,6 @@ def merge_single_cells(
             Additional arguments passed as input to pycytominer.normalize().
         platemap: str or pd.DataFrame, default None
             optional platemap filepath str or pd.DataFrame to be used with results via annotate
-        float_datatype: type, default np.float64
-            Numpy floating point datatype to use for load_compartment and resulting
-            dataframes. Please note: using any besides np.float64 are experimentally
-            unverified.
 
         Returns
         -------
@@ -723,9 +729,7 @@ def merge_single_cells(
                 ]
 
                 if isinstance(sc_df, str):
-                    sc_df = self.load_compartment(
-                        compartment=left_compartment, float_datatype=float_datatype
-                    )
+                    sc_df = self.load_compartment(compartment=left_compartment)
 
                     if compute_subsample:
                         # Sample cells proportionally by self.strata
@@ -740,9 +744,7 @@ def merge_single_cells(
                         ).reindex(sc_df.columns, axis="columns")
 
                     sc_df = sc_df.merge(
-                        self.load_compartment(
-                            compartment=right_compartment, float_datatype=float_datatype
-                        ),
+                        self.load_compartment(compartment=right_compartment),
                         left_on=self.merge_cols + [left_link_col],
                         right_on=self.merge_cols + [right_link_col],
                         suffixes=merge_suffix,

From d55a0cccdc0c84124f2c258bd014162f6d0b80ee Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Mon, 19 Dec 2022 15:53:37 -0700
Subject: [PATCH 3/6] remove float_datatype

---
 pycytominer/cyto_utils/cells.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py
index 888fe05e..48e037d4 100644
--- a/pycytominer/cyto_utils/cells.py
+++ b/pycytominer/cyto_utils/cells.py
@@ -753,7 +753,7 @@ def merge_single_cells(
                 else:
                     sc_df = sc_df.merge(
                         self.load_compartment(
-                            compartment=right_compartment, float_datatype=float_datatype
+                            compartment=right_compartment
                         ),
                         left_on=self.merge_cols + [left_link_col],
                         right_on=self.merge_cols + [right_link_col],

From 028962277fae9f664c8970b1f6f98187db96e1b3 Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Mon, 19 Dec 2022 15:53:59 -0700
Subject: [PATCH 4/6] update testing for load_compartment

---
 .../tests/test_cyto_utils/test_cells.py       | 99 ++++++++-----------
 1 file changed, 42 insertions(+), 57 deletions(-)

diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py
index fb80861c..7eb8fcd3 100644
--- a/pycytominer/tests/test_cyto_utils/test_cells.py
+++ b/pycytominer/tests/test_cyto_utils/test_cells.py
@@ -281,21 +281,40 @@ def test_load_compartment():
         check_dtype=False,
     )
 
-    # test using non-default float_datatype
-    loaded_compartment_df = AP.load_compartment(
-        compartment="cells", float_datatype=np.float32
-    )
+    # test load_compartment with non-default default_datatype_float
+    # create new SingleCells based on AP
+    float32_loaded_compartment_df = SingleCells(
+        sql_file=TMP_SQLITE_FILE, default_datatype_float=np.float32
+    ).load_compartment(compartment="cells")
+
+    # for uniformly handling metadata types for both dataframes
+    metadata_types = {"ObjectNumber": "int64"}
+
+    # create deep copy of CELLS_DF with manually re-typed float columns as float32
+    cells_df_for_compare = CELLS_DF.copy(deep=True).astype(
+        # cast any float type columns to float32 for expected comparison
+        {
+            colname: np.float32
+            for colname in CELLS_DF.columns
+            # check for only columns which are of float type
+            if pd.api.types.is_float(CELLS_DF[colname].dtype)
+            # check for columns which are of 'int64' type
+            # note: pd.api.types.is_integer sometimes is unable to detect int64
+            or CELLS_DF[colname].dtype == "int64"
+            # avoid recasting the metadata_types
+            and colname not in metadata_types.keys()
+        }
+        # use float32_loaded_compartment_df column order for comparison below
+    )[float32_loaded_compartment_df.columns]
+
+    # cast metadata types in the same way for comparisons
+    float32_loaded_compartment_df = float32_loaded_compartment_df.astype(metadata_types)
+    cells_df_for_compare = cells_df_for_compare.astype(metadata_types)
+
+    # perform comparison of dataframes
     pd.testing.assert_frame_equal(
-        loaded_compartment_df,
-        CELLS_DF.astype(
-            # cast any float type columns to float32 for expected comparison
-            {
-                colname: np.float32
-                for colname in CELLS_DF.columns
-                if pd.api.types.is_float(CELLS_DF[colname].dtype)
-            }
-        ).reindex(columns=loaded_compartment_df.columns),
-        check_dtype=False,
+        float32_loaded_compartment_df,
+        cells_df_for_compare,
     )
 
 
@@ -434,49 +453,6 @@ def test_merge_single_cells():
         traditional_norm_df.loc[:, new_compartment_cols].abs().describe(),
     )
 
-    # use non-default float_datatype
-    sc_merged_df = AP.merge_single_cells(float_datatype=np.float32)
-
-    # ensure metadata have same types for comparisons
-    meta_types = {
-        colname: "int64"
-        for colname in [
-            "Metadata_ObjectNumber",
-            "Metadata_ObjectNumber_cells",
-            "Metadata_Cytoplasm_Parent_Nuclei",
-            "Metadata_Cytoplasm_Parent_Cells",
-            "Metadata_ObjectNumber_cytoplasm",
-            "Metadata_Site",
-        ]
-    }
-    # apply type changes as per meta_types
-    manual_merge = manual_merge.astype(meta_types)
-    sc_merged_df = sc_merged_df.astype(meta_types)
-
-    # similar to the assert above, we test non-default float dtype specification
-    pd.testing.assert_frame_equal(
-        left=manual_merge.astype(
-            # cast any float type columns to float32 for expected comparisons
-            {
-                colname: np.float32
-                for colname in manual_merge.columns
-                if pd.api.types.is_float(manual_merge[colname].dtype)
-                # note: pd.api.types.is_integer sometimes is unable to detect int64
-                or manual_merge[colname].dtype == "int64"
-                and colname not in meta_types.keys()
-            }
-        )
-        .sort_values(by=manual_merge.columns.tolist(), ascending=True)
-        .reset_index(drop=True),
-        # use manual_merge's column order for sc_merged_df
-        right=sc_merged_df[manual_merge.columns]
-        # use manual_merge's column order for sorting values
-        .sort_values(by=manual_merge.columns.tolist(), ascending=True).reset_index(
-            drop=True
-        ),
-    )
-
-
 def test_merge_single_cells_subsample():
 
     for subsample_frac in [0.1, 0.5, 0.9]:
@@ -1083,3 +1059,12 @@ def test_load_non_canonical_image_table():
         result.sort_index(axis="columns").drop("Metadata_Site_Count", axis="columns"),
         sc_aggregated_df,
     )
+
+def test_singlecells_default_datatype():
+    """
+    Testing various use of SingleCells class attribute
+    default_datatype_float with non-default options.
+    """
+
+
+

From 15128cb67047bfd3991242efe51aad79e3df538c Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Tue, 20 Dec 2022 07:45:36 -0700
Subject: [PATCH 5/6] remove empty test block

---
 pycytominer/tests/test_cyto_utils/test_cells.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py
index e1db1cca..6bddb800 100644
--- a/pycytominer/tests/test_cyto_utils/test_cells.py
+++ b/pycytominer/tests/test_cyto_utils/test_cells.py
@@ -1080,11 +1080,3 @@ def test_load_non_canonical_image_table():
         sc_aggregated_df,
     )
 
-def test_singlecells_default_datatype():
-    """
-    Testing various use of SingleCells class attribute
-    default_datatype_float with non-default options.
-    """
-
-
-

From 6801c2975b191a7395d09ea7682367122a8e35a6 Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Tue, 20 Dec 2022 08:07:17 -0700
Subject: [PATCH 6/6] improve readability for astype dictionary in test

---
 .../tests/test_cyto_utils/test_cells.py       | 33 ++++++++++---------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py
index 6bddb800..f6461167 100644
--- a/pycytominer/tests/test_cyto_utils/test_cells.py
+++ b/pycytominer/tests/test_cyto_utils/test_cells.py
@@ -303,22 +303,25 @@ def test_load_compartment():
     # for uniformly handling metadata types for both dataframes
     metadata_types = {"ObjectNumber": "int64"}
 
+    # updated column datatypes for manual comparisons with CELLS_DF
+    cells_df_comparison_types = {
+        colname: np.float32
+        for colname in CELLS_DF.columns
+        # check for only columns which are of float type
+        if pd.api.types.is_float(CELLS_DF[colname].dtype)
+        # check for columns which are of 'int64' type
+        # note: pd.api.types.is_integer sometimes is unable to detect int64
+        or CELLS_DF[colname].dtype == "int64"
+        # avoid recasting the metadata_types
+        and colname not in metadata_types.keys()
+    }
+
     # create deep copy of CELLS_DF with manually re-typed float columns as float32
-    cells_df_for_compare = CELLS_DF.copy(deep=True).astype(
-        # cast any float type columns to float32 for expected comparison
-        {
-            colname: np.float32
-            for colname in CELLS_DF.columns
-            # check for only columns which are of float type
-            if pd.api.types.is_float(CELLS_DF[colname].dtype)
-            # check for columns which are of 'int64' type
-            # note: pd.api.types.is_integer sometimes is unable to detect int64
-            or CELLS_DF[colname].dtype == "int64"
-            # avoid recasting the metadata_types
-            and colname not in metadata_types.keys()
-        }
-        # use float32_loaded_compartment_df column order for comparison below
-    )[float32_loaded_compartment_df.columns]
+    # and cast any float type columns to float32 for expected comparison
+    cells_df_for_compare = CELLS_DF.copy(deep=True).astype(cells_df_comparison_types)[
+        # use float32_loaded_compartment_df column order for comparison
+        float32_loaded_compartment_df.columns
+    ]
 
     # cast metadata types in the same way for comparisons
     float32_loaded_compartment_df = float32_loaded_compartment_df.astype(metadata_types)