From 422f3a4aac98a60df6f5cd8f569f0b9f19da9d78 Mon Sep 17 00:00:00 2001
From: Dave Bunten <ekgto445@gmail.com>
Date: Mon, 3 Oct 2022 20:46:28 +0000
Subject: [PATCH 1/6] reduce memory use by merge_single_cells

---
 pycytominer/cyto_utils/cells.py               | 62 +++++++++++----
 .../tests/test_cyto_utils/test_cells.py       | 78 +++++++++++++------
 2 files changed, 98 insertions(+), 42 deletions(-)

diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py
index 969f44e1..6a212282 100644
--- a/pycytominer/cyto_utils/cells.py
+++ b/pycytominer/cyto_utils/cells.py
@@ -652,6 +652,7 @@ def merge_single_cells(
         single_cell_normalize: bool = False,
         normalize_args: Optional[Dict] = None,
         platemap: Optional[Union[str, pd.DataFrame]] = None,
+        chunksize: Optional[int] = None,
         **kwargs,
     ):
         """Given the linking columns, merge single cell data. Normalization is also supported.
@@ -672,6 +673,10 @@ def merge_single_cells(
             Additional arguments passed as input to pycytominer.normalize().
         platemap: str or pd.DataFrame, default None
             optional platemap filepath str or pd.DataFrame to be used with results via annotate
+        chunksize: int, default None
+            chunksize for merge and concatenation operations to help address performance issues
+            note: if set to None, will infer a chunksize which is the roughly 1/3 the row length
+            of first component df.
 
         Returns
         -------
@@ -681,7 +686,7 @@ def merge_single_cells(
         """
 
         # Load the single cell dataframe by merging on the specific linking columns
-        sc_df = ""
+        sc_df = pd.DataFrame()
         linking_check_cols = []
         merge_suffix_rename = []
         for left_compartment in self.compartment_linking_cols:
@@ -704,9 +709,13 @@ def merge_single_cells(
                     left_compartment
                 ]
 
-                if isinstance(sc_df, str):
+                if sc_df.empty:
                     sc_df = self.load_compartment(compartment=left_compartment)
 
+                    # if chunksize was not set,
+                    if chunksize is None:
+                        chunksize = round(len(sc_df) / 3)
+
                     if compute_subsample:
                         # Sample cells proportionally by self.strata
                         self.get_subsample(df=sc_df, rename_col=False)
@@ -719,20 +728,25 @@ def merge_single_cells(
                             sc_df, how="left", on=subset_logic_df.columns.tolist()
                         ).reindex(sc_df.columns, axis="columns")
 
-                    sc_df = sc_df.merge(
-                        self.load_compartment(compartment=right_compartment),
-                        left_on=self.merge_cols + [left_link_col],
-                        right_on=self.merge_cols + [right_link_col],
-                        suffixes=merge_suffix,
-                    )
-
-                else:
-                    sc_df = sc_df.merge(
-                        self.load_compartment(compartment=right_compartment),
-                        left_on=self.merge_cols + [left_link_col],
-                        right_on=self.merge_cols + [right_link_col],
-                        suffixes=merge_suffix,
-                    )
+                # perform a segmented merge using pd.concat and
+                # chunksize to help constrain memory
+                sc_df = pd.concat(
+                    [
+                        self.load_compartment(compartment=right_compartment).merge(
+                            right=right,
+                            # note: we reverse left and right for join key merge order reference
+                            left_on=self.merge_cols + [right_link_col],
+                            right_on=self.merge_cols + [left_link_col],
+                            # note: we reverse left and right for join keys
+                            suffixes=reversed(merge_suffix),
+                            how="inner",
+                        )
+                        for right in [
+                            sc_df[i : i + chunksize]
+                            for i in range(0, sc_df.shape[0], chunksize)
+                        ]
+                    ]
+                )
 
                 linking_check_cols.append(linking_check)
 
@@ -759,8 +773,18 @@ def merge_single_cells(
             self.load_image()
             self.load_image_data = True
 
+        # perform a segmented merge using pd.concat and
+        # chunksize to help constrain memory
         sc_df = (
-            self.image_df.merge(sc_df, on=self.merge_cols, how="right")
+            pd.concat(
+                [
+                    self.image_df.merge(right=right, on=self.merge_cols, how="right")
+                    for right in [
+                        sc_df[i : i + chunksize]
+                        for i in range(0, sc_df.shape[0], chunksize)
+                    ]
+                ]
+            )
             # pandas rename performance may be improved using copy=False, inplace=False
             # reference: https://ryanlstevens.github.io/2022-05-06-pandasColumnRenaming/
             .rename(
@@ -769,6 +793,10 @@ def merge_single_cells(
                 self.full_merge_suffix_rename, axis="columns", copy=False, inplace=False
             )
         )
+
+        # reset the index to address above concat merges and memory conservation (inplace)
+        sc_df.reset_index(inplace=True, drop=True)
+
         if single_cell_normalize:
             # Infering features is tricky with non-canonical data
             if normalize_args is None:
diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py
index db347d7e..06943ed5 100644
--- a/pycytominer/tests/test_cyto_utils/test_cells.py
+++ b/pycytominer/tests/test_cyto_utils/test_cells.py
@@ -273,6 +273,10 @@ def test_get_sql_table_col_names():
 
 
 def test_merge_single_cells():
+    """
+    Testing various SingleCells.merge_single_cells functionality
+    """
+
     sc_merged_df = AP.merge_single_cells()
 
     # Assert that the image data was merged
@@ -300,21 +304,20 @@ def test_merge_single_cells():
     )
 
     # Confirm that the merge correctly reversed the object number (opposite from Parent)
-    assert (
-        sc_merged_df.Metadata_ObjectNumber_cytoplasm.tolist()[::-1]
-        == sc_merged_df.Metadata_ObjectNumber.tolist()
-    )
-    assert (
-        manual_merge.Metadata_ObjectNumber_cytoplasm.tolist()[::-1]
-        == sc_merged_df.Metadata_ObjectNumber.tolist()
-    )
-    assert (
-        manual_merge.Metadata_ObjectNumber_cytoplasm.tolist()[::-1]
-        == sc_merged_df.Metadata_ObjectNumber.tolist()
-    )
-    assert (
-        manual_merge.Metadata_ObjectNumber_cells.tolist()
-        == sc_merged_df.Metadata_ObjectNumber.tolist()
+    assert_cols = [
+        "Metadata_ObjectNumber",
+        "Metadata_ObjectNumber_cytoplasm",
+        "Metadata_ObjectNumber_cells",
+    ]
+    # check that we have the same data using same cols, sort and a reset index
+    pd.testing.assert_frame_equal(
+        left=manual_merge[assert_cols]
+        .sort_values(by=assert_cols, ascending=True)
+        .reset_index(drop=True),
+        right=sc_merged_df[assert_cols]
+        .sort_values(by=assert_cols, ascending=True)
+        .reset_index(drop=True),
+        check_dtype=False,
     )
 
     # Confirm the merge and adding merge options
@@ -335,9 +338,14 @@ def test_merge_single_cells():
                     manual_merge, method=method, samples=samples, features=features
                 )
 
+                # compare data using identical column order, sorting, and reset index
                 pd.testing.assert_frame_equal(
-                    norm_method_df.sort_index(axis=1),
-                    manual_merge_normalize.sort_index(axis=1),
+                    norm_method_df[norm_method_df.columns]
+                    .sort_values(by="Cells_a")
+                    .reset_index(drop=True),
+                    manual_merge_normalize[norm_method_df.columns]
+                    .sort_values(by="Cells_a")
+                    .reset_index(drop=True),
                     check_dtype=False,
                 )
 
@@ -345,9 +353,26 @@ def test_merge_single_cells():
     new_sc_merge_df = AP_NEW.merge_single_cells()
 
     assert sum(new_sc_merge_df.columns.str.startswith("New")) == 4
-    assert (
-        NEW_COMPARTMENT_DF.ObjectNumber.tolist()[::-1]
-        == new_sc_merge_df.Metadata_ObjectNumber_new.tolist()
+
+    assert_cols = [
+        "New_a",
+        "New_b",
+        "New_c",
+        "New_d",
+        "Metadata_ObjectNumber_new",
+    ]
+    # compare data using identical column order, sorting, and reset index
+    # note: we rename NEW_COMPARTMENT_DF to match new_sc_merge_df's ObjectNumber colname
+    pd.testing.assert_frame_equal(
+        left=NEW_COMPARTMENT_DF.rename(
+            columns={"ObjectNumber": "Metadata_ObjectNumber_new"}
+        )[assert_cols]
+        .sort_values(by=assert_cols)
+        .reset_index(drop=True),
+        right=new_sc_merge_df[assert_cols]
+        .sort_values(by=assert_cols)
+        .reset_index(drop=True),
+        check_dtype=False,
     )
 
     norm_new_method_df = AP_NEW.merge_single_cells(
@@ -471,7 +496,6 @@ def test_merge_single_cells_cytominer_database_test_file():
         f"{os.path.dirname(__file__)}/../test_data/cytominer_database_example_data/test_SQ00014613.parquet",
     )
     sql_url = f"sqlite:///{sql_path}"
-    print(sql_url)
 
     # build SingleCells from database
     sc_p = SingleCells(
@@ -493,8 +517,8 @@ def test_merge_single_cells_cytominer_database_test_file():
     # note: pd.DataFrame datatypes sometimes appear automatically changed on-read, so we cast
     # the result_file dataframe using the base dataframe's types.
     pd.testing.assert_frame_equal(
-        pd.read_csv(csv_path).astype(merged_sc.dtypes.to_dict()),
-        pd.read_csv(result_file).astype(merged_sc.dtypes.to_dict()),
+        pd.read_csv(csv_path).astype(merged_sc.dtypes.to_dict())[merged_sc.columns],
+        pd.read_csv(result_file).astype(merged_sc.dtypes.to_dict())[merged_sc.columns],
     )
 
     # test parquet output from merge_single_cells
@@ -507,8 +531,12 @@ def test_merge_single_cells_cytominer_database_test_file():
     # note: pd.DataFrame datatypes sometimes appear automatically changed on-read, so we cast
     # the result_file dataframe using the base dataframe's types.
     pd.testing.assert_frame_equal(
-        pd.read_parquet(parquet_path).astype(merged_sc.dtypes.to_dict()),
-        pd.read_parquet(result_file).astype(merged_sc.dtypes.to_dict()),
+        pd.read_parquet(parquet_path).astype(merged_sc.dtypes.to_dict())[
+            merged_sc.columns
+        ],
+        pd.read_parquet(result_file).astype(merged_sc.dtypes.to_dict())[
+            merged_sc.columns
+        ],
     )
 
     # test parquet output from merge_single_cells with annotation meta

From e0062a4f0744308f972c4e5b0443ee6d5193ac97 Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Thu, 6 Oct 2022 13:39:22 -0600
Subject: [PATCH 2/6] add comments for chunksize

---
 pycytominer/cyto_utils/cells.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py
index 6a212282..1f24da10 100644
--- a/pycytominer/cyto_utils/cells.py
+++ b/pycytominer/cyto_utils/cells.py
@@ -712,7 +712,8 @@ def merge_single_cells(
                 if sc_df.empty:
                     sc_df = self.load_compartment(compartment=left_compartment)
 
-                    # if chunksize was not set,
+                    # if chunksize was not set, set it to roughly
+                    # one third the size of our initial compartment
                     if chunksize is None:
                         chunksize = round(len(sc_df) / 3)
 

From cc68ec58b3a2c7fd2a4f33abd62ec0b4a09607db Mon Sep 17 00:00:00 2001
From: Dave Bunten <ekgto445@gmail.com>
Date: Fri, 7 Oct 2022 12:53:06 -0600
Subject: [PATCH 3/6] Apply renaming suggestions from code review

Co-authored-by: Gregory Way <gregory.way@gmail.com>
---
 pycytominer/cyto_utils/cells.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py
index 1f24da10..6d3d2460 100644
--- a/pycytominer/cyto_utils/cells.py
+++ b/pycytominer/cyto_utils/cells.py
@@ -652,7 +652,7 @@ def merge_single_cells(
         single_cell_normalize: bool = False,
         normalize_args: Optional[Dict] = None,
         platemap: Optional[Union[str, pd.DataFrame]] = None,
-        chunksize: Optional[int] = None,
+        sc_merge_chunksize: Optional[int] = None,
         **kwargs,
     ):
         """Given the linking columns, merge single cell data. Normalization is also supported.
@@ -734,7 +734,7 @@ def merge_single_cells(
                 sc_df = pd.concat(
                     [
                         self.load_compartment(compartment=right_compartment).merge(
-                            right=right,
+                            right=right_chunk,
                             # note: we reverse left and right for join key merge order reference
                             left_on=self.merge_cols + [right_link_col],
                             right_on=self.merge_cols + [left_link_col],
@@ -742,7 +742,7 @@ def merge_single_cells(
                             suffixes=reversed(merge_suffix),
                             how="inner",
                         )
-                        for right in [
+                        for right_chunk in [
                             sc_df[i : i + chunksize]
                             for i in range(0, sc_df.shape[0], chunksize)
                         ]

From 65d2a4206acf180e9abb14da99c0fe4c070f1ddb Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Fri, 7 Oct 2022 13:43:51 -0600
Subject: [PATCH 4/6] rename chunksize and chunk vars; test spacing

---
 pycytominer/cyto_utils/cells.py               | 26 ++++++++++---------
 .../tests/test_cyto_utils/test_cells.py       |  1 +
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py
index 6d3d2460..ce937fb8 100644
--- a/pycytominer/cyto_utils/cells.py
+++ b/pycytominer/cyto_utils/cells.py
@@ -673,8 +673,8 @@ def merge_single_cells(
             Additional arguments passed as input to pycytominer.normalize().
         platemap: str or pd.DataFrame, default None
             optional platemap filepath str or pd.DataFrame to be used with results via annotate
-        chunksize: int, default None
-            chunksize for merge and concatenation operations to help address performance issues
+        sc_merge_chunksize: int, default None
+            Chunksize for merge and concatenation operations to help address performance issues
             note: if set to None, will infer a chunksize which is the roughly 1/3 the row length
             of first component df.
 
@@ -714,8 +714,8 @@ def merge_single_cells(
 
                     # if chunksize was not set, set it to roughly
                     # one third the size of our initial compartment
-                    if chunksize is None:
-                        chunksize = round(len(sc_df) / 3)
+                    if sc_merge_chunksize is None:
+                        sc_merge_chunksize = round(len(sc_df) / 3)
 
                     if compute_subsample:
                         # Sample cells proportionally by self.strata
@@ -730,7 +730,7 @@ def merge_single_cells(
                         ).reindex(sc_df.columns, axis="columns")
 
                 # perform a segmented merge using pd.concat and
-                # chunksize to help constrain memory
+                # sc_merge_chunksize to help constrain memory
                 sc_df = pd.concat(
                     [
                         self.load_compartment(compartment=right_compartment).merge(
@@ -743,8 +743,8 @@ def merge_single_cells(
                             how="inner",
                         )
                         for right_chunk in [
-                            sc_df[i : i + chunksize]
-                            for i in range(0, sc_df.shape[0], chunksize)
+                            sc_df[i : i + sc_merge_chunksize]
+                            for i in range(0, sc_df.shape[0], sc_merge_chunksize)
                         ]
                     ]
                 )
@@ -775,14 +775,16 @@ def merge_single_cells(
             self.load_image_data = True
 
         # perform a segmented merge using pd.concat and
-        # chunksize to help constrain memory
+        # sc_merge_chunksize to help constrain memory
         sc_df = (
             pd.concat(
                 [
-                    self.image_df.merge(right=right, on=self.merge_cols, how="right")
-                    for right in [
-                        sc_df[i : i + chunksize]
-                        for i in range(0, sc_df.shape[0], chunksize)
+                    self.image_df.merge(
+                        right=right_chunk, on=self.merge_cols, how="right"
+                    )
+                    for right_chunk in [
+                        sc_df[i : i + sc_merge_chunksize]
+                        for i in range(0, sc_df.shape[0], sc_merge_chunksize)
                     ]
                 ]
             )
diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py
index 06943ed5..901a7d00 100644
--- a/pycytominer/tests/test_cyto_utils/test_cells.py
+++ b/pycytominer/tests/test_cyto_utils/test_cells.py
@@ -309,6 +309,7 @@ def test_merge_single_cells():
         "Metadata_ObjectNumber_cytoplasm",
         "Metadata_ObjectNumber_cells",
     ]
+
     # check that we have the same data using same cols, sort and a reset index
     pd.testing.assert_frame_equal(
         left=manual_merge[assert_cols]

From 298bf3aab28e4ffed53512debc91dbaba9a11a3c Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Mon, 10 Oct 2022 15:55:37 -0600
Subject: [PATCH 5/6] add optional float dtype specification

---
 pycytominer/cyto_utils/cells.py               | 19 +++++++++++++++----
 .../tests/test_cyto_utils/test_cells.py       | 18 ++++++++++++++++++
 2 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py
index ce937fb8..b5172019 100644
--- a/pycytominer/cyto_utils/cells.py
+++ b/pycytominer/cyto_utils/cells.py
@@ -419,13 +419,16 @@ def get_sql_table_col_names(self, table):
 
         return meta_cols, feat_cols
 
-    def load_compartment(self, compartment):
+    def load_compartment(self, compartment, float_datatype: type = np.float64):
         """Creates the compartment dataframe.
 
         Parameters
         ----------
         compartment : str
             The compartment to process.
+        float_datatype: type, default np.float64
+            Numpy floating point datatype to use for load_compartment and resulting dataframes.
+            Please note: using any besides np.float64 are experimentally unverified.
 
         Returns
         -------
@@ -439,7 +442,7 @@ def load_compartment(self, compartment):
         num_meta, num_feats = len(meta_cols), len(feat_cols)
 
         # Use pre-allocated np.array for data
-        feats = np.empty(shape=(num_cells, num_feats), dtype=np.float64)
+        feats = np.empty(shape=(num_cells, num_feats), dtype=float_datatype)
         # Use pre-allocated pd.DataFrame for metadata
         metas = pd.DataFrame(columns=meta_cols, index=range(num_cells))
 
@@ -653,6 +656,7 @@ def merge_single_cells(
         normalize_args: Optional[Dict] = None,
         platemap: Optional[Union[str, pd.DataFrame]] = None,
         sc_merge_chunksize: Optional[int] = None,
+        float_datatype: type = np.float64,
         **kwargs,
     ):
         """Given the linking columns, merge single cell data. Normalization is also supported.
@@ -677,6 +681,9 @@ def merge_single_cells(
             Chunksize for merge and concatenation operations to help address performance issues
             note: if set to None, will infer a chunksize which is the roughly 1/3 the row length
             of first component df.
+        float_datatype: type, default np.float64
+            Numpy floating point datatype to use for load_compartment and resulting dataframes.
+            Please note: using any besides np.float64 are experimentally unverified.
 
         Returns
         -------
@@ -710,7 +717,9 @@ def merge_single_cells(
                 ]
 
                 if sc_df.empty:
-                    sc_df = self.load_compartment(compartment=left_compartment)
+                    sc_df = self.load_compartment(
+                        compartment=left_compartment, float_datatype=float_datatype
+                    )
 
                     # if chunksize was not set, set it to roughly
                     # one third the size of our initial compartment
@@ -733,7 +742,9 @@ def merge_single_cells(
                 # sc_merge_chunksize to help constrain memory
                 sc_df = pd.concat(
                     [
-                        self.load_compartment(compartment=right_compartment).merge(
+                        self.load_compartment(
+                            compartment=right_compartment, float_datatype=float_datatype
+                        ).merge(
                             right=right_chunk,
                             # note: we reverse left and right for join key merge order reference
                             left_on=self.merge_cols + [right_link_col],
diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py
index 901a7d00..4f4f9c95 100644
--- a/pycytominer/tests/test_cyto_utils/test_cells.py
+++ b/pycytominer/tests/test_cyto_utils/test_cells.py
@@ -3,6 +3,7 @@
 import random
 import tempfile
 
+import numpy as np
 import pandas as pd
 import pytest
 from pycytominer import aggregate, annotate, normalize
@@ -255,6 +256,23 @@ def test_load_compartment():
         check_dtype=False,
     )
 
+    # test using non-default float_datatype
+    loaded_compartment_df = AP.load_compartment(
+        compartment="cells", float_datatype=np.float32
+    )
+    pd.testing.assert_frame_equal(
+        loaded_compartment_df,
+        CELLS_DF.astype(
+            # cast any float type columns to float32 for expected comparison
+            {
+                colname: np.float32
+                for colname in CELLS_DF.columns
+                if pd.api.types.is_float(CELLS_DF[colname].dtype)
+            }
+        ).reindex(columns=loaded_compartment_df.columns),
+        check_dtype=False,
+    )
+
 
 def test_sc_count_sql_table():
     # Iterate over initialized compartments

From b4634e271d498c5c70486a57939ced903f23f7c2 Mon Sep 17 00:00:00 2001
From: d33bs <dave.bunten@cuanschutz.edu>
Date: Tue, 11 Oct 2022 09:49:29 -0600
Subject: [PATCH 6/6] test merge_single_cells non-default float dtype

---
 .../tests/test_cyto_utils/test_cells.py       | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py
index 4f4f9c95..e766a202 100644
--- a/pycytominer/tests/test_cyto_utils/test_cells.py
+++ b/pycytominer/tests/test_cyto_utils/test_cells.py
@@ -339,6 +339,28 @@ def test_merge_single_cells():
         check_dtype=False,
     )
 
+    # use non-default float_datatype
+    sc_merged_df = AP.merge_single_cells(float_datatype=np.float32)
+
+    # similar to the assert above, we test non-default float dtype specification
+    pd.testing.assert_frame_equal(
+        left=manual_merge[assert_cols]
+        .astype(
+            # cast any float type columns to float32 for expected comparison
+            {
+                colname: np.float32
+                for colname in manual_merge.columns
+                if pd.api.types.is_float(manual_merge[colname].dtype)
+            }
+        )
+        .sort_values(by=assert_cols, ascending=True)
+        .reset_index(drop=True),
+        right=sc_merged_df[assert_cols]
+        .sort_values(by=assert_cols, ascending=True)
+        .reset_index(drop=True),
+        check_dtype=False,
+    )
+
     # Confirm the merge and adding merge options
     for method in ["standardize", "robustize"]:
         for samples in ["all", "Metadata_ImageNumber == 'x'"]: