From 422f3a4aac98a60df6f5cd8f569f0b9f19da9d78 Mon Sep 17 00:00:00 2001 From: Dave Bunten Date: Mon, 3 Oct 2022 20:46:28 +0000 Subject: [PATCH 1/6] reduce memory use by merge_single_cells --- pycytominer/cyto_utils/cells.py | 62 +++++++++++---- .../tests/test_cyto_utils/test_cells.py | 78 +++++++++++++------ 2 files changed, 98 insertions(+), 42 deletions(-) diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py index 969f44e1..6a212282 100644 --- a/pycytominer/cyto_utils/cells.py +++ b/pycytominer/cyto_utils/cells.py @@ -652,6 +652,7 @@ def merge_single_cells( single_cell_normalize: bool = False, normalize_args: Optional[Dict] = None, platemap: Optional[Union[str, pd.DataFrame]] = None, + chunksize: Optional[int] = None, **kwargs, ): """Given the linking columns, merge single cell data. Normalization is also supported. @@ -672,6 +673,10 @@ def merge_single_cells( Additional arguments passed as input to pycytominer.normalize(). platemap: str or pd.DataFrame, default None optional platemap filepath str or pd.DataFrame to be used with results via annotate + chunksize: int, default None + chunksize for merge and concatenation operations to help address performance issues + note: if set to None, will infer a chunksize which is the roughly 1/3 the row length + of first component df. Returns ------- @@ -681,7 +686,7 @@ def merge_single_cells( """ # Load the single cell dataframe by merging on the specific linking columns - sc_df = "" + sc_df = pd.DataFrame() linking_check_cols = [] merge_suffix_rename = [] for left_compartment in self.compartment_linking_cols: @@ -704,9 +709,13 @@ def merge_single_cells( left_compartment ] - if isinstance(sc_df, str): + if sc_df.empty: sc_df = self.load_compartment(compartment=left_compartment) + # if chunksize was not set, + if chunksize is None: + chunksize = round(len(sc_df) / 3) + if compute_subsample: # Sample cells proportionally by self.strata self.get_subsample(df=sc_df, rename_col=False) @@ -719,20 +728,25 @@ def merge_single_cells( sc_df, how="left", on=subset_logic_df.columns.tolist() ).reindex(sc_df.columns, axis="columns") - sc_df = sc_df.merge( - self.load_compartment(compartment=right_compartment), - left_on=self.merge_cols + [left_link_col], - right_on=self.merge_cols + [right_link_col], - suffixes=merge_suffix, - ) - - else: - sc_df = sc_df.merge( - self.load_compartment(compartment=right_compartment), - left_on=self.merge_cols + [left_link_col], - right_on=self.merge_cols + [right_link_col], - suffixes=merge_suffix, - ) + # perform a segmented merge using pd.concat and + # chunksize to help constrain memory + sc_df = pd.concat( + [ + self.load_compartment(compartment=right_compartment).merge( + right=right, + # note: we reverse left and right for join key merge order reference + left_on=self.merge_cols + [right_link_col], + right_on=self.merge_cols + [left_link_col], + # note: we reverse left and right for join keys + suffixes=reversed(merge_suffix), + how="inner", + ) + for right in [ + sc_df[i : i + chunksize] + for i in range(0, sc_df.shape[0], chunksize) + ] + ] + ) linking_check_cols.append(linking_check) @@ -759,8 +773,18 @@ def merge_single_cells( self.load_image() self.load_image_data = True + # perform a segmented merge using pd.concat and + # chunksize to help constrain memory sc_df = ( - self.image_df.merge(sc_df, on=self.merge_cols, how="right") + pd.concat( + [ + self.image_df.merge(right=right, on=self.merge_cols, how="right") + for right in [ + sc_df[i : i + chunksize] + for i in range(0, sc_df.shape[0], chunksize) + ] + ] + ) # pandas rename performance may be improved using copy=False, inplace=False # reference: https://ryanlstevens.github.io/2022-05-06-pandasColumnRenaming/ .rename( @@ -769,6 +793,10 @@ def merge_single_cells( self.full_merge_suffix_rename, axis="columns", copy=False, inplace=False ) ) + + # reset the index to address above concat merges and memory conservation (inplace) + sc_df.reset_index(inplace=True, drop=True) + if single_cell_normalize: # Infering features is tricky with non-canonical data if normalize_args is None: diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py index db347d7e..06943ed5 100644 --- a/pycytominer/tests/test_cyto_utils/test_cells.py +++ b/pycytominer/tests/test_cyto_utils/test_cells.py @@ -273,6 +273,10 @@ def test_get_sql_table_col_names(): def test_merge_single_cells(): + """ + Testing various SingleCells.merge_single_cells functionality + """ + sc_merged_df = AP.merge_single_cells() # Assert that the image data was merged @@ -300,21 +304,20 @@ def test_merge_single_cells(): ) # Confirm that the merge correctly reversed the object number (opposite from Parent) - assert ( - sc_merged_df.Metadata_ObjectNumber_cytoplasm.tolist()[::-1] - == sc_merged_df.Metadata_ObjectNumber.tolist() - ) - assert ( - manual_merge.Metadata_ObjectNumber_cytoplasm.tolist()[::-1] - == sc_merged_df.Metadata_ObjectNumber.tolist() - ) - assert ( - manual_merge.Metadata_ObjectNumber_cytoplasm.tolist()[::-1] - == sc_merged_df.Metadata_ObjectNumber.tolist() - ) - assert ( - manual_merge.Metadata_ObjectNumber_cells.tolist() - == sc_merged_df.Metadata_ObjectNumber.tolist() + assert_cols = [ + "Metadata_ObjectNumber", + "Metadata_ObjectNumber_cytoplasm", + "Metadata_ObjectNumber_cells", + ] + # check that we have the same data using same cols, sort and a reset index + pd.testing.assert_frame_equal( + left=manual_merge[assert_cols] + .sort_values(by=assert_cols, ascending=True) + .reset_index(drop=True), + right=sc_merged_df[assert_cols] + .sort_values(by=assert_cols, ascending=True) + .reset_index(drop=True), + check_dtype=False, ) # Confirm the merge and adding merge options @@ -335,9 +338,14 @@ def test_merge_single_cells(): manual_merge, method=method, samples=samples, features=features ) + # compare data using identical column order, sorting, and reset index pd.testing.assert_frame_equal( - norm_method_df.sort_index(axis=1), - manual_merge_normalize.sort_index(axis=1), + norm_method_df[norm_method_df.columns] + .sort_values(by="Cells_a") + .reset_index(drop=True), + manual_merge_normalize[norm_method_df.columns] + .sort_values(by="Cells_a") + .reset_index(drop=True), check_dtype=False, ) @@ -345,9 +353,26 @@ def test_merge_single_cells(): new_sc_merge_df = AP_NEW.merge_single_cells() assert sum(new_sc_merge_df.columns.str.startswith("New")) == 4 - assert ( - NEW_COMPARTMENT_DF.ObjectNumber.tolist()[::-1] - == new_sc_merge_df.Metadata_ObjectNumber_new.tolist() + + assert_cols = [ + "New_a", + "New_b", + "New_c", + "New_d", + "Metadata_ObjectNumber_new", + ] + # compare data using identical column order, sorting, and reset index + # note: we rename NEW_COMPARTMENT_DF to match new_sc_merge_df's ObjectNumber colname + pd.testing.assert_frame_equal( + left=NEW_COMPARTMENT_DF.rename( + columns={"ObjectNumber": "Metadata_ObjectNumber_new"} + )[assert_cols] + .sort_values(by=assert_cols) + .reset_index(drop=True), + right=new_sc_merge_df[assert_cols] + .sort_values(by=assert_cols) + .reset_index(drop=True), + check_dtype=False, ) norm_new_method_df = AP_NEW.merge_single_cells( @@ -471,7 +496,6 @@ def test_merge_single_cells_cytominer_database_test_file(): f"{os.path.dirname(__file__)}/../test_data/cytominer_database_example_data/test_SQ00014613.parquet", ) sql_url = f"sqlite:///{sql_path}" - print(sql_url) # build SingleCells from database sc_p = SingleCells( @@ -493,8 +517,8 @@ def test_merge_single_cells_cytominer_database_test_file(): # note: pd.DataFrame datatypes sometimes appear automatically changed on-read, so we cast # the result_file dataframe using the base dataframe's types. pd.testing.assert_frame_equal( - pd.read_csv(csv_path).astype(merged_sc.dtypes.to_dict()), - pd.read_csv(result_file).astype(merged_sc.dtypes.to_dict()), + pd.read_csv(csv_path).astype(merged_sc.dtypes.to_dict())[merged_sc.columns], + pd.read_csv(result_file).astype(merged_sc.dtypes.to_dict())[merged_sc.columns], ) # test parquet output from merge_single_cells @@ -507,8 +531,12 @@ def test_merge_single_cells_cytominer_database_test_file(): # note: pd.DataFrame datatypes sometimes appear automatically changed on-read, so we cast # the result_file dataframe using the base dataframe's types. pd.testing.assert_frame_equal( - pd.read_parquet(parquet_path).astype(merged_sc.dtypes.to_dict()), - pd.read_parquet(result_file).astype(merged_sc.dtypes.to_dict()), + pd.read_parquet(parquet_path).astype(merged_sc.dtypes.to_dict())[ + merged_sc.columns + ], + pd.read_parquet(result_file).astype(merged_sc.dtypes.to_dict())[ + merged_sc.columns + ], ) # test parquet output from merge_single_cells with annotation meta From e0062a4f0744308f972c4e5b0443ee6d5193ac97 Mon Sep 17 00:00:00 2001 From: d33bs Date: Thu, 6 Oct 2022 13:39:22 -0600 Subject: [PATCH 2/6] add comments for chunksize --- pycytominer/cyto_utils/cells.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py index 6a212282..1f24da10 100644 --- a/pycytominer/cyto_utils/cells.py +++ b/pycytominer/cyto_utils/cells.py @@ -712,7 +712,8 @@ def merge_single_cells( if sc_df.empty: sc_df = self.load_compartment(compartment=left_compartment) - # if chunksize was not set, + # if chunksize was not set, set it to roughly + # one third the size of our initial compartment if chunksize is None: chunksize = round(len(sc_df) / 3) From cc68ec58b3a2c7fd2a4f33abd62ec0b4a09607db Mon Sep 17 00:00:00 2001 From: Dave Bunten Date: Fri, 7 Oct 2022 12:53:06 -0600 Subject: [PATCH 3/6] Apply renaming suggestions from code review Co-authored-by: Gregory Way --- pycytominer/cyto_utils/cells.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py index 1f24da10..6d3d2460 100644 --- a/pycytominer/cyto_utils/cells.py +++ b/pycytominer/cyto_utils/cells.py @@ -652,7 +652,7 @@ def merge_single_cells( single_cell_normalize: bool = False, normalize_args: Optional[Dict] = None, platemap: Optional[Union[str, pd.DataFrame]] = None, - chunksize: Optional[int] = None, + sc_merge_chunksize: Optional[int] = None, **kwargs, ): """Given the linking columns, merge single cell data. Normalization is also supported. @@ -734,7 +734,7 @@ def merge_single_cells( sc_df = pd.concat( [ self.load_compartment(compartment=right_compartment).merge( - right=right, + right=right_chunk, # note: we reverse left and right for join key merge order reference left_on=self.merge_cols + [right_link_col], right_on=self.merge_cols + [left_link_col], @@ -742,7 +742,7 @@ def merge_single_cells( suffixes=reversed(merge_suffix), how="inner", ) - for right in [ + for right_chunk in [ sc_df[i : i + chunksize] for i in range(0, sc_df.shape[0], chunksize) ] From 65d2a4206acf180e9abb14da99c0fe4c070f1ddb Mon Sep 17 00:00:00 2001 From: d33bs Date: Fri, 7 Oct 2022 13:43:51 -0600 Subject: [PATCH 4/6] rename chunksize and chunk vars; test spacing --- pycytominer/cyto_utils/cells.py | 26 ++++++++++--------- .../tests/test_cyto_utils/test_cells.py | 1 + 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py index 6d3d2460..ce937fb8 100644 --- a/pycytominer/cyto_utils/cells.py +++ b/pycytominer/cyto_utils/cells.py @@ -673,8 +673,8 @@ def merge_single_cells( Additional arguments passed as input to pycytominer.normalize(). platemap: str or pd.DataFrame, default None optional platemap filepath str or pd.DataFrame to be used with results via annotate - chunksize: int, default None - chunksize for merge and concatenation operations to help address performance issues + sc_merge_chunksize: int, default None + Chunksize for merge and concatenation operations to help address performance issues note: if set to None, will infer a chunksize which is the roughly 1/3 the row length of first component df. @@ -714,8 +714,8 @@ def merge_single_cells( # if chunksize was not set, set it to roughly # one third the size of our initial compartment - if chunksize is None: - chunksize = round(len(sc_df) / 3) + if sc_merge_chunksize is None: + sc_merge_chunksize = round(len(sc_df) / 3) if compute_subsample: # Sample cells proportionally by self.strata @@ -730,7 +730,7 @@ def merge_single_cells( ).reindex(sc_df.columns, axis="columns") # perform a segmented merge using pd.concat and - # chunksize to help constrain memory + # sc_merge_chunksize to help constrain memory sc_df = pd.concat( [ self.load_compartment(compartment=right_compartment).merge( @@ -743,8 +743,8 @@ def merge_single_cells( how="inner", ) for right_chunk in [ - sc_df[i : i + chunksize] - for i in range(0, sc_df.shape[0], chunksize) + sc_df[i : i + sc_merge_chunksize] + for i in range(0, sc_df.shape[0], sc_merge_chunksize) ] ] ) @@ -775,14 +775,16 @@ def merge_single_cells( self.load_image_data = True # perform a segmented merge using pd.concat and - # chunksize to help constrain memory + # sc_merge_chunksize to help constrain memory sc_df = ( pd.concat( [ - self.image_df.merge(right=right, on=self.merge_cols, how="right") - for right in [ - sc_df[i : i + chunksize] - for i in range(0, sc_df.shape[0], chunksize) + self.image_df.merge( + right=right_chunk, on=self.merge_cols, how="right" + ) + for right_chunk in [ + sc_df[i : i + sc_merge_chunksize] + for i in range(0, sc_df.shape[0], sc_merge_chunksize) ] ] ) diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py index 06943ed5..901a7d00 100644 --- a/pycytominer/tests/test_cyto_utils/test_cells.py +++ b/pycytominer/tests/test_cyto_utils/test_cells.py @@ -309,6 +309,7 @@ def test_merge_single_cells(): "Metadata_ObjectNumber_cytoplasm", "Metadata_ObjectNumber_cells", ] + # check that we have the same data using same cols, sort and a reset index pd.testing.assert_frame_equal( left=manual_merge[assert_cols] From 298bf3aab28e4ffed53512debc91dbaba9a11a3c Mon Sep 17 00:00:00 2001 From: d33bs Date: Mon, 10 Oct 2022 15:55:37 -0600 Subject: [PATCH 5/6] add optional float dtype specification --- pycytominer/cyto_utils/cells.py | 19 +++++++++++++++---- .../tests/test_cyto_utils/test_cells.py | 18 ++++++++++++++++++ 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py index ce937fb8..b5172019 100644 --- a/pycytominer/cyto_utils/cells.py +++ b/pycytominer/cyto_utils/cells.py @@ -419,13 +419,16 @@ def get_sql_table_col_names(self, table): return meta_cols, feat_cols - def load_compartment(self, compartment): + def load_compartment(self, compartment, float_datatype: type = np.float64): """Creates the compartment dataframe. Parameters ---------- compartment : str The compartment to process. + float_datatype: type, default np.float64 + Numpy floating point datatype to use for load_compartment and resulting dataframes. + Please note: using any besides np.float64 are experimentally unverified. Returns ------- @@ -439,7 +442,7 @@ def load_compartment(self, compartment): num_meta, num_feats = len(meta_cols), len(feat_cols) # Use pre-allocated np.array for data - feats = np.empty(shape=(num_cells, num_feats), dtype=np.float64) + feats = np.empty(shape=(num_cells, num_feats), dtype=float_datatype) # Use pre-allocated pd.DataFrame for metadata metas = pd.DataFrame(columns=meta_cols, index=range(num_cells)) @@ -653,6 +656,7 @@ def merge_single_cells( normalize_args: Optional[Dict] = None, platemap: Optional[Union[str, pd.DataFrame]] = None, sc_merge_chunksize: Optional[int] = None, + float_datatype: type = np.float64, **kwargs, ): """Given the linking columns, merge single cell data. Normalization is also supported. @@ -677,6 +681,9 @@ def merge_single_cells( Chunksize for merge and concatenation operations to help address performance issues note: if set to None, will infer a chunksize which is the roughly 1/3 the row length of first component df. + float_datatype: type, default np.float64 + Numpy floating point datatype to use for load_compartment and resulting dataframes. + Please note: using any besides np.float64 are experimentally unverified. Returns ------- @@ -710,7 +717,9 @@ def merge_single_cells( ] if sc_df.empty: - sc_df = self.load_compartment(compartment=left_compartment) + sc_df = self.load_compartment( + compartment=left_compartment, float_datatype=float_datatype + ) # if chunksize was not set, set it to roughly # one third the size of our initial compartment @@ -733,7 +742,9 @@ def merge_single_cells( # sc_merge_chunksize to help constrain memory sc_df = pd.concat( [ - self.load_compartment(compartment=right_compartment).merge( + self.load_compartment( + compartment=right_compartment, float_datatype=float_datatype + ).merge( right=right_chunk, # note: we reverse left and right for join key merge order reference left_on=self.merge_cols + [right_link_col], diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py index 901a7d00..4f4f9c95 100644 --- a/pycytominer/tests/test_cyto_utils/test_cells.py +++ b/pycytominer/tests/test_cyto_utils/test_cells.py @@ -3,6 +3,7 @@ import random import tempfile +import numpy as np import pandas as pd import pytest from pycytominer import aggregate, annotate, normalize @@ -255,6 +256,23 @@ def test_load_compartment(): check_dtype=False, ) + # test using non-default float_datatype + loaded_compartment_df = AP.load_compartment( + compartment="cells", float_datatype=np.float32 + ) + pd.testing.assert_frame_equal( + loaded_compartment_df, + CELLS_DF.astype( + # cast any float type columns to float32 for expected comparison + { + colname: np.float32 + for colname in CELLS_DF.columns + if pd.api.types.is_float(CELLS_DF[colname].dtype) + } + ).reindex(columns=loaded_compartment_df.columns), + check_dtype=False, + ) + def test_sc_count_sql_table(): # Iterate over initialized compartments From b4634e271d498c5c70486a57939ced903f23f7c2 Mon Sep 17 00:00:00 2001 From: d33bs Date: Tue, 11 Oct 2022 09:49:29 -0600 Subject: [PATCH 6/6] test merge_single_cells non-default float dtype --- .../tests/test_cyto_utils/test_cells.py | 22 +++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py index 4f4f9c95..e766a202 100644 --- a/pycytominer/tests/test_cyto_utils/test_cells.py +++ b/pycytominer/tests/test_cyto_utils/test_cells.py @@ -339,6 +339,28 @@ def test_merge_single_cells(): check_dtype=False, ) + # use non-default float_datatype + sc_merged_df = AP.merge_single_cells(float_datatype=np.float32) + + # similar to the assert above, we test non-default float dtype specification + pd.testing.assert_frame_equal( + left=manual_merge[assert_cols] + .astype( + # cast any float type columns to float32 for expected comparison + { + colname: np.float32 + for colname in manual_merge.columns + if pd.api.types.is_float(manual_merge[colname].dtype) + } + ) + .sort_values(by=assert_cols, ascending=True) + .reset_index(drop=True), + right=sc_merged_df[assert_cols] + .sort_values(by=assert_cols, ascending=True) + .reset_index(drop=True), + check_dtype=False, + ) + # Confirm the merge and adding merge options for method in ["standardize", "robustize"]: for samples in ["all", "Metadata_ImageNumber == 'x'"]: