From fea3a73c4c9e239ac9daaf2ffb72915c1b65104d Mon Sep 17 00:00:00 2001 From: d33bs Date: Fri, 16 Dec 2022 13:49:34 -0700 Subject: [PATCH 1/6] add float_datatype parameter to load and merge ops --- pycytominer/cyto_utils/cells.py | 27 +++++++-- .../tests/test_cyto_utils/test_cells.py | 60 +++++++++++++++++++ 2 files changed, 81 insertions(+), 6 deletions(-) diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py index 08029fe6..4921820e 100644 --- a/pycytominer/cyto_utils/cells.py +++ b/pycytominer/cyto_utils/cells.py @@ -429,13 +429,17 @@ def get_sql_table_col_names(self, table): return meta_cols, feat_cols - def load_compartment(self, compartment): + def load_compartment(self, compartment, float_datatype: type = np.float64): """Creates the compartment dataframe. Parameters ---------- compartment : str The compartment to process. + float_datatype: type, default np.float64 + Numpy floating point datatype to use for load_compartment and resulting + dataframes. Please note: using any besides np.float64 are experimentally + unverified. Returns ------- @@ -448,8 +452,8 @@ def load_compartment(self, compartment): meta_cols, feat_cols = self.get_sql_table_col_names(compartment) num_meta, num_feats = len(meta_cols), len(feat_cols) - # Use pre-allocated np.array for data - feats = np.empty(shape=(num_cells, num_feats), dtype=np.float64) + # Use pre-allocated np.array for feature data + feats = np.empty(shape=(num_cells, num_feats), dtype=float_datatype) # Use pre-allocated pd.DataFrame for metadata metas = pd.DataFrame(columns=meta_cols, index=range(num_cells)) @@ -661,6 +665,7 @@ def merge_single_cells( single_cell_normalize: bool = False, normalize_args: Optional[Dict] = None, platemap: Optional[Union[str, pd.DataFrame]] = None, + float_datatype: type = np.float64, **kwargs, ): """Given the linking columns, merge single cell data. Normalization is also supported. @@ -681,6 +686,10 @@ def merge_single_cells( Additional arguments passed as input to pycytominer.normalize(). platemap: str or pd.DataFrame, default None optional platemap filepath str or pd.DataFrame to be used with results via annotate + float_datatype: type, default np.float64 + Numpy floating point datatype to use for load_compartment and resulting + dataframes. Please note: using any besides np.float64 are experimentally + unverified. Returns ------- @@ -714,7 +723,9 @@ def merge_single_cells( ] if isinstance(sc_df, str): - sc_df = self.load_compartment(compartment=left_compartment) + sc_df = self.load_compartment( + compartment=left_compartment, float_datatype=float_datatype + ) if compute_subsample: # Sample cells proportionally by self.strata @@ -729,7 +740,9 @@ def merge_single_cells( ).reindex(sc_df.columns, axis="columns") sc_df = sc_df.merge( - self.load_compartment(compartment=right_compartment), + self.load_compartment( + compartment=right_compartment, float_datatype=float_datatype + ), left_on=self.merge_cols + [left_link_col], right_on=self.merge_cols + [right_link_col], suffixes=merge_suffix, @@ -737,7 +750,9 @@ def merge_single_cells( else: sc_df = sc_df.merge( - self.load_compartment(compartment=right_compartment), + self.load_compartment( + compartment=right_compartment, float_datatype=float_datatype + ), left_on=self.merge_cols + [left_link_col], right_on=self.merge_cols + [right_link_col], suffixes=merge_suffix, diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py index 295db959..fb80861c 100644 --- a/pycytominer/tests/test_cyto_utils/test_cells.py +++ b/pycytominer/tests/test_cyto_utils/test_cells.py @@ -3,6 +3,7 @@ import random import tempfile +import numpy as np import pandas as pd import pytest from pycytominer import aggregate, annotate, normalize @@ -280,6 +281,23 @@ def test_load_compartment(): check_dtype=False, ) + # test using non-default float_datatype + loaded_compartment_df = AP.load_compartment( + compartment="cells", float_datatype=np.float32 + ) + pd.testing.assert_frame_equal( + loaded_compartment_df, + CELLS_DF.astype( + # cast any float type columns to float32 for expected comparison + { + colname: np.float32 + for colname in CELLS_DF.columns + if pd.api.types.is_float(CELLS_DF[colname].dtype) + } + ).reindex(columns=loaded_compartment_df.columns), + check_dtype=False, + ) + def test_sc_count_sql_table(): # Iterate over initialized compartments @@ -416,6 +434,48 @@ def test_merge_single_cells(): traditional_norm_df.loc[:, new_compartment_cols].abs().describe(), ) + # use non-default float_datatype + sc_merged_df = AP.merge_single_cells(float_datatype=np.float32) + + # ensure metadata have same types for comparisons + meta_types = { + colname: "int64" + for colname in [ + "Metadata_ObjectNumber", + "Metadata_ObjectNumber_cells", + "Metadata_Cytoplasm_Parent_Nuclei", + "Metadata_Cytoplasm_Parent_Cells", + "Metadata_ObjectNumber_cytoplasm", + "Metadata_Site", + ] + } + # apply type changes as per meta_types + manual_merge = manual_merge.astype(meta_types) + sc_merged_df = sc_merged_df.astype(meta_types) + + # similar to the assert above, we test non-default float dtype specification + pd.testing.assert_frame_equal( + left=manual_merge.astype( + # cast any float type columns to float32 for expected comparisons + { + colname: np.float32 + for colname in manual_merge.columns + if pd.api.types.is_float(manual_merge[colname].dtype) + # note: pd.api.types.is_integer sometimes is unable to detect int64 + or manual_merge[colname].dtype == "int64" + and colname not in meta_types.keys() + } + ) + .sort_values(by=manual_merge.columns.tolist(), ascending=True) + .reset_index(drop=True), + # use manual_merge's column order for sc_merged_df + right=sc_merged_df[manual_merge.columns] + # use manual_merge's column order for sorting values + .sort_values(by=manual_merge.columns.tolist(), ascending=True).reset_index( + drop=True + ), + ) + def test_merge_single_cells_subsample(): From db55da4188c4b0601e1bf91125495c1f9504ff3f Mon Sep 17 00:00:00 2001 From: d33bs Date: Mon, 19 Dec 2022 15:08:52 -0700 Subject: [PATCH 2/6] move to singlecells default_datatype_float attr --- pycytominer/cyto_utils/cells.py | 36 +++++++++++++++++---------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py index 4921820e..888fe05e 100644 --- a/pycytominer/cyto_utils/cells.py +++ b/pycytominer/cyto_utils/cells.py @@ -69,6 +69,14 @@ class SingleCells(object): Name of the fields of view feature. object_feature : str, default "Metadata_ObjectNumber" Object number feature. + default_datatype_float: type + Numpy floating point datatype to use for load_compartment and resulting + dataframes. This parameter may be used to assist with performance-related + issues by reducing the memory required for floating-point data. + For example, using np.float32 instead of np.float64 for this parameter + will reduce memory consumed by float columns by roughly 50%. + Please note: using any besides np.float64 are experimentally + unverified. Notes ----- @@ -105,6 +113,7 @@ def __init__( fields_of_view="all", fields_of_view_feature="Metadata_Site", object_feature="Metadata_ObjectNumber", + default_datatype_float=np.float64, ): """Constructor method""" # Check compartments specified @@ -139,6 +148,7 @@ def __init__( self.compartment_linking_cols = compartment_linking_cols self.fields_of_view_feature = fields_of_view_feature self.object_feature = object_feature + self.default_datatype_float = default_datatype_float # Confirm that the compartments and linking cols are formatted properly assert_linking_cols_complete( @@ -429,17 +439,16 @@ def get_sql_table_col_names(self, table): return meta_cols, feat_cols - def load_compartment(self, compartment, float_datatype: type = np.float64): + def load_compartment(self, compartment): """Creates the compartment dataframe. + Note: makes use of default_datatype_float attribute + for setting a default floating point datatype. + Parameters ---------- compartment : str The compartment to process. - float_datatype: type, default np.float64 - Numpy floating point datatype to use for load_compartment and resulting - dataframes. Please note: using any besides np.float64 are experimentally - unverified. Returns ------- @@ -453,7 +462,9 @@ def load_compartment(self, compartment, float_datatype: type = np.float64): num_meta, num_feats = len(meta_cols), len(feat_cols) # Use pre-allocated np.array for feature data - feats = np.empty(shape=(num_cells, num_feats), dtype=float_datatype) + feats = np.empty( + shape=(num_cells, num_feats), dtype=self.default_datatype_float + ) # Use pre-allocated pd.DataFrame for metadata metas = pd.DataFrame(columns=meta_cols, index=range(num_cells)) @@ -665,7 +676,6 @@ def merge_single_cells( single_cell_normalize: bool = False, normalize_args: Optional[Dict] = None, platemap: Optional[Union[str, pd.DataFrame]] = None, - float_datatype: type = np.float64, **kwargs, ): """Given the linking columns, merge single cell data. Normalization is also supported. @@ -686,10 +696,6 @@ def merge_single_cells( Additional arguments passed as input to pycytominer.normalize(). platemap: str or pd.DataFrame, default None optional platemap filepath str or pd.DataFrame to be used with results via annotate - float_datatype: type, default np.float64 - Numpy floating point datatype to use for load_compartment and resulting - dataframes. Please note: using any besides np.float64 are experimentally - unverified. Returns ------- @@ -723,9 +729,7 @@ def merge_single_cells( ] if isinstance(sc_df, str): - sc_df = self.load_compartment( - compartment=left_compartment, float_datatype=float_datatype - ) + sc_df = self.load_compartment(compartment=left_compartment) if compute_subsample: # Sample cells proportionally by self.strata @@ -740,9 +744,7 @@ def merge_single_cells( ).reindex(sc_df.columns, axis="columns") sc_df = sc_df.merge( - self.load_compartment( - compartment=right_compartment, float_datatype=float_datatype - ), + self.load_compartment(compartment=right_compartment), left_on=self.merge_cols + [left_link_col], right_on=self.merge_cols + [right_link_col], suffixes=merge_suffix, From d55a0cccdc0c84124f2c258bd014162f6d0b80ee Mon Sep 17 00:00:00 2001 From: d33bs Date: Mon, 19 Dec 2022 15:53:37 -0700 Subject: [PATCH 3/6] remove float_datatype --- pycytominer/cyto_utils/cells.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py index 888fe05e..48e037d4 100644 --- a/pycytominer/cyto_utils/cells.py +++ b/pycytominer/cyto_utils/cells.py @@ -753,7 +753,7 @@ def merge_single_cells( else: sc_df = sc_df.merge( self.load_compartment( - compartment=right_compartment, float_datatype=float_datatype + compartment=right_compartment ), left_on=self.merge_cols + [left_link_col], right_on=self.merge_cols + [right_link_col], From 028962277fae9f664c8970b1f6f98187db96e1b3 Mon Sep 17 00:00:00 2001 From: d33bs Date: Mon, 19 Dec 2022 15:53:59 -0700 Subject: [PATCH 4/6] update testing for load_compartment --- .../tests/test_cyto_utils/test_cells.py | 99 ++++++++----------- 1 file changed, 42 insertions(+), 57 deletions(-) diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py index fb80861c..7eb8fcd3 100644 --- a/pycytominer/tests/test_cyto_utils/test_cells.py +++ b/pycytominer/tests/test_cyto_utils/test_cells.py @@ -281,21 +281,40 @@ def test_load_compartment(): check_dtype=False, ) - # test using non-default float_datatype - loaded_compartment_df = AP.load_compartment( - compartment="cells", float_datatype=np.float32 - ) + # test load_compartment with non-default default_datatype_float + # create new SingleCells based on AP + float32_loaded_compartment_df = SingleCells( + sql_file=TMP_SQLITE_FILE, default_datatype_float=np.float32 + ).load_compartment(compartment="cells") + + # for uniformly handling metadata types for both dataframes + metadata_types = {"ObjectNumber": "int64"} + + # create deep copy of CELLS_DF with manually re-typed float columns as float32 + cells_df_for_compare = CELLS_DF.copy(deep=True).astype( + # cast any float type columns to float32 for expected comparison + { + colname: np.float32 + for colname in CELLS_DF.columns + # check for only columns which are of float type + if pd.api.types.is_float(CELLS_DF[colname].dtype) + # check for columns which are of 'int64' type + # note: pd.api.types.is_integer sometimes is unable to detect int64 + or CELLS_DF[colname].dtype == "int64" + # avoid recasting the metadata_types + and colname not in metadata_types.keys() + } + # use float32_loaded_compartment_df column order for comparison below + )[float32_loaded_compartment_df.columns] + + # cast metadata types in the same way for comparisons + float32_loaded_compartment_df = float32_loaded_compartment_df.astype(metadata_types) + cells_df_for_compare = cells_df_for_compare.astype(metadata_types) + + # perform comparison of dataframes pd.testing.assert_frame_equal( - loaded_compartment_df, - CELLS_DF.astype( - # cast any float type columns to float32 for expected comparison - { - colname: np.float32 - for colname in CELLS_DF.columns - if pd.api.types.is_float(CELLS_DF[colname].dtype) - } - ).reindex(columns=loaded_compartment_df.columns), - check_dtype=False, + float32_loaded_compartment_df, + cells_df_for_compare, ) @@ -434,49 +453,6 @@ def test_merge_single_cells(): traditional_norm_df.loc[:, new_compartment_cols].abs().describe(), ) - # use non-default float_datatype - sc_merged_df = AP.merge_single_cells(float_datatype=np.float32) - - # ensure metadata have same types for comparisons - meta_types = { - colname: "int64" - for colname in [ - "Metadata_ObjectNumber", - "Metadata_ObjectNumber_cells", - "Metadata_Cytoplasm_Parent_Nuclei", - "Metadata_Cytoplasm_Parent_Cells", - "Metadata_ObjectNumber_cytoplasm", - "Metadata_Site", - ] - } - # apply type changes as per meta_types - manual_merge = manual_merge.astype(meta_types) - sc_merged_df = sc_merged_df.astype(meta_types) - - # similar to the assert above, we test non-default float dtype specification - pd.testing.assert_frame_equal( - left=manual_merge.astype( - # cast any float type columns to float32 for expected comparisons - { - colname: np.float32 - for colname in manual_merge.columns - if pd.api.types.is_float(manual_merge[colname].dtype) - # note: pd.api.types.is_integer sometimes is unable to detect int64 - or manual_merge[colname].dtype == "int64" - and colname not in meta_types.keys() - } - ) - .sort_values(by=manual_merge.columns.tolist(), ascending=True) - .reset_index(drop=True), - # use manual_merge's column order for sc_merged_df - right=sc_merged_df[manual_merge.columns] - # use manual_merge's column order for sorting values - .sort_values(by=manual_merge.columns.tolist(), ascending=True).reset_index( - drop=True - ), - ) - - def test_merge_single_cells_subsample(): for subsample_frac in [0.1, 0.5, 0.9]: @@ -1083,3 +1059,12 @@ def test_load_non_canonical_image_table(): result.sort_index(axis="columns").drop("Metadata_Site_Count", axis="columns"), sc_aggregated_df, ) + +def test_singlecells_default_datatype(): + """ + Testing various use of SingleCells class attribute + default_datatype_float with non-default options. + """ + + + From 15128cb67047bfd3991242efe51aad79e3df538c Mon Sep 17 00:00:00 2001 From: d33bs Date: Tue, 20 Dec 2022 07:45:36 -0700 Subject: [PATCH 5/6] remove empty test block --- pycytominer/tests/test_cyto_utils/test_cells.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py index e1db1cca..6bddb800 100644 --- a/pycytominer/tests/test_cyto_utils/test_cells.py +++ b/pycytominer/tests/test_cyto_utils/test_cells.py @@ -1080,11 +1080,3 @@ def test_load_non_canonical_image_table(): sc_aggregated_df, ) -def test_singlecells_default_datatype(): - """ - Testing various use of SingleCells class attribute - default_datatype_float with non-default options. - """ - - - From 6801c2975b191a7395d09ea7682367122a8e35a6 Mon Sep 17 00:00:00 2001 From: d33bs Date: Tue, 20 Dec 2022 08:07:17 -0700 Subject: [PATCH 6/6] improve readability for astype dictionary in test --- .../tests/test_cyto_utils/test_cells.py | 33 ++++++++++--------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py index 6bddb800..f6461167 100644 --- a/pycytominer/tests/test_cyto_utils/test_cells.py +++ b/pycytominer/tests/test_cyto_utils/test_cells.py @@ -303,22 +303,25 @@ def test_load_compartment(): # for uniformly handling metadata types for both dataframes metadata_types = {"ObjectNumber": "int64"} + # updated column datatypes for manual comparisons with CELLS_DF + cells_df_comparison_types = { + colname: np.float32 + for colname in CELLS_DF.columns + # check for only columns which are of float type + if pd.api.types.is_float(CELLS_DF[colname].dtype) + # check for columns which are of 'int64' type + # note: pd.api.types.is_integer sometimes is unable to detect int64 + or CELLS_DF[colname].dtype == "int64" + # avoid recasting the metadata_types + and colname not in metadata_types.keys() + } + # create deep copy of CELLS_DF with manually re-typed float columns as float32 - cells_df_for_compare = CELLS_DF.copy(deep=True).astype( - # cast any float type columns to float32 for expected comparison - { - colname: np.float32 - for colname in CELLS_DF.columns - # check for only columns which are of float type - if pd.api.types.is_float(CELLS_DF[colname].dtype) - # check for columns which are of 'int64' type - # note: pd.api.types.is_integer sometimes is unable to detect int64 - or CELLS_DF[colname].dtype == "int64" - # avoid recasting the metadata_types - and colname not in metadata_types.keys() - } - # use float32_loaded_compartment_df column order for comparison below - )[float32_loaded_compartment_df.columns] + # and cast any float type columns to float32 for expected comparison + cells_df_for_compare = CELLS_DF.copy(deep=True).astype(cells_df_comparison_types)[ + # use float32_loaded_compartment_df column order for comparison + float32_loaded_compartment_df.columns + ] # cast metadata types in the same way for comparisons float32_loaded_compartment_df = float32_loaded_compartment_df.astype(metadata_types)