diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py index e26d7928..cd5c9621 100644 --- a/pycytominer/cyto_utils/cells.py +++ b/pycytominer/cyto_utils/cells.py @@ -69,6 +69,14 @@ class SingleCells(object): Name of the fields of view feature. object_feature : str, default "Metadata_ObjectNumber" Object number feature. + default_datatype_float: type + Numpy floating point datatype to use for load_compartment and resulting + dataframes. This parameter may be used to assist with performance-related + issues by reducing the memory required for floating-point data. + For example, using np.float32 instead of np.float64 for this parameter + will reduce memory consumed by float columns by roughly 50%. + Please note: using any besides np.float64 are experimentally + unverified. Notes ----- @@ -105,6 +113,7 @@ def __init__( fields_of_view="all", fields_of_view_feature="Metadata_Site", object_feature="Metadata_ObjectNumber", + default_datatype_float=np.float64, ): """Constructor method""" # Check compartments specified @@ -139,6 +148,7 @@ def __init__( self.compartment_linking_cols = compartment_linking_cols self.fields_of_view_feature = fields_of_view_feature self.object_feature = object_feature + self.default_datatype_float = default_datatype_float # Confirm that the compartments and linking cols are formatted properly assert_linking_cols_complete( @@ -436,6 +446,9 @@ def split_column_categories(self, col_names): def load_compartment(self, compartment): """Creates the compartment dataframe. + Note: makes use of default_datatype_float attribute + for setting a default floating point datatype. + Parameters ---------- compartment : str @@ -455,8 +468,10 @@ def load_compartment(self, compartment): meta_cols, feat_cols = self.split_column_categories(col_names) num_meta, num_feats = len(meta_cols), len(feat_cols) - # Use pre-allocated np.array for data - feats = np.empty(shape=(num_cells, num_feats), dtype=np.float64) + # Use pre-allocated np.array for feature data + feats = np.empty( + shape=(num_cells, num_feats), dtype=self.default_datatype_float + ) # Use pre-allocated pd.DataFrame for metadata metas = pd.DataFrame(columns=meta_cols, index=range(num_cells)) @@ -748,7 +763,9 @@ def merge_single_cells( else: sc_df = sc_df.merge( - self.load_compartment(compartment=right_compartment), + self.load_compartment( + compartment=right_compartment + ), left_on=self.merge_cols + [left_link_col], right_on=self.merge_cols + [right_link_col], suffixes=merge_suffix, diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py index c9ec8ed8..f6461167 100644 --- a/pycytominer/tests/test_cyto_utils/test_cells.py +++ b/pycytominer/tests/test_cyto_utils/test_cells.py @@ -3,6 +3,7 @@ import random import tempfile +import numpy as np import pandas as pd import pytest from pycytominer import aggregate, annotate, normalize @@ -293,6 +294,45 @@ def test_load_compartment(): check_dtype=False, ) + # test load_compartment with non-default default_datatype_float + # create new SingleCells based on AP + float32_loaded_compartment_df = SingleCells( + sql_file=TMP_SQLITE_FILE, default_datatype_float=np.float32 + ).load_compartment(compartment="cells") + + # for uniformly handling metadata types for both dataframes + metadata_types = {"ObjectNumber": "int64"} + + # updated column datatypes for manual comparisons with CELLS_DF + cells_df_comparison_types = { + colname: np.float32 + for colname in CELLS_DF.columns + # check for only columns which are of float type + if pd.api.types.is_float(CELLS_DF[colname].dtype) + # check for columns which are of 'int64' type + # note: pd.api.types.is_integer sometimes is unable to detect int64 + or CELLS_DF[colname].dtype == "int64" + # avoid recasting the metadata_types + and colname not in metadata_types.keys() + } + + # create deep copy of CELLS_DF with manually re-typed float columns as float32 + # and cast any float type columns to float32 for expected comparison + cells_df_for_compare = CELLS_DF.copy(deep=True).astype(cells_df_comparison_types)[ + # use float32_loaded_compartment_df column order for comparison + float32_loaded_compartment_df.columns + ] + + # cast metadata types in the same way for comparisons + float32_loaded_compartment_df = float32_loaded_compartment_df.astype(metadata_types) + cells_df_for_compare = cells_df_for_compare.astype(metadata_types) + + # perform comparison of dataframes + pd.testing.assert_frame_equal( + float32_loaded_compartment_df, + cells_df_for_compare, + ) + def test_sc_count_sql_table(): # Iterate over initialized compartments @@ -436,14 +476,6 @@ def test_merge_single_cells(): traditional_norm_df.loc[:, new_compartment_cols].abs().describe(), ) - -def test_merge_single_cells_subset(): - sc_merged_df = AP_SUBSET.merge_single_cells() - assert (sc_merged_df.shape[1]) == 13 - non_meta_cols = [x for x in sc_merged_df.columns if "Metadata" not in x] - assert len(non_meta_cols) == len([x for x in non_meta_cols if x in SUBSET_FEATURES]) - - def test_merge_single_cells_subsample(): for subsample_frac in [0.1, 0.5, 0.9]: @@ -1050,3 +1082,4 @@ def test_load_non_canonical_image_table(): result.sort_index(axis="columns").drop("Metadata_Site_Count", axis="columns"), sc_aggregated_df, ) +