Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add float_datatype parameter for SingleCells compartment load and merge performance flexibility #248

Merged
merged 7 commits into from
Dec 22, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 20 additions & 3 deletions pycytominer/cyto_utils/cells.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,14 @@ class SingleCells(object):
Name of the fields of view feature.
object_feature : str, default "Metadata_ObjectNumber"
Object number feature.
default_datatype_float: type
Numpy floating point datatype to use for load_compartment and resulting
dataframes. This parameter may be used to assist with performance-related
issues by reducing the memory required for floating-point data.
For example, using np.float32 instead of np.float64 for this parameter
will reduce memory consumed by float columns by roughly 50%.
Please note: using any besides np.float64 are experimentally
unverified.

Notes
-----
Expand Down Expand Up @@ -105,6 +113,7 @@ def __init__(
fields_of_view="all",
fields_of_view_feature="Metadata_Site",
object_feature="Metadata_ObjectNumber",
default_datatype_float=np.float64,
):
"""Constructor method"""
# Check compartments specified
Expand Down Expand Up @@ -139,6 +148,7 @@ def __init__(
self.compartment_linking_cols = compartment_linking_cols
self.fields_of_view_feature = fields_of_view_feature
self.object_feature = object_feature
self.default_datatype_float = default_datatype_float

# Confirm that the compartments and linking cols are formatted properly
assert_linking_cols_complete(
Expand Down Expand Up @@ -436,6 +446,9 @@ def split_column_categories(self, col_names):
def load_compartment(self, compartment):
"""Creates the compartment dataframe.

Note: makes use of default_datatype_float attribute
for setting a default floating point datatype.

Parameters
----------
compartment : str
Expand All @@ -455,8 +468,10 @@ def load_compartment(self, compartment):
meta_cols, feat_cols = self.split_column_categories(col_names)
num_meta, num_feats = len(meta_cols), len(feat_cols)

# Use pre-allocated np.array for data
feats = np.empty(shape=(num_cells, num_feats), dtype=np.float64)
# Use pre-allocated np.array for feature data
feats = np.empty(
shape=(num_cells, num_feats), dtype=self.default_datatype_float
)
# Use pre-allocated pd.DataFrame for metadata
metas = pd.DataFrame(columns=meta_cols, index=range(num_cells))

Expand Down Expand Up @@ -748,7 +763,9 @@ def merge_single_cells(

else:
sc_df = sc_df.merge(
self.load_compartment(compartment=right_compartment),
self.load_compartment(
compartment=right_compartment
),
left_on=self.merge_cols + [left_link_col],
right_on=self.merge_cols + [right_link_col],
suffixes=merge_suffix,
Expand Down
49 changes: 41 additions & 8 deletions pycytominer/tests/test_cyto_utils/test_cells.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import random
import tempfile

import numpy as np
import pandas as pd
import pytest
from pycytominer import aggregate, annotate, normalize
Expand Down Expand Up @@ -293,6 +294,45 @@ def test_load_compartment():
check_dtype=False,
)

# test load_compartment with non-default default_datatype_float
# create new SingleCells based on AP
float32_loaded_compartment_df = SingleCells(
sql_file=TMP_SQLITE_FILE, default_datatype_float=np.float32
).load_compartment(compartment="cells")

# for uniformly handling metadata types for both dataframes
metadata_types = {"ObjectNumber": "int64"}

# updated column datatypes for manual comparisons with CELLS_DF
cells_df_comparison_types = {
colname: np.float32
for colname in CELLS_DF.columns
# check for only columns which are of float type
if pd.api.types.is_float(CELLS_DF[colname].dtype)
# check for columns which are of 'int64' type
# note: pd.api.types.is_integer sometimes is unable to detect int64
or CELLS_DF[colname].dtype == "int64"
# avoid recasting the metadata_types
and colname not in metadata_types.keys()
}

# create deep copy of CELLS_DF with manually re-typed float columns as float32
# and cast any float type columns to float32 for expected comparison
cells_df_for_compare = CELLS_DF.copy(deep=True).astype(cells_df_comparison_types)[
# use float32_loaded_compartment_df column order for comparison
float32_loaded_compartment_df.columns
]

# cast metadata types in the same way for comparisons
float32_loaded_compartment_df = float32_loaded_compartment_df.astype(metadata_types)
cells_df_for_compare = cells_df_for_compare.astype(metadata_types)

# perform comparison of dataframes
pd.testing.assert_frame_equal(
float32_loaded_compartment_df,
cells_df_for_compare,
)


def test_sc_count_sql_table():
# Iterate over initialized compartments
Expand Down Expand Up @@ -436,14 +476,6 @@ def test_merge_single_cells():
traditional_norm_df.loc[:, new_compartment_cols].abs().describe(),
)


def test_merge_single_cells_subset():
sc_merged_df = AP_SUBSET.merge_single_cells()
assert (sc_merged_df.shape[1]) == 13
non_meta_cols = [x for x in sc_merged_df.columns if "Metadata" not in x]
assert len(non_meta_cols) == len([x for x in non_meta_cols if x in SUBSET_FEATURES])


def test_merge_single_cells_subsample():

for subsample_frac in [0.1, 0.5, 0.9]:
Expand Down Expand Up @@ -1050,3 +1082,4 @@ def test_load_non_canonical_image_table():
result.sort_index(axis="columns").drop("Metadata_Site_Count", axis="columns"),
sc_aggregated_df,
)