Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add float_datatype parameter for SingleCells compartment load and merge performance flexibility #248

Merged
merged 7 commits into from
Dec 22, 2022
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 21 additions & 6 deletions pycytominer/cyto_utils/cells.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,13 +429,17 @@ def get_sql_table_col_names(self, table):

return meta_cols, feat_cols

def load_compartment(self, compartment):
def load_compartment(self, compartment, float_datatype: type = np.float64):
gwaybio marked this conversation as resolved.
Show resolved Hide resolved
"""Creates the compartment dataframe.

Parameters
----------
compartment : str
The compartment to process.
float_datatype: type, default np.float64
Numpy floating point datatype to use for load_compartment and resulting
dataframes. Please note: using any besides np.float64 are experimentally
gwaybio marked this conversation as resolved.
Show resolved Hide resolved
unverified.

Returns
-------
Expand All @@ -448,8 +452,8 @@ def load_compartment(self, compartment):
meta_cols, feat_cols = self.get_sql_table_col_names(compartment)
num_meta, num_feats = len(meta_cols), len(feat_cols)

# Use pre-allocated np.array for data
feats = np.empty(shape=(num_cells, num_feats), dtype=np.float64)
# Use pre-allocated np.array for feature data
feats = np.empty(shape=(num_cells, num_feats), dtype=float_datatype)
# Use pre-allocated pd.DataFrame for metadata
metas = pd.DataFrame(columns=meta_cols, index=range(num_cells))

Expand Down Expand Up @@ -661,6 +665,7 @@ def merge_single_cells(
single_cell_normalize: bool = False,
normalize_args: Optional[Dict] = None,
platemap: Optional[Union[str, pd.DataFrame]] = None,
float_datatype: type = np.float64,
**kwargs,
):
"""Given the linking columns, merge single cell data. Normalization is also supported.
Expand All @@ -681,6 +686,10 @@ def merge_single_cells(
Additional arguments passed as input to pycytominer.normalize().
platemap: str or pd.DataFrame, default None
optional platemap filepath str or pd.DataFrame to be used with results via annotate
float_datatype: type, default np.float64
Numpy floating point datatype to use for load_compartment and resulting
dataframes. Please note: using any besides np.float64 are experimentally
unverified.

Returns
-------
Expand Down Expand Up @@ -714,7 +723,9 @@ def merge_single_cells(
]

if isinstance(sc_df, str):
sc_df = self.load_compartment(compartment=left_compartment)
sc_df = self.load_compartment(
compartment=left_compartment, float_datatype=float_datatype
)

if compute_subsample:
# Sample cells proportionally by self.strata
Expand All @@ -729,15 +740,19 @@ def merge_single_cells(
).reindex(sc_df.columns, axis="columns")

sc_df = sc_df.merge(
self.load_compartment(compartment=right_compartment),
self.load_compartment(
compartment=right_compartment, float_datatype=float_datatype
),
left_on=self.merge_cols + [left_link_col],
right_on=self.merge_cols + [right_link_col],
suffixes=merge_suffix,
)

else:
sc_df = sc_df.merge(
self.load_compartment(compartment=right_compartment),
self.load_compartment(
compartment=right_compartment, float_datatype=float_datatype
),
left_on=self.merge_cols + [left_link_col],
right_on=self.merge_cols + [right_link_col],
suffixes=merge_suffix,
Expand Down
60 changes: 60 additions & 0 deletions pycytominer/tests/test_cyto_utils/test_cells.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import random
import tempfile

import numpy as np
import pandas as pd
import pytest
from pycytominer import aggregate, annotate, normalize
Expand Down Expand Up @@ -280,6 +281,23 @@ def test_load_compartment():
check_dtype=False,
)

# test using non-default float_datatype
loaded_compartment_df = AP.load_compartment(
compartment="cells", float_datatype=np.float32
)
pd.testing.assert_frame_equal(
loaded_compartment_df,
CELLS_DF.astype(
gwaybio marked this conversation as resolved.
Show resolved Hide resolved
# cast any float type columns to float32 for expected comparison
{
colname: np.float32
for colname in CELLS_DF.columns
if pd.api.types.is_float(CELLS_DF[colname].dtype)
}
).reindex(columns=loaded_compartment_df.columns),
check_dtype=False,
gwaybio marked this conversation as resolved.
Show resolved Hide resolved
)


def test_sc_count_sql_table():
# Iterate over initialized compartments
Expand Down Expand Up @@ -416,6 +434,48 @@ def test_merge_single_cells():
traditional_norm_df.loc[:, new_compartment_cols].abs().describe(),
)

# use non-default float_datatype
sc_merged_df = AP.merge_single_cells(float_datatype=np.float32)
gwaybio marked this conversation as resolved.
Show resolved Hide resolved

# ensure metadata have same types for comparisons
meta_types = {
colname: "int64"
for colname in [
"Metadata_ObjectNumber",
"Metadata_ObjectNumber_cells",
"Metadata_Cytoplasm_Parent_Nuclei",
"Metadata_Cytoplasm_Parent_Cells",
"Metadata_ObjectNumber_cytoplasm",
"Metadata_Site",
]
}
# apply type changes as per meta_types
manual_merge = manual_merge.astype(meta_types)
sc_merged_df = sc_merged_df.astype(meta_types)

# similar to the assert above, we test non-default float dtype specification
pd.testing.assert_frame_equal(
left=manual_merge.astype(
# cast any float type columns to float32 for expected comparisons
{
colname: np.float32
for colname in manual_merge.columns
if pd.api.types.is_float(manual_merge[colname].dtype)
# note: pd.api.types.is_integer sometimes is unable to detect int64
or manual_merge[colname].dtype == "int64"
and colname not in meta_types.keys()
}
)
.sort_values(by=manual_merge.columns.tolist(), ascending=True)
.reset_index(drop=True),
# use manual_merge's column order for sc_merged_df
right=sc_merged_df[manual_merge.columns]
# use manual_merge's column order for sorting values
.sort_values(by=manual_merge.columns.tolist(), ascending=True).reset_index(
drop=True
),
)


def test_merge_single_cells_subsample():

Expand Down