-
Notifications
You must be signed in to change notification settings - Fork 36
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Reduce memory consumption of SingleCells.merge_single_cells #234
Changes from all commits
422f3a4
e0062a4
cc68ec5
65d2a42
298bf3a
b4634e2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -3,6 +3,7 @@ | |
import random | ||
import tempfile | ||
|
||
import numpy as np | ||
import pandas as pd | ||
import pytest | ||
from pycytominer import aggregate, annotate, normalize | ||
|
@@ -255,6 +256,23 @@ def test_load_compartment(): | |
check_dtype=False, | ||
) | ||
|
||
# test using non-default float_datatype | ||
loaded_compartment_df = AP.load_compartment( | ||
compartment="cells", float_datatype=np.float32 | ||
) | ||
pd.testing.assert_frame_equal( | ||
loaded_compartment_df, | ||
CELLS_DF.astype( | ||
# cast any float type columns to float32 for expected comparison | ||
{ | ||
colname: np.float32 | ||
for colname in CELLS_DF.columns | ||
if pd.api.types.is_float(CELLS_DF[colname].dtype) | ||
} | ||
).reindex(columns=loaded_compartment_df.columns), | ||
check_dtype=False, | ||
) | ||
|
||
|
||
def test_sc_count_sql_table(): | ||
# Iterate over initialized compartments | ||
|
@@ -273,6 +291,10 @@ def test_get_sql_table_col_names(): | |
|
||
|
||
def test_merge_single_cells(): | ||
""" | ||
Testing various SingleCells.merge_single_cells functionality | ||
""" | ||
|
||
sc_merged_df = AP.merge_single_cells() | ||
|
||
# Assert that the image data was merged | ||
|
@@ -300,21 +322,43 @@ def test_merge_single_cells(): | |
) | ||
|
||
# Confirm that the merge correctly reversed the object number (opposite from Parent) | ||
assert ( | ||
sc_merged_df.Metadata_ObjectNumber_cytoplasm.tolist()[::-1] | ||
== sc_merged_df.Metadata_ObjectNumber.tolist() | ||
) | ||
assert ( | ||
manual_merge.Metadata_ObjectNumber_cytoplasm.tolist()[::-1] | ||
== sc_merged_df.Metadata_ObjectNumber.tolist() | ||
) | ||
assert ( | ||
manual_merge.Metadata_ObjectNumber_cytoplasm.tolist()[::-1] | ||
== sc_merged_df.Metadata_ObjectNumber.tolist() | ||
assert_cols = [ | ||
"Metadata_ObjectNumber", | ||
"Metadata_ObjectNumber_cytoplasm", | ||
"Metadata_ObjectNumber_cells", | ||
] | ||
|
||
# check that we have the same data using same cols, sort and a reset index | ||
pd.testing.assert_frame_equal( | ||
left=manual_merge[assert_cols] | ||
.sort_values(by=assert_cols, ascending=True) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am a bit concerned that sorting is giving us undue confidence of equivalent results. Am I reading this wrong? How should I be thinking about this? Can you add an explicit check that the object number is the same? Oh, but maybe this code block is doing exactly that (and more!)? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Great comment, thank you! The chunked merges made the data inconsistent with the existing tests. The check here now looks at columns "Metadata_ObjectNumber", "Metadata_ObjectNumber_cytoplasm", "Metadata_ObjectNumber_cells", sorted by the same columns in the same order for data equivalency and does include an Metadata_ObjectNumber check as a result. Would users of |
||
.reset_index(drop=True), | ||
right=sc_merged_df[assert_cols] | ||
.sort_values(by=assert_cols, ascending=True) | ||
.reset_index(drop=True), | ||
check_dtype=False, | ||
) | ||
assert ( | ||
manual_merge.Metadata_ObjectNumber_cells.tolist() | ||
== sc_merged_df.Metadata_ObjectNumber.tolist() | ||
|
||
# use non-default float_datatype | ||
sc_merged_df = AP.merge_single_cells(float_datatype=np.float32) | ||
|
||
# similar to the assert above, we test non-default float dtype specification | ||
pd.testing.assert_frame_equal( | ||
left=manual_merge[assert_cols] | ||
.astype( | ||
# cast any float type columns to float32 for expected comparison | ||
{ | ||
colname: np.float32 | ||
for colname in manual_merge.columns | ||
if pd.api.types.is_float(manual_merge[colname].dtype) | ||
} | ||
) | ||
.sort_values(by=assert_cols, ascending=True) | ||
.reset_index(drop=True), | ||
right=sc_merged_df[assert_cols] | ||
.sort_values(by=assert_cols, ascending=True) | ||
.reset_index(drop=True), | ||
check_dtype=False, | ||
) | ||
|
||
# Confirm the merge and adding merge options | ||
|
@@ -335,19 +379,41 @@ def test_merge_single_cells(): | |
manual_merge, method=method, samples=samples, features=features | ||
) | ||
|
||
# compare data using identical column order, sorting, and reset index | ||
pd.testing.assert_frame_equal( | ||
norm_method_df.sort_index(axis=1), | ||
manual_merge_normalize.sort_index(axis=1), | ||
norm_method_df[norm_method_df.columns] | ||
.sort_values(by="Cells_a") | ||
.reset_index(drop=True), | ||
manual_merge_normalize[norm_method_df.columns] | ||
.sort_values(by="Cells_a") | ||
.reset_index(drop=True), | ||
check_dtype=False, | ||
) | ||
|
||
# Test non-canonical compartment merging | ||
new_sc_merge_df = AP_NEW.merge_single_cells() | ||
|
||
assert sum(new_sc_merge_df.columns.str.startswith("New")) == 4 | ||
assert ( | ||
NEW_COMPARTMENT_DF.ObjectNumber.tolist()[::-1] | ||
== new_sc_merge_df.Metadata_ObjectNumber_new.tolist() | ||
|
||
assert_cols = [ | ||
"New_a", | ||
"New_b", | ||
"New_c", | ||
"New_d", | ||
"Metadata_ObjectNumber_new", | ||
] | ||
# compare data using identical column order, sorting, and reset index | ||
# note: we rename NEW_COMPARTMENT_DF to match new_sc_merge_df's ObjectNumber colname | ||
pd.testing.assert_frame_equal( | ||
left=NEW_COMPARTMENT_DF.rename( | ||
columns={"ObjectNumber": "Metadata_ObjectNumber_new"} | ||
)[assert_cols] | ||
.sort_values(by=assert_cols) | ||
.reset_index(drop=True), | ||
right=new_sc_merge_df[assert_cols] | ||
.sort_values(by=assert_cols) | ||
.reset_index(drop=True), | ||
check_dtype=False, | ||
) | ||
|
||
norm_new_method_df = AP_NEW.merge_single_cells( | ||
|
@@ -471,7 +537,6 @@ def test_merge_single_cells_cytominer_database_test_file(): | |
f"{os.path.dirname(__file__)}/../test_data/cytominer_database_example_data/test_SQ00014613.parquet", | ||
) | ||
sql_url = f"sqlite:///{sql_path}" | ||
print(sql_url) | ||
|
||
# build SingleCells from database | ||
sc_p = SingleCells( | ||
|
@@ -493,8 +558,8 @@ def test_merge_single_cells_cytominer_database_test_file(): | |
# note: pd.DataFrame datatypes sometimes appear automatically changed on-read, so we cast | ||
# the result_file dataframe using the base dataframe's types. | ||
pd.testing.assert_frame_equal( | ||
pd.read_csv(csv_path).astype(merged_sc.dtypes.to_dict()), | ||
pd.read_csv(result_file).astype(merged_sc.dtypes.to_dict()), | ||
pd.read_csv(csv_path).astype(merged_sc.dtypes.to_dict())[merged_sc.columns], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why is this necessary now? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you for calling this out! For inner merges, Pandas "...preserve[s] the order of the left keys." (reference: pd.merge(..., how)). The column specification here is provided to account for the now differently ordered columns due to the "right" and "left" being swapped within the compartment merges. I also felt that accounting for exact column ordering over time may be unwieldy, so this may provide benefit towards development velocity. That said, we may need a change; do you think we should enforce strict column order within There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @gwaybio - just wanted to follow up here with some thoughts about I wrote the following as an example of what we could do here: import pandas as pd
# create an example dataframe with mixed order columns
df = pd.DataFrame(
{
"Image_Metadata_1": [0],
"TableNumber": [0],
"ImageNumber": [0],
"Cytoplasm_Data_1": [0],
"Nuclei_Data_1": [0],
"Cells_Data_1": [0],
"Mito_Data_1": [0],
"Actin_Data_1": [0],
"Image_Data_1": [0],
}
)
# print df columns as a python list, representing new order
print("Initial order:")
print(df.columns.tolist())
def custom_sort(value: str):
"""
A custom sort for Pycytominer merge_single_cells
pd.dataframe columns
"""
# lowercase str which will be used for comparisons
# to avoid any capitalization challenges
value_lower = value.lower()
# first sorted values (by list index)
sort_first = ["tablenumber", "imagenumber"]
# middle sort value
sort_middle = "metadata"
# sorted last (by list order enumeration)
sort_later = [
"cells",
"cytoplasm",
"nuclei",
"image",
]
# if value is in the sort_first list
# return the index from that list
if value_lower in sort_first:
return sort_first.index(value_lower)
# if sort_middle is anywhere in value return
# next index value after sort_first values
elif sort_middle in value_lower:
return len(sort_first)
# if any sort_later are found as the first part of value
# return enumerated index of sort_later value (starting from
# relative len based on the above conditionals and lists)
elif any(value_lower.startswith(val) for val in sort_later):
for k, v in enumerate(sort_later, start=len(sort_first) + 1):
if value_lower.startswith(v):
return k
# else we return the total length of all sort values
return len(sort_first) + len(sort_later) + 1
# inner sorted alphabetizes any columns which may not be part of custom_sort
# outer sort provides pycytominer-specific column sort order
df = df[sorted(sorted(df.columns), key=custom_sort)]
# print df columns as a python list, representing new order
print("\nSorted order:")
print(df.columns.tolist()) Which has printed output: Initial order:
['Image_Metadata_1', 'TableNumber', 'ImageNumber', 'Cytoplasm_Data_1', 'Nuclei_Data_1', 'Cells_Data_1', 'Mito_Data_1', 'Actin_Data_1', 'Image_Data_1']
Sorted order:
['TableNumber', 'ImageNumber', 'Image_Metadata_1', 'Cells_Data_1', 'Cytoplasm_Data_1', 'Nuclei_Data_1', 'Image_Data_1', 'Actin_Data_1', 'Mito_Data_1'] |
||
pd.read_csv(result_file).astype(merged_sc.dtypes.to_dict())[merged_sc.columns], | ||
) | ||
|
||
# test parquet output from merge_single_cells | ||
|
@@ -507,8 +572,12 @@ def test_merge_single_cells_cytominer_database_test_file(): | |
# note: pd.DataFrame datatypes sometimes appear automatically changed on-read, so we cast | ||
# the result_file dataframe using the base dataframe's types. | ||
pd.testing.assert_frame_equal( | ||
pd.read_parquet(parquet_path).astype(merged_sc.dtypes.to_dict()), | ||
pd.read_parquet(result_file).astype(merged_sc.dtypes.to_dict()), | ||
pd.read_parquet(parquet_path).astype(merged_sc.dtypes.to_dict())[ | ||
merged_sc.columns | ||
], | ||
pd.read_parquet(result_file).astype(merged_sc.dtypes.to_dict())[ | ||
merged_sc.columns | ||
], | ||
) | ||
|
||
# test parquet output from merge_single_cells with annotation meta | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
this doesn't looked blacked to me, please confirm
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I ran black on this, and
--check
doesn't seem to flag it (may be profile related). I'm seeing that there's a list end bumping up against the comment, I'll add in a newline there with new changes. Are you seeing anything else that looks stylistically out of place which I may change?