From ec7a79d46b6819def6d0c2d2737f39baa61cb65d Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Thu, 23 Feb 2023 22:54:12 -0500 Subject: [PATCH 01/82] Init --- pycytominer/cyto_utils/cell_locations.py | 167 ++++++++++++++++++ .../test_cyto_utils/test_cell_locations.py | 60 +++++++ .../cell_locations_example_data/.gitignore | 1 + .../BR00126114_subset.sqlite | Bin 0 -> 20480 bytes .../load_data_with_illum_subset.parquet | Bin 0 -> 27012 bytes .../test_cell_locations.sh | 55 ++++++ 6 files changed, 283 insertions(+) create mode 100644 pycytominer/cyto_utils/cell_locations.py create mode 100644 pycytominer/tests/test_cyto_utils/test_cell_locations.py create mode 100644 pycytominer/tests/test_data/cell_locations_example_data/.gitignore create mode 100644 pycytominer/tests/test_data/cell_locations_example_data/BR00126114_subset.sqlite create mode 100644 pycytominer/tests/test_data/cell_locations_example_data/load_data_with_illum_subset.parquet create mode 100644 pycytominer/tests/test_data/cell_locations_example_data/test_cell_locations.sh diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py new file mode 100644 index 00000000..c1a2ae83 --- /dev/null +++ b/pycytominer/cyto_utils/cell_locations.py @@ -0,0 +1,167 @@ +""" +Utility function to augment a Parquet file with X,Y locations of cells in each image +""" + +import pandas as pd +import sqlite3 + + +class CellLocation: + """This class holds all the functions augment a Parquet files with X,Y locations + of cells in each image. + + In the Parquet file, + - Each row is single multi-channel image + - Each such image is indexed by 3 columns: `Metadata_Plate`,`Metadata_Well`,`Metadata_Site` + + The SQLite file contains at least two tables + - `Nuclei`, which has the single-cell-level readouts, including location information + - `Image`, which has the image-level readouts, as well metadata to link to the Parquet file + + In the `Nuclei` table, + - Each row is a cell + - Each cell has at least 3 columns: `Nuclei_Location_Center_X`, `Nuclei_Location_Center_Y`, `ImageNumber` + + In the `Image` table, + - Each row is an image + - Each image has at least 3 columns: `Metadata_Plate`,`Metadata_Well`,`Metadata_Site` + + + The methods in this class do the following + - Read the Parquet file + - Read the SQLite file + - For each image in the Parquet file, find the corresponding image in the SQLite file + - For each cell in the corresponding image, find the X,Y location + - Add the X,Y locations of all cells to the Parquet file in the corresponding row, packed into a single column + + + Attributes + ---------- + parquet_file : str + Path to the Parquet file + + sqlite_file : str + Path to the SQLite file + + image_column : default = 'ImageNumber' + Name of the column in the Parquet file that links to the SQLite file + + object_column : default = 'ObjectNumber' + Name of the column in the SQLite file that identifies each cell + + cell_x_loc : default = 'Nuclei_Location_Center_X' + Name of the column in the SQLite file that contains the X location of each cell + + cell_y_loc : default = 'Nuclei_Location_Center_Y' + Name of the column in the SQLite file that contains the Y location of each cell + + Methods + ------- + load_data() + Load the Parquet file into a Pandas DataFrame + + load_sqlite() + Load the required columns from the `Image` and `Nuclei` tables in the SQLite file into a Pandas DataFrame + + """ + + def __init__( + self, + parquet_file: str, + sqlite_file: str, + image_column: str = "ImageNumber", + object_column: str = "ObjectNumber", + image_index=["Metadata_Plate", "Metadata_Well", "Metadata_Site"], + cell_x_loc: str = "Nuclei_Location_Center_X", + cell_y_loc: str = "Nuclei_Location_Center_Y", + ): + self.parquet_file = parquet_file + self.sqlite_file = sqlite_file + self.image_column = image_column + self.object_column = object_column + self.image_index = image_index + self.cell_x_loc = cell_x_loc + self.cell_y_loc = cell_y_loc + + def load_data(self): + """Load the Parquet file into a Pandas DataFrame + + Returns + ------- + Pandas DataFrame + The Parquet file loaded into a Pandas DataFrame + """ + df = pd.read_parquet(self.parquet_file) + + # verify that the image index columns are present in the Parquet file + + if not all(elem in df.columns for elem in self.image_index): + raise ValueError( + f"Image index columns {self.image_index} are not present in the Parquet file" + ) + + return df + + def load_sqlite(self): + """Load the required columns from the `Image` and `Nuclei` tables in the SQLite file into a Pandas DataFrame + + Returns + ------- + Pandas DataFrame + The required columns from the `Image` and `Nuclei` tables in the SQLite file loaded into a Pandas DataFrame + """ + # Load the required columns from the SQLite file + + nuclei_query = f"SELECT {self.image_column},{self.object_column},{self.cell_x_loc},{self.cell_y_loc} FROM Nuclei;" + + image_index_str = ", ".join(self.image_index) + + image_query = f"SELECT {self.image_column},{image_index_str} FROM Image;" + + conn = sqlite3.connect(self.sqlite_file) + + nuclei_df = pd.read_sql_query(nuclei_query, conn) + + image_df = pd.read_sql_query(image_query, conn) + + conn.close() + + # Merge the Image and Nuclei tables + merged_df = pd.merge(image_df, nuclei_df, on=self.image_column, how="inner") + + # Cast the cell location columns to float + merged_df[self.cell_x_loc] = merged_df[self.cell_x_loc].astype(float) + merged_df[self.cell_y_loc] = merged_df[self.cell_y_loc].astype(float) + + # Group and nest the X,Y locations of all cells in each image + merged_df = ( + merged_df.groupby(self.image_index) + .agg( + {self.object_column: list, self.cell_x_loc: list, self.cell_y_loc: list} + ) + .reset_index() + ) + + return merged_df + + def add_cell_location(self): + """Add the X,Y locations of all cells to the Parquet file in the corresponding row, packed into a single column + + Returns + ------- + Pandas DataFrame + The Parquet file with the X,Y locations of all cells packed into a single column + """ + # Load the data + data_df = self.load_data() + sqlite_df = self.load_sqlite() + + # Merge the data and SQLite tables + merged_df = pd.merge( + data_df, + sqlite_df, + on=["Metadata_Plate", "Metadata_Well", "Metadata_Site"], + how="left", + ) + + return merged_df diff --git a/pycytominer/tests/test_cyto_utils/test_cell_locations.py b/pycytominer/tests/test_cyto_utils/test_cell_locations.py new file mode 100644 index 00000000..d43c06d4 --- /dev/null +++ b/pycytominer/tests/test_cyto_utils/test_cell_locations.py @@ -0,0 +1,60 @@ +"""This tests the output from CellLocation class""" + +import os + +from pycytominer.cyto_utils.cell_locations import CellLocation + +# setting the file locations +example_project_dir = os.path.join( + os.path.dirname(__file__), "..", "test_data", "cell_locations_example_data" +) + +parquet_file = os.path.join(example_project_dir, "load_data_with_illum_subset.parquet") + +sqlite_file = os.path.join(example_project_dir, "BR00126114_subset.sqlite") + +cell_loc_obj = CellLocation( + parquet_file=parquet_file, + sqlite_file=sqlite_file, +) + +# load the data +cell_loc = cell_loc_obj.add_cell_location() + +# test the data +def test_shape_and_columns(): + # check the shape of the data + assert cell_loc.shape == (2, 28) + + # verify that the Nuclear_Location_Center_X and Nuclear_Location_Center_Y columns are present + assert "Nuclei_Location_Center_X" in cell_loc.columns + assert "Nuclei_Location_Center_Y" in cell_loc.columns + + +def test_values(): + # verify that the values in the Nuclear_Location_Center_X and Nuclear_Location_Center_Y columns are correct + assert cell_loc["Nuclei_Location_Center_X"].values[0] == [ + 943.512129380054, + 65.5980176211454, + 790.798319327731, + 798.1744, + 657.246344206974, + 778.97604035309, + 322.763649425287, + 718.11819235226, + 109.785065590313, + 325.727799227799, + ] + + assert cell_loc["Nuclei_Location_Center_Y"].values[0] == [ + 182.789757412399, + 294.24449339207, + 338.886554621849, + 387.1376, + 402.2272215973, + 406.378310214376, + 413.334051724138, + 469.506373117034, + 474.240161453078, + 497.608108108108, + ] diff --git a/pycytominer/tests/test_data/cell_locations_example_data/.gitignore b/pycytominer/tests/test_data/cell_locations_example_data/.gitignore new file mode 100644 index 00000000..355a6663 --- /dev/null +++ b/pycytominer/tests/test_data/cell_locations_example_data/.gitignore @@ -0,0 +1 @@ +!*sqlite diff --git a/pycytominer/tests/test_data/cell_locations_example_data/BR00126114_subset.sqlite b/pycytominer/tests/test_data/cell_locations_example_data/BR00126114_subset.sqlite new file mode 100644 index 0000000000000000000000000000000000000000..0958abb3248b438f13ecbc763ab3c4577ad79e5d GIT binary patch literal 20480 zcmeI2%WoT16vpjIOhSo0eME{%sbnG%C(6y}p8GoYk_~N@1yXplN!4Z*Il-W!G?gmD zqPrme2-fhIutVya1snFP*&@L=9`f*vQVEGg7H1TXC&$NU&i$S1#pwHX9&SGykNRJ} zc=>!d?%US&R=d6MQNQ16wdV2a;B|V;;-Y+=9_`bMH@wfciv7D?B+PbyYvtef{_1^; z3ps><5D)@FKnMr{As_^VfDjM@Lg4=+aPsc#+|upa?eFsO@Zf0l={JW*qi5B*^Wf3e zz1^+;?!EilTm9uKxV(C8`QzuquaNotVDxgizq|ExSM`@aJNSBZIIab&uk3HXI2?|j zy*S=~FghNOUhY3V`_?51dsV{P$;`~$($Z4b+x866!pqdR!CJ&QILw|3IW=9cc+SoNuBUHjLk{6o7r>|Rj|oJO>sE?uUx-suKVI9E1%7?{Fhe#J5F*40U;m+ zgn$qb0zyCt2mv7=1cZPP5CT`4z+#rs{YP4p2}Dla)1-_ti|s~~6kS|M`TW1Q@M|l7 zo&S`-%7^)f`OV%Ry`OtO^p1O5z4y9*c3*dY?7r$g?5-~SjYK(wfDjM@LO=)z0U;m+ zgn$qb0#}K^>SDHM6ls+x6gq|!eYDAwSI%$|T`-<~^oiZzR`$fGq*8K4ipCoxP;_3| z7z<;9w`?dx69+fT#HbT`JdfvOpgS;&0m5*rAGFw7dnu9Wt3lxvurr;9_ z_R4}zc`TZ&_1d{)l2gXv#uThJ$s4cjpj#HS%Gp#{6QYaA(1sP62nfzu&xu3ugM~5? zAYH%`J&;(^!6lGIz;o2W;2tRq-YFCD#F+|E4yYBp_W@vi;hYa>nIpv%SFdGzcmUPP z8egzsPRUHp$|VMdNT+{;x3edN!4PP0^cn?|wuG+P6~<`_C|Ku=4%kviG}t<93}J`arP+)LQ{jaE*g5Ph#&|dkX3B;v zKG4_~sGBTMScGgWV#I>6Pq7!%N>(<6t>O!xq9#Tdi^pKG7cl^rtW8Gqpj{ROG-nG> zhD@?iYvfVHl4D}ej*Ly|;!jod|BE!_S_lXMAs_^VfDjM@LO=)z0U;m+guvA%u$Xn; zzm@0b763Q^4D|nIltll(+M|^2gn$qb0zyCt2mv7=1cZPP_%8%j7d!cljSU0wxMl!o z>i?ZQM`o%U0Gj%Ll^E*=fTsRmC34*W(A58txV~XR%>dBU|Eq$&ZUAWN|5ZU(GXOO8 z|Ei#^8vvU6f0YQmd)ffd)c>nQNY2v+fTsRG84SVOP%{8D_5Vppf(>HY0MOL`k+{A= zz&&LEXzKq}L9QDBn)-iLFw_hHP5r+txJJHi0I2K#NV|KN+^GTJT>q~cvULMMQ~xg$ W*Wv8182}pke^rp`27ou||Nj6_Q4{k3 literal 0 HcmV?d00001 diff --git a/pycytominer/tests/test_data/cell_locations_example_data/load_data_with_illum_subset.parquet b/pycytominer/tests/test_data/cell_locations_example_data/load_data_with_illum_subset.parquet new file mode 100644 index 0000000000000000000000000000000000000000..9324e8b4b0f8a32bdae9ceb44d5172fd16996174 GIT binary patch literal 27012 zcmeHQYiwKP8Mf2(LRU&#VdIvp%Pcq>YE%0-PKZ0E?f4p-#EG56H^w~QK1Xqph>2kH-mrU_LD?R~%FbMcH)%o$XV0L*<3x9 zP1RG$c|KLo8HS9*#@F@6H}s5~Io7GlvnW_+7}CMVDfrUq;QplHjEMHA=@2~QtSklQ zA$V-U4-K$S4#~jom4HQaJWp_K8=uuPFCSt)9ztMpB~&Q)8%#5Z5Da(xAw39U@xw+z zqqmv#DkO)7nA?YGNEV7s3vC*o(KBxzW^NB75}^Vr3X@?(G{}jI#{fXBvc8tdC25U! zOiVLLzclvw(=i#X~}bq^#y?Z;{UhQ>C0FlxX_7 z-gsNjd~h%G+`Xu2yNcUO+?&qsAh%PUn^{{CG!ryBe{!a+&QL(1^NrKYty5H;yGo?$ zY&T4a?R+k4q_ndq+(u=iwwZ;TrZ&$t3cqKR6+WeIWKWprdrNQpkDmFnfw_4<>f5H; zw&JMTP8l8(wH@J26m4gV^-3FkOkvGh;MP%~NtrZK-Nf_eV>F!PUxXy(yW-&OqH%wylPc{C+k@M+RD=6UymY@Yk1ZUouW>^TMN zeDw%(^%0tNG_`k?zBlWzd$x|IWD7n`y2d)UEwXjw-Uh8B^|69|-oC{AZH#6gP5oWv z@6A5rd$y0JWD7n`y2d_l+hzNx`yR0~7R+u4ZYSN?VFJ zLAA)^HKjvO^iGG@QBxpk_1^|0~f!_4bb%#BHG%^qUwsonz? z?V~?vSoCjqTNa4}uE3(}9P|E<1}qx%MI?%*VbQHwnMJ46eNl5>B9Q2v0Q150fJ6fl z(Ma^)h)kk0Y7$A)H-ScfUtm5C4QMo=5sgOA{Y<8jW?m%Das?LM{0Z}U;v2`Jmxl~5 z4T-bs7xkabJinjQ>(l2h%2VrPQqNo)vpUAC>>@jn9Csw`Q)9EaRBe4NSIU*^ti>-( z(#K9TQVb zc<7B>MmUew`V^9+qls{);L|@s^9?u8CZ0|vi{)%?Gs)*Ra(vR7Op;Rt@9B;Igkw{$ z>6zcXrXSgbDB67S?3WKt=uAgyndMw5Ww7fECmtTs9*TTeH!}h_Ns{_lB0U9<2`W>x zNA;9xniFu!BzfOgJgU|q6_yOjRC!O7l=Xy_iIh`2%4CgPa2sW+Tyt{~-#$@;NxRFh zEGWwq2Qp=x0I+;Ek zu4MXlxRL44`NBKl5Fc{~oRnkwe;kdu6V7}wcfi4_yZSs3b2lGjVeaD77tCFKXn?t! z_qv(ECMj$CI=dK@hy^p)UE6<~4z^|X-=>3jzVLqOe*6C*5d5__hiTpF0S9P}A3k9m zW{w$u^~C);fMYsRO_j5$n(5P#r_DlaI8%YQB+51O<4p(E-iY;rp#Ba$Y9hc%+jmeke9$fm|}{rIYV z2k~NZkECPVyn}QxxmVK3YxJGuOBs5EfV>0W0f3agzw)J*@5K0dD+FqmUZQ}QcsCTy zEWJeG33me_^+;-@OLd?@k9I&0x4MsCdk5DxCQ96K7VZn?CzM%wXot?EL=ZjUq_of0zQ?3y*5-fB2^uT6;_hgd#_eEP)a_Q&W+nhZs5%6Cw1JmnfhMEtOe% zXot?EL=e5;ByF8VS*^7Wn(zw!q#X&YUUW&16>BcCsv%3PwOp!RStH9ZzL;K1t!?2d zth$w2TdOp%vW?tYtyn4JQ^zIiC94@$L*j6)NMkSAFO8e8=u9WLaCk0ySvVwQx;1IR z%U)RGI4%T#@p_!&?3}Z}c{#4YxeDZ&)2T{t;6sm;{x~-$+`}iSg{1pAuE9Y$_{K_{ zb~k*%J;(Vfkl*>Jzr?R6d=Y!h7TicrN9yVFY;msWO#699J{6r>rmgsayq(%b*yKjo3$AiS4@<6KJ5$V(kpe6KvYozLDF0SHeKSO{xI(^HVRN5n~pgrV&mb7j1TMOifLQm zsYEnr71CKg>+?b#;bs3yrI6bS@aa-;BjE$R5R)qqiFo4mIak1ukEpP3WB%xPttjDg z1%e5vC*kv2;}J(F4c~wXah6iiNG;)YtjD7)kLCQUvfLQ!tbu$1Z`rw=E`^sftizg% zIfcCC$a*S@n3jrYk90Xw&rI`5oj2m8Djy#Y3F*;@H69HwXMG;Yj`@WK>efbn7VO&~ zWm{~m)+q+|H7n{`3(q z3w^`XE^I94?;|f3cEyWGpr5>$&y@J`PJM52JWx$oz0*sA-}HkQ1aH_g;oJ$21-yOr zI{|ORzY`9t^7ezjg1uxPqKwDNK84_2jPlm>R^R#wf?;WO7kpq3+}}bl7FX0`f#gH2 zb4cC^83jWppId@Camu3_pQW+!pmwHYP?5SAG2AEtHj3xJdD2^nIgn}aJ}r~iCjdzZ%vv9}P1p}Ypf?%Z*VwN)`7=W9T$j)c4 z<5#P1M;5pMmbZj73uuq+m8o6YLx?rt zni8?NwxPVEW|>#nL#xjM3RLRTjIofu5i~Dg`O2>K5#l;9uOj;JSl+R?H+?X6g8Ix7 zd-yH+j{1;Rr|8qPt$PafnV%o;-aZ~>oJ!5tSu8*9cI=>i=xZjz*F`9QVO66KiR>%* zM)<0Xfr;ZR7r^qCB~5v4`w~{RmY4WS?CF{Z>UZUKkUN5cZtht?MTSE~=|a26H;3h65P-@Gxc@bc`&V#<-wFvIKl`}jxqz>jshdHDaA{s*>C2h{)o literal 0 HcmV?d00001 diff --git a/pycytominer/tests/test_data/cell_locations_example_data/test_cell_locations.sh b/pycytominer/tests/test_data/cell_locations_example_data/test_cell_locations.sh new file mode 100644 index 00000000..e5aae623 --- /dev/null +++ b/pycytominer/tests/test_data/cell_locations_example_data/test_cell_locations.sh @@ -0,0 +1,55 @@ +# Download SQLite file +aws s3 cp s3://cellpainting-gallery/cpg0016-jump/source_4/workspace/backend/2021_08_23_Batch12/BR00126114/BR00126114.sqlite . + +# Download LoadData CSV file +aws s3 cp s3://cellpainting-gallery/cpg0016-jump/source_4/workspace/load_data_csv/2021_08_23_Batch12/BR00126114/load_data_with_illum.parquet . + +# Write a SQL query tp select rows of the `Image` table in the SQLite file where `ImageNumber` is 1 or 2. +# Only select the columns: `Metadata_Plate`, `Metadata_Well`, `Metadata_Site`, `ImageNumber` + +sqlite3 -header -csv BR00126114.sqlite "SELECT Metadata_Plate, Metadata_Well, Metadata_Site, ImageNumber FROM Image WHERE ImageNumber = 1 OR ImageNumber = 2;" > image_query.csv + + +# Write a SQL query tp select rows of the `Nuclei` table in the SQLite file where `ImageNumber` is 1 or 2. +# Only select the columns: `ImageNumber`, `ObjectNumber`, `Nuclei_Location_Center_X`, `Nuclei_Location_Center_Y` + +sqlite3 -header -csv BR00126114.sqlite "SELECT ImageNumber, ObjectNumber, Nuclei_Location_Center_X, Nuclei_Location_Center_Y FROM Nuclei WHERE ImageNumber = 1 LIMIT 10;" > nuclei_query_1.csv +sqlite3 -header -csv BR00126114.sqlite "SELECT ImageNumber, ObjectNumber, Nuclei_Location_Center_X, Nuclei_Location_Center_Y FROM Nuclei WHERE ImageNumber = 2 LIMIT 10;" > nuclei_query_2.csv + +csvstack nuclei_query_1.csv nuclei_query_2.csv > nuclei_query.csv + +# Create a text file with the following SQL commands: + +cat << EOF > create_tables.sql +.mode csv +.import image_query.csv Image +.import nuclei_query.csv Nuclei +EOF + +cat create_tables.sql + +# run the SQL commands in the text file to create the SQLite file + +sqlite3 BR00126114_subset.sqlite < create_tables.sql + +# Print the contents of the `Image` table in the SQLite file + +sqlite3 BR00126114_subset.sqlite "SELECT * FROM Image;" + +# Print the contents of the `Nuclei` table in the SQLite file + +sqlite3 BR00126114_subset.sqlite "SELECT * FROM Nuclei;" + +cat << EOF > create_parquet.py +import pandas as pd +load_data = pd.read_parquet("load_data_with_illum.parquet") +load_data = load_data.astype({"Metadata_Plate": str, "Metadata_Well": str, "Metadata_Site": str}) +image_query = pd.read_csv("image_query.csv") +image_query = image_query[["Metadata_Plate", "Metadata_Well", "Metadata_Site"]] +image_query = image_query.astype({"Metadata_Plate": str, "Metadata_Well": str, "Metadata_Site": str}) +merged_df = image_query.merge(load_data, on=["Metadata_Plate", "Metadata_Well", "Metadata_Site"]) +merged_df.to_parquet("load_data_with_illum_subset.parquet") +EOF + +python create_parquet.py + From 2ee9909ffc436b0f27a92c803ff5486e166020ab Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Thu, 23 Feb 2023 23:17:36 -0500 Subject: [PATCH 02/82] cmdline tool --- pycytominer/cyto_utils/cell_locations_cmd.py | 59 ++++++++++++++++++++ 1 file changed, 59 insertions(+) create mode 100644 pycytominer/cyto_utils/cell_locations_cmd.py diff --git a/pycytominer/cyto_utils/cell_locations_cmd.py b/pycytominer/cyto_utils/cell_locations_cmd.py new file mode 100644 index 00000000..7b526927 --- /dev/null +++ b/pycytominer/cyto_utils/cell_locations_cmd.py @@ -0,0 +1,59 @@ +# This is a command line interface for pycytominer/cyto_utils/cell_locations.py + +import argparse +from pycytominer.cyto_utils.cell_locations import CellLocation + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Utility function to augment a Parquet file with X,Y locations of cells in each image" + ) + parser.add_argument( + "--input_parquet_file", + help="Path to the input Parquet file", + required=True, + ) + parser.add_argument( + "--sqlite_file", + help="Path to the SQLite file", + required=True, + ) + parser.add_argument( + "--output_parquet_file", + help="Path to the output Parquet file", + required=True, + ) + parser.add_argument( + "--image_column", + help="Name of the column in the Parquet file that links to the SQLite file", + default="ImageNumber", + ) + parser.add_argument( + "--object_column", + help="Name of the column in the SQLite file that identifies each cell", + default="ObjectNumber", + ) + parser.add_argument( + "--cell_x_loc", + help="Name of the column in the SQLite file that contains the X location of each cell", + default="Nuclei_Location_Center_X", + ) + parser.add_argument( + "--cell_y_loc", + help="Name of the column in the SQLite file that contains the Y location of each cell", + default="Nuclei_Location_Center_Y", + ) + args = parser.parse_args() + + cell_loc_obj = CellLocation( + parquet_file=args.input_parquet_file, + sqlite_file=args.sqlite_file, + image_column=args.image_column, + object_column=args.object_column, + cell_x_loc=args.cell_x_loc, + cell_y_loc=args.cell_y_loc, + ) + + cell_loc = cell_loc_obj.add_cell_location() + + cell_loc.to_parquet(args.output_parquet_file, index=False) From ed132a9f0d9fe456c8cafe60aef1fe80756bf9df Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Thu, 23 Feb 2023 23:26:45 -0500 Subject: [PATCH 03/82] Cleanup --- pycytominer/cyto_utils/cell_locations.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index c1a2ae83..7fba4843 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -133,6 +133,9 @@ def load_sqlite(self): merged_df[self.cell_x_loc] = merged_df[self.cell_x_loc].astype(float) merged_df[self.cell_y_loc] = merged_df[self.cell_y_loc].astype(float) + # Cast the object column to int + merged_df[self.object_column] = merged_df[self.object_column].astype(int) + # Group and nest the X,Y locations of all cells in each image merged_df = ( merged_df.groupby(self.image_index) @@ -160,7 +163,7 @@ def add_cell_location(self): merged_df = pd.merge( data_df, sqlite_df, - on=["Metadata_Plate", "Metadata_Well", "Metadata_Site"], + on=self.image_index, how="left", ) From 8662db1583748b61142c57be90baf5fc9145d7b8 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Fri, 24 Feb 2023 22:06:03 -0500 Subject: [PATCH 04/82] use fire --- pycytominer/cyto_utils/cell_locations.py | 26 ++++++-- pycytominer/cyto_utils/cell_locations_cmd.py | 60 +------------------ .../test_cyto_utils/test_cell_locations.py | 3 +- 3 files changed, 26 insertions(+), 63 deletions(-) diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index 7fba4843..7a4f1175 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -37,8 +37,11 @@ class CellLocation: Attributes ---------- - parquet_file : str - Path to the Parquet file + parquet_file_input : str + Path to the input Parquet file + + parquet_file_output : str + Path to the output Parquet file sqlite_file : str Path to the SQLite file @@ -63,11 +66,15 @@ class CellLocation: load_sqlite() Load the required columns from the `Image` and `Nuclei` tables in the SQLite file into a Pandas DataFrame + run() + Augment the Parquet file and save it + """ def __init__( self, - parquet_file: str, + parquet_file_input: str, + parquet_file_output: str, sqlite_file: str, image_column: str = "ImageNumber", object_column: str = "ObjectNumber", @@ -75,7 +82,8 @@ def __init__( cell_x_loc: str = "Nuclei_Location_Center_X", cell_y_loc: str = "Nuclei_Location_Center_Y", ): - self.parquet_file = parquet_file + self.parquet_file_input = parquet_file_input + self.parquet_file_output = parquet_file_output self.sqlite_file = sqlite_file self.image_column = image_column self.object_column = object_column @@ -91,7 +99,7 @@ def load_data(self): Pandas DataFrame The Parquet file loaded into a Pandas DataFrame """ - df = pd.read_parquet(self.parquet_file) + df = pd.read_parquet(self.parquet_file_input) # verify that the image index columns are present in the Parquet file @@ -168,3 +176,11 @@ def add_cell_location(self): ) return merged_df + + def run(self): + """Augment the Parquet file and save it""" + # Add the cell location + merged_df = self.add_cell_location() + + # Save the data + merged_df.to_parquet(self.parquet_file_output) diff --git a/pycytominer/cyto_utils/cell_locations_cmd.py b/pycytominer/cyto_utils/cell_locations_cmd.py index 7b526927..754449e2 100644 --- a/pycytominer/cyto_utils/cell_locations_cmd.py +++ b/pycytominer/cyto_utils/cell_locations_cmd.py @@ -1,59 +1,5 @@ -# This is a command line interface for pycytominer/cyto_utils/cell_locations.py - -import argparse from pycytominer.cyto_utils.cell_locations import CellLocation +import fire - -if __name__ == "__main__": - parser = argparse.ArgumentParser( - description="Utility function to augment a Parquet file with X,Y locations of cells in each image" - ) - parser.add_argument( - "--input_parquet_file", - help="Path to the input Parquet file", - required=True, - ) - parser.add_argument( - "--sqlite_file", - help="Path to the SQLite file", - required=True, - ) - parser.add_argument( - "--output_parquet_file", - help="Path to the output Parquet file", - required=True, - ) - parser.add_argument( - "--image_column", - help="Name of the column in the Parquet file that links to the SQLite file", - default="ImageNumber", - ) - parser.add_argument( - "--object_column", - help="Name of the column in the SQLite file that identifies each cell", - default="ObjectNumber", - ) - parser.add_argument( - "--cell_x_loc", - help="Name of the column in the SQLite file that contains the X location of each cell", - default="Nuclei_Location_Center_X", - ) - parser.add_argument( - "--cell_y_loc", - help="Name of the column in the SQLite file that contains the Y location of each cell", - default="Nuclei_Location_Center_Y", - ) - args = parser.parse_args() - - cell_loc_obj = CellLocation( - parquet_file=args.input_parquet_file, - sqlite_file=args.sqlite_file, - image_column=args.image_column, - object_column=args.object_column, - cell_x_loc=args.cell_x_loc, - cell_y_loc=args.cell_y_loc, - ) - - cell_loc = cell_loc_obj.add_cell_location() - - cell_loc.to_parquet(args.output_parquet_file, index=False) +if __name__ == '__main__': + fire.Fire(CellLocation) diff --git a/pycytominer/tests/test_cyto_utils/test_cell_locations.py b/pycytominer/tests/test_cyto_utils/test_cell_locations.py index d43c06d4..6547e337 100644 --- a/pycytominer/tests/test_cyto_utils/test_cell_locations.py +++ b/pycytominer/tests/test_cyto_utils/test_cell_locations.py @@ -14,13 +14,14 @@ sqlite_file = os.path.join(example_project_dir, "BR00126114_subset.sqlite") cell_loc_obj = CellLocation( - parquet_file=parquet_file, + parquet_file_input=parquet_file, sqlite_file=sqlite_file, ) # load the data cell_loc = cell_loc_obj.add_cell_location() + # test the data def test_shape_and_columns(): # check the shape of the data From 80698e66da0e86fff6ec58f3ce062adf93bcad36 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Fri, 24 Feb 2023 22:09:19 -0500 Subject: [PATCH 05/82] output is optional --- pycytominer/cyto_utils/cell_locations.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index 7a4f1175..979d3417 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -74,8 +74,8 @@ class CellLocation: def __init__( self, parquet_file_input: str, - parquet_file_output: str, - sqlite_file: str, + sqlite_file: str = str, + parquet_file_output: str = None, image_column: str = "ImageNumber", object_column: str = "ObjectNumber", image_index=["Metadata_Plate", "Metadata_Well", "Metadata_Site"], From 2da6618de92583d525d8eddb0c84c10760d09f58 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sat, 25 Feb 2023 08:20:36 -0500 Subject: [PATCH 06/82] fix sqlite fixture --- .../BR00126114_subset.sqlite | Bin 20480 -> 12288 bytes .../test_cell_locations.sh | 4 ++++ 2 files changed, 4 insertions(+) diff --git a/pycytominer/tests/test_data/cell_locations_example_data/BR00126114_subset.sqlite b/pycytominer/tests/test_data/cell_locations_example_data/BR00126114_subset.sqlite index 0958abb3248b438f13ecbc763ab3c4577ad79e5d..efeabe0204b6b3b443051f0ba7849ba9ad8601ba 100644 GIT binary patch delta 104 zcmZozz}S#5L0XW7fq{V;h+%+fqK+|8P%mDV7bwKUcaecVkMH7UL4ia*=0+)|$#-}q u7?~%3IQOBS3SCp^Tcr@HI*=~L&Pb02^F_{YKbTNhUs78aKAfA(AY zA<3(&3;Eug>nJU)KUo-lyYc77SL;uHdNf&od^1=)eKdS+uyp#}aC-e@xm(%X+8UgU{{`MVNAE4Zd*fjL*1`6z{j2XEZ0|g+ z?p(RB^WMYpr(k|~H2ET3JNj&L|M=T^`23ZN@U^Y>Od5>MC`CZRjP%xR!3_+5LRkKlNF{2m8Jo2QK;snG0uoK zdZ7m*mB{e9%|O;_tqtDRtXdmHo0ErMxzOFEJ{#ehjm;u>z9tQW5l+vp(evFQAsd1p z=Cyz@iV$R*X4XQDv^T-?gatfZo7H&yfWPB&Ej0_f!R(eLm(VVRb8c{^< zmXst5BB*4HXWxCUXKcv~epoAtC=g3hv@O<~z@%dB?d9$!A(~bVikhf$X&9>+kc{~; zX2O`s=2B;bWGtZ2R54zLgshDgdqsj51+xL-5SSq-iD!4TjA6w<#%|Xu!no92!wVV8 sXdl^PAd!KFNN};ypeK9_zNzF&OdR5+07#CeJ*=7%OT^h9!?o?d0H&t?YXATM diff --git a/pycytominer/tests/test_data/cell_locations_example_data/test_cell_locations.sh b/pycytominer/tests/test_data/cell_locations_example_data/test_cell_locations.sh index e5aae623..547fb954 100644 --- a/pycytominer/tests/test_data/cell_locations_example_data/test_cell_locations.sh +++ b/pycytominer/tests/test_data/cell_locations_example_data/test_cell_locations.sh @@ -32,6 +32,10 @@ cat create_tables.sql sqlite3 BR00126114_subset.sqlite < create_tables.sql +# Print the list of tables in the SQLite file + +sqlite3 BR00126114_subset.sqlite ".tables" + # Print the contents of the `Image` table in the SQLite file sqlite3 BR00126114_subset.sqlite "SELECT * FROM Image;" From ba54ab2127024adf22c2e0574dbb7f97d243d3a4 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sat, 25 Feb 2023 08:20:52 -0500 Subject: [PATCH 07/82] refactor --- pycytominer/cyto_utils/cell_locations.py | 184 ++++++++++++------ .../test_cyto_utils/test_cell_locations.py | 10 +- 2 files changed, 128 insertions(+), 66 deletions(-) diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index 979d3417..58293973 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -1,5 +1,5 @@ """ -Utility function to augment a Parquet file with X,Y locations of cells in each image +Utility function to augment a metadata file with X,Y locations of cells in each image """ import pandas as pd @@ -7,16 +7,16 @@ class CellLocation: - """This class holds all the functions augment a Parquet files with X,Y locations + """This class holds all the functions augment a metadata files with X,Y locations of cells in each image. - In the Parquet file, + In the metadata file, which is either a CSV or a Parquet file, - Each row is single multi-channel image - - Each such image is indexed by 3 columns: `Metadata_Plate`,`Metadata_Well`,`Metadata_Site` + - Each image is indexed by multiple columns, e.g., `Metadata_Plate`,`Metadata_Well`,`Metadata_Site` - The SQLite file contains at least two tables + The single_cell SQLite file contains at least two tables - `Nuclei`, which has the single-cell-level readouts, including location information - - `Image`, which has the image-level readouts, as well metadata to link to the Parquet file + - `Image`, which has the image-level readouts, as well metadata to link to the metadata file In the `Nuclei` table, - Each row is a cell @@ -24,101 +24,164 @@ class CellLocation: In the `Image` table, - Each row is an image - - Each image has at least 3 columns: `Metadata_Plate`,`Metadata_Well`,`Metadata_Site` - + - Each image has at least the same columns as the images in the metadata file are indexed by, e.g., `Metadata_Plate`,`Metadata_Well`,`Metadata_Site` The methods in this class do the following - - Read the Parquet file - - Read the SQLite file - - For each image in the Parquet file, find the corresponding image in the SQLite file + - Read the metadata file + - Read the single_cell file + - For each image in the metadata file, find the corresponding image in the single_cell file - For each cell in the corresponding image, find the X,Y location - - Add the X,Y locations of all cells to the Parquet file in the corresponding row, packed into a single column + - Add the X,Y locations of all cells to the metadata file in the corresponding row, packed into a single column Attributes ---------- - parquet_file_input : str - Path to the input Parquet file + metadata_file_input : str + Path to the input metadata file - parquet_file_output : str - Path to the output Parquet file + augmented_metadata_file_output : str + Path to the output file - sqlite_file : str - Path to the SQLite file + single_cell_file : str + Path to the single_cell file image_column : default = 'ImageNumber' - Name of the column in the Parquet file that links to the SQLite file + Name of the column in the metadata file that links to the single_cell file object_column : default = 'ObjectNumber' - Name of the column in the SQLite file that identifies each cell + Name of the column in the single_cell file that identifies each cell cell_x_loc : default = 'Nuclei_Location_Center_X' - Name of the column in the SQLite file that contains the X location of each cell + Name of the column in the single_cell file that contains the X location of each cell cell_y_loc : default = 'Nuclei_Location_Center_Y' - Name of the column in the SQLite file that contains the Y location of each cell + Name of the column in the single_cell file that contains the Y location of each cell Methods ------- - load_data() - Load the Parquet file into a Pandas DataFrame + load_metadata() + Load the metadata file into a Pandas DataFrame - load_sqlite() - Load the required columns from the `Image` and `Nuclei` tables in the SQLite file into a Pandas DataFrame + load_single_cell() + Load the required columns from the `Image` and `Nuclei` tables in the single_cell file into a Pandas DataFrame - run() - Augment the Parquet file and save it + add_cell_location() + Augment the metadata file and optionally save it to a file """ def __init__( self, - parquet_file_input: str, - sqlite_file: str = str, - parquet_file_output: str = None, + metadata_file_input: str, + single_cell_file_input: str = str, + augmented_metadata_file_output: str = None, image_column: str = "ImageNumber", object_column: str = "ObjectNumber", image_index=["Metadata_Plate", "Metadata_Well", "Metadata_Site"], cell_x_loc: str = "Nuclei_Location_Center_X", cell_y_loc: str = "Nuclei_Location_Center_Y", ): - self.parquet_file_input = parquet_file_input - self.parquet_file_output = parquet_file_output - self.sqlite_file = sqlite_file + self.metadata_file_input = metadata_file_input + self.augmented_metadata_file_output = augmented_metadata_file_output + self.single_cell_file_input = single_cell_file_input self.image_column = image_column self.object_column = object_column self.image_index = image_index self.cell_x_loc = cell_x_loc self.cell_y_loc = cell_y_loc - def load_data(self): - """Load the Parquet file into a Pandas DataFrame + def load_metadata(self): + """Load the metadata file into a Pandas DataFrame Returns ------- Pandas DataFrame - The Parquet file loaded into a Pandas DataFrame + The metadata file loaded into a Pandas DataFrame """ - df = pd.read_parquet(self.parquet_file_input) - # verify that the image index columns are present in the Parquet file + # verify that the metadata file is a CSV or a Parquet file + + if not ( + self.metadata_file_input.endswith(".csv") + or self.metadata_file_input.endswith(".parquet") + ): + raise ValueError("Metadata file must be a CSV or a Parquet file") + + # load the metadata file into a Pandas DataFrame + + if self.metadata_file_input.endswith(".csv"): + df = pd.read_csv(self.metadata_file_input) + else: + df = pd.read_parquet(self.metadata_file_input) + + # verify that the image index columns are present in the metadata file if not all(elem in df.columns for elem in self.image_index): raise ValueError( - f"Image index columns {self.image_index} are not present in the Parquet file" + f"Image index columns {self.image_index} are not present in the metadata file" ) return df - def load_sqlite(self): - """Load the required columns from the `Image` and `Nuclei` tables in the SQLite file into a Pandas DataFrame + def load_single_cell(self): + """Load the required columns from the `Image` and `Nuclei` tables in the single_cell file into a Pandas DataFrame Returns ------- Pandas DataFrame - The required columns from the `Image` and `Nuclei` tables in the SQLite file loaded into a Pandas DataFrame + The required columns from the `Image` and `Nuclei` tables in the single_cell file loaded into a Pandas DataFrame """ - # Load the required columns from the SQLite file + + # Verify that the Image and Nuclei tables are present in the single_cell file + + conn = sqlite3.connect(self.single_cell_file_input) + + c = conn.cursor() + + c.execute("SELECT name FROM sqlite_master WHERE type='table';") + + tables = c.fetchall() + + tables = [x[0] for x in tables] + + if not ("Image" in tables and "Nuclei" in tables): + raise ValueError( + "Image and Nuclei tables are not present in the single_cell file" + ) + + # Verify that the required columns are present in the single_cell file + + c.execute("PRAGMA table_info(Nuclei);") + + nuclei_columns = c.fetchall() + + nuclei_columns = [x[1] for x in nuclei_columns] + + if not ( + self.image_column in nuclei_columns + and self.object_column in nuclei_columns + and self.cell_x_loc in nuclei_columns + and self.cell_y_loc in nuclei_columns + ): + raise ValueError( + f"Required columns are not present in the Nuclei table in the SQLite file" + ) + + c.execute("PRAGMA table_info(Image);") + + image_columns = c.fetchall() + + image_columns = [x[1] for x in image_columns] + + if not ( + self.image_column in image_columns + and all(elem in image_columns for elem in self.image_index) + ): + raise ValueError( + f"Required columns are not present in the Image table in the SQLite file" + ) + + # Load the required columns from the single_cell file nuclei_query = f"SELECT {self.image_column},{self.object_column},{self.cell_x_loc},{self.cell_y_loc} FROM Nuclei;" @@ -126,8 +189,6 @@ def load_sqlite(self): image_query = f"SELECT {self.image_column},{image_index_str} FROM Image;" - conn = sqlite3.connect(self.sqlite_file) - nuclei_df = pd.read_sql_query(nuclei_query, conn) image_df = pd.read_sql_query(image_query, conn) @@ -156,7 +217,8 @@ def load_sqlite(self): return merged_df def add_cell_location(self): - """Add the X,Y locations of all cells to the Parquet file in the corresponding row, packed into a single column + """Add the X,Y locations of all cells to the metadata file in the corresponding row, packed into a single column. + Optionally, save the augmented metadata file as a Parquet file. Returns ------- @@ -164,23 +226,21 @@ def add_cell_location(self): The Parquet file with the X,Y locations of all cells packed into a single column """ # Load the data - data_df = self.load_data() - sqlite_df = self.load_sqlite() + metadata_df = self.load_metadata() + single_cell_df = self.load_single_cell() - # Merge the data and SQLite tables - merged_df = pd.merge( - data_df, - sqlite_df, + # Merge the data and single_cell tables + augmented_metadata_df = pd.merge( + metadata_df, + single_cell_df, on=self.image_index, how="left", ) - return merged_df - - def run(self): - """Augment the Parquet file and save it""" - # Add the cell location - merged_df = self.add_cell_location() - - # Save the data - merged_df.to_parquet(self.parquet_file_output) + # If self.augmented_metadata_file_output) is not None, save the data + if self.augmented_metadata_file_output is not None: + augmented_metadata_df.to_parquet( + self.augmented_metadata_file_output, index=False + ) + else: + return augmented_metadata_df diff --git a/pycytominer/tests/test_cyto_utils/test_cell_locations.py b/pycytominer/tests/test_cyto_utils/test_cell_locations.py index 6547e337..14470595 100644 --- a/pycytominer/tests/test_cyto_utils/test_cell_locations.py +++ b/pycytominer/tests/test_cyto_utils/test_cell_locations.py @@ -9,13 +9,15 @@ os.path.dirname(__file__), "..", "test_data", "cell_locations_example_data" ) -parquet_file = os.path.join(example_project_dir, "load_data_with_illum_subset.parquet") +metadata_file_input = os.path.join( + example_project_dir, "load_data_with_illum_subset.parquet" +) -sqlite_file = os.path.join(example_project_dir, "BR00126114_subset.sqlite") +single_cell_file = os.path.join(example_project_dir, "BR00126114_subset.sqlite") cell_loc_obj = CellLocation( - parquet_file_input=parquet_file, - sqlite_file=sqlite_file, + metadata_file_input=metadata_file_input, + single_cell_file_input=single_cell_file, ) # load the data From 43b4f582e419769a6f8c3d0cfcecea01b803fafd Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sun, 26 Feb 2023 08:25:38 -0500 Subject: [PATCH 08/82] files are optional --- pycytominer/cyto_utils/cell_locations.py | 76 ++++++++++--------- .../test_cyto_utils/test_cell_locations.py | 40 +++++++--- 2 files changed, 71 insertions(+), 45 deletions(-) diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index 58293973..cb53022d 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -7,12 +7,12 @@ class CellLocation: - """This class holds all the functions augment a metadata files with X,Y locations - of cells in each image. + """This class holds all the functions augment a metadata file with X,Y + locations of cells in each image. In the metadata file, which is either a CSV or a Parquet file, - Each row is single multi-channel image - - Each image is indexed by multiple columns, e.g., `Metadata_Plate`,`Metadata_Well`,`Metadata_Site` + - Each image is indexed by multiple columns, e.g., `Metadata_Plate`, `Metadata_Well`,`Metadata_Site` The single_cell SQLite file contains at least two tables - `Nuclei`, which has the single-cell-level readouts, including location information @@ -36,14 +36,14 @@ class CellLocation: Attributes ---------- - metadata_file_input : str - Path to the input metadata file + metadata_input : str or Pandas DataFrame + Path to the input metadata file or a Pandas DataFrame - augmented_metadata_file_output : str - Path to the output file + single_cell_file : str or sqlite3.Connection + Path to the single_cell file or a sqlite3.Connection object - single_cell_file : str - Path to the single_cell file + augmented_metadata_output : str + Path to the output file. If None, the metadata file is not saved to disk image_column : default = 'ImageNumber' Name of the column in the metadata file that links to the single_cell file @@ -72,18 +72,18 @@ class CellLocation: def __init__( self, - metadata_file_input: str, - single_cell_file_input: str = str, - augmented_metadata_file_output: str = None, + metadata_input: str or pd.DataFrame, + single_cell_input: str or sqlite3.Connection, + augmented_metadata_output: str = None, image_column: str = "ImageNumber", object_column: str = "ObjectNumber", image_index=["Metadata_Plate", "Metadata_Well", "Metadata_Site"], cell_x_loc: str = "Nuclei_Location_Center_X", cell_y_loc: str = "Nuclei_Location_Center_Y", ): - self.metadata_file_input = metadata_file_input - self.augmented_metadata_file_output = augmented_metadata_file_output - self.single_cell_file_input = single_cell_file_input + self.metadata_input = metadata_input + self.augmented_metadata_output = augmented_metadata_output + self.single_cell_input = single_cell_input self.image_column = image_column self.object_column = object_column self.image_index = image_index @@ -91,30 +91,33 @@ def __init__( self.cell_y_loc = cell_y_loc def load_metadata(self): - """Load the metadata file into a Pandas DataFrame + """Load the metadata into a Pandas DataFrame Returns ------- Pandas DataFrame - The metadata file loaded into a Pandas DataFrame + The metadata loaded into a Pandas DataFrame """ - # verify that the metadata file is a CSV or a Parquet file + if not isinstance(self.metadata_input, pd.DataFrame): + # verify that the metadata file is a CSV or a Parquet file - if not ( - self.metadata_file_input.endswith(".csv") - or self.metadata_file_input.endswith(".parquet") - ): - raise ValueError("Metadata file must be a CSV or a Parquet file") + if not ( + self.metadata_input.endswith(".csv") + or self.metadata_input.endswith(".parquet") + ): + raise ValueError("Metadata file must be a CSV or a Parquet file") - # load the metadata file into a Pandas DataFrame + # load the metadata file into a Pandas DataFrame - if self.metadata_file_input.endswith(".csv"): - df = pd.read_csv(self.metadata_file_input) + if self.metadata_input.endswith(".csv"): + df = pd.read_csv(self.metadata_input) + else: + df = pd.read_parquet(self.metadata_input) else: - df = pd.read_parquet(self.metadata_file_input) + df = self.metadata_input - # verify that the image index columns are present in the metadata file + # verify that the image index columns are present in the metadata object if not all(elem in df.columns for elem in self.image_index): raise ValueError( @@ -124,17 +127,20 @@ def load_metadata(self): return df def load_single_cell(self): - """Load the required columns from the `Image` and `Nuclei` tables in the single_cell file into a Pandas DataFrame + """Load the required columns from the `Image` and `Nuclei` tables in the single_cell file or sqlite3.Connection object into a Pandas DataFrame Returns ------- Pandas DataFrame - The required columns from the `Image` and `Nuclei` tables in the single_cell file loaded into a Pandas DataFrame + The required columns from the `Image` and `Nuclei` tables loaded into a Pandas DataFrame """ - # Verify that the Image and Nuclei tables are present in the single_cell file + if isinstance(self.single_cell_input, str): + conn = sqlite3.connect(self.single_cell_input) + else: + conn = self.single_cell_input - conn = sqlite3.connect(self.single_cell_file_input) + # Verify that the Image and Nuclei tables are present in single_cell c = conn.cursor() @@ -237,10 +243,10 @@ def add_cell_location(self): how="left", ) - # If self.augmented_metadata_file_output) is not None, save the data - if self.augmented_metadata_file_output is not None: + # If self.augmented_metadata_output) is not None, save the data + if self.augmented_metadata_output is not None: augmented_metadata_df.to_parquet( - self.augmented_metadata_file_output, index=False + self.augmented_metadata_output, index=False ) else: return augmented_metadata_df diff --git a/pycytominer/tests/test_cyto_utils/test_cell_locations.py b/pycytominer/tests/test_cyto_utils/test_cell_locations.py index 14470595..c4645626 100644 --- a/pycytominer/tests/test_cyto_utils/test_cell_locations.py +++ b/pycytominer/tests/test_cyto_utils/test_cell_locations.py @@ -1,31 +1,50 @@ """This tests the output from CellLocation class""" import os - +import pandas as pd from pycytominer.cyto_utils.cell_locations import CellLocation +import pytest +import sqlite3 # setting the file locations example_project_dir = os.path.join( os.path.dirname(__file__), "..", "test_data", "cell_locations_example_data" ) -metadata_file_input = os.path.join( +metadata_input = os.path.join( example_project_dir, "load_data_with_illum_subset.parquet" ) -single_cell_file = os.path.join(example_project_dir, "BR00126114_subset.sqlite") +single_cell_input = os.path.join(example_project_dir, "BR00126114_subset.sqlite") + +cell_loc_obj1 = CellLocation( + metadata_input=metadata_input, + single_cell_input=single_cell_input, +) -cell_loc_obj = CellLocation( - metadata_file_input=metadata_file_input, - single_cell_file_input=single_cell_file, +cell_loc_obj2 = CellLocation( + metadata_input=pd.read_parquet(metadata_input), + single_cell_input=sqlite3.connect(single_cell_input), ) # load the data -cell_loc = cell_loc_obj.add_cell_location() +cell_loc1 = cell_loc_obj1.add_cell_location() + +cell_loc2 = cell_loc_obj2.add_cell_location() + + +@pytest.mark.parametrize("cell_loc", [cell_loc1, cell_loc2]) +def test_shape_and_columns(cell_loc): + # check the shape of the data + assert cell_loc.shape == (2, 28) + + # verify that the Nuclear_Location_Center_X and Nuclear_Location_Center_Y columns are present + assert "Nuclei_Location_Center_X" in cell_loc.columns + assert "Nuclei_Location_Center_Y" in cell_loc.columns -# test the data -def test_shape_and_columns(): +@pytest.mark.parametrize("cell_loc", [cell_loc1, cell_loc2]) +def test_shape_and_columns(cell_loc): # check the shape of the data assert cell_loc.shape == (2, 28) @@ -34,7 +53,8 @@ def test_shape_and_columns(): assert "Nuclei_Location_Center_Y" in cell_loc.columns -def test_values(): +@pytest.mark.parametrize("cell_loc", [cell_loc1, cell_loc2]) +def test_values(cell_loc): # verify that the values in the Nuclear_Location_Center_X and Nuclear_Location_Center_Y columns are correct assert cell_loc["Nuclei_Location_Center_X"].values[0] == [ 943.512129380054, From 63c1b07fa657a23dd32fda94396ef51fd6c02221 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sun, 26 Feb 2023 08:39:05 -0500 Subject: [PATCH 09/82] Typo --- pycytominer/cyto_utils/cell_locations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index cb53022d..0ed47081 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -39,7 +39,7 @@ class CellLocation: metadata_input : str or Pandas DataFrame Path to the input metadata file or a Pandas DataFrame - single_cell_file : str or sqlite3.Connection + single_cell_input : str or sqlite3.Connection Path to the single_cell file or a sqlite3.Connection object augmented_metadata_output : str From 2b4928022761801233a03728443607e7ffc96dff Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sun, 26 Feb 2023 09:40:18 -0500 Subject: [PATCH 10/82] cleanup --- .../test_cyto_utils/test_cell_locations.py | 38 +++++++++++++++++-- 1 file changed, 34 insertions(+), 4 deletions(-) diff --git a/pycytominer/tests/test_cyto_utils/test_cell_locations.py b/pycytominer/tests/test_cyto_utils/test_cell_locations.py index c4645626..87938857 100644 --- a/pycytominer/tests/test_cyto_utils/test_cell_locations.py +++ b/pycytominer/tests/test_cyto_utils/test_cell_locations.py @@ -6,7 +6,7 @@ import pytest import sqlite3 -# setting the file locations +# local files example_project_dir = os.path.join( os.path.dirname(__file__), "..", "test_data", "cell_locations_example_data" ) @@ -17,23 +17,53 @@ single_cell_input = os.path.join(example_project_dir, "BR00126114_subset.sqlite") +# inputs are files cell_loc_obj1 = CellLocation( metadata_input=metadata_input, single_cell_input=single_cell_input, ) +# inputs are in-memory objects cell_loc_obj2 = CellLocation( metadata_input=pd.read_parquet(metadata_input), single_cell_input=sqlite3.connect(single_cell_input), ) +# inputs are S3 paths + +# don't run this test if running on GitHub Actions +# because the S3 bucket is not public + +if "GITHUB_WORKFLOW" in os.environ: + pytest.skip("Skipping S3 test", allow_module_level=True) + + +example_s3_project_dir = "s3://cellpainting-gallery/test-cpg0016-jump/source_4/workspace/{workspace_folder}/2021_08_23_Batch12/BR00126114/" + +metadata_input_s3 = os.path.join( + example_s3_project_dir.format(workspace_folder="load_data_csv"), + "load_data_with_illum_subset.parquet", +) + +single_cell_input_s3 = os.path.join(example_project_dir, "BR00126114_subset.sqlite") + +cell_loc_obj3 = CellLocation( + metadata_input=metadata_input_s3, + single_cell_input=single_cell_input_s3, +) + +cell_loc3 = cell_loc_obj3.add_cell_location() + + # load the data cell_loc1 = cell_loc_obj1.add_cell_location() cell_loc2 = cell_loc_obj2.add_cell_location() +cell_loc_l = [cell_loc1, cell_loc2] + -@pytest.mark.parametrize("cell_loc", [cell_loc1, cell_loc2]) +@pytest.mark.parametrize("cell_loc", cell_loc_l) def test_shape_and_columns(cell_loc): # check the shape of the data assert cell_loc.shape == (2, 28) @@ -43,7 +73,7 @@ def test_shape_and_columns(cell_loc): assert "Nuclei_Location_Center_Y" in cell_loc.columns -@pytest.mark.parametrize("cell_loc", [cell_loc1, cell_loc2]) +@pytest.mark.parametrize("cell_loc", cell_loc_l) def test_shape_and_columns(cell_loc): # check the shape of the data assert cell_loc.shape == (2, 28) @@ -53,7 +83,7 @@ def test_shape_and_columns(cell_loc): assert "Nuclei_Location_Center_Y" in cell_loc.columns -@pytest.mark.parametrize("cell_loc", [cell_loc1, cell_loc2]) +@pytest.mark.parametrize("cell_loc", cell_loc_l) def test_values(cell_loc): # verify that the values in the Nuclear_Location_Center_X and Nuclear_Location_Center_Y columns are correct assert cell_loc["Nuclei_Location_Center_X"].values[0] == [ From 69323b09f902c0bcff234a6fed347ae736ede960 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sun, 26 Feb 2023 09:59:15 -0500 Subject: [PATCH 11/82] drop comments --- pycytominer/tests/test_cyto_utils/test_cell_locations.py | 8 -------- 1 file changed, 8 deletions(-) diff --git a/pycytominer/tests/test_cyto_utils/test_cell_locations.py b/pycytominer/tests/test_cyto_utils/test_cell_locations.py index 87938857..7728cc7b 100644 --- a/pycytominer/tests/test_cyto_utils/test_cell_locations.py +++ b/pycytominer/tests/test_cyto_utils/test_cell_locations.py @@ -29,14 +29,6 @@ single_cell_input=sqlite3.connect(single_cell_input), ) -# inputs are S3 paths - -# don't run this test if running on GitHub Actions -# because the S3 bucket is not public - -if "GITHUB_WORKFLOW" in os.environ: - pytest.skip("Skipping S3 test", allow_module_level=True) - example_s3_project_dir = "s3://cellpainting-gallery/test-cpg0016-jump/source_4/workspace/{workspace_folder}/2021_08_23_Batch12/BR00126114/" From 7c36fb5a588c02633e3d147013a17e4e8739dbcd Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sun, 26 Feb 2023 09:59:31 -0500 Subject: [PATCH 12/82] create test fixtures --- pycytominer/tests/test_cyto_utils/conftest.py | 135 ++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 pycytominer/tests/test_cyto_utils/conftest.py diff --git a/pycytominer/tests/test_cyto_utils/conftest.py b/pycytominer/tests/test_cyto_utils/conftest.py new file mode 100644 index 00000000..68bf7c78 --- /dev/null +++ b/pycytominer/tests/test_cyto_utils/conftest.py @@ -0,0 +1,135 @@ +""" +conftest.py for pytest +""" + +import os +import pandas as pd +import pathlib +import pytest +import sqlite3 +from pycytominer.cyto_utils.cell_locations import CellLocation + + +@pytest.fixture(name="data_dir_cell_locations") +def fixture_data_dir_cell_locations() -> str: + """ + Provide a data directory for cell_locations test data + """ + + return f"{pathlib.Path(__file__).parent}/test_data/cell_locations_example_data" + + +@pytest.fixture(name="metadata_input_file") +def fixture_metadata_input_file(data_dir_cell_locations: str) -> str: + """ + Provide a metadata input file for cell_locations test data + """ + return os.path.join(data_dir_cell_locations, "load_data_with_illum_subset.parquet") + + +@pytest.fixture(name="single_cell_input_file") +def fixture_single_cell_input_file(data_dir_cell_locations: str) -> str: + """ + Provide a single cell input file for cell_locations test data + """ + return os.path.join(data_dir_cell_locations, "BR00126114_subset.parquet") + + +@pytest.fixture(name="metadata_input_file_s3") +def fixture_metadata_input_file_s3(data_dir_cell_locations: str) -> str: + """ + Provide a metadata input file for cell_locations test data + """ + return "s3://cellpainting-gallery/test-cpg0016-jump/source_4/workspace/load_data_csv/2021_08_23_Batch12/BR00126114/load_data_with_illum_subset.parquet" + + +@pytest.fixture(name="single_cell_input_file_s3") +def fixture_single_cell_input_file_s3(data_dir_cell_locations: str) -> str: + """ + Provide a single cell input file for cell_locations test data + """ + return "s3://cellpainting-gallery/test-cpg0016-jump/source_4/workspace/backend/2021_08_23_Batch12/BR00126114/BR00126114_subset.sqlite" + + +@pytest.fixture(name="metadata_input_dataframe") +def fixture_metadata_input_dataframe(metadata_input_file: str) -> pd.DataFrame: + """ + Provide a metadata input file for cell_locations test data + """ + return pd.read_parquet(metadata_input_file) + + +@pytest.fixture(name="single_cell_input_connection") +def fixture_single_cell_input_connection( + single_cell_input_file: str, +) -> sqlite3.Connection: + """ + Provide a single cell input file for cell_locations test data + """ + return sqlite3.connect(single_cell_input_file) + + +@pytest.fixture(name="cell_loc_obj1") +def fixture_cell_loc_obj1( + metadata_input_file: str, + single_cell_input_file: str, +) -> CellLocation: + """ + Provide a CellLocation object with file inputs + """ + return CellLocation( + metadata_input=metadata_input_file, + single_cell_input=single_cell_input_file, + ) + + +@pytest.fixture(name="cell_loc_obj2") +def fixture_cell_loc_obj2( + metadata_input_dataframe: pd.DataFrame, + single_cell_input_connection: sqlite3.Connection, +) -> CellLocation: + """ + Provide a CellLocation object with in-memory inputs + """ + return CellLocation( + metadata_input=metadata_input_dataframe, + single_cell_input=single_cell_input_connection, + ) + + +@pytest.fixture(name="cell_loc_obj3") +def fixture_cell_loc_obj3( + metadata_input_file_s3: str, + single_cell_input_file_s3: str, +) -> CellLocation: + """ + Provide a CellLocation object with s3 inputs + """ + return CellLocation( + metadata_input=metadata_input_file_s3, + single_cell_input=single_cell_input_file_s3, + ) + + +@pytest.fixture(name="cell_loc1") +def fixture_cell_loc1(cell_loc_obj1: CellLocation) -> pd.DataFrame: + """ + Provide the output of running CellLocation.add_cell_location + """ + return cell_loc_obj1.add_cell_location() + + +@pytest.fixture(name="cell_loc2") +def fixture_cell_loc2(cell_loc_obj2: CellLocation) -> pd.DataFrame: + """ + Provide the output of running CellLocation.add_cell_location + """ + return cell_loc_obj2.add_cell_location() + + +@pytest.fixture(name="cell_loc3") +def fixture_cell_loc3(cell_loc_obj3: CellLocation) -> pd.DataFrame: + """ + Provide the output of running CellLocation.add_cell_location + """ + return cell_loc_obj3.add_cell_location() From 612fc9caca8c55a5eef22e5a21b298d69b9974b4 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sun, 26 Feb 2023 10:56:41 -0500 Subject: [PATCH 13/82] checks --- pycytominer/cyto_utils/cell_locations.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index 0ed47081..921eb483 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -2,6 +2,7 @@ Utility function to augment a metadata file with X,Y locations of cells in each image """ +import os import pandas as pd import sqlite3 @@ -136,6 +137,11 @@ def load_single_cell(self): """ if isinstance(self.single_cell_input, str): + # check if the single_cell file is a SQLite file + + if not self.single_cell_input.endswith(".sqlite"): + raise ValueError("single_cell file must be a SQLite file") + conn = sqlite3.connect(self.single_cell_input) else: conn = self.single_cell_input From 216212608ed06cc6ff0adf5f8380dc276f2560ae Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sun, 26 Feb 2023 10:56:55 -0500 Subject: [PATCH 14/82] fix paths --- pycytominer/tests/test_cyto_utils/conftest.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pycytominer/tests/test_cyto_utils/conftest.py b/pycytominer/tests/test_cyto_utils/conftest.py index 68bf7c78..5e272591 100644 --- a/pycytominer/tests/test_cyto_utils/conftest.py +++ b/pycytominer/tests/test_cyto_utils/conftest.py @@ -16,7 +16,9 @@ def fixture_data_dir_cell_locations() -> str: Provide a data directory for cell_locations test data """ - return f"{pathlib.Path(__file__).parent}/test_data/cell_locations_example_data" + return ( + f"{pathlib.Path(__file__).parent.parent}/test_data/cell_locations_example_data" + ) @pytest.fixture(name="metadata_input_file") @@ -32,7 +34,7 @@ def fixture_single_cell_input_file(data_dir_cell_locations: str) -> str: """ Provide a single cell input file for cell_locations test data """ - return os.path.join(data_dir_cell_locations, "BR00126114_subset.parquet") + return os.path.join(data_dir_cell_locations, "BR00126114_subset.sqlite") @pytest.fixture(name="metadata_input_file_s3") From d64eb11d81a24633c3a473d5a74ac7cae03caedc Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sun, 26 Feb 2023 10:57:31 -0500 Subject: [PATCH 15/82] Use fixtures --- .../test_cyto_utils/test_cell_locations.py | 65 ++++--------------- 1 file changed, 11 insertions(+), 54 deletions(-) diff --git a/pycytominer/tests/test_cyto_utils/test_cell_locations.py b/pycytominer/tests/test_cyto_utils/test_cell_locations.py index 7728cc7b..62d70810 100644 --- a/pycytominer/tests/test_cyto_utils/test_cell_locations.py +++ b/pycytominer/tests/test_cyto_utils/test_cell_locations.py @@ -4,59 +4,12 @@ import pandas as pd from pycytominer.cyto_utils.cell_locations import CellLocation import pytest -import sqlite3 -# local files -example_project_dir = os.path.join( - os.path.dirname(__file__), "..", "test_data", "cell_locations_example_data" -) -metadata_input = os.path.join( - example_project_dir, "load_data_with_illum_subset.parquet" -) +@pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2"]) +def test_shape_and_columns(cell_loc, request): + cell_loc = request.getfixturevalue(cell_loc) -single_cell_input = os.path.join(example_project_dir, "BR00126114_subset.sqlite") - -# inputs are files -cell_loc_obj1 = CellLocation( - metadata_input=metadata_input, - single_cell_input=single_cell_input, -) - -# inputs are in-memory objects -cell_loc_obj2 = CellLocation( - metadata_input=pd.read_parquet(metadata_input), - single_cell_input=sqlite3.connect(single_cell_input), -) - - -example_s3_project_dir = "s3://cellpainting-gallery/test-cpg0016-jump/source_4/workspace/{workspace_folder}/2021_08_23_Batch12/BR00126114/" - -metadata_input_s3 = os.path.join( - example_s3_project_dir.format(workspace_folder="load_data_csv"), - "load_data_with_illum_subset.parquet", -) - -single_cell_input_s3 = os.path.join(example_project_dir, "BR00126114_subset.sqlite") - -cell_loc_obj3 = CellLocation( - metadata_input=metadata_input_s3, - single_cell_input=single_cell_input_s3, -) - -cell_loc3 = cell_loc_obj3.add_cell_location() - - -# load the data -cell_loc1 = cell_loc_obj1.add_cell_location() - -cell_loc2 = cell_loc_obj2.add_cell_location() - -cell_loc_l = [cell_loc1, cell_loc2] - - -@pytest.mark.parametrize("cell_loc", cell_loc_l) -def test_shape_and_columns(cell_loc): # check the shape of the data assert cell_loc.shape == (2, 28) @@ -65,8 +18,10 @@ def test_shape_and_columns(cell_loc): assert "Nuclei_Location_Center_Y" in cell_loc.columns -@pytest.mark.parametrize("cell_loc", cell_loc_l) -def test_shape_and_columns(cell_loc): +@pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2"]) +def test_shape_and_columns(cell_loc, request): + cell_loc = request.getfixturevalue(cell_loc) + # check the shape of the data assert cell_loc.shape == (2, 28) @@ -75,8 +30,10 @@ def test_shape_and_columns(cell_loc): assert "Nuclei_Location_Center_Y" in cell_loc.columns -@pytest.mark.parametrize("cell_loc", cell_loc_l) -def test_values(cell_loc): +@pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2"]) +def test_values(cell_loc, request): + cell_loc = request.getfixturevalue(cell_loc) + # verify that the values in the Nuclear_Location_Center_X and Nuclear_Location_Center_Y columns are correct assert cell_loc["Nuclei_Location_Center_X"].values[0] == [ 943.512129380054, From 6e1afa898c7f840f17ea627733b88bd56e5efbaa Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sun, 26 Feb 2023 11:05:34 -0500 Subject: [PATCH 16/82] yield --- pycytominer/tests/test_cyto_utils/conftest.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pycytominer/tests/test_cyto_utils/conftest.py b/pycytominer/tests/test_cyto_utils/conftest.py index 5e272591..e406aeec 100644 --- a/pycytominer/tests/test_cyto_utils/conftest.py +++ b/pycytominer/tests/test_cyto_utils/conftest.py @@ -68,7 +68,9 @@ def fixture_single_cell_input_connection( """ Provide a single cell input file for cell_locations test data """ - return sqlite3.connect(single_cell_input_file) + conn = sqlite3.connect(single_cell_input_file) + yield conn + conn.close() @pytest.fixture(name="cell_loc_obj1") From 3cf8bdd22317cd8c9f001e0e410bc8f1d319571f Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sun, 26 Feb 2023 14:37:49 -0500 Subject: [PATCH 17/82] download file if needed --- pycytominer/cyto_utils/cell_locations.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index 921eb483..dbdc58e6 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -5,6 +5,8 @@ import os import pandas as pd import sqlite3 +import boto3 +import tempfile class CellLocation: @@ -142,6 +144,25 @@ def load_single_cell(self): if not self.single_cell_input.endswith(".sqlite"): raise ValueError("single_cell file must be a SQLite file") + # if the single_cell file is an S3 path, download it to a temporary file + if self.single_cell_input.startswith("s3://"): + s3 = boto3.resource("s3") + + temp_dir = tempfile.mkdtemp() + + # get the bucket name and key from the S3 path + bucket_name = self.single_cell_input.split("/")[2] + key = "/".join(self.single_cell_input.split("/")[3:]) + + # get the file name from the key + file_name = key.split("/")[-1] + + # the the full path to the temporary file + self.single_cell_input = os.path.join(temp_dir, file_name) + + # save the single_cell file to the temporary directory + s3.Bucket(bucket_name).download_file(key, self.single_cell_input) + conn = sqlite3.connect(self.single_cell_input) else: conn = self.single_cell_input From f1b581a64ad0e5bd244ce687df60e64ad98eba01 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sun, 26 Feb 2023 14:37:56 -0500 Subject: [PATCH 18/82] typo --- pycytominer/tests/test_cyto_utils/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pycytominer/tests/test_cyto_utils/conftest.py b/pycytominer/tests/test_cyto_utils/conftest.py index e406aeec..26a20a77 100644 --- a/pycytominer/tests/test_cyto_utils/conftest.py +++ b/pycytominer/tests/test_cyto_utils/conftest.py @@ -38,7 +38,7 @@ def fixture_single_cell_input_file(data_dir_cell_locations: str) -> str: @pytest.fixture(name="metadata_input_file_s3") -def fixture_metadata_input_file_s3(data_dir_cell_locations: str) -> str: +def fixture_metadata_input_file_s3() -> str: """ Provide a metadata input file for cell_locations test data """ @@ -46,7 +46,7 @@ def fixture_metadata_input_file_s3(data_dir_cell_locations: str) -> str: @pytest.fixture(name="single_cell_input_file_s3") -def fixture_single_cell_input_file_s3(data_dir_cell_locations: str) -> str: +def fixture_single_cell_input_file_s3() -> str: """ Provide a single cell input file for cell_locations test data """ From 36c9dca7c8b274dcf04996c5494a38ddd01a3685 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sun, 26 Feb 2023 16:37:41 -0500 Subject: [PATCH 19/82] Update req --- requirements.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/requirements.txt b/requirements.txt index 5626a1e2..7c56f3f7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,3 +5,6 @@ scikit-learn>=0.21.2 sqlalchemy>=1.3.6,<2 pyarrow>=8.0.0 pytest>=5.0.1 +fsspec>=2023.1.0 +s3fs>=0.4.2 +boto3>=1.26.79 From 2df741cd63193661c9d8e676efb53ec77781bf9a Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Mon, 27 Feb 2023 06:40:18 -0500 Subject: [PATCH 20/82] use boto3 session --- pycytominer/cyto_utils/cell_locations.py | 31 ++++++++++++++++++------ 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index dbdc58e6..d85e5489 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -7,6 +7,7 @@ import sqlite3 import boto3 import tempfile +import shutil class CellLocation: @@ -146,10 +147,6 @@ def load_single_cell(self): # if the single_cell file is an S3 path, download it to a temporary file if self.single_cell_input.startswith("s3://"): - s3 = boto3.resource("s3") - - temp_dir = tempfile.mkdtemp() - # get the bucket name and key from the S3 path bucket_name = self.single_cell_input.split("/")[2] key = "/".join(self.single_cell_input.split("/")[3:]) @@ -157,13 +154,27 @@ def load_single_cell(self): # get the file name from the key file_name = key.split("/")[-1] - # the the full path to the temporary file - self.single_cell_input = os.path.join(temp_dir, file_name) + # create a temporary directory + temp_dir = tempfile.mkdtemp() + + # create a temporary file + temp_single_cell_input = os.path.join(temp_dir, file_name) + + # create a boto3 session + s3_session = boto3.session.Session() + + # create a boto3 client + s3_client = s3_session.client("s3") # save the single_cell file to the temporary directory - s3.Bucket(bucket_name).download_file(key, self.single_cell_input) + s3_client.download_file(bucket_name, key, temp_single_cell_input) + + # connect to the single_cell file + conn = sqlite3.connect(temp_single_cell_input) - conn = sqlite3.connect(self.single_cell_input) + else: + # connect to the single_cell file + conn = sqlite3.connect(self.single_cell_input) else: conn = self.single_cell_input @@ -228,6 +239,10 @@ def load_single_cell(self): conn.close() + # if the single_cell file was downloaded from S3, delete the temporary directory + if "temp_dir" in locals(): + shutil.rmtree(temp_dir) + # Merge the Image and Nuclei tables merged_df = pd.merge(image_df, nuclei_df, on=self.image_column, how="inner") From adf635ab36eac86abb4e362c348948d052add49c Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Mon, 27 Feb 2023 06:42:19 -0500 Subject: [PATCH 21/82] Test s3 locations --- pycytominer/tests/test_cyto_utils/test_cell_locations.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pycytominer/tests/test_cyto_utils/test_cell_locations.py b/pycytominer/tests/test_cyto_utils/test_cell_locations.py index 62d70810..b04d99e3 100644 --- a/pycytominer/tests/test_cyto_utils/test_cell_locations.py +++ b/pycytominer/tests/test_cyto_utils/test_cell_locations.py @@ -6,7 +6,7 @@ import pytest -@pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2"]) +@pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2", "cell_loc3"]) def test_shape_and_columns(cell_loc, request): cell_loc = request.getfixturevalue(cell_loc) @@ -18,7 +18,7 @@ def test_shape_and_columns(cell_loc, request): assert "Nuclei_Location_Center_Y" in cell_loc.columns -@pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2"]) +@pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2", "cell_loc3"]) def test_shape_and_columns(cell_loc, request): cell_loc = request.getfixturevalue(cell_loc) @@ -30,7 +30,7 @@ def test_shape_and_columns(cell_loc, request): assert "Nuclei_Location_Center_Y" in cell_loc.columns -@pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2"]) +@pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2", "cell_loc3"]) def test_values(cell_loc, request): cell_loc = request.getfixturevalue(cell_loc) From bc2f01dfa0252bba5f3aa04cd8488f8dc7fe5de9 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Mon, 27 Feb 2023 06:50:58 -0500 Subject: [PATCH 22/82] skip s3 test --- pycytominer/tests/test_cyto_utils/test_cell_locations.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/pycytominer/tests/test_cyto_utils/test_cell_locations.py b/pycytominer/tests/test_cyto_utils/test_cell_locations.py index b04d99e3..58fd549b 100644 --- a/pycytominer/tests/test_cyto_utils/test_cell_locations.py +++ b/pycytominer/tests/test_cyto_utils/test_cell_locations.py @@ -6,7 +6,8 @@ import pytest -@pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2", "cell_loc3"]) +# @pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2", "cell_loc3"]) +@pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2"]) def test_shape_and_columns(cell_loc, request): cell_loc = request.getfixturevalue(cell_loc) @@ -18,7 +19,8 @@ def test_shape_and_columns(cell_loc, request): assert "Nuclei_Location_Center_Y" in cell_loc.columns -@pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2", "cell_loc3"]) +# @pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2", "cell_loc3"]) +@pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2"]) def test_shape_and_columns(cell_loc, request): cell_loc = request.getfixturevalue(cell_loc) @@ -30,7 +32,8 @@ def test_shape_and_columns(cell_loc, request): assert "Nuclei_Location_Center_Y" in cell_loc.columns -@pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2", "cell_loc3"]) +# @pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2", "cell_loc3"]) +@pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2"]) def test_values(cell_loc, request): cell_loc = request.getfixturevalue(cell_loc) From 36de960203c5266cd1a661418462eb0ed45dbdd9 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Mon, 27 Feb 2023 18:30:10 -0500 Subject: [PATCH 23/82] dtypes --- pycytominer/cyto_utils/cell_locations.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index d85e5489..5ca8043e 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -113,11 +113,12 @@ def load_metadata(self): raise ValueError("Metadata file must be a CSV or a Parquet file") # load the metadata file into a Pandas DataFrame - if self.metadata_input.endswith(".csv"): - df = pd.read_csv(self.metadata_input) + df = pd.read_csv(self.metadata_input, dtype=str) else: df = pd.read_parquet(self.metadata_input) + # cast all columns to string + df = df.astype(str) else: df = self.metadata_input @@ -253,6 +254,10 @@ def load_single_cell(self): # Cast the object column to int merged_df[self.object_column] = merged_df[self.object_column].astype(int) + # Cast the image index columns to str + for col in self.image_index: + merged_df[col] = merged_df[col].astype(str) + # Group and nest the X,Y locations of all cells in each image merged_df = ( merged_df.groupby(self.image_index) From 61855789e4d870312bc7636b2cd45a4887af5963 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Mon, 27 Feb 2023 18:30:49 -0500 Subject: [PATCH 24/82] Add mike's snipper --- pycytominer/cyto_utils/cell_locations.py | 29 ++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index 5ca8043e..1ff39ccb 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -8,6 +8,7 @@ import boto3 import tempfile import shutil +import collections class CellLocation: @@ -131,6 +132,34 @@ def load_metadata(self): return df + def _convert_to_per_row_dict(self, df): + output_df_list = collections.defaultdict(list) + for (plate, well, site, image_number), cell_df in df.groupby( + ["Metadata_Plate", "Metadata_Well", "Metadata_Site", "ImageNumber"] + ): + output_df_list["Metadata_Plate"].append(plate) + output_df_list["Metadata_Well"].append(well) + output_df_list["Metadata_Site"].append(site) + output_df_list["ImageNumber"].append(image_number) + + cell_dict = cell_df.to_dict(orient="list") + row_cell_dicts = [] + for object_number, location_center_x, location_center_y in zip( + cell_dict["ObjectNumber"], + cell_dict["Location_Center_X"], + cell_dict["Location_Center_Y"], + ): + row_cell_dicts.append( + { + "ObjectNumber": object_number, + "Location_Center_X": location_center_x, + "Location_Center_Y": location_center_y, + } + ) + output_df_list["CellCenters"].append(row_cell_dicts) + + return pd.DataFrame(output_df_list) + def load_single_cell(self): """Load the required columns from the `Image` and `Nuclei` tables in the single_cell file or sqlite3.Connection object into a Pandas DataFrame From 6070491aaf6387fa217da0f1e44f7324d44940d4 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Wed, 1 Mar 2023 19:19:53 -0500 Subject: [PATCH 25/82] use mike's format --- pycytominer/cyto_utils/cell_locations.py | 35 ++++++++++++++++-------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index 1ff39ccb..40054871 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -132,6 +132,17 @@ def load_metadata(self): return df + def _download_s3(self, uri): + """ + Download a file from S3, save it to a temporary directory, and return the path to the file + """ + s3 = boto3.resource("s3") + bucket, key = uri.replace("s3://", "").split("/", 1) + tmp_dir = tempfile.mkdtemp() + tmp_file = os.path.join(tmp_dir, os.path.basename(key)) + s3.Bucket(bucket).download_file(key, tmp_file) + return tmp_file + def _convert_to_per_row_dict(self, df): output_df_list = collections.defaultdict(list) for (plate, well, site, image_number), cell_df in df.groupby( @@ -146,14 +157,14 @@ def _convert_to_per_row_dict(self, df): row_cell_dicts = [] for object_number, location_center_x, location_center_y in zip( cell_dict["ObjectNumber"], - cell_dict["Location_Center_X"], - cell_dict["Location_Center_Y"], + cell_dict["Nuclei_Location_Center_X"], + cell_dict["Nuclei_Location_Center_Y"], ): row_cell_dicts.append( { "ObjectNumber": object_number, - "Location_Center_X": location_center_x, - "Location_Center_Y": location_center_y, + "Nuclei_Location_Center_X": location_center_x, + "Nuclei_Location_Center_Y": location_center_y, } ) output_df_list["CellCenters"].append(row_cell_dicts) @@ -288,13 +299,15 @@ def load_single_cell(self): merged_df[col] = merged_df[col].astype(str) # Group and nest the X,Y locations of all cells in each image - merged_df = ( - merged_df.groupby(self.image_index) - .agg( - {self.object_column: list, self.cell_x_loc: list, self.cell_y_loc: list} - ) - .reset_index() - ) + # merged_df = ( + # merged_df.groupby(self.image_index) + # .agg( + # {self.object_column: list, self.cell_x_loc: list, self.cell_y_loc: list} + # ) + # .reset_index() + # ) + + merged_df = self._convert_to_per_row_dict(merged_df) return merged_df From 55a4f748c362ad1a2e2d248808e780cd2df017b3 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Wed, 1 Mar 2023 19:29:12 -0500 Subject: [PATCH 26/82] tests pass --- .../test_cyto_utils/test_cell_locations.py | 33 +++++++++---------- 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/pycytominer/tests/test_cyto_utils/test_cell_locations.py b/pycytominer/tests/test_cyto_utils/test_cell_locations.py index 58fd549b..570b1190 100644 --- a/pycytominer/tests/test_cyto_utils/test_cell_locations.py +++ b/pycytominer/tests/test_cyto_utils/test_cell_locations.py @@ -12,24 +12,14 @@ def test_shape_and_columns(cell_loc, request): cell_loc = request.getfixturevalue(cell_loc) # check the shape of the data - assert cell_loc.shape == (2, 28) + assert cell_loc.shape == (2, 27) # verify that the Nuclear_Location_Center_X and Nuclear_Location_Center_Y columns are present - assert "Nuclei_Location_Center_X" in cell_loc.columns - assert "Nuclei_Location_Center_Y" in cell_loc.columns + # assert "Nuclei_Location_Center_X" in cell_loc.columns + # assert "Nuclei_Location_Center_Y" in cell_loc.columns - -# @pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2", "cell_loc3"]) -@pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2"]) -def test_shape_and_columns(cell_loc, request): - cell_loc = request.getfixturevalue(cell_loc) - - # check the shape of the data - assert cell_loc.shape == (2, 28) - - # verify that the Nuclear_Location_Center_X and Nuclear_Location_Center_Y columns are present - assert "Nuclei_Location_Center_X" in cell_loc.columns - assert "Nuclei_Location_Center_Y" in cell_loc.columns + assert "Nuclei_Location_Center_X" in cell_loc["CellCenters"][0][0].keys() + assert "Nuclei_Location_Center_Y" in cell_loc["CellCenters"][0][0].keys() # @pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2", "cell_loc3"]) @@ -37,8 +27,11 @@ def test_shape_and_columns(cell_loc, request): def test_values(cell_loc, request): cell_loc = request.getfixturevalue(cell_loc) - # verify that the values in the Nuclear_Location_Center_X and Nuclear_Location_Center_Y columns are correct - assert cell_loc["Nuclei_Location_Center_X"].values[0] == [ + # get the values in the Nuclear_Location_Center_X and Nuclear_Location_Center_Y columns + observed_x = [x["Nuclei_Location_Center_X"] for x in cell_loc.CellCenters[0]] + observed_y = [x["Nuclei_Location_Center_Y"] for x in cell_loc.CellCenters[0]] + + expected_x = [ 943.512129380054, 65.5980176211454, 790.798319327731, @@ -51,7 +44,7 @@ def test_values(cell_loc, request): 325.727799227799, ] - assert cell_loc["Nuclei_Location_Center_Y"].values[0] == [ + expected_y = [ 182.789757412399, 294.24449339207, 338.886554621849, @@ -63,3 +56,7 @@ def test_values(cell_loc, request): 474.240161453078, 497.608108108108, ] + + # verify that the values in the Nuclear_Location_Center_X and Nuclear_Location_Center_Y columns are correct + assert observed_x == expected_x + assert observed_y == expected_y From 471614341223c5bf80d2c8d3f842d2498d384831 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Thu, 2 Mar 2023 23:28:48 -0500 Subject: [PATCH 27/82] use method --- pycytominer/cyto_utils/cell_locations.py | 28 ++++-------------------- 1 file changed, 4 insertions(+), 24 deletions(-) diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index 40054871..25c1137e 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -188,27 +188,7 @@ def load_single_cell(self): # if the single_cell file is an S3 path, download it to a temporary file if self.single_cell_input.startswith("s3://"): - # get the bucket name and key from the S3 path - bucket_name = self.single_cell_input.split("/")[2] - key = "/".join(self.single_cell_input.split("/")[3:]) - - # get the file name from the key - file_name = key.split("/")[-1] - - # create a temporary directory - temp_dir = tempfile.mkdtemp() - - # create a temporary file - temp_single_cell_input = os.path.join(temp_dir, file_name) - - # create a boto3 session - s3_session = boto3.session.Session() - - # create a boto3 client - s3_client = s3_session.client("s3") - - # save the single_cell file to the temporary directory - s3_client.download_file(bucket_name, key, temp_single_cell_input) + temp_single_cell_input = self._download_s3(self.single_cell_input) # connect to the single_cell file conn = sqlite3.connect(temp_single_cell_input) @@ -280,9 +260,9 @@ def load_single_cell(self): conn.close() - # if the single_cell file was downloaded from S3, delete the temporary directory - if "temp_dir" in locals(): - shutil.rmtree(temp_dir) + # if the single_cell file was downloaded from S3, delete the temporary file + if "temp_single_cell_input" in locals(): + os.remove(temp_single_cell_input) # Merge the Image and Nuclei tables merged_df = pd.merge(image_df, nuclei_df, on=self.image_column, how="inner") From a77364ff91be42177401ad1b8907d8278b68f404 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Fri, 3 Mar 2023 00:38:56 -0500 Subject: [PATCH 28/82] add alternatives --- pycytominer/cyto_utils/cell_locations.py | 49 +++++++++++++++++++----- 1 file changed, 40 insertions(+), 9 deletions(-) diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index 25c1137e..d400ee19 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -143,23 +143,58 @@ def _download_s3(self, uri): s3.Bucket(bucket).download_file(key, tmp_file) return tmp_file + def _convert_to_per_row_dict_old(self, df): + merged_df = ( + merged_df.groupby(self.image_index) + .agg( + {self.object_column: list, self.cell_x_loc: list, self.cell_y_loc: list} + ) + .reset_index() + ) + + def _convert_to_per_row_dict_list(self, df): + df = ( + df.groupby(self.image_index + [self.image_column]) + .apply( + lambda x: pd.Series( + { + "CellCenters": x[ + [self.cell_x_loc, self.cell_y_loc, self.object_column] + ] + .values.astype(int) + .tolist() + } + ) + ) + .reset_index() + ) + def _convert_to_per_row_dict(self, df): + # define a dictionary to store the output output_df_list = collections.defaultdict(list) + + # iterate over each group of cells in the merged DataFrame for (plate, well, site, image_number), cell_df in df.groupby( ["Metadata_Plate", "Metadata_Well", "Metadata_Site", "ImageNumber"] ): + # add the image index columns to the output dictionary output_df_list["Metadata_Plate"].append(plate) output_df_list["Metadata_Well"].append(well) output_df_list["Metadata_Site"].append(site) + # add the image number to the output dictionary output_df_list["ImageNumber"].append(image_number) + # convert the cell DataFrame to a dictionary of lists cell_dict = cell_df.to_dict(orient="list") + + # iterate over each cell in the cell DataFrame and add it to the output dictionary row_cell_dicts = [] for object_number, location_center_x, location_center_y in zip( cell_dict["ObjectNumber"], cell_dict["Nuclei_Location_Center_X"], cell_dict["Nuclei_Location_Center_Y"], ): + # add the cell information to the output dictionary row_cell_dicts.append( { "ObjectNumber": object_number, @@ -167,8 +202,13 @@ def _convert_to_per_row_dict(self, df): "Nuclei_Location_Center_Y": location_center_y, } ) + output_df_list["CellCenters"].append(row_cell_dicts) + # The type of the CellCenters column is as follows: + # ListType(list>)] + + # convert the output dictionary to a Pandas DataFrame return pd.DataFrame(output_df_list) def load_single_cell(self): @@ -278,15 +318,6 @@ def load_single_cell(self): for col in self.image_index: merged_df[col] = merged_df[col].astype(str) - # Group and nest the X,Y locations of all cells in each image - # merged_df = ( - # merged_df.groupby(self.image_index) - # .agg( - # {self.object_column: list, self.cell_x_loc: list, self.cell_y_loc: list} - # ) - # .reset_index() - # ) - merged_df = self._convert_to_per_row_dict(merged_df) return merged_df From 7fdd393fdefb4f5651043324d8fd97a8b5fdbe6e Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sat, 4 Mar 2023 08:34:22 -0500 Subject: [PATCH 29/82] refactor mike's code --- pycytominer/cyto_utils/cell_locations.py | 67 +++++++----------------- 1 file changed, 18 insertions(+), 49 deletions(-) diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index d400ee19..8944359a 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -143,71 +143,40 @@ def _download_s3(self, uri): s3.Bucket(bucket).download_file(key, tmp_file) return tmp_file - def _convert_to_per_row_dict_old(self, df): - merged_df = ( - merged_df.groupby(self.image_index) - .agg( - {self.object_column: list, self.cell_x_loc: list, self.cell_y_loc: list} - ) - .reset_index() - ) - - def _convert_to_per_row_dict_list(self, df): - df = ( - df.groupby(self.image_index + [self.image_column]) - .apply( - lambda x: pd.Series( - { - "CellCenters": x[ - [self.cell_x_loc, self.cell_y_loc, self.object_column] - ] - .values.astype(int) - .tolist() - } - ) - ) - .reset_index() - ) - def _convert_to_per_row_dict(self, df): # define a dictionary to store the output output_df_list = collections.defaultdict(list) # iterate over each group of cells in the merged DataFrame - for (plate, well, site, image_number), cell_df in df.groupby( - ["Metadata_Plate", "Metadata_Well", "Metadata_Site", "ImageNumber"] - ): - # add the image index columns to the output dictionary - output_df_list["Metadata_Plate"].append(plate) - output_df_list["Metadata_Well"].append(well) - output_df_list["Metadata_Site"].append(site) - # add the image number to the output dictionary - output_df_list["ImageNumber"].append(image_number) - - # convert the cell DataFrame to a dictionary of lists + group_cols = self.image_index + [self.image_column] + + for group_values, cell_df in df.groupby(group_cols): + # add the image-level information to the output dictionary + for key, value in zip(group_cols, group_values): + output_df_list[key].append(value) + + # convert the cell DataFrame to a dictionary cell_dict = cell_df.to_dict(orient="list") - # iterate over each cell in the cell DataFrame and add it to the output dictionary + # iterate over each cell in the cell DataFrame row_cell_dicts = [] - for object_number, location_center_x, location_center_y in zip( - cell_dict["ObjectNumber"], - cell_dict["Nuclei_Location_Center_X"], - cell_dict["Nuclei_Location_Center_Y"], + for object_column, cell_x_loc, cell_y_loc in zip( + cell_dict[self.object_column], + cell_dict[self.cell_x_loc], + cell_dict[self.cell_y_loc], ): - # add the cell information to the output dictionary + # add the cell information to a dictionary row_cell_dicts.append( { - "ObjectNumber": object_number, - "Nuclei_Location_Center_X": location_center_x, - "Nuclei_Location_Center_Y": location_center_y, + self.object_column: object_column, + self.cell_x_loc: cell_x_loc, + self.cell_y_loc: cell_y_loc, } ) + # add the cell-level information to the output dictionary output_df_list["CellCenters"].append(row_cell_dicts) - # The type of the CellCenters column is as follows: - # ListType(list>)] - # convert the output dictionary to a Pandas DataFrame return pd.DataFrame(output_df_list) From 5595abcce5a9c28b0fe080dcec485cbe46f06203 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sat, 4 Mar 2023 08:35:36 -0500 Subject: [PATCH 30/82] cleanup --- pycytominer/tests/test_cyto_utils/test_cell_locations.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/pycytominer/tests/test_cyto_utils/test_cell_locations.py b/pycytominer/tests/test_cyto_utils/test_cell_locations.py index 570b1190..378e542c 100644 --- a/pycytominer/tests/test_cyto_utils/test_cell_locations.py +++ b/pycytominer/tests/test_cyto_utils/test_cell_locations.py @@ -15,9 +15,6 @@ def test_shape_and_columns(cell_loc, request): assert cell_loc.shape == (2, 27) # verify that the Nuclear_Location_Center_X and Nuclear_Location_Center_Y columns are present - # assert "Nuclei_Location_Center_X" in cell_loc.columns - # assert "Nuclei_Location_Center_Y" in cell_loc.columns - assert "Nuclei_Location_Center_X" in cell_loc["CellCenters"][0][0].keys() assert "Nuclei_Location_Center_Y" in cell_loc["CellCenters"][0][0].keys() From 8ad1ccfcbd047798f1f008af41a5f4893d28b7bd Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sat, 4 Mar 2023 10:21:47 -0500 Subject: [PATCH 31/82] better tests --- .../test_cyto_utils/test_cell_locations.py | 64 +++++++++++-------- 1 file changed, 36 insertions(+), 28 deletions(-) diff --git a/pycytominer/tests/test_cyto_utils/test_cell_locations.py b/pycytominer/tests/test_cyto_utils/test_cell_locations.py index 378e542c..27922087 100644 --- a/pycytominer/tests/test_cyto_utils/test_cell_locations.py +++ b/pycytominer/tests/test_cyto_utils/test_cell_locations.py @@ -4,15 +4,19 @@ import pandas as pd from pycytominer.cyto_utils.cell_locations import CellLocation import pytest +import sqlite3 # @pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2", "cell_loc3"]) @pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2"]) -def test_shape_and_columns(cell_loc, request): +def test_shape_and_columns(cell_loc, metadata_input_dataframe, request): cell_loc = request.getfixturevalue(cell_loc) # check the shape of the data - assert cell_loc.shape == (2, 27) + assert cell_loc.shape == ( + metadata_input_dataframe.shape[0], + metadata_input_dataframe.shape[1] + 2, + ) # verify that the Nuclear_Location_Center_X and Nuclear_Location_Center_Y columns are present assert "Nuclei_Location_Center_X" in cell_loc["CellCenters"][0][0].keys() @@ -21,38 +25,42 @@ def test_shape_and_columns(cell_loc, request): # @pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2", "cell_loc3"]) @pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2"]) -def test_values(cell_loc, request): +def test_values(cell_loc, metadata_input_dataframe, single_cell_input_file, request): cell_loc = request.getfixturevalue(cell_loc) + # if we restrict the columns of cell_loc to the ones in metadata_input_dataframe, we should get the same dataframe + assert ( + cell_loc[metadata_input_dataframe.columns] + .reset_index(drop=True) + .equals(metadata_input_dataframe.reset_index(drop=True)) + ) + + nuclei_query = f"SELECT ImageNumber, ObjectNumber, Nuclei_Location_Center_X, Nuclei_Location_Center_Y FROM Nuclei;" + + conn = sqlite3.connect(single_cell_input_file) + + nuclei_df = pd.read_sql_query(nuclei_query, conn) + + conn.close() + # get the values in the Nuclear_Location_Center_X and Nuclear_Location_Center_Y columns + # for the rows in cell_loc that have ImageNumber == 1 + + cell_loc_row1 = cell_loc[cell_loc["ImageNumber"] == "1"] + + # get the values in the Nuclear_Location_Center_X and Nuclear_Location_Center_Y columns + # for the rows in nuclei_df that have ImageNumber == 1 + + nuclei_df_row1 = nuclei_df[nuclei_df["ImageNumber"] == "1"] + observed_x = [x["Nuclei_Location_Center_X"] for x in cell_loc.CellCenters[0]] observed_y = [x["Nuclei_Location_Center_Y"] for x in cell_loc.CellCenters[0]] - expected_x = [ - 943.512129380054, - 65.5980176211454, - 790.798319327731, - 798.1744, - 657.246344206974, - 778.97604035309, - 322.763649425287, - 718.11819235226, - 109.785065590313, - 325.727799227799, - ] - - expected_y = [ - 182.789757412399, - 294.24449339207, - 338.886554621849, - 387.1376, - 402.2272215973, - 406.378310214376, - 413.334051724138, - 469.506373117034, - 474.240161453078, - 497.608108108108, - ] + expected_x = nuclei_df_row1["Nuclei_Location_Center_X"].tolist() + expected_x = [float(x) for x in expected_x] + + expected_y = nuclei_df_row1["Nuclei_Location_Center_Y"].tolist() + expected_y = [float(x) for x in expected_y] # verify that the values in the Nuclear_Location_Center_X and Nuclear_Location_Center_Y columns are correct assert observed_x == expected_x From 4016815787344dccd672d66359b750ce8f8a10ac Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sun, 5 Mar 2023 18:26:55 -0500 Subject: [PATCH 32/82] cleanup --- pycytominer/tests/test_cyto_utils/test_cell_locations.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/pycytominer/tests/test_cyto_utils/test_cell_locations.py b/pycytominer/tests/test_cyto_utils/test_cell_locations.py index 27922087..05345dbe 100644 --- a/pycytominer/tests/test_cyto_utils/test_cell_locations.py +++ b/pycytominer/tests/test_cyto_utils/test_cell_locations.py @@ -43,11 +43,6 @@ def test_values(cell_loc, metadata_input_dataframe, single_cell_input_file, requ conn.close() - # get the values in the Nuclear_Location_Center_X and Nuclear_Location_Center_Y columns - # for the rows in cell_loc that have ImageNumber == 1 - - cell_loc_row1 = cell_loc[cell_loc["ImageNumber"] == "1"] - # get the values in the Nuclear_Location_Center_X and Nuclear_Location_Center_Y columns # for the rows in nuclei_df that have ImageNumber == 1 From 492f29b5280d1fe7743e02d6a248dd894ad24383 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sun, 5 Mar 2023 18:31:07 -0500 Subject: [PATCH 33/82] overwrite is an option, other cleanup --- pycytominer/cyto_utils/cell_locations.py | 133 ++++++++++++++++++----- 1 file changed, 106 insertions(+), 27 deletions(-) diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index 8944359a..a3c72512 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -6,8 +6,8 @@ import pandas as pd import sqlite3 import boto3 +import botocore import tempfile -import shutil import collections @@ -64,12 +64,6 @@ class CellLocation: Methods ------- - load_metadata() - Load the metadata file into a Pandas DataFrame - - load_single_cell() - Load the required columns from the `Image` and `Nuclei` tables in the single_cell file into a Pandas DataFrame - add_cell_location() Augment the metadata file and optionally save it to a file @@ -80,22 +74,98 @@ def __init__( metadata_input: str or pd.DataFrame, single_cell_input: str or sqlite3.Connection, augmented_metadata_output: str = None, + overwrite: bool = False, image_column: str = "ImageNumber", object_column: str = "ObjectNumber", image_index=["Metadata_Plate", "Metadata_Well", "Metadata_Site"], cell_x_loc: str = "Nuclei_Location_Center_X", cell_y_loc: str = "Nuclei_Location_Center_Y", ): - self.metadata_input = metadata_input - self.augmented_metadata_output = augmented_metadata_output - self.single_cell_input = single_cell_input + self.metadata_input = self._expanduser(metadata_input) + self.augmented_metadata_output = self._expanduser(augmented_metadata_output) + self.single_cell_input = self._expanduser(single_cell_input) + self.overwrite = overwrite self.image_column = image_column self.object_column = object_column self.image_index = image_index self.cell_x_loc = cell_x_loc self.cell_y_loc = cell_y_loc - def load_metadata(self): + def _expanduser(self, obj): + """Expand the user home directory in a path""" + if obj is not None and isinstance(obj, str) and not obj.startswith("s3://"): + return os.path.expanduser(obj) + else: + return obj + + def _parse_s3_path(self, s3_path): + """Parse an S3 path into a bucket and key + + Parameters + ---------- + s3_path : str + The S3 path + + Returns + ------- + str + The bucket + str + The key + """ + + s3_path = s3_path.replace("s3://", "") + + bucket = s3_path.split("/")[0] + + key = "/".join(s3_path.split("/")[1:]) + + return bucket, key + + def _s3_file_exists(self, s3_path): + """Check if a file exists on S3 + + Parameters + ---------- + s3_path : str + The path to the file on S3 + + Returns + ------- + bool + True if the file exists on S3, False otherwise + """ + + s3 = boto3.resource("s3") + + bucket, key = self._parse_s3_path(s3_path) + + try: + s3.Object(bucket, key).load() + except botocore.exceptions.ClientError as e: + if e.response["Error"]["Code"] == "404": + return False + else: + raise + else: + return True + + def _download_s3(self, uri): + """ + Download a file from S3, save it to a temporary directory, and return the path to the file + """ + s3 = boto3.resource("s3") + + bucket, key = self._parse_s3_path(uri) + + tmp_dir = tempfile.mkdtemp() + tmp_file = os.path.join(tmp_dir, os.path.basename(key)) + + s3.Bucket(bucket).download_file(key, tmp_file) + + return tmp_file + + def _load_metadata(self): """Load the metadata into a Pandas DataFrame Returns @@ -132,17 +202,6 @@ def load_metadata(self): return df - def _download_s3(self, uri): - """ - Download a file from S3, save it to a temporary directory, and return the path to the file - """ - s3 = boto3.resource("s3") - bucket, key = uri.replace("s3://", "").split("/", 1) - tmp_dir = tempfile.mkdtemp() - tmp_file = os.path.join(tmp_dir, os.path.basename(key)) - s3.Bucket(bucket).download_file(key, tmp_file) - return tmp_file - def _convert_to_per_row_dict(self, df): # define a dictionary to store the output output_df_list = collections.defaultdict(list) @@ -180,7 +239,7 @@ def _convert_to_per_row_dict(self, df): # convert the output dictionary to a Pandas DataFrame return pd.DataFrame(output_df_list) - def load_single_cell(self): + def _load_single_cell(self): """Load the required columns from the `Image` and `Nuclei` tables in the single_cell file or sqlite3.Connection object into a Pandas DataFrame Returns @@ -298,11 +357,30 @@ def add_cell_location(self): Returns ------- Pandas DataFrame - The Parquet file with the X,Y locations of all cells packed into a single column + Either a data frame or the path to a Parquet file with the X,Y locations of all cells packed into a single column """ + + # If self.augmented_metadata_output is not None and it is a str and the file already exists, there is nothing to do + if ( + self.augmented_metadata_output is not None + and isinstance(self.augmented_metadata_output, str) + and self.overwrite is False + and ( + ( + self.augmented_metadata_output.startswith("s3://") + and self._s3_file_exists(self.augmented_metadata_output) + ) + or ( + not self.augmented_metadata_output.startswith("s3://") + and os.path.exists(self.augmented_metadata_output) + ) + ) + ): + return self.augmented_metadata_output + # Load the data - metadata_df = self.load_metadata() - single_cell_df = self.load_single_cell() + metadata_df = self._load_metadata() + single_cell_df = self._load_single_cell() # Merge the data and single_cell tables augmented_metadata_df = pd.merge( @@ -312,10 +390,11 @@ def add_cell_location(self): how="left", ) - # If self.augmented_metadata_output) is not None, save the data + # If self.augmented_metadata_output is not None, save the data if self.augmented_metadata_output is not None: augmented_metadata_df.to_parquet( self.augmented_metadata_output, index=False ) + return self.augmented_metadata_output else: return augmented_metadata_df From ea21a45a58f990cc1c46bea948afcdc9f451434d Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Mon, 27 Mar 2023 14:18:06 -0400 Subject: [PATCH 34/82] Update pycytominer/cyto_utils/cell_locations.py Co-authored-by: Gregory Way --- pycytominer/cyto_utils/cell_locations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index a3c72512..364854ad 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -77,7 +77,7 @@ def __init__( overwrite: bool = False, image_column: str = "ImageNumber", object_column: str = "ObjectNumber", - image_index=["Metadata_Plate", "Metadata_Well", "Metadata_Site"], + image_index: List = ["Metadata_Plate", "Metadata_Well", "Metadata_Site"], cell_x_loc: str = "Nuclei_Location_Center_X", cell_y_loc: str = "Nuclei_Location_Center_Y", ): From 487ef2d1dd9461fc3a3238bce9a94b708e6f5938 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Mon, 27 Mar 2023 15:19:39 -0400 Subject: [PATCH 35/82] add docs --- README.md | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index f8a4132b..f5aa3a05 100644 --- a/README.md +++ b/README.md @@ -45,11 +45,12 @@ Since the project is actively being developed, with new features added regularly # Example: pip install git+git://github.com/cytomining/pycytominer@2aa8638d7e505ab510f1d5282098dd59bb2cb470 ``` + ### CSV collation If running your images on a cluster, unless you have a MySQL or similar large database set up then you will likely end up with lots of different folders from the different cluster runs (often one per well or one per site), each one containing an `Image.csv`, `Nuclei.csv`, etc. In order to look at full plates, therefore, we first need to collate all of these CSVs into a single file (currently SQLite) per plate. -We currently do this with a library called [cytominer-database](https://github.com/cytomining/cytominer-database). +We currently do this with a library called [cytominer-database](https://github.com/cytomining/cytominer-database). If you want to perform this data collation inside pycytominer using the `cyto_utils` function `collate` (and/or you want to be able to run the tests and have them all pass!), you will need `cytominer-database==0.3.4`; this will change your installation commands slightly: @@ -62,6 +63,21 @@ pip install "pycytominer[collate] @ git+git://github.com/cytomining/pycytominer@ If using `pycytominer` in a conda environment, in order to run `collate.py`, you will also want to make sure to add `cytominer-database=0.3.4` to your list of dependencies. +## Creating a cell locations lookup table + +The `CellLocation` class, designed for use with pycytominer, offers a convenient way to augment a [LoadData](https://cellprofiler-manual.s3.amazonaws.com/CPmanual/LoadData.html) file with X,Y locations of cells in each image. +The locations information is obtained from a single cell SQLite file, which contains `Nuclei` and `Image` tables with cell and image-level readouts, respectively. +Each row of a LoadData file represents a single image, indexed by multiple columns (e.g., `Metadata_Plate`, `Metadata_Well`, `Metadata_Site`). +The `CellLocation` class uses the `Image` table to identify the image-level metadata for each image in the LoadData file, and then uses the `Nuclei` table to identify the X,Y locations of cells in each image. +The X,Y locations are then added to the LoadData file as new columns, and the resulting augmented LoadData file is saved to disk. + +To use this functionality, you will need to modify your installation command, similar to above: + +```bash +# Example for general case commit: +pip install "pycytominer[cell_locations] @ git+git://github.com/cytomining/pycytominer" +``` + ## Usage Using pycytominer is simple and fun. From 599a600970c686cb41771652d6cb142353b500aa Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Mon, 27 Mar 2023 15:20:03 -0400 Subject: [PATCH 36/82] Add deps --- setup.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8d9b44c2..bb8bd969 100644 --- a/setup.py +++ b/setup.py @@ -23,7 +23,10 @@ packages=find_packages(), license=ABOUT["__license__"], install_requires=REQUIRED_PKGS, - extras_require={"collate": ["cytominer-database==0.3.4"]}, + extras_require={ + "collate": ["cytominer-database==0.3.4"], + "cell_locations": ["fsspec>=2023.1.0", "s3fs>=0.4.2", "boto3>=1.26.79"], + }, python_requires=">=3.4", include_package_data=True, ) From 3549060be2b12b60c5a75f09f6fccc785a1f9918 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Mon, 27 Mar 2023 15:20:41 -0400 Subject: [PATCH 37/82] Move to setup --- requirements.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/requirements.txt b/requirements.txt index 7c56f3f7..5626a1e2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,6 +5,3 @@ scikit-learn>=0.21.2 sqlalchemy>=1.3.6,<2 pyarrow>=8.0.0 pytest>=5.0.1 -fsspec>=2023.1.0 -s3fs>=0.4.2 -boto3>=1.26.79 From a628fe5cea157767fccf9bacb775b8bfb4e208be Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Mon, 27 Mar 2023 15:22:01 -0400 Subject: [PATCH 38/82] add fire to deps --- setup.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index bb8bd969..9a21905b 100644 --- a/setup.py +++ b/setup.py @@ -25,7 +25,12 @@ install_requires=REQUIRED_PKGS, extras_require={ "collate": ["cytominer-database==0.3.4"], - "cell_locations": ["fsspec>=2023.1.0", "s3fs>=0.4.2", "boto3>=1.26.79"], + "cell_locations": [ + "fsspec>=2023.1.0", + "s3fs>=0.4.2", + "boto3>=1.26.79", + "fire>=0.5.0", + ], }, python_requires=">=3.4", include_package_data=True, From a6cd67dfecfb60fb6b13eaf459c61a545c19ea71 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Mon, 27 Mar 2023 18:22:16 -0400 Subject: [PATCH 39/82] use pathlib --- pycytominer/cyto_utils/cell_locations.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index 364854ad..6372eb82 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -2,7 +2,7 @@ Utility function to augment a metadata file with X,Y locations of cells in each image """ -import os +import pathlib import pandas as pd import sqlite3 import boto3 @@ -77,7 +77,7 @@ def __init__( overwrite: bool = False, image_column: str = "ImageNumber", object_column: str = "ObjectNumber", - image_index: List = ["Metadata_Plate", "Metadata_Well", "Metadata_Site"], + image_index: list = ["Metadata_Plate", "Metadata_Well", "Metadata_Site"], cell_x_loc: str = "Nuclei_Location_Center_X", cell_y_loc: str = "Nuclei_Location_Center_Y", ): @@ -94,7 +94,8 @@ def __init__( def _expanduser(self, obj): """Expand the user home directory in a path""" if obj is not None and isinstance(obj, str) and not obj.startswith("s3://"): - return os.path.expanduser(obj) + return pathlib.Path(obj).expanduser().as_posix() + else: return obj @@ -159,7 +160,7 @@ def _download_s3(self, uri): bucket, key = self._parse_s3_path(uri) tmp_dir = tempfile.mkdtemp() - tmp_file = os.path.join(tmp_dir, os.path.basename(key)) + tmp_file = pathlib.Path(tmp_dir) / pathlib.Path(key).name s3.Bucket(bucket).download_file(key, tmp_file) @@ -330,7 +331,7 @@ def _load_single_cell(self): # if the single_cell file was downloaded from S3, delete the temporary file if "temp_single_cell_input" in locals(): - os.remove(temp_single_cell_input) + pathlib.Path(temp_single_cell_input).unlink() # Merge the Image and Nuclei tables merged_df = pd.merge(image_df, nuclei_df, on=self.image_column, how="inner") @@ -372,7 +373,7 @@ def add_cell_location(self): ) or ( not self.augmented_metadata_output.startswith("s3://") - and os.path.exists(self.augmented_metadata_output) + and pathlib.Path(self.augmented_metadata_output).exists() ) ) ): @@ -392,6 +393,8 @@ def add_cell_location(self): # If self.augmented_metadata_output is not None, save the data if self.augmented_metadata_output is not None: + # switch to https://github.com/cytomining/pycytominer/blob/master/pycytominer/cyto_utils/output.py + # if we want to support more file types augmented_metadata_df.to_parquet( self.augmented_metadata_output, index=False ) From 841562714253eefe89fb134ca2afa411a7981cef Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Mon, 27 Mar 2023 20:52:30 -0400 Subject: [PATCH 40/82] Add pip install .[cell_locations] (+formatting) --- .github/workflows/python-app.yml | 36 ++++++++++++++++---------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 0d769d4d..edf51b16 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -5,33 +5,33 @@ name: Python build on: push: - branches: [ master ] + branches: [master] pull_request: - branches: [ master ] + branches: [master] jobs: build: - runs-on: ${{ matrix.os }} strategy: matrix: python-version: [3.7, 3.8, 3.9] os: [ubuntu-latest, macos-latest] env: - OS: ${{ matrix.os }} + OS: ${{ matrix.os }} steps: - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} + - uses: actions/checkout@v2 + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v2 + with: + python-version: ${{ matrix.python-version }} - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install pytest - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - pip install .[collate] - - name: Test with pytest - run: | - pytest + - name: Install dependencies + run: | + python -m pip install --upgrade pip + pip install pytest + if [ -f requirements.txt ]; then pip install -r requirements.txt; fi + pip install .[collate] + pip install .[cell_location] + - name: Test with pytest + run: | + pytest From b11139928f7787e7bd0ad96917c186396cacf12a Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Mon, 27 Mar 2023 21:08:21 -0400 Subject: [PATCH 41/82] Update docs + fix typo in actions --- .github/workflows/python-app.yml | 2 +- README.md | 22 +++++++++++++++++----- 2 files changed, 18 insertions(+), 6 deletions(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index edf51b16..f97eda32 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -31,7 +31,7 @@ jobs: pip install pytest if [ -f requirements.txt ]; then pip install -r requirements.txt; fi pip install .[collate] - pip install .[cell_location] + pip install .[cell_locations] - name: Test with pytest run: | pytest diff --git a/README.md b/README.md index f5aa3a05..ed76c487 100644 --- a/README.md +++ b/README.md @@ -65,11 +65,8 @@ If using `pycytominer` in a conda environment, in order to run `collate.py`, you ## Creating a cell locations lookup table -The `CellLocation` class, designed for use with pycytominer, offers a convenient way to augment a [LoadData](https://cellprofiler-manual.s3.amazonaws.com/CPmanual/LoadData.html) file with X,Y locations of cells in each image. -The locations information is obtained from a single cell SQLite file, which contains `Nuclei` and `Image` tables with cell and image-level readouts, respectively. -Each row of a LoadData file represents a single image, indexed by multiple columns (e.g., `Metadata_Plate`, `Metadata_Well`, `Metadata_Site`). -The `CellLocation` class uses the `Image` table to identify the image-level metadata for each image in the LoadData file, and then uses the `Nuclei` table to identify the X,Y locations of cells in each image. -The X,Y locations are then added to the LoadData file as new columns, and the resulting augmented LoadData file is saved to disk. +The `CellLocation` class offers a convenient way to augment a [LoadData](https://cellprofiler-manual.s3.amazonaws.com/CPmanual/LoadData.html) file with X,Y locations of cells in each image. +The locations information is obtained from a single cell SQLite file. To use this functionality, you will need to modify your installation command, similar to above: @@ -78,6 +75,21 @@ To use this functionality, you will need to modify your installation command, si pip install "pycytominer[cell_locations] @ git+git://github.com/cytomining/pycytominer" ``` +Use it like this + +```bash +metadata_input="s3://cellpainting-gallery/test-cpg0016-jump/source_4/workspace/load_data_csv/2021_08_23_Batch12/BR00126114/load_data_with_illum_subset.parquet" +single_single_cell_input="s3://cellpainting-gallery/test-cpg0016-jump/source_4/workspace/backend/2021_08_23_Batch12/BR00126114/BR00126114_subset.sqlite" +augmented_metadata_output="s3://cellpainting-gallery/test-cpg0016-jump/source_4/workspace/load_data_csv/2021_08_23_Batch12/BR00126114/load_data_with_illum_and_cell_location_subset.parquet" + +python \ + pycytominer/cyto_utils/cell_locations_cmd.py \ + --metadata_input ${metadata_input} \ + --single_cell_input ${single_single_cell_input} \ + --augmented_metadata_output ${augmented_metadata_output} \ + add_cell_location +``` + ## Usage Using pycytominer is simple and fun. From 287cc1792012842ba6153698c6d6a2bef3b65c5f Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Mon, 27 Mar 2023 21:13:55 -0400 Subject: [PATCH 42/82] Use as module --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ed76c487..4efceb93 100644 --- a/README.md +++ b/README.md @@ -80,10 +80,10 @@ Use it like this ```bash metadata_input="s3://cellpainting-gallery/test-cpg0016-jump/source_4/workspace/load_data_csv/2021_08_23_Batch12/BR00126114/load_data_with_illum_subset.parquet" single_single_cell_input="s3://cellpainting-gallery/test-cpg0016-jump/source_4/workspace/backend/2021_08_23_Batch12/BR00126114/BR00126114_subset.sqlite" -augmented_metadata_output="s3://cellpainting-gallery/test-cpg0016-jump/source_4/workspace/load_data_csv/2021_08_23_Batch12/BR00126114/load_data_with_illum_and_cell_location_subset.parquet" +augmented_metadata_output="~/Desktop/load_data_with_illum_and_cell_location_subset.parquet" python \ - pycytominer/cyto_utils/cell_locations_cmd.py \ + -m pycytominer.cyto_utils.cell_locations_cmd \ --metadata_input ${metadata_input} \ --single_cell_input ${single_single_cell_input} \ --augmented_metadata_output ${augmented_metadata_output} \ From fbdf9aab095ba9ccadb58b2ba265e662bd677bec Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Mon, 27 Mar 2023 21:16:50 -0400 Subject: [PATCH 43/82] Formatting --- .github/workflows/codecov.yml | 48 +++++++++++++++++------------------ 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml index 9760a0e3..5ed44617 100644 --- a/.github/workflows/codecov.yml +++ b/.github/workflows/codecov.yml @@ -2,33 +2,33 @@ name: Code coverage on: push: - branches: [ master ] + branches: [master] pull_request: - branches: [ master ] + branches: [master] jobs: run: runs-on: ubuntu-latest steps: - - uses: actions/checkout@master - - name: Setup Python - uses: actions/setup-python@master - with: - python-version: 3.7 - - name: Generate coverage report - run: | - pip install pytest - pip install pytest-cov - pip install -r requirements.txt - pip install .[collate] - pytest --cov=./ --cov-report=xml - - name: Upload coverage to Codecov - uses: codecov/codecov-action@v1 - with: - file: ./coverage.xml - files: ./coverage1.xml,./coverage2.xml - directory: ./coverage/reports/ - flags: unittests - name: codecov-umbrella - fail_ci_if_error: false - path_to_write_report: ./coverage/codecov_report.gz + - uses: actions/checkout@master + - name: Setup Python + uses: actions/setup-python@master + with: + python-version: 3.7 + - name: Generate coverage report + run: | + pip install pytest + pip install pytest-cov + pip install -r requirements.txt + pip install .[collate] + pytest --cov=./ --cov-report=xml + - name: Upload coverage to Codecov + uses: codecov/codecov-action@v1 + with: + file: ./coverage.xml + files: ./coverage1.xml,./coverage2.xml + directory: ./coverage/reports/ + flags: unittests + name: codecov-umbrella + fail_ci_if_error: false + path_to_write_report: ./coverage/codecov_report.gz From 5fa59cf577bf0565d5918601a79e74b666655c3f Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Mon, 27 Mar 2023 21:17:11 -0400 Subject: [PATCH 44/82] add cell_locations --- .github/workflows/codecov.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml index 5ed44617..9d2d160a 100644 --- a/.github/workflows/codecov.yml +++ b/.github/workflows/codecov.yml @@ -21,6 +21,7 @@ jobs: pip install pytest-cov pip install -r requirements.txt pip install .[collate] + pip install .[cell_locations] pytest --cov=./ --cov-report=xml - name: Upload coverage to Codecov uses: codecov/codecov-action@v1 From 945e5ceb84e5b388447b83269a29ffe7b7a4e441 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Tue, 28 Mar 2023 12:02:18 -0400 Subject: [PATCH 45/82] Merge in SQL --- pycytominer/cyto_utils/cell_locations.py | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index 6372eb82..68c2a80f 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -315,17 +315,18 @@ def _load_single_cell(self): f"Required columns are not present in the Image table in the SQLite file" ) - # Load the required columns from the single_cell file - - nuclei_query = f"SELECT {self.image_column},{self.object_column},{self.cell_x_loc},{self.cell_y_loc} FROM Nuclei;" - image_index_str = ", ".join(self.image_index) - image_query = f"SELECT {self.image_column},{image_index_str} FROM Image;" + # merge the Image and Nuclei tables in SQL - nuclei_df = pd.read_sql_query(nuclei_query, conn) + merge_query = f""" + SELECT Nuclei.{self.image_column},Nuclei.{self.object_column},Nuclei.{self.cell_x_loc},Nuclei.{self.cell_y_loc},Image.{image_index_str} + FROM Nuclei + INNER JOIN Image + ON Nuclei.{self.image_column} = Image.{self.image_column}; + """ - image_df = pd.read_sql_query(image_query, conn) + merged_df = pd.read_sql_query(merge_query, conn) conn.close() @@ -333,9 +334,6 @@ def _load_single_cell(self): if "temp_single_cell_input" in locals(): pathlib.Path(temp_single_cell_input).unlink() - # Merge the Image and Nuclei tables - merged_df = pd.merge(image_df, nuclei_df, on=self.image_column, how="inner") - # Cast the cell location columns to float merged_df[self.cell_x_loc] = merged_df[self.cell_x_loc].astype(float) merged_df[self.cell_y_loc] = merged_df[self.cell_y_loc].astype(float) From e987ccf816291c30b8f3c77e616c589467f145ee Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Thu, 30 Mar 2023 10:52:56 -0400 Subject: [PATCH 46/82] Update README.md Co-authored-by: Dave Bunten --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 4efceb93..18e7eb9e 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,7 @@ To use this functionality, you will need to modify your installation command, si pip install "pycytominer[cell_locations] @ git+git://github.com/cytomining/pycytominer" ``` -Use it like this +Example using this functionality: ```bash metadata_input="s3://cellpainting-gallery/test-cpg0016-jump/source_4/workspace/load_data_csv/2021_08_23_Batch12/BR00126114/load_data_with_illum_subset.parquet" From 4f067fafc75fcacb2755363c93c5f4632aa4d91e Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Thu, 30 Mar 2023 10:53:20 -0400 Subject: [PATCH 47/82] Update .github/workflows/codecov.yml Co-authored-by: Dave Bunten --- .github/workflows/codecov.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml index 9d2d160a..cdaf32e9 100644 --- a/.github/workflows/codecov.yml +++ b/.github/workflows/codecov.yml @@ -19,9 +19,7 @@ jobs: run: | pip install pytest pip install pytest-cov - pip install -r requirements.txt - pip install .[collate] - pip install .[cell_locations] + pip install .[collate,cell_locations] pytest --cov=./ --cov-report=xml - name: Upload coverage to Codecov uses: codecov/codecov-action@v1 From 3d711f7deae0ada008f029ef872efd2ac0affefa Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Thu, 30 Mar 2023 10:54:19 -0400 Subject: [PATCH 48/82] Update .github/workflows/python-app.yml Co-authored-by: Dave Bunten --- .github/workflows/python-app.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index f97eda32..eb5ed71b 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -29,9 +29,7 @@ jobs: run: | python -m pip install --upgrade pip pip install pytest - if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - pip install .[collate] - pip install .[cell_locations] + pip install .[collate,cell_locations] - name: Test with pytest run: | pytest From cda5f761fb42f8d97d5f0cb1ae37fa9c912d22a1 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Thu, 30 Mar 2023 10:54:52 -0400 Subject: [PATCH 49/82] Update pycytominer/tests/test_data/cell_locations_example_data/test_cell_locations.sh Co-authored-by: Dave Bunten --- .../cell_locations_example_data/test_cell_locations.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pycytominer/tests/test_data/cell_locations_example_data/test_cell_locations.sh b/pycytominer/tests/test_data/cell_locations_example_data/test_cell_locations.sh index 547fb954..d9c19f38 100644 --- a/pycytominer/tests/test_data/cell_locations_example_data/test_cell_locations.sh +++ b/pycytominer/tests/test_data/cell_locations_example_data/test_cell_locations.sh @@ -4,7 +4,7 @@ aws s3 cp s3://cellpainting-gallery/cpg0016-jump/source_4/workspace/backend/2021 # Download LoadData CSV file aws s3 cp s3://cellpainting-gallery/cpg0016-jump/source_4/workspace/load_data_csv/2021_08_23_Batch12/BR00126114/load_data_with_illum.parquet . -# Write a SQL query tp select rows of the `Image` table in the SQLite file where `ImageNumber` is 1 or 2. +# Write a SQL query to select rows of the `Image` table in the SQLite file where `ImageNumber` is 1 or 2. # Only select the columns: `Metadata_Plate`, `Metadata_Well`, `Metadata_Site`, `ImageNumber` sqlite3 -header -csv BR00126114.sqlite "SELECT Metadata_Plate, Metadata_Well, Metadata_Site, ImageNumber FROM Image WHERE ImageNumber = 1 OR ImageNumber = 2;" > image_query.csv From f65b36dbfb6951b61bf4d42c7be5fe46a60e8a25 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Thu, 30 Mar 2023 10:55:19 -0400 Subject: [PATCH 50/82] Update pycytominer/tests/test_data/cell_locations_example_data/test_cell_locations.sh Co-authored-by: Dave Bunten --- .../cell_locations_example_data/test_cell_locations.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pycytominer/tests/test_data/cell_locations_example_data/test_cell_locations.sh b/pycytominer/tests/test_data/cell_locations_example_data/test_cell_locations.sh index d9c19f38..e33a69bd 100644 --- a/pycytominer/tests/test_data/cell_locations_example_data/test_cell_locations.sh +++ b/pycytominer/tests/test_data/cell_locations_example_data/test_cell_locations.sh @@ -10,7 +10,7 @@ aws s3 cp s3://cellpainting-gallery/cpg0016-jump/source_4/workspace/load_data_cs sqlite3 -header -csv BR00126114.sqlite "SELECT Metadata_Plate, Metadata_Well, Metadata_Site, ImageNumber FROM Image WHERE ImageNumber = 1 OR ImageNumber = 2;" > image_query.csv -# Write a SQL query tp select rows of the `Nuclei` table in the SQLite file where `ImageNumber` is 1 or 2. +# Write a SQL query to select rows of the `Nuclei` table in the SQLite file where `ImageNumber` is 1 or 2. # Only select the columns: `ImageNumber`, `ObjectNumber`, `Nuclei_Location_Center_X`, `Nuclei_Location_Center_Y` sqlite3 -header -csv BR00126114.sqlite "SELECT ImageNumber, ObjectNumber, Nuclei_Location_Center_X, Nuclei_Location_Center_Y FROM Nuclei WHERE ImageNumber = 1 LIMIT 10;" > nuclei_query_1.csv From 03995b8aec79a83056fb49570e09ebefcd49863d Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Thu, 30 Mar 2023 10:56:37 -0400 Subject: [PATCH 51/82] Update pycytominer/tests/test_cyto_utils/test_cell_locations.py Co-authored-by: Dave Bunten --- pycytominer/tests/test_cyto_utils/test_cell_locations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pycytominer/tests/test_cyto_utils/test_cell_locations.py b/pycytominer/tests/test_cyto_utils/test_cell_locations.py index 05345dbe..99a072c0 100644 --- a/pycytominer/tests/test_cyto_utils/test_cell_locations.py +++ b/pycytominer/tests/test_cyto_utils/test_cell_locations.py @@ -35,7 +35,7 @@ def test_values(cell_loc, metadata_input_dataframe, single_cell_input_file, requ .equals(metadata_input_dataframe.reset_index(drop=True)) ) - nuclei_query = f"SELECT ImageNumber, ObjectNumber, Nuclei_Location_Center_X, Nuclei_Location_Center_Y FROM Nuclei;" + nuclei_query = "SELECT ImageNumber, ObjectNumber, Nuclei_Location_Center_X, Nuclei_Location_Center_Y FROM Nuclei;" conn = sqlite3.connect(single_cell_input_file) From c81762622ea38bfc9c99467f4c8376213d503174 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Thu, 30 Mar 2023 10:58:46 -0400 Subject: [PATCH 52/82] Update pycytominer/cyto_utils/cell_locations.py Co-authored-by: Dave Bunten --- pycytominer/cyto_utils/cell_locations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index 68c2a80f..80432dde 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -298,7 +298,7 @@ def _load_single_cell(self): and self.cell_y_loc in nuclei_columns ): raise ValueError( - f"Required columns are not present in the Nuclei table in the SQLite file" + "Required columns are not present in the Nuclei table in the SQLite file" ) c.execute("PRAGMA table_info(Image);") From 72896b93488dc0c9ec937c0bf757d34a399a2862 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Thu, 30 Mar 2023 10:59:08 -0400 Subject: [PATCH 53/82] Update pycytominer/cyto_utils/cell_locations.py Co-authored-by: Dave Bunten --- pycytominer/cyto_utils/cell_locations.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index 80432dde..58f364dd 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -312,7 +312,7 @@ def _load_single_cell(self): and all(elem in image_columns for elem in self.image_index) ): raise ValueError( - f"Required columns are not present in the Image table in the SQLite file" + "Required columns are not present in the Image table in the SQLite file" ) image_index_str = ", ".join(self.image_index) From 59816ff95dc46d50648538bcc237530c02fa701f Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Thu, 30 Mar 2023 10:59:47 -0400 Subject: [PATCH 54/82] Update pycytominer/tests/test_cyto_utils/test_cell_locations.py Co-authored-by: Dave Bunten --- pycytominer/tests/test_cyto_utils/test_cell_locations.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pycytominer/tests/test_cyto_utils/test_cell_locations.py b/pycytominer/tests/test_cyto_utils/test_cell_locations.py index 99a072c0..74042159 100644 --- a/pycytominer/tests/test_cyto_utils/test_cell_locations.py +++ b/pycytominer/tests/test_cyto_utils/test_cell_locations.py @@ -7,7 +7,6 @@ import sqlite3 -# @pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2", "cell_loc3"]) @pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2"]) def test_shape_and_columns(cell_loc, metadata_input_dataframe, request): cell_loc = request.getfixturevalue(cell_loc) From 7df9c68eabb114a258cc4ce43533f07993a21550 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Thu, 30 Mar 2023 11:00:13 -0400 Subject: [PATCH 55/82] Update pycytominer/tests/test_cyto_utils/test_cell_locations.py Co-authored-by: Dave Bunten --- pycytominer/tests/test_cyto_utils/test_cell_locations.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pycytominer/tests/test_cyto_utils/test_cell_locations.py b/pycytominer/tests/test_cyto_utils/test_cell_locations.py index 74042159..c9eb57a3 100644 --- a/pycytominer/tests/test_cyto_utils/test_cell_locations.py +++ b/pycytominer/tests/test_cyto_utils/test_cell_locations.py @@ -22,7 +22,6 @@ def test_shape_and_columns(cell_loc, metadata_input_dataframe, request): assert "Nuclei_Location_Center_Y" in cell_loc["CellCenters"][0][0].keys() -# @pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2", "cell_loc3"]) @pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2"]) def test_values(cell_loc, metadata_input_dataframe, single_cell_input_file, request): cell_loc = request.getfixturevalue(cell_loc) From 52d758918f93a75f8e73720a7cca9607dcf29013 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sat, 1 Apr 2023 10:26:34 -0400 Subject: [PATCH 56/82] Address various comment --- pycytominer/cyto_utils/cell_locations.py | 68 ++++++++++++++---------- 1 file changed, 40 insertions(+), 28 deletions(-) diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index 58f364dd..e6e9e1c8 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -9,6 +9,7 @@ import botocore import tempfile import collections +from typing import Union class CellLocation: @@ -71,13 +72,13 @@ class CellLocation: def __init__( self, - metadata_input: str or pd.DataFrame, - single_cell_input: str or sqlite3.Connection, + metadata_input: Union[str, pd.DataFrame], + single_cell_input: Union[str, sqlite3.Connection], augmented_metadata_output: str = None, overwrite: bool = False, image_column: str = "ImageNumber", object_column: str = "ObjectNumber", - image_index: list = ["Metadata_Plate", "Metadata_Well", "Metadata_Site"], + image_key: list = ["Metadata_Plate", "Metadata_Well", "Metadata_Site"], cell_x_loc: str = "Nuclei_Location_Center_X", cell_y_loc: str = "Nuclei_Location_Center_Y", ): @@ -87,7 +88,7 @@ def __init__( self.overwrite = overwrite self.image_column = image_column self.object_column = object_column - self.image_index = image_index + self.image_key = image_key self.cell_x_loc = cell_x_loc self.cell_y_loc = cell_y_loc @@ -95,9 +96,7 @@ def _expanduser(self, obj): """Expand the user home directory in a path""" if obj is not None and isinstance(obj, str) and not obj.startswith("s3://"): return pathlib.Path(obj).expanduser().as_posix() - - else: - return obj + return obj def _parse_s3_path(self, s3_path): """Parse an S3 path into a bucket and key @@ -153,18 +152,19 @@ def _s3_file_exists(self, s3_path): def _download_s3(self, uri): """ - Download a file from S3, save it to a temporary directory, and return the path to the file + Download a file from S3 to a temporary file and return the temporary path """ s3 = boto3.resource("s3") bucket, key = self._parse_s3_path(uri) - tmp_dir = tempfile.mkdtemp() - tmp_file = pathlib.Path(tmp_dir) / pathlib.Path(key).name + tmp_file = tempfile.NamedTemporaryFile( + delete=False, suffix=pathlib.Path(key).name + ) - s3.Bucket(bucket).download_file(key, tmp_file) + s3.Bucket(bucket).download_file(key, tmp_file.name) - return tmp_file + return tmp_file.name def _load_metadata(self): """Load the metadata into a Pandas DataFrame @@ -196,19 +196,31 @@ def _load_metadata(self): # verify that the image index columns are present in the metadata object - if not all(elem in df.columns for elem in self.image_index): + if not all(elem in df.columns for elem in self.image_key): raise ValueError( - f"Image index columns {self.image_index} are not present in the metadata file" + f"Image index columns {self.image_key} are not present in the metadata file" ) return df - def _convert_to_per_row_dict(self, df): + def _create_nested_df(self, df): + """Create a new column `CellCenters` by nesting the X and Y locations of cell from an image into the row of the image + + Parameters + ---------- + df : Pandas DataFrame + The DataFrame to convert + + Returns + ------- + Pandas DataFrame + """ + # define a dictionary to store the output output_df_list = collections.defaultdict(list) # iterate over each group of cells in the merged DataFrame - group_cols = self.image_index + [self.image_column] + group_cols = self.image_key + [self.image_column] for group_values, cell_df in df.groupby(group_cols): # add the image-level information to the output dictionary @@ -309,24 +321,24 @@ def _load_single_cell(self): if not ( self.image_column in image_columns - and all(elem in image_columns for elem in self.image_index) + and all(elem in image_columns for elem in self.image_key) ): raise ValueError( "Required columns are not present in the Image table in the SQLite file" ) - image_index_str = ", ".join(self.image_index) + image_index_str = ", ".join(self.image_key) # merge the Image and Nuclei tables in SQL - merge_query = f""" + join_query = f""" SELECT Nuclei.{self.image_column},Nuclei.{self.object_column},Nuclei.{self.cell_x_loc},Nuclei.{self.cell_y_loc},Image.{image_index_str} FROM Nuclei INNER JOIN Image ON Nuclei.{self.image_column} = Image.{self.image_column}; """ - merged_df = pd.read_sql_query(merge_query, conn) + joined_df = pd.read_sql_query(join_query, conn) conn.close() @@ -335,19 +347,19 @@ def _load_single_cell(self): pathlib.Path(temp_single_cell_input).unlink() # Cast the cell location columns to float - merged_df[self.cell_x_loc] = merged_df[self.cell_x_loc].astype(float) - merged_df[self.cell_y_loc] = merged_df[self.cell_y_loc].astype(float) + joined_df[self.cell_x_loc] = joined_df[self.cell_x_loc].astype(float) + joined_df[self.cell_y_loc] = joined_df[self.cell_y_loc].astype(float) # Cast the object column to int - merged_df[self.object_column] = merged_df[self.object_column].astype(int) + joined_df[self.object_column] = joined_df[self.object_column].astype(int) # Cast the image index columns to str - for col in self.image_index: - merged_df[col] = merged_df[col].astype(str) + for col in self.image_key: + joined_df[col] = joined_df[col].astype(str) - merged_df = self._convert_to_per_row_dict(merged_df) + joined_df = self._create_nested_df(joined_df) - return merged_df + return joined_df def add_cell_location(self): """Add the X,Y locations of all cells to the metadata file in the corresponding row, packed into a single column. @@ -385,7 +397,7 @@ def add_cell_location(self): augmented_metadata_df = pd.merge( metadata_df, single_cell_df, - on=self.image_index, + on=self.image_key, how="left", ) From 8eaa9ee7224179288f07eee58bb34ee79c8ca1e3 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sat, 1 Apr 2023 12:34:11 -0400 Subject: [PATCH 57/82] To address this warning below: pycytominer/tests/test_cyto_utils/test_cell_locations.py::test_shape_and_columns[cell_loc1] /Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pandas/io/sql.py:1405: RemovedIn20Warning: Deprecated API features detected! These feature(s) are not compatible with SQLAlchemy 2.0. To prevent incompatible upgrades prior to updating applications, ensure requirements files are pinned to "sqlalchemy<2.0". Set environment variable SQLALCHEMY_WARN_20=1 to show all deprecation warnings. Set environment variable SQLALCHEMY_SILENCE_UBER_WARNING=1 to silence this message. (Background on SQLAlchemy 2.0 at: https://sqlalche.me/e/b8d9) return self.connectable.execution_options().execute(*args, **kwargs) --- .github/workflows/python-app.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index eb5ed71b..b305f622 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -18,6 +18,7 @@ jobs: os: [ubuntu-latest, macos-latest] env: OS: ${{ matrix.os }} + SQLALCHEMY_SILENCE_UBER_WARNING: "1" steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} From 683048d3c700bd250e6c8b083198074b61616ee8 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sat, 1 Apr 2023 12:54:16 -0400 Subject: [PATCH 58/82] use sqlalchemy --- pycytominer/cyto_utils/cell_locations.py | 41 ++++++------------- pycytominer/tests/test_cyto_utils/conftest.py | 16 ++++---- .../test_cyto_utils/test_cell_locations.py | 6 ++- 3 files changed, 24 insertions(+), 39 deletions(-) diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index e6e9e1c8..0c0dd2c2 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -4,11 +4,11 @@ import pathlib import pandas as pd -import sqlite3 import boto3 import botocore import tempfile import collections +import sqlalchemy from typing import Union @@ -45,8 +45,8 @@ class CellLocation: metadata_input : str or Pandas DataFrame Path to the input metadata file or a Pandas DataFrame - single_cell_input : str or sqlite3.Connection - Path to the single_cell file or a sqlite3.Connection object + single_cell_input : str or sqlalchemy.engine.Engine + Path to the single_cell file or a sqlalchemy.engine.Engine object augmented_metadata_output : str Path to the output file. If None, the metadata file is not saved to disk @@ -73,7 +73,7 @@ class CellLocation: def __init__( self, metadata_input: Union[str, pd.DataFrame], - single_cell_input: Union[str, sqlite3.Connection], + single_cell_input: Union[str, sqlalchemy.engine.Engine], augmented_metadata_output: str = None, overwrite: bool = False, image_column: str = "ImageNumber", @@ -272,36 +272,27 @@ def _load_single_cell(self): temp_single_cell_input = self._download_s3(self.single_cell_input) # connect to the single_cell file - conn = sqlite3.connect(temp_single_cell_input) - + engine = sqlalchemy.create_engine(f"sqlite:///{temp_single_cell_input}") else: # connect to the single_cell file - conn = sqlite3.connect(self.single_cell_input) + engine = sqlalchemy.create_engine(f"sqlite:///{self.single_cell_input}") else: - conn = self.single_cell_input + engine = self.single_cell_input # Verify that the Image and Nuclei tables are present in single_cell - c = conn.cursor() - - c.execute("SELECT name FROM sqlite_master WHERE type='table';") + inspector = sqlalchemy.inspect(engine) - tables = c.fetchall() + table_names = inspector.get_table_names() - tables = [x[0] for x in tables] - - if not ("Image" in tables and "Nuclei" in tables): + if not ("Image" in table_names and "Nuclei" in table_names): raise ValueError( "Image and Nuclei tables are not present in the single_cell file" ) # Verify that the required columns are present in the single_cell file - c.execute("PRAGMA table_info(Nuclei);") - - nuclei_columns = c.fetchall() - - nuclei_columns = [x[1] for x in nuclei_columns] + nuclei_columns = [column["name"] for column in inspector.get_columns("Nuclei")] if not ( self.image_column in nuclei_columns @@ -313,11 +304,7 @@ def _load_single_cell(self): "Required columns are not present in the Nuclei table in the SQLite file" ) - c.execute("PRAGMA table_info(Image);") - - image_columns = c.fetchall() - - image_columns = [x[1] for x in image_columns] + image_columns = [column["name"] for column in inspector.get_columns("Image")] if not ( self.image_column in image_columns @@ -338,9 +325,7 @@ def _load_single_cell(self): ON Nuclei.{self.image_column} = Image.{self.image_column}; """ - joined_df = pd.read_sql_query(join_query, conn) - - conn.close() + joined_df = pd.read_sql_query(join_query, engine) # if the single_cell file was downloaded from S3, delete the temporary file if "temp_single_cell_input" in locals(): diff --git a/pycytominer/tests/test_cyto_utils/conftest.py b/pycytominer/tests/test_cyto_utils/conftest.py index 26a20a77..8ea7c9af 100644 --- a/pycytominer/tests/test_cyto_utils/conftest.py +++ b/pycytominer/tests/test_cyto_utils/conftest.py @@ -6,7 +6,7 @@ import pandas as pd import pathlib import pytest -import sqlite3 +import sqlalchemy from pycytominer.cyto_utils.cell_locations import CellLocation @@ -61,16 +61,14 @@ def fixture_metadata_input_dataframe(metadata_input_file: str) -> pd.DataFrame: return pd.read_parquet(metadata_input_file) -@pytest.fixture(name="single_cell_input_connection") -def fixture_single_cell_input_connection( +@pytest.fixture(name="single_cell_input_engine") +def fixture_single_cell_input_engine( single_cell_input_file: str, -) -> sqlite3.Connection: +) -> sqlalchemy.engine.Engine: """ Provide a single cell input file for cell_locations test data """ - conn = sqlite3.connect(single_cell_input_file) - yield conn - conn.close() + return sqlalchemy.create_engine(f"sqlite:///{single_cell_input_file}") @pytest.fixture(name="cell_loc_obj1") @@ -90,14 +88,14 @@ def fixture_cell_loc_obj1( @pytest.fixture(name="cell_loc_obj2") def fixture_cell_loc_obj2( metadata_input_dataframe: pd.DataFrame, - single_cell_input_connection: sqlite3.Connection, + single_cell_input_engine: sqlalchemy.engine.Engine, ) -> CellLocation: """ Provide a CellLocation object with in-memory inputs """ return CellLocation( metadata_input=metadata_input_dataframe, - single_cell_input=single_cell_input_connection, + single_cell_input=single_cell_input_engine, ) diff --git a/pycytominer/tests/test_cyto_utils/test_cell_locations.py b/pycytominer/tests/test_cyto_utils/test_cell_locations.py index c9eb57a3..73969299 100644 --- a/pycytominer/tests/test_cyto_utils/test_cell_locations.py +++ b/pycytominer/tests/test_cyto_utils/test_cell_locations.py @@ -7,7 +7,8 @@ import sqlite3 -@pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2"]) +@pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2", "cell_loc3"]) +# @pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2"]) def test_shape_and_columns(cell_loc, metadata_input_dataframe, request): cell_loc = request.getfixturevalue(cell_loc) @@ -22,7 +23,8 @@ def test_shape_and_columns(cell_loc, metadata_input_dataframe, request): assert "Nuclei_Location_Center_Y" in cell_loc["CellCenters"][0][0].keys() -@pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2"]) +@pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2", "cell_loc3"]) +# @pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2"]) def test_values(cell_loc, metadata_input_dataframe, single_cell_input_file, request): cell_loc = request.getfixturevalue(cell_loc) From 03bc671459b878394d41b8447bec3f35ad1ce017 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sat, 1 Apr 2023 13:18:42 -0400 Subject: [PATCH 59/82] More comments and switch to boto3.client --- pycytominer/cyto_utils/cell_locations.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index 0c0dd2c2..d61d6809 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -54,6 +54,9 @@ class CellLocation: image_column : default = 'ImageNumber' Name of the column in the metadata file that links to the single_cell file + image_key: default = ['Metadata_Plate', 'Metadata_Well', 'Metadata_Site'] + Names of the columns in the metadata file that uniquely identify each image + object_column : default = 'ObjectNumber' Name of the column in the single_cell file that identifies each cell @@ -136,7 +139,9 @@ def _s3_file_exists(self, s3_path): True if the file exists on S3, False otherwise """ - s3 = boto3.resource("s3") + s3 = boto3.client( + "s3", config=botocore.config.Config(signature_version=botocore.UNSIGNED) + ) bucket, key = self._parse_s3_path(s3_path) @@ -154,7 +159,10 @@ def _download_s3(self, uri): """ Download a file from S3 to a temporary file and return the temporary path """ - s3 = boto3.resource("s3") + + s3 = boto3.client( + "s3", config=botocore.config.Config(signature_version=botocore.UNSIGNED) + ) bucket, key = self._parse_s3_path(uri) @@ -162,7 +170,7 @@ def _download_s3(self, uri): delete=False, suffix=pathlib.Path(key).name ) - s3.Bucket(bucket).download_file(key, tmp_file.name) + s3.download_file(bucket, key, tmp_file.name) return tmp_file.name @@ -362,6 +370,7 @@ def add_cell_location(self): and isinstance(self.augmented_metadata_output, str) and self.overwrite is False and ( + # Check if the file exists on S3 or locally ( self.augmented_metadata_output.startswith("s3://") and self._s3_file_exists(self.augmented_metadata_output) @@ -372,6 +381,9 @@ def add_cell_location(self): ) ) ): + # TODO: Consider doing a quick difference check should the file already exist. + # For example, if the file already exists and it's different than what could be possibly incoming, should the user know? + # This will involve performing all the steps below and then doing a check to see if the file is different, so this is a bit of a pain. return self.augmented_metadata_output # Load the data @@ -388,8 +400,7 @@ def add_cell_location(self): # If self.augmented_metadata_output is not None, save the data if self.augmented_metadata_output is not None: - # switch to https://github.com/cytomining/pycytominer/blob/master/pycytominer/cyto_utils/output.py - # if we want to support more file types + # TODO: switch to https://github.com/cytomining/pycytominer/blob/master/pycytominer/cyto_utils/output.py if we want to support more file types augmented_metadata_df.to_parquet( self.augmented_metadata_output, index=False ) From 1a433e391389ef9fe313725aeda49cbc5128d6d7 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sun, 2 Apr 2023 06:37:17 -0400 Subject: [PATCH 60/82] Be explicit about anon; fix indentation bug --- pycytominer/cyto_utils/cell_locations.py | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index d61d6809..a4b3f040 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -192,13 +192,22 @@ def _load_metadata(self): ): raise ValueError("Metadata file must be a CSV or a Parquet file") + storage_options = ( + {"anon": True} if self.metadata_input.startswith("s3://") else None + ) + # load the metadata file into a Pandas DataFrame if self.metadata_input.endswith(".csv"): - df = pd.read_csv(self.metadata_input, dtype=str) + df = pd.read_csv( + self.metadata_input, dtype=str, storage_options=storage_options + ) else: - df = pd.read_parquet(self.metadata_input) - # cast all columns to string - df = df.astype(str) + df = pd.read_parquet( + self.metadata_input, storage_options=storage_options + ) + + # cast all columns to string + df = df.astype(str) else: df = self.metadata_input From 892e627a1bf44cb48384dc4a10c5ee3860783ad0 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sun, 2 Apr 2023 07:01:47 -0400 Subject: [PATCH 61/82] explicit types --- pycytominer/cyto_utils/cell_locations.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index a4b3f040..76413627 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -95,13 +95,13 @@ def __init__( self.cell_x_loc = cell_x_loc self.cell_y_loc = cell_y_loc - def _expanduser(self, obj): + def _expanduser(self, obj: Union[str, None]): """Expand the user home directory in a path""" if obj is not None and isinstance(obj, str) and not obj.startswith("s3://"): return pathlib.Path(obj).expanduser().as_posix() return obj - def _parse_s3_path(self, s3_path): + def _parse_s3_path(self, s3_path: str): """Parse an S3 path into a bucket and key Parameters @@ -125,7 +125,7 @@ def _parse_s3_path(self, s3_path): return bucket, key - def _s3_file_exists(self, s3_path): + def _s3_file_exists(self, s3_path: str): """Check if a file exists on S3 Parameters @@ -155,7 +155,7 @@ def _s3_file_exists(self, s3_path): else: return True - def _download_s3(self, uri): + def _download_s3(self, uri: str): """ Download a file from S3 to a temporary file and return the temporary path """ @@ -220,7 +220,7 @@ def _load_metadata(self): return df - def _create_nested_df(self, df): + def _create_nested_df(self, df: pd.DataFrame): """Create a new column `CellCenters` by nesting the X and Y locations of cell from an image into the row of the image Parameters From 76e1f49759d0beabbdd8537bf14711f7f2d40a84 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sun, 2 Apr 2023 07:02:01 -0400 Subject: [PATCH 62/82] Address various comment --- .../test_cyto_utils/test_cell_locations.py | 36 ++++++++++++------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/pycytominer/tests/test_cyto_utils/test_cell_locations.py b/pycytominer/tests/test_cyto_utils/test_cell_locations.py index 73969299..6d4ee0db 100644 --- a/pycytominer/tests/test_cyto_utils/test_cell_locations.py +++ b/pycytominer/tests/test_cyto_utils/test_cell_locations.py @@ -1,15 +1,22 @@ """This tests the output from CellLocation class""" -import os import pandas as pd -from pycytominer.cyto_utils.cell_locations import CellLocation import pytest -import sqlite3 +import sqlalchemy +from typing import Literal, Type +from _pytest.fixtures import FixtureRequest @pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2", "cell_loc3"]) -# @pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2"]) -def test_shape_and_columns(cell_loc, metadata_input_dataframe, request): +def test_output_shape_and_required_columns( + cell_loc: Literal["cell_loc1", "cell_loc2", "cell_loc3"], + metadata_input_dataframe: pd.DataFrame, + request: Type[FixtureRequest], +): + """ + This tests the shape of the output from CellLocation class and verifies that the required columns are present + """ + cell_loc = request.getfixturevalue(cell_loc) # check the shape of the data @@ -24,8 +31,15 @@ def test_shape_and_columns(cell_loc, metadata_input_dataframe, request): @pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2", "cell_loc3"]) -# @pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2"]) -def test_values(cell_loc, metadata_input_dataframe, single_cell_input_file, request): +def test_output_value_correctness( + cell_loc: Literal["cell_loc1", "cell_loc2", "cell_loc3"], + metadata_input_dataframe: pd.DataFrame, + single_cell_input_file: str, + request: Type[FixtureRequest], +): + """ + This tests the correctness of the values in the output from CellLocation class by comparing the values in the output to the values in the input + """ cell_loc = request.getfixturevalue(cell_loc) # if we restrict the columns of cell_loc to the ones in metadata_input_dataframe, we should get the same dataframe @@ -35,13 +49,11 @@ def test_values(cell_loc, metadata_input_dataframe, single_cell_input_file, requ .equals(metadata_input_dataframe.reset_index(drop=True)) ) - nuclei_query = "SELECT ImageNumber, ObjectNumber, Nuclei_Location_Center_X, Nuclei_Location_Center_Y FROM Nuclei;" - - conn = sqlite3.connect(single_cell_input_file) + engine = sqlalchemy.create_engine(f"sqlite:///{single_cell_input_file}") - nuclei_df = pd.read_sql_query(nuclei_query, conn) + nuclei_query = "SELECT ImageNumber, ObjectNumber, Nuclei_Location_Center_X, Nuclei_Location_Center_Y FROM Nuclei;" - conn.close() + nuclei_df = pd.read_sql_query(nuclei_query, engine) # get the values in the Nuclear_Location_Center_X and Nuclear_Location_Center_Y columns # for the rows in nuclei_df that have ImageNumber == 1 From cf7c94a01307e40a52570c96044dcd62f8672f21 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sun, 2 Apr 2023 07:07:34 -0400 Subject: [PATCH 63/82] Move gitignore entries to the top level --- .gitignore | 1 + .../tests/test_data/cell_locations_example_data/.gitignore | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) delete mode 100644 pycytominer/tests/test_data/cell_locations_example_data/.gitignore diff --git a/.gitignore b/.gitignore index 3eeeb0e0..6499442c 100644 --- a/.gitignore +++ b/.gitignore @@ -9,4 +9,5 @@ build *.sqlite pycytominer/tests/test_data/collate/backend/**/*.csv !pycytominer/tests/test_data/collate/backend/**/*master.csv +!pycytominer/tests/test_data/cell_locations_example_data/*.sqlite diff --git a/pycytominer/tests/test_data/cell_locations_example_data/.gitignore b/pycytominer/tests/test_data/cell_locations_example_data/.gitignore deleted file mode 100644 index 355a6663..00000000 --- a/pycytominer/tests/test_data/cell_locations_example_data/.gitignore +++ /dev/null @@ -1 +0,0 @@ -!*sqlite From 5e9ae36a8bf5ba9a72b48be4ad6c11302f9b959a Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sun, 2 Apr 2023 07:24:35 -0400 Subject: [PATCH 64/82] rename files, add docs --- pycytominer/tests/test_cyto_utils/conftest.py | 10 +++++---- ...cell_locations.sh => shrink_BR00126114.sh} | 19 ++++++++++++++---- ...4_subset.sqlite => test_BR00126114.sqlite} | Bin ...t_BR00126114_load_data_with_illum.parquet} | Bin 4 files changed, 21 insertions(+), 8 deletions(-) rename pycytominer/tests/test_data/cell_locations_example_data/{test_cell_locations.sh => shrink_BR00126114.sh} (77%) rename pycytominer/tests/test_data/cell_locations_example_data/{BR00126114_subset.sqlite => test_BR00126114.sqlite} (100%) rename pycytominer/tests/test_data/cell_locations_example_data/{load_data_with_illum_subset.parquet => test_BR00126114_load_data_with_illum.parquet} (100%) diff --git a/pycytominer/tests/test_cyto_utils/conftest.py b/pycytominer/tests/test_cyto_utils/conftest.py index 8ea7c9af..a6c78821 100644 --- a/pycytominer/tests/test_cyto_utils/conftest.py +++ b/pycytominer/tests/test_cyto_utils/conftest.py @@ -26,7 +26,9 @@ def fixture_metadata_input_file(data_dir_cell_locations: str) -> str: """ Provide a metadata input file for cell_locations test data """ - return os.path.join(data_dir_cell_locations, "load_data_with_illum_subset.parquet") + return os.path.join( + data_dir_cell_locations, "test_BR00126114_load_data_with_illum.parquet" + ) @pytest.fixture(name="single_cell_input_file") @@ -34,7 +36,7 @@ def fixture_single_cell_input_file(data_dir_cell_locations: str) -> str: """ Provide a single cell input file for cell_locations test data """ - return os.path.join(data_dir_cell_locations, "BR00126114_subset.sqlite") + return os.path.join(data_dir_cell_locations, "test_BR00126114.sqlite") @pytest.fixture(name="metadata_input_file_s3") @@ -42,7 +44,7 @@ def fixture_metadata_input_file_s3() -> str: """ Provide a metadata input file for cell_locations test data """ - return "s3://cellpainting-gallery/test-cpg0016-jump/source_4/workspace/load_data_csv/2021_08_23_Batch12/BR00126114/load_data_with_illum_subset.parquet" + return "s3://cellpainting-gallery/test-cpg0016-jump/source_4/workspace/load_data_csv/2021_08_23_Batch12/BR00126114/test_BR00126114_load_data_with_illum.parquet" @pytest.fixture(name="single_cell_input_file_s3") @@ -50,7 +52,7 @@ def fixture_single_cell_input_file_s3() -> str: """ Provide a single cell input file for cell_locations test data """ - return "s3://cellpainting-gallery/test-cpg0016-jump/source_4/workspace/backend/2021_08_23_Batch12/BR00126114/BR00126114_subset.sqlite" + return "s3://cellpainting-gallery/test-cpg0016-jump/source_4/workspace/backend/2021_08_23_Batch12/BR00126114/test_BR00126114.sqlite" @pytest.fixture(name="metadata_input_dataframe") diff --git a/pycytominer/tests/test_data/cell_locations_example_data/test_cell_locations.sh b/pycytominer/tests/test_data/cell_locations_example_data/shrink_BR00126114.sh similarity index 77% rename from pycytominer/tests/test_data/cell_locations_example_data/test_cell_locations.sh rename to pycytominer/tests/test_data/cell_locations_example_data/shrink_BR00126114.sh index e33a69bd..8a18202f 100644 --- a/pycytominer/tests/test_data/cell_locations_example_data/test_cell_locations.sh +++ b/pycytominer/tests/test_data/cell_locations_example_data/shrink_BR00126114.sh @@ -1,3 +1,14 @@ +#!/bin/bash + +# Create SQLite and LoadData CSV files for testing cell locations +# +# Steps: +# 1. Download SQLite file from S3 +# 2. Download LoadData CSV file from S3 +# 3. Query SQLite to select specific columns of all rows of the `Image` and `Nuclei` table in the SQLite file where `ImageNumber` is 1 or 2. +# 4. Create the SQLite file fixture using the output of the SQL queries +# 5. Create a new LoadData CSV fixture file with only the rows corresponding to the rows in SQLite file fixture + # Download SQLite file aws s3 cp s3://cellpainting-gallery/cpg0016-jump/source_4/workspace/backend/2021_08_23_Batch12/BR00126114/BR00126114.sqlite . @@ -30,19 +41,19 @@ cat create_tables.sql # run the SQL commands in the text file to create the SQLite file -sqlite3 BR00126114_subset.sqlite < create_tables.sql +sqlite3 test_BR00126114.sqlite < create_tables.sql # Print the list of tables in the SQLite file -sqlite3 BR00126114_subset.sqlite ".tables" +sqlite3 test_BR00126114.sqlite ".tables" # Print the contents of the `Image` table in the SQLite file -sqlite3 BR00126114_subset.sqlite "SELECT * FROM Image;" +sqlite3 test_BR00126114.sqlite "SELECT * FROM Image;" # Print the contents of the `Nuclei` table in the SQLite file -sqlite3 BR00126114_subset.sqlite "SELECT * FROM Nuclei;" +sqlite3 test_BR00126114.sqlite "SELECT * FROM Nuclei;" cat << EOF > create_parquet.py import pandas as pd diff --git a/pycytominer/tests/test_data/cell_locations_example_data/BR00126114_subset.sqlite b/pycytominer/tests/test_data/cell_locations_example_data/test_BR00126114.sqlite similarity index 100% rename from pycytominer/tests/test_data/cell_locations_example_data/BR00126114_subset.sqlite rename to pycytominer/tests/test_data/cell_locations_example_data/test_BR00126114.sqlite diff --git a/pycytominer/tests/test_data/cell_locations_example_data/load_data_with_illum_subset.parquet b/pycytominer/tests/test_data/cell_locations_example_data/test_BR00126114_load_data_with_illum.parquet similarity index 100% rename from pycytominer/tests/test_data/cell_locations_example_data/load_data_with_illum_subset.parquet rename to pycytominer/tests/test_data/cell_locations_example_data/test_BR00126114_load_data_with_illum.parquet From a55d74acd0c4dedeb62de14f02ca456cfabf35c0 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sun, 2 Apr 2023 07:30:41 -0400 Subject: [PATCH 65/82] Upgrade to python 3.10 --- .github/workflows/codecov.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml index cdaf32e9..137c0e6f 100644 --- a/.github/workflows/codecov.yml +++ b/.github/workflows/codecov.yml @@ -14,7 +14,7 @@ jobs: - name: Setup Python uses: actions/setup-python@master with: - python-version: 3.7 + python-version: 3.10 - name: Generate coverage report run: | pip install pytest From 3e5afdd55d86bf67050121dee49c79ca6aaf2cb9 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sun, 2 Apr 2023 07:52:10 -0400 Subject: [PATCH 66/82] refactor _load_single_cell --- pycytominer/cyto_utils/cell_locations.py | 46 ++++++++++++++++++------ 1 file changed, 36 insertions(+), 10 deletions(-) diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index 76413627..33afb22e 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -269,18 +269,13 @@ def _create_nested_df(self, df: pd.DataFrame): # convert the output dictionary to a Pandas DataFrame return pd.DataFrame(output_df_list) - def _load_single_cell(self): - """Load the required columns from the `Image` and `Nuclei` tables in the single_cell file or sqlite3.Connection object into a Pandas DataFrame - - Returns - ------- - Pandas DataFrame - The required columns from the `Image` and `Nuclei` tables loaded into a Pandas DataFrame + def _get_single_cell_engine(self): + """ + Get the sqlalchemy.engine.Engine object for the single_cell file """ if isinstance(self.single_cell_input, str): # check if the single_cell file is a SQLite file - if not self.single_cell_input.endswith(".sqlite"): raise ValueError("single_cell file must be a SQLite file") @@ -293,10 +288,18 @@ def _load_single_cell(self): else: # connect to the single_cell file engine = sqlalchemy.create_engine(f"sqlite:///{self.single_cell_input}") + temp_single_cell_input = None + else: engine = self.single_cell_input + temp_single_cell_input = None - # Verify that the Image and Nuclei tables are present in single_cell + return temp_single_cell_input, engine + + def _check_single_cell_correctness(self, engine: sqlalchemy.engine.Engine): + """ + Check that the single_cell file has the required tables and columns + """ inspector = sqlalchemy.inspect(engine) @@ -331,6 +334,16 @@ def _load_single_cell(self): "Required columns are not present in the Image table in the SQLite file" ) + def _get_joined_image_nuclei_tables(self): + """ + Merge the Image and Nuclei tables in SQL + """ + # get the sqlalchemy.engine.Engine object for the single_cell file + temp_single_cell_input, engine = self._get_single_cell_engine() + + # check that the single_cell file has the required tables and columns + self._check_single_cell_correctness(engine) + image_index_str = ", ".join(self.image_key) # merge the Image and Nuclei tables in SQL @@ -345,9 +358,22 @@ def _load_single_cell(self): joined_df = pd.read_sql_query(join_query, engine) # if the single_cell file was downloaded from S3, delete the temporary file - if "temp_single_cell_input" in locals(): + if temp_single_cell_input is not None: pathlib.Path(temp_single_cell_input).unlink() + return joined_df + + def _load_single_cell(self): + """Load the required columns from the `Image` and `Nuclei` tables in the single_cell file or sqlalchemy.engine.Engine object into a Pandas DataFrame + + Returns + ------- + Pandas DataFrame + The required columns from the `Image` and `Nuclei` tables loaded into a Pandas DataFrame + """ + + joined_df = self._get_joined_image_nuclei_tables() + # Cast the cell location columns to float joined_df[self.cell_x_loc] = joined_df[self.cell_x_loc].astype(float) joined_df[self.cell_y_loc] = joined_df[self.cell_y_loc].astype(float) From 8d70fd08025e916e9d47e8946c059440c3b75a40 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sun, 2 Apr 2023 07:55:19 -0400 Subject: [PATCH 67/82] fix type --- pycytominer/tests/test_cyto_utils/test_cell_locations.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pycytominer/tests/test_cyto_utils/test_cell_locations.py b/pycytominer/tests/test_cyto_utils/test_cell_locations.py index 6d4ee0db..f7102273 100644 --- a/pycytominer/tests/test_cyto_utils/test_cell_locations.py +++ b/pycytominer/tests/test_cyto_utils/test_cell_locations.py @@ -3,13 +3,13 @@ import pandas as pd import pytest import sqlalchemy -from typing import Literal, Type +from typing import Type from _pytest.fixtures import FixtureRequest @pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2", "cell_loc3"]) def test_output_shape_and_required_columns( - cell_loc: Literal["cell_loc1", "cell_loc2", "cell_loc3"], + cell_loc: str, metadata_input_dataframe: pd.DataFrame, request: Type[FixtureRequest], ): @@ -32,7 +32,7 @@ def test_output_shape_and_required_columns( @pytest.mark.parametrize("cell_loc", ["cell_loc1", "cell_loc2", "cell_loc3"]) def test_output_value_correctness( - cell_loc: Literal["cell_loc1", "cell_loc2", "cell_loc3"], + cell_loc: str, metadata_input_dataframe: pd.DataFrame, single_cell_input_file: str, request: Type[FixtureRequest], From e77c6cfe02676ebab512b0df1b6ba16f8a664597 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Sun, 2 Apr 2023 07:58:46 -0400 Subject: [PATCH 68/82] test on highest build version --- .github/workflows/codecov.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/codecov.yml b/.github/workflows/codecov.yml index 137c0e6f..1d38473f 100644 --- a/.github/workflows/codecov.yml +++ b/.github/workflows/codecov.yml @@ -14,7 +14,7 @@ jobs: - name: Setup Python uses: actions/setup-python@master with: - python-version: 3.10 + python-version: 3.9 - name: Generate coverage report run: | pip install pytest From 972e720e2e543d18d09ad799e9fc7abc1597fd56 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Wed, 5 Apr 2023 12:01:07 -0400 Subject: [PATCH 69/82] explain warning --- .github/workflows/python-app.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index b305f622..65b4e941 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -18,6 +18,9 @@ jobs: os: [ubuntu-latest, macos-latest] env: OS: ${{ matrix.os }} + # This is needed to avoid a warning from SQLAlchemy + # https://sqlalche.me/e/b8d9 + # We can remove this once we upgrade to SQLAlchemy >= 2.0 SQLALCHEMY_SILENCE_UBER_WARNING: "1" steps: - uses: actions/checkout@v2 From 9b4a0bf23bc92c3b8188ef548ea23a528133b7cd Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Wed, 5 Apr 2023 12:01:26 -0400 Subject: [PATCH 70/82] s3 is an attribute --- pycytominer/cyto_utils/cell_locations.py | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index 33afb22e..dff20e65 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -94,6 +94,11 @@ def __init__( self.image_key = image_key self.cell_x_loc = cell_x_loc self.cell_y_loc = cell_y_loc + # Does this mean we are constrained to only anonymous access for S3 resources? + # What would happen with non-anonymous access needs - is this something we should think about? + self.s3 = boto3.client( + "s3", config=botocore.config.Config(signature_version=botocore.UNSIGNED) + ) def _expanduser(self, obj: Union[str, None]): """Expand the user home directory in a path""" @@ -139,14 +144,10 @@ def _s3_file_exists(self, s3_path: str): True if the file exists on S3, False otherwise """ - s3 = boto3.client( - "s3", config=botocore.config.Config(signature_version=botocore.UNSIGNED) - ) - bucket, key = self._parse_s3_path(s3_path) try: - s3.Object(bucket, key).load() + self.s3.Object(bucket, key).load() except botocore.exceptions.ClientError as e: if e.response["Error"]["Code"] == "404": return False @@ -160,17 +161,13 @@ def _download_s3(self, uri: str): Download a file from S3 to a temporary file and return the temporary path """ - s3 = boto3.client( - "s3", config=botocore.config.Config(signature_version=botocore.UNSIGNED) - ) - bucket, key = self._parse_s3_path(uri) tmp_file = tempfile.NamedTemporaryFile( delete=False, suffix=pathlib.Path(key).name ) - s3.download_file(bucket, key, tmp_file.name) + self.s3.download_file(bucket, key, tmp_file.name) return tmp_file.name From e05056ddc95b757bbad9675482254a81a2380c76 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Wed, 5 Apr 2023 12:09:45 -0400 Subject: [PATCH 71/82] compact check --- pycytominer/cyto_utils/cell_locations.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index dff20e65..62b7722d 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -300,9 +300,10 @@ def _check_single_cell_correctness(self, engine: sqlalchemy.engine.Engine): inspector = sqlalchemy.inspect(engine) - table_names = inspector.get_table_names() - - if not ("Image" in table_names and "Nuclei" in table_names): + if not all( + table_name in inspector.get_table_names() + for table_name in ["Image", "Nuclei"] + ): raise ValueError( "Image and Nuclei tables are not present in the single_cell file" ) @@ -311,11 +312,14 @@ def _check_single_cell_correctness(self, engine: sqlalchemy.engine.Engine): nuclei_columns = [column["name"] for column in inspector.get_columns("Nuclei")] - if not ( - self.image_column in nuclei_columns - and self.object_column in nuclei_columns - and self.cell_x_loc in nuclei_columns - and self.cell_y_loc in nuclei_columns + if not all( + column_name in nuclei_columns + for column_name in [ + self.image_column, + self.object_column, + self.cell_x_loc, + self.cell_y_loc, + ] ): raise ValueError( "Required columns are not present in the Nuclei table in the SQLite file" From 6f2e3b3c61fa735bf4ea54fad07e14bc1778e056 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Wed, 5 Apr 2023 12:13:41 -0400 Subject: [PATCH 72/82] Update README.md Co-authored-by: Dave Bunten --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 18e7eb9e..73ebce55 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,7 @@ pip install "pycytominer[cell_locations] @ git+git://github.com/cytomining/pycyt Example using this functionality: ```bash -metadata_input="s3://cellpainting-gallery/test-cpg0016-jump/source_4/workspace/load_data_csv/2021_08_23_Batch12/BR00126114/load_data_with_illum_subset.parquet" +metadata_input="s3://cellpainting-gallery/test-cpg0016-jump/source_4/workspace/load_data_csv/2021_08_23_Batch12/BR00126114/test_BR00126114_load_data_with_illum.parquet" single_single_cell_input="s3://cellpainting-gallery/test-cpg0016-jump/source_4/workspace/backend/2021_08_23_Batch12/BR00126114/BR00126114_subset.sqlite" augmented_metadata_output="~/Desktop/load_data_with_illum_and_cell_location_subset.parquet" From 1e8f1e6de6d3ac2bb4750cba2cdf1b9f7313bd61 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Wed, 5 Apr 2023 12:23:58 -0400 Subject: [PATCH 73/82] fix typo + more docs --- README.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 73ebce55..3db1cef4 100644 --- a/README.md +++ b/README.md @@ -79,7 +79,7 @@ Example using this functionality: ```bash metadata_input="s3://cellpainting-gallery/test-cpg0016-jump/source_4/workspace/load_data_csv/2021_08_23_Batch12/BR00126114/test_BR00126114_load_data_with_illum.parquet" -single_single_cell_input="s3://cellpainting-gallery/test-cpg0016-jump/source_4/workspace/backend/2021_08_23_Batch12/BR00126114/BR00126114_subset.sqlite" +single_single_cell_input="s3://cellpainting-gallery/test-cpg0016-jump/source_4/workspace/backend/2021_08_23_Batch12/BR00126114/test_BR00126114.sqlite" augmented_metadata_output="~/Desktop/load_data_with_illum_and_cell_location_subset.parquet" python \ @@ -88,6 +88,16 @@ python \ --single_cell_input ${single_single_cell_input} \ --augmented_metadata_output ${augmented_metadata_output} \ add_cell_location + +# Check the output + +python -c "import pandas as pd; print(pd.read_parquet('${augmented_metadata_output}').head())" + +# It should look something like this (depends on the width of your terminal): + +# Metadata_Plate Metadata_Well Metadata_Site ... PathName_OrigRNA ImageNumber CellCenters +# 0 BR00126114 A01 1 ... s3://cellpainting-gallery/cpg0016-jump/source_... 1 [{'Nuclei_Location_Center_X': 943.512129380054... +# 1 BR00126114 A01 2 ... s3://cellpainting-gallery/cpg0016-jump/source_... 2 [{'Nuclei_Location_Center_X': 29.9516027655562... ``` ## Usage From 3e162096aaeb636044c36b69b3b43a8460779909 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Wed, 5 Apr 2023 13:32:08 -0400 Subject: [PATCH 74/82] black cells.py --- pycytominer/cyto_utils/cells.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py index cd5c9621..d163e242 100644 --- a/pycytominer/cyto_utils/cells.py +++ b/pycytominer/cyto_utils/cells.py @@ -72,8 +72,8 @@ class SingleCells(object): default_datatype_float: type Numpy floating point datatype to use for load_compartment and resulting dataframes. This parameter may be used to assist with performance-related - issues by reducing the memory required for floating-point data. - For example, using np.float32 instead of np.float64 for this parameter + issues by reducing the memory required for floating-point data. + For example, using np.float32 instead of np.float64 for this parameter will reduce memory consumed by float columns by roughly 50%. Please note: using any besides np.float64 are experimentally unverified. @@ -365,7 +365,6 @@ def subsample_profiles(self, df, rename_col=True): self.set_subsample_random_state(random_state) if self.subsample_frac == 1: - output_df = pd.DataFrame.sample( df, n=self.subsample_n, @@ -537,7 +536,6 @@ def aggregate_compartment( compartment=compartment, n_aggregation_memory_strata=n_aggregation_memory_strata, ): - population_df = self.image_df.merge( compartment_df, how="inner", @@ -636,7 +634,7 @@ def _compartment_df_generator( con=self.conn, ) all_columns = compartment_row1.columns - if self.features != "infer": # allow to get only some features + if self.features != "infer": # allow to get only some features all_columns = [x for x in all_columns if x in self.features] typeof_str = ", ".join([f"typeof({x})" for x in all_columns]) @@ -763,9 +761,7 @@ def merge_single_cells( else: sc_df = sc_df.merge( - self.load_compartment( - compartment=right_compartment - ), + self.load_compartment(compartment=right_compartment), left_on=self.merge_cols + [left_link_col], right_on=self.merge_cols + [right_link_col], suffixes=merge_suffix, From d7b345c6623373a2c25686adb612afbf82c2e85f Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Wed, 5 Apr 2023 13:34:02 -0400 Subject: [PATCH 75/82] trim code --- pycytominer/cyto_utils/cells.py | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/pycytominer/cyto_utils/cells.py b/pycytominer/cyto_utils/cells.py index d163e242..1511f354 100644 --- a/pycytominer/cyto_utils/cells.py +++ b/pycytominer/cyto_utils/cells.py @@ -752,20 +752,12 @@ def merge_single_cells( sc_df, how="left", on=subset_logic_df.columns.tolist() ).reindex(sc_df.columns, axis="columns") - sc_df = sc_df.merge( - self.load_compartment(compartment=right_compartment), - left_on=self.merge_cols + [left_link_col], - right_on=self.merge_cols + [right_link_col], - suffixes=merge_suffix, - ) - - else: - sc_df = sc_df.merge( - self.load_compartment(compartment=right_compartment), - left_on=self.merge_cols + [left_link_col], - right_on=self.merge_cols + [right_link_col], - suffixes=merge_suffix, - ) + sc_df = sc_df.merge( + self.load_compartment(compartment=right_compartment), + left_on=self.merge_cols + [left_link_col], + right_on=self.merge_cols + [right_link_col], + suffixes=merge_suffix, + ) linking_check_cols.append(linking_check) From 3b5388e8b3e429c2f76a03cc257cbab635cae795 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Wed, 5 Apr 2023 15:50:22 -0400 Subject: [PATCH 76/82] black --- .../tests/test_cyto_utils/test_cells.py | 23 ++++++++----------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py index f6461167..c3c59554 100644 --- a/pycytominer/tests/test_cyto_utils/test_cells.py +++ b/pycytominer/tests/test_cyto_utils/test_cells.py @@ -345,14 +345,16 @@ def test_get_sql_table_col_names(): # Iterate over initialized compartments for compartment in AP.compartments: expected_meta_cols = ["ObjectNumber", "ImageNumber", "TableNumber"] - expected_feat_cols = [f"{compartment.capitalize()}_{i}" for i in ["a", "b", "c", "d"]] - if compartment == 'cytoplasm': - expected_feat_cols += ["Cytoplasm_Parent_Cells","Cytoplasm_Parent_Nuclei"] + expected_feat_cols = [ + f"{compartment.capitalize()}_{i}" for i in ["a", "b", "c", "d"] + ] + if compartment == "cytoplasm": + expected_feat_cols += ["Cytoplasm_Parent_Cells", "Cytoplasm_Parent_Nuclei"] col_name_result = AP.get_sql_table_col_names(table=compartment) - assert sorted(col_name_result) == sorted(expected_feat_cols+expected_meta_cols) - meta_cols, feat_cols = AP.split_column_categories( - col_name_result + assert sorted(col_name_result) == sorted( + expected_feat_cols + expected_meta_cols ) + meta_cols, feat_cols = AP.split_column_categories(col_name_result) assert meta_cols == expected_meta_cols assert feat_cols == expected_feat_cols @@ -406,7 +408,6 @@ def test_merge_single_cells(): for method in ["standardize", "robustize"]: for samples in ["all", "Metadata_ImageNumber == 'x'"]: for features in ["infer", ["Cytoplasm_a", "Cells_a"]]: - norm_method_df = AP.merge_single_cells( single_cell_normalize=True, normalize_args={ @@ -476,8 +477,8 @@ def test_merge_single_cells(): traditional_norm_df.loc[:, new_compartment_cols].abs().describe(), ) -def test_merge_single_cells_subsample(): +def test_merge_single_cells_subsample(): for subsample_frac in [0.1, 0.5, 0.9]: ap_subsample = SingleCells( sql_file=TMP_SQLITE_FILE, subsample_frac=subsample_frac @@ -704,7 +705,6 @@ def test_aggregate_subsampling_count_cells(): def test_aggregate_subsampling_profile(): - assert isinstance( AP_SUBSAMPLE.aggregate_profiles(compute_subsample=True), pd.DataFrame ) @@ -724,7 +724,6 @@ def test_aggregate_subsampling_profile(): def test_aggregate_subsampling_profile_output(): - expected_result = pd.DataFrame( { "Metadata_Plate": ["plate", "plate"], @@ -768,7 +767,6 @@ def test_aggregate_subsampling_profile_output(): def test_aggregate_subsampling_profile_output_multiple_queries(): - expected_result = pd.DataFrame( { "Metadata_Plate": ["plate", "plate"], @@ -814,7 +812,6 @@ def test_aggregate_subsampling_profile_output_multiple_queries(): def test_n_aggregation_memory_strata(): - df_n1 = AP.aggregate_profiles(n_aggregation_memory_strata=1) df_n2 = AP.aggregate_profiles(n_aggregation_memory_strata=2) df_n3 = AP.aggregate_profiles(n_aggregation_memory_strata=3) @@ -832,7 +829,6 @@ def test_invalid_n_aggregation_memory_strata(): def test_sqlite_strata_conditions(): - df = pd.DataFrame( data={ "TableNumber": [[1], [2], [3], [4]], @@ -1082,4 +1078,3 @@ def test_load_non_canonical_image_table(): result.sort_index(axis="columns").drop("Metadata_Site_Count", axis="columns"), sc_aggregated_df, ) - From 53b214bf6440eb79aad11d581b8c7c2b19992cee Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Wed, 5 Apr 2023 15:52:58 -0400 Subject: [PATCH 77/82] Skip test --- pycytominer/tests/test_cyto_utils/test_cells.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py index c3c59554..c5092864 100644 --- a/pycytominer/tests/test_cyto_utils/test_cells.py +++ b/pycytominer/tests/test_cyto_utils/test_cells.py @@ -427,7 +427,15 @@ def test_merge_single_cells(): check_dtype=False, ) - # Test non-canonical compartment merging + +@pytest.mark.skip( + reason="This test will soon fail because of a logic error in merge_single_cells" +) +def test_merge_single_cells_non_canonical(): + # The test raises this warning: + # FutureWarning: Passing 'suffixes' which cause duplicate columns + # {'ObjectNumber_cytoplasm'} in the result is deprecated and will raise a + # MergeError in a future version. # Test non-canonical compartment merging new_sc_merge_df = AP_NEW.merge_single_cells() assert sum(new_sc_merge_df.columns.str.startswith("New")) == 4 From 56d71a85ada57200bc5a97ff09e41c0313dfacb7 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Wed, 5 Apr 2023 15:56:01 -0400 Subject: [PATCH 78/82] docs --- pycytominer/tests/test_cyto_utils/test_cells.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pycytominer/tests/test_cyto_utils/test_cells.py b/pycytominer/tests/test_cyto_utils/test_cells.py index c5092864..ee16bdee 100644 --- a/pycytominer/tests/test_cyto_utils/test_cells.py +++ b/pycytominer/tests/test_cyto_utils/test_cells.py @@ -435,7 +435,10 @@ def test_merge_single_cells_non_canonical(): # The test raises this warning: # FutureWarning: Passing 'suffixes' which cause duplicate columns # {'ObjectNumber_cytoplasm'} in the result is deprecated and will raise a - # MergeError in a future version. # Test non-canonical compartment merging + # MergeError in a future version. + # See https://github.com/cytomining/pycytominer/issues/266 + + # Test non-canonical compartment merging new_sc_merge_df = AP_NEW.merge_single_cells() assert sum(new_sc_merge_df.columns.str.startswith("New")) == 4 From fdebb84327532d20c121cc710bf4863c335e8bda Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Wed, 5 Apr 2023 16:10:16 -0400 Subject: [PATCH 79/82] dtypes --- pycytominer/cyto_utils/cell_locations.py | 29 ++++++++++-------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index 62b7722d..6deee12f 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -356,7 +356,17 @@ def _get_joined_image_nuclei_tables(self): ON Nuclei.{self.image_column} = Image.{self.image_column}; """ - joined_df = pd.read_sql_query(join_query, engine) + column_types = { + self.image_column: "int64", + self.object_column: "int64", + self.cell_x_loc: "float", + self.cell_y_loc: "float", + } + + for image_key in self.image_key: + column_types[image_key] = "str" + + joined_df = pd.read_sql_query(join_query, engine, dtype=column_types) # if the single_cell file was downloaded from S3, delete the temporary file if temp_single_cell_input is not None: @@ -373,22 +383,7 @@ def _load_single_cell(self): The required columns from the `Image` and `Nuclei` tables loaded into a Pandas DataFrame """ - joined_df = self._get_joined_image_nuclei_tables() - - # Cast the cell location columns to float - joined_df[self.cell_x_loc] = joined_df[self.cell_x_loc].astype(float) - joined_df[self.cell_y_loc] = joined_df[self.cell_y_loc].astype(float) - - # Cast the object column to int - joined_df[self.object_column] = joined_df[self.object_column].astype(int) - - # Cast the image index columns to str - for col in self.image_key: - joined_df[col] = joined_df[col].astype(str) - - joined_df = self._create_nested_df(joined_df) - - return joined_df + return self._create_nested_df(self._get_joined_image_nuclei_tables()) def add_cell_location(self): """Add the X,Y locations of all cells to the metadata file in the corresponding row, packed into a single column. From c037ef90be2633e507ff35b3c89536063202c6be Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Wed, 5 Apr 2023 16:14:21 -0400 Subject: [PATCH 80/82] skip test --- pycytominer/tests/test_cyto_utils/test_modz.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/pycytominer/tests/test_cyto_utils/test_modz.py b/pycytominer/tests/test_cyto_utils/test_modz.py index 8a026467..920fe84f 100644 --- a/pycytominer/tests/test_cyto_utils/test_modz.py +++ b/pycytominer/tests/test_cyto_utils/test_modz.py @@ -1,5 +1,6 @@ import os import random +import pytest import numpy as np import pandas as pd from pycytominer.cyto_utils import modz @@ -120,6 +121,8 @@ def test_modz_multiple_columns(): ) +# skip this test +@pytest.skip("TypeError: Could not convert ccc to numeric") def test_modz_multiple_columns_one_metadata_column(): replicate_columns = "Metadata_g" data_replicate_multi_df = data_replicate_df.assign(h=["c", "c", "c", "d", "d", "d"]) From 24caa04eecd41517691fc6fe80010866b75e65c9 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Wed, 5 Apr 2023 16:23:29 -0400 Subject: [PATCH 81/82] Fix test --- pycytominer/tests/test_cyto_utils/test_modz.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/pycytominer/tests/test_cyto_utils/test_modz.py b/pycytominer/tests/test_cyto_utils/test_modz.py index 920fe84f..ca075ed8 100644 --- a/pycytominer/tests/test_cyto_utils/test_modz.py +++ b/pycytominer/tests/test_cyto_utils/test_modz.py @@ -121,8 +121,6 @@ def test_modz_multiple_columns(): ) -# skip this test -@pytest.skip("TypeError: Could not convert ccc to numeric") def test_modz_multiple_columns_one_metadata_column(): replicate_columns = "Metadata_g" data_replicate_multi_df = data_replicate_df.assign(h=["c", "c", "c", "d", "d", "d"]) @@ -146,7 +144,11 @@ def test_modz_multiple_columns_one_metadata_column(): consensus_df = modz( data_replicate_multi_df, replicate_columns, min_weight=1, precision=precision ) - expected_result = data_replicate_multi_df.groupby(replicate_columns).mean().round(4) + expected_result = ( + data_replicate_multi_df.groupby(replicate_columns) + .mean(numeric_only=True) + .round(4) + ) expected_result.index.name = replicate_columns pd.testing.assert_frame_equal( expected_result.reset_index(), consensus_df, check_exact=False, atol=1e-3 From e68ab4b5a785f3bff34beda5d8921a1bb7e94212 Mon Sep 17 00:00:00 2001 From: Shantanu Singh Date: Wed, 5 Apr 2023 16:27:25 -0400 Subject: [PATCH 82/82] Add docs --- pycytominer/cyto_utils/cell_locations.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pycytominer/cyto_utils/cell_locations.py b/pycytominer/cyto_utils/cell_locations.py index 6deee12f..755ab7a3 100644 --- a/pycytominer/cyto_utils/cell_locations.py +++ b/pycytominer/cyto_utils/cell_locations.py @@ -94,8 +94,8 @@ def __init__( self.image_key = image_key self.cell_x_loc = cell_x_loc self.cell_y_loc = cell_y_loc - # Does this mean we are constrained to only anonymous access for S3 resources? - # What would happen with non-anonymous access needs - is this something we should think about? + # Currently constrained to only anonymous access for S3 resources + # https://github.com/cytomining/pycytominer/issues/268 self.s3 = boto3.client( "s3", config=botocore.config.Config(signature_version=botocore.UNSIGNED) )