Skip to content

Commit

Permalink
sqlite col vs val types; further testing connectorx
Browse files Browse the repository at this point in the history
  • Loading branch information
d33bs committed May 24, 2022
1 parent c1681eb commit 5b26bc7
Show file tree
Hide file tree
Showing 4 changed files with 496 additions and 2 deletions.
25 changes: 25 additions & 0 deletions src/pycytominer_#198/remove-sqlite-nans.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import sqlite3

import pandas as pd

# create connections for sqlite
# reference: https://nih.figshare.com/articles/dataset/Cell_Health_-_Cell_Painting_Single_Cell_Profiles/9995672
sqlite_conn = sqlite3.connect("mod_SQ00014613.sqlite")

image_cols = pd.read_sql("PRAGMA table_info(Image);", con=sqlite_conn)
cells_cols = pd.read_sql("PRAGMA table_info(Cells);", con=sqlite_conn)
cyto_cols = pd.read_sql("PRAGMA table_info(Cytoplasm);", con=sqlite_conn)
nuclei_cols = pd.read_sql("PRAGMA table_info(Nuclei);", con=sqlite_conn)

df_dict = {
"image": image_cols,
"cells": cells_cols,
"cytoplasm": cyto_cols,
"nuclei": nuclei_cols,
}

for tabname, df in df_dict.items():
for colname in df[df["type"].isin(["FLOAT", "BIGINT"])]["name"].values.tolist():
sql = f"UPDATE {tabname} SET {colname} = replace({colname}, 'nan', 0);"
sqlite_conn.execute(sql)
sqlite_conn.commit()
273 changes: 273 additions & 0 deletions src/pycytominer_#198/sqlite-database-types.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,273 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "321a7122-455a-4994-88ae-189d10773d31",
"metadata": {},
"source": [
"# SQLite Database Types\n",
"\n",
"Checking on the types within the database to investigate connector-x compatibility as per https://github.com/sfu-db/connector-x/blob/main/Types.md#sqlite.\n",
"\n",
"Example errors:\n",
"- `RuntimeError: Invalid column type Text at index: 61, name: Cytoplasm_Correlation_Costes_AGP_DNA`\n",
"- `RuntimeError: Invalid column type Text at index: 64, name: Cytoplasm_Correlation_Costes_AGP_RNA`\n",
"- `RuntimeError: Invalid column type Text at index: 74, name: Cytoplasm_Correlation_Costes_Mito_DNA`\n",
"- `...Cytoplasm_Correlation_K_ER_Mito`\n",
"- "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "96145791-4f23-43c9-802b-323d7a530da5",
"metadata": {},
"outputs": [],
"source": [
"import sqlite3\n",
"\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "30e1c744-8c17-4a4b-976a-eb02c40689ce",
"metadata": {},
"outputs": [],
"source": [
"# create connections for sqlite\n",
"# reference: https://nih.figshare.com/articles/dataset/Cell_Health_-_Cell_Painting_Single_Cell_Profiles/9995672\n",
"sqlite_conn = sqlite3.connect(\"SQ00014613.sqlite\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "1cd718c0-93ba-40fd-a5ec-ba5862d9bc7f",
"metadata": {},
"outputs": [],
"source": [
"sql = \"\"\"\n",
"PRAGMA table_info(Image);\n",
"\"\"\"\n",
"image_cols = pd.read_sql(sql, con=sqlite_conn)\n",
"image_cols[\"type\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b867daa1-a673-4c91-a10d-52cbec1c12ab",
"metadata": {},
"outputs": [],
"source": [
"sql = \"\"\"\n",
"PRAGMA table_info(Cells);\n",
"\"\"\"\n",
"cells_cols = pd.read_sql(sql, con=sqlite_conn)\n",
"cells_cols[\"type\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5ac19955-f5e9-48c8-b5c5-d987488bfcf0",
"metadata": {},
"outputs": [],
"source": [
"sql = \"\"\"\n",
"PRAGMA table_info(Cytoplasm);\n",
"\"\"\"\n",
"cyto_cols = pd.read_sql(sql, con=sqlite_conn)\n",
"cyto_cols[\"type\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "01b0301f-edad-46cf-8509-36c84283ebdc",
"metadata": {},
"outputs": [],
"source": [
"sql = \"\"\"\n",
"PRAGMA table_info(Nuclei);\n",
"\"\"\"\n",
"nuclei_cols = pd.read_sql(sql, con=sqlite_conn)\n",
"nuclei_cols[\"type\"].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5b772781-8ae9-4c53-9a43-d431091586bd",
"metadata": {},
"outputs": [],
"source": [
"df_dict = {\n",
" \"image\": image_cols,\n",
" \"cells\": cells_cols,\n",
" \"cytoplasm\": cyto_cols,\n",
" \"nuclei\": nuclei_cols,\n",
"}\n",
"len(df_dict.keys())"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "16a98b86-2368-48c2-8dde-841b30198f6d",
"metadata": {},
"outputs": [],
"source": [
"df_dict[\"image\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6427d2ab-65fc-4d39-9419-548cbeec6ed8",
"metadata": {},
"outputs": [],
"source": [
"sql = \"\"\"\n",
"select * from Cytoplasm\n",
"where rowid = 61 or rowid = 60;\n",
"\"\"\"\n",
"cyto_errs = pd.read_sql(sql, con=sqlite_conn)\n",
"cyto_errs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f5aca2e4-52d2-46b8-bed4-e6e67d63711b",
"metadata": {},
"outputs": [],
"source": [
"cyto_errs[\"Cytoplasm_Correlation_Costes_AGP_DNA\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e911a997-7c06-4c86-a5a2-500f96f52828",
"metadata": {},
"outputs": [],
"source": [
"sql = \"\"\"\n",
"select ObjectNumber, \n",
" Cytoplasm_Correlation_Costes_AGP_DNA, \n",
" typeof(Cytoplasm_Correlation_Costes_AGP_DNA) from Cytoplasm\n",
"where rowid between 60 and 61;\n",
"\"\"\"\n",
"cyto_errs = pd.read_sql(sql, con=sqlite_conn)\n",
"cyto_errs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d9e49d05-be26-461b-8943-7936a8e5a468",
"metadata": {},
"outputs": [],
"source": [
"cyto_cols[cyto_cols[\"name\"] == \"Cytoplasm_Correlation_Costes_AGP_DNA\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "a0a8d54d-be71-44de-aa62-fa1e746cad46",
"metadata": {},
"outputs": [],
"source": [
"sql = \"\"\"\n",
"select ObjectNumber, \n",
" Cytoplasm_Correlation_Costes_AGP_DNA, \n",
" typeof(Cytoplasm_Correlation_Costes_AGP_DNA) from Cytoplasm\n",
"where typeof(Cytoplasm_Correlation_Costes_AGP_DNA) != 'real';\n",
"\"\"\"\n",
"cyto_errs = pd.read_sql(sql, con=sqlite_conn)\n",
"cyto_errs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "8709db4e-da28-4dd9-a921-e7d6a16c9044",
"metadata": {},
"outputs": [],
"source": [
"sql = \"\"\"\n",
"select ObjectNumber, \n",
" Cytoplasm_Correlation_Costes_AGP_DNA, \n",
" typeof(Cytoplasm_Correlation_Costes_AGP_DNA) from Cytoplasm\n",
"where typeof(Cytoplasm_Correlation_Costes_AGP_DNA) != 'real';\n",
"\"\"\"\n",
"sqlite_conn.execute(sql).fetchall()[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "467df648-9283-4812-a835-14ef8ace010b",
"metadata": {},
"outputs": [],
"source": [
"sql = \"\"\"\n",
"select ObjectNumber, \n",
" Cytoplasm_Correlation_Costes_AGP_DNA,\n",
" replace(Cytoplasm_Correlation_Costes_AGP_DNA, 'nan', NULL),\n",
" typeof(Cytoplasm_Correlation_Costes_AGP_DNA) from Cytoplasm\n",
"where typeof(Cytoplasm_Correlation_Costes_AGP_DNA) != 'real';\n",
"\"\"\"\n",
"cyto_errs = pd.read_sql(sql, con=sqlite_conn)\n",
"cyto_errs"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b036e290-1de9-4847-9869-52d9dad06bf3",
"metadata": {},
"outputs": [],
"source": [
"for tabname, df in df_dict.items():\n",
" for colname in df[df[\"type\"].isin([\"FLOAT\", \"BIGINT\"])][\"name\"].values.tolist():\n",
" sql = f\"UPDATE {tabname} SET {colname} = replace({colname}, 'nan', 0);\"\n",
" sqlite_conn.execute(sql)\n",
" sqlite_conn.commit()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d1147f2d-a1e1-49c3-8c3a-e645ecbeb840",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.12"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
4 changes: 2 additions & 2 deletions src/pycytominer_#198/test_connectorx.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
# reference https://github.com/cytomining/pycytominer/issues/195
# shrunk file for quicker testing as per work within shrink-demo-file.ipynb
sql_path = "testing_SQ00014613.sqlite"
sql_url = "sqlite:///testing_SQ00014613.sqlite"

# referenced from https://github.com/cytomining/pycytominer/blob/master/pycytominer/cyto_utils/cells.py
def merge_single_cells(
Expand Down Expand Up @@ -191,5 +192,4 @@ def mem_profile_func():
sc_p.load_compartment = types.MethodType(new_load_compartment, sc_p)
return merge_single_cells(self=sc_p)


mem_profile_func()
print(mem_profile_func().info())
Loading

0 comments on commit 5b26bc7

Please sign in to comment.