diff --git a/README.md b/README.md index 6a239c6..257c2e1 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # Hypergraph Multimorbidity (Hypergraph-mm) -## NHS England - Digital Analytics and Research Team - PhD Internship Project +## NHS England - Digital Analytics and Research Team - PhD Internship Projects ### About the Project @@ -10,7 +10,9 @@ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black) -This repository holds supporting code for the Transforming Healthcare Data with Graph-based Techniques Using SAIL DataBank project including a copy of the framework built to perform the hypergraph calculations. A link to the original project proposal can be found [here](https://nhsx.github.io/nhsx-internship-projects/). +This repository holds supporting code for the "Transforming Healthcare Data with Graph-based Techniques Using SAIL DataBank" project including a copy of the framework built to perform the hypergraph calculations. It also supports the work presented in the pre-print [Representing multimorbid disease progressions using directed hypergraphs](https://doi.org/10.1101/2023.08.31.23294903). + +A link to the original project proposal can be found [here](https://nhsx.github.io/nhsx-internship-projects/). _**Note:** Only public or fake data are shared in this repository._ @@ -111,6 +113,6 @@ of the [Open Government 3.0][ogl] licence. ### Contact -To find out more about the [Digital Analytics and Research Team](https://www.nhsx.nhs.uk/key-tools-and-info/nhsx-analytics-unit/) visit our [project website](https://nhsx.github.io/AnalyticsUnit/projects.html) or get in touch at [england.tdau@nhs.net](mailto:england.tdau@nhs.net). +To find out more about the [Digital Analytics and Research Team](https://www.nhsx.nhs.uk/key-tools-and-info/nhsx-analytics-unit/) visit our [project website](https://nhsx.github.io/AnalyticsUnit/projects.html) or get in touch at [datascience@nhs.net](mailto:datascience@nhs.net). diff --git a/docs/index.md b/docs/index.md index 1fae3a4..d715733 100644 --- a/docs/index.md +++ b/docs/index.md @@ -1 +1 @@ -# Welcome to Hypergraph Testing +# Welcome to `hypergraph-mm` diff --git a/notebooks/full_test_case_demo.ipynb b/notebooks/full_test_case_demo.ipynb index 7bbcb9d..bccfb72 100644 --- a/notebooks/full_test_case_demo.ipynb +++ b/notebooks/full_test_case_demo.ipynb @@ -33,6 +33,7 @@ "\n", "import matplotlib\n", "import matplotlib.pyplot as plt\n", + "import matplotlib.tri as tri\n", "import numba\n", "import numpy as np\n", "import pandas as pd\n", @@ -43,6 +44,111 @@ "from hypmm import build_model, centrality, centrality_utils, utils, weight_functions" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "ptm_cmap = plt.cm.get_cmap(\"Reds\")\n", + "\n", + "\n", + "def plot_PTM(ptm_arr, cmap, dis_cols, max_val, fontsize, annotate=True):\n", + "\n", + " # Plot heatmap of transition probabilities\n", + " fig, ax = plt.subplots(1, 1, figsize=(12, 9))\n", + " sns.heatmap(\n", + " ptm_arr,\n", + " cmap=cmap,\n", + " ax=ax,\n", + " linewidth=0.5,\n", + " linecolor=\"k\",\n", + " vmax=max_val,\n", + " cbar=True,\n", + " annot=annotate,\n", + " annot_kws={\"fontsize\": fontsize},\n", + " )\n", + " cbar = ax.collections[0].colorbar\n", + " cbar.ax.tick_params(labelsize=fontsize)\n", + " ax.set_yticklabels(dis_cols, fontsize=fontsize, rotation=0)\n", + " ax.set_xticklabels(dis_cols, fontsize=fontsize, ha=\"center\", rotation=90)\n", + " ax.xaxis.tick_top()\n", + " fig.tight_layout()\n", + "\n", + " return fig" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def plot_PR(\n", + " pred_PR,\n", + " succ_PR,\n", + " dis_cols,\n", + " vmax=1.0,\n", + " offset=(-0.06, 0.01),\n", + " bg_color=False,\n", + " fontsize=20,\n", + " add_text=False,\n", + " colors=None,\n", + "):\n", + "\n", + " # Define colors\n", + " N_diseases = len(dis_cols)\n", + " nx, ny = offset\n", + " if colors is None:\n", + " colors = []\n", + " for i in range(N_diseases):\n", + " np.random.seed(i)\n", + " colors.append(np.random.choice(range(156), size=3) / 255)\n", + "\n", + " # Define triangles\n", + " L = vmax\n", + " x = [L, L, 0, 0, 0.5 * L]\n", + " y = [L, 0, 0, L, 0.5 * L]\n", + " s = [0, -0.9 * L, 0, 0.9 * L, 0]\n", + " triangles = [[2, 0, 1], [2, 0, 2], [2, 0, 3]]\n", + " triang = tri.Triangulation(x, y, triangles)\n", + "\n", + " # Figure\n", + " fig, ax = plt.subplots(1, 1, figsize=(8, 8))\n", + " ax.axis([0, L, 0, L])\n", + " ax.plot([0, L], [0, L], linewidth=3, color=\"k\")\n", + " if bg_color:\n", + " ax.tripcolor(triang, s, alpha=0.3, cmap=\"bwr\", shading=\"gouraud\")\n", + " ax.grid(\"on\")\n", + "\n", + " # Annotate with disease titles\n", + " dirhyp_pagerank = pred_PR.merge(succ_PR, on=\"Disease\", how=\"inner\")\n", + " dirhyp_pagerank.columns = [\"Disease\", \"Predecessor\", \"Successor\"]\n", + " dirhyp_pagerank_arr = dirhyp_pagerank[[\"Successor\", \"Predecessor\"]].values\n", + " for i, pts in enumerate(dirhyp_pagerank_arr):\n", + " color = colors[i]\n", + " ax.scatter(pts[0], pts[1], s=750, color=color)\n", + " ax.text(pts[0] + nx, pts[1] + ny, s=dis_cols[i], color=\"k\", fontsize=20)\n", + "\n", + " # Annotate figure labels, etc.\n", + " if add_text:\n", + " ax.text(\n", + " 0.3,\n", + " 0.35,\n", + " \"Transitive Conditions\",\n", + " fontsize=fontsize,\n", + " rotation=45,\n", + " transform=ax.transAxes,\n", + " )\n", + " ax.set_xlabel(\"Successor PageRank\", fontsize=fontsize + 2)\n", + " ax.set_ylabel(\"Predecessor PageRank\", fontsize=fontsize + 2)\n", + " ax.tick_params(axis=\"both\", labelsize=fontsize)\n", + " ax.set_aspect(\"equal\", \"box\")\n", + " fig.tight_layout()\n", + "\n", + " return fig" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -123,6 +229,9 @@ "metadata": {}, "outputs": [], "source": [ + "# SYNTHETIC EXAMPLE FOR PAPER\n", + "\n", + "\n", "binmat = np.array(\n", " [\n", " [1, 1, 1],\n", @@ -146,10 +255,10 @@ " [0, 1, 2],\n", " [2, 0, 1],\n", " [1, 2, -1],\n", - " [1, -1, -1],\n", + " [0, -1, -1],\n", " [2, -1, -1],\n", " [1, 0, 2],\n", - " [1, 0, -1],\n", + " [0, 1, -1],\n", " [0, 2, -1],\n", " ],\n", " dtype=np.int8,\n", @@ -185,46 +294,6 @@ "## Compute hyperedge and hyperarc weights" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "inc_mat, weights, rev_arrs = build_model.compute_weights(\n", - " binmat,\n", - " conds_worklist,\n", - " idx_worklist,\n", - " colarr,\n", - " \"exclusive\",\n", - " weight_functions.modified_sorensen_dice_coefficient,\n", - " utils.compute_simple_progset,\n", - " dice_type=1,\n", - " plot=True,\n", - " ret_inc_mat=None,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "inc_mat, weights, prev_arrs = build_model.compute_weights(\n", - " binmat,\n", - " conds_worklist,\n", - " idx_worklist,\n", - " colarr,\n", - " \"power\",\n", - " weight_functions.modified_sorensen_dice_coefficient,\n", - " utils.compute_pwset_progset,\n", - " dice_type=1,\n", - " plot=True,\n", - " ret_inc_mat=None,\n", - ")" - ] - }, { "cell_type": "code", "execution_count": null, @@ -311,7 +380,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Successor Detection" + "# Probability transition matrices" ] }, { @@ -324,13 +393,33 @@ " inc_mat_tail, inc_mat_head, edge_weights, node_degree_tail, None, \"successor\", eps=0\n", ")\n", "\n", - "fig, ax = plt.subplots(1, 1, figsize=(10, 8))\n", - "sns.heatmap(P_node, cmap=\"inferno\", ax=ax, annot=True, annot_kws={\"fontsize\": 16})\n", - "ax.set_yticklabels(colarr, fontsize=15, rotation=0)\n", - "ax.set_xticklabels(colarr, fontsize=15, rotation=45)\n", - "ax.set_title(\"Successor Transition Matrix\", fontsize=18, pad=20)\n", - "ax.xaxis.tick_top()\n", - "plt.show()" + "succ_ptm = plot_PTM(P_node, ptm_cmap, colarr, max_val=0.95, fontsize=36)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "P_node = centrality.comp_ptm_std(\n", + " inc_mat_tail,\n", + " inc_mat_head,\n", + " edge_weights,\n", + " node_degree_head,\n", + " edge_degree_tail,\n", + " \"predecessor\",\n", + " eps=0,\n", + ")\n", + "\n", + "pred_ptm = plot_PTM(P_node, ptm_cmap, colarr, max_val=0.95, fontsize=36)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# PageRank" ] }, { @@ -369,40 +458,13 @@ " .reset_index(drop=True)\n", " .Disease\n", ")\n", - "all_node_sc_evc.sort_values(by=\"PageRank Centrality\", ascending=False).reset_index(\n", - " drop=True\n", - ").round({\"PageRank Centrality\": 3})" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Predecessor Detection" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "P_pred_node = centrality.comp_ptm_std(\n", - " inc_mat_tail,\n", - " inc_mat_head,\n", - " edge_weights,\n", - " node_degree_head,\n", - " edge_degree_tail,\n", - " rep=\"predecessor\",\n", + "succ_node_sc_evc = (\n", + " all_node_sc_evc.sort_values(by=\"PageRank Centrality\", ascending=False)\n", + " .reset_index(drop=True)\n", + " .round({\"PageRank Centrality\": 3})\n", ")\n", "\n", - "fig, ax = plt.subplots(1, 1, figsize=(10, 8))\n", - "sns.heatmap(P_pred_node, cmap=\"inferno\", ax=ax, annot=True, annot_kws={\"fontsize\": 16})\n", - "ax.set_yticklabels(colarr, fontsize=15, rotation=0)\n", - "ax.set_xticklabels(colarr, fontsize=15, rotation=45)\n", - "ax.set_title(\"Predecessor Transition Matrix\", fontsize=18, pad=20)\n", - "ax.xaxis.tick_top()\n", - "plt.show()" + "succ_node_sc_evc" ] }, { @@ -441,18 +503,20 @@ " .reset_index(drop=True)\n", " .Disease\n", ")\n", - "all_node_pr_evc.sort_values(by=\"PageRank Centrality\", ascending=False).reset_index(\n", - " drop=True\n", - ").round({\"PageRank Centrality\": 3})" + "pred_node_sc_evc = (\n", + " all_node_pr_evc.sort_values(by=\"PageRank Centrality\", ascending=False)\n", + " .reset_index(drop=True)\n", + " .round({\"PageRank Centrality\": 3})\n", + ")\n", + "\n", + "pred_node_sc_evc" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "# Compute Undirected Representation of Directed Hypergraph\n", - "\n", - "## Node Adjacency Matrix" + "## Successor-Predecessor PageRank Visualisation" ] }, { @@ -461,30 +525,24 @@ "metadata": {}, "outputs": [], "source": [ - "# Construct incidence matrix for undirected representation of directed hypergraph\n", - "incidence_matrix = np.concatenate([inc_mat_tail, inc_mat_head], axis=0)" + "colors = [\"r\", \"g\", \"b\"]\n", + "PR_fig = plot_PR(\n", + " pred_node_sc_evc,\n", + " succ_node_sc_evc,\n", + " colarr,\n", + " colors=colors,\n", + " add_text=True,\n", + " bg_color=True,\n", + " vmax=1.0,\n", + ")" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "node_adj = centrality.comp_adjacency(\n", - " incidence_matrix,\n", - " hyperarc_weights,\n", - " node_weights,\n", - " rep=\"standard\",\n", - " weight_resultant=True,\n", - ")\n", - "\n", - "fig, ax = plt.subplots(1, 1, figsize=(10, 7))\n", - "sns.heatmap(node_adj, cmap=\"inferno\", ax=ax, annot=True)\n", - "ax.set_yticks(ax.get_yticks(), node_titles, fontsize=12, rotation=0)\n", - "ax.set_xticklabels(node_titles, fontsize=12, rotation=45, ha=\"left\")\n", - "ax.xaxis.tick_top()\n", - "plt.show()" + "---\n", + "# Test to compute equally likely disease transitions among conditions" ] }, { @@ -493,34 +551,88 @@ "metadata": {}, "outputs": [], "source": [ - "node_centrality = centrality.eigenvector_centrality(\n", - " incidence_matrix,\n", - " hyperarc_weights,\n", - " node_weights,\n", - " rep=\"standard\",\n", - " tolerance=1e-6,\n", - " max_iterations=1000,\n", - " weight_resultant=True,\n", - " random_seed=None,\n", + "# PREPARED EXAMPLE TO GENERATE EQUALLY LIKELY PAGERANK\n", + "\n", + "binmat = np.array(\n", + " [\n", + " [1, 1, 1],\n", + " [1, 1, 1],\n", + " [1, 1, 1],\n", + " [1, 1, 1],\n", + " [1, 1, 1],\n", + " [1, 1, 1],\n", + " [1, 1, 1],\n", + " [1, 1, 1],\n", + " [1, 1, 1],\n", + " [1, 1, 1],\n", + " [1, 1, 1],\n", + " [1, 1, 1],\n", + " ],\n", + " dtype=np.int8,\n", ")\n", - "node_evc = pd.DataFrame(\n", - " {\n", - " \"Disease\": node_titles,\n", - " \"Eigenvector Centrality\": node_centrality,\n", - " }\n", + "\n", + "conds_worklist = np.array(\n", + " [\n", + " [0, 1, 2],\n", + " [0, 2, 1],\n", + " [1, 2, 0],\n", + " [1, 0, 2],\n", + " [2, 0, 1],\n", + " [2, 1, 0],\n", + " [0, 1, 2],\n", + " [0, 2, 1],\n", + " [1, 2, 0],\n", + " [1, 0, 2],\n", + " [2, 0, 1],\n", + " [2, 1, 0],\n", + " ],\n", + " dtype=np.int8,\n", ")\n", - "node_evc.sort_values(by=\"Eigenvector Centrality\", ascending=False).reset_index(\n", - " drop=True\n", - ")" + "\n", + "idx_worklist = np.array(\n", + " [\n", + " [-1, -1, -1],\n", + " [-1, -1, -1],\n", + " [-1, -1, -1],\n", + " [-1, -1, -1],\n", + " [-1, -1, -1],\n", + " [-1, -1, -1],\n", + " [-1, -1, -1],\n", + " [-1, -1, -1],\n", + " [-1, -1, -1],\n", + " [-1, -1, -1],\n", + " [-1, -1, -1],\n", + " [-1, -1, -1],\n", + " ],\n", + " dtype=np.int8,\n", + ")\n", + "\n", + "colarr = np.array([\"A\", \"B\", \"C\"], dtype=\" C to A, B, C -> C.\n", - "# This is so the hyperarc adjacency matrix will now measure strength of connection by\n", - "# not only matching the tail-head combination of hyperarcs together, but also matching those\n", - "# hyperarcs which feed into higher degree hyperarcs too. Note the difference in\n", - "# adjacency strength between A -> B and A, B -> C in the following setup in comparison to above.\n", - "new_inc_mat_tail = inc_mat_tail + inc_mat_head\n", - "new_inc_mat_tail[new_inc_mat_tail > 1] = 1\n", - "edge_incidence_matrix = np.concatenate([new_inc_mat_tail, inc_mat_head], axis=0)" + "P_node = centrality.comp_ptm_std(\n", + " inc_mat_tail, inc_mat_head, edge_weights, node_degree_tail, None, \"successor\", eps=0\n", + ")\n", + "\n", + "succ_ptm = plot_PTM(P_node, ptm_cmap, colarr, max_val=0.95, fontsize=36)" ] }, { @@ -602,20 +694,17 @@ "metadata": {}, "outputs": [], "source": [ - "edge_adj = centrality.comp_adjacency(\n", - " edge_incidence_matrix,\n", - " hyperarc_weights,\n", - " node_weights,\n", - " rep=\"dual\",\n", - " weight_resultant=True,\n", + "P_node = centrality.comp_ptm_std(\n", + " inc_mat_tail,\n", + " inc_mat_head,\n", + " edge_weights,\n", + " node_degree_head,\n", + " edge_degree_tail,\n", + " \"predecessor\",\n", + " eps=0,\n", ")\n", "\n", - "fig, ax = plt.subplots(1, 1, figsize=(10, 7))\n", - "sns.heatmap(edge_adj, cmap=\"inferno\", ax=ax, annot=True)\n", - "ax.set_yticks(ax.get_yticks(), hyperarc_titles, fontsize=12, rotation=0)\n", - "ax.set_xticklabels(hyperarc_titles, fontsize=12, rotation=45, ha=\"left\")\n", - "ax.xaxis.tick_top()\n", - "plt.show()" + "pred_ptm = plot_PTM(P_node, ptm_cmap, colarr, max_val=0.95, fontsize=36)" ] }, { @@ -624,103 +713,43 @@ "metadata": {}, "outputs": [], "source": [ - "hyperarc_centrality = centrality.eigenvector_centrality(\n", - " edge_incidence_matrix,\n", - " hyperarc_weights,\n", - " node_weights,\n", - " rep=\"dual\",\n", + "inpt = (\n", + " (inc_mat_tail, inc_mat_head),\n", + " (edge_weights, node_weights),\n", + " node_degs,\n", + " edge_degree_tail,\n", + ")\n", + "node_pagerank = centrality.pagerank_centrality(\n", + " inpt,\n", + " rep=\"standard\",\n", + " typ=\"successor\",\n", " tolerance=1e-6,\n", " max_iterations=1000,\n", - " weight_resultant=True,\n", + " is_irreducible=False,\n", + " weight_resultant=False,\n", " random_seed=None,\n", + " eps=0.0001,\n", ")\n", "\n", - "n_conds = n_diseases * [1] + [\n", - " len(d.split(\",\")) + 1 for d in hyperarc_titles[n_diseases:]\n", - "]\n", - "hyperarc_evc = pd.DataFrame(\n", + "all_node_sc_evc = pd.DataFrame(\n", " {\n", - " \"Disease\": hyperarc_titles,\n", - " \"Degree\": n_conds,\n", - " \"Eigenvector Centrality\": hyperarc_centrality,\n", + " \"Disease\": colarr,\n", + " \"PageRank Centrality\": node_pagerank,\n", " }\n", ")\n", - "hyperarc_evc.sort_values(by=\"Eigenvector Centrality\", ascending=False).reset_index(\n", - " drop=True\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "degrees = []\n", - "top_progressions = pd.DataFrame(columns=[\"Disease\", \"Degree\", \"Eigenvector Centrality\"])\n", - "for i, row in hyperarc_evc.sort_values(\n", - " by=\"Eigenvector Centrality\", ascending=False\n", - ").iterrows():\n", - " dis, deg, cent = row\n", - " if deg not in degrees:\n", - " degrees.append(deg)\n", - " top_progressions = pd.concat([top_progressions, pd.DataFrame(row).T], axis=0)\n", - "top_progressions.reset_index(drop=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Multimorbidity pathway plot using progressive edge adjacency" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Extract top top_n hyperarcs for up to max_d-degree hyperarcs\n", - "top_n = 5 # 3\n", - "top_progressions = pd.DataFrame()\n", - "max_d = 3 # 3\n", - "\n", - "for deg in range(1, max_d + 1):\n", - " deg_hyperarc_evc = hyperarc_evc[hyperarc_evc.Degree == deg].sort_values(\n", - " by=\"Eigenvector Centrality\", ascending=False, axis=0\n", - " )\n", - " top_progressions = pd.concat(\n", - " [top_progressions, deg_hyperarc_evc.iloc[:top_n]], axis=0\n", - " )\n", - "top_progressions.reset_index(drop=True).sort_values(by=\"Degree\", ascending=True, axis=0)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Sort indexing out for plot and rename self-transitions to original disease title\n", - "rank_progressions = top_progressions.copy(deep=True)\n", - "\n", - "index_out = []\n", - "for deg in range(1, max_d + 1):\n", - " local_max = rank_progressions[rank_progressions[\"Degree\"] == deg].shape[0]\n", - "\n", - " for i in range(local_max):\n", - " if i <= top_n:\n", - " index_out.append(i)\n", - "\n", - "rank_progressions.index = index_out\n", - "rank_progressions.reset_index(inplace=True)\n", - "rank_progressions[\"Disease\"] = [\n", - " prog.split(\" -> \")[0] if (prog.split(\" -> \")[0] == prog.split(\" -> \")[1]) else prog\n", - " for prog in rank_progressions.Disease\n", - "]\n", + "print(f\"PageRank sum: {node_pagerank.sum()}\")\n", + "succ_order = (\n", + " all_node_sc_evc.sort_values(by=\"PageRank Centrality\", ascending=False)\n", + " .reset_index(drop=True)\n", + " .Disease\n", + ")\n", + "succ_node_sc_evc = (\n", + " all_node_sc_evc.sort_values(by=\"PageRank Centrality\", ascending=False)\n", + " .reset_index(drop=True)\n", + " .round({\"PageRank Centrality\": 3})\n", + ")\n", "\n", - "rank_progressions" + "succ_node_sc_evc" ] }, { @@ -729,48 +758,43 @@ "metadata": {}, "outputs": [], "source": [ - "# populate list for every hyperarc where they feed into hyperarcs of degree above\n", - "# for plotting purposes below\n", - "prog_idx_lists = []\n", - "\n", - "# Loop over degrees\n", - "for deg in range(1, max_d):\n", - "\n", - " # Extract degree specific top ranking hyperarcs\n", - " deg_dis_progs = rank_progressions[rank_progressions.Degree == deg].reset_index(\n", - " drop=True\n", - " )\n", - " lst_idx = deg - 1\n", - " n_degs = deg_dis_progs.shape[0]\n", - "\n", - " # Initialise list of lists for each top ranking hyperarc of that degree\n", - " prog_idx_lists.append([[] for i in range(n_degs)])\n", - "\n", - " # Loop over these top ranking hyperarc of degree deg\n", - " for deg_dis_prog in deg_dis_progs.iterrows():\n", - " i, (idx, dis_prog, dis_deg, eig_cent) = deg_dis_prog\n", - "\n", - " # If not a self-transition, extract tail members and head member\n", - " if deg != 1:\n", - " prog_tail = dis_prog.split(\" -> \")[0].split(\", \")\n", - " prog_head = dis_prog.split(\" -> \")[1]\n", - " prog_dis = np.sort(prog_tail + [prog_head])\n", - "\n", - " # Otherwise, just extract whole progression as it's one disease\n", - " else:\n", - " prog_dis = dis_prog\n", + "inpt = (\n", + " (inc_mat_tail, inc_mat_head),\n", + " (edge_weights, node_weights),\n", + " node_degs,\n", + " edge_degree_tail,\n", + ")\n", + "node_pagerank = centrality.pagerank_centrality(\n", + " inpt,\n", + " rep=\"standard\",\n", + " typ=\"predecessor\",\n", + " tolerance=1e-6,\n", + " max_iterations=100,\n", + " is_irreducible=False,\n", + " weight_resultant=False,\n", + " random_seed=None,\n", + " eps=0.0001,\n", + ")\n", "\n", - " # Extract all top ranking hyperarcs of the degree above\n", - " degabove_dis_progs = rank_progressions[\n", - " rank_progressions.Degree == dis_deg + 1\n", - " ].Disease\n", + "all_node_pr_evc = pd.DataFrame(\n", + " {\n", + " \"Disease\": colarr,\n", + " \"PageRank Centrality\": node_pagerank,\n", + " }\n", + ")\n", + "print(f\"PageRank sum: {node_pagerank.sum()}\")\n", + "pred_order = (\n", + " all_node_pr_evc.sort_values(by=\"PageRank Centrality\", ascending=False)\n", + " .reset_index(drop=True)\n", + " .Disease\n", + ")\n", + "pred_node_sc_evc = (\n", + " all_node_pr_evc.sort_values(by=\"PageRank Centrality\", ascending=False)\n", + " .reset_index(drop=True)\n", + " .round({\"PageRank Centrality\": 3})\n", + ")\n", "\n", - " # Loop over these top ranking hyperarcs of the degree above, check if their tail set matches\n", - " # the prog_dis of above, append ranking index to the corresponding list in prog_idx_lists\n", - " for j, degabove_dis in enumerate(degabove_dis_progs):\n", - " degabove_tail = np.sort(degabove_dis.split(\" -> \")[0].split(\", \"))\n", - " if np.all(prog_dis == degabove_tail):\n", - " prog_idx_lists[lst_idx][i].append(j)" + "pred_node_sc_evc" ] }, { @@ -779,63 +803,17 @@ "metadata": {}, "outputs": [], "source": [ - "# Line differentiation and progression line colour differentiation\n", - "linestyle_lsts = list(matplotlib.lines.lineStyles.keys())[:4]\n", - "prog_colors = list(plt.get_cmap(\"nipy_spectral\")(np.linspace(0.1, 0.9, max_d)))\n", - "\n", - "# Plot progressions\n", - "sq_s = 10\n", - "fig, ax = plt.subplots(1, 1, figsize=(sq_s, sq_s))\n", - "sns.scatterplot(\n", - " y=\"index\",\n", - " x=\"Degree\",\n", - " hue=\"Disease\",\n", - " s=200,\n", - " marker=\"o\",\n", - " data=rank_progressions,\n", - " ax=ax,\n", - " legend=False,\n", - ")\n", - "ax.set_ylabel(\"Progression Rank\", fontsize=15)\n", - "ax.set_xlabel(\"Hyperarc Degree\", fontsize=15)\n", - "ax.set_yticks(np.arange(top_n))\n", - "ax.set_xticks(np.arange(1, max_d + 1))\n", - "ax.set_xticklabels(np.arange(1, max_d + 1), fontsize=15)\n", - "ax.set_yticklabels(np.arange(1, top_n + 1), fontsize=15)\n", - "ax.axis([0, max_d + 1, top_n, -1])\n", - "ax.grid(\"on\")\n", - "\n", - "# Include lines to represent degree-to-degree progressions\n", - "for i, deg_lst in enumerate(prog_idx_lists):\n", - " for j, idx_list in enumerate(deg_lst):\n", - " for idx in idx_list:\n", - " plt.plot(\n", - " [i + 1, i + 2],\n", - " [j, idx],\n", - " linestyle=linestyle_lsts[j % len(linestyle_lsts)],\n", - " c=prog_colors[i],\n", - " linewidth=2,\n", - " )\n", - "\n", - "# Annotate progressions\n", - "for i, dis in enumerate(rank_progressions.Disease):\n", - " row = rank_progressions.iloc[i]\n", - " deg, idx = (row[\"Degree\"], row[\"index\"])\n", - " n_dis = len(dis)\n", - " plt.annotate(\n", - " dis, (deg - (0.5 / 10) * (n_dis / 2), idx - 0.15), fontsize=int(1.5 * sq_s)\n", - " )\n", - "\n", - "fig.tight_layout()\n", - "plt.show()" + "colors = [\"r\", \"g\", \"b\"]\n", + "PR_fig = plot_PR(\n", + " pred_node_sc_evc,\n", + " succ_node_sc_evc,\n", + " colarr,\n", + " colors=colors,\n", + " add_text=True,\n", + " bg_color=True,\n", + " vmax=1.0,\n", + ")" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -854,7 +832,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.16" + "version": "3.8.17" }, "vscode": { "interpreter": { diff --git a/notebooks/full_test_case_demo_JB.ipynb b/notebooks/full_test_case_demo_JB.ipynb deleted file mode 100644 index 2f67c8f..0000000 --- a/notebooks/full_test_case_demo_JB.ipynb +++ /dev/null @@ -1,1143 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "---\n", - "# Initial Imports" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "\n", - "print(os.getcwd())\n", - "os.chdir(\"../\")\n", - "print(os.getcwd())" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import random\n", - "import time as t\n", - "from importlib import reload\n", - "\n", - "import matplotlib\n", - "import matplotlib.pyplot as plt\n", - "import matplotlib.tri as tri\n", - "import numba\n", - "import numpy as np\n", - "import pandas as pd\n", - "import seaborn as sns\n", - "\n", - "%matplotlib inline\n", - "\n", - "from hypmm import build_model, centrality, centrality_utils, utils, weight_functions" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "ptm_cmap = plt.cm.get_cmap(\"Reds\")\n", - "\n", - "\n", - "def plot_PTM(ptm_arr, cmap, dis_cols, max_val, fontsize, annotate=True):\n", - "\n", - " # Plot heatmap of transition probabilities\n", - " fig, ax = plt.subplots(1, 1, figsize=(12, 9))\n", - " sns.heatmap(\n", - " ptm_arr,\n", - " cmap=cmap,\n", - " ax=ax,\n", - " linewidth=0.5,\n", - " linecolor=\"k\",\n", - " vmax=max_val,\n", - " cbar=True,\n", - " annot=annotate,\n", - " annot_kws={\"fontsize\": fontsize},\n", - " )\n", - " cbar = ax.collections[0].colorbar\n", - " cbar.ax.tick_params(labelsize=fontsize)\n", - " ax.set_yticklabels(dis_cols, fontsize=fontsize, rotation=0)\n", - " ax.set_xticklabels(dis_cols, fontsize=fontsize, ha=\"center\", rotation=90)\n", - " ax.xaxis.tick_top()\n", - " fig.tight_layout()\n", - "\n", - " return fig" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def plot_PR(\n", - " pred_PR,\n", - " succ_PR,\n", - " dis_cols,\n", - " vmax=1.0,\n", - " offset=(-0.06, 0.01),\n", - " bg_color=False,\n", - " fontsize=20,\n", - " add_text=False,\n", - " colors=None,\n", - "):\n", - "\n", - " # Define colors\n", - " N_diseases = len(dis_cols)\n", - " nx, ny = offset\n", - " if colors is None:\n", - " colors = []\n", - " for i in range(N_diseases):\n", - " np.random.seed(i)\n", - " colors.append(np.random.choice(range(156), size=3) / 255)\n", - "\n", - " # Define triangles\n", - " L = vmax\n", - " x = [L, L, 0, 0, 0.5 * L]\n", - " y = [L, 0, 0, L, 0.5 * L]\n", - " s = [0, -0.9 * L, 0, 0.9 * L, 0]\n", - " triangles = [[2, 0, 1], [2, 0, 2], [2, 0, 3]]\n", - " triang = tri.Triangulation(x, y, triangles)\n", - "\n", - " # Figure\n", - " fig, ax = plt.subplots(1, 1, figsize=(8, 8))\n", - " ax.axis([0, L, 0, L])\n", - " ax.plot([0, L], [0, L], linewidth=3, color=\"k\")\n", - " if bg_color:\n", - " ax.tripcolor(triang, s, alpha=0.3, cmap=\"bwr\", shading=\"gouraud\")\n", - " ax.grid(\"on\")\n", - "\n", - " # Annotate with disease titles\n", - " dirhyp_pagerank = pred_PR.merge(succ_PR, on=\"Disease\", how=\"inner\")\n", - " dirhyp_pagerank.columns = [\"Disease\", \"Predecessor\", \"Successor\"]\n", - " dirhyp_pagerank_arr = dirhyp_pagerank[[\"Successor\", \"Predecessor\"]].values\n", - " for i, pts in enumerate(dirhyp_pagerank_arr):\n", - " color = colors[i]\n", - " ax.scatter(pts[0], pts[1], s=750, color=color)\n", - " ax.text(pts[0] + nx, pts[1] + ny, s=dis_cols[i], color=\"k\", fontsize=20)\n", - "\n", - " # Annotate figure labels, etc.\n", - " if add_text:\n", - " ax.text(\n", - " 0.3,\n", - " 0.35,\n", - " \"Transitive Conditions\",\n", - " fontsize=fontsize,\n", - " rotation=45,\n", - " transform=ax.transAxes,\n", - " )\n", - " ax.set_xlabel(\"Successor PageRank\", fontsize=fontsize + 2)\n", - " ax.set_ylabel(\"Predecessor PageRank\", fontsize=fontsize + 2)\n", - " ax.tick_params(axis=\"both\", labelsize=fontsize)\n", - " ax.set_aspect(\"equal\", \"box\")\n", - " fig.tight_layout()\n", - "\n", - " return fig" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Definitions from doc strings\n", - "\n", - "`binmat (np.array, dtype=np.int8)`: \n", - "\n", - "Binary flag matrix whose test observations are represented by rows and diseases represented by columns.\n", - "\n", - "`conds_worklist (np.array, dtype=np.int8)`: \n", - "\n", - "For each individual, the worklist stores the ordered set of conditions via their columns indexes. For compatability with Numba, once all conditions specified per individual, rest of row is a stream if -1's.\n", - "\n", - "`idx_worklist (np.array, dtype=np.int8)`:\n", - "\n", - "Worklist to specify any duplicates. For each individuals, if -2 then they only have 1 condition, if -1 then they have no duplicates and if they have positive integers indexing `conds_worklist` then it specifies which conditions\n", - "have a duplicate which needs taken care of." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Example patients definition to input format\n", - "\n", - "**Patients 1, 2 and 3:**\n", - "- Has diseases A, B, and C (i.e. `binmat: [1, 1, 1]`)\n", - "- Disease progression is A -> B -> C (i.e. `conds_worklist: [0, 1, 2]`)\n", - "- No duplicates are present (i.e. `idx_worklist: [-1, -1, -1]`)\n", - "\n", - "\n", - "**Patient 4:**\n", - "- Has diseases A, B, and C (i.e. `binmat: [1, 1, 1]`)\n", - "- Disease progression is C -> A -> B (i.e. `conds_worklist: [2, 0, 1]`)\n", - "- No duplicates are present (i.e. `idx_worklist: [-1, -1, -1]`)\n", - "\n", - "\n", - "**Patient 5:**\n", - "- Has diseases B and C (i.e. `binmat: [0, 1, 1]`)\n", - "- Disease progression is B -> C (i.e. `conds_worklist: [1, 2, -1]`)\n", - "- No duplicates are present (i.e. `idx_worklist: [-1, -1, -1]`)\n", - "\n", - "\n", - "**Patient 6:**\n", - "- Has disease B (i.e. `binmat: [0, 1, 0]`)\n", - "- No disease progression (just B) (i.e. `conds_worklist: [1, -1, -1]`)\n", - "- No duplicates are present, and only has one condition (i.e. `idx_worklist: [-2, -1, -1]`)\n", - "\n", - "\n", - "**Patient 7:**\n", - "- Has disease C (i.e. `binmat: [0, 0, 1]`)\n", - "- No disease progression (just C) (i.e. `conds_worklist: [2, -1, -1]`)\n", - "- No duplicates are present, and only has one condition (i.e. `idx_worklist: [-2, -1, -1]`)\n", - "\n", - "\n", - "**Patient 8:**\n", - "- Has diseases A, B and C (i.e. `binmat: [1, 1, 1]`)\n", - "- Disease progression is B -> A -> C (i.e. `conds_worklist: [1, 0, 2]`)\n", - "- No duplicates are present (i.e. `idx_worklist: [-1, -1, -1]`)\n", - "\n", - "\n", - "**Patient 9:**\n", - "- Has diseases A and B (i.e. `binmat: [1, 1, 0]`)\n", - "- Disease progression is B -> A (i.e. `conds_worklist: [1, 0, -1]`)\n", - "- No duplicates are present (i.e. `idx_worklist: [-1, -1, -1]`)\n", - "\n", - "\n", - "**Patient 10:**\n", - "- Has diseases A and C (i.e. `binmat: [1, 0, 1]`)\n", - "- Disease progression is A -> C (i.e. `conds_worklist: [0, 2, -1]`)\n", - "- No duplicates are present (i.e. `idx_worklist: [-1, -1, -1]`)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# # TEST CASE USED WITH ZOE AND DAN\n", - "\n", - "# binmat = np.array(\n", - "# [\n", - "# [1, 1, 1],\n", - "# [1, 1, 1],\n", - "# [1, 1, 1],\n", - "# [1, 1, 1],\n", - "# [0, 1, 1],\n", - "# [0, 1, 0],\n", - "# [0, 0, 1],\n", - "# [1, 1, 1],\n", - "# [1, 1, 0],\n", - "# [1, 0, 1],\n", - "# ],\n", - "# dtype=np.int8,\n", - "# )\n", - "\n", - "# conds_worklist = np.array(\n", - "# [\n", - "# [0, 1, 2],\n", - "# [0, 1, 2],\n", - "# [0, 1, 2],\n", - "# [2, 0, 1],\n", - "# [1, 2, -1],\n", - "# [1, -1, -1],\n", - "# [2, -1, -1],\n", - "# [1, 0, 2],\n", - "# [1, 0, -1],\n", - "# [0, 2, -1],\n", - "# ],\n", - "# dtype=np.int8,\n", - "# )\n", - "\n", - "# idx_worklist = np.array(\n", - "# [\n", - "# [-1, -1, -1],\n", - "# [-1, -1, -1],\n", - "# [-1, -1, -1],\n", - "# [-1, -1, -1],\n", - "# [-1, -1, -1],\n", - "# [-2, -1, -1],\n", - "# [-2, -1, -1],\n", - "# [-1, -1, -1],\n", - "# [-1, -1, -1],\n", - "# [-1, -1, -1],\n", - "# ],\n", - "# dtype=np.int8,\n", - "# )\n", - "\n", - "# colarr = np.array([\"A\", \"B\", \"C\"], dtype=\"