Merge pull request #186 from PixelgenTechnologies/exe-1997-improve-wpmds

Updated probability distance calculations
PixelgenTechnologies · Sep 2, 2024 · 8d05475 · 8d05475
2 parents 50261bb + c9baf9b
commit 8d05475
Show file tree

Hide file tree

Showing 3 changed files with 82 additions and 13 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,9 +7,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [x.x.x] - 2024-xx-xx
 
-
 ### Changed
 
+-   Support for MultiGraphs in `pmds_layout`
 -   Support multiple targets in `plot_colocalization_diff_volcano` and `plot_colocalization_diff_heatmap`.
 
 ### Fixed

diff --git a/src/pixelator/graph/backends/implementations/_networkx.py b/src/pixelator/graph/backends/implementations/_networkx.py
@@ -775,7 +775,10 @@ def pmds_layout(
     Options are:
 
     * an np.array with non-negative values (same number of elements as edges in g)
-    * "prob_dist" to use -log(P)^3, where P is the probability of a random walker to traverse the end nodes of and edge (i->j) and back again (j->i) in 5 steps.
+    * "prob_dist" to weight each edge (i, j) by -log(P)^3, where P is the probability
+        of a random walker to go from i to j in 5 steps and then back again (j->i)
+        in 5 steps. For this computation, self-loops are added to the graph to ensure
+        that all transitions are possible.
     * None to use unweighted shortest path lengths
     :param seed: Set seed for reproducibility
     :return: A dataframe with layout coordinates
@@ -876,7 +879,35 @@ def _prob_edge_weights(
     """Compute edge weights based on k-step transition probabilities.
 
     The transition probabilities are computed using powers of the
-    stochastic matrix of the graph with self-loops allowed.
+    stochastic matrix of the graph with self-loops allowed. Self-loops
+    are necessary for bipartite graphs and without them many transitions
+    will be impossible. For instance, if we have a bipartite graph with
+    A nodes and B nodes where A can only be connected with B and vice versa,
+    there is no possible 2-step path from an A node to a B node. However,
+    if we allow self-loops, we can go from an A node to itself and then to
+    a B node in two steps. By allowing self-loops, we make all transitions
+    within the neighborhood k possible.
+
+    The transition probabilities are computed for a k-step random walk
+    by taking the k'th power of the stochastic matrix. Transition
+    probabilities are not symmetric, i.e. it matters what node we start
+    from. The probability of going from i to j in k steps
+    is not the same as the probability of going from j to i in k steps.
+    This is impractical for layout algorithms that require a single weight
+    per edge. To make the transition probabilities symmetric, we multiply
+    the transition probabilities in both directions (pk(i, j) * pk(j, i)).
+    This way, we get the probability of going from i to j in k steps and then
+    back again in k steps, so it no longer matters if we start in i or j.
+
+    Once we have computed the transition probabilities for all k-step
+    paths, we then extract the probabilities for the edges in the original
+    graph.
+
+    If we consider an edge (u, v) in the original graph, we now have
+    the probability of a random walker going from u to v in k steps and then
+    back again in k steps with self-loops allowed. If u anv v are well
+    connected (if there are many possible k-step paths between them), this
+    probability should be high.
 
     :param g: A networkx graph object
     :param k: The number of steps in the random walk
@@ -888,12 +919,13 @@ def _prob_edge_weights(
     if not isinstance(k, int) and k < 1 or k > 10:
         raise ValueError("'k' must be an integer between 1 and 10.")
 
-    A = nx.to_scipy_sparse_array(g, weight=None, nodelist=list(g.nodes), format="csr")
+    # Get the adjacency matrix. By default it is order by the nodes
+    A = nx.to_scipy_sparse_array(g, weight=None, format="csr")
 
     # Add 1 to the diagonal to allow self-loops
     A = A + sp.sparse.diags([1] * A.shape[0], format="csr")
 
-    # Divide by row sum
+    # Divide by row sum to get the stochastic matrix
     D = sp.sparse.diags(1 / A.sum(axis=1), format="csr")
     P = D @ A
 
@@ -904,18 +936,28 @@ def _prob_edge_weights(
     P_step = P_step.multiply(A)
 
     # Compute bi-directional transition probabilities which are symmetric.
-    # Now we get the probability of going from i to j and back again in k steps,
-    # so it doesn't matter if we start in i or j.
+    # Now we get the probability of going from i to j and back again in k
+    # steps, so it no longer matters if we start in i or j.
     P_step_bidirectional = P_step.multiply(P_step.T)
 
     # Extract the transition probabilities for edges in g
-    edges = list(g.edges)
-    edge_probs = P_step_bidirectional[
-        [list(g.nodes).index(u) for u, _ in edges],
-        [list(g.nodes).index(v) for _, v in edges],
-    ]
+    edges = np.array(g.edges)
+    nodes = np.array(g.nodes)
 
-    return np.array(edge_probs)[0]
+    # Get end node IDs for the edges
+    node_from = edges[:, 0]
+    node_to = edges[:, 1]
+
+    # Find the correct positions to extract from the probability matrix
+    index_dict = {val: idx for idx, val in enumerate(nodes)}
+    vectorized_index = np.vectorize(lambda x: index_dict.get(x, -1))
+    row_indices = vectorized_index(node_from)
+    col_indices = vectorized_index(node_to)
+
+    # Extract the probabilities
+    edge_probs = np.asarray(P_step_bidirectional[row_indices, col_indices])[0]
+
+    return edge_probs
 
 
 def _mat_pow(mat, power):

diff --git a/tests/graph/test_graph.py b/tests/graph/test_graph.py
@@ -6,13 +6,15 @@
 import random
 from unittest.mock import MagicMock
 
+import networkx as nx
 import numpy as np
 import pandas as pd
 import pytest
 from numpy.testing import assert_array_almost_equal, assert_array_equal
 from pandas.testing import assert_frame_equal
 
 from pixelator.graph import Graph
+from pixelator.graph.backends.implementations._networkx import pmds_layout
 from tests.graph.networkx.test_tools import random_sequence
 from tests.test_tools import enforce_edgelist_types_for_tests
 
@@ -579,6 +581,31 @@ def test_layout_coordinates_3d_pmds_with_weights(pentagram_graph):
     assert_array_almost_equal(l2, expected, decimal=4)
 
 
+def test_pmds_layout_3d_with_weights_multigraph(pentagram_graph):
+    g = pentagram_graph.raw
+    g_multi = nx.MultiGraph(g)
+
+    result = pmds_layout(
+        g_multi,
+        pivots=4,
+        dim=3,
+        weights="prob_dist",
+        seed=123,
+    )
+    result = pd.DataFrame.from_dict(result, orient="index", columns=["x", "y", "z"])
+
+    l2 = np.linalg.norm(result[["x", "y", "z"]], axis=1)
+    expected = [
+        3569.35412551,
+        2998.85729688,
+        2998.85729688,
+        3569.35412551,
+        2998.85729688,
+    ]
+
+    assert_array_almost_equal(l2, expected, decimal=4)
+
+
 @pytest.mark.parametrize("enable_backend", ["networkx"], indirect=True)
 def test_layout_coordinates_for_all_algorithms(enable_backend, pentagram_graph):
     # Just making sure all existing algorithms get exercised