Add gene threshold and add input for plotting cond factors

meyer-lab · Aug 15, 2024 · 772d1b7 · 772d1b7
1 parent a1743ef
commit 772d1b7
Show file tree

Hide file tree

Showing 3 changed files with 47 additions and 27 deletions.
diff --git a/pf2rnaseq/figures/commonFuncs/plotFactors.py b/pf2rnaseq/figures/commonFuncs/plotFactors.py
@@ -14,22 +14,21 @@
 def plot_condition_factors(
     data: AnnData,
     ax: Axes,
+    condition_name: str,
     cond_group_labels: Optional[pd.Series] = None,
     ThomsonNorm=False,
     groupConditions=False,
 ):
     """Plots Pf2 condition factors"""
     pd.set_option("display.max_rows", None)
-    yt = pd.Series(np.unique(data.obs["time"]))
+    yt = pd.Series(np.unique(data.obs[condition_name]))
     X = np.array(data.uns["Pf2_A"])
 
-    X = np.log10(X)
-    if ThomsonNorm is True:
-        controls = yt.str.contains("CTRL")
-        X = X[controls]
+    # X = np.log10(X)
+    XX = X
 
-    X -= np.median(X, axis=0)
-    X /= np.std(X, axis=0)
+    X -= np.median(XX, axis=0)
+    X /= np.std(XX, axis=0)
 
     ind = reorder_table(X)
     X = X[ind]
@@ -172,6 +171,7 @@ def reorder_table(projs: np.ndarray) -> np.ndarray:
     return sch.leaves_list(Z)
 
 
+
 def bot_top_genes(X, cmp, geneAmount=5):
     """Saves most pos/negatively genes"""
     df = pd.DataFrame(

diff --git a/pf2rnaseq/figures/figureCA1.py b/pf2rnaseq/figures/figureCA1.py
@@ -15,7 +15,7 @@
 )
 from pf2rnaseq.figures.commonFuncs.plotPaCMAP import plot_labels_pacmap
 import numpy as np
-from pf2rnaseq.imports import import_CA, prepare_dataset
+from pf2rnaseq.imports import import_CA, prepare_dataset_bulk
 
 from pf2rnaseq.factorization import pf2
 
@@ -30,18 +30,16 @@ def makeFigure():
 
     #Import the CA data as a DataFrame and put into an Anndata object
     CA_ad = import_CA()
-    X = prepare_dataset(CA_ad, "sample_id")
+    X = prepare_dataset_bulk(CA_ad, "sample_id", geneThreshold=0.001)
 
-    pf2(X, rank=10)
+    X = pf2(X, rank=10)
 
-    plot_condition_factors(X, ax[0])
+    plot_condition_factors(X, ax[0], condition_name="sample_id")
     plot_eigenstate_factors(X, ax[1])
     plot_gene_factors(X, ax[2])
     plot_factor_weight(X, ax[3])
 
-    plot_labels_pacmap(X, "Condition", ax[4])
+    # plot_labels_pacmap(X, "time", ax[4])
 
 
-    return f
-
-makeFigure()
+    return f
diff --git a/pf2rnaseq/imports.py b/pf2rnaseq/imports.py
@@ -22,27 +22,49 @@ def import_CA():
 
     return CA_ad
 
+
+def prepare_dataset_bulk(
+    X: ad.AnnData, condition_name: str, 
+    geneThreshold: float
+) -> ad.AnnData:
+    assert isinstance(X.X, spmatrix)
+    assert np.amin(X.X.data) >= 0.0  # type: ignore
+
+    # Filter out genes with too few reads
+    readmean, _ = mean_variance_axis(X.X, axis=0)  # type: ignore
+    X = X[:, readmean > geneThreshold]
+
+    # # Get the indices for subsetting the data
+    _, sgIndex = np.unique(X.obs_vector(condition_name), return_inverse=True)
+    X.obs["condition_unique_idxs"] = sgIndex
+
+    # Pre-calculate gene means
+    means, _ = mean_variance_axis(X.X, axis=0)  # type: ignore
+    X.var["means"] = means
+
+    return X
+
 def prepare_dataset(
     X: ad.AnnData, condition_name: str, 
-    # geneThreshold: float
+    geneThreshold: float
 ) -> ad.AnnData:
     assert isinstance(X.X, spmatrix)
     assert np.amin(X.X.data) >= 0.0  # type: ignore
 
-    # # Filter out genes with too few reads
-    # readmean, _ = mean_variance_axis(X.X, axis=0)  # type: ignore
-    # X = X[:, readmean > geneThreshold]
+    # Filter out genes with too few reads
+    readmean, _ = mean_variance_axis(X.X, axis=0)  # type: ignore
+    X = X[:, readmean > geneThreshold]
 
-    # # Normalize read depth
-    # sc.pp.normalize_total(X, exclude_highly_expressed=False, inplace=True)
+    # Normalize read depth
+    sc.pp.normalize_total(X, exclude_highly_expressed=False, inplace=True)
 
-    # # Scale genes by sum
-    # readmean, _ = mean_variance_axis(X.X, axis=0)  # type: ignore
-    # readsum = X.shape[0] * readmean
-    # inplace_column_scale(X.X, 1.0 / readsum)
+    # Scale genes by sum
+    readmean, _ = mean_variance_axis(X.X, axis=0)  # type: ignore
+    readsum = X.shape[0] * readmean
+    inplace_column_scale(X.X, 1.0 / readsum)
 
-    # # Transform values
-    # X.X.data = np.log10((1000.0 * X.X.data) + 1.0)  # type: ignore
+    # Transform values
+    X.X.data = np.log10((1000.0 * X.X.data) + 1.0)  # type: ignore
 
     # # Get the indices for subsetting the data
     _, sgIndex = np.unique(X.obs_vector(condition_name), return_inverse=True)