Merge pull request #167 from PixelgenTechnologies/dev

Release 0.18.0
PixelgenTechnologies · Jul 11, 2024 · b3a862d · b3a862d
2 parents 6c1b0c9 + 853b710
commit b3a862d
Show file tree

Hide file tree

Showing 77 changed files with 3,772 additions and 2,028 deletions.
diff --git a/.github/actions/build-dev-image/action.yml b/.github/actions/build-dev-image/action.yml
@@ -1,9 +1,6 @@
 name: "build-dev-image"
 description: "Build a dev image and push it to the Pixelgen container registries"
 inputs:
-  tag:
-    description: "The tag to use for the image"
-    required: true
   cache-from:
     description: "The cache to use for the build"
     default: ""
@@ -61,7 +58,12 @@ runs:
           bakeSetCommands.push(...cacheToArray)
           bakeSetCommands.push(...platformArray)
 
-          bakeSetCommandsText = bakeSetCommands.join('\n')
+          // Override the version in the container if we are run in a version tag push event
+          if ( context.eventName === "push" && context.ref.startsWith('refs/tags/v') ) {
+            bakeSetCommands.push("*.args.VERSION_OVERRIDE=" + "${{ steps.meta.outputs.version }}" )
+          }
+
+          const bakeSetCommandsText = bakeSetCommands.join('\n')
           core.setOutput('set', bakeSetCommandsText)
 
     - name: Build and push to ghcr and ecr

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -132,7 +132,8 @@ jobs:
       fail-fast: false
       matrix:
         case:
-          - D21
+          - mpx_v1
+          - mpx_v2
 
     if: needs.pre_job.outputs.should_skip == '{}' || !fromJSON(needs.pre_job.outputs.paths_result).python.should_skip
     steps:

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -18,7 +18,7 @@ repos:
       - id: ruff-format
 
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: "v1.3.0" # Use the sha / tag you want to point at
+    rev: "v1.10.1" # Use the sha / tag you want to point at
     hooks:
       - id: mypy
         exclude: "(^cue.mod/)|(^docs/)"

diff --git a/CHANGELOG.md b/CHANGELOG.md
diff --git a/CITATIONS.md b/CITATIONS.md
@@ -25,3 +25,7 @@
 - [local G](https://doi.org/10.1007/s11749-018-0599-x)
 
   > Bivand, R.S., Wong, D.W.S. Comparing implementations of global and local indicators of spatial association. TEST 27, 716–748 (2018). https://doi.org/10.1007/s11749-018-0599-x
+
+- [dsb](https://doi.org/10.1038/s41467-022-29356-8)
+
+  > Mulè, M.P., Martins, A.J. & Tsang, J.S. Normalizing and denoising protein expression data from droplet-based single cell profiling. Nat Commun 13, 2099 (2022). https://doi.org/10.1038/s41467-022-29356-8
diff --git a/poetry.lock b/poetry.lock
diff --git a/src/pixelator/analysis/__init__.py b/src/pixelator/analysis/__init__.py
@@ -33,6 +33,7 @@ class AnalysisParameters:
     colocalization_neighbourhood_size: int
     colocalization_n_permutations: int
     colocalization_min_region_count: int
+    colocalization_min_marker_count: int
 
 
 def analyse_pixels(

diff --git a/src/pixelator/analysis/colocalization/__init__.py b/src/pixelator/analysis/colocalization/__init__.py
@@ -51,6 +51,7 @@ def colocalization_from_component_edgelist(
     n_permutations: int = 50,
     use_full_bipartite: bool = True,
     min_region_count: int = 5,
+    min_marker_count: int = 5,
     random_seed: Optional[int] = None,
 ) -> pd.DataFrame:
     """Get the colocalization scores for the component in the given `edgelist`.
@@ -65,6 +66,8 @@ def colocalization_from_component_edgelist(
                                projection, defaults to True
     :param min_region_count: minimum number of counts in region to consider, defaults
                              to 5
+    :param min_marker_count: the minimum number of counts of a marker to calculate
+                             colocalization
     :param random_seed: Set the random seed for the permutation tests, defaults to None
     :return: a dataframe with computed colocalization scores
     :rtype: pd.DataFrame
@@ -84,6 +87,7 @@ def colocalization_from_component_edgelist(
         neighbourhood_size=neighbourhood_size,
         n_permutations=n_permutations,
         min_region_count=min_region_count,
+        min_marker_count=min_marker_count,
         random_seed=random_seed,
     )
 
@@ -109,6 +113,7 @@ def colocalization_from_component_graph(
     neighbourhood_size: int = 1,
     n_permutations: int = 50,
     min_region_count: int = 5,
+    min_marker_count: int = 5,
     random_seed: Optional[int] = None,
 ) -> pd.DataFrame:
     """Compute the colocalization scores for this component graph.
@@ -121,20 +126,29 @@ def colocalization_from_component_graph(
                            p-values and z-scores, defaults to 50
     :param min_region_count: minimum number of counts in region to consider, defaults
                              to 5
+    :param min_marker_count: the minimum number of counts of a marker to calculate
+                             colocalization
     :param random_seed: Set the random seed for the permutation tests, defaults to None
     :return: a dataframe containing colocalization scores for this component
     :rtype: pd.DataFrame
     """
     logger.debug("Computing colocalization for component: %s", component_id)
     logger.debug("Prepare the graph data for computing colocalization")
-    marker_counts_by_region = prepare_from_graph(graph, n_neighbours=neighbourhood_size)
 
+    raw_marker_counts = graph.node_marker_counts
+    # Record markers to keep
+    # Remove markers with zero variance and markers below minimum marker count
+    markers_to_keep = raw_marker_counts.columns[
+        (raw_marker_counts != 0).any(axis=0)
+        & (raw_marker_counts.nunique() > 1)
+        & (raw_marker_counts.sum() >= min_marker_count)
+    ]
+
+    marker_counts_by_region = prepare_from_graph(graph, n_neighbours=neighbourhood_size)
+    marker_counts_by_region = marker_counts_by_region[markers_to_keep]
     marker_counts_by_region = filter_by_region_counts(
         marker_counts_by_region, min_region_counts=min_region_count
     )
-    marker_counts_by_region = filter_by_unique_values(
-        marker_counts_by_region, at_least_n_unique=2
-    )
 
     nrow, ncols = marker_counts_by_region.shape
     if nrow == 0:
@@ -187,6 +201,7 @@ def colocalization_scores(
     neighbourhood_size: int = 1,
     n_permutations: int = 50,
     min_region_count: int = 5,
+    min_marker_count: int = 5,
     random_seed: Optional[int] = None,
 ) -> pd.DataFrame:
     """Compute colocalization scores for antibody pairs.
@@ -225,6 +240,8 @@ def colocalization_scores(
     :param min_region_count: The minimum size of the region (e.g. number
                              of counts in the neighbourhood) required
                              for it to be considered
+    :param min_marker_count: the minimum number of counts of a marker to calculate
+                             colocalization
     :param random_seed: Set a random seed for the permutation function
     :returns: a pd.DataFrame of scores
     :rtype: pd.DataFrame
@@ -273,6 +290,7 @@ def data():
                 neighbourhood_size=neighbourhood_size,
                 n_permutations=n_permutations,
                 min_region_count=min_region_count,
+                min_marker_count=min_marker_count,
                 random_seed=random_seed,
             )
 
@@ -402,6 +420,7 @@ def __init__(
         neighbourhood_size: int,
         n_permutations: int,
         min_region_count: int,
+        min_marker_count: int,
     ):
         """Initialize the ColocalizationAnalysis.
 
@@ -413,11 +432,14 @@ def __init__(
         :param min_region_count: The minimum size of the region (e.g. number
                              of counts in the neighbourhood) required
                              for it to be considered for colocalization analysis
+        :param min_marker_count: the minimum number of counts of a marker to calculate
+                             colocalization
         """
         self.transformation_type = transformation_type
         self.neighbourhood_size = neighbourhood_size
         self.n_permutations = n_permutations
         self.min_region_count = min_region_count
+        self.min_marker_count = min_marker_count
 
     def run_on_component(self, component: Graph, component_id: str) -> pd.DataFrame:
         """Run colocalization analysis on the component."""
@@ -429,6 +451,7 @@ def run_on_component(self, component: Graph, component_id: str) -> pd.DataFrame:
             neighbourhood_size=self.neighbourhood_size,
             n_permutations=self.n_permutations,
             min_region_count=self.min_region_count,
+            min_marker_count=self.min_marker_count,
         )
 
     def post_process_data(self, data: pd.DataFrame) -> pd.DataFrame:

diff --git a/src/pixelator/analysis/normalization/__init__.py b/src/pixelator/analysis/normalization/__init__.py
@@ -0,0 +1,66 @@
+"""Functions for the colocalization analysis in pixelator.
+
+Copyright © 2024 Pixelgen Technologies AB.
+"""
+
+from typing import List, Union
+
+import numpy as np
+import pandas as pd
+from sklearn.decomposition import PCA
+from sklearn.mixture import GaussianMixture
+from sklearn.preprocessing import StandardScaler
+
+
+def _regress_out_confounder(pheno, exprs, rcond=1e-8):
+    """Linear regression to remove confounding factors from abundance data."""
+    design_matrix = np.column_stack((np.ones((len(pheno), 1)), pheno))
+    coefficients, res, rank, s = np.linalg.lstsq(design_matrix, exprs, rcond=rcond)
+    beta = coefficients[1:]  # remove intercept term
+    return exprs - design_matrix[:, 1:].dot(beta)
+
+
+def _get_background_abundance(dataframe: pd.DataFrame, axis=0):
+    """Fit a double gaussian distribution to the abundance data and return the mean of the first gaussian as an estimation of the background level."""
+    background = pd.Series(index=dataframe.index if axis == 0 else dataframe.columns)
+    scores = pd.Series(index=dataframe.index if axis == 0 else dataframe.columns)
+    gmm = GaussianMixture(n_components=2, max_iter=1000, random_state=0)
+    if axis not in {0, 1}:
+        raise ValueError(f"Axis was {axis}. Must be 0 or 1")
+    ax_iter = dataframe.index if axis == 0 else dataframe.columns
+    for i in ax_iter:
+        current_axis = dataframe.loc[i, :] if axis == 0 else dataframe.loc[:, i]
+        gmm = gmm.fit(current_axis.to_frame())
+        background[i] = np.min(gmm.means_)
+        scores[i] = np.abs(gmm.means_[1] - gmm.means_[0]) / np.sum(gmm.covariances_)
+    return background, scores
+
+
+def dsb_normalize(
+    raw_abundance: pd.DataFrame, isotype_controls: Union[List, None] = None
+):
+    """empty-droplet-free method as implemented in Mulè et. al. dsb package.
+
+    The normalization steps are: 1- log1p transformation, 2- remove background
+    abundance per marker, 3- regularize abundance per component.
+
+    :param raw_abundance: the raw abundance count data.
+    :param isotype_controls: list of isotype controls.
+    :return: normalized abundance data.
+    """
+    log_abundance = np.log1p(raw_abundance)
+    marker_background, _ = _get_background_abundance(log_abundance, axis=1)
+    log_abundance = log_abundance - marker_background
+    component_background, _ = _get_background_abundance(log_abundance, axis=0)
+
+    if isotype_controls is not None:
+        control_signals = log_abundance.loc[:, isotype_controls]
+        control_signals["component_background"] = component_background
+        control_signals = StandardScaler().fit_transform(control_signals)
+        pheno = PCA(n_components=1).fit_transform(control_signals)
+    else:
+        raise ValueError(f"At least one isotype control must be provided.")
+
+    normalized_abundance = _regress_out_confounder(pheno, log_abundance)
+
+    return normalized_abundance
diff --git a/src/pixelator/cli/analysis.py b/src/pixelator/cli/analysis.py
@@ -92,7 +92,7 @@
 )
 @click.option(
     "--colocalization-transformation",
-    default="log1p",
+    default="rate-diff",
     required=False,
     type=click.Choice(get_args(TransformationTypes)),
     show_default=True,
@@ -134,6 +134,14 @@
         "valid for computing colocalization"
     ),
 )
+@click.option(
+    "--colocalization-min-marker-count",
+    default=5,
+    required=False,
+    type=click.IntRange(min=0),
+    show_default=True,
+    help=("The minimum number of marker counts in component for colocalization"),
+)
 @output_option
 @click.pass_context
 @timer
@@ -150,6 +158,7 @@ def analysis(
     colocalization_neighbourhood_size,
     colocalization_n_permutations,
     colocalization_min_region_count,
+    colocalization_min_marker_count,
     output,
 ):
     """
@@ -171,6 +180,7 @@ def analysis(
         colocalization_neighbourhood_size=colocalization_neighbourhood_size,
         colocalization_n_permutations=colocalization_n_permutations,
         colocalization_min_region_count=colocalization_min_region_count,
+        colocalization_min_marker_count=colocalization_min_marker_count,
     )
 
     # some basic sanity check on the input files
@@ -214,6 +224,7 @@ def analysis(
                 neighbourhood_size=colocalization_neighbourhood_size,
                 n_permutations=colocalization_n_permutations,
                 min_region_count=colocalization_min_region_count,
+                min_marker_count=colocalization_min_marker_count,
             )
         )
 

diff --git a/src/pixelator/cli/collapse.py b/src/pixelator/cli/collapse.py
@@ -3,6 +3,7 @@
 Copyright © 2022 Pixelgen Technologies AB.
 """
 
+import sys
 from collections import defaultdict
 from concurrent import futures
 from pathlib import Path
@@ -25,6 +26,38 @@
 )
 
 
+def _handle_errors(jobs, executor):
+    for job in jobs:
+        exception = job.exception()
+        if exception is None:
+            continue
+
+        logger.error(
+            "Found an issue in the process pool. Trying to determine what went wrong and set the correct exit code. Exception was: %s",
+            exception,
+        )
+        process_map = executor._processes
+        for pid in process_map.keys():
+            exit_code = process_map[pid].exitcode
+            if exit_code is not None and exit_code != 0:
+                logger.error(
+                    "The child process in the process pool returned a non-zero exit code: %s.",
+                    exit_code,
+                )
+                # If we have an out of memory exception, make sure we exit with that.
+                if abs(exit_code) == 9:
+                    logger.error(
+                        "One of the child processes was killed (exit code: 9). "
+                        "Usually this is caused by the out-of-memory killer terminating the process. "
+                        "The parent process will return an exit code of 137 to indicate that it terminated because of a kill signal in the child process."
+                    )
+                    sys.exit(137)
+        logger.error(
+            "Was unable to determine what when wrong in process pool. Will raise original exception."
+        )
+        raise exception
+
+
 @click.command(
     "collapse",
     short_help=(
@@ -238,12 +271,12 @@ def collapse(
                             min_count=min_count,
                         )
                     )
+            jobs = list(futures.as_completed(jobs))
+            _handle_errors(jobs, executor)
 
         total_input_reads = 0
         tmp_files = []
-        for job in futures.as_completed(jobs):
-            if job.exception() is not None:
-                raise job.exception()
+        for job in jobs:
             # the worker returns a path to a file (temp antibody edge list)
             tmp_file, input_reads_count = job.result()
             if tmp_file is not None:

diff --git a/src/pixelator/cli/layout.py b/src/pixelator/cli/layout.py
@@ -48,7 +48,7 @@
     "--layout-algorithm",
     required=False,
     multiple=True,
-    default=["pmds_3d"],
+    default=["wpmds_3d"],
     help="Select a layout algorithm to use. This can be specified multiple times to compute multiple layouts. Default: pmds_3d",
     type=click.Choice(get_args(SupportedLayoutAlgorithm)),
 )

diff --git a/src/pixelator/cli/misc.py b/src/pixelator/cli/misc.py
@@ -41,7 +41,7 @@ def list_single_cell_panels(ctx: click.Context, param: Any, value: Any) -> None:
     if not value or ctx.resilient_parsing:
         return
 
-    options = list(config.panels.keys())
+    options = config.list_panel_names(include_aliases=True)
     for option in options:
         click_echo(option)