Merge pull request #108 from theislab/dev

Dev
theislab · Mar 11, 2020 · 31b905b · 31b905b
2 parents c839ef2 + 8842f6b
commit 31b905b
Show file tree

Hide file tree

Showing 14 changed files with 336 additions and 374 deletions.
diff --git a/batchglm/api/data.py b/batchglm/api/data.py
@@ -1,4 +1,4 @@
 from batchglm.data import design_matrix
 from batchglm.data import constraint_matrix_from_dict, constraint_matrix_from_string, string_constraints_from_dict, \
     constraint_system_from_star
-from batchglm.data import view_coef_names, preview_coef_names
+from batchglm.data import view_coef_names, preview_coef_names, bin_continuous_covariate
diff --git a/batchglm/data.py b/batchglm/data.py
@@ -453,3 +453,31 @@ def constraint_matrix_from_string(
         )
 
     return constraint_mat
+
+
+def bin_continuous_covariate(
+        sample_description: pd.DataFrame,
+        factor_to_bin: str,
+        bins: Union[int, list, np.ndarray, Tuple]
+):
+    r"""
+    Bin a continuous covariate.
+
+    Adds the binned covariate to the table. Binning is performed on quantiles of the distribution.
+
+    :param sample_description: Sample description table.
+    :param factor_to_bin: Name of columns of factor to bin.
+    :param bins: Number of bins or iteratable with bin borders. If given as integer, the bins are defined on the
+        quantiles of the covariate, ie the bottom 20% of observations are in the first bin if bins==5.
+    :return: Sample description table with binned covariate added.
+    """
+    if isinstance(bins, list) or isinstance(bins, np.ndarray) or isinstance(bins, Tuple):
+        bins = np.asarray(bins)
+    else:
+        bins = np.arange(0, 1, 1 / bins)
+
+    sample_description[factor_to_bin + "_binned"] = np.digitize(
+        np.argsort(np.argsort(sample_description[factor_to_bin].values)) / sample_description.shape[0],
+        bins
+    )
+    return sample_description
diff --git a/batchglm/models/base/estimator.py b/batchglm/models/base/estimator.py
@@ -5,6 +5,7 @@
 import numpy as np
 import pandas as pd
 import pprint
+import sys
 
 try:
     import anndata
@@ -112,6 +113,14 @@ def train_sequence(
         logger.debug("training strategy:\n%s", pprint.pformat(training_strategy))
         for idx, d in enumerate(training_strategy):
             logger.debug("Beginning with training sequence #%d", idx + 1)
+            # Override duplicate arguments with user choice:
+            if np.any([x in list(d.keys()) for x in list(kwargs.keys())]):
+                d = dict([(x, y) for x, y in d.items() if x not in list(kwargs.keys())])
+                for x in [xx for xx in list(d.keys()) if xx in list(kwargs.keys())]:
+                    sys.stdout.write(
+                        "overrding %s from training strategy with value %s with new value %s\n" %
+                        (x, str(d[x]), str(kwargs[x]))
+                    )
             self.train(**d, **kwargs)
             logger.debug("Training sequence #%d complete", idx + 1)
 
@@ -165,6 +174,11 @@ def _plot_coef_vs_ref(
         from matplotlib import gridspec
         from matplotlib import rcParams
 
+        if isinstance(true_values, dask.array.core.Array):
+            true_values = true_values.compute()
+        if isinstance(estim_values, dask.array.core.Array):
+            estim_values = estim_values.compute()
+
         plt.ioff()
 
         n_par = true_values.shape[0]
@@ -258,6 +272,11 @@ def _plot_deviation(
         import seaborn as sns
         import matplotlib.pyplot as plt
 
+        if isinstance(true_values, dask.array.core.Array):
+            true_values = true_values.compute()
+        if isinstance(estim_values, dask.array.core.Array):
+            estim_values = estim_values.compute()
+
         plt.ioff()
 
         n_par = true_values.shape[0]

diff --git a/batchglm/models/base_glm/simulator.py b/batchglm/models/base_glm/simulator.py
@@ -171,3 +171,16 @@ def constraints_loc(self):
     @property
     def constraints_scale(self):
         return np.identity(n=self.b_var.shape[0])
+
+    def np_clip_param(
+            self,
+            param,
+            name
+    ):
+        # TODO: inherit this from somewhere?
+        bounds_min, bounds_max = self.param_bounds(param.dtype)
+        return np.clip(
+            param,
+            bounds_min[name],
+            bounds_max[name]
+        )
diff --git a/batchglm/models/glm_nb/external.py b/batchglm/models/glm_nb/external.py
@@ -5,4 +5,6 @@
 from batchglm.models.base_glm import closedform_glm_mean, closedform_glm_scale
 
 import batchglm.data as data_utils
-from batchglm.utils.linalg import groupwise_solve_lm
+from batchglm.utils.linalg import groupwise_solve_lm
+
+from batchglm import pkg_constants
diff --git a/batchglm/models/glm_nb/simulator.py b/batchglm/models/glm_nb/simulator.py
@@ -2,6 +2,7 @@
 
 from .model import Model
 from .external import _SimulatorGLM, InputDataGLM
+from .external import pkg_constants
 
 
 class Simulator(_SimulatorGLM, Model):
@@ -58,3 +59,35 @@ def generate_data(self):
             design_scale_names=None
         )
 
+    def param_bounds(
+            self,
+            dtype
+    ):
+        # TODO: inherit this from somewhere?
+        dtype = np.dtype(dtype)
+        dmin = np.finfo(dtype).min
+        dmax = np.finfo(dtype).max
+        dtype = dtype.type
+
+        sf = dtype(pkg_constants.ACCURACY_MARGIN_RELATIVE_TO_LIMIT)
+        bounds_min = {
+            "a_var": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
+            "b_var": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
+            "eta_loc": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
+            "eta_scale": np.log(np.nextafter(0, np.inf, dtype=dtype)) / sf,
+            "loc": np.nextafter(0, np.inf, dtype=dtype),
+            "scale": np.nextafter(0, np.inf, dtype=dtype),
+            "likelihood": dtype(0),
+            "ll": np.log(np.nextafter(0, np.inf, dtype=dtype)),
+        }
+        bounds_max = {
+            "a_var": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
+            "b_var": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
+            "eta_loc": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
+            "eta_scale": np.nextafter(np.log(dmax), -np.inf, dtype=dtype) / sf,
+            "loc": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
+            "scale": np.nextafter(dmax, -np.inf, dtype=dtype) / sf,
+            "likelihood": dtype(1),
+            "ll": dtype(0),
+        }
+        return bounds_min, bounds_max
diff --git a/batchglm/models/glm_nb/utils.py b/batchglm/models/glm_nb/utils.py
@@ -1,3 +1,5 @@
+import dask
+import logging
 import numpy as np
 import scipy.sparse
 from typing import Union
@@ -71,3 +73,145 @@ def compute_scales_fun(variance, mean):
         inv_link_fn=invlink_fn,
         compute_scales_fun=compute_scales_fun
     )
+
+
+def init_par(
+        input_data,
+        init_a,
+        init_b,
+        init_model
+):
+    r"""
+    standard:
+    Only initialise intercept and keep other coefficients as zero.
+
+    closed-form:
+    Initialize with Maximum Likelihood / Maximum of Momentum estimators
+
+    Idea:
+    $$
+        \theta &= f(x) \\
+        \Rightarrow f^{-1}(\theta) &= x \\
+            &= (D \cdot D^{+}) \cdot x \\
+            &= D \cdot (D^{+} \cdot x) \\
+            &= D \cdot x' = f^{-1}(\theta)
+    $$
+    """
+    train_loc = True
+    train_scale = True
+
+    if init_model is None:
+        groupwise_means = None
+        init_a_str = None
+        if isinstance(init_a, str):
+            init_a_str = init_a.lower()
+            # Chose option if auto was chosen
+            if init_a.lower() == "auto":
+                if isinstance(input_data.design_loc, dask.array.core.Array):
+                    dloc = input_data.design_loc.compute()
+                else:
+                    dloc = input_data.design_loc
+                one_hot = len(np.unique(dloc)) == 2 and \
+                    np.abs(np.min(dloc) - 0.) == 0. and \
+                    np.abs(np.max(dloc) - 1.) == 0.
+                init_a = "standard" if not one_hot else "closed_form"
+
+            if init_a.lower() == "closed_form":
+                groupwise_means, init_a, rmsd_a = closedform_nb_glm_logmu(
+                    x=input_data.x,
+                    design_loc=input_data.design_loc,
+                    constraints_loc=input_data.constraints_loc,
+                    size_factors=input_data.size_factors,
+                    link_fn=lambda mu: np.log(mu+np.nextafter(0, 1, dtype=mu.dtype))
+                )
+
+                # train mu, if the closed-form solution is inaccurate
+                train_loc = not (np.all(np.abs(rmsd_a) < 1e-20) or rmsd_a.size == 0)
+
+                if input_data.size_factors is not None:
+                    if np.any(input_data.size_factors != 1):
+                        train_loc = True
+            elif init_a.lower() == "standard":
+                overall_means = np.mean(input_data.x, axis=0)  # directly calculate the mean
+                init_a = np.zeros([input_data.num_loc_params, input_data.num_features])
+                init_a[0, :] = np.log(overall_means)
+                train_loc = True
+            elif init_a.lower() == "all_zero":
+                init_a = np.zeros([input_data.num_loc_params, input_data.num_features])
+                train_loc = True
+            else:
+                raise ValueError("init_a string %s not recognized" % init_a)
+
+        if isinstance(init_b, str):
+            if init_b.lower() == "auto":
+                init_b = "standard"
+
+            if init_b.lower() == "standard":
+                groupwise_scales, init_b_intercept, rmsd_b = closedform_nb_glm_logphi(
+                    x=input_data.x,
+                    design_scale=input_data.design_scale[:, [0]],
+                    constraints=input_data.constraints_scale[[0], :][:, [0]],
+                    size_factors=input_data.size_factors,
+                    groupwise_means=None,
+                    link_fn=lambda r: np.log(r+np.nextafter(0, 1, dtype=r.dtype))
+                )
+                init_b = np.zeros([input_data.num_scale_params, input_data.num_features])
+                init_b[0, :] = init_b_intercept
+            elif init_b.lower() == "closed_form":
+                dmats_unequal = False
+                if input_data.design_loc.shape[1] == input_data.design_scale.shape[1]:
+                    if np.any(input_data.design_loc != input_data.design_scale):
+                        dmats_unequal = True
+
+                inits_unequal = False
+                if init_a_str is not None:
+                    if init_a_str != init_b:
+                        inits_unequal = True
+
+                if inits_unequal or dmats_unequal:
+                    raise ValueError("cannot use closed_form init for scale model " +
+                                     "if scale model differs from loc model")
+
+                groupwise_scales, init_b, rmsd_b = closedform_nb_glm_logphi(
+                    x=input_data.x,
+                    design_scale=input_data.design_scale,
+                    constraints=input_data.constraints_scale,
+                    size_factors=input_data.size_factors,
+                    groupwise_means=groupwise_means,
+                    link_fn=lambda r: np.log(r)
+                )
+            elif init_b.lower() == "all_zero":
+                init_b = np.zeros([input_data.num_scale_params, input_data.x.shape[1]])
+            else:
+                raise ValueError("init_b string %s not recognized" % init_b)
+    else:
+        # Locations model:
+        if isinstance(init_a, str) and (init_a.lower() == "auto" or init_a.lower() == "init_model"):
+            my_loc_names = set(input_data.loc_names)
+            my_loc_names = my_loc_names.intersection(set(init_model.input_data.loc_names))
+
+            init_loc = np.zeros([input_data.num_loc_params, input_data.num_features])
+            for parm in my_loc_names:
+                init_idx = np.where(init_model.input_data.loc_names == parm)[0]
+                my_idx = np.where(input_data.loc_names == parm)[0]
+                init_loc[my_idx] = init_model.a_var[init_idx]
+
+            init_a = init_loc
+            logging.getLogger("batchglm").debug("Using initialization based on input model for mean")
+
+        # Scale model:
+        if isinstance(init_b, str) and (init_b.lower() == "auto" or init_b.lower() == "init_model"):
+            my_scale_names = set(input_data.scale_names)
+            my_scale_names = my_scale_names.intersection(init_model.input_data.scale_names)
+
+            init_scale = np.zeros([input_data.num_scale_params, input_data.num_features])
+            for parm in my_scale_names:
+                init_idx = np.where(init_model.input_data.scale_names == parm)[0]
+                my_idx = np.where(input_data.scale_names == parm)[0]
+                init_scale[my_idx] = init_model.b_var[init_idx]
+
+            init_b = init_scale
+            logging.getLogger("batchglm").debug("Using initialization based on input model for dispersion")
+
+    return init_a, init_b, train_loc, train_scale
+