Skip to content

Commit

Permalink
Merge pull request #364 from pavlin-policar/sparse-1
Browse files Browse the repository at this point in the history
ScPreprocess: Add sparse support
  • Loading branch information
JakaKokosar authored Aug 31, 2020
2 parents 80585b6 + 38d0fd5 commit 1e7e303
Show file tree
Hide file tree
Showing 2 changed files with 262 additions and 115 deletions.
124 changes: 80 additions & 44 deletions orangecontrib/single_cell/preprocess/scpreprocess.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,18 @@
from typing import Tuple, Optional
from typing import Tuple
import warnings
from typing import Union

import numpy as np
import scipy.sparse as sp
from scipy.stats import zscore, percentileofscore

from Orange.data import Domain, Table
from Orange.preprocess.preprocess import Preprocess
from Orange.util import Enum
import Orange.statistics.util as ut


AnyArray = Union[np.ndarray, sp.csr_matrix, sp.csc_matrix]


class LogarithmicScale(Preprocess):
Expand All @@ -17,14 +23,23 @@ class LogarithmicScale(Preprocess):
def __init__(self, base=BinaryLog):
self.base = base

def __call__(self, data):
def __call__(self, data: Table) -> Table:
new_data = data.copy()

if self.base == LogarithmicScale.BinaryLog:
new_data.X = np.log2(1 + data.X)
elif self.base == LogarithmicScale.NaturalLog:
new_data.X = np.log(1 + data.X)
def func(x, *args, **kwargs):
return np.log2(x + 1, *args, **kwargs)
elif self.base == LogarithmicScale.CommonLog:
new_data.X = np.log10(1 + data.X)
def func(x, *args, **kwargs):
return np.log10(x + 1, *args, **kwargs)
elif self.base == LogarithmicScale.NaturalLog:
func = np.log1p

if sp.issparse(new_data.X):
func(new_data.X.data, out=new_data.X.data)
else:
func(new_data.X, out=new_data.X)

return new_data


Expand All @@ -37,18 +52,17 @@ def __init__(self, condition=GreaterOrEqual, threshold=1):
self.condition = condition
self.threshold = threshold

def __call__(self, data):
def __call__(self, data: Table) -> Table:
new_data = data.copy()
if self.condition == Binarize.GreaterOrEqual:
new_data.X = np.where(data.X >= self.threshold, 1, 0)
new_data.X = new_data.X >= self.threshold
elif self.condition == Binarize.Greater:
new_data.X = np.where(data.X > self.threshold, 1, 0)
new_data.X = new_data.X > self.threshold
return new_data


class Normalize(Preprocess):
Method = Enum("Normalize", ("CPM", "Median"),
qualname="Normalize.Method")
Method = Enum("Normalize", ("CPM", "Median"), qualname="Normalize.Method")
CPM, Median = Method

def __init__(self, method=CPM):
Expand All @@ -62,46 +76,67 @@ def normalize(self, *args):


class NormalizeSamples(Normalize):
def __call__(self, data):
def __call__(self, data: Table) -> Table:
new_data = data.copy()
new_data.X = self.normalize(data.X)
return new_data

def normalize(self, table):
row_sums = np.nansum(table, axis=1)
row_sums[row_sums == 0] = 1
table = table / row_sums[:, None]
factor = np.nanmedian(row_sums) \
if self.method == NormalizeSamples.Median else 10 ** 6
return table * factor
def normalize(self, table: AnyArray) -> AnyArray:
row_sums = ut.nansum(table, axis=1)
row_sums[row_sums == 0] = 1 # avoid division by zero errors

if self.method == NormalizeSamples.Median:
factor = np.nanmedian(row_sums)
else:
factor = 1e6

if sp.issparse(table):
table = sp.diags(1 / row_sums) @ table
else:
table = table / row_sums[:, None]

table *= factor

return table


class NormalizeGroups(Normalize):
def __init__(self, group_var, method=Normalize.CPM):
super().__init__(method)
self.group_var = group_var

def __call__(self, data):
def __call__(self, data: Table) -> Table:
group_col = data.get_column_view(self.group_var)[0]
group_col = group_col.astype("int64")
new_data = data.copy()
new_data.X = self.normalize(data.X, group_col)
return new_data

def normalize(self, table, group_col):
group_sums = np.bincount(group_col, np.nansum(table, axis=1))
def normalize(self, table: AnyArray, group_col: np.ndarray) -> AnyArray:
group_sums = np.bincount(group_col, ut.nansum(table, axis=1))
group_sums[group_sums == 0] = 1
group_sums_row = np.zeros_like(group_col)
medians = []
row_sums = np.nansum(table, axis=1)
row_sums = ut.nansum(table, axis=1)
for value, group_sum in zip(np.unique(group_col), group_sums):
mask = group_col == value
group_sums_row[mask] = group_sum
if self.method == NormalizeGroups.Median:
medians.append(np.nanmedian(row_sums[mask]))
factor = np.min(medians) \
if self.method == NormalizeGroups.Median else 10 ** 6
return table / group_sums_row[:, None] * factor

if self.method == NormalizeGroups.Median:
factor = np.min(medians)
else:
factor = 1e6

if sp.issparse(table):
table = sp.diags(1 / group_sums_row) @ table
else:
table = table / group_sums_row[:, None]

table *= factor

return table


class Standardize(Preprocess):
Expand Down Expand Up @@ -129,10 +164,10 @@ def __init__(self, method=Dispersion, n_genes=1000, n_groups=20):
self.n_genes = n_genes
self.n_groups = n_groups if n_groups and n_groups > 1 else 1

def __call__(self, data):
def __call__(self, data: Table) -> Table:
n_groups = min(self.n_groups, len(data.domain.attributes))
mean = np.nanmean(data.X, axis=0)
variance = np.nanvar(data.X, axis=0)
mean = ut.nanmean(data.X, axis=0)
variance = ut.nanvar(data.X, axis=0)
percentiles = [percentileofscore(mean, m) for m in mean]
_, bins = np.histogram(percentiles, n_groups)
bin_indices = np.digitize(percentiles, bins, True)
Expand Down Expand Up @@ -190,23 +225,24 @@ def __call__(self, data: Table) -> Table:
warnings.warn(f"{sum(selected)} genes selected", DropoutWarning)
return self.filter_columns(data, selected)

def detection(self, table: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
mask = table > self.threshold
def detection(self, table: AnyArray) -> Tuple[np.ndarray, np.ndarray]:
with np.errstate(invalid="ignore"): # comparison can include nans
mask = table > self.threshold

if sp.issparse(table):
zero_rate = 1 - np.squeeze(np.array(mask.mean(axis=0)))
A = table.multiply(mask)
A.data = np.log2(A.data)
mean_expr = np.zeros_like(zero_rate) * np.nan
detected = zero_rate < 1
detected_mean = np.squeeze(np.array(A[:, detected].mean(axis=0)))
mean_expr[detected] = detected_mean / (1 - zero_rate[detected])
A = table.copy()
np.log2(A.data, out=A.data)
else:
zero_rate = 1 - np.mean(mask, axis=0)
mean_expr = np.zeros_like(zero_rate) * np.nan
detected = zero_rate < 1
mean_expr[detected] = np.nanmean(
np.where(table[:, detected] > self.threshold,
np.log2(table[:, detected]), np.nan), axis=0)
A = np.ma.log2(table) # avoid log2(0)
A.mask = False

detection_rate = ut.nanmean(mask, axis=0)
zero_rate = 1 - detection_rate
detected = detection_rate > 0
detected_mean = ut.nanmean(A[:, detected], axis=0)

mean_expr = np.full_like(zero_rate, fill_value=np.nan)
mean_expr[detected] = detected_mean / detection_rate[detected]

low_detection = np.array(np.sum(mask, axis=0)).squeeze()
zero_rate[low_detection < self.at_least] = np.nan
Expand Down
Loading

0 comments on commit 1e7e303

Please sign in to comment.