Skip to content

Commit

Permalink
near complete docstrings coverage. redefined some lambda functions as…
Browse files Browse the repository at this point in the history
… regular functions or removed entirely
  • Loading branch information
parashardhapola committed Jul 29, 2021
1 parent f72d2c7 commit b454bdf
Showing 1 changed file with 64 additions and 32 deletions.
96 changes: 64 additions & 32 deletions scarf/assay.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ def __init__(
Args:
z (zarr.Group): Zarr hierarchy where raw data is located
name (str): A label/name for assay.
cell_data: Metadata for the cells.
cell_data: Metadata class object for the cell attributes.
nthreads: number for threads to use for dask parallel computations
min_cells_per_feature:
"""
Expand Down Expand Up @@ -339,7 +339,7 @@ def _get_summary_stats_loc(cell_key: str) -> Tuple[str, str]:
cell_key: Name of the key (column) from cell attribute table
Returns: A tuple of two strings. First is the text that will be prepended to column
names when summary statistics are loaded onto the feature metadata table. The
names when summary statistics are loaded onto the feature attributes table. The
second is the location of the summary statistics group in the zarr hierarchy of
the assay.
Expand Down Expand Up @@ -543,9 +543,7 @@ def __repr__(self):

class RNAassay(Assay):
"""
This assay is designed for feature selection and normalization of scRNA-Seq data.
Subclass of Assay.
This subclass of Assay is designed for feature selection and normalization of scRNA-Seq data.
"""

def __init__(self, z: zarr.hierarchy, name: str, cell_data: MetaData, **kwargs):
Expand All @@ -554,7 +552,7 @@ def __init__(self, z: zarr.hierarchy, name: str, cell_data: MetaData, **kwargs):
Args:
z (zarr.Group): Zarr hierarchy where raw data is located
name (str): A label/name for assay.
cell_data: Metadata for the cells.
cell_data: Metadata class object for the cell attributes.
**kwargs:
"""
super().__init__(z, name, cell_data, **kwargs)
Expand All @@ -576,7 +574,7 @@ def normed(
) -> daskarr:
"""
This function normalizes the raw and returns a delayed dask array of the normalized
data. Unlike the `normed` method in the generic Assay class this method is optimized for RNA-Seq data and
data. Unlike the `normed` method in the generic Assay class this method is optimized for scRNA-Seq data and
takes additional parameters that will be used by `norm_lib_size` (default normalization
method for this class).
Expand Down Expand Up @@ -743,9 +741,11 @@ def mark_hvgs(
**plot_kwargs: Keyword arguments for matplotlib.pyplot.scatter function
"""

def col_renamer(x):
return f"{identifier}_{x}"

self.set_feature_stats(cell_key, min_cells)
identifier = self._load_stats_loc(cell_key)
col_renamer = lambda x: f"{identifier}_{x}"
c_var_col = f"c_var__{n_bins}__{lowess_frac}"
if col_renamer(c_var_col) in self.feats.columns:
logger.info("Using existing corrected dispersion values")
Expand Down Expand Up @@ -818,14 +818,18 @@ def mark_hvgs(


class ATACassay(Assay):
# TODO: add docstring
"""
This subclass of Assay is designed for feature selection and normalization of scATAC-Seq data.
"""

def __init__(self, z: zarr.hierarchy, name: str, cell_data: MetaData, **kwargs):
"""
This Assay subclass is designed for feature selection and normalization of scATAC-Seq data
Args:
z:
name:
cell_data:
z (zarr.Group): Zarr hierarchy where raw data is located
name (str): A label/name for assay.
cell_data: Metadata class object for the cell attributes.
**kwargs:
"""
super().__init__(z, name, cell_data, **kwargs)
Expand All @@ -836,15 +840,24 @@ def __init__(self, z: zarr.hierarchy, name: str, cell_data: MetaData, **kwargs):

def normed(
self, cell_idx: np.ndarray = None, feat_idx: np.ndarray = None, **kwargs
):
) -> daskarr:
"""
This function normalizes the raw and returns a delayed dask array of the normalized
data. Unlike the `normed` method in the generic Assay class this method is optimized for scATAC-Seq data.
This method uses the the normalization indicated by attribute self.normMethod which by default is set to
`norm_tf_idf`. The TF-IDF normalization is performed using only the cells and features indicated by the
'cell_idx' and 'feat_idx' parameters.
Args:
cell_idx:
feat_idx:
cell_idx: Indices of cells to be included in the normalized matrix
(Default value: All those marked True in 'I' column of cell
attribute table)
feat_idx: Indices of features to be included in the normalized matrix
(Default value: All those marked True in 'I' column of
feature attribute table)
**kwargs:
Returns:
Returns: A dask array (delayed matrix) containing normalized data.
"""
if cell_idx is None:
Expand All @@ -859,11 +872,13 @@ def normed(

def set_feature_stats(self, cell_key: str) -> None:
"""
Calculates prevalence of each valid feature of the assay using only cells that are marked True by the
'cell_key' parameter. Prevalence of a feature is the sum of all its TF-IDF normalized values across cells.
Args:
cell_key:
cell_key: Name of the key (column) from cell attribute table.
Returns:
Returns: None
"""
feat_key = "I" # Here we choose to calculate stats for all the features
Expand Down Expand Up @@ -892,13 +907,17 @@ def mark_prevalent_peaks(
self, cell_key: str, top_n: int, prevalence_key_name: str
) -> None:
"""
Marks `top_n` peaks with highest prevalence as prevalent peaks.
Args:
cell_key:
top_n:
prevalence_key_name:
cell_key: Cells to use for selection of most prevalent peaks. The provided value for `cell_key` should be a
column in cell attributes table with boolean values.
top_n: Number of top prevalent peaks to be selected. This value is ignored if a value is provided
for `min_var` parameter.
prevalence_key_name: Base label for marking prevalent peaks in the features attributes column. The value for
'cell_key' parameter is prepended to this value.
Returns:
Returns: None
"""
if top_n >= self.feats.N:
Expand All @@ -909,9 +928,8 @@ def mark_prevalent_peaks(
raise TypeError("ERROR: n_top must a positive integer value")
self.set_feature_stats(cell_key)
identifier = self._load_stats_loc(cell_key)
col_renamer = lambda x: f"{identifier}_{x}"
idx = (
pd.Series(self.feats.fetch_all(col_renamer("prevalence")))
pd.Series(self.feats.fetch_all(f"{identifier}_prevalence"))
.sort_values(ascending=False)[:top_n]
.index
)
Expand All @@ -926,30 +944,44 @@ def mark_prevalent_peaks(


class ADTassay(Assay):
# TODO: add docstring
"""
This subclass of Assay is designed for normalization of ADT/HTO (feature-barcodes library) data from
CITE-Seq experiments.
"""

def __init__(self, z: zarr.hierarchy, name: str, cell_data: MetaData, **kwargs):
"""
This subclass of Assay is designed for normalization of ADT/HTO (feature-barcodes library) data from
CITE-Seq experiments.
Args:
z:
name:
cell_data:
z (zarr.Group): Zarr hierarchy where raw data is located
name (str): A label/name for assay.
cell_data: Metadata class object for the cell attributes.
**kwargs:
"""
super().__init__(z, name, cell_data, **kwargs)
self.normMethod = norm_clr

def normed(
self, cell_idx: np.ndarray = None, feat_idx: np.ndarray = None, **kwargs
):
) -> daskarr:
"""
This function normalizes the raw and returns a delayed dask array of the normalized
data. This method uses the the normalization indicated by attribute self.normMethod which by default is set to
`norm_clr`. The centered log-ratio normalization is performed using only the cells and features indicated by the
'cell_idx' and 'feat_idx' parameters.
Args:
cell_idx:
feat_idx:
cell_idx: Indices of cells to be included in the normalized matrix
(Default value: All those marked True in 'I' column of cell
attribute table)
feat_idx: Indices of features to be included in the normalized matrix
(Default value: All those marked True in 'I' column of
feature attribute table)
**kwargs:
Returns:
Returns: A dask array (delayed matrix) containing normalized data.
"""
if cell_idx is None:
Expand Down

0 comments on commit b454bdf

Please sign in to comment.