Skip to content

Commit

Permalink
Merge pull request #466 from pepkit/443_add_classmethod
Browse files Browse the repository at this point in the history
Updated initiation object method with class methods
  • Loading branch information
nsheff authored Dec 18, 2023
2 parents e7e923d + 4316596 commit 2ec46e8
Show file tree
Hide file tree
Showing 10 changed files with 237 additions and 39 deletions.
4 changes: 4 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm

_Due to the changes mentioned above, a few item functionalities may be disabled. For example, the `name` and `description` properties can now be accessed and modified using attribute functionality_

### Added
- Constructor methods: `from_dict`, `from_pandas`, `from_yaml`


## [0.35.7] -- 2023-07-19
### Fixed
- incorrect setting of sample and subsample indexes using from_dict function (#452)
Expand Down
75 changes: 75 additions & 0 deletions docs/initialize.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
# How to initiate peppy using different methods

peppy supports multiple ways to initiate a project. The most common way is to use a configuration file.
However, peppy also supports using a csv file (sample sheet), and a yaml file (sample sheet).
Additionally, peppy can be initiated using Python objects such as a pandas dataframe or a dictionary.

## 1. Using a configuration file
```python
import peppy
project = peppy.Project.from_pep_config("path/to/project/config.yaml")
```

## 2. Using csv file (sample sheet)
```python
import peppy
project = peppy.Project.from_pep_config("path/to/project/sample_sheet.csv")
```

## 3. Using yaml sample sheet

```python
import peppy

project = peppy.Project.from_sample_yaml("path/to/project/sample_sheet.yaml")
```


## 4. Using a pandas dataframe
```python
import pandas as pd
import peppy
df = pd.read_csv("path/to/project/sample_sheet.csv")
project = peppy.Project.from_pandas(df)
```

## 5. Using a peppy generated dict
```python
import peppy
project = peppy.Project.from_dict(
{'_config': {'description': None,
'name': 'example_basic',
'pep_version': '2.0.0',
'sample_table': 'sample_table.csv',},
'_sample_dict': [{'organism': 'pig', 'sample_name': 'pig_0h', 'time': '0'},
{'organism': 'pig', 'sample_name': 'pig_1h', 'time': '1'},
{'organism': 'frog', 'sample_name': 'frog_0h', 'time': '0'},
{'organism': 'frog', 'sample_name': 'frog_1h', 'time': '1'}],
'_subsample_list': [[{'read1': 'frog1a_data.txt',
'read2': 'frog1a_data2.txt',
'sample_name': 'frog_0h'},
{'read1': 'frog1b_data.txt',
'read2': 'frog1b_data2.txt',
'sample_name': 'pig_0h'},
{'read1': 'frog1c_data.txt',
'read2': 'frog1b_data2.txt',
'sample_name': 'pig_0h'}]]})
```

## 5.1 Generate dict from peppy and reuse it
```python
import peppy

project = peppy.Project("https://raw.githubusercontent.com/pepkit/example_peps/master/example_basic/sample_table.csv")
project_dict = project.to_dict(extended=True)
project_copy = peppy.Project.from_dict(project_dict)

# now you can check if this project is the same as the original project
print(project_copy == project)
```

## 6. Using a csv file from a url
```python
import peppy
project = peppy.Project("https://raw.githubusercontent.com/pepkit/example_peps/master/example_basic/sample_table.csv")
```
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ nav:
- How to use append sample modifier: feature1_append.md
- How to use imply sample modifier: feature2_imply.md
- How to validate a PEP: validating.md
- How to initialize a peppy: initialize.md
- Reference:
- API: autodoc_build/peppy.md
- Support: support.md
Expand Down
2 changes: 1 addition & 1 deletion peppy/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.40.0a5"
__version__ = "0.40.0a6"
114 changes: 83 additions & 31 deletions peppy/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

import numpy as np
import pandas as pd
import yaml
from pandas.core.common import flatten
from rich.console import Console
from rich.progress import track
Expand Down Expand Up @@ -91,9 +92,6 @@ class Project(MutableMapping):
:param str | Iterable[str] amendments: names of the amendments to activate
:param Iterable[str] amendments: amendments to use within configuration file
:param bool defer_samples_creation: whether the sample creation should be skipped
:param Dict[Any]: dict representation of the project {_config: str,
_samples: list | dict,
_subsamples: list[list | dict]}
:Example:
Expand All @@ -111,7 +109,6 @@ def __init__(
sample_table_index: Union[str, Iterable[str]] = None,
subsample_table_index: Union[str, Iterable[str]] = None,
defer_samples_creation: bool = False,
from_dict: dict = None,
):
_LOGGER.debug(
"Creating {}{}".format(
Expand Down Expand Up @@ -166,49 +163,66 @@ def __init__(
self._sample_table = self._get_table_from_samples(
index=self.st_index, initial=True
)
if from_dict:
self.from_dict(from_dict)

def __eq__(self, other):
return [s.to_dict() for s in self.samples] == [
s.to_dict() for s in other.samples
]

@classmethod
def from_pandas(
self,
cls,
samples_df: pd.DataFrame,
sub_samples_df: List[pd.DataFrame] = None,
config: dict = None,
) -> "Project":
):
"""
Init a peppy project instance from a pandas Dataframe
:param samples_df: in-memory pandas DataFrame object of samples
:param sub_samples_df: in-memory list of pandas DataFrame objects of sub-samples
:param config: dict of yaml file
"""
tmp_obj = cls()
if not config:
config = {CONFIG_VERSION_KEY: PEP_LATEST_VERSION}
self[SAMPLE_DF_KEY] = samples_df.replace(np.nan, "")
self[SUBSAMPLE_DF_KEY] = sub_samples_df
tmp_obj[SAMPLE_DF_KEY] = samples_df.replace(np.nan, "")
tmp_obj[SUBSAMPLE_DF_KEY] = sub_samples_df

self[SAMPLE_DF_LARGE] = self[SAMPLE_DF_KEY].shape[0] > 1000
tmp_obj[SAMPLE_DF_LARGE] = tmp_obj[SAMPLE_DF_KEY].shape[0] > 1000

self[CONFIG_KEY] = config
tmp_obj[CONFIG_KEY] = config

self.create_samples(modify=False if self[SAMPLE_TABLE_FILE_KEY] else True)
self._sample_table = self._get_table_from_samples(
index=self.st_index, initial=True
tmp_obj.create_samples(modify=False if tmp_obj[SAMPLE_TABLE_FILE_KEY] else True)
tmp_obj._sample_table = tmp_obj._get_table_from_samples(
index=tmp_obj.st_index, initial=True
)
return self
return tmp_obj

def from_dict(self, pep_dictionary: dict) -> "Project":
@classmethod
def from_dict(cls, pep_dictionary: dict):
"""
Init a peppy project instance from a dictionary representation
of an already processed PEP.
:param dict pep_dictionary: in-memory dict representation of pep.
:param Dict[Any] pep_dictionary: dict representation of the project {_config: dict,
_samples: list | dict,
_subsamples: list[list | dict]}
"""
_LOGGER.info("Processing project from dictionary...")
temp_obj = cls()
return temp_obj._from_dict(pep_dictionary)

def _from_dict(self, pep_dictionary) -> "Project":
"""
Initiate a peppy project instance from a dictionary representation of an already processed PEP.
# This function is needed in looper to reinit the project after it was created from a dictionary representation.
:param Dict[Any] pep_dictionary: dict representation of the project {_config: dict,
_samples: list | dict,
_subsamples: list[list | dict]}
"""
self[SAMPLE_DF_KEY] = pd.DataFrame(pep_dictionary[SAMPLE_RAW_DICT_KEY])
self[CONFIG_KEY] = pep_dictionary[CONFIG_KEY]

Expand All @@ -233,6 +247,50 @@ def from_dict(self, pep_dictionary: dict) -> "Project":

return self

@classmethod
def from_pep_config(
cls,
cfg: str = None,
amendments: Union[str, Iterable[str]] = None,
sample_table_index: Union[str, Iterable[str]] = None,
subsample_table_index: Union[str, Iterable[str]] = None,
defer_samples_creation: bool = False,
):
"""
Init a peppy project instance from a yaml file
:param str cfg: Project config file (YAML) or sample table (CSV/TSV)
with one row per sample to constitute project
:param str | Iterable[str] sample_table_index: name of the columns to set
the sample_table index to
:param str | Iterable[str] subsample_table_index: name of the columns to set
the subsample_table index to
:param str | Iterable[str] amendments: names of the amendments to activate
:param Iterable[str] amendments: amendments to use within configuration file
:param bool defer_samples_creation: whether the sample creation should be skipped
"""
# TODO: this is just a copy of the __init__ method. It should be refactored
return cls(
cfg=cfg,
amendments=amendments,
sample_table_index=sample_table_index,
subsample_table_index=subsample_table_index,
defer_samples_creation=defer_samples_creation,
)

@classmethod
def from_sample_yaml(cls, yaml_file: str):
"""
Init a peppy project instance from a yaml file
:param str yaml_file: path to yaml file
"""
_LOGGER.info("Processing project from yaml...")
with open(yaml_file, "r") as f:
prj_dict = yaml.safe_load(f)
pd_df = pd.DataFrame.from_dict(prj_dict)
return cls.from_pandas(pd_df)

def to_dict(
self,
# expand: bool = False, # expand was used to expand paths. This functionality was removed, because of attmapp
Expand All @@ -244,7 +302,6 @@ def to_dict(
"""
Convert the Project object to a dictionary.
:param bool expand: whether to expand the paths
:param bool extended: whether to produce complete project dict (used to reinit the project)
:param Literal orient: orientation of the returned df
:return dict: a dictionary representation of the Project object
Expand All @@ -256,7 +313,10 @@ def to_dict(
]
else:
sub_df = None
self[CONFIG_KEY][NAME_KEY] = self.name
try:
self[CONFIG_KEY][NAME_KEY] = self.name
except NotImplementedError:
self[CONFIG_KEY][NAME_KEY] = "unnamed"
self[CONFIG_KEY][DESC_KEY] = self.description
p_dict = {
SAMPLE_RAW_DICT_KEY: self[SAMPLE_DF_KEY].to_dict(orient=orient),
Expand All @@ -275,7 +335,7 @@ def create_samples(self, modify: bool = False):
"""
self._samples: List[Sample] = self.load_samples()
if self.samples is None:
_LOGGER.info("No samples found in the project.")
_LOGGER.debug("No samples found in the project.")

if modify:
self.modify_samples()
Expand Down Expand Up @@ -1406,18 +1466,10 @@ def __delitem__(self, key):
def __repr__(self):
return str(self)

# pickle now is impossible, because it's impossible to initialize Project class without using actual files
def __reduce__(self):
return (
self.__class__,
(
None,
None,
None,
None,
False,
self.to_dict(extended=True, orient="records"),
),
self.__class__.from_dict,
(self.to_dict(extended=True, orient="records"),),
)


Expand Down
30 changes: 27 additions & 3 deletions peppy/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

import logging
import os
from typing import Dict
from typing import Dict, Mapping
from urllib.request import urlopen

import yaml
Expand Down Expand Up @@ -104,6 +104,30 @@ def _raise_faulty_arg():
_raise_faulty_arg()


def _expandpath(path: str):
"""
Expand a filesystem path that may or may not contain user/env vars.
:param str path: path to expand
:return str: expanded version of input path
"""
return os.path.expandvars(os.path.expanduser(path))


def expand_paths(x: dict) -> dict:
"""
Recursively expand paths in a dict.
:param dict x: dict to expand
:return dict: dict with expanded paths
"""
if isinstance(x, str):
return expandpath(x)
elif isinstance(x, Mapping):
return {k: expand_paths(v) for k, v in x.items()}
return x


def load_yaml(filepath):
"""
Load a local or remote YAML file into a Python dict
Expand All @@ -123,11 +147,11 @@ def load_yaml(filepath):
)
else:
data = response.read().decode("utf-8")
return yaml.safe_load(data)
return expand_paths(yaml.safe_load(data))
else:
with open(os.path.abspath(filepath), "r") as f:
data = yaml.safe_load(f)
return data
return expand_paths(data)


def is_cfg_or_anno(file_path, formats=None):
Expand Down
1 change: 0 additions & 1 deletion requirements/requirements-all.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,3 @@ pyyaml
rich>=10.3.0
ubiquerg>=0.6.2
numpy
yacman>=0.9.0
5 changes: 5 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,11 @@ def example_pep_csv_path(request):
return get_path_to_example_file(EPB, request.param, "sample_table.csv")


@pytest.fixture
def example_yaml_sample_file(request):
return get_path_to_example_file(EPB, request.param, "sample.yaml")


@pytest.fixture
def example_pep_nextflow_csv_path():
return get_path_to_example_file(EPB, "nextflow_taxprofiler_pep", "samplesheet.csv")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
- sample_name: sample1
file: path/to/file.tsv
- sample_name: sample2
file: path/to/2.tsv
- sample_name: sample3
file: path/to/3.tsv
Loading

0 comments on commit 2ec46e8

Please sign in to comment.