Merge pull request #466 from pepkit/443_add_classmethod

Updated initiation object method with class methods
pepkit · Dec 18, 2023 · 2ec46e8 · 2ec46e8
2 parents e7e923d + 4316596
commit 2ec46e8
Show file tree

Hide file tree

Showing 10 changed files with 237 additions and 39 deletions.
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -13,6 +13,10 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 
 _Due to the changes mentioned above, a few item functionalities may be disabled. For example, the `name` and `description` properties can now be accessed and modified using attribute functionality_
 
+### Added
+- Constructor methods: `from_dict`, `from_pandas`, `from_yaml`
+
+
 ## [0.35.7] -- 2023-07-19
 ### Fixed 
 - incorrect setting of sample and subsample indexes using from_dict function (#452)

diff --git a/docs/initialize.md b/docs/initialize.md
@@ -0,0 +1,75 @@
+# How to initiate peppy using different methods
+
+peppy supports multiple ways to initiate a project. The most common way is to use a configuration file. 
+However, peppy also supports using a csv file (sample sheet), and a yaml file (sample sheet).
+Additionally, peppy can be initiated using Python objects such as a pandas dataframe or a dictionary.
+
+## 1. Using a configuration file
+```python
+import peppy
+project = peppy.Project.from_pep_config("path/to/project/config.yaml")
+```
+
+## 2. Using csv file (sample sheet)
+```python
+import peppy
+project = peppy.Project.from_pep_config("path/to/project/sample_sheet.csv")
+```
+
+## 3. Using yaml sample sheet
+
+```python
+import peppy
+
+project = peppy.Project.from_sample_yaml("path/to/project/sample_sheet.yaml")
+```
+
+
+## 4. Using a pandas dataframe
+```python
+import pandas as pd
+import peppy
+df = pd.read_csv("path/to/project/sample_sheet.csv")
+project = peppy.Project.from_pandas(df)
+```
+
+## 5. Using a peppy generated dict
+```python
+import peppy
+project = peppy.Project.from_dict(
+    {'_config': {'description': None,
+                 'name': 'example_basic',
+                 'pep_version': '2.0.0',
+                 'sample_table': 'sample_table.csv',},
+    '_sample_dict': [{'organism': 'pig', 'sample_name': 'pig_0h', 'time': '0'},
+                     {'organism': 'pig', 'sample_name': 'pig_1h', 'time': '1'},
+                     {'organism': 'frog', 'sample_name': 'frog_0h', 'time': '0'},
+                     {'organism': 'frog', 'sample_name': 'frog_1h', 'time': '1'}],
+    '_subsample_list': [[{'read1': 'frog1a_data.txt',
+                       'read2': 'frog1a_data2.txt',
+                       'sample_name': 'frog_0h'},
+                      {'read1': 'frog1b_data.txt',
+                       'read2': 'frog1b_data2.txt',
+                       'sample_name': 'pig_0h'},
+                      {'read1': 'frog1c_data.txt',
+                       'read2': 'frog1b_data2.txt',
+                       'sample_name': 'pig_0h'}]]})
+```
+
+## 5.1 Generate dict from peppy and reuse it
+```python
+import peppy
+
+project = peppy.Project("https://raw.githubusercontent.com/pepkit/example_peps/master/example_basic/sample_table.csv")
+project_dict = project.to_dict(extended=True)
+project_copy = peppy.Project.from_dict(project_dict)
+
+# now you can check if this project is the same as the original project
+print(project_copy == project)
+```
+
+## 6. Using a csv file from a url
+```python
+import peppy
+project = peppy.Project("https://raw.githubusercontent.com/pepkit/example_peps/master/example_basic/sample_table.csv")
+```
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -14,6 +14,7 @@ nav:
     - How to use append sample modifier: feature1_append.md
     - How to use imply sample modifier: feature2_imply.md
     - How to validate a PEP: validating.md
+    - How to initialize a peppy: initialize.md
   - Reference:
     - API: autodoc_build/peppy.md
     - Support: support.md

diff --git a/peppy/_version.py b/peppy/_version.py
@@ -1 +1 @@
-__version__ = "0.40.0a5"
+__version__ = "0.40.0a6"
diff --git a/peppy/project.py b/peppy/project.py
@@ -10,6 +10,7 @@
 
 import numpy as np
 import pandas as pd
+import yaml
 from pandas.core.common import flatten
 from rich.console import Console
 from rich.progress import track
@@ -91,9 +92,6 @@ class Project(MutableMapping):
     :param str | Iterable[str] amendments: names of the amendments to activate
     :param Iterable[str] amendments: amendments to use within configuration file
     :param bool defer_samples_creation: whether the sample creation should be skipped
-    :param Dict[Any]: dict representation of the project {_config: str,
-                                                          _samples: list | dict,
-                                                          _subsamples: list[list | dict]}
 
     :Example:
 
@@ -111,7 +109,6 @@ def __init__(
         sample_table_index: Union[str, Iterable[str]] = None,
         subsample_table_index: Union[str, Iterable[str]] = None,
         defer_samples_creation: bool = False,
-        from_dict: dict = None,
     ):
         _LOGGER.debug(
             "Creating {}{}".format(
@@ -166,49 +163,66 @@ def __init__(
         self._sample_table = self._get_table_from_samples(
             index=self.st_index, initial=True
         )
-        if from_dict:
-            self.from_dict(from_dict)
 
     def __eq__(self, other):
         return [s.to_dict() for s in self.samples] == [
             s.to_dict() for s in other.samples
         ]
 
+    @classmethod
     def from_pandas(
-        self,
+        cls,
         samples_df: pd.DataFrame,
         sub_samples_df: List[pd.DataFrame] = None,
         config: dict = None,
-    ) -> "Project":
+    ):
         """
         Init a peppy project instance from a pandas Dataframe
+
         :param samples_df: in-memory pandas DataFrame object of samples
         :param sub_samples_df: in-memory list of pandas DataFrame objects of sub-samples
         :param config: dict of yaml file
         """
+        tmp_obj = cls()
         if not config:
             config = {CONFIG_VERSION_KEY: PEP_LATEST_VERSION}
-        self[SAMPLE_DF_KEY] = samples_df.replace(np.nan, "")
-        self[SUBSAMPLE_DF_KEY] = sub_samples_df
+        tmp_obj[SAMPLE_DF_KEY] = samples_df.replace(np.nan, "")
+        tmp_obj[SUBSAMPLE_DF_KEY] = sub_samples_df
 
-        self[SAMPLE_DF_LARGE] = self[SAMPLE_DF_KEY].shape[0] > 1000
+        tmp_obj[SAMPLE_DF_LARGE] = tmp_obj[SAMPLE_DF_KEY].shape[0] > 1000
 
-        self[CONFIG_KEY] = config
+        tmp_obj[CONFIG_KEY] = config
 
-        self.create_samples(modify=False if self[SAMPLE_TABLE_FILE_KEY] else True)
-        self._sample_table = self._get_table_from_samples(
-            index=self.st_index, initial=True
+        tmp_obj.create_samples(modify=False if tmp_obj[SAMPLE_TABLE_FILE_KEY] else True)
+        tmp_obj._sample_table = tmp_obj._get_table_from_samples(
+            index=tmp_obj.st_index, initial=True
         )
-        return self
+        return tmp_obj
 
-    def from_dict(self, pep_dictionary: dict) -> "Project":
+    @classmethod
+    def from_dict(cls, pep_dictionary: dict):
         """
         Init a peppy project instance from a dictionary representation
         of an already processed PEP.
-        :param dict pep_dictionary: in-memory dict representation of pep.
+
+        :param Dict[Any] pep_dictionary: dict representation of the project {_config: dict,
+                                                                             _samples: list | dict,
+                                                                             _subsamples: list[list | dict]}
         """
         _LOGGER.info("Processing project from dictionary...")
+        temp_obj = cls()
+        return temp_obj._from_dict(pep_dictionary)
+
+    def _from_dict(self, pep_dictionary) -> "Project":
+        """
+        Initiate a peppy project instance from a dictionary representation of an already processed PEP.
 
+        # This function is needed in looper to reinit the project after it was created from a dictionary representation.
+
+        :param Dict[Any] pep_dictionary: dict representation of the project {_config: dict,
+                                                                             _samples: list | dict,
+                                                                             _subsamples: list[list | dict]}
+        """
         self[SAMPLE_DF_KEY] = pd.DataFrame(pep_dictionary[SAMPLE_RAW_DICT_KEY])
         self[CONFIG_KEY] = pep_dictionary[CONFIG_KEY]
 
@@ -233,6 +247,50 @@ def from_dict(self, pep_dictionary: dict) -> "Project":
 
         return self
 
+    @classmethod
+    def from_pep_config(
+        cls,
+        cfg: str = None,
+        amendments: Union[str, Iterable[str]] = None,
+        sample_table_index: Union[str, Iterable[str]] = None,
+        subsample_table_index: Union[str, Iterable[str]] = None,
+        defer_samples_creation: bool = False,
+    ):
+        """
+        Init a peppy project instance from a yaml file
+
+        :param str cfg: Project config file (YAML) or sample table (CSV/TSV)
+            with one row per sample to constitute project
+        :param str | Iterable[str] sample_table_index: name of the columns to set
+            the sample_table index to
+        :param str | Iterable[str] subsample_table_index: name of the columns to set
+            the subsample_table index to
+        :param str | Iterable[str] amendments: names of the amendments to activate
+        :param Iterable[str] amendments: amendments to use within configuration file
+        :param bool defer_samples_creation: whether the sample creation should be skipped
+        """
+        # TODO: this is just a copy of the __init__ method. It should be refactored
+        return cls(
+            cfg=cfg,
+            amendments=amendments,
+            sample_table_index=sample_table_index,
+            subsample_table_index=subsample_table_index,
+            defer_samples_creation=defer_samples_creation,
+        )
+
+    @classmethod
+    def from_sample_yaml(cls, yaml_file: str):
+        """
+        Init a peppy project instance from a yaml file
+
+        :param str yaml_file: path to yaml file
+        """
+        _LOGGER.info("Processing project from yaml...")
+        with open(yaml_file, "r") as f:
+            prj_dict = yaml.safe_load(f)
+        pd_df = pd.DataFrame.from_dict(prj_dict)
+        return cls.from_pandas(pd_df)
+
     def to_dict(
         self,
         # expand: bool = False, # expand was used to expand paths. This functionality was removed, because of attmapp
@@ -244,7 +302,6 @@ def to_dict(
         """
         Convert the Project object to a dictionary.
 
-        :param bool expand: whether to expand the paths
         :param bool extended: whether to produce complete project dict (used to reinit the project)
         :param Literal orient: orientation of the returned df
         :return dict: a dictionary representation of the Project object
@@ -256,7 +313,10 @@ def to_dict(
                 ]
             else:
                 sub_df = None
-            self[CONFIG_KEY][NAME_KEY] = self.name
+            try:
+                self[CONFIG_KEY][NAME_KEY] = self.name
+            except NotImplementedError:
+                self[CONFIG_KEY][NAME_KEY] = "unnamed"
             self[CONFIG_KEY][DESC_KEY] = self.description
             p_dict = {
                 SAMPLE_RAW_DICT_KEY: self[SAMPLE_DF_KEY].to_dict(orient=orient),
@@ -275,7 +335,7 @@ def create_samples(self, modify: bool = False):
         """
         self._samples: List[Sample] = self.load_samples()
         if self.samples is None:
-            _LOGGER.info("No samples found in the project.")
+            _LOGGER.debug("No samples found in the project.")
 
         if modify:
             self.modify_samples()
@@ -1406,18 +1466,10 @@ def __delitem__(self, key):
     def __repr__(self):
         return str(self)
 
-    # pickle now is impossible, because it's impossible to initialize Project class without using actual files
     def __reduce__(self):
         return (
-            self.__class__,
-            (
-                None,
-                None,
-                None,
-                None,
-                False,
-                self.to_dict(extended=True, orient="records"),
-            ),
+            self.__class__.from_dict,
+            (self.to_dict(extended=True, orient="records"),),
         )
 
 

diff --git a/peppy/utils.py b/peppy/utils.py
@@ -2,7 +2,7 @@
 
 import logging
 import os
-from typing import Dict
+from typing import Dict, Mapping
 from urllib.request import urlopen
 
 import yaml
@@ -104,6 +104,30 @@ def _raise_faulty_arg():
         _raise_faulty_arg()
 
 
+def _expandpath(path: str):
+    """
+    Expand a filesystem path that may or may not contain user/env vars.
+
+    :param str path: path to expand
+    :return str: expanded version of input path
+    """
+    return os.path.expandvars(os.path.expanduser(path))
+
+
+def expand_paths(x: dict) -> dict:
+    """
+    Recursively expand paths in a dict.
+
+    :param dict x: dict to expand
+    :return dict: dict with expanded paths
+    """
+    if isinstance(x, str):
+        return expandpath(x)
+    elif isinstance(x, Mapping):
+        return {k: expand_paths(v) for k, v in x.items()}
+    return x
+
+
 def load_yaml(filepath):
     """
     Load a local or remote YAML file into a Python dict
@@ -123,11 +147,11 @@ def load_yaml(filepath):
             )
         else:
             data = response.read().decode("utf-8")
-            return yaml.safe_load(data)
+            return expand_paths(yaml.safe_load(data))
     else:
         with open(os.path.abspath(filepath), "r") as f:
             data = yaml.safe_load(f)
-        return data
+        return expand_paths(data)
 
 
 def is_cfg_or_anno(file_path, formats=None):

diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt
@@ -3,4 +3,3 @@ pyyaml
 rich>=10.3.0
 ubiquerg>=0.6.2
 numpy
-yacman>=0.9.0
diff --git a/tests/conftest.py b/tests/conftest.py
@@ -36,6 +36,11 @@ def example_pep_csv_path(request):
     return get_path_to_example_file(EPB, request.param, "sample_table.csv")
 
 
+@pytest.fixture
+def example_yaml_sample_file(request):
+    return get_path_to_example_file(EPB, request.param, "sample.yaml")
+
+
 @pytest.fixture
 def example_pep_nextflow_csv_path():
     return get_path_to_example_file(EPB, "nextflow_taxprofiler_pep", "samplesheet.csv")

diff --git a/tests/data/example_peps-master/example_basic_sample_yaml/sample.yaml b/tests/data/example_peps-master/example_basic_sample_yaml/sample.yaml
@@ -0,0 +1,6 @@
+- sample_name: sample1
+  file: path/to/file.tsv
+- sample_name: sample2
+  file: path/to/2.tsv
+- sample_name: sample3
+  file: path/to/3.tsv