Merge pull request #415 from pepkit/dev

Release v0.35.1
pepkit · Sep 7, 2022 · 7197c26 · 7197c26
2 parents 659b337 + d6f5d43
commit 7197c26
Show file tree

Hide file tree

Showing 20 changed files with 1,193 additions and 738 deletions.
diff --git a/README.md b/README.md
@@ -2,13 +2,13 @@
 
 ![Run pytests](https://github.com/pepkit/peppy/workflows/Run%20pytests/badge.svg)
 [![codecov](https://codecov.io/gh/pepkit/peppy/branch/master/graph/badge.svg)](https://codecov.io/gh/pepkit/peppy)
-[![PEP compatible](http://pepkit.github.io/img/PEP-compatible-green.svg)](http://pep.databio.org)
+[![PEP compatible](https://pepkit.github.io/img/PEP-compatible-green.svg)](https://pep.databio.org)
 [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
 
 `peppy` is the official python package for reading **Portable Encapsulated Projects** or **PEP**s in Python.
 
 Links to complete documentation:
 
-* Complete documentation and API for the `peppy` python package is at [peppy.databio.org](http://peppy.databio.org).
-* Reference documentation for standard **PEP** format is at [pep.databio.org](http://pep.databio.org/).
+* Complete documentation and API for the `peppy` python package is at [peppy.databio.org](https://peppy.databio.org).
+* Reference documentation for standard **PEP** format is at [pep.databio.org](https://pep.databio.org/).
 * Example PEPs for testing `peppy` are in the [example_peps repository](https://github.com/pepkit/example_peps).
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -2,6 +2,18 @@
 
 This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format.
 
+## [0.35.1] -- 2022-09-07
+### Changed
+- Organization of test files. Separated unittests from smoketests.
+
+### Fixed
+- The root cause of `np.nan` values showing up in Pandas dataframes. Replaced the values with None right after reading the database, which made it possible to remove all custom `np.nan` to `None` converters used later in the code.
+- Typing in some methods.
+- Code redundancy in fixtures in conftest.
+
+### Added
+- New test cases with test data
+
 ## [0.35.0] -- 2022-08-25
 
 ### Changed 

diff --git a/peppy/_version.py b/peppy/_version.py
@@ -1 +1 @@
-__version__ = "0.35.0"
+__version__ = "0.35.1"
diff --git a/peppy/parsers.py b/peppy/parsers.py
@@ -87,6 +87,7 @@ def parse(self) -> pd.DataFrame:
         """
         self.validate_path()
         self._table = pd.read_csv(self.path, **self._pandas_kwargs)
+        self._table = self._table.where(pd.notnull(self._table), None)
         return self.table
 
 

diff --git a/peppy/project.py b/peppy/project.py
@@ -8,6 +8,7 @@
 from logging import getLogger
 from typing import Dict, Iterable, List, Tuple, Union
 
+import numpy as np
 import pandas as pd
 from attmap import PathExAttMap
 from pandas.core.common import flatten
@@ -50,14 +51,14 @@
     SAMPLE_MODIFIERS,
     SAMPLE_MODS_KEY,
     SAMPLE_NAME_ATTR,
+    SAMPLE_RAW_DICT_KEY,
     SAMPLE_TABLE_FILE_KEY,
     SAMPLE_TABLE_INDEX_KEY,
     SUBSAMPLE_DF_KEY,
     SUBSAMPLE_NAME_ATTR,
+    SUBSAMPLE_RAW_DICT_KEY,
     SUBSAMPLE_TABLE_INDEX_KEY,
     SUBSAMPLE_TABLES_FILE_KEY,
-    SAMPLE_RAW_DICT_KEY,
-    SUBSAMPLE_RAW_DICT_KEY,
 )
 from .exceptions import *
 from .parsers import select_parser
@@ -166,72 +167,12 @@ def __eq__(self, other):
             s.to_dict() for s in other.samples
         ]
 
-    def _convert_to_dict(self, project_value=None):
-        """
-        Recursively transform various project values, objects, and attributes to a dictionary
-        compatible format. Useful for creating an extended dictionary representation
-        of the peppy project.
-
-        :param project_value object - the value to transform
-        """
-        if isinstance(project_value, list):
-            new_list = []
-            for item_value in project_value:
-                new_list.append(self._convert_to_dict(item_value))
-            return new_list
-
-        elif isinstance(project_value, dict):
-            new_dict = {}
-            for key, value in project_value.items():
-                if key != "_project":
-                    new_dict[key] = self._convert_to_dict(value)
-            return new_dict
-
-        elif isinstance(project_value, PathExAttMap):
-            new_dict = PathExAttMap.to_dict(project_value)
-            return self._convert_to_dict(new_dict)
-
-        elif isinstance(project_value, Sample):
-            new_dict = PathExAttMap.to_dict(project_value)
-            return new_dict
-
-        elif isinstance(project_value, pd.DataFrame):
-            project_value = project_value.to_dict()
-            project_value = self._nan_converter(project_value)
-            return project_value
-
-        else:
-            return project_value
-
-    def _nan_converter(self, nan_dict: Dict) -> Union[Dict, List]:
-        """
-        Searching and converting nan values to None
-        :param dict nan_dict: dictionary with nan values
-        """
-        if isinstance(nan_dict, list):
-            new_list = []
-            for list_item in nan_dict:
-                new_list.append(self._nan_converter(list_item))
-
-            return new_list
-
-        elif isinstance(nan_dict, dict):
-            new_dict = {}
-            for key, value in nan_dict.items():
-                new_dict[key] = self._nan_converter(value)
-            return new_dict
-        elif isinstance(nan_dict, float):
-            if math.isnan(nan_dict):
-                return None
-        else:
-            return nan_dict
-
     def from_pandas(
         self,
         samples_df: pd.DataFrame,
         sub_samples_df: List[pd.DataFrame] = None,
         config: dict = None,
-    ) -> object:
+    ) -> "Project":
         """
         Init a peppy project instance from a pandas Dataframe
         :param samples_df: in-memory pandas DataFrame object of samples
@@ -253,7 +194,7 @@ def from_pandas(
         )
         return self
 
-    def from_dict(self, pep_dictionary: dict) -> object:
+    def from_dict(self, pep_dictionary: dict) -> "Project":
         """
         Init a peppy project instance from a dictionary representation
         of an already processed PEP.
@@ -303,7 +244,6 @@ def to_dict(self, expand: bool = False, extended: bool = False) -> dict:
                 NAME_KEY: self[NAME_KEY],
                 DESC_KEY: self[DESC_KEY],
             }
-            p_dict = self._nan_converter(p_dict)
         else:
             p_dict = self.config.to_dict(expand=expand)
             p_dict["_samples"] = [s.to_dict() for s in self.samples]
@@ -472,7 +412,7 @@ def load_samples(self):
             self[SUBSAMPLE_DF_KEY] = None
 
         for _, r in self[SAMPLE_DF_KEY].iterrows():
-            samples_list.append(Sample(r.dropna(), prj=self))
+            samples_list.append(Sample(r, prj=self))
         return samples_list
 
     def modify_samples(self):
@@ -727,10 +667,12 @@ def attr_merge(self):
             _LOGGER.debug("No {} found, skipping merge".format(CFG_SUBSAMPLE_TABLE_KEY))
             return
         for subsample_table in self[SUBSAMPLE_DF_KEY]:
-            for n in list(subsample_table[self.st_index]):
-                if n not in [s[self.st_index] for s in self.samples]:
+            for sample_name in list(subsample_table[self.st_index]):
+                if sample_name not in [s[self.st_index] for s in self.samples]:
                     _LOGGER.warning(
-                        ("Couldn't find matching sample for subsample: {}").format(n)
+                        ("Couldn't find matching sample for subsample: {}").format(
+                            sample_name
+                        )
                     )
             for sample in track(
                 self.samples,
@@ -749,9 +691,7 @@ def attr_merge(self):
                 sample_indexer = (
                     subsample_table[sample_colname] == sample[self.st_index]
                 )
-                this_sample_rows = subsample_table[sample_indexer].dropna(
-                    how="all", axis=1
-                )
+                this_sample_rows = subsample_table[sample_indexer]
                 if len(this_sample_rows) == 0:
                     _LOGGER.debug(
                         "No merge rows for sample '%s', skipping",

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -0,0 +1,60 @@
+""" Configuration for modules with independent tests of models. """
+
+import os
+
+import pandas as pd
+import pytest
+
+__author__ = "Michal Stolarczyk"
+__email__ = "[email protected]"
+
+# example_peps branch, see: https://github.com/pepkit/example_peps
+EPB = "master"
+
+
+def merge_paths(pep_branch, directory_name):
+    return os.path.join(
+        os.path.dirname(os.path.dirname(os.path.abspath(__file__))),
+        "tests",
+        "data",
+        "example_peps-{}".format(pep_branch),
+        "example_{}".format(directory_name),
+    )
+
+
+def get_path_to_example_file(branch, directory_name, file_name):
+    return os.path.join(merge_paths(branch, directory_name), file_name)
+
+
+@pytest.fixture
+def example_pep_cfg_path(request):
+    return get_path_to_example_file(EPB, request.param, "project_config.yaml")
+
+
+@pytest.fixture
+def example_pep_csv_path(request):
+    return get_path_to_example_file(EPB, request.param, "sample_table.csv")
+
+
+@pytest.fixture
+def example_pep_cfg_noname_path(request):
+    return get_path_to_example_file(EPB, "noname", request.param)
+
+
+@pytest.fixture
+def example_peps_cfg_paths(request):
+    """
+    This is the same as the ficture above, however, it lets
+    you return multiple paths (for comparing peps). Will return
+    list of paths.
+    """
+    return [
+        get_path_to_example_file(EPB, p, "project_config.yaml") for p in request.param
+    ]
+
+
+@pytest.fixture
+def config_with_pandas_obj(request):
+    return pd.read_csv(
+        get_path_to_example_file(EPB, request.param, "sample_table.csv"), dtype=str
+    )
diff --git a/tests/data/example_peps-master/example_nextflow_config/project_config.yaml b/tests/data/example_peps-master/example_nextflow_config/project_config.yaml
@@ -0,0 +1,3 @@
+pep_version: "2.1.0"
+sample_table: "samplesheet.csv"
+
diff --git a/tests/data/example_peps-master/example_nextflow_config/samplesheet.csv b/tests/data/example_peps-master/example_nextflow_config/samplesheet.csv
@@ -0,0 +1,7 @@
+sample,instrument_platform,run_accession,fastq_1,fastq_2,fasta
+2611,ILLUMINA,ERR5766174,,,https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/data/fasta/ERX5474930_ERR5766174_1.fa.gz
+2612,ILLUMINA,ERR5766176,https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/data/fastq/ERX5474932_ERR5766176_1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/data/fastq/ERX5474932_ERR5766176_2.fastq.gz,
+2612,ILLUMINA,ERR5766176_B,https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/data/fastq/ERX5474932_ERR5766176_B_1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/data/fastq/ERX5474932_ERR5766176_B_2.fastq.gz,
+2612,ILLUMINA,ERR5766180,https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/data/fastq/ERX5474936_ERR5766180_1.fastq.gz,,
+2613,ILLUMINA,ERR5766181,https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/data/fastq/ERX5474937_ERR5766181_1.fastq.gz,https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/data/fastq/ERX5474937_ERR5766181_2.fastq.gz,
+ERR3201952,OXFORD_NANOPORE,ERR3201952,https://raw.githubusercontent.com/nf-core/test-datasets/taxprofiler/data/fastq/ERR3201952.fastq.gz,,
diff --git a/...mple_nextflow_samplesheet/samplesheet.csv → ...ple_nextflow_samplesheet/sample_table.csv b/...mple_nextflow_samplesheet/samplesheet.csv → ...ple_nextflow_samplesheet/sample_table.csv
diff --git a/tests/data/example_peps-master/example_node_alias/README.md b/tests/data/example_peps-master/example_node_alias/README.md
@@ -0,0 +1,16 @@
+# YAML Aliases
+
+You can also use YAML aliases in PEPs, since the config file is just a YAML file. These allow you to define variables within the YAML file, and then re-use these in other places in the file. Unfortunately, you can't import aliases across files (each file must contain its own definitions so it's self-sufficient as a YAML file).
+
+To do it, just define a value as an anchor with the `&` character. Then, recall (duplicate) that value later with the `*` character. For example:
+
+```
+list:
+- sandwich
+- drink
+- &thing chips
+- crackers
+- *thing
+```
+
+This will have `chips` twice in the list. In the first instance, we assigned it to `thing` with the `&thing` flag. Then, we repopulate `*thing` to chips.
diff --git a/tests/data/example_peps-master/example_node_alias/project_config.yaml b/tests/data/example_peps-master/example_node_alias/project_config.yaml
@@ -0,0 +1,8 @@
+pep_version: "2.0.0"
+sample_table: sample_table.csv
+output_dir: $HOME/hello_looper_results
+pipeline_dir: $HOME/pipeline_dir
+
+project_modifiers:
+  import:
+    - project_config1.yaml
diff --git a/tests/data/example_peps-master/example_node_alias/project_config1.yaml b/tests/data/example_peps-master/example_node_alias/project_config1.yaml
@@ -0,0 +1,13 @@
+pep_version: "2.0.0"
+sample_modifiers:
+  append:
+    imported_attr: imported_val
+
+value: &anchor Foo
+testvalue: *anchor
+anchors:
+  - &property value
+  - &trimmer trimmomatic
+  - &setting 4
+
+trimmer: *trimmer
diff --git a/tests/data/example_peps-master/example_node_alias/sample_table.csv b/tests/data/example_peps-master/example_node_alias/sample_table.csv
@@ -0,0 +1,3 @@
+sample_name,protocol,file
+frog_1,anySampleType,data/frog1_data.txt
+frog_2,anySampleType,data/frog2_data.txt
diff --git a/tests/data/example_peps-master/example_other_sample_table_index/config.yaml b/tests/data/example_peps-master/example_other_sample_table_index/config.yaml
diff --git a/tests/data/example_peps-master/example_other_sample_table_index/samplesheet.csv b/tests/data/example_peps-master/example_other_sample_table_index/samplesheet.csv
diff --git a/tests/data/example_peps-master/example_other_sample_table_index/subsamplesheet.csv b/tests/data/example_peps-master/example_other_sample_table_index/subsamplesheet.csv
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		pep_version: "2.1.0"
		sample_table: "samplesheet.csv"