Skip to content

Commit

Permalink
Merge pull request #398 from pepkit/dev
Browse files Browse the repository at this point in the history
Release v.0.34.0
  • Loading branch information
rafalstepien authored Aug 18, 2022
2 parents c4075dd + cee4ea3 commit 30e7807
Show file tree
Hide file tree
Showing 8 changed files with 141 additions and 52 deletions.
13 changes: 13 additions & 0 deletions docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,19 @@

This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format.

## [0.34.0] -- 2022-08-17

### Changed

Way of initialization project from dictionary. Now it's possible as follows: Project().from_dict()
### Fixed

- Fix error that was raised when duplicated sample in `sample_table` had different read types (single-end mixed with paired-end).

### Added

- Feature of initializing `peppy.Project` from `pandas.DataFrame`

## [0.33.0] -- 2022-07-25

### Changed
Expand Down
2 changes: 1 addition & 1 deletion peppy/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.33.0"
__version__ = "0.34.0"
1 change: 1 addition & 0 deletions peppy/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,5 +117,6 @@
"REQUIRED_VERSION",
]

PEP_LATEST_VERSION = "2.1.0"

__all__ = PROJECT_CONSTANTS + SAMPLE_CONSTANTS + OTHER_CONSTANTS
151 changes: 104 additions & 47 deletions peppy/project.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import os
from collections.abc import Mapping
from logging import getLogger
from typing import Dict, List, Tuple, Union
from typing import Dict, List, Tuple, Union, Iterable

import pandas as pd
from attmap import PathExAttMap
Expand Down Expand Up @@ -56,6 +56,7 @@
SUBSAMPLE_NAME_ATTR,
SUBSAMPLE_TABLE_INDEX_KEY,
SUBSAMPLE_TABLES_FILE_KEY,
PEP_LATEST_VERSION,
)

from .exceptions import *
Expand Down Expand Up @@ -99,12 +100,11 @@ class Project(PathExAttMap):

def __init__(
self,
cfg=None,
amendments=None,
sample_table_index=None,
subsample_table_index=None,
defer_samples_creation=False,
project_dict=None,
cfg: str = None,
amendments: Union[str, Iterable[str]] = None,
sample_table_index: Union[str, Iterable[str]] = None,
subsample_table_index: Union[str, Iterable[str]] = None,
defer_samples_creation: bool = False,
):
_LOGGER.debug(
"Creating {}{}".format(
Expand Down Expand Up @@ -151,18 +151,14 @@ def __init__(

self.name = self.infer_name()
self.description = self.get_description()

if not defer_samples_creation:
self.create_samples(modify=False if self[SAMPLE_TABLE_FILE_KEY] else True)
self._sample_table = self._get_table_from_samples(
index=self.st_index, initial=True
)

# init project from dict
if project_dict:
self.from_dict(project_dict)

def __eq__(self, other):

dict_self = self._convert_to_dict(self)
dict_other = self._convert_to_dict(other)

Expand Down Expand Up @@ -231,11 +227,25 @@ def _nan_converter(self, nan_dict: Dict) -> Union[Dict, List]:
else:
return nan_dict

def from_dict(self, pep_dictionary: dict) -> None:
def from_pandas(self, pandas_df: pd.DataFrame) -> object:
"""
Init a peppy project instance from a pandas Dataframe
:param pandas_df: in-memory pandas DataFrame object
"""
self[SAMPLE_DF_KEY] = pandas_df
self[SAMPLE_DF_LARGE] = self[SAMPLE_DF_KEY].shape[0] > 1000

self.create_samples(modify=False if self[SAMPLE_TABLE_FILE_KEY] else True)
self._sample_table = self._get_table_from_samples(
index=self.st_index, initial=True
)
return self

def from_dict(self, pep_dictionary: dict) -> object:
"""
Init a peppy project instance from a dictionary representation
of an already processed PEP.
:param dict d: in-memory dict representation of processed pep.
:param dict pep_dictionary: in-memory dict representation of processed pep.
"""
_LOGGER.info(f"Processing project from dictionary...")
if CONFIG_KEY not in self:
Expand Down Expand Up @@ -280,6 +290,8 @@ def from_dict(self, pep_dictionary: dict) -> None:

_LOGGER.info(f"Project '{self.name}' has been initiated")

return self

def to_dict(self, expand: bool = False, extended: bool = False) -> dict:
"""
Convert the Project object to a dictionary.
Expand All @@ -296,11 +308,11 @@ def to_dict(self, expand: bool = False, extended: bool = False) -> dict:
p_dict["_samples"] = [s.to_dict() for s in self.samples]
return p_dict

def create_samples(self, modify=False):
def create_samples(self, modify: bool = False):
"""
Populate Project with Sample objects
"""
self._samples = self.load_samples()
self._samples: List[Sample] = self.load_samples()
if modify:
self.modify_samples()
else:
Expand Down Expand Up @@ -440,6 +452,14 @@ def load_samples(self):
if SAMPLE_DF_KEY not in self:
return []

if CONFIG_KEY not in self:
self[CONFIG_KEY] = {CONFIG_VERSION_KEY: PEP_LATEST_VERSION}
self[CONFIG_FILE_KEY] = None

elif len(self[CONFIG_KEY]) < 1:
self[CONFIG_KEY][CONFIG_VERSION_KEY] = PEP_LATEST_VERSION
self[CONFIG_FILE_KEY] = None

for _, r in self[SAMPLE_DF_KEY].iterrows():
samples_list.append(Sample(r.dropna(), prj=self))
return samples_list
Expand Down Expand Up @@ -594,56 +614,42 @@ def _auto_merge_duplicated_names(self):
specified in the config
"""
sample_names_list = [getattr(s, self.st_index) for s in self.samples]
dups_set = set(
[
x
for x in track(
sample_names_list,
description="Detecting duplicate sample names",
disable=not self.is_sample_table_large,
)
if sample_names_list.count(x) > 1
]
)
if not dups_set:
# all sample names are unique
duplicated_sample_ids = self._get_duplicated_sample_ids(sample_names_list)

if not duplicated_sample_ids:
return

_LOGGER.info(
f"Found {len(dups_set)} samples with non-unique names: {dups_set}. Attempting to auto-merge."
f"Found {len(duplicated_sample_ids)} samples with non-unique names: {duplicated_sample_ids}. Attempting to auto-merge."
)
if SUBSAMPLE_DF_KEY in self and self[SUBSAMPLE_DF_KEY] is not None:
raise IllegalStateException(
f"Duplicated sample names found and subsample_table is specified in the config; "
f"you may use either auto-merging or subsample_table-based merging. "
f"Duplicates: {dups_set}"
f"Duplicates: {duplicated_sample_ids}"
)
for duplication in dups_set:

for duplicated_id in duplicated_sample_ids:
(
duplicated_samples,
non_duplicated_samples,
) = self._get_duplicated_and_not_duplicated_samples(
duplication, self.st_index, self.samples
duplicated_id, self.st_index, self.samples
)
self._samples = non_duplicated_samples

sample_attrs = [
sample_attributes = [
attr
for attr in duplicated_samples[0].keys()
if not attr.startswith("_")
]

merged_attrs = {}
for attr in sample_attrs:
merged_attrs[attr] = list(
flatten([getattr(s, attr) for s in duplicated_samples])
)
merged_attrs = self._get_merged_attributes(
sample_attributes, duplicated_samples
)
self._samples = non_duplicated_samples

# make single element lists scalars
for attribute, values in merged_attrs.items():
if isinstance(
values, list
) and self._all_values_in_the_list_are_the_same(values):
merged_attrs[attribute] = values[0]
for attribute_name, values in merged_attrs.items():
if isinstance(values, list) and len(list(set(values))) == 1:
merged_attrs[attribute_name] = values[0]

self.add_samples(Sample(series=merged_attrs))

Expand Down Expand Up @@ -674,6 +680,36 @@ def _get_duplicated_and_not_duplicated_samples(
def _all_values_in_the_list_are_the_same(list_of_values: List) -> bool:
return all(value == list_of_values[0] for value in list_of_values)

@staticmethod
def _get_duplicated_sample_ids(sample_names_list: List) -> set:
return set(
[
sample_id
for sample_id in track(
sample_names_list,
description="Detecting duplicate sample names",
disable=not Project.is_sample_table_large,
)
if sample_names_list.count(sample_id) > 1
]
)

@staticmethod
def _get_merged_attributes(
sample_attributes: List[str], duplicated_samples: List[Sample]
) -> dict:
merged_attributes = {}
for attr in sample_attributes:

attribute_values = []
for sample in duplicated_samples:
attribute_value_for_sample = getattr(sample, attr, "")
attribute_values.append(attribute_value_for_sample)

merged_attributes[attr] = list(flatten(attribute_values))

return merged_attributes

def attr_merge(self):
"""
Merge sample subannotations (from subsample table) with
Expand Down Expand Up @@ -1222,6 +1258,27 @@ def _read_sample_data(self):
:param List[str] sample_table: a list of paths to sample tables
"""

def _read_tab(pth):
"""
Internal read table function
:param str pth: absolute path to the file to read
:return pandas.DataFrame: table object
"""
csv_kwargs = {
"dtype": str,
"index_col": False,
"keep_default_na": False,
"na_values": [""],
}
try:
return pd.read_csv(pth, sep=infer_delimiter(pth), **csv_kwargs)
except Exception as e:
raise SampleTableFileException(
f"Could not read table: {pth}. "
f"Caught exception: {getattr(e, 'message', repr(e))}"
)

no_metadata_msg = "No {} specified"
if self[SAMPLE_TABLE_FILE_KEY] is not None:
st = self[SAMPLE_TABLE_FILE_KEY]
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def get_static(name, condition=None):
"Programming Language :: Python :: 3.7",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Topic :: Scientific/Engineering :: Bio-Informatics",
],
keywords="project, metadata, bioinformatics, sequencing, ngs, workflow",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
sample,run_accession,instrument_platform,fastq_1,fastq_2,fasta
2611,ERR5766174,ILLUMINA,,,/<path>/<to>/fasta/ERX5474930_ERR5766174_1.fa.gz
2612,ERR5766176,ILLUMINA,/<path>/<to>/fastq/ERX5474932_ERR5766176_1.fastq.gz,/<path>/<to>/fastq/ERX5474932_ERR5766176_2.fastq.gz,
2612,ERR5766180,ILLUMINA,/<path>/<to>/fastq/ERX5474936_ERR5766180_1.fastq.gz,,
2613,ERR5766181,ILLUMINA,/<path>/<to>/fastq/ERX5474937_ERR5766181_1.fastq.gz,/<path>/<to>/fastq/ERX5474937_ERR5766181_2.fastq.gz,
ERR3201952,ERR3201952,OXFORD_NANOPORE,/<path>/<to>/fastq/ERR3201952.fastq.gz,,
5 changes: 5 additions & 0 deletions tests/smoketests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,3 +61,8 @@ def example_peps_cfg_paths(request):
)
for p in request.param
]


@pytest.fixture
def nextflow_sample_table_path():
return "tests/data/example_peps-master/example_nextflow_samplesheet/samplesheet.csv"
14 changes: 10 additions & 4 deletions tests/smoketests/test_Project.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def _cmp_all_samples_attr(p1, p2, attr):
]


class ProjectConstructorTests:
class TestProjectConstructor:
def test_empty(self):
"""Verify that an empty Project instance can be created"""
p = Project()
Expand Down Expand Up @@ -355,8 +355,14 @@ def test_sample_table_version(self, example_pep_csv_path):
p = Project(cfg=example_pep_csv_path)
assert isinstance(p.pep_version, str)

def test_auto_merge_duplicated_names_works_for_different_read_types(
self, nextflow_sample_table_path
):
p = Project(nextflow_sample_table_path, sample_table_index="sample")
assert len(p.samples) == 4

class ProjectManipulationTests:

class TestProjectManipulationTests:
@pytest.mark.parametrize("example_pep_cfg_path", ["amendments1"], indirect=True)
def test_amendments_activation_interactive(self, example_pep_cfg_path):
"""
Expand Down Expand Up @@ -456,7 +462,7 @@ def test_get_sample_nonexistent(self, example_pep_cfg_path):
p.get_sample(sample_name="kdkdkdk")


class SampleModifiersTests:
class TestSampleModifiers:
@pytest.mark.parametrize("example_pep_cfg_path", ["append"], indirect=True)
def test_append(self, example_pep_cfg_path):
"""Verify that the appended attribute is added to the samples"""
Expand Down Expand Up @@ -522,7 +528,7 @@ def test_subtable(self, example_pep_cfg_path):
)


class PostInitSampleCreationTests:
class TestPostInitSampleCreation:
@pytest.mark.parametrize("example_pep_cfg_path", ["append"], indirect=True)
def test_append(self, example_pep_cfg_path):
"""
Expand Down

0 comments on commit 30e7807

Please sign in to comment.