From 6bdc32ccad1c917afafda06f588f10646e86bfb2 Mon Sep 17 00:00:00 2001 From: Jochem Smit Date: Mon, 27 Sep 2021 14:58:37 +0200 Subject: [PATCH] match fd/nd control to data using start/end as index in favour of using pd.merge --- pyhdx/models.py | 44 +++++++++++++++++++++++--------------------- tests/test_models.py | 7 ++++--- 2 files changed, 27 insertions(+), 24 deletions(-) diff --git a/pyhdx/models.py b/pyhdx/models.py index 4a2d7802..d108033c 100644 --- a/pyhdx/models.py +++ b/pyhdx/models.py @@ -232,11 +232,13 @@ def __init__(self, data, drop_first=1, ignore_prolines=True, d_percentage=100., assert 0 <= d_percentage <= 100., 'Deuteration percentage must be between 0 and 100' d_percentage /= 100. - self.data = data.copy() + self.data = data.copy().reset_index(drop=True) + self.data.index.name = 'peptide_index' + if remove_nan: self.data = self.data.dropna(subset=['uptake']) if sort: - self.data = self.data.sort_values(['start', 'end', 'sequence', 'exposure', 'state']) + self.data = self.data.sort_values(['start', 'end', 'sequence', 'state', 'exposure']) for col in ['start', 'end', 'sequence']: target = '_' + col @@ -329,33 +331,33 @@ def set_control(self, control_1, control_0=None): """ - fd_df = self.get_data(*control_1)[['start', 'end', 'uptake']] - fd_df.rename(columns={'uptake': 'FD_uptake'}, inplace=True) + try: + fd_df = self.get_data(*control_1)[['start', 'end', 'uptake']].set_index(['start', 'end'], verify_integrity=True) + except ValueError as e: + raise ValueError("FD control has duplicate entries") from e if fd_df.size == 0: raise ValueError(f'No matching peptides with state {control_1[0]} and exposure {control_1[1]}') - if control_0 is None: - nd_df = self.get_data(*control_1).copy()[['start', 'end', 'uptake']] - nd_df['uptake'] = 0 - if nd_df.size == 0: - raise ValueError(f'No matching peptides with state {control_0[0]} and exposure {control_0[1]}') - else: - nd_df = self.get_data(*control_0)[['start', 'end', 'uptake']] - - + try: + if control_0 is None: + nd_df = self.get_data(*control_1).copy()[['start', 'end', 'uptake']].set_index(['start', 'end'], verify_integrity=True) + nd_df['uptake'] = 0 - nd_df.rename(columns={'uptake': 'ND_uptake'}, inplace=True) + else: + nd_df = self.get_data(*control_0)[['start', 'end', 'uptake']].set_index(['start', 'end'], verify_integrity=True) + if nd_df.size == 0: + raise ValueError(f'No matching peptides with state {control_0[0]} and exposure {control_0[1]}') + except ValueError as e: + raise ValueError("ND control has duplicate entries") from e - # this should probably go to the log (but atm there isnt any for running without GUI) - # assert control_1.size > 0, f"No peptides found with state '{control_1[0]}' and exposure '{control_1[1]}'" - # assert control_0.size > 0, f"No peptides found with state '{control_0[0]}' and exposure '{control_0[1]}'" + self.data.set_index(['start', 'end'], append=True, inplace=True) + self.data.reset_index(level=0, inplace=True) - self.data = pd.merge(self.data, fd_df, on=['start', 'end'], how='left') - self.data = pd.merge(self.data, nd_df, on=['start', 'end'], how='left') + self.data['rfu'] = (self.data['uptake'] - nd_df['uptake']) / (fd_df['uptake'] - nd_df['uptake']) + self.data['uptake_corrected'] = self.data['rfu'] * self.data['ex_residues'] - self.data['rfu'] = (self.data['uptake'] - self.data['ND_uptake']) / (self.data['FD_uptake'] - self.data['ND_uptake']) - self.data['uptake_corrected'] = (self.data['uptake'] / self.data['FD_uptake'] * self.data['ex_residues']) + self.data = self.data.set_index('peptide_index', append=True).reset_index(level=[0, 1]) def select(self, **kwargs): """ diff --git a/tests/test_models.py b/tests/test_models.py index 6735f1d6..3dba3491 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -8,6 +8,7 @@ from operator import add from pathlib import Path import pandas as pd +from pandas.testing import assert_frame_equal import tempfile import pickle import pytest @@ -60,9 +61,9 @@ def test_tensors(self): def test_rfu(self): rfu_residues = self.hdxm.rfu_residues compare = csv_to_dataframe(output_dir / 'ecSecB_rfu_per_exposure.csv') - compare_array = compare.to_numpy() - - np.testing.assert_allclose(rfu_residues, compare_array) + compare.columns = compare.columns.astype(float) + compare.columns.name = 'exposure' + assert_frame_equal(rfu_residues, compare) def test_to_file(self): with tempfile.TemporaryDirectory() as tempdir: