From 6bdc32ccad1c917afafda06f588f10646e86bfb2 Mon Sep 17 00:00:00 2001
From: Jochem Smit <jhsmit@gmail.com>
Date: Mon, 27 Sep 2021 14:58:37 +0200
Subject: [PATCH] match fd/nd control to data using start/end as index

in favour of using pd.merge
---
 pyhdx/models.py      | 44 +++++++++++++++++++++++---------------------
 tests/test_models.py |  7 ++++---
 2 files changed, 27 insertions(+), 24 deletions(-)

diff --git a/pyhdx/models.py b/pyhdx/models.py
index 4a2d7802..d108033c 100644
--- a/pyhdx/models.py
+++ b/pyhdx/models.py
@@ -232,11 +232,13 @@ def __init__(self, data, drop_first=1, ignore_prolines=True, d_percentage=100.,
         assert 0 <= d_percentage <= 100., 'Deuteration percentage must be between 0 and 100'
         d_percentage /= 100.
 
-        self.data = data.copy()
+        self.data = data.copy().reset_index(drop=True)
+        self.data.index.name = 'peptide_index'
+
         if remove_nan:
             self.data = self.data.dropna(subset=['uptake'])
         if sort:
-            self.data = self.data.sort_values(['start', 'end', 'sequence', 'exposure', 'state'])
+            self.data = self.data.sort_values(['start', 'end', 'sequence', 'state', 'exposure'])
 
         for col in ['start', 'end', 'sequence']:
             target = '_' + col
@@ -329,33 +331,33 @@ def set_control(self, control_1, control_0=None):
 
         """
 
-        fd_df = self.get_data(*control_1)[['start', 'end', 'uptake']]
-        fd_df.rename(columns={'uptake': 'FD_uptake'}, inplace=True)
+        try:
+            fd_df = self.get_data(*control_1)[['start', 'end', 'uptake']].set_index(['start', 'end'], verify_integrity=True)
+        except ValueError as e:
+            raise ValueError("FD control has duplicate entries") from e
 
         if fd_df.size == 0:
             raise ValueError(f'No matching peptides with state {control_1[0]} and exposure {control_1[1]}')
 
-        if control_0 is None:
-            nd_df = self.get_data(*control_1).copy()[['start', 'end', 'uptake']]
-            nd_df['uptake'] = 0
-            if nd_df.size == 0:
-                raise ValueError(f'No matching peptides with state {control_0[0]} and exposure {control_0[1]}')
-        else:
-            nd_df = self.get_data(*control_0)[['start', 'end', 'uptake']]
-
-
+        try:
+            if control_0 is None:
+                nd_df = self.get_data(*control_1).copy()[['start', 'end', 'uptake']].set_index(['start', 'end'], verify_integrity=True)
+                nd_df['uptake'] = 0
 
-        nd_df.rename(columns={'uptake': 'ND_uptake'}, inplace=True)
+            else:
+                nd_df = self.get_data(*control_0)[['start', 'end', 'uptake']].set_index(['start', 'end'], verify_integrity=True)
+                if nd_df.size == 0:
+                    raise ValueError(f'No matching peptides with state {control_0[0]} and exposure {control_0[1]}')
+        except ValueError as e:
+            raise ValueError("ND control has duplicate entries") from e
 
-        # this should probably go to the log (but atm there isnt any for running without GUI)
-        # assert control_1.size > 0, f"No peptides found with state '{control_1[0]}' and exposure '{control_1[1]}'"
-        # assert control_0.size > 0, f"No peptides found with state '{control_0[0]}' and exposure '{control_0[1]}'"
+        self.data.set_index(['start', 'end'], append=True, inplace=True)
+        self.data.reset_index(level=0, inplace=True)
 
-        self.data = pd.merge(self.data, fd_df, on=['start', 'end'], how='left')
-        self.data = pd.merge(self.data, nd_df, on=['start', 'end'], how='left')
+        self.data['rfu'] = (self.data['uptake'] - nd_df['uptake']) / (fd_df['uptake'] - nd_df['uptake'])
+        self.data['uptake_corrected'] = self.data['rfu'] * self.data['ex_residues']
 
-        self.data['rfu'] = (self.data['uptake'] - self.data['ND_uptake']) / (self.data['FD_uptake'] - self.data['ND_uptake'])
-        self.data['uptake_corrected'] = (self.data['uptake'] / self.data['FD_uptake'] * self.data['ex_residues'])
+        self.data = self.data.set_index('peptide_index', append=True).reset_index(level=[0, 1])
 
     def select(self, **kwargs):
         """
diff --git a/tests/test_models.py b/tests/test_models.py
index 6735f1d6..3dba3491 100644
--- a/tests/test_models.py
+++ b/tests/test_models.py
@@ -8,6 +8,7 @@
 from operator import add
 from pathlib import Path
 import pandas as pd
+from pandas.testing import assert_frame_equal
 import tempfile
 import pickle
 import pytest
@@ -60,9 +61,9 @@ def test_tensors(self):
     def test_rfu(self):
         rfu_residues = self.hdxm.rfu_residues
         compare = csv_to_dataframe(output_dir / 'ecSecB_rfu_per_exposure.csv')
-        compare_array = compare.to_numpy()
-
-        np.testing.assert_allclose(rfu_residues, compare_array)
+        compare.columns = compare.columns.astype(float)
+        compare.columns.name = 'exposure'
+        assert_frame_equal(rfu_residues, compare)
 
     def test_to_file(self):
         with tempfile.TemporaryDirectory() as tempdir: