From de0a13f5e7fb1dec6f1244e0d06921fff20ce4fb Mon Sep 17 00:00:00 2001 From: armaan-abraham Date: Tue, 5 Mar 2024 19:49:02 -0500 Subject: [PATCH] improve sequences validation (#535) --- ddmc/clustering.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/ddmc/clustering.py b/ddmc/clustering.py index 98eb87da..caaaeec5 100644 --- a/ddmc/clustering.py +++ b/ddmc/clustering.py @@ -96,12 +96,18 @@ def fit(self, p_signal: pd.DataFrame): p_signal, pd.DataFrame ), "`p_signal` must be a pandas dataframe." sequences = p_signal.index.values - assert ( - isinstance(sequences[0], str) and len(sequences[0]) == 11 - ), "The index of p_signal must be the peptide sequences of length 11" - assert all( - [token.upper() in AAlist for token in sequences[0]] - ), "Sequence(s) contain invalid characters" + + for i, seq in enumerate(sequences): + assert isinstance( + seq, str + ), f"Sequence {seq} at index {i} is not a string. All sequences must be strings." + assert ( + len(seq) == 11 + ), f"Sequence {seq} at index {i} is of length {len(seq)}. All sequences must be of length 11." + assert all( + [token.upper() in AAlist for token in seq] + ), f"Sequence {seq} at index {i} contains invalid characters." + assert ( p_signal.select_dtypes(include=[np.number]).shape[1] == p_signal.shape[1] ), "All values in `p_signal` should be numerical"