pcp_df handling and better error message

matsengrp · Mar 1, 2025 · 0c1cef6 · 0c1cef6
1 parent d44cc3d
commit 0c1cef6
Show file tree

Hide file tree

Showing 2 changed files with 30 additions and 5 deletions.
diff --git a/netam/dxsm.py b/netam/dxsm.py
@@ -145,9 +145,15 @@ def of_seriess(
                 nt_parents[i], nt_children[i], aa_length=max_aa_seq_len
             )
             aa_seq_len = len(aa_parent)
-            assert_pcp_valid(
-                nt_parents[i], nt_children[i], aa_mask=masks[i][:aa_seq_len]
-            )
+            try:
+                assert_pcp_valid(
+                    nt_parents[i], nt_children[i], aa_mask=masks[i][:aa_seq_len]
+                )
+            except ValueError as e:
+                raise ValueError(
+                    "Parent and child nucleotide sequences are identical after masking codons containing N's. "
+                    "You may want to filter your data by this condition using `netam.sequences.assert_pcp_valid`."
+                ) from e
 
             aa_parents_idxss[i, :aa_seq_len] = aa_idx_tensor_of_str_ambig(aa_parent)
             aa_children_idxss[i, :aa_seq_len] = aa_idx_tensor_of_str_ambig(aa_child)

diff --git a/netam/framework.py b/netam/framework.py
@@ -398,6 +398,21 @@ def standardize_heavy_light_columns(pcp_df):
             assert col + "_l" in cols, f"{col}_l column missing from pcp file!"
         pcp_df["v_family_h"] = pcp_df["v_gene_h"].str.split("-").str[0]
         pcp_df["v_family_l"] = pcp_df["v_gene_l"].str.split("-").str[0]
+        # Check that V gene families are in the correct columns:
+        if not pcp_df["v_family_l"].str[:3].isin(light_names).all():
+            _non_light_names = pcp_df[~pcp_df["v_family_l"].str[:3].isin(light_names)][
+                "v_family_l"
+            ].unique()
+            raise ValueError(
+                f"Unexpected light chain V gene families: {_non_light_names}"
+            )
+        if not (pcp_df["v_family_h"].str[:3] == "IGH").all():
+            _non_heavy_names = pcp_df[pcp_df["v_family_h"].str[:3] != "IGH"][
+                "v_family_h"
+            ].unique()
+            raise ValueError(
+                f"Unexpected heavy chain V gene families: {_non_heavy_names}"
+            )
     elif "parent" in cols:
         for col in differentiated_columns:
             assert col in cols, f"{col} column missing from pcp file!"
@@ -413,8 +428,12 @@ def standardize_heavy_light_columns(pcp_df):
         for col in differentiated_columns + ["v_gene"]:
             pcp_df[col + "_h"] = pcp_df[col]
             pcp_df[col + "_l"] = pcp_df[col]
-            pcp_df.loc[is_heavy_chain, col + "_l"] = ""
-            pcp_df.loc[~is_heavy_chain, col + "_h"] = ""
+            if pd.api.types.is_string_dtype(pcp_df[col]):
+                fill_value = ""
+            else:
+                fill_value = pd.NA
+            pcp_df.loc[is_heavy_chain, col + "_l"] = fill_value
+            pcp_df.loc[~is_heavy_chain, col + "_h"] = fill_value
 
     if (pcp_df["parent_h"].str.len() + pcp_df["parent_l"].str.len()).min() < 3:
         raise ValueError("At least one PCP has fewer than three nucleotides.")