Skip to content

Commit

Permalink
pcp_df handling and better error message
Browse files Browse the repository at this point in the history
  • Loading branch information
willdumm committed Mar 1, 2025
1 parent d44cc3d commit 0c1cef6
Show file tree
Hide file tree
Showing 2 changed files with 30 additions and 5 deletions.
12 changes: 9 additions & 3 deletions netam/dxsm.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,9 +145,15 @@ def of_seriess(
nt_parents[i], nt_children[i], aa_length=max_aa_seq_len
)
aa_seq_len = len(aa_parent)
assert_pcp_valid(
nt_parents[i], nt_children[i], aa_mask=masks[i][:aa_seq_len]
)
try:
assert_pcp_valid(
nt_parents[i], nt_children[i], aa_mask=masks[i][:aa_seq_len]
)
except ValueError as e:
raise ValueError(
"Parent and child nucleotide sequences are identical after masking codons containing N's. "
"You may want to filter your data by this condition using `netam.sequences.assert_pcp_valid`."
) from e

aa_parents_idxss[i, :aa_seq_len] = aa_idx_tensor_of_str_ambig(aa_parent)
aa_children_idxss[i, :aa_seq_len] = aa_idx_tensor_of_str_ambig(aa_child)
Expand Down
23 changes: 21 additions & 2 deletions netam/framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,6 +398,21 @@ def standardize_heavy_light_columns(pcp_df):
assert col + "_l" in cols, f"{col}_l column missing from pcp file!"
pcp_df["v_family_h"] = pcp_df["v_gene_h"].str.split("-").str[0]
pcp_df["v_family_l"] = pcp_df["v_gene_l"].str.split("-").str[0]
# Check that V gene families are in the correct columns:
if not pcp_df["v_family_l"].str[:3].isin(light_names).all():
_non_light_names = pcp_df[~pcp_df["v_family_l"].str[:3].isin(light_names)][
"v_family_l"
].unique()
raise ValueError(
f"Unexpected light chain V gene families: {_non_light_names}"
)
if not (pcp_df["v_family_h"].str[:3] == "IGH").all():
_non_heavy_names = pcp_df[pcp_df["v_family_h"].str[:3] != "IGH"][
"v_family_h"
].unique()
raise ValueError(
f"Unexpected heavy chain V gene families: {_non_heavy_names}"
)
elif "parent" in cols:
for col in differentiated_columns:
assert col in cols, f"{col} column missing from pcp file!"
Expand All @@ -413,8 +428,12 @@ def standardize_heavy_light_columns(pcp_df):
for col in differentiated_columns + ["v_gene"]:
pcp_df[col + "_h"] = pcp_df[col]
pcp_df[col + "_l"] = pcp_df[col]
pcp_df.loc[is_heavy_chain, col + "_l"] = ""
pcp_df.loc[~is_heavy_chain, col + "_h"] = ""
if pd.api.types.is_string_dtype(pcp_df[col]):
fill_value = ""
else:
fill_value = pd.NA
pcp_df.loc[is_heavy_chain, col + "_l"] = fill_value
pcp_df.loc[~is_heavy_chain, col + "_h"] = fill_value

if (pcp_df["parent_h"].str.len() + pcp_df["parent_l"].str.len()).min() < 3:
raise ValueError("At least one PCP has fewer than three nucleotides.")
Expand Down

0 comments on commit 0c1cef6

Please sign in to comment.