From b581b73ebc458cd83179f8d85a331b7f358c9d08 Mon Sep 17 00:00:00 2001 From: Lisa M Tuttle Date: Thu, 13 Jun 2024 08:41:46 -0700 Subject: [PATCH 1/8] Update models.py Change HDXMeasurement to allow replicates in data --- pyhdx/models.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/pyhdx/models.py b/pyhdx/models.py index ce938257..0fef09c9 100644 --- a/pyhdx/models.py +++ b/pyhdx/models.py @@ -248,7 +248,8 @@ def __init__(self, data: pd.DataFrame, **metadata: Any): self.timepoints: np.ndarray = np.sort(np.unique(data["exposure"])) # todo sort happens twice now - data = data.sort_values(["start", "stop", "sequence", "exposure"]) + data = data.reset_index() + data = data.sort_values(["start", "stop", "sequence", "exposure","index"]) # Obtain the intersection of peptides per timepoint df_list = [(data[data["exposure"] == exposure]) for exposure in self.timepoints] @@ -274,14 +275,14 @@ def __init__(self, data: pd.DataFrame, **metadata: Any): self.data: pd.DataFrame = pd.concat( intersected_data, axis=0, ignore_index=True - ).sort_values(["start", "stop", "sequence", "exposure"]) + ).sort_values(["start", "stop", "sequence", "exposure","index"]) self.data["peptide_id"] = self.data.index % self.Np self.data.index.name = ( "peptide_index" # index is original index which continues along exposures ) self.data_wide = ( - self.data.pivot(index="peptide_id", columns=["exposure"]) - .reorder_levels([1, 0], axis=1) + self.data.pivot(index="peptide_id", columns=["exposure","index"]) + .reorder_levels([2, 1, 0], axis=1) .sort_index(axis=1, level=0, sort_remaining=False) ) From dd31995c1f948d1d405a3d12782ba8f85fe8e25e Mon Sep 17 00:00:00 2001 From: Lisa M Tuttle Date: Thu, 13 Jun 2024 08:43:25 -0700 Subject: [PATCH 2/8] convert HDExaminer to pyhdx converts HDExaminer outputs to a format usable by pyhdx to create an HDXMeasurement object --- pyhdx/convert_data.py | 135 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 pyhdx/convert_data.py diff --git a/pyhdx/convert_data.py b/pyhdx/convert_data.py new file mode 100644 index 00000000..0d045bb3 --- /dev/null +++ b/pyhdx/convert_data.py @@ -0,0 +1,135 @@ +''' convert_data.py 12june2024 LMT +Function to convert data tables exported from HDExaminer to the processed DynamX format pyHDX expects +this will leave any extra columns, but will chop out the MAX time points after processing + +Example use case: + +# 'all_results.csv' is a summary uptake table exported from HDExaminer +# requires a .fasta file for each Protein State + +import pandas as pd +import numpy as np +import os + +proj_dir = '' +hdexdf = pd.read_csv(os.path.join(proj_dir,'all_results.csv')) + +pepdata = pd.DataFrame() +pepdata = hdexa_to_pyhdx(hdexdf) + +hdxm = {} +for mutant in pepdata['state'].unique(): + fasta_sequence = SeqIO.parse(open(os.path.join(proj_dir,str(mutant)+'.fasta')),'fasta') + for fasta in fasta_sequence: + sequence = str(fasta.seq) + #sequence[mutant] = str(fasta.seq) + use_data = pepdata.copy()[(pepdata["state"]==mutant) & (pepdata["quality"]!="Low")] + hdxm[mutant] = HDXMeasurement(use_data, temperature=303.15, pH=8., sequence=sequence) + +''' + + +import pandas as pd +import numpy as np + + +def hdexa_to_pyhdx(data,d_percentage=0.85,protein='protein'): + drop_first=2 + + def _time_to_sec(tp,tpunit): + return tp * np.power(60.0,'smh'.find(tpunit[0])) + if '# Deut' in data.columns: + data = data.rename(columns={"# Deut":"#D"}) + data['#D'] = data['#D'].fillna(0.0) + data['#D'] = data['#D'].astype(float) + if 'Deut %' in data.columns: + data = data.rename(columns={"Deut %":"%D"}) + data['%D'] = data['%D'].fillna(0.0) + data['%D'] = data['%D'].astype(float) + if 'Deut Time' in data.columns: + data.loc[data['Deut Time'] == 'FD','Deut Time'] = '1e6s' + data['time unit'] = data['Deut Time'].str[-1] + data['Deut Time (sec)'] = data['Deut Time'].str[:-1].astype(float) + data['Deut Time (sec)'] = data.apply(lambda x: _time_to_sec(tp=x['Deut Time (sec)'],tpunit=x['time unit']),axis=1) + data.loc[data['Deut Time (sec)'] == 1e6,'Deut Time (sec)'] = 'MAX' + if 'Protein' not in data.columns: + data['Protein'] = protein + + + pyhdx_cols = ['start', 'end' ,'stop' ,'sequence', 'state', 'exposure' ,'uptake' ,'maxuptake', + 'fd_uptake' ,'fd_uptake_sd' ,'nd_uptake' ,'nd_uptake_sd' ,'rfu', 'protein', + 'modification', 'fragment', 'mhp' ,'center' ,'center_sd' ,'uptake_sd' ,'rt', + 'rt_sd' ,'rfu_sd' ,'_sequence' ,'_start' ,'_stop' ,'ex_residues', + 'uptake_corrected'] + data = data.rename(columns={ + "Protein State":"state", + "Protein":"protein", + "Start":"start", + "End":"end", + "Sequence":"_sequence", + "Peptide Mass":"mhp", + "RT (min)":"rt", + "Deut Time (sec)":"exposure", + "maxD":"maxuptake", + "Theor Uptake #D":"uptake_corrected", + "#D":"uptake", + "%D":"rfu", + "Conf Interval (#D)":"rfu_sd", + "#Rep":"rep", + "Confidence":"quality", + "Stddev":"center_sd", + #"p" + }) + + missing = list(set(pyhdx_cols)-set(data.columns)) + for mcol in missing: + data[mcol] = np.nan + if mcol == "rfu_sd": data[mcol] = 0.05 #set 5% error as dummy value + + data['rfu']=data['rfu']/100. + data.loc[data['exposure']=="0",'rfu_sd']=0.0 + data['stop']=data['end']+1 + data['sequence']=data["_sequence"].copy() + data['sequence']=[s.replace("P", "p") for s in data["sequence"]] + # Find the total number of n terminal / c_terminal residues to remove from pyhdx/process.py + n_term = np.array([len(seq) - len(seq[drop_first:].lstrip("p")) for seq in data["sequence"]]) + c_term = np.array([len(seq) - len(seq.rstrip("p")) for seq in data["sequence"]]) + data["sequence"] = ["x" * nt + s[nt:] for nt, s in zip(n_term, data["sequence"])] + data["_start"] = data["start"] + n_term + data["_stop"] = data["stop"] - c_term + ex_residues = (np.array([len(s) - s.count("x") - s.count("p") for s in data["sequence"]])* d_percentage) + data["ex_residues"] = ex_residues + data["uptake_sd"]=data["center_sd"] + data["nd_uptake"]=0.0 + data["nd_uptake_sd"]=0.0 + data["modification"]=float("nan") + data["fragment"]=float("nan") + # upeps = data[data["exposure"]=="0"]["_sequence"].unique() + # fpeps = data[data["exposure"]=="MAX"]["_sequence"].unique() + # good_peps = np.array(list(set(upeps) & set(fpeps))) + #peps = data["_sequence"].unique() + states = data["state"].unique() + data["fd_uptake"]="novalue" + data["fd_uptake_sd"]="novalue" + + for state in states: + peps = data[data["state"]==state]["_sequence"].unique() + for pep in peps: + fd_up = data[(data["_sequence"]==pep) & (data["exposure"]=="MAX")& (data["state"]==state)]['uptake'].iat[0] + fd_up_sd = data[(data["_sequence"]==pep) & (data["exposure"]=="MAX")& (data["state"]==state)]['center_sd'].iat[0] + data.loc[data["_sequence"]==pep, "fd_uptake"]=fd_up + data.loc[data["_sequence"]==pep, "fd_uptake_sd"]=fd_up_sd + data["center"]=data["mhp"]+data["uptake"] + data["rt_sd"]=0.05 #dummy value + + data['uptake_corrected_orig'] = data['uptake_corrected'] #sometimes the HDExaminer output value is incorrect + data['uptake_corrected'] = data["rfu"]*data['maxuptake'] # so revert to conversion from the rfu + + + data = data[data["exposure"] != "MAX"] + data = data[data["fd_uptake"] != 0] + data = data[~data["uptake"].isna()] + data["exposure"]=data["exposure"].astype(float) + + new_columns = [col for col in pyhdx_cols if col in data.columns] + [col for col in data.columns if col not in pyhdx_cols] + return data[new_columns] From 96cbb67f1de0ecfc33ae52545e74ac6340f2c6fd Mon Sep 17 00:00:00 2001 From: Jochem Smit Date: Fri, 14 Jun 2024 11:12:48 +0200 Subject: [PATCH 3/8] style: ruff format --- pyhdx/convert_data.py | 203 +++++++++++++++++++++++++----------------- 1 file changed, 122 insertions(+), 81 deletions(-) diff --git a/pyhdx/convert_data.py b/pyhdx/convert_data.py index 0d045bb3..277e1ee4 100644 --- a/pyhdx/convert_data.py +++ b/pyhdx/convert_data.py @@ -1,4 +1,4 @@ -''' convert_data.py 12june2024 LMT +""" convert_data.py 12june2024 LMT Function to convert data tables exported from HDExaminer to the processed DynamX format pyHDX expects this will leave any extra columns, but will chop out the MAX time points after processing @@ -26,110 +26,151 @@ use_data = pepdata.copy()[(pepdata["state"]==mutant) & (pepdata["quality"]!="Low")] hdxm[mutant] = HDXMeasurement(use_data, temperature=303.15, pH=8., sequence=sequence) -''' +""" -import pandas as pd import numpy as np +import pandas as pd -def hdexa_to_pyhdx(data,d_percentage=0.85,protein='protein'): - drop_first=2 - - def _time_to_sec(tp,tpunit): - return tp * np.power(60.0,'smh'.find(tpunit[0])) - if '# Deut' in data.columns: - data = data.rename(columns={"# Deut":"#D"}) - data['#D'] = data['#D'].fillna(0.0) - data['#D'] = data['#D'].astype(float) - if 'Deut %' in data.columns: - data = data.rename(columns={"Deut %":"%D"}) - data['%D'] = data['%D'].fillna(0.0) - data['%D'] = data['%D'].astype(float) - if 'Deut Time' in data.columns: - data.loc[data['Deut Time'] == 'FD','Deut Time'] = '1e6s' - data['time unit'] = data['Deut Time'].str[-1] - data['Deut Time (sec)'] = data['Deut Time'].str[:-1].astype(float) - data['Deut Time (sec)'] = data.apply(lambda x: _time_to_sec(tp=x['Deut Time (sec)'],tpunit=x['time unit']),axis=1) - data.loc[data['Deut Time (sec)'] == 1e6,'Deut Time (sec)'] = 'MAX' - if 'Protein' not in data.columns: - data['Protein'] = protein - - - pyhdx_cols = ['start', 'end' ,'stop' ,'sequence', 'state', 'exposure' ,'uptake' ,'maxuptake', - 'fd_uptake' ,'fd_uptake_sd' ,'nd_uptake' ,'nd_uptake_sd' ,'rfu', 'protein', - 'modification', 'fragment', 'mhp' ,'center' ,'center_sd' ,'uptake_sd' ,'rt', - 'rt_sd' ,'rfu_sd' ,'_sequence' ,'_start' ,'_stop' ,'ex_residues', - 'uptake_corrected'] - data = data.rename(columns={ - "Protein State":"state", - "Protein":"protein", - "Start":"start", - "End":"end", - "Sequence":"_sequence", - "Peptide Mass":"mhp", - "RT (min)":"rt", - "Deut Time (sec)":"exposure", - "maxD":"maxuptake", - "Theor Uptake #D":"uptake_corrected", - "#D":"uptake", - "%D":"rfu", - "Conf Interval (#D)":"rfu_sd", - "#Rep":"rep", - "Confidence":"quality", - "Stddev":"center_sd", - #"p" - }) - - missing = list(set(pyhdx_cols)-set(data.columns)) +def hdexa_to_pyhdx(data, d_percentage=0.85, protein="protein"): + drop_first = 2 + + def _time_to_sec(tp, tpunit): + return tp * np.power(60.0, "smh".find(tpunit[0])) + + if "# Deut" in data.columns: + data = data.rename(columns={"# Deut": "#D"}) + data["#D"] = data["#D"].fillna(0.0) + data["#D"] = data["#D"].astype(float) + if "Deut %" in data.columns: + data = data.rename(columns={"Deut %": "%D"}) + data["%D"] = data["%D"].fillna(0.0) + data["%D"] = data["%D"].astype(float) + if "Deut Time" in data.columns: + data.loc[data["Deut Time"] == "FD", "Deut Time"] = "1e6s" + data["time unit"] = data["Deut Time"].str[-1] + data["Deut Time (sec)"] = data["Deut Time"].str[:-1].astype(float) + data["Deut Time (sec)"] = data.apply( + lambda x: _time_to_sec(tp=x["Deut Time (sec)"], tpunit=x["time unit"]), axis=1 + ) + data.loc[data["Deut Time (sec)"] == 1e6, "Deut Time (sec)"] = "MAX" + if "Protein" not in data.columns: + data["Protein"] = protein + + pyhdx_cols = [ + "start", + "end", + "stop", + "sequence", + "state", + "exposure", + "uptake", + "maxuptake", + "fd_uptake", + "fd_uptake_sd", + "nd_uptake", + "nd_uptake_sd", + "rfu", + "protein", + "modification", + "fragment", + "mhp", + "center", + "center_sd", + "uptake_sd", + "rt", + "rt_sd", + "rfu_sd", + "_sequence", + "_start", + "_stop", + "ex_residues", + "uptake_corrected", + ] + data = data.rename( + columns={ + "Protein State": "state", + "Protein": "protein", + "Start": "start", + "End": "end", + "Sequence": "_sequence", + "Peptide Mass": "mhp", + "RT (min)": "rt", + "Deut Time (sec)": "exposure", + "maxD": "maxuptake", + "Theor Uptake #D": "uptake_corrected", + "#D": "uptake", + "%D": "rfu", + "Conf Interval (#D)": "rfu_sd", + "#Rep": "rep", + "Confidence": "quality", + "Stddev": "center_sd", + # "p" + } + ) + + missing = list(set(pyhdx_cols) - set(data.columns)) for mcol in missing: data[mcol] = np.nan - if mcol == "rfu_sd": data[mcol] = 0.05 #set 5% error as dummy value - - data['rfu']=data['rfu']/100. - data.loc[data['exposure']=="0",'rfu_sd']=0.0 - data['stop']=data['end']+1 - data['sequence']=data["_sequence"].copy() - data['sequence']=[s.replace("P", "p") for s in data["sequence"]] + if mcol == "rfu_sd": + data[mcol] = 0.05 # set 5% error as dummy value + + data["rfu"] = data["rfu"] / 100.0 + data.loc[data["exposure"] == "0", "rfu_sd"] = 0.0 + data["stop"] = data["end"] + 1 + data["sequence"] = data["_sequence"].copy() + data["sequence"] = [s.replace("P", "p") for s in data["sequence"]] # Find the total number of n terminal / c_terminal residues to remove from pyhdx/process.py n_term = np.array([len(seq) - len(seq[drop_first:].lstrip("p")) for seq in data["sequence"]]) c_term = np.array([len(seq) - len(seq.rstrip("p")) for seq in data["sequence"]]) data["sequence"] = ["x" * nt + s[nt:] for nt, s in zip(n_term, data["sequence"])] data["_start"] = data["start"] + n_term data["_stop"] = data["stop"] - c_term - ex_residues = (np.array([len(s) - s.count("x") - s.count("p") for s in data["sequence"]])* d_percentage) + ex_residues = ( + np.array([len(s) - s.count("x") - s.count("p") for s in data["sequence"]]) * d_percentage + ) data["ex_residues"] = ex_residues - data["uptake_sd"]=data["center_sd"] - data["nd_uptake"]=0.0 - data["nd_uptake_sd"]=0.0 - data["modification"]=float("nan") - data["fragment"]=float("nan") + data["uptake_sd"] = data["center_sd"] + data["nd_uptake"] = 0.0 + data["nd_uptake_sd"] = 0.0 + data["modification"] = float("nan") + data["fragment"] = float("nan") # upeps = data[data["exposure"]=="0"]["_sequence"].unique() # fpeps = data[data["exposure"]=="MAX"]["_sequence"].unique() # good_peps = np.array(list(set(upeps) & set(fpeps))) - #peps = data["_sequence"].unique() + # peps = data["_sequence"].unique() states = data["state"].unique() - data["fd_uptake"]="novalue" - data["fd_uptake_sd"]="novalue" + data["fd_uptake"] = "novalue" + data["fd_uptake_sd"] = "novalue" for state in states: - peps = data[data["state"]==state]["_sequence"].unique() + peps = data[data["state"] == state]["_sequence"].unique() for pep in peps: - fd_up = data[(data["_sequence"]==pep) & (data["exposure"]=="MAX")& (data["state"]==state)]['uptake'].iat[0] - fd_up_sd = data[(data["_sequence"]==pep) & (data["exposure"]=="MAX")& (data["state"]==state)]['center_sd'].iat[0] - data.loc[data["_sequence"]==pep, "fd_uptake"]=fd_up - data.loc[data["_sequence"]==pep, "fd_uptake_sd"]=fd_up_sd - data["center"]=data["mhp"]+data["uptake"] - data["rt_sd"]=0.05 #dummy value - - data['uptake_corrected_orig'] = data['uptake_corrected'] #sometimes the HDExaminer output value is incorrect - data['uptake_corrected'] = data["rfu"]*data['maxuptake'] # so revert to conversion from the rfu + fd_up = data[ + (data["_sequence"] == pep) & (data["exposure"] == "MAX") & (data["state"] == state) + ]["uptake"].iat[0] + fd_up_sd = data[ + (data["_sequence"] == pep) & (data["exposure"] == "MAX") & (data["state"] == state) + ]["center_sd"].iat[0] + data.loc[data["_sequence"] == pep, "fd_uptake"] = fd_up + data.loc[data["_sequence"] == pep, "fd_uptake_sd"] = fd_up_sd + data["center"] = data["mhp"] + data["uptake"] + data["rt_sd"] = 0.05 # dummy value + + data["uptake_corrected_orig"] = data[ + "uptake_corrected" + ] # sometimes the HDExaminer output value is incorrect + data["uptake_corrected"] = ( + data["rfu"] * data["maxuptake"] + ) # so revert to conversion from the rfu - data = data[data["exposure"] != "MAX"] data = data[data["fd_uptake"] != 0] data = data[~data["uptake"].isna()] - data["exposure"]=data["exposure"].astype(float) + data["exposure"] = data["exposure"].astype(float) - new_columns = [col for col in pyhdx_cols if col in data.columns] + [col for col in data.columns if col not in pyhdx_cols] + new_columns = [col for col in pyhdx_cols if col in data.columns] + [ + col for col in data.columns if col not in pyhdx_cols + ] return data[new_columns] From 9b037205d81e27ffea00b3ea7e3039d02b45a13d Mon Sep 17 00:00:00 2001 From: Lisa M Tuttle Date: Mon, 1 Jul 2024 11:19:27 -0700 Subject: [PATCH 4/8] Create hdexaminer_to_pyhdx.py function to convert HDExaminer outputs into a format usable by pyHDX. Additional changes to models.py are required to allow for the HDExaminer outputs to have multiple replicates --- pyhdx/hdexaminer_to_pyhdx.py | 107 +++++++++++++++++++++++++++++++++++ 1 file changed, 107 insertions(+) create mode 100644 pyhdx/hdexaminer_to_pyhdx.py diff --git a/pyhdx/hdexaminer_to_pyhdx.py b/pyhdx/hdexaminer_to_pyhdx.py new file mode 100644 index 00000000..3ca9f9ce --- /dev/null +++ b/pyhdx/hdexaminer_to_pyhdx.py @@ -0,0 +1,107 @@ +import os +import numpy as np +import pandas as pd + +### Function to convert data table exported from HDExaminer to the processed DynamX format pyHDX expects +### this will leave any extra columns, but will chop out the MAX time points after processing + +def hdexa_to_pyhdx(data,d_percentage=0.85,protein='protein'): + drop_first=2 + + def _time_to_sec(tp,tpunit): + return tp * np.power(60.0,'smh'.find(tpunit[0])) + if '# Deut' in data.columns: + data = data.rename(columns={"# Deut":"#D"}) + data['#D'] = data['#D'].fillna(0.0) + data['#D'] = data['#D'].astype(float) + if 'Deut %' in data.columns: + data = data.rename(columns={"Deut %":"%D"}) + data['%D'] = data['%D'].fillna(0.0) + data['%D'] = data['%D'].astype(float) + if 'Deut Time' in data.columns: + data.loc[data['Deut Time'] == 'FD','Deut Time'] = '1e6s' + data['time unit'] = data['Deut Time'].str[-1] + data['Deut Time (sec)'] = data['Deut Time'].str[:-1].astype(float) + data['Deut Time (sec)'] = data.apply(lambda x: _time_to_sec(tp=x['Deut Time (sec)'],tpunit=x['time unit']),axis=1) + data.loc[data['Deut Time (sec)'] == 1e6,'Deut Time (sec)'] = 'MAX' + if 'Protein' not in data.columns: + data['Protein'] = protein + + + pyhdx_cols = ['start', 'end' ,'stop' ,'sequence', 'state', 'exposure' ,'uptake' ,'maxuptake', + 'fd_uptake' ,'fd_uptake_sd' ,'nd_uptake' ,'nd_uptake_sd' ,'rfu', 'protein', + 'modification', 'fragment', 'mhp' ,'center' ,'center_sd' ,'uptake_sd' ,'rt', + 'rt_sd' ,'rfu_sd' ,'_sequence' ,'_start' ,'_stop' ,'ex_residues', + 'uptake_corrected'] + data = data.rename(columns={ + "Protein State":"state", + "Protein":"protein", + "Start":"start", + "End":"end", + "Sequence":"_sequence", + "Peptide Mass":"mhp", + "RT (min)":"rt", + "Deut Time (sec)":"exposure", + "maxD":"maxuptake", + "Theor Uptake #D":"uptake_corrected", + "#D":"uptake", + "%D":"rfu", + "Conf Interval (#D)":"rfu_sd", + "#Rep":"rep", + "Confidence":"quality", + "Stddev":"center_sd", + #"p" + }) + + missing = list(set(pyhdx_cols)-set(data.columns)) + for mcol in missing: + data[mcol] = np.nan + if mcol == "rfu_sd": data[mcol] = 0.05 #set 5% error as dummy value + + data['rfu']=data['rfu']/100. + data.loc[data['exposure']=="0",'rfu_sd']=0.0 + data['stop']=data['end']+1 + data['sequence']=data["_sequence"].copy() + data['sequence']=[s.replace("P", "p") for s in data["sequence"]] + # Find the total number of n terminal / c_terminal residues to remove from pyhdx/process.py + n_term = np.array([len(seq) - len(seq[drop_first:].lstrip("p")) for seq in data["sequence"]]) + c_term = np.array([len(seq) - len(seq.rstrip("p")) for seq in data["sequence"]]) + data["sequence"] = ["x" * nt + s[nt:] for nt, s in zip(n_term, data["sequence"])] + data["_start"] = data["start"] + n_term + data["_stop"] = data["stop"] - c_term + ex_residues = (np.array([len(s) - s.count("x") - s.count("p") for s in data["sequence"]])* d_percentage) + data["ex_residues"] = ex_residues + data["uptake_sd"]=data["center_sd"] + data["nd_uptake"]=0.0 + data["nd_uptake_sd"]=0.0 + data["modification"]=float("nan") + data["fragment"]=float("nan") + # upeps = data[data["exposure"]=="0"]["_sequence"].unique() + # fpeps = data[data["exposure"]=="MAX"]["_sequence"].unique() + # good_peps = np.array(list(set(upeps) & set(fpeps))) + #peps = data["_sequence"].unique() + states = data["state"].unique() + data["fd_uptake"]="novalue" + data["fd_uptake_sd"]="novalue" + + for state in states: + peps = data[data["state"]==state]["_sequence"].unique() + for pep in peps: + fd_up = data[(data["_sequence"]==pep) & (data["exposure"]=="MAX")& (data["state"]==state)]['uptake'].iat[0] + fd_up_sd = data[(data["_sequence"]==pep) & (data["exposure"]=="MAX")& (data["state"]==state)]['center_sd'].iat[0] + data.loc[data["_sequence"]==pep, "fd_uptake"]=fd_up + data.loc[data["_sequence"]==pep, "fd_uptake_sd"]=fd_up_sd + data["center"]=data["mhp"]+data["uptake"] + data["rt_sd"]=0.05 #dummy value + + data['uptake_corrected_orig'] = data['uptake_corrected'] + data['uptake_corrected'] = data["rfu"]*data['maxuptake'] + + + data = data[data["exposure"] != "MAX"] + data = data[data["fd_uptake"] != 0] + data = data[~data["uptake"].isna()] + data["exposure"]=data["exposure"].astype(float) + + new_columns = [col for col in pyhdx_cols if col in data.columns] + [col for col in data.columns if col not in pyhdx_cols] + return data[new_columns] From b45f91cfd088650f89d74c81a7827dd6b679fbbf Mon Sep 17 00:00:00 2001 From: Lisa M Tuttle Date: Mon, 1 Jul 2024 11:27:01 -0700 Subject: [PATCH 5/8] Create all_results.csv --- HDExaminer_examples/all_results.csv | 31 +++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 HDExaminer_examples/all_results.csv diff --git a/HDExaminer_examples/all_results.csv b/HDExaminer_examples/all_results.csv new file mode 100644 index 00000000..58a0c8f8 --- /dev/null +++ b/HDExaminer_examples/all_results.csv @@ -0,0 +1,31 @@ +Protein State,Deut Time,Experiment,Start,End,Sequence,Charge,Search RT,Actual RT,# Spectra,Peak Width,m/z Shift,Max Inty,Exp Cent,Theor Cent,Score,Cent Diff,# Deut,Deut %,Confidence +Protein State 1,0s,121520_WTUN1,80,86,DVKHFSP,2,6.3,6.23-6.40,10,0.036,0.003,3.46E+05,415.496,415.464,0.8447,n/a,n/a,n/a,Medium +Protein State 1,0s,121520_WTUN2,80,86,DVKHFSP,2,6.3,6.21-6.38,10,0.036,0.003,3.12E+05,415.502,415.464,0.8624,n/a,n/a,n/a,High +Protein State 1,0s,121520_WT_UN3,80,86,DVKHFSP,2,6.3,6.23-6.36,8,0.037,0.003,2.73E+05,415.507,415.464,0.8529,n/a,n/a,n/a,High +Protein State 1,FD,121520_WT_TD1,80,86,DVKHFSP,2,6.3,6.36-6.47,7,0.032,0.001,1.34E+05,416.742,416.741,0.8449,1.277,2.936,73.392,Medium +Protein State 1,FD,121520_WT_TD2,80,86,DVKHFSP,2,6.3,6.35-6.48,9,0.035,0.002,1.19E+05,416.736,416.727,0.8778,1.263,2.904,72.601,High +Protein State 1,FD,121520_WT_TD3,80,86,DVKHFSP,2,6.3,6.35-6.43,6,0.036,0,9.85E+04,416.731,416.721,0.8967,1.257,2.889,72.226,High +Protein State 1,0.15s,121520_WT_150_1,80,86,DVKHFSP,2,6.3,6.21-6.40,12,0.039,0.001,2.68E+04,415.884,415.861,0.8967,0.397,0.913,32.345,High +Protein State 1,0.15s,121520_WT_150_2,80,86,DVKHFSP,2,6.3,6.36-6.47,7,0.038,0.001,1.25E+05,415.77,415.749,0.9019,0.285,0.654,23.187,High +Protein State 1,0.15s,121520_WT_150_3,80,86,DVKHFSP,2,6.3,6.36-6.48,8,0.041,0.002,1.65E+05,415.747,415.743,0.9268,0.279,0.641,22.715,High +Protein State 1,0.15s,121520_WT_150_4,80,86,DVKHFSP,2,6.3,6.23-6.35,7,0.038,-0.001,1.11E+05,415.761,415.758,0.8966,0.294,0.677,23.971,High +Protein State 1,0.75s,121520_WT_750_1,80,86,DVKHFSP,2,6.3,6.28-6.41,9,0.041,0.001,9.84E+04,415.83,415.833,0.9312,0.369,0.847,30.025,High +Protein State 1,0.75s,121520_WT_750_2,80,86,DVKHFSP,2,6.3,6.23-6.35,7,0.038,0.001,1.04E+05,415.795,415.785,0.9066,0.321,0.738,26.139,High +Protein State 1,0.75s,121520_WT_750_3,80,86,DVKHFSP,2,6.3,6.23-6.36,8,0.033,0.002,1.63E+05,415.815,415.788,0.8795,0.324,0.746,26.417,High +Protein State 1,0.75s,121520_WT_750_4,80,86,DVKHFSP,2,6.3,6.36-6.48,8,0.036,0,1.69E+05,415.802,415.795,0.9101,0.331,0.761,26.951,High +Protein State 1,4.00s,121520_WT_3S_1,80,86,DVKHFSP,2,6.3,6.36-6.48,8,0.031,0.002,2.61E+05,415.752,415.704,0.8565,0.241,0.553,19.592,High +Protein State 1,4.00s,121520_WT_3s_2,80,86,DVKHFSP,2,6.3,6.35-6.47,8,0.032,0.002,2.46E+05,415.733,415.686,0.8528,0.223,0.512,18.125,High +Protein State 1,4.00s,121520_WT_3S_3,80,86,DVKHFSP,2,6.3,6.35-6.48,9,0.032,0.002,2.73E+05,415.725,415.712,0.8639,0.249,0.571,20.246,High +Protein State 1,4.00s,121520_WT_3s4,80,86,DVKHFSP,2,6.3,6.23-6.35,7,0.037,-0.001,1.58E+05,415.765,415.757,0.9074,0.293,0.673,23.839,High +Protein State 1,60.00s,121520_WT_1m_1,80,86,DVKHFSP,2,6.3,6.23-6.38,9,0.037,0.001,1.95E+05,415.832,415.825,0.9016,0.361,0.83,29.418,High +Protein State 1,60.00s,121520_WT_1m_2,80,86,DVKHFSP,2,6.3,6.23-6.35,7,0.039,0.002,1.66E+05,415.829,415.812,0.9219,0.348,0.8,28.335,High +Protein State 1,60.00s,121520_WT_1m_3,80,86,DVKHFSP,2,6.3,6.23-6.35,7,0.043,0.001,1.41E+05,415.826,415.827,0.9126,0.363,0.835,29.599,High +Protein State 1,60.00s,121520_WT_1m4,80,86,DVKHFSP,2,6.3,6.35-6.47,8,0.034,0,2.21E+05,415.829,415.82,0.8869,0.357,0.82,29.036,High +Protein State 1,1800.00s,121520_WT_30m_1,80,86,DVKHFSP,2,6.3,6.35-6.48,9,0.037,0.001,1.96E+05,416.072,416.063,0.9133,0.599,1.377,48.802,High +Protein State 1,1800.00s,121520_WT_30m_2,80,86,DVKHFSP,2,6.3,6.35-6.48,9,0.037,0.001,1.93E+05,416.077,416.067,0.9018,0.603,1.386,49.112,High +Protein State 1,1800.00s,121520_WT_30m_3,80,86,DVKHFSP,2,6.3,6.35-6.48,9,0.035,0,1.81E+05,416.067,416.069,0.8747,0.605,1.391,49.28,High +Protein State 1,1800.00s,121520_WT_30m4,80,86,DVKHFSP,2,6.3,6.21-6.35,8,0.039,-0.001,1.38E+05,416.077,416.066,0.9106,0.602,1.385,49.064,High +Protein State 1,72000.00s,121520_WT_20h_1,80,86,DVKHFSP,2,6.3,6.21-6.35,8,0.038,0.001,1.83E+05,416.275,416.276,0.888,0.812,1.866,66.101,High +Protein State 1,72000.00s,121520_WT_20h_2,80,86,DVKHFSP,2,6.3,6.21-6.36,9,0.038,0.001,1.71E+05,416.289,416.285,0.9048,0.821,1.888,66.871,High +Protein State 1,72000.00s,121520_WT_20h_3,80,86,DVKHFSP,2,6.3,6.23-6.36,8,0.038,0,1.63E+05,416.286,416.284,0.9012,0.82,1.885,66.772,High +Protein State 1,72000.00s,121520_WT_20h4,80,86,DVKHFSP,2,6.3,6.24-6.55,18,0.037,-0.001,2.08E+05,416.277,416.264,0.9055,0.8,1.84,65.173,High From 00f3e9e74f7f581311423c90f5a6f5309046faee Mon Sep 17 00:00:00 2001 From: Lisa M Tuttle Date: Mon, 1 Jul 2024 11:28:54 -0700 Subject: [PATCH 6/8] Create uptake_summary.csv --- HDExaminer_examples/uptake_summary.csv | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 HDExaminer_examples/uptake_summary.csv diff --git a/HDExaminer_examples/uptake_summary.csv b/HDExaminer_examples/uptake_summary.csv new file mode 100644 index 00000000..7c625df4 --- /dev/null +++ b/HDExaminer_examples/uptake_summary.csv @@ -0,0 +1,7 @@ +Protein State,Protein,Start,End,Sequence,Peptide Mass,RT (min),Deut Time (sec),maxD,Theor Uptake #D,#D,%D,Conf Interval (#D),#Rep,Confidence,Stddev,p +"WT"," B5",1,4,MDIA,448.1992,5.5936,0,2,0,0,0,n/a,2,High,0 +"WT"," B5",1,4,MDIA,448.1992,5.5809,4,2,0.001,0.768,47.600,0.044,4,Medium,0.028, +"WT"," B5",1,4,MDIA,448.1992,5.6104,60,2,0.021,1.151,71.352,0.026,4,Medium,0.016, +"WT"," B5",1,4,MDIA,448.1992,5.6317,1800,2,0.546,1.546,95.850,0.031,4,Medium,0.020, +"WT"," B5",1,4,MDIA,448.1992,5.5894,72000,2,2.000,1.616,100.227,0.009,4,Medium,0.006, +"WT"," B5",1,4,MDIA,448.1992,5.6019,MAX,2,2,1.613,80.628,n/a,2,Medium,0.077 From ba48fa16b9f44b6e8a8b845bcefcf01f28497121 Mon Sep 17 00:00:00 2001 From: Lisa M Tuttle Date: Mon, 1 Jul 2024 11:40:33 -0700 Subject: [PATCH 7/8] Update uptake_summary.csv --- HDExaminer_examples/uptake_summary.csv | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/HDExaminer_examples/uptake_summary.csv b/HDExaminer_examples/uptake_summary.csv index 7c625df4..4ed9003e 100644 --- a/HDExaminer_examples/uptake_summary.csv +++ b/HDExaminer_examples/uptake_summary.csv @@ -1,7 +1,7 @@ Protein State,Protein,Start,End,Sequence,Peptide Mass,RT (min),Deut Time (sec),maxD,Theor Uptake #D,#D,%D,Conf Interval (#D),#Rep,Confidence,Stddev,p -"WT"," B5",1,4,MDIA,448.1992,5.5936,0,2,0,0,0,n/a,2,High,0 +"WT"," B5",1,4,MDIA,448.1992,5.5936,0,2,0,0,0,n/a,2,High,0, "WT"," B5",1,4,MDIA,448.1992,5.5809,4,2,0.001,0.768,47.600,0.044,4,Medium,0.028, "WT"," B5",1,4,MDIA,448.1992,5.6104,60,2,0.021,1.151,71.352,0.026,4,Medium,0.016, "WT"," B5",1,4,MDIA,448.1992,5.6317,1800,2,0.546,1.546,95.850,0.031,4,Medium,0.020, "WT"," B5",1,4,MDIA,448.1992,5.5894,72000,2,2.000,1.616,100.227,0.009,4,Medium,0.006, -"WT"," B5",1,4,MDIA,448.1992,5.6019,MAX,2,2,1.613,80.628,n/a,2,Medium,0.077 +"WT"," B5",1,4,MDIA,448.1992,5.6019,MAX,2,2,1.613,80.628,n/a,2,Medium,0.077, From b78d3656147ee6a13fa6b88960704a53c48d87d3 Mon Sep 17 00:00:00 2001 From: Lisa M Tuttle Date: Wed, 18 Sep 2024 12:55:05 -0700 Subject: [PATCH 8/8] Update README.md --- README.md | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/README.md b/README.md index a9f3b01b..3faf8a35 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,14 @@ +tuttlelm fork includes modifications for downstream compatibility with pyHXExpress + +Additions include conversion of HDExaminer and pyHXEXpress outputs to a format that +can be read in by pyHDX to create HDXMeasurement objects. This then allows computing +the RFU_residue values and creating coverage plots. + +**some original features may not be compatible + + + + # PyHDX [![zenodo](https://zenodo.org/badge/206772076.svg)](https://zenodo.org/badge/latestdoi/206772076)