From 644c848e86cfdee021c09054ccbb5dffcc91781c Mon Sep 17 00:00:00 2001 From: Alejandro Velez Date: Fri, 22 Mar 2024 17:18:09 -0400 Subject: [PATCH] fixed broken protein sequences --- .../protein_feature_generator.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/tdc/feature_generators/protein_feature_generator.py b/tdc/feature_generators/protein_feature_generator.py index 0218aca2..9f2807de 100644 --- a/tdc/feature_generators/protein_feature_generator.py +++ b/tdc/feature_generators/protein_feature_generator.py @@ -17,6 +17,18 @@ class ProteinFeatureGenerator(DataFeatureGenerator): Goals are to make it easier to integrate custom datasets not yet in TDC format. Note: for running in sequence, this class inherits from data_processing_utils.DataFeatureGenerator """ + + _SEQUENCE_MAP = { + "ACE2": ( + "protein-coding", + "MSSSSWLLLSLVAVTAAQSTIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNNAGDKWSAFLKEQSTLAQMYPLQEIQNLTVKLQLQALQQNGSSVLSEDKSKRLNTILNTMSTIYSTGKVCNPDNPQECLLLEPGLNEIMANSLDYNERLWAWESWRSEVGKQLRPLYEEYVVLKNEMARANHYEDYGDYWRGDYEVNGVDGYDYSRGQLIEDVEHTFEEIKPLYEHLHAYVRAKLMNAYPSYISPIGCLPAHLLGDMWGRFWTNLYSLTVPFGQKPNIDVTDAMVDQAWDAQRIFKEAEKFFVSVGLPNMTQGFWENSMLTDPGNVQKAVCHPTAWDLGKGDFRILMCTKVTMDDFLTAHHEMGHIQYDMAYAAQPFLLRNGANEGFHEAVGEIMSLSAATPKHLKSIGLLSPDFQEDNETEINFLLKQALTIVGTLPFTYMLEKWRWMVFKGEIPKDQWMKKWWEMKREIVGVVEPVPHDETYCDPASLFHVSNDYSFIRYYTRTLYQFQFQEALCQAAKHEGPLHKCDISNSTEAGQKLFNMLRLGKSEPWTLALENVVGAKNMNVRPLLNYFEPLFTWLKDQNKNSFVGWSTDWSPYADQSIKVRISLKSALGDKAYEWNDNEMYLFRSSVAYAMRQYFLKVKNQMILFGEEDVRVANLKPRISFNFFVTAPKNVSDIIPRTEVEKAIRMSRSRINDAFRLNDNSLEFLGIQPTLGPPNQPPVSIWLIVFGVVMGVIVVGIVILIFTGIRDRKKKNKARSGENPYASIDISKGENNPGFQNTDDVQTSF" + ), + "12CA5": ( + "protein-coding", + "YPYDVPDYA" + ) + + } @classmethod def get_ncrna_sequence(cls, ncrna_id): @@ -85,12 +97,12 @@ def get_protein_sequence(cls, gene_name: str) -> str: # Query MyGene.info for the given gene name # You might need to adjust the fields based on the gene's specifics # 'fields': 'proteins' might vary depending on the data available for your gene + if gene_name in cls._SEQUENCE_MAP: # genes for which mygene seems to be mislabled + return cls._SEQUENCE_MAP[gene_name][1] gene_info = mg.query(gene_name, fields='all', species='human') try: # Attempt to extract the protein sequence # The path to the protein sequence might need adjustment based on the response structure - if gene_name.upper() == "12CA5": - return "YPYDVPDYA" # hard-coded due to unavailability in mygene if gene_info['hits'][0]["type_of_gene"].lower() == "ncrna": ncbi_id = gene_info['hits'][0]['entrezgene'] return cls.get_ncrna_sequence(ncbi_id) @@ -108,6 +120,8 @@ def get_type_of_gene(cls, gene_name: str) -> str: # Query MyGene.info for the given gene name # You might need to adjust the fields based on the gene's specifics # 'fields': 'proteins' might vary depending on the data available for your gene + if gene_name in cls._SEQUENCE_MAP: + return cls._SEQUENCE_MAP[gene_name][0] gene_info = mg.query(gene_name, fields='all', species='human') try: return gene_info["hits"][0][