fixed broken protein sequences

mims-harvard · Mar 22, 2024 · 644c848 · 644c848
1 parent 38fff00
commit 644c848
Showing 1 changed file with 16 additions and 2 deletions.
diff --git a/tdc/feature_generators/protein_feature_generator.py b/tdc/feature_generators/protein_feature_generator.py
@@ -17,6 +17,18 @@ class ProteinFeatureGenerator(DataFeatureGenerator):
     Goals are to make it easier to integrate custom datasets not yet in TDC format.
     Note: for running in sequence, this class inherits from data_processing_utils.DataFeatureGenerator
     """
+
+    _SEQUENCE_MAP = {
+        "ACE2": (
+            "protein-coding",
+            "MSSSSWLLLSLVAVTAAQSTIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNNAGDKWSAFLKEQSTLAQMYPLQEIQNLTVKLQLQALQQNGSSVLSEDKSKRLNTILNTMSTIYSTGKVCNPDNPQECLLLEPGLNEIMANSLDYNERLWAWESWRSEVGKQLRPLYEEYVVLKNEMARANHYEDYGDYWRGDYEVNGVDGYDYSRGQLIEDVEHTFEEIKPLYEHLHAYVRAKLMNAYPSYISPIGCLPAHLLGDMWGRFWTNLYSLTVPFGQKPNIDVTDAMVDQAWDAQRIFKEAEKFFVSVGLPNMTQGFWENSMLTDPGNVQKAVCHPTAWDLGKGDFRILMCTKVTMDDFLTAHHEMGHIQYDMAYAAQPFLLRNGANEGFHEAVGEIMSLSAATPKHLKSIGLLSPDFQEDNETEINFLLKQALTIVGTLPFTYMLEKWRWMVFKGEIPKDQWMKKWWEMKREIVGVVEPVPHDETYCDPASLFHVSNDYSFIRYYTRTLYQFQFQEALCQAAKHEGPLHKCDISNSTEAGQKLFNMLRLGKSEPWTLALENVVGAKNMNVRPLLNYFEPLFTWLKDQNKNSFVGWSTDWSPYADQSIKVRISLKSALGDKAYEWNDNEMYLFRSSVAYAMRQYFLKVKNQMILFGEEDVRVANLKPRISFNFFVTAPKNVSDIIPRTEVEKAIRMSRSRINDAFRLNDNSLEFLGIQPTLGPPNQPPVSIWLIVFGVVMGVIVVGIVILIFTGIRDRKKKNKARSGENPYASIDISKGENNPGFQNTDDVQTSF"
+        ),
+        "12CA5": (
+            "protein-coding",
+            "YPYDVPDYA"
+        )
+
+    }
 
     @classmethod
     def get_ncrna_sequence(cls, ncrna_id):
@@ -85,12 +97,12 @@ def get_protein_sequence(cls, gene_name: str) -> str:
         # Query MyGene.info for the given gene name
         # You might need to adjust the fields based on the gene's specifics
         # 'fields': 'proteins' might vary depending on the data available for your gene
+        if gene_name in cls._SEQUENCE_MAP:  # genes for which mygene seems to be mislabled
+            return cls._SEQUENCE_MAP[gene_name][1]
         gene_info = mg.query(gene_name, fields='all', species='human')
         try:
             # Attempt to extract the protein sequence
             # The path to the protein sequence might need adjustment based on the response structure
-            if gene_name.upper() == "12CA5":
-                return "YPYDVPDYA"  # hard-coded due to unavailability in mygene
             if gene_info['hits'][0]["type_of_gene"].lower() == "ncrna":
                 ncbi_id = gene_info['hits'][0]['entrezgene']
                 return cls.get_ncrna_sequence(ncbi_id)
@@ -108,6 +120,8 @@ def get_type_of_gene(cls, gene_name: str) -> str:
         # Query MyGene.info for the given gene name
         # You might need to adjust the fields based on the gene's specifics
         # 'fields': 'proteins' might vary depending on the data available for your gene
+        if gene_name in cls._SEQUENCE_MAP:
+            return cls._SEQUENCE_MAP[gene_name][0]
         gene_info = mg.query(gene_name, fields='all', species='human')
         try:
             return gene_info["hits"][0][