Skip to content

Commit

Permalink
fixed broken protein sequences
Browse files Browse the repository at this point in the history
  • Loading branch information
amva13 committed Mar 22, 2024
1 parent 38fff00 commit 644c848
Showing 1 changed file with 16 additions and 2 deletions.
18 changes: 16 additions & 2 deletions tdc/feature_generators/protein_feature_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,18 @@ class ProteinFeatureGenerator(DataFeatureGenerator):
Goals are to make it easier to integrate custom datasets not yet in TDC format.
Note: for running in sequence, this class inherits from data_processing_utils.DataFeatureGenerator
"""

_SEQUENCE_MAP = {
"ACE2": (
"protein-coding",
"MSSSSWLLLSLVAVTAAQSTIEEQAKTFLDKFNHEAEDLFYQSSLASWNYNTNITEENVQNMNNAGDKWSAFLKEQSTLAQMYPLQEIQNLTVKLQLQALQQNGSSVLSEDKSKRLNTILNTMSTIYSTGKVCNPDNPQECLLLEPGLNEIMANSLDYNERLWAWESWRSEVGKQLRPLYEEYVVLKNEMARANHYEDYGDYWRGDYEVNGVDGYDYSRGQLIEDVEHTFEEIKPLYEHLHAYVRAKLMNAYPSYISPIGCLPAHLLGDMWGRFWTNLYSLTVPFGQKPNIDVTDAMVDQAWDAQRIFKEAEKFFVSVGLPNMTQGFWENSMLTDPGNVQKAVCHPTAWDLGKGDFRILMCTKVTMDDFLTAHHEMGHIQYDMAYAAQPFLLRNGANEGFHEAVGEIMSLSAATPKHLKSIGLLSPDFQEDNETEINFLLKQALTIVGTLPFTYMLEKWRWMVFKGEIPKDQWMKKWWEMKREIVGVVEPVPHDETYCDPASLFHVSNDYSFIRYYTRTLYQFQFQEALCQAAKHEGPLHKCDISNSTEAGQKLFNMLRLGKSEPWTLALENVVGAKNMNVRPLLNYFEPLFTWLKDQNKNSFVGWSTDWSPYADQSIKVRISLKSALGDKAYEWNDNEMYLFRSSVAYAMRQYFLKVKNQMILFGEEDVRVANLKPRISFNFFVTAPKNVSDIIPRTEVEKAIRMSRSRINDAFRLNDNSLEFLGIQPTLGPPNQPPVSIWLIVFGVVMGVIVVGIVILIFTGIRDRKKKNKARSGENPYASIDISKGENNPGFQNTDDVQTSF"
),
"12CA5": (
"protein-coding",
"YPYDVPDYA"
)

}

@classmethod
def get_ncrna_sequence(cls, ncrna_id):
Expand Down Expand Up @@ -85,12 +97,12 @@ def get_protein_sequence(cls, gene_name: str) -> str:
# Query MyGene.info for the given gene name
# You might need to adjust the fields based on the gene's specifics
# 'fields': 'proteins' might vary depending on the data available for your gene
if gene_name in cls._SEQUENCE_MAP: # genes for which mygene seems to be mislabled
return cls._SEQUENCE_MAP[gene_name][1]
gene_info = mg.query(gene_name, fields='all', species='human')
try:
# Attempt to extract the protein sequence
# The path to the protein sequence might need adjustment based on the response structure
if gene_name.upper() == "12CA5":
return "YPYDVPDYA" # hard-coded due to unavailability in mygene
if gene_info['hits'][0]["type_of_gene"].lower() == "ncrna":
ncbi_id = gene_info['hits'][0]['entrezgene']
return cls.get_ncrna_sequence(ncbi_id)
Expand All @@ -108,6 +120,8 @@ def get_type_of_gene(cls, gene_name: str) -> str:
# Query MyGene.info for the given gene name
# You might need to adjust the fields based on the gene's specifics
# 'fields': 'proteins' might vary depending on the data available for your gene
if gene_name in cls._SEQUENCE_MAP:
return cls._SEQUENCE_MAP[gene_name][0]
gene_info = mg.query(gene_name, fields='all', species='human')
try:
return gene_info["hits"][0][
Expand Down

0 comments on commit 644c848

Please sign in to comment.