From 0b1ee62b91439326626f0d9d8823894f8aaa375c Mon Sep 17 00:00:00 2001 From: Quan Pham Date: Tue, 11 Jun 2024 10:04:11 +0700 Subject: [PATCH] Allow a PI's institution name to be search with subdomains Now, for PIs whose emails contain multiple subdomains (i.e `a@b.c.edu`), each of their subdomains will be matched against institute_map.json, from the most qualified to least qualified subdomain. In the example of `a@b.c.edu`, this means `b.c.edu` would be matched first, then `c.edu`, then `edu` --- process_report/institute_map.json | 9 ------ process_report/process_report.py | 7 +++-- process_report/tests/unit_tests.py | 46 ++++++++++++++---------------- 3 files changed, 27 insertions(+), 35 deletions(-) diff --git a/process_report/institute_map.json b/process_report/institute_map.json index d6930eb..810aaa5 100644 --- a/process_report/institute_map.json +++ b/process_report/institute_map.json @@ -10,15 +10,6 @@ "dfci.harvard.edu" : "Dana-Farber Cancer Institute", "bwh.harvard.edu" : "Brigham and Women's Hospital", "bidmc.harvard.edu" : "Beth Israel Deaconess Medical Center", - "fas.harvard.edu" : "Harvard University", - "cga.harvard.edu" : "Harvard University", - "iq.harvard.edu" : "Harvard University", - "hks.harvard.edu" : "Harvard University", - "hsph.harvard.edu" : "Harvard University", - "seas.harvard.edu" : "Harvard University", - "gse.harvard.edu" : "Harvard University", - "gov.harvard.edu" : "Harvard University", - "oeb.harvard.edu" : "Harvard University", "harvard.edu" : "Harvard University", "wpi.edu" : "Worcester Polytechnic Institute", "mit.edu" : "Massachusetts Institute of Technology", diff --git a/process_report/process_report.py b/process_report/process_report.py index 1b980f2..57e8dc6 100644 --- a/process_report/process_report.py +++ b/process_report/process_report.py @@ -44,8 +44,11 @@ def get_institution_from_pi(institute_map, pi_uname): - institution_key = pi_uname.split("@")[-1] - institution_name = institute_map.get(institution_key, "") + institution_domain = pi_uname.split("@")[-1] + for i in range(institution_domain.count(".") + 1): + if institution_name := institute_map.get(institution_domain, ""): + break + institution_domain = institution_domain[institution_domain.find(".") + 1 :] if institution_name == "": print(f"Warning: PI name {pi_uname} does not match any institution!") diff --git a/process_report/tests/unit_tests.py b/process_report/tests/unit_tests.py index 799b976..fc39884 100644 --- a/process_report/tests/unit_tests.py +++ b/process_report/tests/unit_tests.py @@ -229,34 +229,32 @@ def test_get_pi_institution(self): "bu.edu": "Boston University", "bentley.edu": "Bentley", "mclean.harvard.edu": "McLean Hospital", + "northeastern.edu": "Northeastern University", + "childrens.harvard.edu": "Boston Children's Hospital", "meei.harvard.edu": "Massachusetts Eye & Ear", "dfci.harvard.edu": "Dana-Farber Cancer Institute", - "northeastern.edu": "Northeastern University", + "bwh.harvard.edu": "Brigham and Women's Hospital", + "bidmc.harvard.edu": "Beth Israel Deaconess Medical Center", } - self.assertEqual( - process_report.get_institution_from_pi(institute_map, "quanmp@bu.edu"), - "Boston University", - ) - self.assertEqual( - process_report.get_institution_from_pi( - institute_map, "c@mclean.harvard.edu" - ), - "McLean Hospital", - ) - self.assertEqual( - process_report.get_institution_from_pi(institute_map, "b@harvard.edu"), - "Harvard University", - ) - self.assertEqual( - process_report.get_institution_from_pi(institute_map, "fake"), "" - ) - self.assertEqual( - process_report.get_institution_from_pi( - institute_map, "pi@northeastern.edu" - ), - "Northeastern University", - ) + answers = { + "q@bu.edu": "Boston University", + "c@mclean.harvard.edu": "McLean Hospital", + "b@harvard.edu": "Harvard University", + "e@edu": "", + "pi@northeastern.edu": "Northeastern University", + "h@a.b.c.harvard.edu": "Harvard University", + "c@a.childrens.harvard.edu": "Boston Children's Hospital", + "d@a-b.meei.harvard.edu": "Massachusetts Eye & Ear", + "e@dfci.harvard": "", + "f@bwh.harvard.edu": "Brigham and Women's Hospital", + "g@bidmc.harvard.edu": "Beth Israel Deaconess Medical Center", + } + + for pi_email, answer in answers.items(): + self.assertEqual( + process_report.get_institution_from_pi(institute_map, pi_email), answer + ) class TestAlias(TestCase):