diff --git a/rialto_airflow/harvest/openalex.py b/rialto_airflow/harvest/openalex.py index 8cb78f3..ca24d6f 100644 --- a/rialto_airflow/harvest/openalex.py +++ b/rialto_airflow/harvest/openalex.py @@ -6,7 +6,7 @@ from urllib.parse import quote from more_itertools import batched -from pyalex import Authors, Works, config +from pyalex import Authors, Works, config, api from rialto_airflow.utils import invert_dict @@ -89,9 +89,21 @@ def publications_from_dois(dois: list): time.sleep(1) doi_list = quote("|".join([doi for doi in doi_batch])) - for page in Works().filter(doi=doi_list).paginate(per_page=200): - for pub in page: - yield normalize_publication(pub) + try: + for page in Works().filter(doi=doi_list).paginate(per_page=200): + for pub in page: + yield normalize_publication(pub) + except api.QueryError: + # try dois individually + for doi in doi_batch: + try: + pubs = Works().filter(doi=doi).get() + if len(pubs) > 1: + logging.warn(f"Found multiple publications for DOI {doi}") + yield normalize_publication(pubs[0]) + except api.QueryError as e: + logging.error(f"OpenAlex QueryError for {doi}: {e}") + continue def normalize_publication(pub) -> dict: diff --git a/test/data/openalex-dois.csv b/test/data/openalex-dois.csv index a56178f..65dc6c9 100644 --- a/test/data/openalex-dois.csv +++ b/test/data/openalex-dois.csv @@ -1,6 +1,6 @@ doi 10.1002/adma.202103646 -10.1001/jamacardio.2021.6059 +"10.1001/jamacardio,2021.6059" 10.3389/fimmu.2022.832501 10.1161/strokeaha.122.040540 10.1001/jamainternmed.2023.2561 diff --git a/test/harvest/test_openalex.py b/test/harvest/test_openalex.py index fdd054c..7127942 100644 --- a/test/harvest/test_openalex.py +++ b/test/harvest/test_openalex.py @@ -48,7 +48,9 @@ def test_publications_from_dois(): # look up the publication metadata for them pubs = list(openalex.publications_from_dois(dois)) - assert len(pubs) == 231, "should paginate (page size=200)" + assert ( + len(pubs) == 230 + ), "should paginate (page size=200) and have skipped invalid DOI" assert len(pubs) == len(set([pub["doi"] for pub in pubs])), "DOIs are unique" assert set(openalex.FIELDS) == set(pubs[0].keys()), "All fields accounted for." assert len(pubs[0].keys()) == 51, "first publication has 51 columns"