diff --git a/rialto_airflow/harvest/openalex.py b/rialto_airflow/harvest/openalex.py index 8cb78f3..ca24d6f 100644 --- a/rialto_airflow/harvest/openalex.py +++ b/rialto_airflow/harvest/openalex.py @@ -6,7 +6,7 @@ from urllib.parse import quote from more_itertools import batched -from pyalex import Authors, Works, config +from pyalex import Authors, Works, config, api from rialto_airflow.utils import invert_dict @@ -89,9 +89,21 @@ def publications_from_dois(dois: list): time.sleep(1) doi_list = quote("|".join([doi for doi in doi_batch])) - for page in Works().filter(doi=doi_list).paginate(per_page=200): - for pub in page: - yield normalize_publication(pub) + try: + for page in Works().filter(doi=doi_list).paginate(per_page=200): + for pub in page: + yield normalize_publication(pub) + except api.QueryError: + # try dois individually + for doi in doi_batch: + try: + pubs = Works().filter(doi=doi).get() + if len(pubs) > 1: + logging.warn(f"Found multiple publications for DOI {doi}") + yield normalize_publication(pubs[0]) + except api.QueryError as e: + logging.error(f"OpenAlex QueryError for {doi}: {e}") + continue def normalize_publication(pub) -> dict: diff --git a/test/harvest/test_openalex.py b/test/harvest/test_openalex.py index fdd054c..71b4c01 100644 --- a/test/harvest/test_openalex.py +++ b/test/harvest/test_openalex.py @@ -55,6 +55,16 @@ def test_publications_from_dois(): assert len(pubs[1].keys()) == 51, "second publication has 51 columns" +def test_publications_from_invalid_dois(caplog): + # Error may change if OpenAlex API or pyalex changes + invalid_dois = ["doi-with-comma,a", "10.1145/3442188.3445922"] + assert len(list(openalex.publications_from_dois(invalid_dois))) == 1 + assert ( + "OpenAlex QueryError for doi-with-comma,a: Invalid query parameter" + in caplog.text + ), "logs error message" + + def test_publications_csv(tmp_path): pubs_csv = tmp_path / "openalex-pubs.csv" openalex.publications_csv(