Skip to content

Commit

Permalink
Retry DOI lookups when invalid DOI in batch
Browse files Browse the repository at this point in the history
  • Loading branch information
lwrubel committed Jul 8, 2024
1 parent 1ef8969 commit f899f23
Show file tree
Hide file tree
Showing 3 changed files with 20 additions and 6 deletions.
20 changes: 16 additions & 4 deletions rialto_airflow/harvest/openalex.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from urllib.parse import quote

from more_itertools import batched
from pyalex import Authors, Works, config
from pyalex import Authors, Works, config, api

from rialto_airflow.utils import invert_dict

Expand Down Expand Up @@ -89,9 +89,21 @@ def publications_from_dois(dois: list):
time.sleep(1)

doi_list = quote("|".join([doi for doi in doi_batch]))
for page in Works().filter(doi=doi_list).paginate(per_page=200):
for pub in page:
yield normalize_publication(pub)
try:
for page in Works().filter(doi=doi_list).paginate(per_page=200):
for pub in page:
yield normalize_publication(pub)
except api.QueryError:
# try dois individually
for doi in doi_batch:
try:
pubs = Works().filter(doi=doi).get()
if len(pubs) > 1:
logging.warn(f"Found multiple publications for DOI {doi}")
yield normalize_publication(pubs[0])
except api.QueryError as e:
logging.error(f"OpenAlex QueryError for {doi}: {e}")
continue


def normalize_publication(pub) -> dict:
Expand Down
2 changes: 1 addition & 1 deletion test/data/openalex-dois.csv
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
doi
10.1002/adma.202103646
10.1001/jamacardio.2021.6059
"10.1001/jamacardio,2021.6059"
10.3389/fimmu.2022.832501
10.1161/strokeaha.122.040540
10.1001/jamainternmed.2023.2561
Expand Down
4 changes: 3 additions & 1 deletion test/harvest/test_openalex.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@ def test_publications_from_dois():

# look up the publication metadata for them
pubs = list(openalex.publications_from_dois(dois))
assert len(pubs) == 231, "should paginate (page size=200)"
assert (
len(pubs) == 230
), "should paginate (page size=200) and have skipped invalid DOI"
assert len(pubs) == len(set([pub["doi"] for pub in pubs])), "DOIs are unique"
assert set(openalex.FIELDS) == set(pubs[0].keys()), "All fields accounted for."
assert len(pubs[0].keys()) == 51, "first publication has 51 columns"
Expand Down

0 comments on commit f899f23

Please sign in to comment.