From f4b7befa8d48a033739f325bfe5fa324efbb66db Mon Sep 17 00:00:00 2001 From: Laura Wrubel Date: Fri, 5 Jul 2024 08:56:18 -0400 Subject: [PATCH] Set openalex to retrieve 50 DOIs per batch --- rialto_airflow/harvest/openalex.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/rialto_airflow/harvest/openalex.py b/rialto_airflow/harvest/openalex.py index e1d9eeb..8cb78f3 100644 --- a/rialto_airflow/harvest/openalex.py +++ b/rialto_airflow/harvest/openalex.py @@ -78,11 +78,13 @@ def publications_csv(dois: list, csv_file: str) -> None: writer.writerow(pub) -def publications_from_dois(dois: list, batch_size=75): +def publications_from_dois(dois: list): """ Look up works by DOI in batches that fit within OpenAlex request size limits """ - for doi_batch in batched(dois, batch_size): + for doi_batch in batched(dois, 50): + # Setting batch size to 50 to avoid 400 errors from OpenAlex API when GET query string is greater than 4096 characters + # Based on experimentation, 75 is too high. 50 is the default per_page size, so we could consider removing pagination in the future. # TODO: do we need this to stay within 100,000 requests / day API quota? time.sleep(1)