From 3a3ff9b8cc4c4f484efd9bff938c1c8786c2d14e Mon Sep 17 00:00:00 2001 From: Ed Summers Date: Mon, 1 Jul 2024 15:05:03 -0500 Subject: [PATCH] Guard against cap_profile_id being missing Fixes #70 --- rialto_airflow/harvest/doi_sunet.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/rialto_airflow/harvest/doi_sunet.py b/rialto_airflow/harvest/doi_sunet.py index f0bc917..59c078b 100644 --- a/rialto_airflow/harvest/doi_sunet.py +++ b/rialto_airflow/harvest/doi_sunet.py @@ -2,6 +2,7 @@ import pickle from collections import defaultdict +# TODO: use polars instead? import pandas as pd @@ -47,7 +48,7 @@ def doi_sunetids(pickle_file: str, orcid_sunet: dict) -> dict: def sulpub_doi_sunetids(sul_pub_csv, cap_profile_sunet): - # create a dataframe for sul_pubs which has a column for cap_profile_id + # create a dataframe for sul_pub which has a column for cap_profile_id # extracted from the authorship column df = pd.read_csv(sul_pub_csv, usecols=["doi", "authorship"]) df = df[df["doi"].notna()] @@ -60,7 +61,17 @@ def extract_cap_ids(authors): df = df.explode("cap_profile_id") # create a column for sunet using the cap_profile_sunet dictionary - df["sunet"] = df["cap_profile_id"].apply(lambda cap_id: cap_profile_sunet[cap_id]) + df["sunet"] = df["cap_profile_id"].apply( + lambda cap_id: cap_profile_sunet.get(cap_id) + ) + + # NOTE: the sunet could be None if the cap_profile id isn't in the authors.csv + # log these so we get a sense of how much that happens + missing = df[df["sunet"].isna()]["doi"].values + if len(missing) > 0: + logging.warn( + f"found {len(missing)} DOI that have cap_profile_id missing from authors.csv: {','.join(missing)}." + ) return df.groupby("doi")["sunet"].apply(list).to_dict()