Skip to content

Commit

Permalink
Guard against cap_profile_id being missing
Browse files Browse the repository at this point in the history
Fixes #70
  • Loading branch information
edsu committed Jul 1, 2024
1 parent 75c3720 commit 40c793a
Showing 1 changed file with 9 additions and 2 deletions.
11 changes: 9 additions & 2 deletions rialto_airflow/harvest/doi_sunet.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import pickle
from collections import defaultdict

# TODO: use polars instead?
import pandas as pd


Expand Down Expand Up @@ -47,7 +48,7 @@ def doi_sunetids(pickle_file: str, orcid_sunet: dict) -> dict:


def sulpub_doi_sunetids(sul_pub_csv, cap_profile_sunet):
# create a dataframe for sul_pubs which has a column for cap_profile_id
# create a dataframe for sul_pub which has a column for cap_profile_id
# extracted from the authorship column
df = pd.read_csv(sul_pub_csv, usecols=["doi", "authorship"])
df = df[df["doi"].notna()]
Expand All @@ -60,7 +61,13 @@ def extract_cap_ids(authors):
df = df.explode("cap_profile_id")

# create a column for sunet using the cap_profile_sunet dictionary
df["sunet"] = df["cap_profile_id"].apply(lambda cap_id: cap_profile_sunet[cap_id])
df["sunet"] = df["cap_profile_id"].apply(lambda cap_id: cap_profile_sunet.get(cap_id))

# NOTE: the sunet could be None if the cap_profile id isn't in the authors.csv
# log these so we get a sense of how much that happens
missing = df[df['sunet'].isna()]['doi'].values
if len(missing) > 0:
logging.warn(f"found {len(missing)} DOI that have cap_profile_id missing from authors.csv: {','.join(missing)}.")

return df.groupby("doi")["sunet"].apply(list).to_dict()

Expand Down

0 comments on commit 40c793a

Please sign in to comment.