From ab0b044d250cac1e5d5bd7fb95561b8e844395ba Mon Sep 17 00:00:00 2001 From: Laura Wrubel Date: Wed, 3 Jul 2024 13:13:43 -0400 Subject: [PATCH] Normalize DOIs in doi_sunet and merge_pubs tasks --- rialto_airflow/harvest/doi_sunet.py | 4 ++++ rialto_airflow/harvest/merge_pubs.py | 12 ++++++++---- rialto_airflow/utils.py | 9 +++++++++ test/harvest/test_doi_sunet.py | 7 +++++-- test/harvest/test_merge_pubs.py | 4 ++-- test/test_utils.py | 10 ++++++++++ 6 files changed, 38 insertions(+), 8 deletions(-) diff --git a/rialto_airflow/harvest/doi_sunet.py b/rialto_airflow/harvest/doi_sunet.py index 59c078b..61417c9 100644 --- a/rialto_airflow/harvest/doi_sunet.py +++ b/rialto_airflow/harvest/doi_sunet.py @@ -5,6 +5,8 @@ # TODO: use polars instead? import pandas as pd +from rialto_airflow.utils import normalize_doi + def create_doi_sunet_pickle( dimensions: str, openalex: str, sul_pub_csv: str, authors_csv: str, output_path @@ -42,6 +44,7 @@ def doi_sunetids(pickle_file: str, orcid_sunet: dict) -> dict: mapping = {} for doi, orcids in doi_orcids.items(): + doi = normalize_doi(doi) mapping[doi] = [orcid_sunet[orcid] for orcid in orcids] return mapping @@ -52,6 +55,7 @@ def sulpub_doi_sunetids(sul_pub_csv, cap_profile_sunet): # extracted from the authorship column df = pd.read_csv(sul_pub_csv, usecols=["doi", "authorship"]) df = df[df["doi"].notna()] + df["doi"] = df["doi"].apply(lambda doi: normalize_doi(doi)) def extract_cap_ids(authors): return [a["cap_profile_id"] for a in eval(authors) if a["status"] == "approved"] diff --git a/rialto_airflow/harvest/merge_pubs.py b/rialto_airflow/harvest/merge_pubs.py index 3477210..7ddccb4 100644 --- a/rialto_airflow/harvest/merge_pubs.py +++ b/rialto_airflow/harvest/merge_pubs.py @@ -1,5 +1,7 @@ import polars as pl +from rialto_airflow.utils import normalize_doi + def merge(sul_pub, openalex_pubs, dimensions_pubs, output): """ @@ -46,10 +48,10 @@ def dimensions_pubs_df(dimensions_pubs): # Polars is inferring volume is an integer, but it should be a string e.g. "97-B" df = pl.scan_csv(dimensions_pubs, schema_overrides={"volume": pl.String}) df = df.select( + pl.col("doi").map_elements(normalize_doi, return_dtype=pl.String), pl.col( "authors", "document_type", - "doi", "funders", "funding_section", "open_access", @@ -59,7 +61,7 @@ def dimensions_pubs_df(dimensions_pubs): "title", "type", "year", - ) + ), ) df = df.rename(lambda column_name: "dim_" + column_name) return df @@ -71,7 +73,7 @@ def openalex_pubs_df(openalex_pubs): """ df = pl.scan_csv(openalex_pubs) df = df.select( - pl.col("doi").str.replace("https://doi.org/", ""), + pl.col("doi").map_elements(normalize_doi, return_dtype=pl.String), pl.col( "apc_paid", "authorships", "grants", "publication_year", "title", "type" ), @@ -86,6 +88,8 @@ def sulpub_df(sul_pub): """ df = pl.scan_csv(sul_pub) df = df.drop_nulls("doi") - df = df.with_columns(pl.col("doi").str.replace("https://doi.org/", "")) + df = df.with_columns( + pl.col("doi").map_elements(normalize_doi, return_dtype=pl.String) + ) df = df.rename(lambda column_name: "sul_pub_" + column_name) return df diff --git a/rialto_airflow/utils.py b/rialto_airflow/utils.py index 4a48597..a562032 100644 --- a/rialto_airflow/utils.py +++ b/rialto_airflow/utils.py @@ -1,6 +1,7 @@ import csv import datetime from pathlib import Path +import re def create_snapshot_dir(data_dir): @@ -54,3 +55,11 @@ def invert_dict(dict): inverted_dict[i] = [k for k, v in dict.items() if i in v] return inverted_dict + + +def normalize_doi(doi): + doi = doi.strip().lower() + doi = doi.replace("https://doi.org/", "").replace("https://dx.doi.org/", "") + doi = re.sub("^doi: ", "", doi) + + return doi diff --git a/test/harvest/test_doi_sunet.py b/test/harvest/test_doi_sunet.py index 5608753..8d0092f 100644 --- a/test/harvest/test_doi_sunet.py +++ b/test/harvest/test_doi_sunet.py @@ -57,8 +57,11 @@ def sul_pub_csv(tmp_path): with open(fixture_file, "w", newline="") as csvfile: writer = csv.writer(csvfile) writer.writerow(["authorship", "title", "doi"]) - writer.writerow([authorship("cap-01"), "A Publication", "10.0000/aaaa"]) - writer.writerow([authorship("cap-02"), "A Research Article", "10.0000/1234"]) + # include DOIs that will be normalized + writer.writerow([authorship("cap-01"), "A Publication", "10.0000/aAaA"]) + writer.writerow( + [authorship("cap-02"), "A Research Article", "https://doi.org/10.0000/1234"] + ) return fixture_file diff --git a/test/harvest/test_merge_pubs.py b/test/harvest/test_merge_pubs.py index b4d62fa..673c694 100644 --- a/test/harvest/test_merge_pubs.py +++ b/test/harvest/test_merge_pubs.py @@ -34,7 +34,7 @@ def dimensions_pubs_csv(tmp_path): "1", "[]", "ARTICLE", - "10.0000/aaaa", + "10.0000/aAaA", "[]", "[]", "True", @@ -130,7 +130,7 @@ def sul_pubs_csv(tmp_path): "[]", "A Published Research Article", "2024", - "https://doi.org/10.0000/dddd", + "doi: 10.0000/dDdD", ] ) return fixture_file diff --git a/test/test_utils.py b/test/test_utils.py index c7949cb..fea4891 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -57,3 +57,13 @@ def test_invert_dict(): "pub_id7", ] assert inverted_dict["pub_id2"] == ["person_id1", "person_id2"] + + +def test_normalize_doi(): + assert utils.normalize_doi("https://doi.org/10.1234/5678") == "10.1234/5678" + assert utils.normalize_doi("https://dx.doi.org/10.1234/5678") == "10.1234/5678" + assert ( + utils.normalize_doi("10.1103/PhysRevLett.96.07390") + == "10.1103/physrevlett.96.07390" + ) + assert utils.normalize_doi(" doi: 10.1234/5678 ") == "10.1234/5678"