Skip to content

Commit

Permalink
Merge pull request #73 from sul-dlss-labs/t72-normalize-dois
Browse files Browse the repository at this point in the history
Normalize DOIs in doi_sunet and merge_pubs tasks
  • Loading branch information
lwrubel authored Jul 3, 2024
2 parents fd25a8b + ab0b044 commit 7f1a256
Show file tree
Hide file tree
Showing 6 changed files with 38 additions and 8 deletions.
4 changes: 4 additions & 0 deletions rialto_airflow/harvest/doi_sunet.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
# TODO: use polars instead?
import pandas as pd

from rialto_airflow.utils import normalize_doi


def create_doi_sunet_pickle(
dimensions: str, openalex: str, sul_pub_csv: str, authors_csv: str, output_path
Expand Down Expand Up @@ -42,6 +44,7 @@ def doi_sunetids(pickle_file: str, orcid_sunet: dict) -> dict:

mapping = {}
for doi, orcids in doi_orcids.items():
doi = normalize_doi(doi)
mapping[doi] = [orcid_sunet[orcid] for orcid in orcids]

return mapping
Expand All @@ -52,6 +55,7 @@ def sulpub_doi_sunetids(sul_pub_csv, cap_profile_sunet):
# extracted from the authorship column
df = pd.read_csv(sul_pub_csv, usecols=["doi", "authorship"])
df = df[df["doi"].notna()]
df["doi"] = df["doi"].apply(lambda doi: normalize_doi(doi))

def extract_cap_ids(authors):
return [a["cap_profile_id"] for a in eval(authors) if a["status"] == "approved"]
Expand Down
12 changes: 8 additions & 4 deletions rialto_airflow/harvest/merge_pubs.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import polars as pl

from rialto_airflow.utils import normalize_doi


def merge(sul_pub, openalex_pubs, dimensions_pubs, output):
"""
Expand Down Expand Up @@ -46,10 +48,10 @@ def dimensions_pubs_df(dimensions_pubs):
# Polars is inferring volume is an integer, but it should be a string e.g. "97-B"
df = pl.scan_csv(dimensions_pubs, schema_overrides={"volume": pl.String})
df = df.select(
pl.col("doi").map_elements(normalize_doi, return_dtype=pl.String),
pl.col(
"authors",
"document_type",
"doi",
"funders",
"funding_section",
"open_access",
Expand All @@ -59,7 +61,7 @@ def dimensions_pubs_df(dimensions_pubs):
"title",
"type",
"year",
)
),
)
df = df.rename(lambda column_name: "dim_" + column_name)
return df
Expand All @@ -71,7 +73,7 @@ def openalex_pubs_df(openalex_pubs):
"""
df = pl.scan_csv(openalex_pubs)
df = df.select(
pl.col("doi").str.replace("https://doi.org/", ""),
pl.col("doi").map_elements(normalize_doi, return_dtype=pl.String),
pl.col(
"apc_paid", "authorships", "grants", "publication_year", "title", "type"
),
Expand All @@ -86,6 +88,8 @@ def sulpub_df(sul_pub):
"""
df = pl.scan_csv(sul_pub)
df = df.drop_nulls("doi")
df = df.with_columns(pl.col("doi").str.replace("https://doi.org/", ""))
df = df.with_columns(
pl.col("doi").map_elements(normalize_doi, return_dtype=pl.String)
)
df = df.rename(lambda column_name: "sul_pub_" + column_name)
return df
9 changes: 9 additions & 0 deletions rialto_airflow/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import csv
import datetime
from pathlib import Path
import re


def create_snapshot_dir(data_dir):
Expand Down Expand Up @@ -54,3 +55,11 @@ def invert_dict(dict):
inverted_dict[i] = [k for k, v in dict.items() if i in v]

return inverted_dict


def normalize_doi(doi):
doi = doi.strip().lower()
doi = doi.replace("https://doi.org/", "").replace("https://dx.doi.org/", "")
doi = re.sub("^doi: ", "", doi)

return doi
7 changes: 5 additions & 2 deletions test/harvest/test_doi_sunet.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,11 @@ def sul_pub_csv(tmp_path):
with open(fixture_file, "w", newline="") as csvfile:
writer = csv.writer(csvfile)
writer.writerow(["authorship", "title", "doi"])
writer.writerow([authorship("cap-01"), "A Publication", "10.0000/aaaa"])
writer.writerow([authorship("cap-02"), "A Research Article", "10.0000/1234"])
# include DOIs that will be normalized
writer.writerow([authorship("cap-01"), "A Publication", "10.0000/aAaA"])
writer.writerow(
[authorship("cap-02"), "A Research Article", "https://doi.org/10.0000/1234"]
)
return fixture_file


Expand Down
4 changes: 2 additions & 2 deletions test/harvest/test_merge_pubs.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def dimensions_pubs_csv(tmp_path):
"1",
"[]",
"ARTICLE",
"10.0000/aaaa",
"10.0000/aAaA",
"[]",
"[]",
"True",
Expand Down Expand Up @@ -130,7 +130,7 @@ def sul_pubs_csv(tmp_path):
"[]",
"A Published Research Article",
"2024",
"https://doi.org/10.0000/dddd",
"doi: 10.0000/dDdD",
]
)
return fixture_file
Expand Down
10 changes: 10 additions & 0 deletions test/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,3 +57,13 @@ def test_invert_dict():
"pub_id7",
]
assert inverted_dict["pub_id2"] == ["person_id1", "person_id2"]


def test_normalize_doi():
assert utils.normalize_doi("https://doi.org/10.1234/5678") == "10.1234/5678"
assert utils.normalize_doi("https://dx.doi.org/10.1234/5678") == "10.1234/5678"
assert (
utils.normalize_doi("10.1103/PhysRevLett.96.07390")
== "10.1103/physrevlett.96.07390"
)
assert utils.normalize_doi(" doi: 10.1234/5678 ") == "10.1234/5678"

0 comments on commit 7f1a256

Please sign in to comment.