Merge pull request #73 from sul-dlss-labs/t72-normalize-dois

Normalize DOIs in doi_sunet and merge_pubs tasks
sul-dlss-labs · Jul 3, 2024 · 7f1a256 · 7f1a256
2 parents fd25a8b + ab0b044
commit 7f1a256
Show file tree

Hide file tree

Showing 6 changed files with 38 additions and 8 deletions.
diff --git a/rialto_airflow/harvest/doi_sunet.py b/rialto_airflow/harvest/doi_sunet.py
@@ -5,6 +5,8 @@
 # TODO: use polars instead?
 import pandas as pd
 
+from rialto_airflow.utils import normalize_doi
+
 
 def create_doi_sunet_pickle(
     dimensions: str, openalex: str, sul_pub_csv: str, authors_csv: str, output_path
@@ -42,6 +44,7 @@ def doi_sunetids(pickle_file: str, orcid_sunet: dict) -> dict:
 
     mapping = {}
     for doi, orcids in doi_orcids.items():
+        doi = normalize_doi(doi)
         mapping[doi] = [orcid_sunet[orcid] for orcid in orcids]
 
     return mapping
@@ -52,6 +55,7 @@ def sulpub_doi_sunetids(sul_pub_csv, cap_profile_sunet):
     # extracted from the authorship column
     df = pd.read_csv(sul_pub_csv, usecols=["doi", "authorship"])
     df = df[df["doi"].notna()]
+    df["doi"] = df["doi"].apply(lambda doi: normalize_doi(doi))
 
     def extract_cap_ids(authors):
         return [a["cap_profile_id"] for a in eval(authors) if a["status"] == "approved"]

diff --git a/rialto_airflow/harvest/merge_pubs.py b/rialto_airflow/harvest/merge_pubs.py
@@ -1,5 +1,7 @@
 import polars as pl
 
+from rialto_airflow.utils import normalize_doi
+
 
 def merge(sul_pub, openalex_pubs, dimensions_pubs, output):
     """
@@ -46,10 +48,10 @@ def dimensions_pubs_df(dimensions_pubs):
     # Polars is inferring volume is an integer, but it should be a string e.g. "97-B"
     df = pl.scan_csv(dimensions_pubs, schema_overrides={"volume": pl.String})
     df = df.select(
+        pl.col("doi").map_elements(normalize_doi, return_dtype=pl.String),
         pl.col(
             "authors",
             "document_type",
-            "doi",
             "funders",
             "funding_section",
             "open_access",
@@ -59,7 +61,7 @@ def dimensions_pubs_df(dimensions_pubs):
             "title",
             "type",
             "year",
-        )
+        ),
     )
     df = df.rename(lambda column_name: "dim_" + column_name)
     return df
@@ -71,7 +73,7 @@ def openalex_pubs_df(openalex_pubs):
     """
     df = pl.scan_csv(openalex_pubs)
     df = df.select(
-        pl.col("doi").str.replace("https://doi.org/", ""),
+        pl.col("doi").map_elements(normalize_doi, return_dtype=pl.String),
         pl.col(
             "apc_paid", "authorships", "grants", "publication_year", "title", "type"
         ),
@@ -86,6 +88,8 @@ def sulpub_df(sul_pub):
     """
     df = pl.scan_csv(sul_pub)
     df = df.drop_nulls("doi")
-    df = df.with_columns(pl.col("doi").str.replace("https://doi.org/", ""))
+    df = df.with_columns(
+        pl.col("doi").map_elements(normalize_doi, return_dtype=pl.String)
+    )
     df = df.rename(lambda column_name: "sul_pub_" + column_name)
     return df
diff --git a/rialto_airflow/utils.py b/rialto_airflow/utils.py
@@ -1,6 +1,7 @@
 import csv
 import datetime
 from pathlib import Path
+import re
 
 
 def create_snapshot_dir(data_dir):
@@ -54,3 +55,11 @@ def invert_dict(dict):
         inverted_dict[i] = [k for k, v in dict.items() if i in v]
 
     return inverted_dict
+
+
+def normalize_doi(doi):
+    doi = doi.strip().lower()
+    doi = doi.replace("https://doi.org/", "").replace("https://dx.doi.org/", "")
+    doi = re.sub("^doi: ", "", doi)
+
+    return doi
diff --git a/test/harvest/test_doi_sunet.py b/test/harvest/test_doi_sunet.py
@@ -57,8 +57,11 @@ def sul_pub_csv(tmp_path):
     with open(fixture_file, "w", newline="") as csvfile:
         writer = csv.writer(csvfile)
         writer.writerow(["authorship", "title", "doi"])
-        writer.writerow([authorship("cap-01"), "A Publication", "10.0000/aaaa"])
-        writer.writerow([authorship("cap-02"), "A Research Article", "10.0000/1234"])
+        # include DOIs that will be normalized
+        writer.writerow([authorship("cap-01"), "A Publication", "10.0000/aAaA"])
+        writer.writerow(
+            [authorship("cap-02"), "A Research Article", "https://doi.org/10.0000/1234"]
+        )
     return fixture_file
 
 

diff --git a/test/harvest/test_merge_pubs.py b/test/harvest/test_merge_pubs.py
@@ -34,7 +34,7 @@ def dimensions_pubs_csv(tmp_path):
                 "1",
                 "[]",
                 "ARTICLE",
-                "10.0000/aaaa",
+                "10.0000/aAaA",
                 "[]",
                 "[]",
                 "True",
@@ -130,7 +130,7 @@ def sul_pubs_csv(tmp_path):
                 "[]",
                 "A Published Research Article",
                 "2024",
-                "https://doi.org/10.0000/dddd",
+                "doi: 10.0000/dDdD",
             ]
         )
     return fixture_file

diff --git a/test/test_utils.py b/test/test_utils.py
@@ -57,3 +57,13 @@ def test_invert_dict():
         "pub_id7",
     ]
     assert inverted_dict["pub_id2"] == ["person_id1", "person_id2"]
+
+
+def test_normalize_doi():
+    assert utils.normalize_doi("https://doi.org/10.1234/5678") == "10.1234/5678"
+    assert utils.normalize_doi("https://dx.doi.org/10.1234/5678") == "10.1234/5678"
+    assert (
+        utils.normalize_doi("10.1103/PhysRevLett.96.07390")
+        == "10.1103/physrevlett.96.07390"
+    )
+    assert utils.normalize_doi(" doi: 10.1234/5678 ") == "10.1234/5678"