Skip to content

Commit

Permalink
pragerdom/be-366: OpenAIRE authority provider (#199)
Browse files Browse the repository at this point in the history
* feat: schema file for awards

* feat: OpenAIRE Provider constructor

* feat: introduce OpenAIRE to tests and model

* feat: OpenAIRE Authority Provider implementation

* fix: incorrect (test) docker-compose.yml file

* refactor: remove unused import, return vocab item faster

* refactor: logger, token caching, config from current_app

* refactor: access app for keys in tests from context

* refactor: remove some try-except blocks

* refactor: make relations (organizations) get more readable

* fix: prevent None value access

* refactor: unused import, change program finding method to recursive call

* temporary dependancy fix

* refactor: more consistent relations fetch, unite tests with ORCID

* format: reformat OpenAIRE provider

* refactor: more readable NoneType checking

* fix: ROR provider code update from main

* Version bump

* Ignoring warnings

* Splitting tests

* Fixing test

---------

Co-authored-by: Ronald Krist <[email protected]>
Co-authored-by: Mirek Simek <[email protected]>
  • Loading branch information
3 people authored Mar 4, 2025
1 parent 0906bef commit bfeab92
Show file tree
Hide file tree
Showing 16 changed files with 536 additions and 88 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ env:
OAREPO_VERSION: ${{ inputs.oarepo }}
INVENIO_ORCID_CLIENT_ID: ${{secrets.INVENIO_ORCID_CLIENT_ID}}
INVENIO_ORCID_CLIENT_SECRET: ${{secrets.INVENIO_ORCID_CLIENT_SECRET}}
INVENIO_OPENAIRE_CLIENT_ID: ${{secrets.INVENIO_OPENAIRE_CLIENT_ID}}
INVENIO_OPENAIRE_CLIENT_SECRET: ${{secrets.INVENIO_OPENAIRE_CLIENT_SECRET}}

jobs:
build:
Expand Down
38 changes: 9 additions & 29 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,17 +1,15 @@
version: '2.2'
name: test_services
services:
search:
image: bitnami/opensearch:2
restart: 'unless-stopped'
image: opensearchproject/opensearch:latest
restart: "unless-stopped"
environment:
# settings only for development. DO NOT use in production!
- bootstrap.memory_lock=true
- 'OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m'
- 'DISABLE_INSTALL_DEMO_CONFIG=true'
- 'DISABLE_SECURITY_PLUGIN=true'
- 'discovery.type=single-node'
- 'OPENSEARCH_PLUGINS=analysis-icu'
- "OPENSEARCH_JAVA_OPTS=-Xms512m -Xmx512m"
- "DISABLE_INSTALL_DEMO_CONFIG=true"
- "DISABLE_SECURITY_PLUGIN=true"
- "discovery.type=single-node"
ulimits:
memlock:
soft: -1
Expand All @@ -20,30 +18,12 @@ services:
soft: 65536
hard: 65536
mem_limit: 2g
expose:
- 9200
- 9600
ports:
- '127.0.0.1:9200:9200'
- "127.0.0.1:9200:9200"
- "127.0.0.1:9600:9600"
cache:
image: redis:7
restart: "unless-stopped"
read_only: true
ports:
- '127.0.0.1:6379:6379'
s3:
image: minio/minio:latest
restart: "unless-stopped"
environment:
MINIO_ROOT_USER: "tests"
MINIO_ROOT_PASSWORD: "teststests"
command: server /data --console-address :9001
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:9000/minio/health/live"]
interval: 30s
timeout: 20s
retries: 3
ports:
- "127.0.0.1:19000:9000"
- "127.0.0.1:19001:9001"

- "127.0.0.1:6379:6379"
4 changes: 2 additions & 2 deletions oarepo_vocabularies/authorities/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
from .providers import AuthorityProvider, RORProviderV2, ORCIDProvider
from .providers import AuthorityProvider, RORProviderV2, ORCIDProvider, OpenAIREProvider

__all__ = ("AuthorityProvider", "RORProviderV2", "ORCIDProvider")
__all__ = ("AuthorityProvider", "RORProviderV2", "ORCIDProvider", "OpenAIREProvider")
4 changes: 3 additions & 1 deletion oarepo_vocabularies/authorities/providers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
from .base import AuthorityProvider
from .ror_provider import RORProviderV2
from .orcid_provider import ORCIDProvider
from .openaire_provider import OpenAIREProvider

__all__ = (
"AuthorityProvider",
"RORProviderV2",
"ORCIDProvider"
"ORCIDProvider",
"OpenAIREProvider"
)
264 changes: 264 additions & 0 deletions oarepo_vocabularies/authorities/providers/openaire_provider.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,264 @@
import base64
import logging
from flask import current_app
import requests

from oarepo_vocabularies.authorities.providers.base import AuthorityProvider


logger = logging.getLogger("oarepo-vocabularies.providers.openaire")


class OpenAIREClient(object):

def __init__(
self, client_id, client_secret, url=None, testing=False, timeout=None, **kwargs
):
self.client_id = client_id
self.client_secret = client_secret
self.testing = testing
self.timeout = timeout or 10000

def _get_token(self):
url = "https://aai.openaire.eu/oidc/token"
credentials = f"{self.client_id}:{self.client_secret}"
encoded_credentials = base64.b64encode(credentials.encode("utf-8")).decode(
"utf-8"
)

headers = {"Authorization": f"Basic {encoded_credentials}"}

data = {"grant_type": "client_credentials"}

try:
response = requests.post(url, headers=headers, data=data)
response.raise_for_status()
return response.json().get("access_token")
except requests.exceptions.HTTPError as http_err:
logger.error(f"HTTP error occurred: {http_err}")
except Exception as err:
logger.error(f"Other error occurred: {err}")

def quick_search(self, access_token, search_query="", page=1, page_size=20):
url = "https://api.openaire.eu/search/projects?format=json"
if not access_token:
return {}
headers = {"Authorization": f"Bearer {access_token.strip()}"}

if not search_query or search_query == "":
return {}

params = {"name": search_query, "page": page, "size": page_size}

response = requests.get(url, headers=headers, params=params)
if response.status_code != 200:
logger.error(f"Error response: {response.status_code}")
logger.error(f"Response content: {response.text}")
response.raise_for_status()
return response.json()

def get_record(self, item_id, access_token):
url = f"https://api.openaire.eu/search/projects?openaireProjectID={item_id}&format=json"

headers = {"Authorization": f"Bearer {access_token.strip()}"}

response = requests.get(url, headers=headers)
response.raise_for_status()
return response.json()


class OpenAIREProvider(AuthorityProvider):

_cached_token = None

def __init__(self, url=None, testing=False, **kwargs):
self.openaire_client = OpenAIREClient(
current_app.config["OPENAIRE_CLIENT_ID"],
current_app.config["OPENAIRE_CLIENT_SECRET"],
url,
testing,
**kwargs,
)

def get_access_token(self):
if self._cached_token is None:
self._cached_token = self.openaire_client._get_token()
return self._cached_token

def search(self, identity, params, **kwargs):
params = params or {}
access_token = self.get_access_token()

response = self.openaire_client.quick_search(
access_token=access_token,
search_query=params.get("q", ""),
page=params.get("page", 1),
page_size=params.get("page_size", 20),
)

results = response.get("response", {})

if not results:
return [], 0

items = [
self.to_vocabulary_item(openaire_item)
for openaire_item in results.get("results", []).get("result", [])
]
total = OpenAIREProvider.dict_get(results, "header", "total", "$")

return items, total

def get(self, identity, item_id, **kwargs):

access_token = self.get_access_token()

record = self.openaire_client.get_record(item_id, access_token)

if record is None:
raise KeyError(f"OpenAIRE record {item_id} not found.")

return self.to_vocabulary_item(record.get("response", {}))

@staticmethod
def dict_get(d, *args, default={}):
"""Iteratively reach for a key in a nested dictionary"""
for path in args:
if not isinstance(d, dict) or path not in d:
return default
d = d[path]
return d

@staticmethod
def get_program_from_funding(funding_tree):
"""Explicitly search for the first program in the funding tree"""

if not funding_tree:
return "N/A"

funder_info = (
funding_tree[0].items()
if isinstance(funding_tree, list)
else funding_tree.items()
)

for _, value in funder_info:
program = OpenAIREProvider._extract_program(value)
if program:
return program

return "N/A"

@staticmethod
def _extract_program(value):
"""Helper function to extract program from a value"""
if isinstance(value, dict):
if "parent" in value and value["parent"]:
program = OpenAIREProvider._extract_program(value["parent"])
if program:
return program.get("class", {}).get("$", "N/A")

return value.get("class", {}).get("$", "N/A")

return None

@staticmethod
def to_vocabulary_item(record):
# Parse the record
header = record.get("header", {})
metadata = record.get("metadata", {})
entity = metadata.get("oaf:entity", {})
project = entity.get("oaf:project", {})

rels = project.get("rels")
if isinstance(rels, dict):
relations = rels.get("rel", [])
else:
relations = []

# If there is only one relation, convert it to a list
if not isinstance(relations, list):
relations = [relations]

# Tags (keywords)
keywords = project.get("keywords", "")

if isinstance(keywords, dict):
keywords = keywords.get("$", "")
tags = keywords.split(",")

# Identifiers
identifiers = []

identifiers.append(
{
"identifier": header.get("dri:objIdentifier", {}).get("$", ""),
"scheme": "dri:objIdentifier",
}
)

identifiers.append(
{
"identifier": project.get("originalId", {}).get("$", ""),
"scheme": "openaire:originalId",
}
)

# Number (code), title (with locale) and acronym
number = project.get("code", {}).get("$", "")
title = {
header.get("locale", {})
.get("$", "en")[:2]: project.get("title", {})
.get("$", "")
}
acronym = project.get("acronym", {}).get("$", "")

# Funder and according program
funding = project.get("fundingtree", [])

funder = {
"id": OpenAIREProvider.dict_get(funding, "funder", "id", "$") or "",
"name": OpenAIREProvider.dict_get(funding, "funder", "name", "$") or "",
}

program = OpenAIREProvider.get_program_from_funding(funding)

# Subjects and organizations
subjects = []

subject_list = project.get("subject", [])

if not isinstance(subject_list, list) and subject_list:
subject_list = [subject_list]

for subject in subject_list:
if subject and isinstance(subject, dict):
subjects.append(
{"id": subject.get("@classid", ""), "subject": subject.get("$", "")}
)

organizations = []

for relation in relations:

relation_to = relation.get("to", "")
organizations.append(
{
"scheme": relation_to.get("@scheme", ""),
"id": relation_to.get("$", ""),
"organization": relation.get("legalname", {}).get("$", ""),
}
)

return {
"$schema": "local://awards/award-v1.0.0.json",
"tags": tags,
"identifiers": identifiers,
"number": number,
"title": title,
"funder": funder,
"acronym": acronym,
"program": program,
"subjects": subjects,
"organizations": organizations,
}
13 changes: 1 addition & 12 deletions oarepo_vocabularies/authorities/providers/orcid_provider.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,12 @@
import logging

from flask import current_app
import os
import idutils
from oarepo_vocabularies.authorities.providers import AuthorityProvider

from orcid import PublicAPI as PublicAPI




logger = logging.getLogger("oarepo-vocabularies.providers.orcid")

class ORCIDClient(PublicAPI):
Expand All @@ -35,15 +32,7 @@ def get_record(self, access_token, orcid_id):

class ORCIDProvider(AuthorityProvider):
def __init__(self, url=None, testing=False, **kwargs):
try:
client_id = current_app.config["ORCID_CLIENT_ID"]
client_secret = current_app.config["ORCID_CLIENT_SECRET"]
except RuntimeError:
client_id = os.environ["INVENIO_ORCID_CLIENT_ID"]
client_secret = os.environ["INVENIO_ORCID_CLIENT_SECRET"]
except KeyError:
raise KeyError("ORCID_CLIENT_ID and ORCID_CLIENT_SECRET must be set in the configuration or as environment variables.")
self.orcid_client = ORCIDClient(client_id, client_secret, testing, **kwargs)
self.orcid_client = ORCIDClient(current_app.config["ORCID_CLIENT_ID"], current_app.config["ORCID_CLIENT_SECRET"], testing, **kwargs)


def search(self, identity, params, **kwargs):
Expand Down
Loading

0 comments on commit bfeab92

Please sign in to comment.