Skip to content

Commit

Permalink
Remove irdi_parser dependency
Browse files Browse the repository at this point in the history
Previously, we used the [irdi_parser] project as
a dependency to parse IRDI identifiers.
However, we had problems getting a fully automatic
building of a Docker image to run, as it failed
at this dependency for some reason.

We therefore decide to remove the dependency, but
still refer to the source of the regular
expression used to match the IRDI.

[irdi_parser](https://github.com/moritzsommer/irdi-parser)
  • Loading branch information
s-heppner committed Sep 11, 2024
1 parent d5c3d0f commit df88db9
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 5 deletions.
Binary file modified requirements.txt
Binary file not shown.
33 changes: 28 additions & 5 deletions semantic_id_resolver/resolver.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
from typing import Optional, Dict
from urllib.parse import urlparse
import json
import re

import dns.resolver
from parser.irdi_parser import IRDIParser


class DebugSemanticMatchingServiceEndpoints:
Expand Down Expand Up @@ -35,16 +35,39 @@ def get_debug_endpoint(self, semantic_id: str) -> Optional[str]:
return self.debug_endpoints.get(semantic_id)


def matches_irdi(s: str) -> bool:
# (2024-09-11, s-heppner)
# This pattern stems from the wonderful IRDI-Parser project:
# https://github.com/moritzsommer/irdi-parser
# Sadly, we had problems with Docker installing and finding the package, so we decided to eliminate the dependency.
irdi_pattern = re.compile(
# International Code Designator (4 digits)
r'^(?P<icd>\d{4})-'
# Organization Identifier (4 safe characters)
r'(?P<org_id>[a-zA-Z0-9]{4})'
# Optional Additional Information (4 safe characters)
r'(-(?P<add_info>[a-zA-Z0-9]{4}))?'
# Separator Character
r'#'
# Code Space Identifier (2 safe characters)
r'(?P<csi>[a-zA-Z0-9]{2})-'
# Item Code (6 safe characters)
r'(?P<item_code>[a-zA-Z0-9]{6})'
# Separator Character
r'#'
# Version Identifier (1 digit)
r'(?P<version>\d)$'
)
return bool(irdi_pattern.match(s))


def is_iri_not_irdi(semantic_id: str) -> Optional[bool]:
"""
:return: `True`, if `semantic_id` is an IRI, False if it is an IRDI, None for neither
"""
# Check IRDI
try:
IRDIParser().parse(semantic_id)
if matches_irdi(semantic_id):
return False
except ValueError:
pass
# Check IRI
parsed_url = urlparse(semantic_id)
if parsed_url.scheme:
Expand Down

0 comments on commit df88db9

Please sign in to comment.