diff --git a/requirements.txt b/requirements.txt index b4d8e99..12f0c3d 100644 Binary files a/requirements.txt and b/requirements.txt differ diff --git a/semantic_id_resolver/resolver.py b/semantic_id_resolver/resolver.py index bb07c8d..8ecb1f5 100644 --- a/semantic_id_resolver/resolver.py +++ b/semantic_id_resolver/resolver.py @@ -3,9 +3,9 @@ from typing import Optional, Dict from urllib.parse import urlparse import json +import re import dns.resolver -from parser.irdi_parser import IRDIParser class DebugSemanticMatchingServiceEndpoints: @@ -35,16 +35,39 @@ def get_debug_endpoint(self, semantic_id: str) -> Optional[str]: return self.debug_endpoints.get(semantic_id) +def matches_irdi(s: str) -> bool: + # (2024-09-11, s-heppner) + # This pattern stems from the wonderful IRDI-Parser project: + # https://github.com/moritzsommer/irdi-parser + # Sadly, we had problems with Docker installing and finding the package, so we decided to eliminate the dependency. + irdi_pattern = re.compile( + # International Code Designator (4 digits) + r'^(?P\d{4})-' + # Organization Identifier (4 safe characters) + r'(?P[a-zA-Z0-9]{4})' + # Optional Additional Information (4 safe characters) + r'(-(?P[a-zA-Z0-9]{4}))?' + # Separator Character + r'#' + # Code Space Identifier (2 safe characters) + r'(?P[a-zA-Z0-9]{2})-' + # Item Code (6 safe characters) + r'(?P[a-zA-Z0-9]{6})' + # Separator Character + r'#' + # Version Identifier (1 digit) + r'(?P\d)$' + ) + return bool(irdi_pattern.match(s)) + + def is_iri_not_irdi(semantic_id: str) -> Optional[bool]: """ :return: `True`, if `semantic_id` is an IRI, False if it is an IRDI, None for neither """ # Check IRDI - try: - IRDIParser().parse(semantic_id) + if matches_irdi(semantic_id): return False - except ValueError: - pass # Check IRI parsed_url = urlparse(semantic_id) if parsed_url.scheme: