From 583af8bf6aeff96af9dc2fc2900ccd09e6487a0f Mon Sep 17 00:00:00 2001 From: Haider Iqbal Date: Fri, 25 Oct 2024 11:55:26 +0100 Subject: [PATCH] - Update curie formation logic (#767) --- .../annotators/ShortFormAnnotator.java | 24 +++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/annotators/ShortFormAnnotator.java b/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/annotators/ShortFormAnnotator.java index bdb9b8809..953a72657 100644 --- a/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/annotators/ShortFormAnnotator.java +++ b/dataload/rdf2json/src/main/java/uk/ac/ebi/rdf2json/annotators/ShortFormAnnotator.java @@ -1,6 +1,7 @@ package uk.ac.ebi.rdf2json.annotators; import java.util.Set; +import java.util.regex.Pattern; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -39,16 +40,35 @@ public static void annotateShortForms(OntologyGraph graph) { /* CURIEs are formed by following rules: + If there is only one underscore "_" AND the characters before the underscore are PreferredPrefix then replace the underscore with colon ":" If there is only one underscore "_" AND the characters after the underscore are numbers then replace the underscore with colon ":" If there is only one underscore "_" and the characters after the underscore are not just numbers then just keep the curie same as shortform If there are multiple underscore but has only digits after the last underscore then the code replaces the last underscore with a colon */ + String curie; + // Pattern for: single underscore, prefix matches preferredPrefix + String preferredPrefixPattern = "^(?:" + Pattern.quote(preferredPrefix) + ")_([^_]+)$"; + // Pattern for: single underscore, suffix is all digits + String singleUnderscoreDigitsPattern = "^[^_]+_(\\d+)$"; + // Pattern for: multiple underscores, suffix is all digits + String multipleUnderscoresDigitsPattern = "^(.*)_(\\d+)$"; + if (shortForm.matches(preferredPrefixPattern)) { + curie = shortForm.replaceFirst("_", ":"); + } else if (shortForm.matches(singleUnderscoreDigitsPattern)) { + curie = shortForm.replaceFirst("_", ":"); + } else if (shortForm.matches(multipleUnderscoresDigitsPattern)) { + // Multiple underscores, suffix is digits + // Replace the last underscore with a colon + curie = shortForm.replaceFirst("_(?=\\d+$)", ":"); + } else { + // No transformation needed + curie = shortForm; + } - String curie = shortForm.replaceFirst("_(\\d+)$", ":$1"); c.properties.addProperty("shortForm", PropertyValueLiteral.fromString(shortForm)); c.properties.addProperty("curie", PropertyValueLiteral.fromString(curie)); - } } + } long endTime3 = System.nanoTime(); logger.info("annotate short forms: {}", ((endTime3 - startTime3) / 1000 / 1000 / 1000)); }