diff --git a/OntologyValidation/calculate_metrics_for_ttl.py b/OntologyValidation/calculate_metrics_for_ttl.py new file mode 100644 index 0000000..9d64c6d --- /dev/null +++ b/OntologyValidation/calculate_metrics_for_ttl.py @@ -0,0 +1,233 @@ +from src.metrics_calc import compute_metrics +from src.kg_rep import ClassRep, PropertyRep +from rdflib import Graph, BNode +import argparse +import json +import numpy as np + +class_query = """ +SELECT DISTINCT ?a +WHERE { + ?a a owl:Class . + FILTER NOT EXISTS { + ?a a owl:Restriction . + } +}""" +class_superclass_query = """ +SELECT DISTINCT ?b +WHERE { + ?name rdfs:subClassOf ?b . + FILTER NOT EXISTS { + ?b a owl:Restriction . + } +} +""" +class_comment_query = """ +SELECT DISTINCT ?b +WHERE { + ?name rdfs:comment ?b +} +""" +class_label_query = """ +SELECT DISTINCT ?b +WHERE { + ?name skos:prefLabel ?b +} +""" + +objprop_query = """ +SELECT DISTINCT ?a +WHERE { + ?a a owl:ObjectProperty . +}""" + +dprop_query = """ +SELECT DISTINCT ?a +WHERE { + ?a a owl:DatatypeProperty . +}""" + +prop_domain_query = """ +SELECT DISTINCT ?b +WHERE {{ + ?name a {proptype} . + ?name rdfs:domain ?b +}} +""" +prop_domain_union_query = """ +SELECT DISTINCT ?c +WHERE {{ + ?name a {proptype} . + ?name rdfs:domain/(owl:unionOf/rdf:rest*/rdf:first)* ?c . +}} +""" +prop_comment_query = """ +SELECT DISTINCT ?b +WHERE {{ + ?name a {proptype} . + ?name rdfs:comment ?b +}} +""" +prop_range_query = """ +SELECT DISTINCT ?b +WHERE {{ + ?name a {proptype} . + ?name rdfs:range ?b +}} + +""" +prop_range_union_query = """ +SELECT DISTINCT ?c +WHERE {{ + ?name a {proptype} . + ?name rdfs:range/(owl:unionOf/rdf:rest*/rdf:first)* ?c . +}} +""" +prop_pattern_query = """ +SELECT DISTINCT ?b +WHERE {{ + ?name a {proptype} . + ?name rdfs:pattern ?b +}} +""" + + +def get_class_reps_from_graph(g): + class_dict = {} + qres = g.query(class_query) + for row in qres: + if not isinstance(row.a, BNode): + name = str(row.a) + superclass_list = [ + str(row.b) + for row in g.query(class_superclass_query, initBindings={"name": row.a}) + ] + superclass_list = [s for s in superclass_list if (s != name)] + comment_list = [ + str(row.b) + for row in g.query(class_comment_query, initBindings={"name": row.a}) + ] + + label = "; ".join( + [ + str(row.b) + for row in g.query(class_label_query, initBindings={"name": row.a}) + ] + ) + + c = ClassRep( + name=name, + superclass_list=superclass_list, + comments=comment_list, + pref_label=label, + process_name_flag=False, + ) + class_dict[name] = c + return class_dict + + +def get_prop_reps_from_graph(g): + obj_qres = g.query(objprop_query) + + data_qres = g.query(dprop_query) + prop_type_dict = {obj.a: "owl:ObjectProperty" for obj in obj_qres} + prop_type_dict.update({data.a: "owl:DatatypeProperty" for data in data_qres}) + + prop_dict = {} + for name, type in prop_type_dict.items(): + domain_list = [ + row.b + for row in g.query( + prop_domain_query.format(proptype=type), initBindings={"name": name} + ) + ] + domain_union_list = [ + row.c + for row in g.query( + prop_domain_union_query.format(proptype=type), + initBindings={"name": name}, + ) + ] + if len(domain_union_list) > 0: + domain_list = [ + domain for domain in domain_union_list if not isinstance(domain, BNode) + ] + + range_list = [ + row.b + for row in g.query( + prop_range_query.format(proptype=type), initBindings={"name": name} + ) + ] + range_union_list = [ + row.c + for row in g.query( + prop_range_union_query.format(proptype=type), + initBindings={"name": name}, + ) + ] + if len(range_union_list) > 0: + range_list = [ + range for range in range_union_list if not isinstance(range, BNode) + ] + + comment_list = [ + row.b + for row in g.query( + prop_comment_query.format(proptype=type), initBindings={"name": name} + ) + ] + patterns = [ + row.b + for row in g.query( + prop_pattern_query.format(proptype=type), initBindings={"name": name} + ) + ] + p = PropertyRep( + name=str(name), + prop_type=type, + domain_name_list=[str(d) for d in domain_list], + range_name="", + range_list=[str(r) for r in range_list], + comments=comment_list, + pattern="" if len(patterns) == 0 else patterns[0], + process_name_flag=False, + ) + prop_dict[str(name)] = p + + return prop_dict + + +def convert_ttl_to_kg_rep(ttl_path): + g = Graph() + g.parse(ttl_path) + class_dict = get_class_reps_from_graph(g) + prop_dict = get_prop_reps_from_graph(g) + return compute_metrics(class_dict, prop_dict) + + +def get_args(): + parser = argparse.ArgumentParser() + parser.add_argument( + "-p", + "--ttl_path", + type=str, + help="Path to a .ttl ontology, or other RDFLib-supported ontology file", + ) + parser.add_argument( + "-o", + "--output_path", + default="metrics_out.json", + type=str, + help="Path for the output json file containing metrics ", + ) + args = parser.parse_args() + return args + + +if __name__ == "__main__": + args = get_args() + metrics = convert_ttl_to_kg_rep(args.ttl_path) + print("\nMetrics:\n", metrics) + with open(args.output_path, "w") as f: + json.dump(metrics, f, default=int) diff --git a/OntologyValidation/src/__init__.py b/OntologyValidation/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/OntologyValidation/src/kg_rep.py b/OntologyValidation/src/kg_rep.py new file mode 100644 index 0000000..396f586 --- /dev/null +++ b/OntologyValidation/src/kg_rep.py @@ -0,0 +1,417 @@ +import regex as re +from enum import Enum +from .str_utils import * +import numpy as np + + +# Enumerating the OWL objects used in this ontology +class PropType(Enum): + Class = "owl:Class" + Object = "owl:ObjectProperty" + Datatype = "owl:DatatypeProperty" + + +# Dictionary to reference the literal types used in this ontology +literals_dict = { + "string": "xsd:string", + "integer": "xsd:integer", + "number": "xsd:decimal", + "object": "owl:Thing", + "boolean": "xsd:boolean", +} + + +def process_range(name: str) -> str: + if name in literals_dict: + return name + else: + return process_name(extract_classname_from_filename(name)) + + +class PropertyRep: + def __init__( + self, + name: str, + domain_name_list: list, + range_name: str, + range_list: list = [], + comments: list = [], + prop_type: str = PropType.Datatype, + pattern: str = "", + process_name_flag: bool = True, + ): + """Defines a representation of an OSDU property, + identified using the name field. + Associates one or multiple domains with the property, + and only one possible literal or Object range. + Type of property identified via the PropType enum, and + a possible regex pattern can be associated with the property value. + + Args: + name (str): Name for the class within the OSDU ontology. + domain_name_list (list): list of names describing OSDU classes + that may be described by this property + range_name (str): name of OSDU class, or literal type, that this Property will take on + comments (list, optional): List of string comments to associate with this property. + Defaults to []. + prop_type (str, optional): identify whether the Property is an Object, or Datatype. + Defaults to PropType.Datatype. + pattern (str, optional): regex pattern associated with this property. Defaults to ''. + """ + if process_name_flag: + self.name = process_prop_name(name) + self.domain = [ + process_name(domain_name) for domain_name in domain_name_list + ] + if range_list: + self.range = [process_range(n) for n in range_list] + else: + self.range = [process_range(range_name)] + else: + self.name = name + self.domain = domain_name_list + if range_list: + self.range = range_list + else: + self.range = [range_name] + + self.comments = process_comments(comments) + self.type = prop_type + if np.any([f"#{key}" in self.range[0] for key in literals_dict.keys()]): + self.type = PropType.Datatype + newr = [] + for r in self.range: + added = False + for key in literals_dict.keys(): + if f"#{key}" in r: + newr.append(key) + added = True + if not added: + newr.append(r) + elif (self.range[0] not in literals_dict) or (self.range[0] == "object"): + self.type = PropType.Object + else: + self.type = PropType.Datatype + self.patterns = process_new_patterns(pattern) + self.sameas = [] + + def add_domain(self, domain_name: str) -> None: + """Process and add a domain if it is not already associated with the property. + Args: + domain_name (str): string identifier for an ODSU class domain + """ + processed_domain = process_name(domain_name) + if (domain_name != "") and (processed_domain not in self.domain): + self.domain.append(processed_domain) + + def change_range(self, range_name: str) -> None: + """Change the range for this property to a specified string + WARNING: given string will not be formatted or checked. + Any class name in OSDU used as the range must be formatted externally + + Args: + range_name (str): string identifying the new range of this property + """ + if len(self.range) == 1: + self.range = [range_name] + elif range_name not in self.range: + self.range.append(range_name) + + def add_comment(self, comment: str) -> None: + """Process and add a comment if it is not already associated with the property. + Args: + comment (str): string description of the property + """ + processed_comment = process_comment(comment) + if (comment != "") and (processed_comment not in self.comments): + self.comments.append(processed_comment) + + def add_sameas_link(self, identifier): + self.sameas.append(identifier) + + def add_pattern(self, pattern: str) -> None: + """Process and add a pattern if it is not already associated with the property. + Args: + pattern (str): regex pattern identifying the string format this property may take on + """ + processed_pattern = process_pattern(pattern) + if (pattern != "") and (processed_pattern not in self.patterns): + self.patterns.append(processed_pattern) + + def verify_match(self, range_name: str, prop_type: str, replace_range=True) -> None: + """Identify whether potential range and property types + are compatible with this property. If so, updates the range. + This includes updating literal range types to more specific versions, + and removing version numbers from the range name. + Intended property type must match the original exactly. + + Args: + range_name (str): name of OSDU class, or literal type, that this Property may take on + prop_type (str): identify whether the Property is an Object, or Datatype. + """ + + if self.type != prop_type: + raise Exception( + "Incorrect property type " + prop_type, ". Type should be " + self.type + ) + + new_range_name = process_range(range_name) + if new_range_name not in self.range: + if new_range_name not in literals_dict: + self.type = PropType.Object + if replace_range and (len(self.range) == 1): + self.range = [new_range_name] + else: + self.range.append(new_range_name) + + +class ClassRep: + def __init__( + self, + name: str, + superclass_list: list = [], + comments: list = [], + pref_label: str = "", + subclass_list: list = [], + process_name_flag: bool = True, + ): + """Defines a representation of an OSDU class node, + identified using the name field, + and having a list of inherited superclass pointing to this class. + Associates possible comments to the instance, and a possible alternate name for the class. + + Args: + name (str): Name for the class within the OSDU ontology. + superclass_list (list, optional): List of classes which this class should inherit. Defaults to []. + comments (list, optional): Comments about the nature of this property. Defaults to []. + pref_label (str, optional): Provides the option to specify an skos:prefLabel in the ontology + Defaults to '', does not generate the prefLabel given an empty string. + """ + self.name = process_name(name) + + # Option to specify an skos:prefLabel + self.pref_label = pref_label if (pref_label != self.name) else "" + + # Storage of ontology inheritance hierarchy using list of pointers to other classes + self.superclass_list = [] + self.add_superclasses(superclass_list, process_name_flag=process_name_flag) + + # Clean comments for compatibility with the TopBraid Ontology Composer + self.comments = process_comments(comments) + self.type = PropType.Class + + self.sameas = [] + + self.subclass_list = [] + self.add_subclasses(subclass_list, process_name_flag=process_name_flag) + + self.array_props = [] + + def add_comment(self, comment: str): + """Process and add a comment if it is not already associated with the class + Args: + comment (str): string description associated with the class + """ + processed_comment = process_comment(comment) + if (comment != "") and (processed_comment not in self.comments): + self.comments.append(processed_comment) + + def add_comments(self, comments: list): + """Process and add a list of comments if they are not already associated with the class + Args: + comments (list): list of string descriptions of the class + """ + if comments != []: + for comment in comments: + self.add_comment(comment) + + def add_sameas_link(self, identifier): + self.sameas.append(identifier) + + def add_superclass(self, superclass: str, process_name_flag=True) -> None: + """Add a class which is inherited by the current one, if not already listed + Args: + superclass (str): string name referencing an OSDU class + """ + if process_name_flag: + processed_superclass = process_name( + extract_classname_from_filename(superclass) + ) + else: + processed_superclass = superclass + + if ( + processed_superclass + in [ + "ReferenceData", + "MasterData", + "Dataset", + "WorkProductComponent", + "Abstract", + ] + ) and ("owl:Thing" in self.superclass_list): + self.superclass_list.remove("owl:Thing") + self.superclass_list.append(processed_superclass) + elif ( + (superclass == "owl:Thing") + and ("ReferenceData" not in self.superclass_list) + and ("MasterData" not in self.superclass_list) + and ("Dataset" not in self.superclass_list) + and ("WorkProductComponent" not in self.superclass_list) + and ("Abstract" not in self.superclass_list) + and ("owl:Thing" not in self.superclass_list) + ): + self.superclass_list.append(superclass) + elif ( + (superclass != "") + and (processed_superclass not in self.superclass_list) + and (self.name != processed_superclass) + ) and (superclass != "owl:Thing"): + if "owl:Thing" in self.superclass_list: + self.superclass_list.remove("owl:Thing") + self.superclass_list.append(processed_superclass) + + def add_superclasses(self, superclass_list: list, process_name_flag: bool = True): + """Add a list of classes which are inherited by the current one, if not already listed + Args: + superclass_list (list): string names referencing OSDU classes + """ + if superclass_list != []: + for superclass in superclass_list: + self.add_superclass(superclass, process_name_flag=process_name_flag) + + def add_subclass(self, subclass_name: str, process_name_flag=True) -> None: + if process_name_flag: + processed_subclass = process_name( + extract_classname_from_filename(subclass_name) + ) + else: + processed_subclass = subclass_name + if processed_subclass not in self.subclass_list: + self.subclass_list.append(processed_subclass) + + def add_subclasses(self, subclass_list: list, process_name_flag: bool = True): + """Add a list of classes which inherit the current one, if not already listed + Args: + subclass_list (list): string names referencing OSDU classes + """ + if subclass_list != []: + for subclass in subclass_list: + self.add_subclass(subclass, process_name_flag=process_name_flag) + + def add_array_property_restriction( + self, property_name: str, on_class: str, min_card: int = 1 + ): + array_prop = { + "prop_name": process_prop_name(property_name), + "min_card": min_card, + "on_class": process_name(on_class), + } + self.array_props.append(array_prop) + + +def add_property_from_parameters( + property_name: str, + domain_name: str, + range_name: str, + ontology_dict: dict, + property_type: str = PropType.Datatype, + comment: str = "", + pattern: str = "", + replace_range=True, + verbose: bool = False, +) -> dict: + """If a property has not yet been encountered, + creates a PropertyRep object based on externally-specified parameters. + If a property has been encountered, updates the previously specified PropertyRep object. + + Args: + property_name (str): Extracted name for a property within the OSDU ontology. + domain_name (str): Name of a class which this property can describe. + range_name (str): Name of a literal type or class which describes the data this property can take on. + ontology_dict (dict): Dictionary mapping explored OSDU property names to PropertyRep objects. + property_type (str, optional): Enum string for the type of property this should describe. + Defaults to PropType.Datatype. + comment (str, optional): Extracted comment about the nature of this property. Defaults to ''. + pattern (str, optional): Extracted regex pattern describing the format of this property. Defaults to ''. + verbose (bool, optional): Whether to report detailed errors. Defaults to False. + + Returns: + dict: Updated dictionary mapping explored OSDU property names to PropertyRep objects. + """ + + if process_prop_name(property_name) in ontology_dict: + property_name = process_prop_name(property_name) + try: + ontology_dict[property_name].verify_match( + range_name, property_type, replace_range + ) + ontology_dict[property_name].add_domain(domain_name) + ontology_dict[property_name].add_comment(comment) + ontology_dict[property_name].add_pattern(pattern) + except Exception as e: + if verbose: + print(e) + print( + "\nTried to create property on top of a class:", + property_name, + domain_name, + ) + else: + ontology_dict[process_prop_name(property_name)] = PropertyRep( + name=property_name, + domain_name_list=[domain_name], + range_name=range_name, + comments=[comment], + prop_type=property_type, + pattern=pattern, + ) + return ontology_dict + + +def add_class_from_parameters( + class_name: str, + superclass_list: list, + ontology_dict: dict, + comments: list = [], + pref_label: str = "", + subclass_list: list = [], + verbose: bool = False, + process_name_flag: bool = True, +) -> dict: + """If a class has not yet been encountered, + creates a ClassRep object based on externally-specified parameters. + If a class has been encountered, updates the previously specified ClassRep object. + + Args: + class_name (str): Extracted name for a class within the OSDU ontology. + superclass_list (list): Extracted list of classes which this class should inherit. + ontology_dict (dict): Dictionary mapping explored OSDU class names to ClassRep objects. + comments (list, optional): Extracted comments about the nature of this property. Defaults to []. + pref_label (str, optional): Extracted alternate name for the Defaults to ''. + + Returns: + dict: Updated dictionary mapping explored OSDU class names to ClassRep objects. + """ + class_name_proc = process_name(class_name) if process_name_flag else class_name + if class_name_proc in ontology_dict: + try: + ontology_dict[class_name_proc].add_superclasses( + superclass_list, process_name_flag=process_name_flag + ) + ontology_dict[class_name_proc].add_subclasses( + subclass_list, process_name_flag=process_name_flag + ) + ontology_dict[class_name_proc].add_comments(comments) + except Exception: + if verbose: + print("Error in adding superclass or comment to:", class_name) + else: + ontology_dict[class_name_proc] = ClassRep( + class_name, + superclass_list, + comments, + pref_label=pref_label, + subclass_list=subclass_list, + ) + return ontology_dict diff --git a/OntologyValidation/src/metrics_calc.py b/OntologyValidation/src/metrics_calc.py new file mode 100644 index 0000000..a65da1c --- /dev/null +++ b/OntologyValidation/src/metrics_calc.py @@ -0,0 +1,242 @@ +import numpy as np +import networkx as nx +from .kg_rep import * + +import typing +from typing import Any, Dict, List, Optional, Tuple + + +def compute_metrics(class_ontology_dict: dict, prop_ontology_dict: dict) -> dict: + """Generates a dictionary of metrics + Keys: + "ADIT-LN" - the average length of a path from a root ontology node to a leaf node + "Average number of shortest paths" - the average number of shortest paths between nodes + "Number of classes" - the total number of classes/nodes in the ontology + "Number of inheritance relationships" - the total number of superclass edges in the ontology + "Number of property relationships" - the total number of property edges in the ontology + "Number of leaf classes" - number of classes that are not inherited by any other class + "Relationship richness" - non-inheritance edges out of all edges + "Inheritance richness" - number of inheritance relationships / number of classes + "Attribute richness" - number of non-inheritance properties / number of classes + "Average number of shortest paths": how many nodes each node is connected to, on average, in the + "Average shortest path length": the length of a given shortest path between nodes, on average + "Diameter of inheritance graph": the length of the shortest path between the most distanced nodes + "Average number of shortest paths, full graph": avg_num_shortest_paths_full, + "Average shortest path length, full graph": avg_shortest_path_full, + "Diameter of full graph": diameter_ont_g, + + Args: + class_ontology_dict (dict): Complete dictionary mapping OSDU class names to ClassRep objects. + prop_ontology_dict (dict): Complete dictionary mapping explored OSDU property names to PropertyRep objects + + Returns: + metrics_dict: dictionary mapping metric key to calculated metric value for the OSDU ontology + """ + # Construct ontology inheritance graph + inheritance_g = extract_inheritance_graph( + class_ontology_dict, prop_ontology_dict, False + ) + + # Construct ontology property (non-inheritance) graph + property_g = extract_property_graph(class_ontology_dict, prop_ontology_dict) + + # Construct full graph + ont_graph = nx.compose(inheritance_g, property_g) + + # Calculate ADIT-LN + adit_ln, nol = calc_adit_ln_metrics( + class_ontology_dict, prop_ontology_dict, inheritance_g + ) + + # Calculate Richness metrics + rel_richness = calc_relationship_richness(property_g, inheritance_g) + inheritance_richness = calc_inheritance_richness(inheritance_g) + attribute_richness = calc_attribute_richness(property_g) + + # Calculate shortest paths metrics in both inheritance structure and full ontology graph + avg_num_shortest_paths, avg_shortest_path, diameter_inheritance_g = ( + calc_shortest_path_metrics(inheritance_g) + ) + avg_num_shortest_paths_full, avg_shortest_path_full, diameter_ont_g = ( + calc_shortest_path_metrics(ont_graph) + ) + + return { + "Number of classes": calc_num_nodes(inheritance_g), + "Average number of shortest paths": avg_num_shortest_paths, + "Average shortest path length": avg_shortest_path, + "Diameter of inheritance graph": diameter_inheritance_g, + "Average number of shortest paths, full graph": avg_num_shortest_paths_full, + "Average shortest path length, full graph": avg_shortest_path_full, + "Diameter of full graph": diameter_ont_g, + "Number of inheritance relationships": calc_num_inheritance_edges( + inheritance_g + ), + "Number of property relationships": calc_num_noninheritance_edges(property_g), + "Number of leaf classes": nol, + "ADIT-LN": adit_ln, + "Relationship richness": rel_richness, + "Inheritance richness": inheritance_richness, + "Attribute richness": attribute_richness, + } + + +def extract_inheritance_graph( + class_ontology_dict: dict, prop_ontology_dict: dict, extract_classname: bool = True +) -> nx.MultiDiGraph: + # Build a directed graph - TODO: verify that it is acyclic? + inheritance_g = nx.MultiDiGraph() + + inheritance_g.add_nodes_from(list(class_ontology_dict.keys())) + + for class_key, class_rep in class_ontology_dict.items(): + for superclass_key in class_rep.superclass_list: + if extract_classname: + superclass_name = extract_classname_from_filename(superclass_key) + else: + superclass_name = superclass_key + inheritance_g.add_edge(superclass_name, class_key) + + return inheritance_g + + +def extract_property_graph( + class_ontology_dict: dict, prop_ontology_dict: dict +) -> nx.MultiDiGraph: + property_g = nx.MultiDiGraph() + + property_g.add_nodes_from(list(class_ontology_dict.keys())) + + # For each property, add an edge between each of the domain classes and the range class + for prop_key, prop_rep in prop_ontology_dict.items(): + for range_name in prop_rep.range: + if range_name not in property_g.nodes: + property_g.add_node(range_name) + e_bunch = zip( + prop_rep.domain, + [range_name] * len(prop_rep.domain), + [prop_rep.name] * len(prop_rep.domain), + ) + property_g.add_edges_from(e_bunch) + + return property_g + + +def calc_adit_ln_metrics( + class_ontology_dict: dict, prop_ontology_dict: dict, inheritance_g: nx.MultiDiGraph +) -> tuple[float, int]: + """Calculate the average length of a path from a root ontology node to a leaf node (ADIT-LN), + and the number of classes that are not inherited by any other class (number of leaf classes, NOL). + Uses the inheritance graph of an ontology. + Args: + inheritance_g (nx.MultiDiGraph): Inheritance graph of the ontology, with edges pointing from superclass source to subclass target + Returns: + float: value for ADIT-LN + int: value for NOL + """ + leaf_class_list = list(class_ontology_dict.keys()) + root_class_list = [] + + for class_key, class_rep in class_ontology_dict.items(): + if ( + ( + (len(class_rep.superclass_list) == 1) + and class_rep.superclass_list[0] == "owl:Thing" + ) + or ( + (len(class_rep.superclass_list) == 1) + and class_rep.superclass_list[0] + == "http://www.w3.org/2002/07/owl#Thing" + ) + or (len(class_rep.superclass_list) == 0) + ): + root_class_list.append(class_key) + else: + for superclass_key in class_rep.superclass_list: + superclass_name = superclass_key + + if (superclass_name not in literals_dict) and ( + superclass_name in leaf_class_list + ): + leaf_class_list.remove(superclass_name) + + noc = len(root_class_list) + nol = len(leaf_class_list) + + num_paths_matrix = np.zeros((noc, nol)) + 1e-10 + path_lengths_matrix = np.zeros((noc, nol)) + + for i, root_class in enumerate(root_class_list): + for j, leaf_class in enumerate(leaf_class_list): + if nx.has_path(inheritance_g, source=root_class, target=leaf_class): + + paths_gen = nx.all_simple_paths( + inheritance_g, source=root_class, target=leaf_class + ) + comp_paths = list(paths_gen) + paths = [path for path in comp_paths if len(path) != 1] + + num_paths = len(paths) + num_paths_matrix[i, j] = num_paths if num_paths != 0 else 1e-10 + + path_lengths = [len(path) for path in paths] if paths != [] else [0] + path_lengths_matrix[i, j] = sum(path_lengths) + num_paths = np.sum(num_paths_matrix) + adit_ln = np.sum(path_lengths_matrix) / (num_paths + 1e-8) + return adit_ln, nol + + +def calc_shortest_path_metrics(ont_graph: nx.MultiDiGraph) -> tuple[float, float, int]: + """Calculates shortest-paths-based topology metrics for a given graph. + Includes + Average number of shortest paths in the graph (avg_num_shortest_paths), + Average length of a given shortest path in the graph (avg_shortest_path), + Diameter of the graph, or shortest path length between the most distanced nodes in the graph (diameter) + Args: + ont_graph (nx.MultiDiGraph): + Returns: + float: value for avg_num_shortest_paths + float: value for avg_shortest_path + int: value for diameter + """ + s_paths = dict(nx.shortest_path(ont_graph)) + num_s_paths = sum([len(list(s_paths[key].values())) for key in s_paths.keys()]) + + shortest_path_lengths = [ + [len(list(path)) for path in s_paths[key].values()] for key in s_paths.keys() + ] + diameter = np.max([np.max(path_list) for path_list in shortest_path_lengths]) + + return ( + num_s_paths / calc_num_nodes(ont_graph), + np.sum([np.sum(path) for path in shortest_path_lengths]) / num_s_paths, + diameter, + ) + + +def calc_num_nodes(ont_graph: nx.MultiDiGraph) -> float: + return ont_graph.number_of_nodes() + + +def calc_num_inheritance_edges(ont_graph: nx.MultiDiGraph) -> float: + return ont_graph.number_of_edges() + + +def calc_num_noninheritance_edges(prop_graph: nx.MultiDiGraph) -> float: + return prop_graph.number_of_edges() + + +def calc_relationship_richness( + prop_graph: nx.MultiDiGraph, ont_graph: nx.MultiDiGraph +) -> float: + p = calc_num_noninheritance_edges(prop_graph) + h = calc_num_inheritance_edges(ont_graph) + return p / (h + p + 1e-10) + + +def calc_inheritance_richness(ont_graph: nx.MultiDiGraph) -> float: + return calc_num_inheritance_edges(ont_graph) / calc_num_nodes(ont_graph) + + +def calc_attribute_richness(prop_graph: nx.MultiDiGraph) -> float: + return calc_num_noninheritance_edges(prop_graph) / calc_num_nodes(prop_graph) diff --git a/OntologyValidation/src/str_utils.py b/OntologyValidation/src/str_utils.py new file mode 100644 index 0000000..bc522a4 --- /dev/null +++ b/OntologyValidation/src/str_utils.py @@ -0,0 +1,155 @@ +import regex as re +from nltk.tokenize import word_tokenize + + +def strip_whitespace(name: str) -> str: + """Remove any instance of whitespace from a string + Args: + name (str): String to be stripped of whitespace. This string should be the name of a class or property. + Returns: + str: Whitespace-stripped string. + """ + return re.sub("\s", "", name) + + +def remove_punctuation(name: str) -> str: + return re.sub("\.", "", name) + + +def process_name(name: str) -> str: + """Removes asterisks and periods from a string, to be formatted as a name for a class or property. + Args: + name (str): String to be processed. + Returns: + str: Processed string. + """ + name = re.sub(r"\.", "", re.sub(r"\*", "", strip_whitespace(name))) + return upper_split_camelcase(name) + + +def process_prop_name(name: str) -> str: + name = re.sub(r"\.", "", re.sub(r"\*", "", strip_whitespace(name))) + return lower_split_camelcase(name) + + +def process_comment(comment: str) -> str: + """Removes slashes, quotes, and newline characters from a string, to be formatted as a comment. + Args: + comment (str): String to be processed. + Returns: + str: Processed string. + """ + return re.sub("\n", " ", re.sub('"', "", re.sub(r"\\", "", comment))) + + +def process_comments(comments: list) -> list: + """Removes slashes, quotes, and newline characters from a list of strings, all to be formatted as comments. + Args: + comments (list): Strings to be processed. + Returns: + list: Processed strings. + """ + new_comments = ( + [process_comment(comment) for comment in comments.copy()] + if comments != [""] + else [] + ) + return new_comments + + +def process_pattern(pattern: str) -> str: + """Introduces a correct pattern of slashes to a string representing a regex pattern. + Args: + pattern (str): String to be processed. + Returns: + str: Processed string. + """ + return re.sub(r"\\", r"\\\\\\\\", pattern) + + +def process_new_patterns(pattern: str) -> list: + """Introduces a correct pattern of slashes to a list of strings representing regex patterns. + Args: + patterns (list): Strings to be processed. + Returns: + list: Processed strings. + """ + new_patterns = [process_pattern(pattern)] if pattern != "" else [] + return new_patterns + + +def lower_process_name(property_name: str) -> str: + """_summary_ + Args: + prop_name (str): _description_ + Returns: + str: _description_ + """ + return str.lower(process_name(property_name)) + + +def split_camelcase(name: str) -> list: + """_summary_ + Args: + name (str): _description_ + Returns: + str: _description_ + """ + return re.findall( + "(?:[A-Z](?:[a-z]+|[A-Z]*(?=[A-Z]|$)))|[a-z]+|[A-Z]*(?=[A-Z]|$)", name + ) + + +def upper_split_camelcase(name: str) -> str: + name_comps = split_camelcase(remove_punctuation(name)) + name_comps.remove("") + + if (len(name_comps) > 0) and (not name_comps[0][0].isupper()): + name_comps[0] = name_comps[0][0].upper() + name_comps[0][1:] + return "".join(name_comps) + + +def lower_split_camelcase(name: str) -> str: + name_comps = split_camelcase(remove_punctuation(name)) + name_comps.remove("") + + if (len(name_comps) > 0) and (not name_comps[0][0].islower()): + name_comps[0] = name_comps[0][0].lower() + name_comps[0][1:] + return "".join(name_comps) + + +def extract_version(url: str) -> int: + """Extract a 3-digit integer representing the version number of the schema file, + from the filename + Args: + url (str): a filename or relative url, ending in format "filename.x1.x2.x3.json + Returns: + int: 3-digit integer representing a 3-digit version number, returned as x1*100 + x2*10 + x3 + """ + version_str = re.search("(.+)\.\d\.\d\.\d\.json", url) + if version_str is not None: + version_num = int("".join(re.search("(\d)\.(\d)\.(\d)\.json", url).groups())) + return version_num + else: + return None + + +def extract_classname_from_kind(value: str) -> str: + classname = "".join(re.search(":(\w+):\d\.\d\.\d", value).groups()) + return classname + + +def extract_classname_from_filename(value: str) -> str: + class_name = re.search("([A-z]+)\.\d\.\d\.\d\.json", value) + class_name = str(value) if class_name is None else "".join(class_name.groups()) + return class_name + + +def has_prefix(value: str) -> bool: + """Returns true if the string input takes a form 'prefix:classOrProperty' + Args: + value (str): string input to be checked + Returns: + bool: true if the string input takes a form 'prefix:classOrProperty' + """ + return re.search("([A-z]+):", value) is not None