diff --git a/tools/compare_versions/README.md b/tools/compare_versions/README.md index 739ef821..231a1dda 100644 --- a/tools/compare_versions/README.md +++ b/tools/compare_versions/README.md @@ -2,6 +2,9 @@ # How to use it? -- `python tools/compare_versions/compare_versions.py --help` for getting help -- `python --oldbrick 1.0.3 https://brickschema.org/schema/1.0.3/Brick.ttl --newbrick 1.1.0 ./Brick.ttl - - This will produce the comparison results inside `./history/{old_version}-{new_version}`. +``` +python tools/compare_versions/compare_versions.py --oldbrick 1.3.0 https://github.com/BrickSchema/Brick/releases/download/v1.3.0/Brick.ttl --newbrick 1.4.0 https://github.com/BrickSchema/Brick/releases/download/nightly/Brick.ttl +``` + +This will produce the comparison results inside `./history/{old_version}-{new_version}`. + diff --git a/tools/compare_versions/compare_versions.py b/tools/compare_versions/compare_versions.py index cb05e797..52abe7f4 100644 --- a/tools/compare_versions/compare_versions.py +++ b/tools/compare_versions/compare_versions.py @@ -1,11 +1,19 @@ import argparse +import numpy as np +from sentence_transformers import SentenceTransformer +from scipy.optimize import linear_sum_assignment import json import os from collections import defaultdict from pathlib import Path +import sys + +dirname = Path(__file__).resolve().parent.parent.parent +sys.path.append(str(dirname)) +from bricksrc.deprecations import deprecations import semver -from rdflib import Graph, OWL, RDF, RDFS, Namespace +from rdflib import Graph, OWL, RDF, RDFS, Namespace, SKOS from tqdm import tqdm @@ -77,6 +85,12 @@ def get_root(version): g.bind("rdf", RDF) g.bind("owl", OWL) +old_brick = Graph() +old_brick.parse(old_ttl, format="turtle") + +new_brick = Graph() +new_brick.parse(new_ttl, format="turtle") + def get_tag_sets(root): tag_sets = {} @@ -93,14 +107,38 @@ def get_tag_sets(root): return tag_sets -old_tag_sets = get_tag_sets(OLD_ROOT) -new_tag_sets = get_tag_sets(NEW_ROOT) +def get_concepts(graph): + # return everything in the brick: namespace + qstr = """SELECT ?s WHERE { + FILTER(STRSTARTS(STR(?s), "https://brickschema.org/schema")) + { ?s a owl:Class } + UNION + { ?s a owl:ObjectProperty } + UNION + { ?s a owl:DatatypeProperty } + UNION + { ?s a brick:Quantity } + UNION + { ?s a brick:EntityPropertyValue } + UNION + { ?s a brick:EntityProperty } + }""" + return set([row[0] for row in graph.query(qstr)]) + + +old_classes = get_concepts(old_brick) +new_classes = get_concepts(new_brick) + +print(f"Old classes: {len(old_classes)}") +print(f"New classes: {len(new_classes)}") history_dir = Path(f"history/{old_ver}-{new_ver}") os.makedirs(history_dir, exist_ok=True) -old_classes = set(old_tag_sets.keys()) -new_classes = set(new_tag_sets.keys()) +# old_classes = set(old_tag_sets.keys()) +# new_classes = set(new_tag_sets.keys()) + +print(f"Common classes: {len(old_classes & new_classes)}") with open(history_dir / "removed_classes.txt", "w") as fp: fp.write("\n".join(sorted(old_classes - new_classes))) @@ -112,20 +150,45 @@ def get_tag_sets(root): g.serialize(history_dir / "graph.ttl", format="turtle") -# List possible matches for removed classes -mapping_candidates = defaultdict(list) -for old_class, old_tag_set in tqdm(old_tag_sets.items()): - if old_class in new_tag_sets: +def prep_concept(graph, concept): + # remove BRICK namespace from concept, change '_' in to ' ' + name = concept.split("#")[-1].replace("_", " ") + definition = graph.value(concept, RDFS.comment) or graph.value( + concept, SKOS.definition + ) + # get the cbd of the concept + sentence = f"{name} - {definition}" + return sentence + + +THRESHOLD = 0.7 + +model = SentenceTransformer("all-MiniLM-L6-v2") +old_classes = list(old_classes) +old_classes_sentences = [prep_concept(old_brick, c) for c in old_classes] +old_embeddings = model.encode(old_classes_sentences) + +new_classes = list(new_classes) +new_classes_sentences = [prep_concept(new_brick, c) for c in new_classes] +new_embeddings = model.encode(new_classes_sentences) +similarities = np.dot(old_embeddings, new_embeddings.T) +distance_matrix = 1 - similarities +row_ind, col_ind = linear_sum_assignment(distance_matrix) + +mapping = {} +for i, j in zip(row_ind, col_ind): + score = similarities[i, j] + if score < THRESHOLD: continue - for new_class, new_tag_set in new_tag_sets.items(): - # If the delimited tags are similar in the old class and this new class, - # they might be mappable across the version. - if ( - len(old_tag_set.intersection(new_tag_set)) - / len(old_tag_set.union(new_tag_set)) - > 0.7 - ): - mapping_candidates[old_class].append(new_class) - -with open(history_dir / "possible_mapping.json", "w") as fp: - json.dump(mapping_candidates, fp, indent=2) + if old_classes[i] == new_classes[j]: + continue + if old_classes[i] in deprecations: + continue + mapping[old_classes[i]] = new_classes[j] + +with open(history_dir / "mapping.json", "w") as fp: + json.dump(mapping, fp) + +# write deprecations to json file +with open(history_dir / "deprecations.json", "w") as fp: + json.dump(deprecations, fp) diff --git a/tools/compare_versions/requirements.txt b/tools/compare_versions/requirements.txt new file mode 100644 index 00000000..00052217 --- /dev/null +++ b/tools/compare_versions/requirements.txt @@ -0,0 +1,6 @@ +sentence-transformers>=3.0.1 +scipy>=1.13.1 +numpy>=2.0.0 +semver>=3.0.2 +rdflib>=7.0.0 +tqdm>=4.66.4