Skip to content

Commit

Permalink
updating tool (#638)
Browse files Browse the repository at this point in the history
  • Loading branch information
gtfierro authored Jun 21, 2024
1 parent 74b87fc commit d873a4a
Show file tree
Hide file tree
Showing 3 changed files with 96 additions and 24 deletions.
9 changes: 6 additions & 3 deletions tools/compare_versions/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

# How to use it?

- `python tools/compare_versions/compare_versions.py --help` for getting help
- `python --oldbrick 1.0.3 https://brickschema.org/schema/1.0.3/Brick.ttl --newbrick 1.1.0 ./Brick.ttl
- This will produce the comparison results inside `./history/{old_version}-{new_version}`.
```
python tools/compare_versions/compare_versions.py --oldbrick 1.3.0 https://github.com/BrickSchema/Brick/releases/download/v1.3.0/Brick.ttl --newbrick 1.4.0 https://github.com/BrickSchema/Brick/releases/download/nightly/Brick.ttl
```

This will produce the comparison results inside `./history/{old_version}-{new_version}`.

105 changes: 84 additions & 21 deletions tools/compare_versions/compare_versions.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,19 @@
import argparse
import numpy as np
from sentence_transformers import SentenceTransformer
from scipy.optimize import linear_sum_assignment
import json
import os
from collections import defaultdict
from pathlib import Path
import sys

dirname = Path(__file__).resolve().parent.parent.parent
sys.path.append(str(dirname))
from bricksrc.deprecations import deprecations

import semver
from rdflib import Graph, OWL, RDF, RDFS, Namespace
from rdflib import Graph, OWL, RDF, RDFS, Namespace, SKOS
from tqdm import tqdm


Expand Down Expand Up @@ -77,6 +85,12 @@ def get_root(version):
g.bind("rdf", RDF)
g.bind("owl", OWL)

old_brick = Graph()
old_brick.parse(old_ttl, format="turtle")

new_brick = Graph()
new_brick.parse(new_ttl, format="turtle")


def get_tag_sets(root):
tag_sets = {}
Expand All @@ -93,14 +107,38 @@ def get_tag_sets(root):
return tag_sets


old_tag_sets = get_tag_sets(OLD_ROOT)
new_tag_sets = get_tag_sets(NEW_ROOT)
def get_concepts(graph):
# return everything in the brick: namespace
qstr = """SELECT ?s WHERE {
FILTER(STRSTARTS(STR(?s), "https://brickschema.org/schema"))
{ ?s a owl:Class }
UNION
{ ?s a owl:ObjectProperty }
UNION
{ ?s a owl:DatatypeProperty }
UNION
{ ?s a brick:Quantity }
UNION
{ ?s a brick:EntityPropertyValue }
UNION
{ ?s a brick:EntityProperty }
}"""
return set([row[0] for row in graph.query(qstr)])


old_classes = get_concepts(old_brick)
new_classes = get_concepts(new_brick)

print(f"Old classes: {len(old_classes)}")
print(f"New classes: {len(new_classes)}")

history_dir = Path(f"history/{old_ver}-{new_ver}")
os.makedirs(history_dir, exist_ok=True)

old_classes = set(old_tag_sets.keys())
new_classes = set(new_tag_sets.keys())
# old_classes = set(old_tag_sets.keys())
# new_classes = set(new_tag_sets.keys())

print(f"Common classes: {len(old_classes & new_classes)}")

with open(history_dir / "removed_classes.txt", "w") as fp:
fp.write("\n".join(sorted(old_classes - new_classes)))
Expand All @@ -112,20 +150,45 @@ def get_tag_sets(root):
g.serialize(history_dir / "graph.ttl", format="turtle")


# List possible matches for removed classes
mapping_candidates = defaultdict(list)
for old_class, old_tag_set in tqdm(old_tag_sets.items()):
if old_class in new_tag_sets:
def prep_concept(graph, concept):
# remove BRICK namespace from concept, change '_' in to ' '
name = concept.split("#")[-1].replace("_", " ")
definition = graph.value(concept, RDFS.comment) or graph.value(
concept, SKOS.definition
)
# get the cbd of the concept
sentence = f"{name} - {definition}"
return sentence


THRESHOLD = 0.7

model = SentenceTransformer("all-MiniLM-L6-v2")
old_classes = list(old_classes)
old_classes_sentences = [prep_concept(old_brick, c) for c in old_classes]
old_embeddings = model.encode(old_classes_sentences)

new_classes = list(new_classes)
new_classes_sentences = [prep_concept(new_brick, c) for c in new_classes]
new_embeddings = model.encode(new_classes_sentences)
similarities = np.dot(old_embeddings, new_embeddings.T)
distance_matrix = 1 - similarities
row_ind, col_ind = linear_sum_assignment(distance_matrix)

mapping = {}
for i, j in zip(row_ind, col_ind):
score = similarities[i, j]
if score < THRESHOLD:
continue
for new_class, new_tag_set in new_tag_sets.items():
# If the delimited tags are similar in the old class and this new class,
# they might be mappable across the version.
if (
len(old_tag_set.intersection(new_tag_set))
/ len(old_tag_set.union(new_tag_set))
> 0.7
):
mapping_candidates[old_class].append(new_class)

with open(history_dir / "possible_mapping.json", "w") as fp:
json.dump(mapping_candidates, fp, indent=2)
if old_classes[i] == new_classes[j]:
continue
if old_classes[i] in deprecations:
continue
mapping[old_classes[i]] = new_classes[j]

with open(history_dir / "mapping.json", "w") as fp:
json.dump(mapping, fp)

# write deprecations to json file
with open(history_dir / "deprecations.json", "w") as fp:
json.dump(deprecations, fp)
6 changes: 6 additions & 0 deletions tools/compare_versions/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
sentence-transformers>=3.0.1
scipy>=1.13.1
numpy>=2.0.0
semver>=3.0.2
rdflib>=7.0.0
tqdm>=4.66.4

0 comments on commit d873a4a

Please sign in to comment.