From daf70f2f029688cd8375ca26f4bd489598fabd12 Mon Sep 17 00:00:00 2001 From: rctauber Date: Tue, 25 Aug 2020 09:56:27 -0700 Subject: [PATCH] Add extract & test --- gizmos/extract.py | 274 ++++++++++++++++++++++ tests/resources/obi-extract.ttl | 50 ++++ tests/resources/{obi.ttl => obi-tree.ttl} | 0 tests/test_extract.py | 50 ++++ tests/test_tree.py | 2 +- 5 files changed, 375 insertions(+), 1 deletion(-) create mode 100644 gizmos/extract.py create mode 100644 tests/resources/obi-extract.ttl rename tests/resources/{obi.ttl => obi-tree.ttl} (100%) create mode 100644 tests/test_extract.py diff --git a/gizmos/extract.py b/gizmos/extract.py new file mode 100644 index 0000000..ce01bdd --- /dev/null +++ b/gizmos/extract.py @@ -0,0 +1,274 @@ +import logging +import sqlite3 +import sys + +from argparse import ArgumentParser + +""" +Usage: python3 extract.py -d -t > + +Creates a TTL file containing the term, annotations, and ancestors. TTL is written to stdout. +You can include more than one `-t `/`--term `. + +You may also specify multiple CURIEs to extract with `-T `/`--terms ` +where the file contains a list of CURIEs to extract. + +You may also specify which annotations you would like to include with +`-a `/`--annotation ` or `-A `/`--annotations ` +where the file contains a list of annotation property CURIEs. + +Finally, if you don't wish to include the ancestors of the term/terms, +include the `-n`/`--no-hierarchy` flag. + +The sqlite-database must be created by RDFTab (https://github.com/ontodev/rdftab.rs) +and include 'statements' and 'prefixes' tables. + +The CURIEs must use a prefix from the 'prefixes' table. +""" + +# Track terms already added to database +added = [] + + +def main(): + global added + p = ArgumentParser() + p.add_argument("-d", "--database", required=True, help="SQLite database") + p.add_argument("-t", "--term", action="append", help="CURIE of term to extract") + p.add_argument( + "-T", "--terms", help="File containing CURIES of terms to extract", + ) + p.add_argument( + "-a", + "--annotation", + action="append", + help="CURIE of annotation property to include", + ) + p.add_argument( + "-A", + "--annotations", + help="File containing CURIEs of annotation properties to include", + ) + p.add_argument( + "-n", + "--no-hierarchy", + action="store_true", + help="If provided, do not create any rdfs:subClassOf statements", + ) + args = p.parse_args() + + # Get required terms + terms = [] + if args.term: + terms = args.term + if args.terms: + with open(args.terms, "r") as f: + terms_from_file = [x.strip() for x in f.readlines()] + terms.exend(terms_from_file) + + if not terms: + logging.critical("One or more term(s) must be specified with --term or --terms") + sys.exit(1) + + # Get optional annotations (otherwise, all annotations are included) + annotations = None + if args.annotation: + # One or more annotations to add + annotations = args.annotation + if args.annotations: + with open(args.annotations, "r") as f: + annotations = [x.strip() for x in f.readlines()] + + ttl = "\n".join( + extract(args.database, terms, annotations, no_hierarchy=args.no_hierarchy) + ) + sys.stdout.write(ttl) + + +def add_annotations(cur, annotations=None): + """Add annotations from the 'statements' table on all subjects in the 'extract' table.""" + annotation_str = None + if annotations: + annotation_str = ["'" + x.replace("'", "''") + "'" for x in annotations] + annotation_str = ", ".join(annotation_str) + cur.execute("SELECT DISTINCT subject FROM extract;") + for row in cur.fetchall(): + subject = row["subject"] + query = f"""INSERT INTO extract (stanza, subject, predicate, value, language, datatype) + SELECT DISTINCT + subject AS stanza, + subject, + predicate, + value, + language, + datatype + FROM statements WHERE subject = '{subject}' AND value NOT NULL""" + if annotation_str: + query += f" AND predicate IN ({annotation_str})" + cur.execute(query) + + +def add_ancestors(cur, term_id): + """Add the hierarchy for a term ID starting with that term up to the top-level, assuming that + term ID exists in the database.""" + global added + cur.execute( + f""" + WITH RECURSIVE ancestors(parent, child) AS ( + VALUES ('{term_id}', NULL) + UNION + SELECT object AS parent, subject AS child + FROM statements + WHERE predicate = 'rdfs:subClassOf' + AND object = '{term_id}' + UNION + SELECT object AS parent, subject AS child + FROM statements, ancestors + WHERE ancestors.parent = statements.stanza + AND statements.predicate = 'rdfs:subClassOf' + AND statements.object NOT LIKE '_:%' + ) + SELECT * FROM ancestors;""" + ) + + for row in cur.fetchall(): + parent = row["parent"] + if parent and parent not in added: + # Only add rdf:type if it hasn't been added + added.append(parent) + cur.execute( + f"""INSERT INTO extract (stanza, subject, predicate, object) + VALUES ('{parent}', '{parent}', 'rdf:type', 'owl:Class');""" + ) + + child = row["child"] + if child and child not in added: + # Only add rdf:type if it hasn't been added + added.append(child) + cur.execute( + f"""INSERT INTO extract (stanza, subject, predicate, object) + VALUES ('{child}', '{child}', 'rdf:type', 'owl:Class');""" + ) + + if child and parent: + # Row has child & parent, add subclass statement + cur.execute( + f"""INSERT INTO extract (stanza, subject, predicate, object) + VALUES ('{child}', '{child}', 'rdfs:subClassOf', '{parent}');""" + ) + + +def add_term(cur, term_id): + """Add the class assertion for a term ID, assuming that term ID exists in the database.""" + cur.execute(f"SELECT * FROM statements WHERE subject = '{term_id}';") + res = cur.fetchone() + if res: + cur.execute( + f"""INSERT INTO extract (stanza, subject, predicate, object) + VALUES ('{term_id}', '{term_id}', 'rdf:type', 'owl:Class');""" + ) + + +def dict_factory(cursor, row): + """Create a dict factory for sqlite cursor""" + d = {} + for idx, col in enumerate(cursor.description): + d[col[0]] = row[idx] + return d + + +def extract(database, terms, annotations, no_hierarchy=False): + """Extract terms from the ontology database and return the module as lines of Turtle.""" + # Create a new table (extract) and copy the triples we care about + # Then write the triples from that table to the output file + with sqlite3.connect(database) as conn: + conn.row_factory = dict_factory + cur = conn.cursor() + try: + # Create the extract table + cur.execute("DROP TABLE IF EXISTS extract;") + cur.execute( + """CREATE TABLE extract(stanza TEXT, + subject TEXT, + predicate TEXT, + object TEXT, + value TEXT, + datatype TEXT, + language TEXT);""" + ) + + # Get each term up to the top-level (unless no_hierarchy) + if not no_hierarchy: + for t in terms: + add_ancestors(cur, t) + else: + # Only add the terms themselves (as long as they exist) + for t in terms: + add_term(cur, t) + + # Add declarations for any annotations used in 'extract' + cur.execute( + """INSERT INTO extract (stanza, subject, predicate, object) + SELECT DISTINCT + predicate AS stanza, + predicate AS subject, + 'rdf:type', + 'owl:AnnotationProperty' + FROM extract WHERE value NOT NULL;""" + ) + + # Add annotations for all subjects + add_annotations(cur, annotations=annotations) + + # Reset row factory + conn.row_factory = sqlite3.Row + cur = conn.cursor() + return get_ttl(cur) + finally: + # Always drop the extract table + cur.execute("DROP TABLE IF EXISTS extract;") + + +def get_ttl(cur): + """Get the 'extract' table as lines of Turtle (the lines are returned as a list).""" + # Get ttl lines + cur.execute( + '''WITH literal(value, escaped) AS ( + SELECT DISTINCT + value, + replace(replace(replace(value, '\\', '\\\\'), '"', '\\"'), ' + ', '\\n') AS escaped + FROM extract + ) + SELECT + "@prefix " || prefix || ": <" || base || "> ." + FROM prefix + UNION ALL + SELECT DISTINCT + subject + || " " + || predicate + || " " + || coalesce( + object, + """" || escaped || """^^" || datatype, + """" || escaped || """@" || language, + """" || escaped || """" + ) + || " ." + FROM extract LEFT JOIN literal ON extract.value = literal.value;''' + ) + lines = [] + for row in cur.fetchall(): + line = row[0] + if not line: + continue + # Replace newlines + line = line.replace("\n", "\\n") + lines.append(line) + + return lines + + +if __name__ == "__main__": + main() diff --git a/tests/resources/obi-extract.ttl b/tests/resources/obi-extract.ttl new file mode 100644 index 0000000..2bb110b --- /dev/null +++ b/tests/resources/obi-extract.ttl @@ -0,0 +1,50 @@ +@prefix : . +@prefix owl: . +@prefix rdf: . +@prefix xml: . +@prefix xsd: . +@prefix rdfs: . +@base . + + +################################################################# +# Classes +################################################################# + +### http://purl.obolibrary.org/obo/BFO_0000001 + rdf:type owl:Class ; + rdfs:subClassOf owl:Thing ; + rdfs:label "entity"@en . + + +### http://purl.obolibrary.org/obo/BFO_0000002 + rdf:type owl:Class ; + rdfs:subClassOf ; + rdfs:label "continuant"@en . + + +### http://purl.obolibrary.org/obo/BFO_0000004 + rdf:type owl:Class ; + rdfs:subClassOf ; + rdfs:label "independent continuant"@en . + + +### http://purl.obolibrary.org/obo/BFO_0000040 + rdf:type owl:Class ; + rdfs:subClassOf ; + rdfs:label "material entity"@en . + + +### http://purl.obolibrary.org/obo/OBI_0100046 + rdf:type owl:Class ; + rdfs:subClassOf ; + rdfs:label "phosphate buffered saline solution"@en . + + +### http://purl.obolibrary.org/obo/OBI_0302729 + rdf:type owl:Class ; + rdfs:subClassOf ; + rdfs:label "chemical solution"@en . + + +### Generated by the OWL API (version 4.5.9.2019-02-01T07:24:44Z) https://github.com/owlcs/owlapi diff --git a/tests/resources/obi.ttl b/tests/resources/obi-tree.ttl similarity index 100% rename from tests/resources/obi.ttl rename to tests/resources/obi-tree.ttl diff --git a/tests/test_extract.py b/tests/test_extract.py new file mode 100644 index 0000000..429fc74 --- /dev/null +++ b/tests/test_extract.py @@ -0,0 +1,50 @@ +import gizmos.extract +import sys + +from rdflib import Graph, Literal, URIRef + + +def test_extract(): + db = "tests/resources/obi.db" + ttl = gizmos.extract.extract(db, ["OBI:0100046"], ["rdfs:label"]) + ttl = "\n".join(ttl) + + graph = Graph() + graph.parse(data=ttl, format="turtle") + + success = True + expected_graph = Graph() + expected_graph.parse("tests/resources/obi-extract.ttl", format="turtle") + + # Check that no triples are missing + subjects = expected_graph.subjects() + for subject in subjects: + for p, o in expected_graph.predicate_objects(subject): + if (subject, URIRef(p), Literal(str(o), lang="en")) not in graph and ( + subject, + URIRef(p), + URIRef(o), + ) not in graph: + success = False + print(f"Missing '{subject} {p} {o}'") + + # Check that no triples have been added + subjects = graph.subjects() + for subject in subjects: + if str(subject) == "http://www.w3.org/2002/07/owl#Thing": + continue + for p, o in graph.predicate_objects(subject): + if (subject, URIRef(p), Literal(str(o), lang="en")) not in expected_graph and ( + subject, + URIRef(p), + URIRef(o), + ) not in expected_graph: + success = False + print(f"Added '{subject} {p} {o}'") + + if not success: + sys.exit(1) + + +if __name__ == '__main__': + test_extract() diff --git a/tests/test_tree.py b/tests/test_tree.py index 1ec04dd..256274c 100644 --- a/tests/test_tree.py +++ b/tests/test_tree.py @@ -56,7 +56,7 @@ def test_tree(): # Read in the expected output to compare success = True expected_graph = Graph() - expected_graph.parse("tests/resources/obi.ttl", format="turtle") + expected_graph.parse("tests/resources/obi-tree.ttl", format="turtle") subject = URIRef("http://purl.obolibrary.org/obo/OBI_0100046") # Check that no triples are missing