Merge pull request #209 from IFB-ElixirFr/better-stats

Keep track of metadata harvesting and validation in stats
IFB-ElixirFr · Dec 7, 2023 · 3b31bf3 · 3b31bf3
2 parents 7e857bf + c23c773
commit 3b31bf3
Show file tree

Hide file tree

Showing 8 changed files with 57 additions and 23 deletions.
diff --git a/app.py b/app.py
@@ -54,7 +54,7 @@
 from metrics import test_metric
 from metrics.FAIRMetricsFactory import FAIRMetricsFactory
 from metrics.WebResource import WebResource
-from metrics.Evaluation import Result
+from metrics.Evaluation import Result, Evaluation
 from profiles.bioschemas_shape_gen import validate_any_from_KG
 from profiles.bioschemas_shape_gen import validate_any_from_microdata
 from metrics.util import SOURCE, inspect_onto_reg
@@ -502,9 +502,21 @@ def get(self):
         args = reqparse.parse_args()
         url = args["url"]
 
+        eval = Evaluation()
+        eval.set_start_time()
+        eval.set_target_uri(url)
+        eval.set_reason("metadata harvesting, success score == metadata size")
+
         web_res = WebResource(url)
-        data_str = web_res.get_rdf().serialize(format="json-ld")
+        kg = web_res.get_rdf()
+        size = len(kg)
+        data_str = kg.serialize(format="json-ld")
         data_json = json.loads(data_str)
+
+        eval.set_score(size)
+        eval.set_end_time()
+        eval.persist(source="API")
+
         return data_json
 
 
@@ -617,6 +629,11 @@ def get(self):
         args = reqparse.parse_args()
         url = args["url"]
 
+        eval = Evaluation()
+        eval.set_start_time()
+        eval.set_target_uri(url)
+        eval.set_reason("bioschemas metadata validation")
+
         web_res = WebResource(url)
         kg = web_res.get_rdf()
         results = {}
@@ -626,7 +643,6 @@ def get(self):
 
         # Try to match and evaluate all found corresponding profiles
         results_type = evaluate_profile_from_type(kg)
-        print(results_type)
 
         for result_key in results_conformsto.keys():
             results[result_key] = results_conformsto[result_key]
@@ -635,6 +651,9 @@ def get(self):
             if result_key not in results:
                 results[result_key] = results_type[result_key]
 
+        eval.set_end_time()
+        eval.persist(source="API")
+
         # TODO Try similarity match her for profiles that are not matched
 
         return results
@@ -648,6 +667,11 @@ def get(self):
         args = reqparse.parse_args()
         url = args["url"]
 
+        eval = Evaluation()
+        eval.set_start_time()
+        eval.set_target_uri(url)
+        eval.set_reason("bioschemas metadata validation (from conforms_to)")
+
         web_res = WebResource(url)
         kg = web_res.get_rdf()
 
@@ -656,6 +680,9 @@ def get(self):
 
         # TODO Try similarity match her for profiles that are not matched
 
+        eval.set_end_time()
+        eval.persist(source="API")
+
         return results_conformsto
 
 
@@ -667,12 +694,19 @@ def get(self):
         args = reqparse.parse_args()
         url = args["url"]
 
+        eval = Evaluation()
+        eval.set_start_time()
+        eval.set_target_uri(url)
+        eval.set_reason("bioschemas metadata validation (from types)")
+
         web_res = WebResource(url)
         kg = web_res.get_rdf()
 
         # Try to match and evaluate all found corresponding profiles
         results_type = evaluate_profile_from_type(kg)
-        print(results_type)
+
+        eval.set_end_time()
+        eval.persist(source="API")
 
         # TODO Try similarity match her for profiles that are not matched
 

diff --git a/environment.yml b/environment.yml
@@ -26,7 +26,7 @@ dependencies:
   - gitpython
   - pip
   - cachetools==5.0.0
-  - flask-restx
+  - flask-restx==1.0.3
   - flask-swagger-ui
   - numpy
   - sphinx

diff --git a/metrics/Evaluation.py b/metrics/Evaluation.py
@@ -99,15 +99,15 @@ def set_start_time(self):
     def set_end_time(self):
         self.end_time = self.get_current_time()
 
-    def set_recommendations(self, recommendation_text):
+    def set_recommendations(self, recommendation_text: str):
         self.recommendation = recommendation_text
 
     # used by FAIRMetrics, will probably be replaced by logs
-    def set_reason(self, r):
+    def set_reason(self, r: str):
         self.reason = r
 
     # used by FAIRMetrics, will probably be replaced by logs
-    def append_reason(self, r):
+    def append_reason(self, r: str):
         self.reason = self.reason + "\n" + r
 
     def set_web_resource(self, web_resource):

diff --git a/metrics/util.py b/metrics/util.py
@@ -1,5 +1,5 @@
-from time import time
-from SPARQLWrapper import SPARQLWrapper, N3
+# from time import time
+# from SPARQLWrapper import SPARQLWrapper, N3
 from rdflib import Graph, ConjunctiveGraph, URIRef
 import requests
 import metrics.statistics as stats
@@ -63,6 +63,7 @@ def __str__(self):
 # DOI regex
 regex = r"10.\d{4,9}\/[-._;()\/:A-Z0-9]+"
 
+
 # Dynamicaly generates a table with FAIR metrics implementations
 def gen_metrics():
     metrics = []
@@ -407,7 +408,6 @@ def inspect_onto_reg(kg, is_inspect_ui):
         emit("done_check", table_content)
 
     for c in table_content["classes"]:
-
         c["tag"]["OLS"] = ask_OLS(c["name"])
         if is_inspect_ui:
             emit("done_check", table_content)
@@ -430,7 +430,6 @@ def inspect_onto_reg(kg, is_inspect_ui):
             table_content["classes_false"].append(c["name"])
 
     for p in table_content["properties"]:
-
         p["tag"]["OLS"] = ask_OLS(p["name"])
         if is_inspect_ui:
             emit("done_check", table_content)
@@ -693,7 +692,6 @@ def extract_rdf_from_html(uri):
 
 
 def extruct_to_rdf(extruct_str):
-
     g = ConjunctiveGraph()
 
     for md in extruct_str["json-ld"]:

diff --git a/profiles/Profile.py b/profiles/Profile.py
@@ -102,7 +102,7 @@ def gen_SHACL_from_profile(self):
 
             fc:{{shape_name}}
                 a sh:NodeShape ;
-                
+
                 {% for c in target_classes %}
                 sh:targetClass {{c}}, {{c.replace("sc:", "scs:")}} ;
                 {% endfor %}
@@ -180,8 +180,8 @@ def validate_shape(self, knowledge_graph, shacl_shape):
 
         results = results_graph.query(report_query)
         # print("VALIDATION RESULTS")
-        print(knowledge_graph.serialize(format="turtle"))
-        print(shacl_shape)
+        # print(knowledge_graph.serialize(format="turtle"))
+        # print(shacl_shape)
         # print(results_text)
         # print(conforms)
         # print(results_graph.serialize(format="turtle"))
@@ -208,8 +208,8 @@ def validate_shape(self, knowledge_graph, shacl_shape):
                     errors.append(f'{r["path"]}')
                 else:
                     errors.append(f'{r["path"]}')
-        print(errors)
-        print(warnings)
+        # print(errors)
+        # print(warnings)
         return conforms, warnings, errors
 
     def match_sub_kgs_from_profile(self, kg):

diff --git a/profiles/ProfileFactory.py b/profiles/ProfileFactory.py
@@ -472,7 +472,7 @@ def evaluate_profile_with_conformsto(kg):
 
             if ct_profile is not None:
                 shacl_shape = ct_profile.get_shacl_shape()
-                print(shacl_shape)
+                # print(shacl_shape)
                 conforms, warnings, errors = ct_profile.validate_shape(
                     sub_kg, shacl_shape
                 )

diff --git a/profiles/bioschemas_shape_gen.py b/profiles/bioschemas_shape_gen.py
@@ -590,7 +590,7 @@ def validate_any_from_KG(kg):
         # print(o.n3(kg.namespace_manager))
         if o.n3(kg.namespace_manager) in bs_profiles.keys():
             # print()
-            # print(f"Trying to validate {s} as a(n) {o} resource")
+            print(f"Trying to validate {s} as a(n) {o} resource")
             shacl_shape, ref_profile = gen_SHACL_from_target_class(
                 o.n3(kg.namespace_manager)
             )

diff --git a/tests/test_web_resource.py b/tests/test_web_resource.py
@@ -139,10 +139,12 @@ def test_turtle(self):
         # self.assertEqual(60, len(turtle_WR.get_rdf()))
 
     def test_MassBank(self):
-        mb = WebResource("https://massbank.eu/MassBank/RecordDisplay?id=MSBNK-RIKEN_IMS-LQB00001")
+        mb = WebResource(
+            "https://massbank.eu/MassBank/RecordDisplay?id=MSBNK-RIKEN_IMS-LQB00001"
+        )
         kg = mb.get_rdf()
-        #print(kg.serialize(format="turtle"))
-        #logging.info(f"{len(kg)} loaded RDF triples")
+        # print(kg.serialize(format="turtle"))
+        # logging.info(f"{len(kg)} loaded RDF triples")
         self.assertGreater(len(kg), 70)
 
     def test_n3(self):