diff --git a/experiments/aggregate_results.py b/experiments/aggregate_results.py index fd2aeba..88e7763 100644 --- a/experiments/aggregate_results.py +++ b/experiments/aggregate_results.py @@ -1,5 +1,6 @@ import json from collections import defaultdict +import csv # Content type mapping for each format content_type_mapping = { @@ -8,11 +9,18 @@ "rdfxml": ["application/rdf+xml", "application/xml"], # Updated to include multiple accepted types } -rdf_mimtetypes = ['text/turtle', 'application/n-triples', 'application/rdf+xml', 'application/xml'] +# rdf_mimtetypes = ['text/turtle', 'application/n-triples', 'application/rdf+xml', 'application/xml'] +rdf_mimtetypes = [ + 'text/turtle', 'application/n-triples', 'application/rdf+xml', 'application/xml', + 'application/owl+xml', 'application/ld+json', 'text/owl-functional', 'text/owl-manchester', + 'text/n3', 'application/trig', 'application/x-turtle', 'application/x-trig', + 'application/x-nquads' , 'application/n-quads' +] + # File paths for the logs -no_proxy_file_path = 'downloads-200ms-shuffled/download_log_extended.json' -proxy_file_path = 'downloads_proxy_requests/download_proxy_log_extended.json' +no_proxy_file_path = 'downloads-200ms-shuffled/download_log_extended_fixshort.json' +proxy_file_path = 'downloads_proxy_requests/download_proxy_log_extended_fix.json' # Load the JSON data for no-proxy and with-proxy scenarios with open(no_proxy_file_path, 'r') as f: @@ -39,8 +47,9 @@ "0 bytes content", "no rdf content (0 triples parsable)", "partially parsable rdf-content", + # "pp describes requested ont.", "fully parsable rdf-content", - "describes requested ont.", + # "describes requested ont.", "no RDF mimetype", "confused RDF mimetype", "correct mimetype", @@ -121,12 +130,12 @@ def process_data(data, proxy_key): aggregation[proxy_key]["no rdf content (0 triples parsable)"][format] += 1 elif parsed_triples > 0 and rapper_error: aggregation[proxy_key]["partially parsable rdf-content"][format] += 1 - if uri_in_subject_position: - aggregation[proxy_key]["describes requested ont."][format] += 1 + # if uri_in_subject_position: + # aggregation[proxy_key]["pp describes requested ont."][format] += 1 elif parsed_triples > 0 and not rapper_error: aggregation[proxy_key]["fully parsable rdf-content"][format] += 1 - if uri_in_subject_position: - aggregation[proxy_key]["describes requested ont."][format] += 1 + if True: + # aggregation[proxy_key]["describes requested ont."][format] += 1 # Check MIME types only for ontologies that describe the requested ontology if content_type and is_correct_mimetype(format, content_type): @@ -141,10 +150,28 @@ def process_data(data, proxy_key): if formats_correct == {"ttl", "nt", "rdfxml"}: aggregation[proxy_key]["correct for all 3 formats"]["all"] += 1 +# Function to write aggregation results to TSV file +def write_to_tsv(filename, proxy_key): + with open(filename, 'w', newline='') as tsvfile: + writer = csv.writer(tsvfile, delimiter='\t') + writer.writerow(["Accessibility Status", "turtle", "ntriples", "rdfxml"]) + for category in categories: + row = [category] + for format in ["ttl", "nt", "rdfxml"]: + row.append(aggregation[proxy_key][category].get(format, 0)) + writer.writerow(row) + # Write total for "correct for all 3 formats" + correct_all = aggregation[proxy_key]["correct for all 3 formats"]["all"] + writer.writerow(["correct for all 3 formats", correct_all]) + # Process both datasets process_data(no_proxy_data, "w/o proxy") process_data(proxy_data, "with proxy") +# Write results to TSV files +write_to_tsv('no_proxy_results.tsv', "w/o proxy") +write_to_tsv('proxy_results.tsv', "with proxy") + # Print the table table_headers = ["Accessibility Status", "turtle", "ntriples", "rdfxml"] for proxy_key in ["w/o proxy", "with proxy"]: diff --git a/experiments/aggregate_results_NIRcheck.py b/experiments/aggregate_results_NIRcheck.py new file mode 100644 index 0000000..8dcd035 --- /dev/null +++ b/experiments/aggregate_results_NIRcheck.py @@ -0,0 +1,187 @@ +import json +from collections import defaultdict +import csv + +# Content type mapping for each format +content_type_mapping = { + "ttl": "text/turtle", + "nt": "application/n-triples", + "rdfxml": ["application/rdf+xml", "application/xml"], # Updated to include multiple accepted types +} + +# rdf_mimtetypes = ['text/turtle', 'application/n-triples', 'application/rdf+xml', 'application/xml'] +rdf_mimtetypes = [ + 'text/turtle', 'application/n-triples', 'application/rdf+xml', 'application/xml', + 'application/owl+xml', 'application/ld+json', 'text/owl-functional', 'text/owl-manchester', + 'text/n3', 'application/trig', 'application/x-turtle', 'application/x-trig', + 'application/x-nquads' , 'application/n-quads' +] + + +# File paths for the logs +no_proxy_file_path = 'downloads-200ms-shuffled/download_log_extended_fixshort.json' +proxy_file_path = 'downloads_proxy_requests/download_proxy_log_extended_fix.json' + +# Load the JSON data for no-proxy and with-proxy scenarios +with open(no_proxy_file_path, 'r') as f: + no_proxy_data = json.load(f) + +with open(proxy_file_path, 'r') as f: + proxy_data = json.load(f) + +# Initialize the aggregation dictionary for both proxy and no-proxy scenarios +aggregation = { + "w/o proxy": defaultdict(lambda: defaultdict(int)), + "with proxy": defaultdict(lambda: defaultdict(int)), +} + +# Define categories for table +categories = [ + "unsuccessful payload retrieval", + "DNS issue", + "Con. / transport issue", + "TLS cert issue", + "Too many redirects", + "Non-200 HTTP code", + "Successful request (code 200)", + "0 bytes content", + "no rdf content (0 triples parsable)", + "partially parsable rdf-content", + "pp describes requested ont.", + "fully parsable rdf-content", + "describes requested ont.", + "no RDF mimetype", + "confused RDF mimetype", + "correct mimetype", + "correct for all 3 formats", +] + +# Error type to category mapping logic +def map_error_to_category(error_type, type_more_specific): + if error_type == "TooManyRedirects": + return "Too many redirects" + elif error_type == "SSLError": + return "TLS cert issue" + elif error_type == "ConnectionError": + if type_more_specific == "NameResolutionError": + return "DNS issue" + else: + return "Con. / transport issue" + elif error_type == "ConnectTimeout": + return "Con. / transport issue" + else: + return "Con. / transport issue" + +# Check if MIME type is valid for the format +def is_correct_mimetype(format, content_type): + expected_types = content_type_mapping.get(format, []) + if isinstance(expected_types, list): + for expected_type in expected_types: + if expected_type in content_type: + return True + return False + return expected_types in content_type + +def is_rdf_mimetype(content_type): + for rdf_mimetype in rdf_mimtetypes: + if rdf_mimetype in content_type: + return True + return False + +# Process data for aggregation +def process_data(data, proxy_key): + for entry in data: + url = entry.get("url", "") + downloads = entry.get("downloads", {}) + formats_correct = set() + + for format, details in downloads.items(): + # Extract details + status_code = details.get("status_code") + parsed_triples = details.get("parsed_triples", 0) + content_length = details.get("content_lenght_measured", 0) + content_type = details.get("content_type", "").lower() if details.get("content_type") else None + uri_in_subject_position = details.get("uri_in_subject_position", False) + rapper_error = details.get("rapper_error") + error = details.get("error", {}) + + # Check for errors and categorize them + if error and error.get("type"): + error_type = error["type"] + type_more_specific = error.get("type_more_specific") + category = map_error_to_category(error_type, type_more_specific) + aggregation[proxy_key][category][format] += 1 + aggregation[proxy_key]["unsuccessful payload retrieval"][format] += 1 + continue + + # Handle non-200 status codes + if status_code != 200: + aggregation[proxy_key]["Non-200 HTTP code"][format] += 1 + aggregation[proxy_key]["unsuccessful payload retrieval"][format] += 1 + continue + + # Successful request (status code 200) + aggregation[proxy_key]["Successful request (code 200)"][format] += 1 + + # Categorize successful ontologies + if content_length == 0: + aggregation[proxy_key]["0 bytes content"][format] += 1 + elif parsed_triples == 0: + aggregation[proxy_key]["no rdf content (0 triples parsable)"][format] += 1 + elif parsed_triples > 0 and rapper_error: + aggregation[proxy_key]["partially parsable rdf-content"][format] += 1 + if uri_in_subject_position: + aggregation[proxy_key]["pp describes requested ont."][format] += 1 + elif parsed_triples > 0 and not rapper_error: + aggregation[proxy_key]["fully parsable rdf-content"][format] += 1 + if uri_in_subject_position: + aggregation[proxy_key]["describes requested ont."][format] += 1 + + # Check MIME types only for ontologies that describe the requested ontology + if content_type and is_correct_mimetype(format, content_type): + aggregation[proxy_key]["correct mimetype"][format] += 1 + formats_correct.add(format) + elif content_type and is_rdf_mimetype(content_type): + aggregation[proxy_key]["confused RDF mimetype"][format] += 1 + else: + aggregation[proxy_key]["no RDF mimetype"][format] += 1 + + # Check if ontology is correct for all 3 formats + if formats_correct == {"ttl", "nt", "rdfxml"}: + aggregation[proxy_key]["correct for all 3 formats"]["all"] += 1 + +# Function to write aggregation results to TSV file +def write_to_tsv(filename, proxy_key): + with open(filename, 'w', newline='') as tsvfile: + writer = csv.writer(tsvfile, delimiter='\t') + writer.writerow(["Accessibility Status", "turtle", "ntriples", "rdfxml"]) + for category in categories: + row = [category] + for format in ["ttl", "nt", "rdfxml"]: + row.append(aggregation[proxy_key][category].get(format, 0)) + writer.writerow(row) + # Write total for "correct for all 3 formats" + correct_all = aggregation[proxy_key]["correct for all 3 formats"]["all"] + writer.writerow(["correct for all 3 formats", correct_all]) + +# Process both datasets +process_data(no_proxy_data, "w/o proxy") +process_data(proxy_data, "with proxy") + +# Write results to TSV files +write_to_tsv('no_proxy_results.tsv', "w/o proxy") +write_to_tsv('proxy_results.tsv', "with proxy") + +# Print the table +table_headers = ["Accessibility Status", "turtle", "ntriples", "rdfxml"] +for proxy_key in ["w/o proxy", "with proxy"]: + print(f"\nRequested format {proxy_key}") + print(f"{table_headers[0]:<40} {table_headers[1]:<10} {table_headers[2]:<10} {table_headers[3]:<10}") + for category in categories: + row = [category] + for format in ["ttl", "nt", "rdfxml"]: + row.append(aggregation[proxy_key][category].get(format, 0)) + print(f"{row[0]:<40} {row[1]:<10} {row[2]:<10} {row[3]:<10}") + # Print total for "correct for all 3 formats" + correct_all = aggregation[proxy_key]["correct for all 3 formats"]["all"] + print(f"{'correct for all 3 formats':<40} {correct_all:<10}") diff --git a/experiments/download_ontologies.py b/experiments/download_ontologies.py index 032bdda..34190f9 100644 --- a/experiments/download_ontologies.py +++ b/experiments/download_ontologies.py @@ -58,18 +58,26 @@ def download_ontology(url, formats, base_folder): headers = { "Accept": "", } - - session = requests.Session() - session.max_redirects = 10 - retries = Retry(total=0, backoff_factor=1, status_forcelist=[427]) # wanted to use for 429 originally, but backoff is als applied to connection timeouts and such - session.mount('http://', HTTPAdapter(max_retries=retries)) - session.mount('https://', HTTPAdapter(max_retries=retries)) + + proxies = { + "http": f"http://localhost:8898", + "https": f"http://localhost:8898", + } + + cacert_path = "ca-cert.pem" + + # session = requests.Session() + # session.max_redirects = 10 + # retries = Retry(total=0, backoff_factor=1, status_forcelist=[427]) # wanted to use for 429 originally, but backoff is als applied to connection timeouts and such + # session.mount('http://', HTTPAdapter(max_retries=retries)) + # session.mount('https://', HTTPAdapter(max_retries=retries)) for format_name, mime_type in formats.items(): try: headers["Accept"] = mime_type start_time = time.time() - response = session.get(url, headers=headers, timeout=10) + #response = session.get(url, proxies=proxies, headers=headers, verify=cacert_path, timeout=10) + response = requests.get(url, headers=headers, timeout=10) request_duration = time.time() - start_time file_path = "" diff --git a/experiments/parse_ontologies.py b/experiments/parse_ontologies.py index 0c53144..d101633 100644 --- a/experiments/parse_ontologies.py +++ b/experiments/parse_ontologies.py @@ -17,10 +17,19 @@ def is_uri_in_subject(triples, ontology_uri): subject_pattern = re.compile(rf"^<{re.escape(ontology_uri)}>") return any(subject_pattern.match(triple) for triple in triples) + def format_error_message(error_message): + lines = error_message.splitlines() + if len(lines) > 20: + return "\n".join(lines[:10] + ["\n\n\n............\n\n\n"] + lines[-10:]) + return error_message + # Load the JSON file with open(json_file_path, 'r') as f: ontologies = json.load(f) + base_folder = os.path.dirname(json_file_path) + input_base_folder = os.path.basename(base_folder) + for ontology in ontologies: ontology_url = ontology["url"] print(f'URL: {ontology_url}') @@ -33,7 +42,9 @@ def is_uri_in_subject(triples, ontology_uri): format_data["uri_in_subject_position"] = None format_data["rapper_error"] = None elif file_path and status_code == 200: - file_path = file_path.replace('downloads_proxy-test', 'downloads_proxy-fixedCA') + file_path_parts = file_path.split(os.sep) + file_path_parts[0] = input_base_folder + file_path = os.sep.join(file_path_parts) # Prepare the command command = [ "cat", @@ -56,22 +67,19 @@ def is_uri_in_subject(triples, ontology_uri): text=True ) + # Check the result and update the JSON + output = result.stdout + triples = output.splitlines() + num_triples = output.count("\n") + uri_in_subject = is_uri_in_subject(triples, ontology_url) + format_data["uri_in_subject_position"] = uri_in_subject + format_data["parsed_triples"] = num_triples - # Check the result and update the JSON if result.returncode == 0: - output = result.stdout - triples = output.splitlines() - num_triples = output.count("\n") - - uri_in_subject = is_uri_in_subject(triples, ontology_url) - format_data["uri_in_subject_position"] = uri_in_subject - format_data["parsed_triples"] = num_triples format_data["rapper_error"] = None else: - format_data["parsed_triples"] = 0 - format_data["uri_in_subject_position"] = False - format_data["rapper_error"] = result.stderr.strip() + format_data["rapper_error"] = format_error_message(result.stderr.strip()) except Exception as e: format_data["parsed_triples"] = 0 @@ -84,8 +92,8 @@ def is_uri_in_subject(triples, ontology_uri): if __name__ == "__main__": # Replace these paths with your actual file paths - input_json_path = "downloads_proxy-fixedCA/download_nt_proxy_log.json" - output_json_path = "downloads_proxy-fixedCA/download_nt_proxy_log_extended.json" + input_json_path = "downloads_direct_requests/download_log.json" + output_json_path = "downloads_direct_requests/download_log_fixshort.json" if os.path.exists(input_json_path): process_ontologies(input_json_path, output_json_path)