Skip to content

Commit

Permalink
added missing changes of experiment state
Browse files Browse the repository at this point in the history
  • Loading branch information
JJ-Author committed Dec 20, 2024
1 parent 0fa85e8 commit 7b962b7
Show file tree
Hide file tree
Showing 4 changed files with 259 additions and 29 deletions.
43 changes: 35 additions & 8 deletions experiments/aggregate_results.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
from collections import defaultdict
import csv

# Content type mapping for each format
content_type_mapping = {
Expand All @@ -8,11 +9,18 @@
"rdfxml": ["application/rdf+xml", "application/xml"], # Updated to include multiple accepted types
}

rdf_mimtetypes = ['text/turtle', 'application/n-triples', 'application/rdf+xml', 'application/xml']
# rdf_mimtetypes = ['text/turtle', 'application/n-triples', 'application/rdf+xml', 'application/xml']
rdf_mimtetypes = [
'text/turtle', 'application/n-triples', 'application/rdf+xml', 'application/xml',
'application/owl+xml', 'application/ld+json', 'text/owl-functional', 'text/owl-manchester',
'text/n3', 'application/trig', 'application/x-turtle', 'application/x-trig',
'application/x-nquads' , 'application/n-quads'
]


# File paths for the logs
no_proxy_file_path = 'downloads-200ms-shuffled/download_log_extended.json'
proxy_file_path = 'downloads_proxy_requests/download_proxy_log_extended.json'
no_proxy_file_path = 'downloads-200ms-shuffled/download_log_extended_fixshort.json'
proxy_file_path = 'downloads_proxy_requests/download_proxy_log_extended_fix.json'

# Load the JSON data for no-proxy and with-proxy scenarios
with open(no_proxy_file_path, 'r') as f:
Expand All @@ -39,8 +47,9 @@
"0 bytes content",
"no rdf content (0 triples parsable)",
"partially parsable rdf-content",
# "pp describes requested ont.",
"fully parsable rdf-content",
"describes requested ont.",
# "describes requested ont.",
"no RDF mimetype",
"confused RDF mimetype",
"correct mimetype",
Expand Down Expand Up @@ -121,12 +130,12 @@ def process_data(data, proxy_key):
aggregation[proxy_key]["no rdf content (0 triples parsable)"][format] += 1
elif parsed_triples > 0 and rapper_error:
aggregation[proxy_key]["partially parsable rdf-content"][format] += 1
if uri_in_subject_position:
aggregation[proxy_key]["describes requested ont."][format] += 1
# if uri_in_subject_position:
# aggregation[proxy_key]["pp describes requested ont."][format] += 1
elif parsed_triples > 0 and not rapper_error:
aggregation[proxy_key]["fully parsable rdf-content"][format] += 1
if uri_in_subject_position:
aggregation[proxy_key]["describes requested ont."][format] += 1
if True:
# aggregation[proxy_key]["describes requested ont."][format] += 1

# Check MIME types only for ontologies that describe the requested ontology
if content_type and is_correct_mimetype(format, content_type):
Expand All @@ -141,10 +150,28 @@ def process_data(data, proxy_key):
if formats_correct == {"ttl", "nt", "rdfxml"}:
aggregation[proxy_key]["correct for all 3 formats"]["all"] += 1

# Function to write aggregation results to TSV file
def write_to_tsv(filename, proxy_key):
with open(filename, 'w', newline='') as tsvfile:
writer = csv.writer(tsvfile, delimiter='\t')
writer.writerow(["Accessibility Status", "turtle", "ntriples", "rdfxml"])
for category in categories:
row = [category]
for format in ["ttl", "nt", "rdfxml"]:
row.append(aggregation[proxy_key][category].get(format, 0))
writer.writerow(row)
# Write total for "correct for all 3 formats"
correct_all = aggregation[proxy_key]["correct for all 3 formats"]["all"]
writer.writerow(["correct for all 3 formats", correct_all])

# Process both datasets
process_data(no_proxy_data, "w/o proxy")
process_data(proxy_data, "with proxy")

# Write results to TSV files
write_to_tsv('no_proxy_results.tsv', "w/o proxy")
write_to_tsv('proxy_results.tsv', "with proxy")

# Print the table
table_headers = ["Accessibility Status", "turtle", "ntriples", "rdfxml"]
for proxy_key in ["w/o proxy", "with proxy"]:
Expand Down
187 changes: 187 additions & 0 deletions experiments/aggregate_results_NIRcheck.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,187 @@
import json
from collections import defaultdict
import csv

# Content type mapping for each format
content_type_mapping = {
"ttl": "text/turtle",
"nt": "application/n-triples",
"rdfxml": ["application/rdf+xml", "application/xml"], # Updated to include multiple accepted types
}

# rdf_mimtetypes = ['text/turtle', 'application/n-triples', 'application/rdf+xml', 'application/xml']
rdf_mimtetypes = [
'text/turtle', 'application/n-triples', 'application/rdf+xml', 'application/xml',
'application/owl+xml', 'application/ld+json', 'text/owl-functional', 'text/owl-manchester',
'text/n3', 'application/trig', 'application/x-turtle', 'application/x-trig',
'application/x-nquads' , 'application/n-quads'
]


# File paths for the logs
no_proxy_file_path = 'downloads-200ms-shuffled/download_log_extended_fixshort.json'
proxy_file_path = 'downloads_proxy_requests/download_proxy_log_extended_fix.json'

# Load the JSON data for no-proxy and with-proxy scenarios
with open(no_proxy_file_path, 'r') as f:
no_proxy_data = json.load(f)

with open(proxy_file_path, 'r') as f:
proxy_data = json.load(f)

# Initialize the aggregation dictionary for both proxy and no-proxy scenarios
aggregation = {
"w/o proxy": defaultdict(lambda: defaultdict(int)),
"with proxy": defaultdict(lambda: defaultdict(int)),
}

# Define categories for table
categories = [
"unsuccessful payload retrieval",
"DNS issue",
"Con. / transport issue",
"TLS cert issue",
"Too many redirects",
"Non-200 HTTP code",
"Successful request (code 200)",
"0 bytes content",
"no rdf content (0 triples parsable)",
"partially parsable rdf-content",
"pp describes requested ont.",
"fully parsable rdf-content",
"describes requested ont.",
"no RDF mimetype",
"confused RDF mimetype",
"correct mimetype",
"correct for all 3 formats",
]

# Error type to category mapping logic
def map_error_to_category(error_type, type_more_specific):
if error_type == "TooManyRedirects":
return "Too many redirects"
elif error_type == "SSLError":
return "TLS cert issue"
elif error_type == "ConnectionError":
if type_more_specific == "NameResolutionError":
return "DNS issue"
else:
return "Con. / transport issue"
elif error_type == "ConnectTimeout":
return "Con. / transport issue"
else:
return "Con. / transport issue"

# Check if MIME type is valid for the format
def is_correct_mimetype(format, content_type):
expected_types = content_type_mapping.get(format, [])
if isinstance(expected_types, list):
for expected_type in expected_types:
if expected_type in content_type:
return True
return False
return expected_types in content_type

def is_rdf_mimetype(content_type):
for rdf_mimetype in rdf_mimtetypes:
if rdf_mimetype in content_type:
return True
return False

# Process data for aggregation
def process_data(data, proxy_key):
for entry in data:
url = entry.get("url", "")
downloads = entry.get("downloads", {})
formats_correct = set()

for format, details in downloads.items():
# Extract details
status_code = details.get("status_code")
parsed_triples = details.get("parsed_triples", 0)
content_length = details.get("content_lenght_measured", 0)
content_type = details.get("content_type", "").lower() if details.get("content_type") else None
uri_in_subject_position = details.get("uri_in_subject_position", False)
rapper_error = details.get("rapper_error")
error = details.get("error", {})

# Check for errors and categorize them
if error and error.get("type"):
error_type = error["type"]
type_more_specific = error.get("type_more_specific")
category = map_error_to_category(error_type, type_more_specific)
aggregation[proxy_key][category][format] += 1
aggregation[proxy_key]["unsuccessful payload retrieval"][format] += 1
continue

# Handle non-200 status codes
if status_code != 200:
aggregation[proxy_key]["Non-200 HTTP code"][format] += 1
aggregation[proxy_key]["unsuccessful payload retrieval"][format] += 1
continue

# Successful request (status code 200)
aggregation[proxy_key]["Successful request (code 200)"][format] += 1

# Categorize successful ontologies
if content_length == 0:
aggregation[proxy_key]["0 bytes content"][format] += 1
elif parsed_triples == 0:
aggregation[proxy_key]["no rdf content (0 triples parsable)"][format] += 1
elif parsed_triples > 0 and rapper_error:
aggregation[proxy_key]["partially parsable rdf-content"][format] += 1
if uri_in_subject_position:
aggregation[proxy_key]["pp describes requested ont."][format] += 1
elif parsed_triples > 0 and not rapper_error:
aggregation[proxy_key]["fully parsable rdf-content"][format] += 1
if uri_in_subject_position:
aggregation[proxy_key]["describes requested ont."][format] += 1

# Check MIME types only for ontologies that describe the requested ontology
if content_type and is_correct_mimetype(format, content_type):
aggregation[proxy_key]["correct mimetype"][format] += 1
formats_correct.add(format)
elif content_type and is_rdf_mimetype(content_type):
aggregation[proxy_key]["confused RDF mimetype"][format] += 1
else:
aggregation[proxy_key]["no RDF mimetype"][format] += 1

# Check if ontology is correct for all 3 formats
if formats_correct == {"ttl", "nt", "rdfxml"}:
aggregation[proxy_key]["correct for all 3 formats"]["all"] += 1

# Function to write aggregation results to TSV file
def write_to_tsv(filename, proxy_key):
with open(filename, 'w', newline='') as tsvfile:
writer = csv.writer(tsvfile, delimiter='\t')
writer.writerow(["Accessibility Status", "turtle", "ntriples", "rdfxml"])
for category in categories:
row = [category]
for format in ["ttl", "nt", "rdfxml"]:
row.append(aggregation[proxy_key][category].get(format, 0))
writer.writerow(row)
# Write total for "correct for all 3 formats"
correct_all = aggregation[proxy_key]["correct for all 3 formats"]["all"]
writer.writerow(["correct for all 3 formats", correct_all])

# Process both datasets
process_data(no_proxy_data, "w/o proxy")
process_data(proxy_data, "with proxy")

# Write results to TSV files
write_to_tsv('no_proxy_results.tsv', "w/o proxy")
write_to_tsv('proxy_results.tsv', "with proxy")

# Print the table
table_headers = ["Accessibility Status", "turtle", "ntriples", "rdfxml"]
for proxy_key in ["w/o proxy", "with proxy"]:
print(f"\nRequested format {proxy_key}")
print(f"{table_headers[0]:<40} {table_headers[1]:<10} {table_headers[2]:<10} {table_headers[3]:<10}")
for category in categories:
row = [category]
for format in ["ttl", "nt", "rdfxml"]:
row.append(aggregation[proxy_key][category].get(format, 0))
print(f"{row[0]:<40} {row[1]:<10} {row[2]:<10} {row[3]:<10}")
# Print total for "correct for all 3 formats"
correct_all = aggregation[proxy_key]["correct for all 3 formats"]["all"]
print(f"{'correct for all 3 formats':<40} {correct_all:<10}")
22 changes: 15 additions & 7 deletions experiments/download_ontologies.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,18 +58,26 @@ def download_ontology(url, formats, base_folder):
headers = {
"Accept": "",
}

session = requests.Session()
session.max_redirects = 10
retries = Retry(total=0, backoff_factor=1, status_forcelist=[427]) # wanted to use for 429 originally, but backoff is als applied to connection timeouts and such
session.mount('http://', HTTPAdapter(max_retries=retries))
session.mount('https://', HTTPAdapter(max_retries=retries))

proxies = {
"http": f"http://localhost:8898",
"https": f"http://localhost:8898",
}

cacert_path = "ca-cert.pem"

# session = requests.Session()
# session.max_redirects = 10
# retries = Retry(total=0, backoff_factor=1, status_forcelist=[427]) # wanted to use for 429 originally, but backoff is als applied to connection timeouts and such
# session.mount('http://', HTTPAdapter(max_retries=retries))
# session.mount('https://', HTTPAdapter(max_retries=retries))

for format_name, mime_type in formats.items():
try:
headers["Accept"] = mime_type
start_time = time.time()
response = session.get(url, headers=headers, timeout=10)
#response = session.get(url, proxies=proxies, headers=headers, verify=cacert_path, timeout=10)
response = requests.get(url, headers=headers, timeout=10)
request_duration = time.time() - start_time

file_path = ""
Expand Down
36 changes: 22 additions & 14 deletions experiments/parse_ontologies.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,19 @@ def is_uri_in_subject(triples, ontology_uri):
subject_pattern = re.compile(rf"^<{re.escape(ontology_uri)}>")
return any(subject_pattern.match(triple) for triple in triples)

def format_error_message(error_message):
lines = error_message.splitlines()
if len(lines) > 20:
return "\n".join(lines[:10] + ["\n\n\n............\n\n\n"] + lines[-10:])
return error_message

# Load the JSON file
with open(json_file_path, 'r') as f:
ontologies = json.load(f)

base_folder = os.path.dirname(json_file_path)
input_base_folder = os.path.basename(base_folder)

for ontology in ontologies:
ontology_url = ontology["url"]
print(f'URL: {ontology_url}')
Expand All @@ -33,7 +42,9 @@ def is_uri_in_subject(triples, ontology_uri):
format_data["uri_in_subject_position"] = None
format_data["rapper_error"] = None
elif file_path and status_code == 200:
file_path = file_path.replace('downloads_proxy-test', 'downloads_proxy-fixedCA')
file_path_parts = file_path.split(os.sep)
file_path_parts[0] = input_base_folder
file_path = os.sep.join(file_path_parts)
# Prepare the command
command = [
"cat",
Expand All @@ -56,22 +67,19 @@ def is_uri_in_subject(triples, ontology_uri):
text=True
)

# Check the result and update the JSON
output = result.stdout
triples = output.splitlines()
num_triples = output.count("\n")

uri_in_subject = is_uri_in_subject(triples, ontology_url)
format_data["uri_in_subject_position"] = uri_in_subject
format_data["parsed_triples"] = num_triples

# Check the result and update the JSON
if result.returncode == 0:
output = result.stdout
triples = output.splitlines()
num_triples = output.count("\n")

uri_in_subject = is_uri_in_subject(triples, ontology_url)
format_data["uri_in_subject_position"] = uri_in_subject
format_data["parsed_triples"] = num_triples
format_data["rapper_error"] = None
else:
format_data["parsed_triples"] = 0
format_data["uri_in_subject_position"] = False
format_data["rapper_error"] = result.stderr.strip()
format_data["rapper_error"] = format_error_message(result.stderr.strip())

except Exception as e:
format_data["parsed_triples"] = 0
Expand All @@ -84,8 +92,8 @@ def is_uri_in_subject(triples, ontology_uri):

if __name__ == "__main__":
# Replace these paths with your actual file paths
input_json_path = "downloads_proxy-fixedCA/download_nt_proxy_log.json"
output_json_path = "downloads_proxy-fixedCA/download_nt_proxy_log_extended.json"
input_json_path = "downloads_direct_requests/download_log.json"
output_json_path = "downloads_direct_requests/download_log_fixshort.json"

if os.path.exists(input_json_path):
process_ontologies(input_json_path, output_json_path)
Expand Down

0 comments on commit 7b962b7

Please sign in to comment.