From 967ecba9cdb668f1b2bb94328f29e7a6b640379d Mon Sep 17 00:00:00 2001 From: Jenifer Tabita Ciuciu-Kiss Date: Mon, 21 Oct 2024 02:15:48 +0200 Subject: [PATCH] update path based on that is in Archivo --- .gitignore | 3 ++ ontologytimemachine/proxy_wrapper.py | 22 ++++++++--- ontologytimemachine/utils/proxy_logic.py | 48 +++++++++++++++--------- 3 files changed, 51 insertions(+), 22 deletions(-) diff --git a/.gitignore b/.gitignore index fc45164..72a7aac 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,9 @@ __pycache__/ *$py.class *.pem +ontologytimemachine/utils/archivo_ontologies_download.txt +ontologytimemachine/utils/archivo_ontologies_hash.txt + # C extensions *.so diff --git a/ontologytimemachine/proxy_wrapper.py b/ontologytimemachine/proxy_wrapper.py index c881238..37d6e74 100644 --- a/ontologytimemachine/proxy_wrapper.py +++ b/ontologytimemachine/proxy_wrapper.py @@ -39,6 +39,10 @@ def get_request_host(self) -> str: def get_request_path(self) -> str: pass + @abstractmethod + def set_request_path(self, new_path) -> None: + pass + @abstractmethod def get_request_headers(self) -> Dict[str, str]: pass @@ -84,6 +88,10 @@ def get_request_host(self) -> str: def get_request_path(self) -> str: return self.request.path.decode("utf-8") + def set_request_path(self, new_path: str) -> None: + self.request.path = new_path.encode("utf-8") + logger.info(f"Request path set to: {new_path}") + def get_request_headers(self) -> Dict[str, str]: headers: Dict[str, str] = {} for k, v in self.request.headers.items(): @@ -100,16 +108,20 @@ def set_request_accept_header(self, mime_type: str) -> None: def get_request_url_host_path(self) -> Tuple[str, str, str]: logger.info("Get ontology from request") - if (self.is_get_request or self.is_head_request) and not self.request.host: + if ( + (self.is_get_request or self.is_head_request) + and not self.request.host + and not self.get_request_host() + ): for k, v in self.request.headers.items(): if v[0].decode("utf-8") == "Host": host = v[1].decode("utf-8") - path = self.request.path.decode("utf-8") + path = self.get_request_path() url = f"https://{host}{path}" else: - host = self.request.host.decode("utf-8") - path = self.request.path.decode("utf-8") - url = str(self.request._url) + host = self.get_request_host() + path = self.get_request_path() + url = f"http://{host}{path}" logger.info(f"Ontology: {url}") return url, host, path diff --git a/ontologytimemachine/utils/proxy_logic.py b/ontologytimemachine/utils/proxy_logic.py index fe79e45..62bebd5 100644 --- a/ontologytimemachine/utils/proxy_logic.py +++ b/ontologytimemachine/utils/proxy_logic.py @@ -109,6 +109,7 @@ def is_archivo_ontology_request(wrapped_request): if request_path.endswith("/"): request_path = request_path.rstrip("/") if (request_host, request_path) in ARCHIVO_PARSED_URLS: + wrapped_request.set_request_path(request_path) logger.info(f"Requested URL: {request_host+request_path} is in Archivo") return True @@ -117,17 +118,27 @@ def is_archivo_ontology_request(wrapped_request): path_parts = request_path.split("/") new_path = "/".join(path_parts[:-1]) - if ((request_host, new_path) in ARCHIVO_PARSED_URLS) or ( - (request_host, new_path + "/") in ARCHIVO_PARSED_URLS - ): - logger.info(f"Requested URL: {request_host+request_path} is in Archivo") + if (request_host, new_path) in ARCHIVO_PARSED_URLS: + wrapped_request.set_request_path(new_path) + logger.info(f"Requested URL: {request_host+new_path} is in Archivo") + return True + + new_path = new_path + "/" + if (request_host, new_path) in ARCHIVO_PARSED_URLS: + wrapped_request.set_request_path(new_path) + logger.info(f"Requested URL: {request_host+new_path} is in Archivo") return True new_path = "/".join(path_parts[:-2]) - if ((request_host, new_path) in ARCHIVO_PARSED_URLS) or ( - (request_host, new_path + "/") in ARCHIVO_PARSED_URLS - ): - logger.info(f"Requested URL: {request_host+request_path} is in Archivo") + if (request_host, new_path) in ARCHIVO_PARSED_URLS: + wrapped_request.set_request_path(new_path) + logger.info(f"Requested URL: {request_host+new_path} is in Archivo") + return True + + new_path = new_path + "/" + if (request_host, new_path) in ARCHIVO_PARSED_URLS: + wrapped_request.set_request_path(new_path) + logger.info(f"Requested URL: {request_host+new_path} is in Archivo") return True logger.info(f"Requested URL: {request_host+request_path} is NOT in Archivo") @@ -140,7 +151,7 @@ def request_ontology(url, headers, disableRemovingRedirects=False, timeout=5): response = requests.get( url=url, headers=headers, allow_redirects=allow_redirects, timeout=5 ) - logger.info("Successfully fetched original ontology") + logger.info("Successfully fetched ontology") return response except Exception as e: logger.error(f"Error fetching original ontology: {e}") @@ -154,7 +165,6 @@ def proxy_logic(wrapped_request, config): set_onto_format_headers(wrapped_request, config) headers = wrapped_request.get_request_headers() - ontology, _, _ = wrapped_request.get_request_url_host_path() # if the requested format is not in Archivo and the ontoVersion is not original # we can stop because the archivo request will not go through @@ -164,15 +174,16 @@ def proxy_logic(wrapped_request, config): return mock_response_500 if config.ontoVersion == OntoVersion.ORIGINAL: + ontology, _, _ = wrapped_request.get_request_url_host_path() response = fetch_original(ontology, headers, config) elif config.ontoVersion == OntoVersion.ORIGINAL_FAILOVER_LIVE_LATEST: response = fetch_failover( - wrapped_request, ontology, headers, config.disableRemovingRedirects + wrapped_request, headers, config.disableRemovingRedirects ) elif config.ontoVersion == OntoVersion.LATEST_ARCHIVED: response = fetch_latest_archived(wrapped_request, ontology, headers) elif config.ontoVersion == OntoVersion.LATEST_ARCHIVED: - response = fetch_timestamp_archived(wrapped_request, ontology, headers, config) + response = fetch_timestamp_archived(wrapped_request, headers, config) # Commenting the manifest related part because it is not supported in the current version # elif ontoVersion == 'dependencyManifest': # response = fetch_dependency_manifest(ontology, headers, manifest) @@ -187,7 +198,8 @@ def fetch_original(ontology, headers, disableRemovingRedirects): # Failover mode -def fetch_failover(wrapped_request, ontology, headers, disableRemovingRedirects): +def fetch_failover(wrapped_request, headers, disableRemovingRedirects): + ontology, _, _ = wrapped_request.get_request_url_host_path() logger.info(f"Fetching original ontology with failover from URL: {ontology}") original_response = request_ontology(ontology, headers, disableRemovingRedirects) if original_response.status_code in passthrough_status_codes: @@ -204,16 +216,16 @@ def fetch_failover(wrapped_request, ontology, headers, disableRemovingRedirects) return original_response else: logging.info(f"The returned type is not the same as the requested one") - return fetch_latest_archived(wrapped_request, ontology, headers) + return fetch_latest_archived(wrapped_request, headers) else: logger.info( f"The returend status code is not accepted: {original_response.status_code}" ) - return fetch_latest_archived(wrapped_request, ontology, headers) + return fetch_latest_archived(wrapped_request, headers) # Fetch the lates version from archivo (no timestamp defined) -def fetch_latest_archived(wrapped_request, ontology, headers): +def fetch_latest_archived(wrapped_request, headers): if not is_archivo_ontology_request(wrapped_request): logger.info( "Data needs to be fetched from Archivo, but ontology is not available on Archivo." @@ -221,12 +233,13 @@ def fetch_latest_archived(wrapped_request, ontology, headers): return mock_response_404() logger.info("Fetch latest archived") format = get_format_from_accept_header(headers) + ontology, _, _ = wrapped_request.get_request_url_host_path() dbpedia_url = f"{archivo_api}?o={ontology}&f={format}" logger.info(f"Fetching from DBpedia Archivo API: {dbpedia_url}") return request_ontology(dbpedia_url, headers) -def fetch_timestamp_archived(wrapped_request, ontology, headers, config): +def fetch_timestamp_archived(wrapped_request, headers, config): if not is_archivo_ontology_request(wrapped_request): logger.info( "Data needs to be fetched from Archivo, but ontology is not available on Archivo." @@ -234,6 +247,7 @@ def fetch_timestamp_archived(wrapped_request, ontology, headers, config): return mock_response_404() logger.info("Fetch archivo timestamp") format = get_format_from_accept_header(headers) + ontology, _, _ = wrapped_request.get_request_url_host_path() dbpedia_url = f"{archivo_api}?o={ontology}&f={format}&v={config.timestamp}" logger.info(f"Fetching from DBpedia Archivo API: {dbpedia_url}") return request_ontology(dbpedia_url, headers)