diff --git a/ontologytimemachine/custom_proxy.py b/ontologytimemachine/custom_proxy.py index d47cca5..a685474 100644 --- a/ontologytimemachine/custom_proxy.py +++ b/ontologytimemachine/custom_proxy.py @@ -1,5 +1,7 @@ from proxy.http.proxy import HttpProxyBasePlugin from proxy.http import httpHeaders +import gzip +from io import BytesIO from proxy.http.parser import HttpParser from proxy.common.utils import build_http_response from ontologytimemachine.utils.mock_responses import ( @@ -40,7 +42,7 @@ def __init__(self, *args, **kwargs): def before_upstream_connection(self, request: HttpParser) -> HttpParser | None: # self.client.config = QUOTE_NONE - logger.info("Before upstream connection hook") + logger.info("Before upstcream connection hook") logger.info(f"Request method: {request.method} - Request host: {request.host} - Request path: {request.path} - Request headers: {request.headers}") wrapped_request = HttpRequestWrapper(request) @@ -66,10 +68,13 @@ def before_upstream_connection(self, request: HttpParser) -> HttpParser | None: config = self.client.config else: logger.info("Using the proxy configuration") - config = self.config - + config = self.config + if wrapped_request.is_connect_request(): logger.info(f"Handling CONNECT request: configured HTTPS interception mode: {config.httpsInterception}") + # Mark if there is a connect request + if not hasattr(self.client, "mark_connect"): + self.client.mark_connect = True # Check whether to allow CONNECT requests since they can impose a security risk if not do_block_CONNECT_request(config): @@ -80,13 +85,15 @@ def before_upstream_connection(self, request: HttpParser) -> HttpParser | None: return None response = get_response_from_request(wrapped_request, config) - if response: + if response.status_code: + logger.info(response.status_code) self.queue_response(response) return None return request def do_intercept(self, _request: HttpParser) -> bool: + logger.info('Do intercept hook') wrapped_request = HttpRequestWrapper(_request) # Check if any config was provided via the authentication parameters @@ -94,6 +101,7 @@ def do_intercept(self, _request: HttpParser) -> bool: if hasattr(self.client, "config"): logger.info("Using the configuration from the Auth") config = self.client.config + logger.info(f'Config: {config}') else: logger.info("Using the proxy configuration") config = self.config @@ -125,6 +133,21 @@ def handle_client_request(self, request: HttpParser) -> HttpParser: logger.info("Handle client request hook") logger.info(f"Request method: {request.method} - Request host: {request.host} - Request path: {request.path} - Request headers: {request.headers}") + wrapped_request = HttpRequestWrapper(request) + if (wrapped_request.is_head_request() or wrapped_request.is_get_request()) and hasattr(self.client, "mark_connect"): + if self.client.mark_connect: + if hasattr(self.client, "config"): + logger.info("Using the configuration from the Auth") + config = self.client.config + else: + logger.info("Using the proxy configuration") + config = self.config + response = get_response_from_request(wrapped_request, config) + if response.status_code: + logger.info(response.status_code) + self.queue_response(response) + return None + return request def handle_upstream_chunk(self, chunk: memoryview): @@ -140,7 +163,6 @@ def queue_response(self, response): }, body=response.content, ) - ) diff --git a/ontologytimemachine/utils/config.py b/ontologytimemachine/utils/config.py index 680d4ca..49dc39d 100644 --- a/ontologytimemachine/utils/config.py +++ b/ontologytimemachine/utils/config.py @@ -61,7 +61,7 @@ class OntoVersion(EnumValuePrint): ORIGINAL_FAILOVER_LIVE_LATEST = "originalFailoverLiveLatest" LATEST_ARCHIVED = "latestArchived" TIMESTAMP_ARCHIVED = "timestampArchived" - DEPENDENCY_MANIFEST = "dependencyManifest" + #DEPENDENCY_MANIFEST = "dependencyManifest" class HttpsInterception(EnumValuePrint): @@ -91,7 +91,7 @@ class Config: ontoFormatConf: OntoFormatConfig = field(default_factory=OntoFormatConfig) ontoVersion: OntoVersion = OntoVersion.ORIGINAL_FAILOVER_LIVE_LATEST restrictedAccess: bool = False - clientConfigViaProxyAuth: ClientConfigViaProxyAuth = ClientConfigViaProxyAuth.IGNORE + clientConfigViaProxyAuth: ClientConfigViaProxyAuth = ClientConfigViaProxyAuth.REQUIRED httpsInterception: HttpsInterception = HttpsInterception.ALL disableRemovingRedirects: bool = False timestamp: str = "" diff --git a/ontologytimemachine/utils/proxy_logic.py b/ontologytimemachine/utils/proxy_logic.py index 3587a69..af22f4b 100644 --- a/ontologytimemachine/utils/proxy_logic.py +++ b/ontologytimemachine/utils/proxy_logic.py @@ -45,6 +45,7 @@ def do_deny_request_due_non_archivo_ontology_uri(wrapped_request, config): def get_response_from_request(wrapped_request, config): + logger.info('Ger response from tequest') do_deny = do_deny_request_due_non_archivo_ontology_uri(wrapped_request, config) if do_deny: logger.warning( @@ -63,6 +64,7 @@ def get_response_from_request(wrapped_request, config): # apply for current request def evaluate_configuration(wrapped_request, config): authentication_str = wrapped_request.get_authentication_from_request() + logger.info(f'Evaluate configuration, auth str: {authentication_str}') if authentication_str: logger.info("Authentication parameters provided, parsing the configuration.") username, password = authentication_str.split(":") @@ -148,18 +150,18 @@ def request_ontology( allow_redirects = not disableRemovingRedirects try: if wrapped_request.is_head_request(): - response = requests.head( - url=url, headers=headers, allow_redirects=allow_redirects, timeout=5 - ) + response = requests.head(url=url, headers=headers, allow_redirects=allow_redirects, timeout=3) + logger.info(response.content) + logger.info(response.status_code) else: - response = requests.get( - url=url, headers=headers, allow_redirects=allow_redirects, timeout=5 - ) + response = requests.get(url=url, headers=headers, allow_redirects=allow_redirects, timeout=3) + logger.info(response.content) + logger.info(response.status_code) logger.info("Successfully fetched ontology") return response except Exception as e: logger.error(f"Error fetching original ontology: {e}") - return mock_response_404() + return None # change the function definition and pass only the config @@ -189,7 +191,7 @@ def proxy_logic(wrapped_request, config): ) elif config.ontoVersion == OntoVersion.LATEST_ARCHIVED: logger.info('OntoVersion LATEST_ARCHIVED') - response = fetch_latest_archived(wrapped_request, ontology, headers) + response = fetch_latest_archived(wrapped_request, headers) elif config.ontoVersion == OntoVersion.TIMESTAMP_ARCHIVED: logger.info('OntoVersion TIMESTAMP_ARCHIVED') response = fetch_timestamp_archived(wrapped_request, headers, config) @@ -201,10 +203,10 @@ def proxy_logic(wrapped_request, config): # Fetch from the original source, no matter what -def fetch_original(wrapped_request, ontology, headers, disableRemovingRedirects): +def fetch_original(wrapped_request, ontology, headers, config): logger.info(f"Fetching original ontology from URL: {ontology}") return request_ontology( - wrapped_request, ontology, headers, disableRemovingRedirects + wrapped_request, ontology, headers, config.disableRemovingRedirects ) @@ -215,25 +217,29 @@ def fetch_failover(wrapped_request, headers, disableRemovingRedirects): original_response = request_ontology( wrapped_request, ontology, headers, disableRemovingRedirects ) - if original_response.status_code in passthrough_status_codes: - requested_mimetypes_with_priority = parse_accept_header_with_priority( - headers["Accept"] - ) - requested_mimetypes = [x[0] for x in requested_mimetypes_with_priority] - response_mime_type = original_response.headers.get("Content-Type", ";").split( - ";" - )[0] - logger.info(f"Requested mimetypes: {requested_mimetypes}") - logger.info(f"Response mimetype: {response_mime_type}") - if response_mime_type in requested_mimetypes: - return original_response + logger.info(f'Original response: {original_response}') + if original_response: + logger.info('Got an original response') + if original_response.status_code in passthrough_status_codes: + requested_mimetypes_with_priority = parse_accept_header_with_priority( + headers["Accept"] + ) + requested_mimetypes = [x[0] for x in requested_mimetypes_with_priority] + response_mime_type = original_response.headers.get("Content-Type", ";").split( + ";" + )[0] + logger.info(f"Requested mimetypes: {requested_mimetypes}") + logger.info(f"Response mimetype: {response_mime_type}") + if response_mime_type in requested_mimetypes: + return original_response + else: + logger.info(f"The returned type is not the same as the requested one") + return fetch_latest_archived(wrapped_request, headers) else: - logger.info(f"The returned type is not the same as the requested one") + logger.info(f"The returend status code is not accepted: {original_response.status_code}") return fetch_latest_archived(wrapped_request, headers) else: - logger.info( - f"The returend status code is not accepted: {original_response.status_code}" - ) + logger.info("No original response") return fetch_latest_archived(wrapped_request, headers) diff --git a/tests/archivo_test_IRIs.tsv b/tests/archivo_test_IRIs.tsv index 1132c24..17c7908 100644 --- a/tests/archivo_test_IRIs.tsv +++ b/tests/archivo_test_IRIs.tsv @@ -1,22 +1,22 @@ -iri error_dimension expected_error iri_type comment -http://buzzword.org.uk/rdf/personal-link-types# content text/html hash weird html instead of text/turtle -http://data.finlex.fi/schema/sfl/ content slash 0 bytes content-length -http://data.bigdatagrapes.eu/resource/ontology/ dns nxdomain slash -http://data.bigdatagrapes.eu/resource/ontology/MeasurementContext dns nxdomain term -http://data.ontotext.com/resource/leak/ http-code 502 slash -http://data.europa.eu/esco/flow http-code 406 slash -http://bdi.si.ehu.es/bdi/ontologies/ExtruOnt/ExtruOnt transport connect-timeout slash -http://catalogus-professorum.org/cpm/2/ transport connection-refused slash -http://www.w3.org/1999/02/22-rdf-syntax-ns# None hash -http://xmlns.com/foaf/0.1/ None slash -http://xmlns.com/foaf/0.1/Person None term -http://dbpedia.org/ontology/ None term -http://dbpedia.org/ontology/Person None term -https://bag2.basisregistraties.overheid.nl/bag/def/ http-code 404 slash -https://bag2.basisregistraties.overheid.nl/bag/def/Gebruiksdoel http-code 404 term -https://id.parliament.uk/schema http-code 404 slash slash onto without trailing slash / -https://id.parliament.uk/schema/Approval http-code 404 term slash onto without trailing slash / -https://bmake.th-brandenburg.de/spv# http-code 403 hash -https://bmake.th-brandenburg.de/spv http-code 403 hash just test whether Archivo API is used correctly -https://w3id.org/ttla/ transport cert-expired hash -http://data-gov.tw.rpi.edu/2009/data-gov-twc.rdf transport connection-refused hash +enable_testcase iri error_dimension expected_error iri_type comment +1 http://buzzword.org.uk/rdf/personal-link-types# content text/html hash weird html instead of text/turtle +1 http://data.finlex.fi/schema/sfl/ content 0-bytes slash 0 bytes content-length +1 http://data.bigdatagrapes.eu/resource/ontology/ dns nxdomain slash +1 http://data.bigdatagrapes.eu/resource/ontology/MeasurementContext dns nxdomain term +1 http://data.ontotext.com/resource/leak/ http-code 502 slash +1 http://data.europa.eu/esco/flow http-code 406 slash +1 http://bdi.si.ehu.es/bdi/ontologies/ExtruOnt/ExtruOnt transport connect-timeout slash +1 http://catalogus-professorum.org/cpm/2/ transport connection-refused slash +1 http://www.w3.org/1999/02/22-rdf-syntax-ns# None hash +1 http://xmlns.com/foaf/0.1/ None slash +1 http://xmlns.com/foaf/0.1/Person None term +1 http://dbpedia.org/ontology/ None term +1 http://dbpedia.org/ontology/Person None term +0 https://bag2.basisregistraties.overheid.nl/bag/def/ http-code 404 slash +0 https://bag2.basisregistraties.overheid.nl/bag/def/Gebruiksdoel http-code 404 term +0 https://id.parliament.uk/schema http-code 404 slash slash onto without trailing slash / +0 https://id.parliament.uk/schema/Approval http-code 404 term slash onto without trailing slash / +0 https://bmake.th-brandenburg.de/spv# http-code 403 hash +0 https://bmake.th-brandenburg.de/spv http-code 403 hash just test whether Archivo API is used correctly +0 https://w3id.org/ttla/ transport cert-expired hash +1 http://data-gov.tw.rpi.edu/2009/data-gov-twc.rdf transport connection-refused hash diff --git a/tests/non_archivo_test_IRIs.tsv b/tests/non_archivo_test_IRIs.tsv index 86013ac..627c6bf 100644 --- a/tests/non_archivo_test_IRIs.tsv +++ b/tests/non_archivo_test_IRIs.tsv @@ -1,12 +1,12 @@ -iri error_dimension expected_error iri_type comment -https://data.ontotext.com/resource/leak/ http-code 401 https is not ID -https://www.w3.org/1999/02/22-rdf-syntax-ns# None https is not ID -http://example.org None -https://example.org None -http://1.1.1.1 None -https://1.1.1.1 None -https://data.globalchange.gov/gcis.owl http-code "403 " https is not ID -https://data.ordnancesurvey.co.uk/ontology/geometry/ http-code 404 https is not ID -https://data.ordnancesurvey.co.uk/ontology/ http-code 301 https is not ID -https://google.com None - +enable_testcase iri error_dimension expected_error iri_type comment +0 https://data.ontotext.com/resource/leak/ http-code 401 https is not ID +0 https://www.w3.org/1999/02/22-rdf-syntax-ns# None https is not ID +0 http://example.org None +0 https://example.org None +0 http://1.1.1.1 None +0 https://1.1.1.1 None +0 https://data.globalchange.gov/gcis.owl http-code 403 https is not ID +0 https://data.ordnancesurvey.co.uk/ontology/geometry/ http-code 404 https is not ID +0 https://data.ordnancesurvey.co.uk/ontology/ http-code 301 https is not ID +0 https://google.com None +0 \ No newline at end of file diff --git a/tests/test_proxypy_test_proxy_spinup.py b/tests/not_test_proxypy_test_proxy_spinup.py similarity index 100% rename from tests/test_proxypy_test_proxy_spinup.py rename to tests/not_test_proxypy_test_proxy_spinup.py diff --git a/tests/test_proxy_auth_header.py b/tests/test_proxy_auth_header.py index a7e37c8..21ee6e6 100644 --- a/tests/test_proxy_auth_header.py +++ b/tests/test_proxy_auth_header.py @@ -3,8 +3,10 @@ import logging import csv from typing import List, Tuple +from unittest.mock import Mock from requests.auth import HTTPBasicAuth from requests.auth import _basic_auth_str +from requests.exceptions import SSLError from ontologytimemachine.custom_proxy import IP, PORT # Proxy settings @@ -12,6 +14,7 @@ HTTP_PROXY = f"http://{PROXY}" HTTPS_PROXY = f"http://{PROXY}" PROXIES = {"http": HTTP_PROXY, "https": HTTPS_PROXY} +CA_CERT_PATH = "ca-cert.pem" logging.basicConfig( level=logging.ERROR, @@ -41,10 +44,31 @@ def make_request_without_proxy(iri: str) -> Tuple[int, str]: try: response = requests.get(iri, timeout=10, headers=headers, allow_redirects=True) return response + except SSLError as e: + mock_response = Mock() + mock_response.status_code = 'ssl-error' + return mock_response + except requests.exceptions.Timeout: + mock_response = Mock() + mock_response.status_code = 'timeout-error' + return mock_response + except requests.exceptions.ConnectionError as e: + if 'NameResolutionError' in str(e): + mock_response = Mock() + mock_response.status_code = 'nxdomain-error' + return mock_response + elif 'Connection refused' in str(e) or 'Errno 111' in str(e): + mock_response = Mock() + mock_response.status_code = 'connection-refused-error' + return mock_response + else: + mock_response = Mock() + mock_response.status_code = 'error' + return mock_response except Exception as e: - # logger.info(f'Error: {e}') - # logger.info('Error with the connection') - return create_fake_response() + mock_response = Mock() + mock_response.status_code = 'error' + return mock_response def make_request_with_proxy(iri: str, mode: str) -> Tuple[int, str]: logger.info('Run') @@ -53,57 +77,103 @@ def make_request_with_proxy(iri: str, mode: str) -> Tuple[int, str]: password = "my_password" headers = { "Accept": "text/turtle", + "Accept-Encoding": "identity", "Proxy-Authorization": _basic_auth_str(username, password) } try: - response = requests.get(iri, proxies=PROXIES, headers=headers, allow_redirects=True) + # There is an issue here for https requests + response = requests.get(iri, proxies=PROXIES, verify=CA_CERT_PATH, headers=headers, timeout=10) return response + except SSLError as e: + mock_response = Mock() + mock_response.content = '' + mock_response.status_code = 'ssl-error' + return mock_response + except requests.exceptions.Timeout: + mock_response = Mock() + mock_response.content = '' + mock_response.status_code = 'timeout-error' + return mock_response + except requests.exceptions.ConnectionError as e: + if 'NXDOMAIN' in str(e): + mock_response = Mock() + mock_response.content = '' + mock_response.status_code = 'nxdomain-error' + return mock_response + elif 'Connection refused' in str(e) or 'Errno 111' in str(e): + mock_response = Mock() + mock_response.content = '' + mock_response.status_code = 'connection-refused-error' + return mock_response + else: + mock_response = Mock() + mock_response.content = '' + mock_response.status_code = 'error' + return mock_response except Exception as e: - # logger.info(f'Error: {e}') - # logger.info('Error with the connection') - return create_fake_response() + mock_response = Mock() + mock_response.content = '' + mock_response.status_code = 'error' + return mock_response # Parametrize the test cases with data loaded from the TSV file @pytest.mark.parametrize("test_case", load_test_data('tests/archivo_test_IRIs.tsv')) def test_proxy_responses(test_case): + enabled = test_case['enable_testcase'] + iri = test_case['iri'] error_dimension = test_case['error_dimension'] expected_error = test_case['expected_error'] iri_type = test_case['iri_type'] comment = test_case['comment'] + + if enabled == '1': + # Make direct and proxy requests + direct_response = make_request_without_proxy(iri) + proxy_original_response = make_request_with_proxy(iri, 'original') + proxy_failover_response = make_request_with_proxy(iri, 'originalFailoverLiveLatest') + proxy_archivo_laest_response = make_request_with_proxy(iri, 'latestArchived') + + # Evaluation based on error_dimension + if error_dimension == 'http-code': + assert int(expected_error) == direct_response.status_code + assert int(expected_error) == proxy_original_response.status_code + + + elif error_dimension == 'None': + assert direct_response.status_code == 200 + assert proxy_original_response.status_code == 200 + + elif error_dimension == 'content': + if expected_error == 'text_html': + assert direct_response.headers.get('Content-Type') == 'text/html' + assert proxy_original_response.headers.get('Content-Type') == 'text/html' + elif expected_error == '0-bytes': + assert len(direct_response.content) == 0 + assert len(proxy_original_response.content) == 0 + + elif error_dimension == 'dns': + if expected_error == 'nxdomain': + assert direct_response.status_code == 'nxdomain-error' + assert proxy_original_response.status_code == 502 + + elif error_dimension == 'transport': + if expected_error == 'cert-expired': + assert direct_response.status_code == 'ssl-error' + assert proxy_original_response.status_code == 'ssl-error' + elif expected_error == 'connect-timeout': + assert direct_response.status_code == 'timeout-error' + assert proxy_original_response.status_code == 'timeout-error' + elif expected_error == 'connect-refused': + assert direct_response.status_code == 'connection-refused-error' + assert proxy_original_response.status_code == 'connection-refused-error' + + assert 200 == proxy_failover_response.status_code + assert 200 == proxy_archivo_laest_response.status_code + + else: + assert True - # Make direct and proxy requests - direct_response = make_request_without_proxy(iri) - proxy_response = make_request_with_proxy(iri, 'original') - #proxy_response = make_request_with_proxy(iri, 'original') - #proxy_response = make_request_with_proxy(iri, 'laters') - #proxy_response = make_request_with_proxy(iri, 'original') - - # Evaluation based on error_dimension - if error_dimension == 'http-code': - logger.info(f"Comparing direct response status code: expected {expected_error}, got {direct_response.status_code}") - assert int(expected_error) == direct_response.status_code - logger.info(f"Comparing proxy response status code: expected {expected_error}, got {proxy_response.status_code}") - assert int(expected_error) == proxy_response.status_code - - elif error_dimension == 'None': - logger.info(f"Comparing direct response status code for 'None' error dimension: expected 200, got {direct_response.status_code}") - assert direct_response.status_code == 200 - logger.info(f"Comparing proxy response status code for 'None' error dimension: expected 200, got {proxy_response.status_code}") - assert proxy_response.status_code == 200 - - elif error_dimension == 'content': - logger.info(f"Comparing direct response content length: expected 0, got {len(direct_response.content)}") - assert len(direct_response.content) == 0 - logger.info(f"Comparing proxy response content length: expected 0, got {len(proxy_response.content)}") - assert len(proxy_response.content) == 0 - - elif error_dimension == 'dns' or error_dimension == 'transport': - logger.info(f"Comparing direct response status code for unknown error dimension: expected 'error', got '{direct_response}'") - assert 'error' == direct_response.status_code - logger.info(f"Comparing proxy response status code for unknown error dimension: expected 'error', got '{proxy_response.status_code}'") - assert 'error' == proxy_response.status_code - if __name__ == "__main__":