Skip to content

Commit

Permalink
Merge pull request #116 from kuefmz/main
Browse files Browse the repository at this point in the history
fixes for https handling and https IRI tests.
  • Loading branch information
JJ-Author authored Oct 29, 2024
2 parents b8f1c28 + e33e7a5 commit 753c7e1
Show file tree
Hide file tree
Showing 9 changed files with 354 additions and 38 deletions.
48 changes: 41 additions & 7 deletions ontologytimemachine/custom_proxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,17 @@ def before_upstream_connection(self, request: HttpParser) -> HttpParser | None:
logger.info("Before upstcream connection hook")
logger.info(f"Request method: {request.method} - Request host: {request.host} - Request path: {request.path} - Request headers: {request.headers}")
wrapped_request = HttpRequestWrapper(request)

try:
self.client.request_host = wrapped_request.get_request_host()
except:
logger.info('No host')

try:
self.client.request_path = wrapped_request.get_request_path()
except:
logger.info('No path')


if (self.config.clientConfigViaProxyAuth == ClientConfigViaProxyAuth.REQUIRED or self.config.clientConfigViaProxyAuth == ClientConfigViaProxyAuth.OPTIONAL):
logger.info('Setting up config from auth')
Expand Down Expand Up @@ -83,12 +94,20 @@ def before_upstream_connection(self, request: HttpParser) -> HttpParser | None:
else:
logger.info("CONNECT request was blocked due to the configuration")
return None

response = get_response_from_request(wrapped_request, config)
if response.status_code:
logger.info(response.status_code)
self.queue_response(response)
return None

if not wrapped_request.is_connect_request():
logger.info('Skip for the connect request')
if not wrapped_request.get_request_host():
if hasattr(self.client, "request_host"):
wrapped_request.set_request_host(self.client.request_host)
if not wrapped_request.get_request_path():
if hasattr(self.client, "request_path"):
wrapped_request.set_request_path(self.client.request_path)
response = get_response_from_request(wrapped_request, config)
if response.status_code:
logger.info('Queue response from proxy logic')
self.queue_response(response)
return None

return request

Expand Down Expand Up @@ -117,6 +136,12 @@ def do_intercept(self, _request: HttpParser) -> bool:
# this should actually be not triggered as the CONNECT request should have been blocked before
return False
elif config.httpsInterception == HttpsInterception.ARCHIVO:
if not wrapped_request.get_request_host():
if hasattr(self.client, "request_host"):
wrapped_request.set_request_host(self.client.request_host)
if not wrapped_request.get_request_path():
if hasattr(self.client, "request_path"):
wrapped_request.set_request_path(self.client.request_path)
try:
if is_archivo_ontology_request(wrapped_request):
logger.info("Intercepting HTTPS request since it is an Archivo ontology request")
Expand All @@ -135,19 +160,28 @@ def handle_client_request(self, request: HttpParser) -> HttpParser:

wrapped_request = HttpRequestWrapper(request)
if (wrapped_request.is_head_request() or wrapped_request.is_get_request()) and hasattr(self.client, "mark_connect"):
logger.info('HEAD or GET and has mark_connect')
if self.client.mark_connect:
if hasattr(self.client, "config"):
logger.info("Using the configuration from the Auth")
config = self.client.config
else:
logger.info("Using the proxy configuration")
config = self.config
if not wrapped_request.get_request_host():
if hasattr(self.client, "request_host"):
wrapped_request.set_request_host(self.client.request_host)
if not wrapped_request.get_request_path():
if hasattr(self.client, "request_path"):
wrapped_request.set_request_path(self.client.request_path)

response = get_response_from_request(wrapped_request, config)
if response.status_code:
if response and response.status_code:
logger.info(response.status_code)
self.queue_response(response)
return None

logger.info('Return original request')
return request

def handle_upstream_chunk(self, chunk: memoryview):
Expand Down
13 changes: 12 additions & 1 deletion ontologytimemachine/proxy_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ def get_request_path(self) -> str:
@abstractmethod
def set_request_path(self, new_path) -> None:
pass

@abstractmethod
def set_request_host(self, new_host) -> None:
pass

@abstractmethod
def get_request_headers(self) -> Dict[str, str]:
Expand Down Expand Up @@ -77,7 +81,10 @@ def is_https_request(self) -> bool:
).startswith(b"https")

def get_request_host(self) -> str:
return self.request.host.decode("utf-8")
if self.request.host:
return self.request.host.decode("utf-8")
else:
return ""

def get_request_path(self) -> str:
if self.request.path:
Expand All @@ -88,6 +95,10 @@ def get_request_path(self) -> str:
def set_request_path(self, new_path: str) -> None:
self.request.path = new_path.encode("utf-8")
logger.info(f"Request path set to: {new_path}")

def set_request_host(self, new_host: str) -> None:
self.request.host = new_host.encode("utf-8")
logger.info(f"Request path set to: {new_host}")

def get_request_headers(self) -> Dict[str, str]:
headers: Dict[str, str] = {}
Expand Down
23 changes: 12 additions & 11 deletions ontologytimemachine/utils/proxy_logic.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,20 +144,15 @@ def is_archivo_ontology_request(wrapped_request):
return False


def request_ontology(
wrapped_request, url, headers, disableRemovingRedirects=False, timeout=5
):
def request_ontology(wrapped_request, url, headers, disableRemovingRedirects=False, timeout=3):
allow_redirects = not disableRemovingRedirects
logger.info(f'Request parameters: url - {url}, headers - {headers}, allow_redirects - {allow_redirects}')
try:
if wrapped_request.is_head_request():
response = requests.head(url=url, headers=headers, allow_redirects=allow_redirects, timeout=3)
logger.info(response.content)
logger.info(response.status_code)
response = requests.head(url=url, headers=headers, allow_redirects=allow_redirects, timeout=timeout)
else:
response = requests.get(url=url, headers=headers, allow_redirects=allow_redirects, timeout=3)
logger.info(response.content)
logger.info(response.status_code)
logger.info("Successfully fetched ontology")
response = requests.get(url=url, headers=headers, allow_redirects=allow_redirects, timeout=timeout)
logger.info(f"Successfully fetched ontology - status_code: {response.status_code}")
return response
except Exception as e:
logger.error(f"Error fetching original ontology: {e}")
Expand Down Expand Up @@ -255,9 +250,15 @@ def fetch_latest_archived(wrapped_request, headers):
ontology, _, _ = wrapped_request.get_request_url_host_path()
dbpedia_url = f"{archivo_api}?o={ontology}&f={format}"
logger.info(f"Fetching from DBpedia Archivo API: {dbpedia_url}")
response = request_ontology(wrapped_request, dbpedia_url, headers)
if response.status_code != 500:
return response
ontology = ontology.replace('http://', 'https://')
logger.info(f'HTTPS ontology: {ontology}')
dbpedia_url = f"{archivo_api}?o={ontology}&f={format}"
logger.info(f"Fetching from DBpedia Archivo API - https: {dbpedia_url}")
return request_ontology(wrapped_request, dbpedia_url, headers)


def fetch_timestamp_archived(wrapped_request, headers, config):
if not is_archivo_ontology_request(wrapped_request):
logger.info(
Expand Down
103 changes: 102 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ proxy-py = "^2.4.4"
rdflib = "^7.0.0"
werkzeug = "^3.0.4"
schedule = "^1.2.2"
httpx = "^0.27.2"


[build-system]
Expand Down
14 changes: 7 additions & 7 deletions tests/archivo_test_IRIs.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ enable_testcase iri error_dimension expected_error iri_type comment
1 http://xmlns.com/foaf/0.1/Person None term
1 http://dbpedia.org/ontology/ None term
1 http://dbpedia.org/ontology/Person None term
0 https://bag2.basisregistraties.overheid.nl/bag/def/ http-code 404 slash
0 https://bag2.basisregistraties.overheid.nl/bag/def/Gebruiksdoel http-code 404 term
0 https://id.parliament.uk/schema http-code 404 slash slash onto without trailing slash /
0 https://id.parliament.uk/schema/Approval http-code 404 term slash onto without trailing slash /
0 https://bmake.th-brandenburg.de/spv# http-code 403 hash
0 https://bmake.th-brandenburg.de/spv http-code 403 hash just test whether Archivo API is used correctly
0 https://w3id.org/ttla/ transport cert-expired hash
1 https://bag2.basisregistraties.overheid.nl/bag/def/ http-code 404 slash
1 https://bag2.basisregistraties.overheid.nl/bag/def/Gebruiksdoel http-code 404 term
1 https://id.parliament.uk/schema http-code 404 slash slash onto without trailing slash /
1 https://id.parliament.uk/schema/Approval http-code 404 term slash onto without trailing slash /
1 https://bmake.th-brandenburg.de/spv# http-code 403 hash
1 https://bmake.th-brandenburg.de/spv http-code 403 hash just test whether Archivo API is used correctly
1 https://w3id.org/ttla/ transport cert-expired hash
1 http://data-gov.tw.rpi.edu/2009/data-gov-twc.rdf transport connection-refused hash
5 changes: 2 additions & 3 deletions tests/non_archivo_test_IRIs.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,9 @@ enable_testcase iri error_dimension expected_error iri_type comment
0 https://www.w3.org/1999/02/22-rdf-syntax-ns# None https is not ID
0 http://example.org None
0 https://example.org None
0 http://1.1.1.1 None
1 http://1.1.1.1 None
0 https://1.1.1.1 None
0 https://data.globalchange.gov/gcis.owl http-code 403 https is not ID
0 https://data.ordnancesurvey.co.uk/ontology/geometry/ http-code 404 https is not ID
0 https://data.ordnancesurvey.co.uk/ontology/ http-code 301 https is not ID
0 https://google.com None
0
0 https://google.com None
Loading

0 comments on commit 753c7e1

Please sign in to comment.