From b708d6a2a8040dfeb99b51ffe70b16a0f3952595 Mon Sep 17 00:00:00 2001 From: Philipp Heller <10487949+hellerphilipp@users.noreply.github.com> Date: Sun, 10 Nov 2024 12:28:22 +0100 Subject: [PATCH 1/3] Add test for reading PDF from URL in LayoutPDFReader - Implement test_read_pdf_with_url in test_file_reader.py to verify that LayoutPDFReader can successfully read and parse a PDF from a web URL - Ensures read_pdf method interacts with the actual API endpoint and returns a valid Document object --- llmsherpa/readers/tests/test_file_reader.py | 31 +++++++++++++++++++++ 1 file changed, 31 insertions(+) create mode 100644 llmsherpa/readers/tests/test_file_reader.py diff --git a/llmsherpa/readers/tests/test_file_reader.py b/llmsherpa/readers/tests/test_file_reader.py new file mode 100644 index 0000000..b250b06 --- /dev/null +++ b/llmsherpa/readers/tests/test_file_reader.py @@ -0,0 +1,31 @@ +import unittest +from llmsherpa.readers.file_reader import LayoutPDFReader +from llmsherpa.readers import Document + + +class TestFileReader(unittest.TestCase): + + def setUp(self): + """ + Set up a LayoutPDFReader instance with an actual parser API URL. + """ + self.parser_api_url = "http://localhost:5001/api/parseDocument" # Replace with the actual endpoint + self.reader = LayoutPDFReader(self.parser_api_url) + + def test_read_pdf_with_url(self): + """ + Test reading a PDF from a URL by calling the actual service. + Ensures that read_pdf successfully returns a Document object when given a valid URL. + """ + # Replace with a real URL to a PDF file that the API can parse + pdf_url = "https://getsamplefiles.com/download/pdf/sample-1.pdf" # Replace with an accessible PDF URL, or mock + + # Call read_pdf with a URL + document = self.reader.read_pdf(pdf_url) + + # Check if a Document object is returned + self.assertIsInstance(document, Document) + self.assertGreater(len(document.chunks()), 0, "Document should contain chunks") + +if __name__ == '__main__': + unittest.main() From e0dd905441af63c6837487d21107712ebd6d0ebc Mon Sep 17 00:00:00 2001 From: Philipp Heller <10487949+hellerphilipp@users.noreply.github.com> Date: Sun, 10 Nov 2024 12:30:41 +0100 Subject: [PATCH 2/3] Refactor: Replace urllib3 with httpx for HTTP requests - Updated _download_pdf and _parse_pdf methods to use httpx for HTTP requests, replacing urllib3 to enable easier implementation of asynchronous operations in the future - Maintains existing functionality for PDF downloading and parsing, preserving API interactions and response handling --- llmsherpa/readers/file_reader.py | 35 ++++++++++++++++---------------- setup.py | 2 +- 2 files changed, 18 insertions(+), 19 deletions(-) diff --git a/llmsherpa/readers/file_reader.py b/llmsherpa/readers/file_reader.py index 08bdc93..2b78502 100644 --- a/llmsherpa/readers/file_reader.py +++ b/llmsherpa/readers/file_reader.py @@ -1,6 +1,5 @@ -import urllib3 +import httpx import os -import json from urllib.parse import urlparse from llmsherpa.readers import Document @@ -24,26 +23,27 @@ def __init__(self, parser_api_url): API url for LLM Sherpa. Use customer url for your private instance here """ self.parser_api_url = parser_api_url - self.download_connection = urllib3.PoolManager() - self.api_connection = urllib3.PoolManager() def _download_pdf(self, pdf_url): - # some servers only allow browers user_agent to download user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36" # add authorization headers if using external API (see upload_pdf for an example) download_headers = {"User-Agent": user_agent} - download_response = self.download_connection.request("GET", pdf_url, headers=download_headers) - file_name = os.path.basename(urlparse(pdf_url).path) - # note you can change the file name here if you'd like to something else - if download_response.status == 200: - pdf_file = (file_name, download_response.data, 'application/pdf') - return pdf_file + + with httpx.Client() as client: + response = client.get(pdf_url, headers=download_headers) + response.raise_for_status() + + # note you can change the file name here if you'd like to something else + file_name = os.path.basename(urlparse(pdf_url).path) + pdf_file = (file_name, response.content, 'application/pdf') + return pdf_file def _parse_pdf(self, pdf_file): auth_header = {} - parser_response = self.api_connection.request("POST", self.parser_api_url, fields={'file': pdf_file}) - return parser_response + with httpx.Client() as client: + response = client.post(self.parser_api_url, files={'file': pdf_file}, headers=auth_header) + return response.json(), response.status_code def read_pdf(self, path_or_url, contents=None): """ @@ -68,9 +68,8 @@ def read_pdf(self, path_or_url, contents=None): with open(path_or_url, "rb") as f: file_data = f.read() pdf_file = (file_name, file_data, 'application/pdf') - parser_response = self._parse_pdf(pdf_file) - if parser_response.status > 200: - raise ValueError(f"{parser_response.data}") - response_json = json.loads(parser_response.data.decode("utf-8")) - blocks = response_json['return_dict']['result']['blocks'] + parser_response_json, parser_response_status = self._parse_pdf(pdf_file) + if parser_response_status > 200: + raise ValueError(f"{parser_response_json}") + blocks = parser_response_json['return_dict']['result']['blocks'] return Document(blocks) diff --git a/setup.py b/setup.py index fddcdea..6e68763 100644 --- a/setup.py +++ b/setup.py @@ -12,7 +12,7 @@ license='MIT', packages=find_packages(), install_requires=[ - "urllib3" + "httpx", ], classifiers=[ 'Development Status :: 5 - Production/Stable', From a3e658d2df47cd81014894fc935452c29eccb4b3 Mon Sep 17 00:00:00 2001 From: Philipp Heller <10487949+hellerphilipp@users.noreply.github.com> Date: Sun, 10 Nov 2024 12:38:02 +0100 Subject: [PATCH 3/3] Add async alternatives for PDF download, reading, and parsing - Introduced async methods _download_pdf_async and _parse_pdf_async for non-blocking HTTP requests using httpx - Added read_pdf_async alongside read_pdf, supporting asynchronous local file reading with aiofiles and URL downloads - Added additional unit test for read_pdf_async (with PDF from URL) Addresses issue #44 --- llmsherpa/readers/file_reader.py | 51 +++++++++++++++++++++ llmsherpa/readers/tests/test_file_reader.py | 16 +++++++ setup.py | 1 + 3 files changed, 68 insertions(+) diff --git a/llmsherpa/readers/file_reader.py b/llmsherpa/readers/file_reader.py index 2b78502..84d277d 100644 --- a/llmsherpa/readers/file_reader.py +++ b/llmsherpa/readers/file_reader.py @@ -1,5 +1,6 @@ import httpx import os +import aiofiles from urllib.parse import urlparse from llmsherpa.readers import Document @@ -34,6 +35,21 @@ def _download_pdf(self, pdf_url): response = client.get(pdf_url, headers=download_headers) response.raise_for_status() + # note you can change the file name here if you'd like to something else + file_name = os.path.basename(urlparse(pdf_url).path) + pdf_file = (file_name, response.content, 'application/pdf') + return pdf_file + + async def _download_pdf_async(self, pdf_url): + # some servers only allow browers user_agent to download + user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36" + # add authorization headers if using external API (see upload_pdf for an example) + download_headers = {"User-Agent": user_agent} + + async with httpx.AsyncClient() as client: + response = await client.get(pdf_url, headers=download_headers) + response.raise_for_status() + # note you can change the file name here if you'd like to something else file_name = os.path.basename(urlparse(pdf_url).path) pdf_file = (file_name, response.content, 'application/pdf') @@ -44,6 +60,12 @@ def _parse_pdf(self, pdf_file): with httpx.Client() as client: response = client.post(self.parser_api_url, files={'file': pdf_file}, headers=auth_header) return response.json(), response.status_code + + async def _parse_pdf_async(self, pdf_file): + auth_header = {} + async with httpx.AsyncClient() as client: + response = await client.post(self.parser_api_url, files={'file': pdf_file}, headers=auth_header) + return response.json(), response.status_code def read_pdf(self, path_or_url, contents=None): """ @@ -73,3 +95,32 @@ def read_pdf(self, path_or_url, contents=None): raise ValueError(f"{parser_response_json}") blocks = parser_response_json['return_dict']['result']['blocks'] return Document(blocks) + + async def read_pdf_async(self, path_or_url, contents=None): + """ + Asynchronously reads pdf from a url or path + + Parameters + ---------- + path_or_url: str + path or url to the pdf file e.g. https://someexapmple.com/myfile.pdf or /home/user/myfile.pdf + contents: bytes + contents of the pdf file. If contents is given, path_or_url is ignored. This is useful when you already have the pdf file contents in memory such as if you are using streamlit or flask. + """ + # file contents were given + if contents is not None: + pdf_file = (path_or_url, contents, 'application/pdf') + else: + is_url = (urlparse(path_or_url).scheme in ["http", "https"]) + if is_url: + pdf_file = await self._download_pdf_async(path_or_url) + else: + file_name = os.path.basename(path_or_url) + async with aiofiles.open(path_or_url, "rb") as f: + file_data = await f.read() + pdf_file = (file_name, file_data, 'application/pdf') + parser_response_json, parser_response_status = await self._parse_pdf_async(pdf_file) + if parser_response_status > 200: + raise ValueError(f"{parser_response_json}") + blocks = parser_response_json['return_dict']['result']['blocks'] + return Document(blocks) diff --git a/llmsherpa/readers/tests/test_file_reader.py b/llmsherpa/readers/tests/test_file_reader.py index b250b06..0d0ab23 100644 --- a/llmsherpa/readers/tests/test_file_reader.py +++ b/llmsherpa/readers/tests/test_file_reader.py @@ -1,4 +1,5 @@ import unittest +import asyncio from llmsherpa.readers.file_reader import LayoutPDFReader from llmsherpa.readers import Document @@ -27,5 +28,20 @@ def test_read_pdf_with_url(self): self.assertIsInstance(document, Document) self.assertGreater(len(document.chunks()), 0, "Document should contain chunks") + def test_read_pdf_with_url_async(self): + """ + Test reading a PDF from a URL by calling the actual service. + Ensures that read_pdf successfully returns a Document object when given a valid URL. + """ + # Replace with a real URL to a PDF file that the API can parse + pdf_url = "https://getsamplefiles.com/download/pdf/sample-1.pdf" # Replace with an accessible PDF URL, or mock + + # Call read_pdf with a URL + document = asyncio.run(self.reader.read_pdf_async(pdf_url)) + + # Check if a Document object is returned + self.assertIsInstance(document, Document) + self.assertGreater(len(document.chunks()), 0, "Document should contain chunks") + if __name__ == '__main__': unittest.main() diff --git a/setup.py b/setup.py index 6e68763..c514453 100644 --- a/setup.py +++ b/setup.py @@ -13,6 +13,7 @@ packages=find_packages(), install_requires=[ "httpx", + "aiofiles", ], classifiers=[ 'Development Status :: 5 - Production/Stable',