From b708d6a2a8040dfeb99b51ffe70b16a0f3952595 Mon Sep 17 00:00:00 2001
From: Philipp Heller <10487949+hellerphilipp@users.noreply.github.com>
Date: Sun, 10 Nov 2024 12:28:22 +0100
Subject: [PATCH 1/3] Add test for reading PDF from URL in LayoutPDFReader

- Implement test_read_pdf_with_url in test_file_reader.py to verify that LayoutPDFReader can successfully read and parse a PDF from a web URL
- Ensures read_pdf method interacts with the actual API endpoint and returns a valid Document object
---
 llmsherpa/readers/tests/test_file_reader.py | 31 +++++++++++++++++++++
 1 file changed, 31 insertions(+)
 create mode 100644 llmsherpa/readers/tests/test_file_reader.py

diff --git a/llmsherpa/readers/tests/test_file_reader.py b/llmsherpa/readers/tests/test_file_reader.py
new file mode 100644
index 0000000..b250b06
--- /dev/null
+++ b/llmsherpa/readers/tests/test_file_reader.py
@@ -0,0 +1,31 @@
+import unittest
+from llmsherpa.readers.file_reader import LayoutPDFReader
+from llmsherpa.readers import Document
+
+
+class TestFileReader(unittest.TestCase):
+
+    def setUp(self):
+        """
+        Set up a LayoutPDFReader instance with an actual parser API URL.
+        """
+        self.parser_api_url = "http://localhost:5001/api/parseDocument"  # Replace with the actual endpoint
+        self.reader = LayoutPDFReader(self.parser_api_url)
+
+    def test_read_pdf_with_url(self):
+        """
+        Test reading a PDF from a URL by calling the actual service.
+        Ensures that read_pdf successfully returns a Document object when given a valid URL.
+        """
+        # Replace with a real URL to a PDF file that the API can parse
+        pdf_url = "https://getsamplefiles.com/download/pdf/sample-1.pdf"  # Replace with an accessible PDF URL, or mock
+        
+        # Call read_pdf with a URL
+        document = self.reader.read_pdf(pdf_url)
+        
+        # Check if a Document object is returned
+        self.assertIsInstance(document, Document)
+        self.assertGreater(len(document.chunks()), 0, "Document should contain chunks")
+
+if __name__ == '__main__':
+    unittest.main()

From e0dd905441af63c6837487d21107712ebd6d0ebc Mon Sep 17 00:00:00 2001
From: Philipp Heller <10487949+hellerphilipp@users.noreply.github.com>
Date: Sun, 10 Nov 2024 12:30:41 +0100
Subject: [PATCH 2/3] Refactor: Replace urllib3 with httpx for HTTP requests

- Updated _download_pdf and _parse_pdf methods to use httpx for HTTP requests, replacing urllib3 to enable easier implementation of asynchronous operations in the future
- Maintains existing functionality for PDF downloading and parsing, preserving API interactions and response handling
---
 llmsherpa/readers/file_reader.py | 35 ++++++++++++++++----------------
 setup.py                         |  2 +-
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/llmsherpa/readers/file_reader.py b/llmsherpa/readers/file_reader.py
index 08bdc93..2b78502 100644
--- a/llmsherpa/readers/file_reader.py
+++ b/llmsherpa/readers/file_reader.py
@@ -1,6 +1,5 @@
-import urllib3
+import httpx
 import os
-import json
 from urllib.parse import urlparse
 from llmsherpa.readers import Document
 
@@ -24,26 +23,27 @@ def __init__(self, parser_api_url):
                 API url for LLM Sherpa. Use customer url for your private instance here            
         """
         self.parser_api_url = parser_api_url
-        self.download_connection = urllib3.PoolManager()
-        self.api_connection = urllib3.PoolManager()
 
     def _download_pdf(self, pdf_url):
-        
         # some servers only allow browers user_agent to download
         user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
         # add authorization headers if using external API (see upload_pdf for an example)
         download_headers = {"User-Agent": user_agent}
-        download_response = self.download_connection.request("GET", pdf_url, headers=download_headers)
-        file_name = os.path.basename(urlparse(pdf_url).path)
-        # note you can change the file name here if you'd like to something else
-        if download_response.status == 200:
-            pdf_file = (file_name, download_response.data, 'application/pdf')
-        return pdf_file
+
+        with httpx.Client() as client:
+            response = client.get(pdf_url, headers=download_headers)
+            response.raise_for_status()
+
+            # note you can change the file name here if you'd like to something else
+            file_name = os.path.basename(urlparse(pdf_url).path)
+            pdf_file = (file_name, response.content, 'application/pdf')
+            return pdf_file
 
     def _parse_pdf(self, pdf_file):
         auth_header = {}
-        parser_response = self.api_connection.request("POST", self.parser_api_url, fields={'file': pdf_file})
-        return parser_response
+        with httpx.Client() as client:
+            response = client.post(self.parser_api_url, files={'file': pdf_file}, headers=auth_header)
+            return response.json(), response.status_code
 
     def read_pdf(self, path_or_url, contents=None):
         """
@@ -68,9 +68,8 @@ def read_pdf(self, path_or_url, contents=None):
                 with open(path_or_url, "rb") as f:
                     file_data = f.read()
                     pdf_file = (file_name, file_data, 'application/pdf')
-        parser_response = self._parse_pdf(pdf_file)
-        if parser_response.status > 200:
-            raise ValueError(f"{parser_response.data}")
-        response_json = json.loads(parser_response.data.decode("utf-8"))
-        blocks = response_json['return_dict']['result']['blocks']
+        parser_response_json, parser_response_status = self._parse_pdf(pdf_file)
+        if parser_response_status > 200:
+            raise ValueError(f"{parser_response_json}")
+        blocks = parser_response_json['return_dict']['result']['blocks']
         return Document(blocks)
diff --git a/setup.py b/setup.py
index fddcdea..6e68763 100644
--- a/setup.py
+++ b/setup.py
@@ -12,7 +12,7 @@
     license='MIT',
     packages=find_packages(),
     install_requires=[
-        "urllib3"
+        "httpx",
     ],
     classifiers=[
         'Development Status :: 5 - Production/Stable',

From a3e658d2df47cd81014894fc935452c29eccb4b3 Mon Sep 17 00:00:00 2001
From: Philipp Heller <10487949+hellerphilipp@users.noreply.github.com>
Date: Sun, 10 Nov 2024 12:38:02 +0100
Subject: [PATCH 3/3] Add async alternatives for PDF download, reading, and
 parsing

- Introduced async methods _download_pdf_async and _parse_pdf_async for non-blocking HTTP requests using httpx
- Added read_pdf_async alongside read_pdf, supporting asynchronous local file reading with aiofiles and URL downloads
- Added additional unit test for read_pdf_async (with PDF from URL)

Addresses issue #44
---
 llmsherpa/readers/file_reader.py            | 51 +++++++++++++++++++++
 llmsherpa/readers/tests/test_file_reader.py | 16 +++++++
 setup.py                                    |  1 +
 3 files changed, 68 insertions(+)

diff --git a/llmsherpa/readers/file_reader.py b/llmsherpa/readers/file_reader.py
index 2b78502..84d277d 100644
--- a/llmsherpa/readers/file_reader.py
+++ b/llmsherpa/readers/file_reader.py
@@ -1,5 +1,6 @@
 import httpx
 import os
+import aiofiles
 from urllib.parse import urlparse
 from llmsherpa.readers import Document
 
@@ -34,6 +35,21 @@ def _download_pdf(self, pdf_url):
             response = client.get(pdf_url, headers=download_headers)
             response.raise_for_status()
 
+            # note you can change the file name here if you'd like to something else
+            file_name = os.path.basename(urlparse(pdf_url).path)
+            pdf_file = (file_name, response.content, 'application/pdf')
+            return pdf_file 
+
+    async def _download_pdf_async(self, pdf_url):
+        # some servers only allow browers user_agent to download
+        user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
+        # add authorization headers if using external API (see upload_pdf for an example)
+        download_headers = {"User-Agent": user_agent}
+
+        async with httpx.AsyncClient() as client:
+            response = await client.get(pdf_url, headers=download_headers)
+            response.raise_for_status()
+
             # note you can change the file name here if you'd like to something else
             file_name = os.path.basename(urlparse(pdf_url).path)
             pdf_file = (file_name, response.content, 'application/pdf')
@@ -44,6 +60,12 @@ def _parse_pdf(self, pdf_file):
         with httpx.Client() as client:
             response = client.post(self.parser_api_url, files={'file': pdf_file}, headers=auth_header)
             return response.json(), response.status_code
+        
+    async def _parse_pdf_async(self, pdf_file):
+        auth_header = {}
+        async with httpx.AsyncClient() as client:
+            response = await client.post(self.parser_api_url, files={'file': pdf_file}, headers=auth_header)
+            return response.json(), response.status_code
 
     def read_pdf(self, path_or_url, contents=None):
         """
@@ -73,3 +95,32 @@ def read_pdf(self, path_or_url, contents=None):
             raise ValueError(f"{parser_response_json}")
         blocks = parser_response_json['return_dict']['result']['blocks']
         return Document(blocks)
+    
+    async def read_pdf_async(self, path_or_url, contents=None):
+        """
+        Asynchronously reads pdf from a url or path
+
+        Parameters
+        ----------
+        path_or_url: str
+            path or url to the pdf file e.g. https://someexapmple.com/myfile.pdf or /home/user/myfile.pdf
+        contents: bytes
+            contents of the pdf file. If contents is given, path_or_url is ignored. This is useful when you already have the pdf file contents in memory such as if you are using streamlit or flask.
+        """
+        # file contents were given
+        if contents is not None:
+            pdf_file = (path_or_url, contents, 'application/pdf')
+        else:
+            is_url = (urlparse(path_or_url).scheme in ["http", "https"])
+            if is_url:
+                pdf_file = await self._download_pdf_async(path_or_url)
+            else:
+                file_name = os.path.basename(path_or_url)
+                async with aiofiles.open(path_or_url, "rb") as f:
+                    file_data = await f.read()
+                    pdf_file = (file_name, file_data, 'application/pdf')
+        parser_response_json, parser_response_status = await self._parse_pdf_async(pdf_file)
+        if parser_response_status > 200:
+            raise ValueError(f"{parser_response_json}")
+        blocks = parser_response_json['return_dict']['result']['blocks']
+        return Document(blocks)
diff --git a/llmsherpa/readers/tests/test_file_reader.py b/llmsherpa/readers/tests/test_file_reader.py
index b250b06..0d0ab23 100644
--- a/llmsherpa/readers/tests/test_file_reader.py
+++ b/llmsherpa/readers/tests/test_file_reader.py
@@ -1,4 +1,5 @@
 import unittest
+import asyncio
 from llmsherpa.readers.file_reader import LayoutPDFReader
 from llmsherpa.readers import Document
 
@@ -27,5 +28,20 @@ def test_read_pdf_with_url(self):
         self.assertIsInstance(document, Document)
         self.assertGreater(len(document.chunks()), 0, "Document should contain chunks")
 
+    def test_read_pdf_with_url_async(self):
+        """
+        Test reading a PDF from a URL by calling the actual service.
+        Ensures that read_pdf successfully returns a Document object when given a valid URL.
+        """
+        # Replace with a real URL to a PDF file that the API can parse
+        pdf_url = "https://getsamplefiles.com/download/pdf/sample-1.pdf"  # Replace with an accessible PDF URL, or mock
+        
+        # Call read_pdf with a URL
+        document = asyncio.run(self.reader.read_pdf_async(pdf_url))
+        
+        # Check if a Document object is returned
+        self.assertIsInstance(document, Document)
+        self.assertGreater(len(document.chunks()), 0, "Document should contain chunks")
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/setup.py b/setup.py
index 6e68763..c514453 100644
--- a/setup.py
+++ b/setup.py
@@ -13,6 +13,7 @@
     packages=find_packages(),
     install_requires=[
         "httpx",
+        "aiofiles",
     ],
     classifiers=[
         'Development Status :: 5 - Production/Stable',