nlmatics · hellerphilipp · Nov 10, 2024 · Nov 10, 2024 · Nov 10, 2024
diff --git a/llmsherpa/readers/file_reader.py b/llmsherpa/readers/file_reader.py
@@ -1,6 +1,6 @@
-import urllib3
+import httpx
 import os
-import json
+import aiofiles
 from urllib.parse import urlparse
 from llmsherpa.readers import Document
 
@@ -24,26 +24,48 @@ def __init__(self, parser_api_url):
                 API url for LLM Sherpa. Use customer url for your private instance here            
         """
         self.parser_api_url = parser_api_url
-        self.download_connection = urllib3.PoolManager()
-        self.api_connection = urllib3.PoolManager()
 
     def _download_pdf(self, pdf_url):
-
         # some servers only allow browers user_agent to download
         user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
         # add authorization headers if using external API (see upload_pdf for an example)
         download_headers = {"User-Agent": user_agent}
-        download_response = self.download_connection.request("GET", pdf_url, headers=download_headers)
-        file_name = os.path.basename(urlparse(pdf_url).path)
-        # note you can change the file name here if you'd like to something else
-        if download_response.status == 200:
-            pdf_file = (file_name, download_response.data, 'application/pdf')
-        return pdf_file
+
+        with httpx.Client() as client:
+            response = client.get(pdf_url, headers=download_headers)
+            response.raise_for_status()
+
+            # note you can change the file name here if you'd like to something else
+            file_name = os.path.basename(urlparse(pdf_url).path)
+            pdf_file = (file_name, response.content, 'application/pdf')
+            return pdf_file 
+
+    async def _download_pdf_async(self, pdf_url):
+        # some servers only allow browers user_agent to download
+        user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
+        # add authorization headers if using external API (see upload_pdf for an example)
+        download_headers = {"User-Agent": user_agent}
+
+        async with httpx.AsyncClient() as client:
+            response = await client.get(pdf_url, headers=download_headers)
+            response.raise_for_status()
+
+            # note you can change the file name here if you'd like to something else
+            file_name = os.path.basename(urlparse(pdf_url).path)
+            pdf_file = (file_name, response.content, 'application/pdf')
+            return pdf_file
 
     def _parse_pdf(self, pdf_file):
         auth_header = {}
-        parser_response = self.api_connection.request("POST", self.parser_api_url, fields={'file': pdf_file})
-        return parser_response
+        with httpx.Client() as client:
+            response = client.post(self.parser_api_url, files={'file': pdf_file}, headers=auth_header)
+            return response.json(), response.status_code
+
+    async def _parse_pdf_async(self, pdf_file):
+        auth_header = {}
+        async with httpx.AsyncClient() as client:
+            response = await client.post(self.parser_api_url, files={'file': pdf_file}, headers=auth_header)
+            return response.json(), response.status_code
 
     def read_pdf(self, path_or_url, contents=None):
         """
@@ -68,9 +90,37 @@ def read_pdf(self, path_or_url, contents=None):
                 with open(path_or_url, "rb") as f:
                     file_data = f.read()
                     pdf_file = (file_name, file_data, 'application/pdf')
-        parser_response = self._parse_pdf(pdf_file)
-        if parser_response.status > 200:
-            raise ValueError(f"{parser_response.data}")
-        response_json = json.loads(parser_response.data.decode("utf-8"))
-        blocks = response_json['return_dict']['result']['blocks']
+        parser_response_json, parser_response_status = self._parse_pdf(pdf_file)
+        if parser_response_status > 200:
+            raise ValueError(f"{parser_response_json}")
+        blocks = parser_response_json['return_dict']['result']['blocks']
+        return Document(blocks)
+
+    async def read_pdf_async(self, path_or_url, contents=None):
+        """
+        Asynchronously reads pdf from a url or path
+
+        Parameters
+        ----------
+        path_or_url: str
+            path or url to the pdf file e.g. https://someexapmple.com/myfile.pdf or /home/user/myfile.pdf
+        contents: bytes
+            contents of the pdf file. If contents is given, path_or_url is ignored. This is useful when you already have the pdf file contents in memory such as if you are using streamlit or flask.
+        """
+        # file contents were given
+        if contents is not None:
+            pdf_file = (path_or_url, contents, 'application/pdf')
+        else:
+            is_url = (urlparse(path_or_url).scheme in ["http", "https"])
+            if is_url:
+                pdf_file = await self._download_pdf_async(path_or_url)
+            else:
+                file_name = os.path.basename(path_or_url)
+                async with aiofiles.open(path_or_url, "rb") as f:
+                    file_data = await f.read()
+                    pdf_file = (file_name, file_data, 'application/pdf')
+        parser_response_json, parser_response_status = await self._parse_pdf_async(pdf_file)
+        if parser_response_status > 200:
+            raise ValueError(f"{parser_response_json}")
+        blocks = parser_response_json['return_dict']['result']['blocks']
         return Document(blocks)
diff --git a/llmsherpa/readers/tests/test_file_reader.py b/llmsherpa/readers/tests/test_file_reader.py
@@ -0,0 +1,47 @@
+import unittest
+import asyncio
+from llmsherpa.readers.file_reader import LayoutPDFReader
+from llmsherpa.readers import Document
+
+
+class TestFileReader(unittest.TestCase):
+
+    def setUp(self):
+        """
+        Set up a LayoutPDFReader instance with an actual parser API URL.
+        """
+        self.parser_api_url = "http://localhost:5001/api/parseDocument"  # Replace with the actual endpoint
+        self.reader = LayoutPDFReader(self.parser_api_url)
+
+    def test_read_pdf_with_url(self):
+        """
+        Test reading a PDF from a URL by calling the actual service.
+        Ensures that read_pdf successfully returns a Document object when given a valid URL.
+        """
+        # Replace with a real URL to a PDF file that the API can parse
+        pdf_url = "https://getsamplefiles.com/download/pdf/sample-1.pdf"  # Replace with an accessible PDF URL, or mock
+
+        # Call read_pdf with a URL
+        document = self.reader.read_pdf(pdf_url)
+
+        # Check if a Document object is returned
+        self.assertIsInstance(document, Document)
+        self.assertGreater(len(document.chunks()), 0, "Document should contain chunks")
+
+    def test_read_pdf_with_url_async(self):
+        """
+        Test reading a PDF from a URL by calling the actual service.
+        Ensures that read_pdf successfully returns a Document object when given a valid URL.
+        """
+        # Replace with a real URL to a PDF file that the API can parse
+        pdf_url = "https://getsamplefiles.com/download/pdf/sample-1.pdf"  # Replace with an accessible PDF URL, or mock
+
+        # Call read_pdf with a URL
+        document = asyncio.run(self.reader.read_pdf_async(pdf_url))
+
+        # Check if a Document object is returned
+        self.assertIsInstance(document, Document)
+        self.assertGreater(len(document.chunks()), 0, "Document should contain chunks")
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/setup.py b/setup.py
@@ -12,7 +12,8 @@
     license='MIT',
     packages=find_packages(),
     install_requires=[
-        "urllib3"
+        "httpx",
+        "aiofiles",
     ],
     classifiers=[
         'Development Status :: 5 - Production/Stable',