Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add async support #109

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
86 changes: 68 additions & 18 deletions llmsherpa/readers/file_reader.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import urllib3
import httpx
import os
import json
import aiofiles
from urllib.parse import urlparse
from llmsherpa.readers import Document

Expand All @@ -24,26 +24,48 @@ def __init__(self, parser_api_url):
API url for LLM Sherpa. Use customer url for your private instance here
"""
self.parser_api_url = parser_api_url
self.download_connection = urllib3.PoolManager()
self.api_connection = urllib3.PoolManager()

def _download_pdf(self, pdf_url):

# some servers only allow browers user_agent to download
user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
# add authorization headers if using external API (see upload_pdf for an example)
download_headers = {"User-Agent": user_agent}
download_response = self.download_connection.request("GET", pdf_url, headers=download_headers)
file_name = os.path.basename(urlparse(pdf_url).path)
# note you can change the file name here if you'd like to something else
if download_response.status == 200:
pdf_file = (file_name, download_response.data, 'application/pdf')
return pdf_file

with httpx.Client() as client:
response = client.get(pdf_url, headers=download_headers)
response.raise_for_status()

# note you can change the file name here if you'd like to something else
file_name = os.path.basename(urlparse(pdf_url).path)
pdf_file = (file_name, response.content, 'application/pdf')
return pdf_file

async def _download_pdf_async(self, pdf_url):
# some servers only allow browers user_agent to download
user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
# add authorization headers if using external API (see upload_pdf for an example)
download_headers = {"User-Agent": user_agent}

async with httpx.AsyncClient() as client:
response = await client.get(pdf_url, headers=download_headers)
response.raise_for_status()

# note you can change the file name here if you'd like to something else
file_name = os.path.basename(urlparse(pdf_url).path)
pdf_file = (file_name, response.content, 'application/pdf')
return pdf_file

def _parse_pdf(self, pdf_file):
auth_header = {}
parser_response = self.api_connection.request("POST", self.parser_api_url, fields={'file': pdf_file})
return parser_response
with httpx.Client() as client:
response = client.post(self.parser_api_url, files={'file': pdf_file}, headers=auth_header)
return response.json(), response.status_code

async def _parse_pdf_async(self, pdf_file):
auth_header = {}
async with httpx.AsyncClient() as client:
response = await client.post(self.parser_api_url, files={'file': pdf_file}, headers=auth_header)
return response.json(), response.status_code

def read_pdf(self, path_or_url, contents=None):
"""
Expand All @@ -68,9 +90,37 @@ def read_pdf(self, path_or_url, contents=None):
with open(path_or_url, "rb") as f:
file_data = f.read()
pdf_file = (file_name, file_data, 'application/pdf')
parser_response = self._parse_pdf(pdf_file)
if parser_response.status > 200:
raise ValueError(f"{parser_response.data}")
response_json = json.loads(parser_response.data.decode("utf-8"))
blocks = response_json['return_dict']['result']['blocks']
parser_response_json, parser_response_status = self._parse_pdf(pdf_file)
if parser_response_status > 200:
raise ValueError(f"{parser_response_json}")
blocks = parser_response_json['return_dict']['result']['blocks']
return Document(blocks)

async def read_pdf_async(self, path_or_url, contents=None):
"""
Asynchronously reads pdf from a url or path

Parameters
----------
path_or_url: str
path or url to the pdf file e.g. https://someexapmple.com/myfile.pdf or /home/user/myfile.pdf
contents: bytes
contents of the pdf file. If contents is given, path_or_url is ignored. This is useful when you already have the pdf file contents in memory such as if you are using streamlit or flask.
"""
# file contents were given
if contents is not None:
pdf_file = (path_or_url, contents, 'application/pdf')
else:
is_url = (urlparse(path_or_url).scheme in ["http", "https"])
if is_url:
pdf_file = await self._download_pdf_async(path_or_url)
else:
file_name = os.path.basename(path_or_url)
async with aiofiles.open(path_or_url, "rb") as f:
file_data = await f.read()
pdf_file = (file_name, file_data, 'application/pdf')
parser_response_json, parser_response_status = await self._parse_pdf_async(pdf_file)
if parser_response_status > 200:
raise ValueError(f"{parser_response_json}")
blocks = parser_response_json['return_dict']['result']['blocks']
return Document(blocks)
47 changes: 47 additions & 0 deletions llmsherpa/readers/tests/test_file_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import unittest
import asyncio
from llmsherpa.readers.file_reader import LayoutPDFReader
from llmsherpa.readers import Document


class TestFileReader(unittest.TestCase):

def setUp(self):
"""
Set up a LayoutPDFReader instance with an actual parser API URL.
"""
self.parser_api_url = "http://localhost:5001/api/parseDocument" # Replace with the actual endpoint
self.reader = LayoutPDFReader(self.parser_api_url)

def test_read_pdf_with_url(self):
"""
Test reading a PDF from a URL by calling the actual service.
Ensures that read_pdf successfully returns a Document object when given a valid URL.
"""
# Replace with a real URL to a PDF file that the API can parse
pdf_url = "https://getsamplefiles.com/download/pdf/sample-1.pdf" # Replace with an accessible PDF URL, or mock

# Call read_pdf with a URL
document = self.reader.read_pdf(pdf_url)

# Check if a Document object is returned
self.assertIsInstance(document, Document)
self.assertGreater(len(document.chunks()), 0, "Document should contain chunks")

def test_read_pdf_with_url_async(self):
"""
Test reading a PDF from a URL by calling the actual service.
Ensures that read_pdf successfully returns a Document object when given a valid URL.
"""
# Replace with a real URL to a PDF file that the API can parse
pdf_url = "https://getsamplefiles.com/download/pdf/sample-1.pdf" # Replace with an accessible PDF URL, or mock

# Call read_pdf with a URL
document = asyncio.run(self.reader.read_pdf_async(pdf_url))

# Check if a Document object is returned
self.assertIsInstance(document, Document)
self.assertGreater(len(document.chunks()), 0, "Document should contain chunks")

if __name__ == '__main__':
unittest.main()
3 changes: 2 additions & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
license='MIT',
packages=find_packages(),
install_requires=[
"urllib3"
"httpx",
"aiofiles",
],
classifiers=[
'Development Status :: 5 - Production/Stable',
Expand Down