Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Clean up parsers imports #1959

Merged
merged 2 commits into from
Feb 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 3 additions & 9 deletions py/core/parsers/media/audio_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import tempfile
from typing import AsyncGenerator

from litellm import atranscription

from core.base.parsers.base_parser import AsyncParser
from core.base.providers import (
CompletionProvider,
Expand All @@ -25,15 +27,7 @@ def __init__(
self.database_provider = database_provider
self.llm_provider = llm_provider
self.config = config
try:
from litellm import atranscription

self.atranscription = atranscription
except ImportError:
logger.error("Failed to import LiteLLM transcription")
raise ImportError(
"Please install the `litellm` package to use the AudioParser."
)
self.atranscription = atranscription

async def ingest( # type: ignore
self, data: bytes, **kwargs
Expand Down
13 changes: 3 additions & 10 deletions py/core/parsers/media/doc_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from io import BytesIO
from typing import AsyncGenerator

import olefile

from core.base.parsers.base_parser import AsyncParser
from core.base.providers import (
CompletionProvider,
Expand All @@ -23,16 +25,7 @@ def __init__(
self.database_provider = database_provider
self.llm_provider = llm_provider
self.config = config

try:
import olefile

self.olefile = olefile
except ImportError:
raise ImportError(
"Error: 'olefile' is required to run DOCParser. "
"Please install it using pip: pip install olefile"
)
self.olefile = olefile

async def ingest(
self, data: str | bytes, **kwargs
Expand Down
12 changes: 3 additions & 9 deletions py/core/parsers/media/docx_parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from io import BytesIO
from typing import AsyncGenerator

from docx import Document

from core.base.parsers.base_parser import AsyncParser
from core.base.providers import (
CompletionProvider,
Expand All @@ -21,15 +23,7 @@ def __init__(
self.database_provider = database_provider
self.llm_provider = llm_provider
self.config = config

try:
from docx import Document

self.Document = Document
except ImportError:
raise ValueError(
"Error, `python-docx` is required to run `DOCXParser`. Please install it using `pip install python-docx`."
)
self.Document = Document

async def ingest(self, data: str | bytes, *args, **kwargs) -> AsyncGenerator[str, None]: # type: ignore
"""Ingest DOCX data and yield text from each paragraph."""
Expand Down
21 changes: 6 additions & 15 deletions py/core/parsers/media/img_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
from io import BytesIO
from typing import AsyncGenerator

import pillow_heif
from PIL import Image

from core.base.abstractions import GenerationConfig
from core.base.parsers.base_parser import AsyncParser
from core.base.providers import (
Expand All @@ -26,21 +29,9 @@ def __init__(
self.llm_provider = llm_provider
self.config = config
self.vision_prompt_text = None

try:
import pillow_heif # for HEIC support
from litellm import supports_vision
from PIL import Image

self.supports_vision = supports_vision
self.Image = Image
self.pillow_heif = pillow_heif
self.pillow_heif.register_heif_opener()
except ImportError as e:
logger.error(f"Failed to import required packages: {str(e)}")
raise ImportError(
"Please install the required packages: litellm, Pillow, pillow-heif"
)
self.Image = Image
self.pillow_heif = pillow_heif
self.pillow_heif.register_heif_opener()

def _is_heic(self, data: bytes) -> bool:
"""More robust HEIC detection using magic numbers and patterns."""
Expand Down
13 changes: 4 additions & 9 deletions py/core/parsers/media/odt_parser.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# type: ignore
import xml.etree.ElementTree as ET
import zipfile
from typing import AsyncGenerator

from core.base.parsers.base_parser import AsyncParser
Expand All @@ -19,15 +21,8 @@ def __init__(
self.database_provider = database_provider
self.llm_provider = llm_provider
self.config = config

try:
import xml.etree.ElementTree as ET
import zipfile

self.zipfile = zipfile
self.ET = ET
except ImportError:
raise ImportError("XML parsing libraries not available")
self.zipfile = zipfile
self.ET = ET

async def ingest(
self, data: str | bytes, **kwargs
Expand Down
30 changes: 2 additions & 28 deletions py/core/parsers/media/pdf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,17 @@
import asyncio
import base64
import logging
import os
import string
import tempfile
import time
import unicodedata
import uuid
from io import BytesIO
from typing import AsyncGenerator

# Third-party imports
import aiofiles
from pdf2image import convert_from_bytes, convert_from_path
from pdf2image.exceptions import PDFInfoNotInstalledError
from PIL import Image
from pypdf import PdfReader

# Local application imports
from core.base.abstractions import GenerationConfig
Expand Down Expand Up @@ -46,16 +43,6 @@ def __init__(
self.config = config
self.vision_prompt_text = None

try:
from litellm import supports_vision

self.supports_vision = supports_vision
except ImportError:
logger.error("Failed to import LiteLLM vision support")
raise ImportError(
"Please install the litellm package to use the VLMPDFParser."
)

async def convert_pdf_to_images(
self, data: str | bytes
) -> list[Image.Image]:
Expand Down Expand Up @@ -229,14 +216,7 @@ def __init__(
self.database_provider = database_provider
self.llm_provider = llm_provider
self.config = config
try:
from pypdf import PdfReader

self.PdfReader = PdfReader
except ImportError:
raise ValueError(
"Error, `pypdf` is required to run `PyPDFParser`. Please install it using `pip install pypdf`."
)
self.PdfReader = PdfReader

async def ingest(
self, data: str | bytes, **kwargs
Expand Down Expand Up @@ -293,12 +273,6 @@ def __init__(

except ImportError as e:
logger.error("PDFParserUnstructured ImportError : ", e)
logger.error(
"""Please install missing modules using :
pip install unstructured unstructured_pytesseract unstructured_inference
pip install pdfplumber matplotlib pillow_heif toml
"""
)

async def ingest(
self,
Expand Down
13 changes: 3 additions & 10 deletions py/core/parsers/media/ppt_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from io import BytesIO
from typing import AsyncGenerator

import olefile

from core.base.parsers.base_parser import AsyncParser
from core.base.providers import (
CompletionProvider,
Expand All @@ -23,16 +25,7 @@ def __init__(
self.database_provider = database_provider
self.llm_provider = llm_provider
self.config = config

try:
import olefile

self.olefile = olefile
except ImportError:
raise ImportError(
"Error: 'olefile' is required to run PPTParser. "
"Please install it using pip: pip install olefile"
)
self.olefile = olefile

def _extract_text_from_record(self, data: bytes) -> str:
"""Extract text from a PPT text record."""
Expand Down
11 changes: 3 additions & 8 deletions py/core/parsers/media/pptx_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from io import BytesIO
from typing import AsyncGenerator

from pptx import Presentation

from core.base.parsers.base_parser import AsyncParser
from core.base.providers import (
CompletionProvider,
Expand All @@ -22,14 +24,7 @@ def __init__(
self.database_provider = database_provider
self.llm_provider = llm_provider
self.config = config
try:
from pptx import Presentation

self.Presentation = Presentation
except ImportError:
raise ValueError(
"Error, `python-pptx` is required to run `PPTXParser`. Please install it using `pip install python-pptx`."
)
self.Presentation = Presentation

async def ingest(self, data: str | bytes, **kwargs) -> AsyncGenerator[str, None]: # type: ignore
"""Ingest PPT data and yield text from each slide."""
Expand Down
13 changes: 3 additions & 10 deletions py/core/parsers/media/rtf_parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# type: ignore
from typing import AsyncGenerator

from striprtf.striprtf import rtf_to_text

from core.base.parsers.base_parser import AsyncParser
from core.base.providers import (
CompletionProvider,
Expand All @@ -21,16 +23,7 @@ def __init__(
self.database_provider = database_provider
self.llm_provider = llm_provider
self.config = config

try:
from striprtf.striprtf import rtf_to_text

self.striprtf = rtf_to_text
except ImportError:
raise ImportError(
"Error: 'striprtf' is required to run RTFParser. "
"Please install it using pip: pip install striprtf"
)
self.striprtf = rtf_to_text

async def ingest(
self, data: str | bytes, **kwargs
Expand Down
13 changes: 3 additions & 10 deletions py/core/parsers/structured/epub_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import logging
from typing import AsyncGenerator

import epub

from core.base.parsers.base_parser import AsyncParser
from core.base.providers import (
CompletionProvider,
Expand All @@ -24,16 +26,7 @@ def __init__(
self.database_provider = database_provider
self.llm_provider = llm_provider
self.config = config

try:
import epub

self.epub = epub
except ImportError:
raise ImportError(
"Error: 'epub' is required to run EPUBParser. "
"Please install it using pip: pip install epub"
)
self.epub = epub

def _safe_get_metadata(self, book, field: str) -> str | None:
"""Safely extract metadata field from epub book."""
Expand Down
13 changes: 3 additions & 10 deletions py/core/parsers/structured/msg_parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# type: ignore
from typing import AsyncGenerator

import extract_msg

from core.base.parsers.base_parser import AsyncParser
from core.base.providers import (
CompletionProvider,
Expand All @@ -21,16 +23,7 @@ def __init__(
self.database_provider = database_provider
self.llm_provider = llm_provider
self.config = config

try:
import extract_msg

self.extract_msg = extract_msg
except ImportError:
raise ImportError(
"Error: 'extract-msg' is required to run MSGParser. "
"Please install it using pip: pip install extract-msg"
)
self.extract_msg = extract_msg

async def ingest(
self, data: str | bytes, **kwargs
Expand Down
13 changes: 3 additions & 10 deletions py/core/parsers/structured/org_parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# type: ignore
from typing import AsyncGenerator

import orgparse

from core.base.parsers.base_parser import AsyncParser
from core.base.providers import (
CompletionProvider,
Expand All @@ -21,16 +23,7 @@ def __init__(
self.database_provider = database_provider
self.llm_provider = llm_provider
self.config = config

try:
import orgparse

self.orgparse = orgparse
except ImportError:
raise ImportError(
"Error: 'orgparse' is required to run ORGParser. "
"Please install it using pip: pip install orgparse"
)
self.orgparse = orgparse

def _process_node(self, node) -> list[str]:
"""Process an org-mode node and return its content."""
Expand Down
Loading
Loading