Skip to content

Commit

Permalink
Clean up parsers imports
Browse files Browse the repository at this point in the history
  • Loading branch information
NolanTrem committed Feb 11, 2025
1 parent b83348c commit 73bb973
Show file tree
Hide file tree
Showing 28 changed files with 84 additions and 775 deletions.
12 changes: 3 additions & 9 deletions py/core/parsers/media/audio_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
import tempfile
from typing import AsyncGenerator

from litellm import atranscription

from core.base.parsers.base_parser import AsyncParser
from core.base.providers import (
CompletionProvider,
Expand All @@ -25,15 +27,7 @@ def __init__(
self.database_provider = database_provider
self.llm_provider = llm_provider
self.config = config
try:
from litellm import atranscription

self.atranscription = atranscription
except ImportError:
logger.error("Failed to import LiteLLM transcription")
raise ImportError(
"Please install the `litellm` package to use the AudioParser."
)
self.atranscription = atranscription

async def ingest( # type: ignore
self, data: bytes, **kwargs
Expand Down
13 changes: 3 additions & 10 deletions py/core/parsers/media/doc_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from io import BytesIO
from typing import AsyncGenerator

import olefile

from core.base.parsers.base_parser import AsyncParser
from core.base.providers import (
CompletionProvider,
Expand All @@ -23,16 +25,7 @@ def __init__(
self.database_provider = database_provider
self.llm_provider = llm_provider
self.config = config

try:
import olefile

self.olefile = olefile
except ImportError:
raise ImportError(
"Error: 'olefile' is required to run DOCParser. "
"Please install it using pip: pip install olefile"
)
self.olefile = olefile

async def ingest(
self, data: str | bytes, **kwargs
Expand Down
12 changes: 3 additions & 9 deletions py/core/parsers/media/docx_parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from io import BytesIO
from typing import AsyncGenerator

from docx import Document

from core.base.parsers.base_parser import AsyncParser
from core.base.providers import (
CompletionProvider,
Expand All @@ -21,15 +23,7 @@ def __init__(
self.database_provider = database_provider
self.llm_provider = llm_provider
self.config = config

try:
from docx import Document

self.Document = Document
except ImportError:
raise ValueError(
"Error, `python-docx` is required to run `DOCXParser`. Please install it using `pip install python-docx`."
)
self.Document = Document

async def ingest(self, data: str | bytes, *args, **kwargs) -> AsyncGenerator[str, None]: # type: ignore
"""Ingest DOCX data and yield text from each paragraph."""
Expand Down
21 changes: 6 additions & 15 deletions py/core/parsers/media/img_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@
from io import BytesIO
from typing import AsyncGenerator

import pillow_heif
from PIL import Image

from core.base.abstractions import GenerationConfig
from core.base.parsers.base_parser import AsyncParser
from core.base.providers import (
Expand All @@ -26,21 +29,9 @@ def __init__(
self.llm_provider = llm_provider
self.config = config
self.vision_prompt_text = None

try:
import pillow_heif # for HEIC support
from litellm import supports_vision
from PIL import Image

self.supports_vision = supports_vision
self.Image = Image
self.pillow_heif = pillow_heif
self.pillow_heif.register_heif_opener()
except ImportError as e:
logger.error(f"Failed to import required packages: {str(e)}")
raise ImportError(
"Please install the required packages: litellm, Pillow, pillow-heif"
)
self.Image = Image
self.pillow_heif = pillow_heif
self.pillow_heif.register_heif_opener()

def _is_heic(self, data: bytes) -> bool:
"""More robust HEIC detection using magic numbers and patterns."""
Expand Down
13 changes: 4 additions & 9 deletions py/core/parsers/media/odt_parser.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
# type: ignore
import xml.etree.ElementTree as ET
import zipfile
from typing import AsyncGenerator

from core.base.parsers.base_parser import AsyncParser
Expand All @@ -19,15 +21,8 @@ def __init__(
self.database_provider = database_provider
self.llm_provider = llm_provider
self.config = config

try:
import xml.etree.ElementTree as ET
import zipfile

self.zipfile = zipfile
self.ET = ET
except ImportError:
raise ImportError("XML parsing libraries not available")
self.zipfile = zipfile
self.ET = ET

async def ingest(
self, data: str | bytes, **kwargs
Expand Down
30 changes: 2 additions & 28 deletions py/core/parsers/media/pdf_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,20 +4,17 @@
import asyncio
import base64
import logging
import os
import string
import tempfile
import time
import unicodedata
import uuid
from io import BytesIO
from typing import AsyncGenerator

# Third-party imports
import aiofiles
from pdf2image import convert_from_bytes, convert_from_path
from pdf2image.exceptions import PDFInfoNotInstalledError
from PIL import Image
from pypdf import PdfReader

# Local application imports
from core.base.abstractions import GenerationConfig
Expand Down Expand Up @@ -46,16 +43,6 @@ def __init__(
self.config = config
self.vision_prompt_text = None

try:
from litellm import supports_vision

self.supports_vision = supports_vision
except ImportError:
logger.error("Failed to import LiteLLM vision support")
raise ImportError(
"Please install the litellm package to use the VLMPDFParser."
)

async def convert_pdf_to_images(
self, data: str | bytes
) -> list[Image.Image]:
Expand Down Expand Up @@ -229,14 +216,7 @@ def __init__(
self.database_provider = database_provider
self.llm_provider = llm_provider
self.config = config
try:
from pypdf import PdfReader

self.PdfReader = PdfReader
except ImportError:
raise ValueError(
"Error, `pypdf` is required to run `PyPDFParser`. Please install it using `pip install pypdf`."
)
self.PdfReader = PdfReader

async def ingest(
self, data: str | bytes, **kwargs
Expand Down Expand Up @@ -293,12 +273,6 @@ def __init__(

except ImportError as e:
logger.error("PDFParserUnstructured ImportError : ", e)
logger.error(
"""Please install missing modules using :
pip install unstructured unstructured_pytesseract unstructured_inference
pip install pdfplumber matplotlib pillow_heif toml
"""
)

async def ingest(
self,
Expand Down
13 changes: 3 additions & 10 deletions py/core/parsers/media/ppt_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
from io import BytesIO
from typing import AsyncGenerator

import olefile

from core.base.parsers.base_parser import AsyncParser
from core.base.providers import (
CompletionProvider,
Expand All @@ -23,16 +25,7 @@ def __init__(
self.database_provider = database_provider
self.llm_provider = llm_provider
self.config = config

try:
import olefile

self.olefile = olefile
except ImportError:
raise ImportError(
"Error: 'olefile' is required to run PPTParser. "
"Please install it using pip: pip install olefile"
)
self.olefile = olefile

def _extract_text_from_record(self, data: bytes) -> str:
"""Extract text from a PPT text record."""
Expand Down
11 changes: 3 additions & 8 deletions py/core/parsers/media/pptx_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
from io import BytesIO
from typing import AsyncGenerator

from pptx import Presentation

from core.base.parsers.base_parser import AsyncParser
from core.base.providers import (
CompletionProvider,
Expand All @@ -22,14 +24,7 @@ def __init__(
self.database_provider = database_provider
self.llm_provider = llm_provider
self.config = config
try:
from pptx import Presentation

self.Presentation = Presentation
except ImportError:
raise ValueError(
"Error, `python-pptx` is required to run `PPTXParser`. Please install it using `pip install python-pptx`."
)
self.Presentation = Presentation

async def ingest(self, data: str | bytes, **kwargs) -> AsyncGenerator[str, None]: # type: ignore
"""Ingest PPT data and yield text from each slide."""
Expand Down
13 changes: 3 additions & 10 deletions py/core/parsers/media/rtf_parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# type: ignore
from typing import AsyncGenerator

from striprtf.striprtf import rtf_to_text

from core.base.parsers.base_parser import AsyncParser
from core.base.providers import (
CompletionProvider,
Expand All @@ -21,16 +23,7 @@ def __init__(
self.database_provider = database_provider
self.llm_provider = llm_provider
self.config = config

try:
from striprtf.striprtf import rtf_to_text

self.striprtf = rtf_to_text
except ImportError:
raise ImportError(
"Error: 'striprtf' is required to run RTFParser. "
"Please install it using pip: pip install striprtf"
)
self.striprtf = rtf_to_text

async def ingest(
self, data: str | bytes, **kwargs
Expand Down
13 changes: 3 additions & 10 deletions py/core/parsers/structured/epub_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import logging
from typing import AsyncGenerator

import epub

from core.base.parsers.base_parser import AsyncParser
from core.base.providers import (
CompletionProvider,
Expand All @@ -24,16 +26,7 @@ def __init__(
self.database_provider = database_provider
self.llm_provider = llm_provider
self.config = config

try:
import epub

self.epub = epub
except ImportError:
raise ImportError(
"Error: 'epub' is required to run EPUBParser. "
"Please install it using pip: pip install epub"
)
self.epub = epub

def _safe_get_metadata(self, book, field: str) -> str | None:
"""Safely extract metadata field from epub book."""
Expand Down
13 changes: 3 additions & 10 deletions py/core/parsers/structured/msg_parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# type: ignore
from typing import AsyncGenerator

import extract_msg

from core.base.parsers.base_parser import AsyncParser
from core.base.providers import (
CompletionProvider,
Expand All @@ -21,16 +23,7 @@ def __init__(
self.database_provider = database_provider
self.llm_provider = llm_provider
self.config = config

try:
import extract_msg

self.extract_msg = extract_msg
except ImportError:
raise ImportError(
"Error: 'extract-msg' is required to run MSGParser. "
"Please install it using pip: pip install extract-msg"
)
self.extract_msg = extract_msg

async def ingest(
self, data: str | bytes, **kwargs
Expand Down
13 changes: 3 additions & 10 deletions py/core/parsers/structured/org_parser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
# type: ignore
from typing import AsyncGenerator

import orgparse

from core.base.parsers.base_parser import AsyncParser
from core.base.providers import (
CompletionProvider,
Expand All @@ -21,16 +23,7 @@ def __init__(
self.database_provider = database_provider
self.llm_provider = llm_provider
self.config = config

try:
import orgparse

self.orgparse = orgparse
except ImportError:
raise ImportError(
"Error: 'orgparse' is required to run ORGParser. "
"Please install it using pip: pip install orgparse"
)
self.orgparse = orgparse

def _process_node(self, node) -> list[str]:
"""Process an org-mode node and return its content."""
Expand Down
Loading

0 comments on commit 73bb973

Please sign in to comment.