Skip to content

Commit

Permalink
Merge pull request #170 from enoch3712/169-add-markitdown-to-init
Browse files Browse the repository at this point in the history
add markdown DL
  • Loading branch information
enoch3712 authored Dec 30, 2024
2 parents b671989 + 2fcb207 commit db1cce1
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 8 deletions.
2 changes: 1 addition & 1 deletion docs/core-concepts/document-loaders/markitdown.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# MarkItDown Document Loader

MarkItDown is a versatile document processing library that can handle multiple file formats. ExtractThinker's MarkItDown loader provides a robust interface for text extraction with optional vision mode support.
[MarkItDown](https://github.com/microsoft/markitdown) is a versatile document processing library from Microsoft that can handle multiple file formats. ExtractThinker's MarkItDown loader provides a robust interface for text extraction with optional vision mode support.

## Basic Usage

Expand Down
2 changes: 2 additions & 0 deletions extract_thinker/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from .document_loader.document_loader_pypdf import DocumentLoaderPyPdf
from .document_loader.document_loader_pdfplumber import DocumentLoaderPdfPlumber
from .document_loader.document_loader_beautiful_soup import DocumentLoaderBeautifulSoup
from .document_loader.document_loader_markitdown import DocumentLoaderMarkItDown
from .models.classification import Classification
from .models.classification_response import ClassificationResponse
from .process import Process
Expand Down Expand Up @@ -44,6 +45,7 @@
'DocumentLoaderAWSTextract',
'DocumentLoaderGoogleDocumentAI',
'DocumentLoaderDocumentAI',
'DocumentLoaderMarkItDown',
'Classification',
'ClassificationResponse',
'Process',
Expand Down
31 changes: 25 additions & 6 deletions extract_thinker/document_loader/document_loader_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,6 @@
from extract_thinker.document_loader.cached_document_loader import CachedDocumentLoader
from extract_thinker.utils import MIME_TYPE_MAPPING

try:
from markitdown import MarkItDown
except ImportError:
raise ImportError("MarkItDown library is not installed. Please install it with 'pip install markitdown'.")

class DocumentLoaderMarkItDown(CachedDocumentLoader):
"""
Document loader that uses MarkItDown to extract content from various file formats.
Expand All @@ -26,8 +21,32 @@ class DocumentLoaderMarkItDown(CachedDocumentLoader):
]

def __init__(self, content: Any = None, cache_ttl: int = 300, llm_client=None, llm_model=None):
# Check dependencies before initializing
self._check_dependencies()
super().__init__(content, cache_ttl)
self.markitdown = MarkItDown(llm_client=llm_client, llm_model=llm_model)
self.markitdown = self._get_markitdown()(llm_client=llm_client, llm_model=llm_model)

@staticmethod
def _check_dependencies():
"""Check if required dependencies are installed."""
try:
import markitdown
except ImportError:
raise ImportError(
"Could not import markitdown package. "
"Please install it with `pip install markitdown`."
)

def _get_markitdown(self):
"""Lazy load MarkItDown."""
try:
from markitdown import MarkItDown
return MarkItDown
except ImportError:
raise ImportError(
"Could not import markitdown python package. "
"Please install it with `pip install markitdown`."
)

@cachedmethod(cache=attrgetter('cache'),
key=lambda self, source: hashkey(source if isinstance(source, str) else source.getvalue(), self.vision_mode))
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "extract_thinker"
version = "0.0.32"
version = "0.0.33"
description = "Library to extract data from files and documents agnositicaly using LLMs"
authors = ["Júlio Almeida <[email protected]>"]
readme = "README.md"
Expand Down

0 comments on commit db1cce1

Please sign in to comment.