Skip to content

Commit

Permalink
Initial commit: PDF to Markdown converter with Gradio interface
Browse files Browse the repository at this point in the history
  • Loading branch information
siyryu committed Dec 5, 2024
1 parent 9f18b52 commit 09981f4
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 0 deletions.
23 changes: 23 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# PDF to Markdown Converter

A simple Gradio web application that converts PDF files to Markdown format using the Marker library.

## Features

- Upload PDF files through a user-friendly interface
- Extract text content from PDF documents
- Display extracted text with copy functionality
- Real-time progress tracking

## Requirements

- Python 3.x
- Marker library (via GitHub)
- Gradio

## Installation

1. Clone the repository
2. Install dependencies:
```bash
pip install -r requirements.txt
47 changes: 47 additions & 0 deletions app.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import gradio as gr
from marker.converters.pdf import PdfConverter
from marker.models import create_model_dict
from marker.output import text_from_rendered
import os


def process_pdf(pdf_file, progress=gr.Progress()):
if pdf_file is None:
return "Please upload a PDF file", "Waiting for file..."

progress(0, desc="正在初始化...")
converter = PdfConverter(
artifact_dict=create_model_dict(),
)

progress(0.2, desc="正在加载PDF文件...")
if not os.path.exists(pdf_file.name):
return "File not found", "Error"

progress(0.4, desc="正在转换PDF...")
rendered = converter(pdf_file.name)

progress(0.6, desc="正在处理页面内容...")
progress(0.8, desc="正在提取文本...")
text, _, images = text_from_rendered(rendered)

progress(1.0, desc="完成!")
return text, "处理完成"


with gr.Blocks() as demo:
gr.Markdown("# PDF Text Extractor")
gr.Markdown("Upload a PDF file to extract its text content")
with gr.Column():
pdf_input = gr.File(label="Upload PDF", file_types=[".pdf"])
status_output = gr.Textbox(label="状态", value="等待上传文件...", interactive=False)
text_output = gr.Markdown(label="提取的文本", show_copy_button=True)
pdf_input.change(
fn=process_pdf,
inputs=pdf_input,
outputs=[text_output, status_output],
show_progress=True,
)

if __name__ == "__main__":
demo.launch()
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
git+https://github.com/VikParuchuri/marker.git

0 comments on commit 09981f4

Please sign in to comment.