Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add pdf-extract loader #202

Merged
merged 1 commit into from
Aug 17, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,8 @@ text-splitter = { version = "0.15", features = ["tiktoken-rs", "markdown"] }
surrealdb = { version = "1.4.2", optional = true, default-features = false }
csv = "1.3.0"
urlencoding = "2.1.3"
lopdf = { version = "0.33.0", features = ["pom", "pom_parser"], optional = true }
lopdf = { version = "0.32.0", features = ["nom_parser"], optional = true }
pdf-extract = { version = "0.7.7", optional = true }
thiserror = "1.0.59"
futures-util = "0.3.30"
async-stream = "0.3.5"
Expand Down Expand Up @@ -85,6 +86,7 @@ fastembed = ["dep:fastembed"]
git = ["gix", "flume"]
mistralai = ["mistralai-client"]
lopdf = ["dep:lopdf"]
pdf-extract = ["dep:lopdf", "dep:pdf-extract"]
ollama = ["ollama-rs"]
opensearch = ["dep:opensearch", "aws-config"]
postgres = ["pgvector", "sqlx", "uuid"]
Expand Down
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,8 @@ This is the Rust language implementation of [LangChain](https://github.com/langc
async fn main() {
let path = "./src/document_loaders/test_data/sample.pdf";

let loader = LoPdfLoader::from_path(path).expect("Failed to create PdfLoader");
let loader = PdfExtractLoader::from_path(path).expect("Failed to create PdfExtractLoader");
// let loader = LoPdfLoader::from_path(path).expect("Failed to create LoPdfLoader");

let docs = loader
.load()
Expand Down
9 changes: 8 additions & 1 deletion src/document_loaders/error.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use std::io;
use std::{io, string::FromUtf8Error};

use thiserror::Error;

Expand All @@ -15,13 +15,20 @@ pub enum LoaderError {
#[error(transparent)]
IOError(#[from] io::Error),

#[error(transparent)]
FromUtf8Error(#[from] FromUtf8Error),

#[error(transparent)]
CSVError(#[from] csv::Error),

#[cfg(feature = "lopdf")]
#[error(transparent)]
LoPdfError(#[from] lopdf::Error),

#[cfg(feature = "pdf-extract")]
#[error(transparent)]
PdfExtractOutputError(#[from] pdf_extract::OutputError),

#[error(transparent)]
ReadabilityError(#[from] readability::error::Error),

Expand Down
4 changes: 2 additions & 2 deletions src/document_loaders/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,9 @@
mod pandoc_loader;
pub use pandoc_loader::*;

#[cfg(feature = "lopdf")]
#[cfg(any(feature = "lopdf", feature = "pdf_extract"))]

Check warning on line 18 in src/document_loaders/mod.rs

View workflow job for this annotation

GitHub Actions / build

unexpected `cfg` condition value: `pdf_extract`

Check warning on line 18 in src/document_loaders/mod.rs

View workflow job for this annotation

GitHub Actions / build

unexpected `cfg` condition value: `pdf_extract`
mod pdf_loader;
#[cfg(feature = "lopdf")]
#[cfg(any(feature = "lopdf", feature = "pdf_extract"))]

Check warning on line 20 in src/document_loaders/mod.rs

View workflow job for this annotation

GitHub Actions / build

unexpected `cfg` condition value: `pdf_extract`

Check warning on line 20 in src/document_loaders/mod.rs

View workflow job for this annotation

GitHub Actions / build

unexpected `cfg` condition value: `pdf_extract`
pub use pdf_loader::*;

mod html_loader;
Expand Down
8 changes: 4 additions & 4 deletions src/document_loaders/pdf_loader/lo_loader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ impl LoPdfLoader {
/// ```rust,ignore
/// use std::io::Cursor;
/// let data = Cursor::new(vec![...] /* some PDF data */);
/// let loader = PdfLoader::new(data)?;
/// let loader = LoPdfLoader::new(data)?;
/// ```
///
pub fn new<R: Read>(reader: R) -> Result<Self, LoaderError> {
Expand All @@ -38,7 +38,7 @@ impl LoPdfLoader {
/// # Example
///
/// ```rust,ignore
/// let loader = PdfLoader::from_path("/path/to/my.pdf")?;
/// let loader = LoPdfLoader::from_path("/path/to/my.pdf")?;
/// ```
///
pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self, LoaderError> {
Expand Down Expand Up @@ -96,7 +96,7 @@ mod tests {
async fn test_lo_pdf_loader() {
let path = "./src/document_loaders/test_data/sample.pdf";

let loader = LoPdfLoader::from_path(path).expect("Failed to create PdfLoader");
let loader = LoPdfLoader::from_path(path).expect("Failed to create LoPdfLoader");

let docs = loader
.load()
Expand All @@ -121,7 +121,7 @@ mod tests {
file.read_to_end(&mut buffer).unwrap();
let reader = Cursor::new(buffer);

let loader = LoPdfLoader::new(reader).expect("Failed to create PdfLoader");
let loader = LoPdfLoader::new(reader).expect("Failed to create LoPdfLoader");

let docs = loader
.load()
Expand Down
3 changes: 3 additions & 0 deletions src/document_loaders/pdf_loader/mod.rs
Original file line number Diff line number Diff line change
@@ -1,2 +1,5 @@
#[cfg(feature = "lopdf")]
pub mod lo_loader;

#[cfg(feature = "pdf-extract")]
pub mod pdf_extract_loader;
130 changes: 130 additions & 0 deletions src/document_loaders/pdf_loader/pdf_extract_loader.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
use std::{io::Read, path::Path, pin::Pin};

use async_stream::stream;
use async_trait::async_trait;
use futures::Stream;
use pdf_extract::{output_doc, PlainTextOutput};

use crate::{
document_loaders::{process_doc_stream, Loader, LoaderError},
schemas::Document,
text_splitter::TextSplitter,
};

#[derive(Debug, Clone)]
pub struct PdfExtractLoader {
document: lopdf::Document,
}

impl PdfExtractLoader {
/// Creates a new PdfLoader from anything that implements the Read trait.
/// This is a generic constructor which can be used with any type of reader.
///
/// # Example
///
/// ```rust,ignore
/// use std::io::Cursor;
/// let data = Cursor::new(vec![...] /* some PDF data */);
/// let loader = PdfExtractLoader::new(data)?;
/// ```
///
pub fn new<R: Read>(reader: R) -> Result<Self, LoaderError> {
let document = lopdf::Document::load_from(reader)?;
Ok(Self { document })
}
/// Creates a new PdfLoader from a path to a PDF file.
/// This loads the PDF document and creates a PdfLoader from it.
///
/// # Example
///
/// ```rust,ignore
/// let loader = PdfExtractLoader::from_path("/path/to/my.pdf")?;
/// ```
///
pub fn from_path<P: AsRef<Path>>(path: P) -> Result<Self, LoaderError> {
let document = lopdf::Document::load(path)?;
Ok(Self { document })
}
}

#[async_trait]
impl Loader for PdfExtractLoader {
async fn load(
mut self,
) -> Result<
Pin<Box<dyn Stream<Item = Result<Document, LoaderError>> + Send + 'static>>,
LoaderError,
> {
let mut buffer: Vec<u8> = Vec::new();
let mut output = PlainTextOutput::new(&mut buffer as &mut dyn std::io::Write);
output_doc(&self.document, &mut output)?;
let doc = Document::new(String::from_utf8(buffer)?);

let stream = stream! {
yield Ok(doc);
};

Ok(Box::pin(stream))
}

async fn load_and_split<TS: TextSplitter + 'static>(
mut self,
splitter: TS,
) -> Result<
Pin<Box<dyn Stream<Item = Result<Document, LoaderError>> + Send + 'static>>,
LoaderError,
> {
let doc_stream = self.load().await?;
let stream = process_doc_stream(doc_stream, splitter).await;
Ok(Box::pin(stream))
}
}

#[cfg(test)]
mod tests {
use std::{fs::File, io::Cursor};

use futures_util::StreamExt;

use super::*;

#[tokio::test]
async fn test_lo_pdf_loader() {
let path = "./src/document_loaders/test_data/sample.pdf";

let loader = PdfExtractLoader::from_path(path).expect("Failed to create PdfExtractLoader");

let docs = loader
.load()
.await
.unwrap()
.map(|d| d.unwrap())
.collect::<Vec<_>>()
.await;

assert_eq!(&docs[0].page_content[..100], "\n\nSample PDF Document\n\nRobert Maron\nGrzegorz Grudzi´nski\n\nFebruary 20, 1999\n\n2\n\nContents\n\n1 Templat");
assert_eq!(docs.len(), 1);
}

#[tokio::test]
async fn test_lo_pdf_loader_reader() {
let path = "./src/document_loaders/test_data/sample.pdf";
let mut file = File::open(path).unwrap();
let mut buffer = Vec::new();
file.read_to_end(&mut buffer).unwrap();
let reader = Cursor::new(buffer);

let loader = PdfExtractLoader::new(reader).expect("Failed to create PdfExtractLoader");

let docs = loader
.load()
.await
.unwrap()
.map(|d| d.unwrap())
.collect::<Vec<_>>()
.await;

assert_eq!(&docs[0].page_content[..100], "\n\nSample PDF Document\n\nRobert Maron\nGrzegorz Grudzi´nski\n\nFebruary 20, 1999\n\n2\n\nContents\n\n1 Templat");
assert_eq!(docs.len(), 1);
}
}
Loading