From d60f221d57d073e3310aceef4aff214334be721b Mon Sep 17 00:00:00 2001 From: benshi <807629978@qq.com> Date: Mon, 15 Apr 2024 11:33:08 +0800 Subject: [PATCH] add extract txt with page example --- examples/extract_txt_with_page.rs | 98 +++++++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) create mode 100644 examples/extract_txt_with_page.rs diff --git a/examples/extract_txt_with_page.rs b/examples/extract_txt_with_page.rs new file mode 100644 index 0000000..4d181d7 --- /dev/null +++ b/examples/extract_txt_with_page.rs @@ -0,0 +1,98 @@ +use std::{cell::RefCell, collections::HashMap, env, fmt, rc::Rc}; + +use lopdf::Document; +use pdf_extract::{ConvertToFmt, OutputDev, OutputError, PlainTextOutput}; + +fn main() { + let file = env::args().nth(1).unwrap(); + let doc = Document::load(file).unwrap(); + let mut output = PagePlainTextOutput::new(); + pdf_extract::output_doc(&doc, &mut output).unwrap(); + + // print the text of each page + for (page_num, text) in output.pages { + println!("Page {}: {}", page_num, text); + } +} + +struct PagePlainTextOutput { + inner: PlainTextOutput, + pages: HashMap, + current_page: u32, + reader: Rc>, +} + +struct OutputWarpper(Rc>); + +impl std::fmt::Write for OutputWarpper { + fn write_str(&mut self, s: &str) -> std::fmt::Result { + self.0.borrow_mut().write_str(s).map_err(|_| fmt::Error) + } +} + +impl ConvertToFmt for OutputWarpper { + type Writer = OutputWarpper; + + fn convert(self) -> Self::Writer { + self + } +} + +impl PagePlainTextOutput { + fn new() -> Self { + let s = Rc::new(RefCell::new(String::new())); + let writer = Rc::clone(&s); + Self { + pages: HashMap::new(), + current_page: 0, + reader: s, + inner: PlainTextOutput::new(OutputWarpper(writer)), + } + } +} + +impl OutputDev for PagePlainTextOutput { + fn begin_page( + &mut self, + page_num: u32, + media_box: &pdf_extract::MediaBox, + art_box: Option<(f64, f64, f64, f64)>, + ) -> Result<(), OutputError> { + self.current_page = page_num; + self.inner.begin_page(page_num, media_box, art_box) + } + + fn end_page(&mut self) -> Result<(), OutputError> { + self.inner.end_page()?; + + let buf = self.reader.borrow().clone(); + self.pages.insert(self.current_page, buf); + self.reader.borrow_mut().clear(); + + Ok(()) + } + + fn output_character( + &mut self, + trm: &pdf_extract::Transform, + width: f64, + spacing: f64, + font_size: f64, + char: &str, + ) -> Result<(), OutputError> { + self.inner + .output_character(trm, width, spacing, font_size, char) + } + + fn begin_word(&mut self) -> Result<(), OutputError> { + self.inner.begin_word() + } + + fn end_word(&mut self) -> Result<(), OutputError> { + self.inner.end_word() + } + + fn end_line(&mut self) -> Result<(), OutputError> { + self.inner.end_line() + } +}