Skip to content
This repository has been archived by the owner on Feb 12, 2024. It is now read-only.

Commit

Permalink
extractor: update to pdf upstream
Browse files Browse the repository at this point in the history
  • Loading branch information
omkar-mohanty committed Dec 9, 2023
1 parent dc75753 commit 2421e27
Show file tree
Hide file tree
Showing 3 changed files with 129 additions and 22 deletions.
136 changes: 119 additions & 17 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
pdf = {git = "https://github.com/omkar-mohanty/pdf.git" , branch = "png_flate"}
pdf = "0.9.0"
clap = { version = "4.2.4" , features = ["derive"] }
log = "0.4.0"
env_logger = "0.9.0"
Expand Down
13 changes: 9 additions & 4 deletions src/extractor/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use pdf::backend::Backend;
use pdf::file::Cache;
use pdf::file::File;
use pdf::file::FileOptions;
use pdf::file::Log;
use pdf::object::PageRc;
use pdf::object::{Resolve, XObject};
use pdf::PdfError;
Expand All @@ -18,33 +19,37 @@ pub enum Method<'a> {
Bytes(&'a [u8]),
}

pub fn get_pages<T, K, Y>(file: &File<T, K, Y>) -> Result<Vec<PageRc>>
pub fn get_pages<T, K, Y, L>(file: &File<T, K, Y, L>) -> Result<Vec<PageRc>>
where
T: Backend,
K: Cache<std::result::Result<AnySync, Arc<PdfError>>>,
Y: Cache<std::result::Result<Arc<[u8]>, Arc<PdfError>>>,
L : Log
{
Ok(file
.pages()
.map(|page| page.unwrap())
.collect::<Vec<PageRc>>())
}

pub fn get_raw_images<T, K, Y>(page: PageRc, file: &File<T, K, Y>) -> Result<Vec<RawImage>>
pub fn get_raw_images<T, K, Y, L>(page: PageRc, file: &File<T, K, Y, L>) -> Result<Vec<RawImage>>
where
T: Backend,
K: Cache<std::result::Result<AnySync, Arc<PdfError>>>,
Y: Cache<std::result::Result<Arc<[u8]>, Arc<PdfError>>>,
L: Log
{
let mut images = vec![];

let resources = page.resources()?;

let resolver = file.resolver();

images.extend(
resources
.xobjects
.iter()
.map(|(_name, &r)| file.get(r).unwrap())
.map(|(_name, &r)| resolver.get(r).unwrap())
.filter(|o| matches!(**o, pdf::object::XObject::Image(_))),
);

Expand All @@ -58,7 +63,7 @@ where
_ => continue,
};

let data = img.image_data(file)?;
let data = img.image_data(&resolver)?;

let img_dict = img.deref().to_owned();

Expand Down

0 comments on commit 2421e27

Please sign in to comment.