Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support reading from buffer & other improvements #8

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 1 addition & 6 deletions examples/readdocx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,11 @@
*/
extern crate dotext;

use dotext::*;

use std::env;
use std::io::Read;

fn main() {
if let Some(path) = env::args().nth(1) {
let mut file = Docx::open(path).expect("Cannot open file");
let mut isi = String::new();
let _ = file.read_to_string(&mut isi);
let isi = dotext::extract_file(path).unwrap();
println!("CONTENT:");
println!("----------BEGIN----------");
println!("{}", isi);
Expand Down
90 changes: 0 additions & 90 deletions src/doc.rs

This file was deleted.

169 changes: 169 additions & 0 deletions src/document.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,169 @@
use zip::ZipArchive;

use xml::events::Event;
use xml::reader::Reader;

use std::clone::Clone;
use std::fs::File;
use std::io;
use std::io::prelude::*;
use std::path::{Path, PathBuf};
use std::str::FromStr;
use zip::read::ZipFile;

use crate::{Docx, Odp, Ods, Odt, Pptx, Xlsx};

pub enum DocumentKind {
Docx,
Odp,
Ods,
Odt,
Pptx,
Xlsx,
}

impl DocumentKind {
pub fn as_str(&self) -> &'static str {
match self {
Self::Docx => "Word Document",
Self::Odp => "Open Office Presentation",
Self::Ods => "Open Office Spreadsheet",
Self::Odt => "Open Office Document",
Self::Pptx => "Power Point",
Self::Xlsx => "Excel",
}
}

pub fn extension(&self) -> &'static str {
match self {
Self::Docx => "docx",
Self::Odp => "odp",
Self::Ods => "ods",
Self::Odt => "Odt",
Self::Pptx => "pptx",
Self::Xlsx => "xlsx",
}
}

/// Read the document from a reader, like a buffer
pub fn extract<R>(&self, reader: R) -> io::Result<String>
where
R: Read + io::Seek,
{
let mut isi = String::new();

match self {
DocumentKind::Docx => Docx::from_reader(reader)?.read_to_string(&mut isi),
DocumentKind::Odp => Odp::from_reader(reader)?.read_to_string(&mut isi),
DocumentKind::Ods => Ods::from_reader(reader)?.read_to_string(&mut isi),
DocumentKind::Odt => Odt::from_reader(reader)?.read_to_string(&mut isi),
DocumentKind::Pptx => Pptx::from_reader(reader)?.read_to_string(&mut isi),
DocumentKind::Xlsx => Xlsx::from_reader(reader)?.read_to_string(&mut isi),
};

Ok(isi)
}
}

impl FromStr for DocumentKind {
type Err = io::Error;

fn from_str(s: &str) -> Result<Self, Self::Err> {
match s {
"docx" => Ok(Self::Docx),
"odp" => Ok(Self::Odp),
"ods" => Ok(Self::Ods),
"Odt" => Ok(Self::Odt),
"pptx" => Ok(Self::Pptx),
"xlsx" => Ok(Self::Xlsx),
_ => Err(io::Error::new(
io::ErrorKind::Other,
"File format not supported",
)),
}
}
}

pub trait Document<T>: Read {
/// Returns the document type
fn kind(&self) -> DocumentKind;

/// Read the document from the disk
fn open<P>(path: P) -> io::Result<T>
where
P: AsRef<Path>,
{
let file = File::open(path.as_ref())?;
Self::from_reader(file)
}

/// Read the document from a reader, like a buffer
fn from_reader<R>(reader: R) -> io::Result<T>
where
R: Read + io::Seek;
}

pub(crate) fn open_doc_read_data<R>(
reader: R,
content_name: &str,
tags: &[&str],
) -> io::Result<String>
where
R: Read + io::Seek,
{
let mut archive = ZipArchive::new(reader)?;

let mut xml_data = String::new();

for i in 0..archive.len() {
let mut c_file = archive.by_index(i).unwrap();
if c_file.name() == content_name {
c_file.read_to_string(&mut xml_data);
break;
}
}

let mut xml_reader = Reader::from_str(xml_data.as_ref());

let mut buf = Vec::new();
let mut txt = Vec::new();

if xml_data.len() > 0 {
let mut to_read = false;
loop {
match xml_reader.read_event(&mut buf) {
Ok(Event::Start(ref e)) => {
for tag in tags {
if e.name() == tag.as_bytes() {
to_read = true;
if e.name() == b"text:p" {
txt.push("\n\n".to_string());
}
break;
}
}
}
Ok(Event::Text(e)) => {
if to_read {
txt.push(e.unescape_and_decode(&xml_reader).unwrap());
to_read = false;
}
}
Ok(Event::Eof) => break,
Err(e) => {
return Err(io::Error::new(
io::ErrorKind::Other,
format!(
"Error at position {}: {:?}",
xml_reader.buffer_position(),
e
),
))
}
_ => (),
}
}
}

Ok(txt.join(""))
}
25 changes: 9 additions & 16 deletions src/docx.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,27 +11,22 @@ use std::io::Cursor;
use std::path::{Path, PathBuf};
use zip::read::ZipFile;

use doc::{HasKind, MsDoc};
use crate::document::{Document, DocumentKind};

pub struct Docx {
path: PathBuf,
data: Cursor<String>,
}

impl HasKind for Docx {
fn kind(&self) -> &'static str {
"Word Document"
impl Document<Docx> for Docx {
fn kind(&self) -> DocumentKind {
DocumentKind::Docx
}

fn ext(&self) -> &'static str {
"docx"
}
}

impl MsDoc<Docx> for Docx {
fn open<P: AsRef<Path>>(path: P) -> io::Result<Docx> {
let file = File::open(path.as_ref())?;
let mut archive = ZipArchive::new(file)?;
fn from_reader<R>(reader: R) -> io::Result<Docx>
where
R: Read + io::Seek,
{
let mut archive = ZipArchive::new(reader)?;

let mut xml_data = String::new();

Expand Down Expand Up @@ -81,9 +76,7 @@ impl MsDoc<Docx> for Docx {
}
}
}

Ok(Docx {
path: path.as_ref().to_path_buf(),
data: Cursor::new(txt.join("")),
})
}
Expand Down
39 changes: 36 additions & 3 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
#![allow(unused_imports, dead_code, unused_must_use)]

extern crate quick_xml as xml;
/**
* Copyright 2017 Robin Syihab. All rights reserved.
*
Expand All @@ -21,20 +20,54 @@ extern crate quick_xml as xml;
* IN THE SOFTWARE.
*
*/
extern crate quick_xml as xml;
extern crate zip;

pub mod doc;
pub mod document;
pub mod docx;
pub mod odp;
pub mod ods;
pub mod odt;
pub mod pptx;
pub mod xlsx;

pub use doc::MsDoc;
pub use document::Document;
pub use document::DocumentKind;
pub use docx::Docx;
pub use odp::Odp;
pub use ods::Ods;
pub use odt::Odt;
pub use pptx::Pptx;
pub use xlsx::Xlsx;

/// This function tries to extract the text from a stream.
/// The filename extension is used to detect the right extraction method.
pub fn extract<R>(reader: R, filename: &str) -> std::io::Result<String>
where
R: std::io::Read + std::io::Seek,
{
use std::str::FromStr;

let extension = filename
.rsplit_once('.')
.map(|(_, e)| e)
.ok_or_else(|| std::io::Error::new(std::io::ErrorKind::Other, "No file extension found"))?;

DocumentKind::from_str(extension)?.extract(reader)
}

/// This function tries to extract the text from a file.
/// The filename extension is used to detect the right extraction method.
pub fn extract_file<P>(path: P) -> std::io::Result<String>
where
P: AsRef<std::path::Path>,
{
let filename = path
.as_ref()
.file_name()
.and_then(|s| s.to_str())
.ok_or_else(|| std::io::Error::new(std::io::ErrorKind::Other, "No filename found"))?;

let file = std::fs::File::open(path.as_ref())?;
extract(file, filename)
}
Loading