diff --git a/examples/readdocx.rs b/examples/readdocx.rs index 6a0f9ba..fe52644 100644 --- a/examples/readdocx.rs +++ b/examples/readdocx.rs @@ -20,16 +20,11 @@ */ extern crate dotext; -use dotext::*; - use std::env; -use std::io::Read; fn main() { if let Some(path) = env::args().nth(1) { - let mut file = Docx::open(path).expect("Cannot open file"); - let mut isi = String::new(); - let _ = file.read_to_string(&mut isi); + let isi = dotext::extract_file(path).unwrap(); println!("CONTENT:"); println!("----------BEGIN----------"); println!("{}", isi); diff --git a/src/doc.rs b/src/doc.rs deleted file mode 100644 index 40abd26..0000000 --- a/src/doc.rs +++ /dev/null @@ -1,90 +0,0 @@ -use zip::ZipArchive; - -use xml::events::Event; -use xml::reader::Reader; - -use std::clone::Clone; -use std::fs::File; -use std::io; -use std::io::prelude::*; -use std::path::{Path, PathBuf}; -use zip::read::ZipFile; - -pub trait HasKind { - // kind - fn kind(&self) -> &'static str; - - // extension - fn ext(&self) -> &'static str; -} - -pub trait MsDoc: Read + HasKind { - fn open>(path: P) -> io::Result; -} - -pub trait OpenOfficeDoc: Read + HasKind { - fn open>(path: P) -> io::Result; -} - -pub(crate) fn open_doc_read_data>( - path: P, - content_name: &str, - tags: &[&str], -) -> io::Result { - let file = File::open(path.as_ref())?; - let mut archive = ZipArchive::new(file)?; - - let mut xml_data = String::new(); - - for i in 0..archive.len() { - let mut c_file = archive.by_index(i).unwrap(); - if c_file.name() == content_name { - c_file.read_to_string(&mut xml_data); - break; - } - } - - let mut xml_reader = Reader::from_str(xml_data.as_ref()); - - let mut buf = Vec::new(); - let mut txt = Vec::new(); - - if xml_data.len() > 0 { - let mut to_read = false; - loop { - match xml_reader.read_event(&mut buf) { - Ok(Event::Start(ref e)) => { - for tag in tags { - if e.name() == tag.as_bytes() { - to_read = true; - if e.name() == b"text:p" { - txt.push("\n\n".to_string()); - } - break; - } - } - } - Ok(Event::Text(e)) => { - if to_read { - txt.push(e.unescape_and_decode(&xml_reader).unwrap()); - to_read = false; - } - } - Ok(Event::Eof) => break, - Err(e) => { - return Err(io::Error::new( - io::ErrorKind::Other, - format!( - "Error at position {}: {:?}", - xml_reader.buffer_position(), - e - ), - )) - } - _ => (), - } - } - } - - Ok(txt.join("")) -} diff --git a/src/document.rs b/src/document.rs new file mode 100644 index 0000000..9213e0e --- /dev/null +++ b/src/document.rs @@ -0,0 +1,169 @@ +use zip::ZipArchive; + +use xml::events::Event; +use xml::reader::Reader; + +use std::clone::Clone; +use std::fs::File; +use std::io; +use std::io::prelude::*; +use std::path::{Path, PathBuf}; +use std::str::FromStr; +use zip::read::ZipFile; + +use crate::{Docx, Odp, Ods, Odt, Pptx, Xlsx}; + +pub enum DocumentKind { + Docx, + Odp, + Ods, + Odt, + Pptx, + Xlsx, +} + +impl DocumentKind { + pub fn as_str(&self) -> &'static str { + match self { + Self::Docx => "Word Document", + Self::Odp => "Open Office Presentation", + Self::Ods => "Open Office Spreadsheet", + Self::Odt => "Open Office Document", + Self::Pptx => "Power Point", + Self::Xlsx => "Excel", + } + } + + pub fn extension(&self) -> &'static str { + match self { + Self::Docx => "docx", + Self::Odp => "odp", + Self::Ods => "ods", + Self::Odt => "Odt", + Self::Pptx => "pptx", + Self::Xlsx => "xlsx", + } + } + + /// Read the document from a reader, like a buffer + pub fn extract(&self, reader: R) -> io::Result + where + R: Read + io::Seek, + { + let mut isi = String::new(); + + match self { + DocumentKind::Docx => Docx::from_reader(reader)?.read_to_string(&mut isi), + DocumentKind::Odp => Odp::from_reader(reader)?.read_to_string(&mut isi), + DocumentKind::Ods => Ods::from_reader(reader)?.read_to_string(&mut isi), + DocumentKind::Odt => Odt::from_reader(reader)?.read_to_string(&mut isi), + DocumentKind::Pptx => Pptx::from_reader(reader)?.read_to_string(&mut isi), + DocumentKind::Xlsx => Xlsx::from_reader(reader)?.read_to_string(&mut isi), + }; + + Ok(isi) + } +} + +impl FromStr for DocumentKind { + type Err = io::Error; + + fn from_str(s: &str) -> Result { + match s { + "docx" => Ok(Self::Docx), + "odp" => Ok(Self::Odp), + "ods" => Ok(Self::Ods), + "Odt" => Ok(Self::Odt), + "pptx" => Ok(Self::Pptx), + "xlsx" => Ok(Self::Xlsx), + _ => Err(io::Error::new( + io::ErrorKind::Other, + "File format not supported", + )), + } + } +} + +pub trait Document: Read { + /// Returns the document type + fn kind(&self) -> DocumentKind; + + /// Read the document from the disk + fn open

(path: P) -> io::Result + where + P: AsRef, + { + let file = File::open(path.as_ref())?; + Self::from_reader(file) + } + + /// Read the document from a reader, like a buffer + fn from_reader(reader: R) -> io::Result + where + R: Read + io::Seek; +} + +pub(crate) fn open_doc_read_data( + reader: R, + content_name: &str, + tags: &[&str], +) -> io::Result +where + R: Read + io::Seek, +{ + let mut archive = ZipArchive::new(reader)?; + + let mut xml_data = String::new(); + + for i in 0..archive.len() { + let mut c_file = archive.by_index(i).unwrap(); + if c_file.name() == content_name { + c_file.read_to_string(&mut xml_data); + break; + } + } + + let mut xml_reader = Reader::from_str(xml_data.as_ref()); + + let mut buf = Vec::new(); + let mut txt = Vec::new(); + + if xml_data.len() > 0 { + let mut to_read = false; + loop { + match xml_reader.read_event(&mut buf) { + Ok(Event::Start(ref e)) => { + for tag in tags { + if e.name() == tag.as_bytes() { + to_read = true; + if e.name() == b"text:p" { + txt.push("\n\n".to_string()); + } + break; + } + } + } + Ok(Event::Text(e)) => { + if to_read { + txt.push(e.unescape_and_decode(&xml_reader).unwrap()); + to_read = false; + } + } + Ok(Event::Eof) => break, + Err(e) => { + return Err(io::Error::new( + io::ErrorKind::Other, + format!( + "Error at position {}: {:?}", + xml_reader.buffer_position(), + e + ), + )) + } + _ => (), + } + } + } + + Ok(txt.join("")) +} diff --git a/src/docx.rs b/src/docx.rs index 294e7ce..2155195 100644 --- a/src/docx.rs +++ b/src/docx.rs @@ -11,27 +11,22 @@ use std::io::Cursor; use std::path::{Path, PathBuf}; use zip::read::ZipFile; -use doc::{HasKind, MsDoc}; +use crate::document::{Document, DocumentKind}; pub struct Docx { - path: PathBuf, data: Cursor, } -impl HasKind for Docx { - fn kind(&self) -> &'static str { - "Word Document" +impl Document for Docx { + fn kind(&self) -> DocumentKind { + DocumentKind::Docx } - fn ext(&self) -> &'static str { - "docx" - } -} - -impl MsDoc for Docx { - fn open>(path: P) -> io::Result { - let file = File::open(path.as_ref())?; - let mut archive = ZipArchive::new(file)?; + fn from_reader(reader: R) -> io::Result + where + R: Read + io::Seek, + { + let mut archive = ZipArchive::new(reader)?; let mut xml_data = String::new(); @@ -81,9 +76,7 @@ impl MsDoc for Docx { } } } - Ok(Docx { - path: path.as_ref().to_path_buf(), data: Cursor::new(txt.join("")), }) } diff --git a/src/lib.rs b/src/lib.rs index 8211f9f..65808f3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,5 @@ #![allow(unused_imports, dead_code, unused_must_use)] -extern crate quick_xml as xml; /** * Copyright 2017 Robin Syihab. All rights reserved. * @@ -21,9 +20,10 @@ extern crate quick_xml as xml; * IN THE SOFTWARE. * */ +extern crate quick_xml as xml; extern crate zip; -pub mod doc; +pub mod document; pub mod docx; pub mod odp; pub mod ods; @@ -31,10 +31,43 @@ pub mod odt; pub mod pptx; pub mod xlsx; -pub use doc::MsDoc; +pub use document::Document; +pub use document::DocumentKind; pub use docx::Docx; pub use odp::Odp; pub use ods::Ods; pub use odt::Odt; pub use pptx::Pptx; pub use xlsx::Xlsx; + +/// This function tries to extract the text from a stream. +/// The filename extension is used to detect the right extraction method. +pub fn extract(reader: R, filename: &str) -> std::io::Result +where + R: std::io::Read + std::io::Seek, +{ + use std::str::FromStr; + + let extension = filename + .rsplit_once('.') + .map(|(_, e)| e) + .ok_or_else(|| std::io::Error::new(std::io::ErrorKind::Other, "No file extension found"))?; + + DocumentKind::from_str(extension)?.extract(reader) +} + +/// This function tries to extract the text from a file. +/// The filename extension is used to detect the right extraction method. +pub fn extract_file

(path: P) -> std::io::Result +where + P: AsRef, +{ + let filename = path + .as_ref() + .file_name() + .and_then(|s| s.to_str()) + .ok_or_else(|| std::io::Error::new(std::io::ErrorKind::Other, "No filename found"))?; + + let file = std::fs::File::open(path.as_ref())?; + extract(file, filename) +} diff --git a/src/odp.rs b/src/odp.rs index 989f425..942b737 100644 --- a/src/odp.rs +++ b/src/odp.rs @@ -11,86 +11,24 @@ use std::io::Cursor; use std::path::{Path, PathBuf}; use zip::read::ZipFile; -use doc; -use doc::{HasKind, OpenOfficeDoc}; +use crate::document::{open_doc_read_data, Document, DocumentKind}; pub struct Odp { - path: PathBuf, data: Cursor, } -impl HasKind for Odp { - fn kind(&self) -> &'static str { - "Open Office Presentation" +impl Document for Odp { + fn kind(&self) -> DocumentKind { + DocumentKind::Odp } - fn ext(&self) -> &'static str { - "odp" - } -} - -impl OpenOfficeDoc for Odp { - fn open>(path: P) -> io::Result { - let text = doc::open_doc_read_data(path.as_ref(), "content.xml", &["text:p", "text:span"])?; - - // let file = File::open(path.as_ref())?; - // let mut archive = ZipArchive::new(file)?; - - // let mut xml_data = String::new(); - - // for i in 0..archive.len(){ - // let mut c_file = archive.by_index(i).unwrap(); - // if c_file.name() == "content.xml" { - // c_file.read_to_string(&mut xml_data); - // break - // } - // } - - // let mut xml_reader = Reader::from_str(xml_data.as_ref()); - - // let mut buf = Vec::new(); - // let mut txt = Vec::new(); - - // if xml_data.len() > 0 { - // let mut to_read = false; - // loop { - // match xml_reader.read_event(&mut buf){ - // Ok(Event::Start(ref e)) => { - // match e.name() { - // b"text:p" => { - // to_read = true; - // txt.push("\n\n".to_string()); - // }, - // b"text:span" => { - // to_read = true; - // } - // _ => (), - // } - // }, - // Ok(Event::Text(e)) => { - // if to_read { - // txt.push(e.unescape_and_decode(&xml_reader).unwrap()); - // to_read = false; - // } - // }, - // Ok(Event::Eof) => break, - // Err(e) => { - // return Err(io::Error::new( - // io::ErrorKind::Other, - // format!( - // "Error at position {}: {:?}", - // xml_reader.buffer_position(), - // e - // ), - // )) - // } - // _ => (), - // } - // } - // } + fn from_reader(reader: R) -> io::Result + where + R: Read + io::Seek, + { + let text = open_doc_read_data(reader, "content.xml", &["text:p", "text:span"])?; Ok(Odp { - path: path.as_ref().to_path_buf(), data: Cursor::new(text), }) } diff --git a/src/ods.rs b/src/ods.rs index 8bf7a56..ee43fc8 100644 --- a/src/ods.rs +++ b/src/ods.rs @@ -11,30 +11,24 @@ use std::io::Cursor; use std::path::{Path, PathBuf}; use zip::read::ZipFile; -use doc; -use doc::{HasKind, OpenOfficeDoc}; +use crate::document::{open_doc_read_data, Document, DocumentKind}; pub struct Ods { - path: PathBuf, data: Cursor, } -impl HasKind for Ods { - fn kind(&self) -> &'static str { - "Open Office Spreadsheet" +impl Document for Ods { + fn kind(&self) -> DocumentKind { + DocumentKind::Ods } - fn ext(&self) -> &'static str { - "ods" - } -} - -impl OpenOfficeDoc for Ods { - fn open>(path: P) -> io::Result { - let text = doc::open_doc_read_data(path.as_ref(), "content.xml", &["text:p"])?; + fn from_reader(reader: R) -> io::Result + where + R: Read + io::Seek, + { + let text = open_doc_read_data(reader, "content.xml", &["text:p"])?; Ok(Ods { - path: path.as_ref().to_path_buf(), data: Cursor::new(text), }) } diff --git a/src/odt.rs b/src/odt.rs index 20641e2..e29cb1c 100644 --- a/src/odt.rs +++ b/src/odt.rs @@ -11,30 +11,24 @@ use std::io::Cursor; use std::path::{Path, PathBuf}; use zip::read::ZipFile; -use doc; -use doc::{HasKind, OpenOfficeDoc}; +use crate::document::{open_doc_read_data, Document, DocumentKind}; pub struct Odt { - path: PathBuf, data: Cursor, } -impl HasKind for Odt { - fn kind(&self) -> &'static str { - "Open Office Document" +impl Document for Odt { + fn kind(&self) -> DocumentKind { + DocumentKind::Odt } - fn ext(&self) -> &'static str { - "odt" - } -} - -impl OpenOfficeDoc for Odt { - fn open>(path: P) -> io::Result { - let text = doc::open_doc_read_data(path.as_ref(), "content.xml", &["text:p"])?; + fn from_reader(reader: R) -> io::Result + where + R: Read + io::Seek, + { + let text = open_doc_read_data(reader, "content.xml", &["text:p"])?; Ok(Odt { - path: path.as_ref().to_path_buf(), data: Cursor::new(text), }) } diff --git a/src/pptx.rs b/src/pptx.rs index 3935d75..3f8ce1b 100644 --- a/src/pptx.rs +++ b/src/pptx.rs @@ -11,27 +11,22 @@ use std::io::Cursor; use std::path::{Path, PathBuf}; use zip::read::ZipFile; -use doc::{HasKind, MsDoc}; +use crate::document::{Document, DocumentKind}; pub struct Pptx { - path: PathBuf, data: Cursor, } -impl HasKind for Pptx { - fn kind(&self) -> &'static str { - "Power Point" +impl Document for Pptx { + fn kind(&self) -> DocumentKind { + DocumentKind::Pptx } - fn ext(&self) -> &'static str { - "pptx" - } -} - -impl MsDoc for Pptx { - fn open>(path: P) -> io::Result { - let file = File::open(path.as_ref())?; - let mut archive = ZipArchive::new(file)?; + fn from_reader(reader: R) -> io::Result + where + R: Read + io::Seek, + { + let mut archive = ZipArchive::new(reader)?; let mut xml_data = String::new(); @@ -86,7 +81,6 @@ impl MsDoc for Pptx { } Ok(Pptx { - path: path.as_ref().to_path_buf(), data: Cursor::new(txt.join("")), }) } diff --git a/src/xlsx.rs b/src/xlsx.rs index a8c1938..8b15633 100644 --- a/src/xlsx.rs +++ b/src/xlsx.rs @@ -11,30 +11,24 @@ use std::io::Cursor; use std::path::{Path, PathBuf}; use zip::read::ZipFile; -use doc::{HasKind, MsDoc}; +use crate::document::{Document, DocumentKind}; pub struct Xlsx { - path: PathBuf, data: Cursor, } -impl HasKind for Xlsx { - fn kind(&self) -> &'static str { - "Excel" +impl Document for Xlsx { + fn kind(&self) -> DocumentKind { + DocumentKind::Xlsx } - fn ext(&self) -> &'static str { - "xlsx" - } -} - -impl MsDoc for Xlsx { - fn open>(path: P) -> io::Result { - let file = File::open(path.as_ref())?; - let mut archive = ZipArchive::new(file)?; + fn from_reader(reader: R) -> io::Result + where + R: Read + io::Seek, + { + let mut archive = ZipArchive::new(reader)?; let mut xml_data = String::new(); - // let xml_data_list = Vec::new(); for i in 0..archive.len() { let mut c_file = archive.by_index(i).unwrap(); @@ -91,7 +85,6 @@ impl MsDoc for Xlsx { } Ok(Xlsx { - path: path.as_ref().to_path_buf(), data: Cursor::new(txt.join("")), }) }