.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package tika.legacy;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Serializable;
+import java.util.Locale;
+import java.util.Properties;
+
+/**
+ * Configuration for TesseractOCRParser.
+ *
+ * This allows to enable TesseractOCRParser and set its parameters:
+ *
+ * TesseractOCRConfig config = new TesseractOCRConfig();
+ * config.setTesseractPath(tesseractFolder);
+ * parseContext.set(TesseractOCRConfig.class, config);
+ *
+ *
+ * Parameters can also be set by either editing the existing TesseractOCRConfig.properties file in,
+ * tika-parser/src/main/resources/org/apache/tika/parser/ocr, or overriding it by creating your own
+ * and placing it in the package org/apache/tika/parser/ocr on the classpath.
+ *
+ */
+public class ImageMagickConfig implements Serializable{
+
+ private static final long serialVersionUID = -4861942486845757891L;
+
+ // Path to tesseract installation folder, if not on system path.
+ private String tesseractPath = "";
+
+ // Language dictionary to be used.
+ private String language = "eng";
+
+ // Tesseract page segmentation mode.
+ private String pageSegMode = "1";
+
+ // Minimum file size to submit file to ocr.
+ private int minFileSizeToOcr = 0;
+
+ // Maximum file size to submit file to ocr.
+ private int maxFileSizeToOcr = Integer.MAX_VALUE;
+
+ // Maximum time (seconds) to wait for the ocring process termination
+ private int timeout = 120;
+ private String imageMagickPath = "";
+ private String density = "300";
+ private String depth = "8";
+ private String quality = "1";
+ private int maxTiffSize = Integer.MAX_VALUE;
+ private int minTiffSize = 0;
+
+
+ /**
+ * Default contructor.
+ */
+ public ImageMagickConfig() {
+ init(this.getClass().getResourceAsStream("ImageMagickConfig.properties"));
+ }
+
+ /**
+ * Loads properties from InputStream and then tries to close InputStream.
+ * If there is an IOException, this silently swallows the exception
+ * and goes back to the default.
+ *
+ * @param is
+ */
+ public ImageMagickConfig(InputStream is) {
+ init(is);
+ }
+
+ private void init(InputStream is) {
+ if (is == null) {
+ return;
+ }
+ Properties props = new Properties();
+ try {
+ props.load(is);
+ } catch (IOException ignored) {
+ } finally {
+ if (is != null) {
+ try {
+ is.close();
+ } catch (IOException e) {
+ //swallow
+ }
+ }
+ }
+ setTesseractPath(
+ getProp(props, "tesseractPath", getTesseractPath()));
+ setLanguage(
+ getProp(props, "language", getLanguage()));
+ setPageSegMode(
+ getProp(props, "pageSegMode", getPageSegMode()));
+ setMinFileSizeToOcr(
+ getProp(props, "minFileSizeToOcr", getMinFileSizeToOcr()));
+ setMaxFileSizeToOcr(
+ getProp(props, "maxFileSizeToOcr", getMaxFileSizeToOcr()));
+ setImageMagickPath(
+ getProp(props, "imageMagickPath", getImageMagickPath()));
+
+ setTimeout(
+ getProp(props, "timeout", getTimeout()));
+ setDensity(
+ getProp(props, "density", getDensity()));
+ setQuality(
+ getProp(props, "quality", getQuality()));
+ setDepth(
+ getProp(props, "depth", getDepth()));
+ setMinTiffSize(
+ getProp(props, "minTiffSize", getMinTiffSize()));
+ setMaxTiffSize(
+ getProp(props, "maxTiffSize", getMaxTiffSize()));
+ }
+
+ /** @see #setTesseractPath(String tesseractPath)*/
+ public String getTesseractPath() {
+ return tesseractPath;
+ }
+
+ /**
+ * Set tesseract installation folder, needed if it is not on system path.
+ */
+ public void setTesseractPath(String tesseractPath) {
+ if(!tesseractPath.isEmpty() && !tesseractPath.endsWith(File.separator))
+ tesseractPath += File.separator;
+
+ this.tesseractPath = tesseractPath;
+ }
+
+ /** @see #setLanguage(String language)*/
+ public String getLanguage() {
+ return language;
+ }
+
+ /**
+ * Set tesseract language dictionary to be used. Default is "eng".
+ * Multiple languages may be specified, separated by plus characters.
+ */
+ public void setLanguage(String language) {
+ if (!language.matches("([A-Za-z](\\+?))*")) {
+ throw new IllegalArgumentException("Invalid language code");
+ }
+ this.language = language;
+ }
+
+ /** @see #setPageSegMode(String pageSegMode)*/
+ public String getPageSegMode() {
+ return pageSegMode;
+ }
+
+ /**
+ * Set tesseract page segmentation mode.
+ * Default is 1 = Automatic page segmentation with OSD (Orientation and Script Detection)
+ */
+ public void setPageSegMode(String pageSegMode) {
+ if (!pageSegMode.matches("[1-9]|10")) {
+ throw new IllegalArgumentException("Invalid language code");
+ }
+ this.pageSegMode = pageSegMode;
+ }
+
+ /** @see #setMinFileSizeToOcr(int minFileSizeToOcr)*/
+ public int getMinFileSizeToOcr() {
+ return minFileSizeToOcr;
+ }
+
+ /**
+ * Set minimum file size to submit file to ocr.
+ * Default is 0.
+ */
+ public void setMinFileSizeToOcr(int minFileSizeToOcr) {
+ this.minFileSizeToOcr = minFileSizeToOcr;
+ }
+
+ /** @see #setMaxFileSizeToOcr(int maxFileSizeToOcr)*/
+ public int getMaxFileSizeToOcr() {
+ return maxFileSizeToOcr;
+ }
+
+ /**
+ * Set maximum file size to submit file to ocr.
+ * Default is Integer.MAX_VALUE.
+ */
+ public void setMaxFileSizeToOcr(int maxFileSizeToOcr) {
+ this.maxFileSizeToOcr = maxFileSizeToOcr;
+ }
+
+ /**
+ * Set maximum time (seconds) to wait for the ocring process to terminate.
+ * Default value is 120s.
+ */
+ public void setTimeout(int timeout) {
+ this.timeout = timeout;
+ }
+
+ /** @see #setTimeout(int timeout)*/
+ public int getTimeout() {
+ return timeout;
+ }
+
+ /**
+ * Get property from the properties file passed in.
+ * @param properties properties file to read from.
+ * @param property the property to fetch.
+ * @param defaultMissing default parameter to use.
+ * @return the value.
+ */
+ private int getProp(Properties properties, String property, int defaultMissing) {
+ String p = properties.getProperty(property);
+ if (p == null || p.isEmpty()){
+ return defaultMissing;
+ }
+ try {
+ return Integer.parseInt(p);
+ } catch (Throwable ex) {
+ throw new RuntimeException(String.format(Locale.ROOT, "Cannot parse ImageMagickConfig variable %s, invalid integer value",
+ property), ex);
+ }
+ }
+
+ /**
+ * Get property from the properties file passed in.
+ * @param properties properties file to read from.
+ * @param property the property to fetch.
+ * @param defaultMissing default parameter to use.
+ * @return the value.
+ */
+ private String getProp(Properties properties, String property, String defaultMissing) {
+ return properties.getProperty(property, defaultMissing);
+ }
+
+ public String getImageMagickPath() {
+ return imageMagickPath;
+ }
+
+ public String getDensity() {
+ return density;
+ }
+
+ public String getDepth() {
+ return depth;
+ }
+
+ public String getQuality() {
+ return quality;
+ }
+
+ public int getMaxTiffSize() {
+ return maxTiffSize;
+ }
+
+ public void setMaxTiffSize(int maxTiffSize) {
+ this.maxTiffSize = maxTiffSize;
+ }
+
+ public void setImageMagickPath(String imageMagickPath) {
+ this.imageMagickPath = imageMagickPath;
+ }
+
+ public void setDensity(String density) {
+ this.density = density;
+ }
+
+ public void setDepth(String depth) {
+ this.depth = depth;
+ }
+
+ public void setQuality(String quality) {
+ this.quality = quality;
+ }
+
+ public int getMinTiffSize() {
+ return minTiffSize;
+ }
+
+ public void setMinTiffSize(int minTiffSize) {
+ this.minTiffSize = minTiffSize;
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/tika/legacy/LegacyPdfProcessorConfig.java b/src/main/java/tika/legacy/LegacyPdfProcessorConfig.java
new file mode 100644
index 0000000..6c316f6
--- /dev/null
+++ b/src/main/java/tika/legacy/LegacyPdfProcessorConfig.java
@@ -0,0 +1,51 @@
+package tika.legacy;
+
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonView;
+import lombok.Data;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.context.annotation.Configuration;
+import org.xml.sax.SAXException;
+import common.JsonPropertyAccessView;
+import javax.annotation.PostConstruct;
+import java.io.IOException;
+
+
+/**
+ * The legacy PDF processor configuration, as used in CogStack-Pipeline
+ * with some minor additions
+ */
+@Data
+@Configuration
+public class LegacyPdfProcessorConfig {
+
+ @JsonIgnore
+ private TikaConfig tikaConfig;
+
+ // the timeout value (s) when performing PDF->TIFF conversion of the documents
+ // the default value in Tika is 120s, but this may be too short for multi-page documents
+ @JsonView(JsonPropertyAccessView.Public.class)
+ @Value("${tika.parsers.legacy-pdf-parser.image-magick.timeout:120}")
+ private int conversionTimeout;
+
+ // the timeout value (s) when performing OCR over the documents
+ // the default value in Tika is 120s, but this may be too short for multi-page documents
+ @JsonView(JsonPropertyAccessView.Public.class)
+ @Value("${tika.parsers.legacy-pdf-parser.tesseract-ocr.timeout:120}")
+ private int ocrTimeout;
+
+ // apply OCR only when trying to extract text from previously parsed document (w/o OCR)
+ // that extracted characters were less than N
+ @JsonView(JsonPropertyAccessView.Public.class)
+ @Value("${tika.parsers.legacy-pdf-parser.min-doc-text-length:100}")
+ private int pdfMinDocTextLength;
+
+
+ @PostConstruct
+ public void init() throws IOException, SAXException, TikaException {
+ tikaConfig = new TikaConfig(this.getClass().getClassLoader()
+ .getResourceAsStream("tika-config/legacy-parser-config.xml"));
+ }
+}
diff --git a/src/main/java/tika/legacy/LegacyPdfProcessorParser.java b/src/main/java/tika/legacy/LegacyPdfProcessorParser.java
new file mode 100644
index 0000000..58ed0a2
--- /dev/null
+++ b/src/main/java/tika/legacy/LegacyPdfProcessorParser.java
@@ -0,0 +1,256 @@
+/*
+ * Copyright 2016 King's College London, Richard Jackson .
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package tika.legacy;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.logging.LogFactory;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.parser.ocr.TesseractOCRParser;
+import org.apache.tika.parser.pdf.PDFParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import java.io.*;
+import java.util.*;
+import java.util.concurrent.*;
+
+
+public class LegacyPdfProcessorParser extends AbstractParser {
+
+ private static final long serialVersionUID = -8167538283213097265L;
+ private static Map IMAGEMAGICK_PRESENT = new HashMap();
+ private static final ImageMagickConfig DEFAULT_IMAGEMAGICK_CONFIG = new ImageMagickConfig();
+
+ private static final Set SUPPORTED_TYPES = Collections.unmodifiableSet(
+ new HashSet<>(Arrays.asList(new MediaType[]{
+ MediaType.application("pdf")
+ })));
+ private static final Logger LOG = LoggerFactory.getLogger(LegacyPdfProcessorParser.class);
+
+
+ @Override
+ public Set getSupportedTypes(ParseContext context) {
+ // If ImageMagick is installed, offer our supported image types
+ ImageMagickConfig imconfig = context.get(ImageMagickConfig.class, DEFAULT_IMAGEMAGICK_CONFIG);
+ if (hasImageMagick(imconfig)) {
+ return SUPPORTED_TYPES;
+ }
+
+ // Otherwise don't advertise anything, so the other parsers
+ // can be selected instead
+ return Collections.emptySet();
+ }
+
+ private boolean hasImageMagick(ImageMagickConfig config) {
+ // Fetch where the config says to find hasImageMagick
+ String imageMagick = config.getImageMagickPath() + getImageMagickProg();
+
+ // Have we already checked for a copy of ImageMagick there?
+ if (IMAGEMAGICK_PRESENT.containsKey(imageMagick)) {
+ return IMAGEMAGICK_PRESENT.get(imageMagick);
+ }
+
+ // Try running ImageMagick from there, and see if it exists + works
+ String[] checkCmd = {imageMagick};
+ try {
+ boolean hasImageMagick = ExternalParser.check(checkCmd);
+ IMAGEMAGICK_PRESENT.put(imageMagick, hasImageMagick);
+ return hasImageMagick;
+ } catch (NoClassDefFoundError e) {
+ // This happens under OSGi + Fork Parser - see TIKA-1507
+ // As a workaround for now, just say we can't use OCR
+ // TODO Resolve it so we don't need this try/catch block
+ IMAGEMAGICK_PRESENT.put(imageMagick, false);
+ return false;
+ }
+ }
+
+ @Override
+ public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+ throws IOException, SAXException, TikaException {
+ ImageMagickConfig config = context.get(ImageMagickConfig.class, DEFAULT_IMAGEMAGICK_CONFIG);
+
+ // If ImageMagick is not on the path with the current config, do not try to run OCR
+ // getSupportedTypes shouldn't have listed us as handling it, so this should only
+ // occur if someone directly calls this parser, not via DefaultParser or similar
+// TemporaryResources tmp = new TemporaryResources();
+ //TikaInputStream pdfStream = TikaInputStream.get(stream);
+ PDFParser pdfParser = new PDFParser();
+
+ //create temp handlers to investigate object
+ BodyContentHandler body = new BodyContentHandler();
+ Metadata pdfMetadata = new Metadata();
+
+ //needed to reset stream
+ if (stream.markSupported()) {
+ stream.mark(Integer.MAX_VALUE);
+ }
+
+ //first do initial parse to see if there's subsantial content in pdf metadata already
+ pdfParser.parse(stream, body, pdfMetadata, context);
+ stream.reset();
+ //if there's content - reparse with official handlers/metadata. What else can you do? Also check imagemagick is available
+
+ LegacyPdfProcessorConfig generalConfig = context.get(LegacyPdfProcessorConfig.class);
+
+ if (body.toString().length() > generalConfig.getPdfMinDocTextLength() || !hasImageMagick(config)) {
+ pdfParser.parse(stream, handler, metadata, context);
+ //metadata.set("X-PDFPREPROC-OCR-APPLIED", "NA");
+ return;
+ }
+
+ //metadata.set("X-PDFPREPROC-ORIGINAL", body.toString());
+ // "FAIL" will be overwritten if it succeeds later
+
+ //add the PDF metadata to the official metadata object
+ Arrays.asList(pdfMetadata.names()).forEach(name -> {
+ metadata.add(name, pdfMetadata.get(name));
+ });
+
+ //objects to hold file references for manipulation outside of Java
+ File tiffFileOfPDF = null;
+ File pdfFileFromStream = File.createTempFile("tempPDF", ".pdf");
+ try {
+ FileUtils.copyInputStreamToFile(stream, pdfFileFromStream);
+ tiffFileOfPDF = File.createTempFile("tempTIFF", ".tiff");
+ makeTiffFromPDF(pdfFileFromStream,tiffFileOfPDF, config);
+ if (tiffFileOfPDF.exists()) {
+ long tessStartTime = System.currentTimeMillis();
+ TesseractOCRParser tesseract = new TesseractOCRParser();
+
+ tesseract.parse(FileUtils.openInputStream(tiffFileOfPDF), handler, metadata, context);
+
+ //metadata.set("X-OCR-Applied", "true");
+ metadata.add("X-Parsed-By", TesseractOCRParser.class.getName());
+
+ LOG.debug("Document parsing -- OCR processing time: {} ms", System.currentTimeMillis() - tessStartTime);
+ }
+ } catch (Exception e) {
+ LOG.warn("Error while running OCR over the document");
+ throw e;
+ }
+ finally {
+ if (tiffFileOfPDF.exists()) {
+ tiffFileOfPDF.delete();
+ }
+ if (pdfFileFromStream.exists()) {
+ pdfFileFromStream.delete();
+ }
+ }
+ }
+
+ static String getImageMagickProg() {
+ return System.getProperty("os.name").startsWith("Windows") ? "convert.exe" : "convert";
+ }
+
+ private File makeTiffFromPDF(File input, File output, ImageMagickConfig config) throws IOException, TikaException {
+ String[] cmd = {config.getImageMagickPath() + getImageMagickProg(),
+ "-density", config.getDensity(), input.getPath(),
+ "-depth", config.getDepth(),
+ "-quality", config.getQuality(),
+ "-background", "white", "+matte",
+ output.getPath()};
+
+ ProcessBuilder pb = new ProcessBuilder(cmd);
+ //setEnv(config, pb);
+ final Process process = pb.start();
+
+ process.getOutputStream().close();
+ InputStream out = process.getInputStream();
+ InputStream err = process.getErrorStream();
+
+ logStream("ImageMagick-stdout", out, input);
+ logStream("ImageMagick-stderr", err, input);
+
+ FutureTask waitTask = new FutureTask(new Callable() {
+ public Integer call() throws Exception {
+ return process.waitFor();
+ }
+ });
+
+ Thread waitThread = new Thread(waitTask);
+ waitThread.start();
+
+ try {
+ waitTask.get(config.getTimeout(), TimeUnit.SECONDS);
+ return output;
+ } catch (InterruptedException e) {
+ waitThread.interrupt();
+ process.destroy();
+ Thread.currentThread().interrupt();
+ throw new TikaException("ImageMagick-OCR-PDFParser: interrupted", e);
+
+ } catch (ExecutionException e) {
+ // should not be thrown
+
+ } catch (TimeoutException e) {
+ waitThread.interrupt();
+ process.destroy();
+ throw new TikaException("ImageMagick-OCR-PDFParser: timeout", e);
+ }
+ return null;
+ }
+
+ /**
+ * Starts a thread that reads the contents of the standard output or error
+ * stream of the given process to not block the process. The stream is
+ * closed once fully processed.
+ */
+ private void logStream(final String logType, final InputStream stream, final File file) {
+ new Thread() {
+ public void run() {
+ Reader reader = new InputStreamReader(stream);
+ StringBuilder out = new StringBuilder();
+ char[] buffer = new char[1024];
+ try {
+ for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
+ out.append(buffer, 0, n);
+ }
+ } catch (IOException e) {
+
+ } finally {
+ IOUtils.closeQuietly(stream);
+ }
+
+ String msg = out.toString();
+ LogFactory.getLog(LegacyPdfProcessorParser.class).debug(msg);
+ }
+ }.start();
+ }
+}
\ No newline at end of file
diff --git a/src/main/java/tika/legacy/LegacyTikaProcessor.java b/src/main/java/tika/legacy/LegacyTikaProcessor.java
new file mode 100644
index 0000000..5346e82
--- /dev/null
+++ b/src/main/java/tika/legacy/LegacyTikaProcessor.java
@@ -0,0 +1,108 @@
+package tika.legacy;
+
+import java.io.ByteArrayOutputStream;
+import java.time.OffsetDateTime;
+import java.util.*;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.ocr.TesseractOCRConfig;
+import org.apache.tika.sax.BodyContentHandler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.stereotype.Component;
+import tika.model.TikaProcessingResult;
+import tika.processor.AbstractTikaProcessor;
+import javax.annotation.PostConstruct;
+
+
+/**
+ * The "legacy" Tika processor, using parser from CogStack-Pipeline
+ * to provide compatibility with the migration of the pipeline.
+ *
+ * Processes PDF documents by running manually:
+ * - 1x ImageMagick - to create one large temporary TIFF image
+ * - 1x Tesseract - to extract the text from the TIFF
+ */
+@Component("legacyTikaProcessor")
+public class LegacyTikaProcessor extends AbstractTikaProcessor {
+
+ @Autowired
+ private LegacyPdfProcessorConfig config;
+
+ /**
+ * Document-type based automatic detection of the parser to be used by Tika
+ */
+ private AutoDetectParser defaultParser;
+ private ParseContext defaultParseContext;
+
+ private Logger log = LoggerFactory.getLogger(LegacyTikaProcessor.class);
+
+
+ /**
+ * Initializes the processor using provided (autowired) configuration
+ */
+ @PostConstruct
+ @Override
+ public void init() throws Exception {
+ defaultParseContext = new ParseContext();
+ defaultParseContext.set(TikaConfig.class, config.getTikaConfig());
+ defaultParseContext.set(LegacyPdfProcessorConfig.class, config);
+
+ TesseractOCRConfig tessConfig = new TesseractOCRConfig();
+ tessConfig.setTimeout(config.getOcrTimeout());
+ defaultParseContext.set(TesseractOCRConfig.class, tessConfig);
+
+ ImageMagickConfig imgConfig = new ImageMagickConfig();
+ imgConfig.setTimeout(config.getConversionTimeout());
+ defaultParseContext.set(ImageMagickConfig.class, imgConfig);
+
+ defaultParser = new AutoDetectParser(config.getTikaConfig());
+ }
+
+ /**
+ * Resets the component with any intermediate data used
+ */
+ @Override
+ public void reset() throws Exception {
+ // actually, we only need to re-initialize all the resources apart from the configuration
+ init();
+ }
+
+ /**
+ * Processes the input stream returning the extracted text
+ */
+ protected TikaProcessingResult processStream(TikaInputStream stream) {
+ TikaProcessingResult result;
+
+ try {
+ ByteArrayOutputStream outStream = new ByteArrayOutputStream(64 * 1024);
+ BodyContentHandler handler = new BodyContentHandler(outStream);
+ Metadata metadata = new Metadata();
+
+ defaultParser.parse(stream, handler, metadata, defaultParseContext);
+
+ // parse the metadata and store the result
+ Map resultMetadata = extractMetadata(metadata);
+ result = TikaProcessingResult.builder()
+ .text(outStream.toString())
+ .metadata(resultMetadata)
+ .success(true)
+ .timestamp(OffsetDateTime.now())
+ .build();
+ }
+ catch (Exception e) {
+ log.error(e.getMessage());
+
+ result = TikaProcessingResult.builder()
+ .error("Exception caught while processing the document: " + e.getMessage())
+ .success(false)
+ .build();
+ }
+
+ return result;
+ }
+}
diff --git a/src/main/java/tika/model/MetadataKeys.java b/src/main/java/tika/model/MetadataKeys.java
new file mode 100644
index 0000000..8c607e7
--- /dev/null
+++ b/src/main/java/tika/model/MetadataKeys.java
@@ -0,0 +1,15 @@
+package tika.model;
+
+/**
+ * Metadata keys that are to be used to extract relevant information
+ * from the document alongside the text.
+ * Note that some of these keys may not be available, depending on the document type.
+ */
+public class MetadataKeys {
+ public final static String CONTENT_TYPE = "Content-Type";
+ public final static String CREATION_DATE = "Creation-Date";
+ public final static String LAST_MODIFIED = "Last-Modified";
+ public final static String OCR_APPLIED = "X-OCR-Applied";
+ public final static String PARSED_BY = "X-Parsed-By";
+ public final static String PAGE_COUNT = "Page-Count";
+}
diff --git a/src/main/java/tika/model/TikaBinaryDocument.java b/src/main/java/tika/model/TikaBinaryDocument.java
new file mode 100644
index 0000000..1c4bf2e
--- /dev/null
+++ b/src/main/java/tika/model/TikaBinaryDocument.java
@@ -0,0 +1,12 @@
+package tika.model;
+
+import lombok.Data;
+
+/**
+ * A simplified representation of Tika Binary document
+ * that can be used as a payload for requests
+ */
+@Data
+public class TikaBinaryDocument {
+ byte[] content;
+}
diff --git a/src/main/java/tika/model/TikaPackageInformation.java b/src/main/java/tika/model/TikaPackageInformation.java
new file mode 100644
index 0000000..23367e2
--- /dev/null
+++ b/src/main/java/tika/model/TikaPackageInformation.java
@@ -0,0 +1,28 @@
+package tika.model;
+
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonView;
+import common.JsonPropertyAccessView;
+import lombok.Data;
+import org.apache.tika.Tika;
+
+/**
+ * A helper class providing information about the implementation details of used Tika package
+ */
+@Data
+@JsonIgnoreProperties(value={"specification_version", "implementation_version"}, allowGetters=true)
+public class TikaPackageInformation {
+
+ @JsonProperty("specification_version")
+ @JsonView(JsonPropertyAccessView.Public.class)
+ String getTikaSpecificationVersion() {
+ return Tika.class.getPackage().getSpecificationVersion();
+ }
+
+ @JsonProperty("implementation_version")
+ @JsonView(JsonPropertyAccessView.Public.class)
+ final String getTikaImplementationVersion() {
+ return Tika.class.getPackage().getImplementationVersion();
+ }
+}
diff --git a/src/main/java/tika/model/TikaProcessingResult.java b/src/main/java/tika/model/TikaProcessingResult.java
new file mode 100644
index 0000000..a8802d2
--- /dev/null
+++ b/src/main/java/tika/model/TikaProcessingResult.java
@@ -0,0 +1,38 @@
+package tika.model;
+
+import com.fasterxml.jackson.annotation.JsonFormat;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import lombok.Builder;
+import lombok.Data;
+import org.springframework.format.annotation.DateTimeFormat;
+
+import java.time.OffsetDateTime;
+import java.util.Map;
+
+
+/**
+ * Tika processing result payload
+ */
+@Data
+@Builder
+//@JsonAutoDetect(fieldVisibility = JsonAutoDetect.Visibility.ANY)
+@JsonInclude(JsonInclude.Include.NON_NULL)
+public class TikaProcessingResult {
+
+ // extracted text from the document
+ String text;
+
+ // document metadata
+ Map metadata;
+
+ // processing status
+ Boolean success;
+
+ // the error message in case processing failed
+ String error;
+
+ // when the document was processed
+ @DateTimeFormat(iso = DateTimeFormat.ISO.DATE_TIME)
+ @JsonFormat(pattern = "yyyy-MM-dd'T'HH:mm:ss.SSSXXX")
+ OffsetDateTime timestamp;
+}
diff --git a/src/main/java/tika/processor/AbstractTikaProcessor.java b/src/main/java/tika/processor/AbstractTikaProcessor.java
new file mode 100644
index 0000000..fc93368
--- /dev/null
+++ b/src/main/java/tika/processor/AbstractTikaProcessor.java
@@ -0,0 +1,126 @@
+package tika.processor;
+
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ocr.TesseractOCRParser;
+import tika.model.MetadataKeys;
+import tika.model.TikaBinaryDocument;
+import tika.model.TikaProcessingResult;
+
+
+/**
+ * An abstract class for a Tika Processor
+ */
+public abstract class AbstractTikaProcessor {
+
+ /**
+ * The metadata keys that should be extracted by the processor
+ */
+ private static final String[] metaKeysSingleValue = {MetadataKeys.CONTENT_TYPE, MetadataKeys.CREATION_DATE,
+ MetadataKeys.LAST_MODIFIED, MetadataKeys.OCR_APPLIED};
+ private static final String[] metaKeysMultiValue = {MetadataKeys.PARSED_BY};
+
+
+ /**
+ * Processor lifecycle methods
+ */
+ public void init() throws Exception {}
+
+ public void reset() throws Exception {}
+
+
+ /**
+ * The main documents processing method
+ */
+ protected abstract TikaProcessingResult processStream(TikaInputStream stream);
+
+
+ /**
+ * Wrappers over the main document processing method
+ */
+ public TikaProcessingResult process(final TikaBinaryDocument binaryDoc) {
+ return processStream(TikaInputStream.get(binaryDoc.getContent()));
+ }
+
+ public TikaProcessingResult process(InputStream stream) {
+ return processStream(TikaInputStream.get(stream));
+ }
+
+
+
+ /**
+ * Helper methods
+ * TODO: can be moved to utils
+ */
+ static public int getPageCount(final Metadata docMeta) {
+ Map resultMeta = new HashMap<>();
+ extractPageCount(docMeta, resultMeta);
+
+ if (resultMeta.containsKey(MetadataKeys.PAGE_COUNT)) {
+ return Integer.parseInt(resultMeta.get(MetadataKeys.PAGE_COUNT).toString());
+ }
+ return -1;
+ }
+
+ static public boolean isValidDocumentType(final Map resultMeta) {
+ return !( !resultMeta.containsKey(MetadataKeys.CONTENT_TYPE) ||
+ resultMeta.get(MetadataKeys.CONTENT_TYPE).equals(MediaType.OCTET_STREAM.toString()) ||
+ resultMeta.get(MetadataKeys.CONTENT_TYPE).equals(MediaType.EMPTY.toString()));
+ }
+
+ static private void extractPageCount(final Metadata docMeta, Map resultMeta) {
+ String pgValue = null;
+ if (docMeta.get("xmpTPg:NPages") != null) {
+ pgValue = docMeta.get("xmpTPg:NPages");
+ }
+ else if (docMeta.get("meta:page-count") != null) {
+ pgValue = docMeta.get("meta:page-count");
+ }
+ else if (docMeta.get("exif:PageCount") != null) {
+ pgValue = docMeta.get("exif:PageCount");
+ }
+ else if (docMeta.get("Page-Count") != null) {
+ pgValue = docMeta.get("Page-Count");
+ }
+
+ if (pgValue != null) {
+ resultMeta.put(MetadataKeys.PAGE_COUNT, pgValue);
+ }
+ }
+
+ static private void extractOcrApplied(final Metadata docMeta, Map resultMeta) {
+ if (docMeta.get("X-Parsed-By") != null
+ && (Arrays.asList(docMeta.getValues("X-Parsed-By")).contains(TesseractOCRParser.class.getName())
+ // note that some parsers are also adding class prefix to the name: 'class org...
+ || Arrays.asList(docMeta.getValues("X-Parsed-By")).contains(TesseractOCRParser.class.toString()))) {
+ resultMeta.put(MetadataKeys.OCR_APPLIED, "true");
+ }
+ else {
+ resultMeta.put(MetadataKeys.OCR_APPLIED, "false");
+ }
+ }
+
+ protected Map extractMetadata(final Metadata docMeta) {
+ Map resultMeta = new HashMap<>();
+ Arrays.stream(metaKeysSingleValue).forEach(name -> {
+ if (docMeta.get(name) != null)
+ resultMeta.put(name, docMeta.get(name));
+ });
+
+ Arrays.stream(metaKeysMultiValue).forEach(name -> {
+ if (docMeta.getValues(name) != null)
+ resultMeta.put(name, docMeta.getValues(name));
+ });
+
+ extractPageCount(docMeta, resultMeta);
+
+ extractOcrApplied(docMeta, resultMeta);
+
+ return resultMeta;
+ }
+}
diff --git a/src/main/java/tika/processor/CompositeTikaProcessor.java b/src/main/java/tika/processor/CompositeTikaProcessor.java
new file mode 100644
index 0000000..531529f
--- /dev/null
+++ b/src/main/java/tika/processor/CompositeTikaProcessor.java
@@ -0,0 +1,276 @@
+package tika.processor;
+
+import java.io.ByteArrayOutputStream;
+import java.io.InputStream;
+import java.time.OffsetDateTime;
+import java.util.*;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ocr.TesseractOCRConfig;
+import org.apache.tika.parser.pdf.PDFParser;
+import org.apache.tika.parser.pdf.PDFParserConfig;
+import org.apache.tika.sax.BodyContentHandler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.stereotype.Component;
+import tika.legacy.ImageMagickConfig;
+import tika.legacy.LegacyPdfProcessorConfig;
+import tika.legacy.LegacyPdfProcessorParser;
+import tika.model.TikaProcessingResult;
+import javax.annotation.PostConstruct;
+
+
+/**
+ * A default, composite Tika processor.
+ *
+ * In contrast to "legacy" processor it uses the default approach implemented in Tika, i.e. when
+ * parsing PDF documents, it runs the processing independently per each PDF page,
+ * and hence running Tesseract Page-Count times.
+ */
+@Component("compositeTikaProcessor")
+public class CompositeTikaProcessor extends AbstractTikaProcessor {
+
+ @Autowired
+ private CompositeTikaProcessorConfig compositeTikaProcessorConfig;
+
+ @Autowired
+ private LegacyPdfProcessorConfig legacyPdfProcessorConfig;
+
+ /**
+ In order to properly handle PDF documents and OCR we need three separate parsers:
+ - a generic parser (for any, non-PDF document type),
+ - one that will extract text only from PDFs,
+ - one that will apply OCR on PDFs (when stored only images).
+
+ In the default configuration of PDFParser the OCR is disabled when extracting text from PDFs. However, OCR is
+ enabled when extracting text from documents of image type. When using default parser with OCR enabled (strategy:
+ extract both text and OCR), it will actually always apply OCR on the PDFs even when there is text-only provided.
+
+ We would also like to know when OCR was applied as it will affect the accuracy of the extracted text that will be
+ passed to the downstream analysis applications.
+ */
+
+ // common tika and parsers configuration
+ private TikaConfig tikaConfig;
+ private TesseractOCRConfig tessConfig;
+
+ // the default, generic parser for handling all document types (expect PDF)
+ private AutoDetectParser defaultParser;
+ private ParseContext defaultParseContext;
+
+ // the default parser for PDFs (no OCR)
+ private PDFParser pdfTextParser;
+ private ParseContext pdfTextParseContext;
+
+ // the parser to extract text from PDFs using OCR
+ private PDFParser pdfOcrParser;
+ private ParseContext pdfOcrParseContext;
+
+ // the parser to extract text from PDFs using OCR only for single-pages
+ // (used to strip-off clutter from LibreOffice-generated PDFs just with images)
+ private LegacyPdfProcessorParser pdfSinglePageOcrParser;
+ private ParseContext pdfSinglePageOcrParseContext;
+
+
+ private Logger log = LoggerFactory.getLogger(CompositeTikaProcessor.class);
+
+
+ @PostConstruct
+ @Override
+ public void init() throws Exception {
+
+ tikaConfig = new TikaConfig();
+
+ initializeTesseractConfig();
+
+ initializeDefaultParser();
+
+ initializePdfTextOnlyParser();
+
+ initializePdfOcrParser();
+
+ if (compositeTikaProcessorConfig.isUseLegacyOcrParserForSinglePageDocuments()) {
+ initializePdfLegacyOcrParser();
+ }
+ }
+
+ @Override
+ public void reset() throws Exception {
+ // actually, we only need to re-initialize all the resources apart from the configuration
+ init();
+ }
+
+ protected TikaProcessingResult processStream(TikaInputStream stream) {
+ final int MIN_TEXT_BUFFER_SIZE = 1024;
+
+ TikaProcessingResult result;
+ try {
+ ByteArrayOutputStream outStream = new ByteArrayOutputStream(MIN_TEXT_BUFFER_SIZE);
+ BodyContentHandler handler = new BodyContentHandler(outStream);
+ Metadata metadata = new Metadata();
+
+ // mark the stream for multi-pass processing
+ if (stream.markSupported()) {
+ stream.mark(Integer.MAX_VALUE);
+ }
+
+ // try to detect whether the document is PDF
+ if (isDocumentOfPdfType(stream)) {
+
+ // firstly try the default parser
+ pdfTextParser.parse(stream, handler, metadata, pdfTextParseContext);
+
+ // check if there have been enough characters read / extracted and the we read enough bytes from the stream
+ // (images embedded in the documents will occupy quite more space than just raw text)
+ if (outStream.size() < compositeTikaProcessorConfig.getPdfMinDocTextLength()
+ && stream.getPosition() > compositeTikaProcessorConfig.getPdfMinDocByteSize()) {
+
+ // since we are perfoming a second pass over the document, we need to reset cursor position
+ // in both input and output streams
+ stream.reset();
+ outStream.reset();
+
+ final boolean useOcrLegacyParser = compositeTikaProcessorConfig.isUseLegacyOcrParserForSinglePageDocuments()
+ && getPageCount(metadata) == 1;
+
+ // TODO: Q: shall we use a clean metadata or re-use some of the previously parsed fields???
+ handler = new BodyContentHandler(outStream);
+ metadata = new Metadata();
+
+ if (useOcrLegacyParser) {
+ pdfSinglePageOcrParser.parse(stream, handler, metadata, pdfSinglePageOcrParseContext);
+
+ // since we use the parser manually, update the metadata with the name of the parser class used
+ metadata.add("X-Parsed-By", LegacyPdfProcessorParser.class.getName());
+ }
+ else {
+ pdfOcrParser.parse(stream, handler, metadata, pdfOcrParseContext);
+
+ // since we use the parser manually, update the metadata with the name of the parser class used
+ metadata.add("X-Parsed-By", PDFParser.class.getName());
+ }
+ }
+ else {
+ // since we use the parser manually, update the metadata with the name of the parser class used
+ metadata.add("X-Parsed-By", PDFParser.class.getName());
+ }
+ }
+ else {
+ // otherwise, run default documents parser
+ defaultParser.parse(stream, handler, metadata, defaultParseContext);
+ }
+
+ // parse the metadata and store the result
+ Map resultMeta = extractMetadata(metadata);
+ result = TikaProcessingResult.builder()
+ .text(outStream.toString())
+ .metadata(resultMeta)
+ .success(true)
+ .timestamp(OffsetDateTime.now())
+ .build();
+ }
+ catch (Exception e) {
+ log.error(e.getMessage());
+
+ result = TikaProcessingResult.builder()
+ .error("Exception caught while processing the document: " + e.getMessage())
+ .success(false)
+ .build();
+ }
+
+ return result;
+ }
+
+
+ private boolean isDocumentOfPdfType(InputStream stream) throws Exception {
+ Metadata metadata = new Metadata();
+ MediaType mediaType = defaultParser.getDetector().detect(stream, metadata);
+
+ return mediaType.equals(MediaType.application("pdf"));
+ }
+
+
+ private void initializeTesseractConfig() {
+ tessConfig = new TesseractOCRConfig();
+
+ tessConfig.setTimeout(compositeTikaProcessorConfig.getOcrTimeout());
+ tessConfig.setApplyRotation(compositeTikaProcessorConfig.isOcrApplyRotation());
+ if (compositeTikaProcessorConfig.isOcrEnableImageProcessing()) {
+ tessConfig.setEnableImageProcessing(1);
+ }
+ else {
+ tessConfig.setEnableImageProcessing(0);
+ }
+ tessConfig.setLanguage(compositeTikaProcessorConfig.getOcrLanguage());
+ }
+
+
+ private void initializeDefaultParser() {
+ defaultParser = new AutoDetectParser(tikaConfig);
+
+ defaultParseContext = new ParseContext();
+ defaultParseContext.set(TikaConfig.class, tikaConfig);
+ defaultParseContext.set(TesseractOCRConfig.class, tessConfig);
+ defaultParseContext.set(Parser.class, defaultParser); //need to add this to make sure recursive parsing happens!
+ }
+
+
+ private void initializePdfTextOnlyParser() {
+ PDFParserConfig pdfTextOnlyConfig = new PDFParserConfig();
+ pdfTextOnlyConfig.setExtractInlineImages(false);
+ pdfTextOnlyConfig.setExtractUniqueInlineImagesOnly(false); // do not extract multiple inline images
+ pdfTextOnlyConfig.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR);
+
+ pdfTextParser = new PDFParser();
+ pdfTextParseContext = new ParseContext();
+ pdfTextParseContext.set(TikaConfig.class, tikaConfig);
+ pdfTextParseContext.set(PDFParserConfig.class, pdfTextOnlyConfig);
+ //pdfTextParseContext.set(Parser.class, defaultParser); //need to add this to make sure recursive parsing happens!
+ }
+
+
+ private void initializePdfOcrParser() {
+ PDFParserConfig pdfOcrConfig = new PDFParserConfig();
+ pdfOcrConfig.setExtractUniqueInlineImagesOnly(false); // do not extract multiple inline images
+ if (compositeTikaProcessorConfig.isPdfOcrOnlyStrategy()) {
+ pdfOcrConfig.setExtractInlineImages(false);
+ pdfOcrConfig.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY);
+ }
+ else {
+ pdfOcrConfig.setExtractInlineImages(true);
+ // warn: note that applying 'OCR_AND_TEXT_EXTRACTION' the content can be duplicated
+ pdfOcrConfig.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION);
+ }
+
+ pdfOcrParser = new PDFParser();
+ pdfOcrParseContext = new ParseContext();
+ pdfOcrParseContext.set(TikaConfig.class, tikaConfig);
+ pdfOcrParseContext.set(PDFParserConfig.class, pdfOcrConfig);
+ pdfOcrParseContext.set(TesseractOCRConfig.class, tessConfig);
+ //pdfOcrParseContext.set(Parser.class, defaultParser); //need to add this to make sure recursive parsing happens!
+ }
+
+ private void initializePdfLegacyOcrParser() {
+ pdfSinglePageOcrParser = new LegacyPdfProcessorParser();
+
+ pdfSinglePageOcrParseContext = new ParseContext();
+ pdfSinglePageOcrParseContext.set(TikaConfig.class, tikaConfig);
+ pdfSinglePageOcrParseContext.set(LegacyPdfProcessorConfig.class, legacyPdfProcessorConfig);
+
+ TesseractOCRConfig tessConfig = new TesseractOCRConfig();
+ tessConfig.setTimeout(legacyPdfProcessorConfig.getOcrTimeout());
+ pdfSinglePageOcrParseContext.set(TesseractOCRConfig.class, tessConfig);
+
+ ImageMagickConfig imgConfig = new ImageMagickConfig();
+ imgConfig.setTimeout(legacyPdfProcessorConfig.getConversionTimeout());
+ pdfSinglePageOcrParseContext.set(ImageMagickConfig.class, imgConfig);
+
+ //pdfOcrParseContext.set(Parser.class, defaultParser); //need to add this to make sure recursive parsing happens!
+ }
+}
diff --git a/src/main/java/tika/processor/CompositeTikaProcessorConfig.java b/src/main/java/tika/processor/CompositeTikaProcessorConfig.java
new file mode 100644
index 0000000..2a88507
--- /dev/null
+++ b/src/main/java/tika/processor/CompositeTikaProcessorConfig.java
@@ -0,0 +1,63 @@
+package tika.processor;
+
+import com.fasterxml.jackson.annotation.JsonView;
+import common.JsonPropertyAccessView;
+import lombok.Data;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.context.annotation.Configuration;
+
+
+/**
+ * The composite PDF processor configuration
+ */
+@Data
+@Configuration
+public class CompositeTikaProcessorConfig {
+
+ // the timeout value (s) when performing OCR over documents
+ @JsonView(JsonPropertyAccessView.Public.class)
+ @Value("${tika.parsers.tesseract-ocr.timeout:120}")
+ private int ocrTimeout;
+
+ // apply image processing techniques during documents conversion (using ImageMagick)
+ // required to enable applying rotation (see below)
+ @JsonView(JsonPropertyAccessView.Public.class)
+ @Value("${tika.parsers.tesseract-ocr.enable-image-processing:false}")
+ private boolean ocrEnableImageProcessing;
+
+ // apply de-rotation of documents before processing
+ // can be quite computationally expensive (runs as an external python script)
+ @JsonView(JsonPropertyAccessView.Public.class)
+ @Value("${tika.parsers.tesseract-ocr.apply-rotation:false}")
+ private boolean ocrApplyRotation;
+
+ // the language used in the OCR for corrections
+ @JsonView(JsonPropertyAccessView.Public.class)
+ @Value("${tika.parsers.tesseract-ocr.language:eng}")
+ private String ocrLanguage;
+
+ // whether to apply OCR only on the documents or also extract the embeded text (if present)
+ // warn: note that applying 'OCR_AND_TEXT_EXTRACTION' the content can be duplicated
+ @JsonView(JsonPropertyAccessView.Public.class)
+ @Value("${tika.parsers.pdf-ocr-parser.ocr-only-strategy:true}")
+ private boolean pdfOcrOnlyStrategy;
+
+ // apply OCR only when trying to extract text from previously parsed document (w/o OCR)
+ // that extracted characters were less than N
+ @JsonView(JsonPropertyAccessView.Public.class)
+ @Value("${tika.parsers.pdf-ocr-parser.min-doc-text-length:100}")
+ private int pdfMinDocTextLength;
+
+ // apply OCR only when trying to extract text from previously parsed document (w/o OCR)
+ // that the read bytes were at least N
+ @JsonView(JsonPropertyAccessView.Public.class)
+ @Value("${tika.parsers.pdf-ocr-parser.min-doc-byte-size:10000}")
+ private int pdfMinDocByteSize;
+
+ // use a legacy parser for applying OCR for single-page PDF documents
+ // (NB: when exporting single-page PDFs from LibreOffice that contain only one image,
+ // some additional clutter may be embedded in the PDF content)
+ @JsonView(JsonPropertyAccessView.Public.class)
+ @Value("${tika.parsers.use-legacy-ocr-parser-for-single-page-doc:false}")
+ private boolean useLegacyOcrParserForSinglePageDocuments;
+}
diff --git a/src/main/resources/application.yaml b/src/main/resources/application.yaml
new file mode 100644
index 0000000..907c149
--- /dev/null
+++ b/src/main/resources/application.yaml
@@ -0,0 +1,47 @@
+# application configuration
+#
+application:
+ version: 0.1.0
+
+
+# general spring boot configuration
+#
+server:
+ port: 8090
+
+spring:
+ servlet:
+ multipart.max-file-size: 100MB
+ multipart.max-request-size: 100MB
+
+
+# tika configuration
+#
+tika:
+ parsers:
+ tesseract-ocr:
+ language: eng
+ timeout: 300
+ enable-image-processing: false
+ apply-rotation: false
+
+ pdf-ocr-parser:
+ ocr-only-strategy: true
+ min-doc-text-length: 100
+ min-doc-byte-size: 10000
+ use-legacy-ocr-parser-for-single-page-doc: false
+
+ legacy-pdf-parser:
+ image-magick:
+ timeout: 300
+ tesseract-ocr:
+ timeout: 300
+ min-doc-text-length: 100
+
+
+# documents processing configuration
+#
+processing:
+ use-legacy-tika-processor-as-default: true
+ fail-on-empty-files: false
+ fail-on-non-document-types: true
diff --git a/src/main/resources/logback.xml b/src/main/resources/logback.xml
new file mode 100644
index 0000000..2527c44
--- /dev/null
+++ b/src/main/resources/logback.xml
@@ -0,0 +1,11 @@
+
+
+
+
+ %d{dd-MM-yyyy HH:mm:ss.SSS} -- %highlight(%-5level) : %magenta([%thread]) %logger{36}.%M - %msg%n
+
+
+
+
+
+
\ No newline at end of file
diff --git a/src/main/resources/tika-config/legacy-parser-config.xml b/src/main/resources/tika-config/legacy-parser-config.xml
new file mode 100644
index 0000000..c414d03
--- /dev/null
+++ b/src/main/resources/tika-config/legacy-parser-config.xml
@@ -0,0 +1,14 @@
+
+
+
+
+
+ application/pdf
+
+
+
+ application/pdf
+
+
+
\ No newline at end of file
diff --git a/src/test/java/service/ServiceControllerDocumentMultipartFileTests.java b/src/test/java/service/ServiceControllerDocumentMultipartFileTests.java
new file mode 100644
index 0000000..1c92363
--- /dev/null
+++ b/src/test/java/service/ServiceControllerDocumentMultipartFileTests.java
@@ -0,0 +1,69 @@
+package service;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule;
+import org.junit.runner.RunWith;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.boot.test.autoconfigure.web.servlet.AutoConfigureMockMvc;
+import org.springframework.boot.test.context.SpringBootTest;
+import org.springframework.http.HttpStatus;
+import org.springframework.mock.web.MockMultipartFile;
+import org.springframework.test.context.ContextConfiguration;
+import org.springframework.test.context.junit4.SpringRunner;
+import org.springframework.test.web.servlet.MockMvc;
+import org.springframework.test.web.servlet.MvcResult;
+import org.springframework.test.web.servlet.request.MockMvcRequestBuilders;
+import service.controller.TikaServiceConfig;
+import service.model.ServiceResponseContent;
+import tika.legacy.LegacyPdfProcessorConfig;
+import tika.model.TikaProcessingResult;
+import tika.processor.CompositeTikaProcessorConfig;
+import java.io.InputStream;
+
+import static org.junit.Assert.*;
+import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.status;
+
+
+/**
+ * Implements document processing tests for the Service Controller
+ * A document is passed as a multi-part file
+ */
+@SpringBootTest(classes = TikaServiceApplication.class)
+@RunWith(SpringRunner.class)
+@AutoConfigureMockMvc
+@ContextConfiguration(classes = {TikaServiceConfig.class, LegacyPdfProcessorConfig.class, CompositeTikaProcessorConfig.class})
+public class ServiceControllerDocumentMultipartFileTests extends ServiceControllerDocumentTests {
+
+ @Autowired
+ private MockMvc mockMvc;
+
+ final private String PROCESS_FILE_ENDPOINT_URL = "/api/process_file";
+
+ @Override
+ protected TikaProcessingResult sendProcessingRequest(final String docPath, HttpStatus expectedStatus) throws Exception {
+ return sendMultipartFileProcessingRequest(docPath, expectedStatus);
+ }
+
+ private TikaProcessingResult sendMultipartFileProcessingRequest(final String docPath, HttpStatus expectedStatus) throws Exception {
+ InputStream stream = utils.getDocumentStream(docPath);
+ MockMultipartFile multipartFile = new MockMultipartFile("file", docPath, "multipart/form-data", stream);
+
+ MvcResult result = mockMvc.perform(MockMvcRequestBuilders.multipart(PROCESS_FILE_ENDPOINT_URL)
+ .file(multipartFile))
+ //.param("some-random", "4"))
+ .andExpect(status().is(expectedStatus.value()))
+ .andReturn();
+ //.andExpect(content().string("success"));
+
+ assertEquals(expectedStatus.value(), result.getResponse().getStatus());
+ assertNotNull(result.getResponse().getContentAsString());
+
+ // parse content
+ ObjectMapper mapper = new ObjectMapper();
+ mapper.registerModule(new JavaTimeModule());
+ TikaProcessingResult tikaResult = mapper.readValue(result.getResponse().getContentAsString(),
+ ServiceResponseContent.class).getResult();
+
+ return tikaResult;
+ }
+}
diff --git a/src/test/java/service/ServiceControllerDocumentStreamTests.java b/src/test/java/service/ServiceControllerDocumentStreamTests.java
new file mode 100644
index 0000000..6bf1ff3
--- /dev/null
+++ b/src/test/java/service/ServiceControllerDocumentStreamTests.java
@@ -0,0 +1,68 @@
+package service;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule;
+import org.junit.runner.RunWith;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.boot.test.autoconfigure.web.servlet.AutoConfigureMockMvc;
+import org.springframework.boot.test.context.SpringBootTest;
+import org.springframework.http.HttpStatus;
+import org.springframework.test.context.ContextConfiguration;
+import org.springframework.test.context.junit4.SpringRunner;
+import org.springframework.test.web.servlet.MockMvc;
+import org.springframework.test.web.servlet.MvcResult;
+import org.springframework.test.web.servlet.request.MockMvcRequestBuilders;
+import service.controller.TikaServiceConfig;
+import service.model.ServiceResponseContent;
+import tika.legacy.LegacyPdfProcessorConfig;
+import tika.model.TikaProcessingResult;
+import tika.processor.CompositeTikaProcessorConfig;
+import java.io.InputStream;
+
+import static org.junit.Assert.*;
+import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.status;
+
+
+/**
+ * Implements document processing tests for the Service Controller
+ * A document is passed as an ocet stream
+ */
+@SpringBootTest(classes = TikaServiceApplication.class)
+@RunWith(SpringRunner.class)
+@AutoConfigureMockMvc
+@ContextConfiguration(classes = {TikaServiceConfig.class, LegacyPdfProcessorConfig.class, CompositeTikaProcessorConfig.class})
+public class ServiceControllerDocumentStreamTests extends ServiceControllerDocumentTests {
+
+ @Autowired
+ private MockMvc mockMvc;
+
+ final private String PROCESS_ENDPOINT_URL = "/api/process";
+
+ @Override
+ protected TikaProcessingResult sendProcessingRequest(final String docPath, HttpStatus expectedStatus) throws Exception {
+ return sendFileProcessingRequest(docPath, expectedStatus);
+ }
+
+ private TikaProcessingResult sendFileProcessingRequest(final String docPath, HttpStatus expectedStatus) throws Exception {
+ InputStream stream = utils.getDocumentStream(docPath);
+
+ byte[] content = stream.readAllBytes();
+
+ MvcResult result = mockMvc.perform(MockMvcRequestBuilders.post(PROCESS_ENDPOINT_URL)
+ .content(content))
+ //.param("some-random", "4"))
+ .andExpect(status().is(expectedStatus.value()))
+ .andReturn();
+
+ assertEquals(expectedStatus.value(), result.getResponse().getStatus());
+ assertNotNull(result.getResponse().getContentAsString());
+
+ // parse content
+ ObjectMapper mapper = new ObjectMapper();
+ mapper.registerModule(new JavaTimeModule());
+ TikaProcessingResult tikaResult = mapper.readValue(result.getResponse().getContentAsString(),
+ ServiceResponseContent.class).getResult();
+
+ return tikaResult;
+ }
+}
diff --git a/src/test/java/service/ServiceControllerDocumentTests.java b/src/test/java/service/ServiceControllerDocumentTests.java
new file mode 100644
index 0000000..aa3fd88
--- /dev/null
+++ b/src/test/java/service/ServiceControllerDocumentTests.java
@@ -0,0 +1,71 @@
+package service;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.boot.test.autoconfigure.web.servlet.AutoConfigureMockMvc;
+import org.springframework.boot.test.context.SpringBootTest;
+import org.springframework.http.HttpStatus;
+import org.springframework.test.context.ContextConfiguration;
+import org.springframework.test.context.junit4.SpringRunner;
+import service.controller.TikaServiceConfig;
+import tika.DocumentProcessorTests;
+import tika.legacy.LegacyPdfProcessorConfig;
+import tika.model.TikaProcessingResult;
+import tika.processor.CompositeTikaProcessorConfig;
+
+import static org.junit.Assert.*;
+
+
+/**
+ * Implements document processing tests for the Service Controller, extending the set of available tests
+ * present in DocumentProcessorTests
+ */
+@SpringBootTest(classes = TikaServiceApplication.class)
+@RunWith(SpringRunner.class)
+@AutoConfigureMockMvc
+@ContextConfiguration(classes = {TikaServiceConfig.class, LegacyPdfProcessorConfig.class, CompositeTikaProcessorConfig.class})
+public abstract class ServiceControllerDocumentTests extends DocumentProcessorTests {
+
+ @Autowired
+ TikaServiceConfig serviceConfig;
+
+
+ protected abstract TikaProcessingResult sendProcessingRequest(final String docPath, HttpStatus expectedStatus) throws Exception;
+
+ @Override
+ protected TikaProcessingResult processDocument(final String docPath) throws Exception {
+ return sendProcessingRequest(docPath, HttpStatus.OK);
+ }
+
+
+ /**
+ * The actual tests start from here
+ *
+ *
+ */
+
+ @Override
+ public void testExtractPdfEx1Encrypted() throws Exception {
+ final String docPath = "pdf/ex1_enc.pdf";
+
+ TikaProcessingResult result = sendProcessingRequest(docPath, HttpStatus.BAD_REQUEST);
+
+ // extraction from encrypted PDF will fail with the proper error message
+ assertFalse(result.getSuccess());
+ assertTrue(result.getError().contains("document is encrypted"));
+ }
+
+
+ @Test
+ public void testExtractEmptyPdfFile() throws Exception {
+ final String docPath = "invalid/pdf_empty.pdf";
+
+ assertFalse(serviceConfig.isFailOnEmptyFiles());
+
+ // extraction should pass but with error
+ TikaProcessingResult result = sendProcessingRequest(docPath, HttpStatus.OK);
+ assertFalse(result.getSuccess());
+ assertTrue(result.getError().contains("Empty"));
+ }
+}
diff --git a/src/test/java/service/ServiceControllerTests.java b/src/test/java/service/ServiceControllerTests.java
new file mode 100644
index 0000000..f55c3b2
--- /dev/null
+++ b/src/test/java/service/ServiceControllerTests.java
@@ -0,0 +1,62 @@
+package service;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.boot.test.autoconfigure.web.servlet.AutoConfigureMockMvc;
+import org.springframework.boot.test.context.SpringBootTest;
+import org.springframework.http.HttpStatus;
+import org.springframework.http.MediaType;
+import org.springframework.test.context.ContextConfiguration;
+import org.springframework.test.context.junit4.SpringRunner;
+import org.springframework.test.web.servlet.MockMvc;
+import org.springframework.test.web.servlet.MvcResult;
+import org.springframework.test.web.servlet.request.MockMvcRequestBuilders;
+import service.controller.TikaServiceConfig;
+import service.model.ServiceInformation;
+import tika.legacy.LegacyPdfProcessorConfig;
+import tika.processor.CompositeTikaProcessorConfig;
+
+import static org.junit.Assert.assertEquals;
+
+
+/**
+ * Implements general tests for the Service Controller
+ * (no documents processing)
+ */
+@SpringBootTest(classes = TikaServiceApplication.class)
+@RunWith(SpringRunner.class)
+@AutoConfigureMockMvc
+@ContextConfiguration(classes = {TikaServiceConfig.class, LegacyPdfProcessorConfig.class, CompositeTikaProcessorConfig.class})
+public class ServiceControllerTests {
+
+ @Autowired
+ private MockMvc mockMvc;
+
+ @Autowired
+ private ServiceInformation serviceinfo;
+
+ final private String INFO_ENDPOINT_URL = "/api/info";
+
+
+ @Test
+ public void testGetApplicationInfo() throws Exception {
+ MvcResult result = mockMvc.perform(MockMvcRequestBuilders
+ .get(INFO_ENDPOINT_URL)
+ .accept(MediaType.APPLICATION_JSON_UTF8))
+ .andReturn();
+
+ // check response status
+ int status = result.getResponse().getStatus();
+ assertEquals(HttpStatus.OK.value(), status);
+
+ // parse content
+ ObjectMapper mapper = new ObjectMapper();
+ ServiceInformation response = mapper.readValue(result.getResponse().getContentAsString(),
+ ServiceInformation.class);
+
+ // check example content
+ assertEquals(response.getServiceConfig().getAppVersion(), serviceinfo.getServiceConfig().getAppVersion());
+ }
+}
diff --git a/src/test/java/tika/CompositeTikaProcessorTests.java b/src/test/java/tika/CompositeTikaProcessorTests.java
new file mode 100644
index 0000000..b528219
--- /dev/null
+++ b/src/test/java/tika/CompositeTikaProcessorTests.java
@@ -0,0 +1,43 @@
+package tika;
+
+import org.junit.*;
+import org.junit.runner.RunWith;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.boot.test.context.SpringBootTest;
+import org.springframework.test.context.ContextConfiguration;
+import org.springframework.test.context.junit4.SpringRunner;
+import tika.legacy.LegacyPdfProcessorConfig;
+import tika.processor.AbstractTikaProcessor;
+import tika.processor.CompositeTikaProcessor;
+import tika.processor.CompositeTikaProcessorConfig;
+
+
+/**
+ * Implements the tests using CompositeTikaProcessor as the documents processor
+ */
+@SpringBootTest(classes = CompositeTikaProcessor.class)
+@RunWith(SpringRunner.class)
+@ContextConfiguration(classes = {LegacyPdfProcessorConfig.class, CompositeTikaProcessorConfig.class})
+public class CompositeTikaProcessorTests extends DocumentProcessorTests {
+
+ @Autowired
+ LegacyPdfProcessorConfig legacyProcessorConfig;
+
+ @Autowired
+ CompositeTikaProcessorConfig compositeProcessorConfig;
+
+ @Autowired
+ CompositeTikaProcessor processor;
+
+
+ @Override
+ protected AbstractTikaProcessor getProcessor() {
+ return processor;
+ }
+
+ @After
+ public void reset() throws Exception {
+ processor.reset();
+ }
+}
+
diff --git a/src/test/java/tika/DocumentProcessorTests.java b/src/test/java/tika/DocumentProcessorTests.java
new file mode 100644
index 0000000..d752636
--- /dev/null
+++ b/src/test/java/tika/DocumentProcessorTests.java
@@ -0,0 +1,234 @@
+package tika;
+
+import org.junit.Ignore;
+import org.junit.Test;
+import tika.model.TikaProcessingResult;
+import tika.processor.AbstractTikaProcessor;
+import java.io.InputStream;
+
+import static org.junit.Assert.*;
+
+
+/**
+ * All the document processor tests are implemented in this abstract class in order to keep the
+ * rationale behind the tests and results in one single place.
+ */
+public abstract class DocumentProcessorTests {
+
+ protected DocumentTestUtils utils = new DocumentTestUtils();
+
+ /**
+ * Helper methods used in tests that can be overloaded in child classes
+ */
+ protected AbstractTikaProcessor getProcessor() { return null; }
+
+ protected TikaProcessingResult processDocument(final String docPath) throws Exception {
+ AbstractTikaProcessor processor = getProcessor();
+ assertNotNull(processor);
+
+ InputStream stream = utils.getDocumentStream(docPath);
+ return processor.process(stream);
+ }
+
+
+ /**
+ * The actual tests start from here
+ *
+ *
+ */
+
+ @Test
+ public void testGenericExtractPattern1SourceTxt() throws Exception {
+ final String docPathPrefix = "generic/pat_id_1";
+ final String docExt = ".txt";
+
+ TikaProcessingResult result = processDocument(docPathPrefix + docExt);
+ assertTrue(result.getSuccess());
+
+ // test parsing status
+ String parsedString = result.getText();
+ assertEquals(310, parsedString.length());
+
+ // test metadata
+ utils.assertOcrApplied(false, result);
+ }
+
+ @Test
+ public void testGenericExtractPattern1Doc() throws Exception {
+ final String docPathPrefix = "generic/pat_id_1";
+ final String docExt = ".doc";
+
+ TikaProcessingResult result = processDocument(docPathPrefix + docExt);
+ assertTrue(result.getSuccess());
+
+ utils.testContentMatch(result, docPathPrefix);
+
+ // test metadata
+ utils.assertPageCount(1, result);
+ utils.assertOcrApplied(false, result);
+ }
+
+ @Test
+ public void testGenericExtractPattern1Docx() throws Exception {
+ final String docPathPrefix = "generic/pat_id_1";
+ final String docExt = ".docx";
+
+ TikaProcessingResult result = processDocument(docPathPrefix + docExt);
+ assertTrue(result.getSuccess());
+
+ utils.testContentMatch(result, docPathPrefix);
+
+ // test metadata
+ utils.assertPageCount(1, result);
+ utils.assertOcrApplied(false, result);
+ }
+
+ @Test
+ public void testGenericExtractPattern1Odt() throws Exception {
+ final String docPathPrefix = "generic/pat_id_1";
+ final String docExt = ".odt";
+
+ TikaProcessingResult result = processDocument(docPathPrefix + docExt);
+ assertTrue(result.getSuccess());
+
+ utils.testContentMatch(result, docPathPrefix);
+
+ // test metadata
+ utils.assertPageCount(1, result);
+ utils.assertOcrApplied(false, result);
+ }
+
+ @Test
+ public void testGenericExtractPattern1Rtf() throws Exception {
+ final String docPathPrefix = "generic/pat_id_1";
+ final String docExt = ".txt";
+
+ TikaProcessingResult result = processDocument(docPathPrefix + docExt);
+ assertTrue(result.getSuccess());
+
+ utils.testContentMatch(result, docPathPrefix);
+
+ // test metadata
+ // rtf does not contain page count
+ utils.assertOcrApplied(false, result);
+ }
+
+ @Test
+ public void testGenericExtractPattern1Png() throws Exception {
+ final String docPathPrefix = "generic/pat_id_1";
+ final String docExt = ".png";
+
+ TikaProcessingResult result = processDocument(docPathPrefix + docExt);
+ assertTrue(result.getSuccess());
+
+ utils.testContentMatch(result, docPathPrefix);
+
+ // test metadata
+ // png does not contain page count
+ utils.assertOcrApplied(true, result);
+ }
+
+ @Test
+ public void testGenericExtractPattern1Pdf() throws Exception {
+ final String docPathPrefix = "generic/pat_id_1";
+ final String docExt = ".pdf";
+
+ TikaProcessingResult result = processDocument(docPathPrefix + docExt);
+ assertTrue(result.getSuccess());
+
+ utils.testContentMatch(result, docPathPrefix);
+
+ // test metadata
+ utils.assertPageCount(1, result);
+ utils.assertOcrApplied(false, result); // this pdf contains text-only
+ }
+
+ @Test
+ public void testExtractPdfEx1WithoutOcr() throws Exception {
+ final String docPath = "pdf/ex1.pdf";
+
+ TikaProcessingResult result = processDocument(docPath);
+
+ // check an example string
+ assertTrue(result.getSuccess());
+ assertTrue(result.getText().contains("An Example Paper"));
+
+ // test metadata
+ utils.assertPageCount(10, result);
+ utils.assertOcrApplied(false, result); // this pdf contains text-only
+ }
+
+ @Test
+ public void testExtractPdfEx1Encrypted() throws Exception {
+ final String docPath = "pdf/ex1_enc.pdf";
+
+ TikaProcessingResult result = processDocument(docPath);
+
+ // extraction from encrypted PDF will fail with the proper error message
+ assertFalse(result.getSuccess());
+ assertTrue(result.getError().contains("document is encrypted"));
+ }
+
+ @Test
+ public void testExtractPdfEx2WithOcr() throws Exception {
+ final String docPath = "pdf/ex2_ocr.pdf";
+
+ TikaProcessingResult result = processDocument(docPath);
+
+ // check the content
+ assertTrue(result.getSuccess());
+ final String parsedString = result.getText();
+ assertTrue(parsedString.length() > 0);
+
+ // example text from the first page
+ assertTrue(parsedString.contains("Father or mother"));
+ // example text from the second page
+ assertTrue(parsedString.contains("how you have determined who is the Nearest"));
+
+ // test medatata
+ utils.assertPageCount(2, result);
+ utils.assertOcrApplied(true, result);
+ }
+
+
+ // TODO: need to double-check how to handle invalid TIFFs or image files
+ @Ignore
+ @Test
+ public void testExtractTiffWithOCR() throws Exception {
+ InputStream stream = utils.getDocumentZipStream("invalid/tiff_multipage_spp2.tiff.zip", "tiff_multipage_spp2.tiff");
+
+ AbstractTikaProcessor processor = getProcessor();
+ TikaProcessingResult result = processor.process(stream);
+ assertTrue(result.getSuccess());
+
+ // HINT: the test should fail either as the TIFF is invalid
+ // or should an additional pre-processing of the image happen
+
+ // test parsing status
+ String parsedString = result.getText();
+ assertTrue(parsedString.length() > 0);
+
+ // test metadata
+ utils.assertPageCount(6, result);
+
+ // test example content
+ // - from first page
+ assertTrue(parsedString.contains("Sample Narrative Report"));
+ }
+
+
+ //TODO: need to create a proper docx encrypted file
+ @Ignore
+ @Test
+ public void testExtractWordEncrypted() throws Exception {
+ InputStream stream = utils.getDocumentStream("word_enc_noerror.docx");
+
+ AbstractTikaProcessor processor = getProcessor();
+ TikaProcessingResult result = processor.process(stream);
+
+ // extraction from encrypted DOCX will succeed but with the content empty and no error message
+ // uses: org.apache.tika.parser.microsoft.OfficeParser
+ //TODO: this one needs an internal fix or further investigation
+ assertTrue(result.getSuccess());
+ }
+}
diff --git a/src/test/java/tika/DocumentTestUtils.java b/src/test/java/tika/DocumentTestUtils.java
new file mode 100644
index 0000000..cae8308
--- /dev/null
+++ b/src/test/java/tika/DocumentTestUtils.java
@@ -0,0 +1,75 @@
+package tika;
+
+import tika.model.MetadataKeys;
+import tika.model.TikaProcessingResult;
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.util.Map;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipFile;
+
+import static org.junit.Assert.*;
+import static org.junit.Assert.assertTrue;
+
+
+/**
+ * Helper utilities used in tests
+ */
+public class DocumentTestUtils {
+ public InputStream getDocumentStream(final String docName) throws Exception {
+ final String fullPath = "tika/docs/" + docName;
+ InputStream stream = getClass().getClassLoader().getResourceAsStream(fullPath);
+ assertNotNull(stream);
+ ByteArrayInputStream bas = new ByteArrayInputStream(stream.readAllBytes());
+ return bas;
+ }
+
+ public InputStream getDocumentZipStream(final String archiveName, final String zipEntry) throws Exception {
+ final String fullPath = "tika/docs/" + archiveName;
+ final ZipEntry entry = new ZipEntry(zipEntry);
+ ZipFile zf = new ZipFile(getClass().getClassLoader().getResource(fullPath).getPath());
+ InputStream stream = zf.getInputStream(entry);
+ assertNotNull(stream);
+ return stream;
+ }
+
+ public String getDocumentText(final String path) throws Exception {
+ return new String(getDocumentStream(path).readAllBytes());
+ }
+
+
+ public void assertContentMatches(final String expected, final String actual) {
+ // note that this check is a very naive method of content comparison, as we only
+ // strip all the special characters and compare the content in lowercase
+ final String regexPattern = "[^\\dA-Za-z]";
+ final String s1parsed = expected.replaceAll(regexPattern, "");
+ final String s2parsed = actual.replaceAll(regexPattern, "");
+ assertEquals(s1parsed, s2parsed);
+ }
+
+ public void assertPageCount(final int expectedPageCount, TikaProcessingResult result) {
+ Map metadata = result.getMetadata();
+ assertTrue(metadata.containsKey(MetadataKeys.PAGE_COUNT));
+ assertEquals(Integer.parseInt(metadata.get(MetadataKeys.PAGE_COUNT).toString()), expectedPageCount);
+ }
+
+ public void assertOcrApplied(final boolean expectedStatus, TikaProcessingResult result) {
+ Map metadata = result.getMetadata();
+ if (metadata.containsKey(MetadataKeys.OCR_APPLIED)) {
+ assertEquals(Boolean.parseBoolean(metadata.get(MetadataKeys.OCR_APPLIED).toString()), expectedStatus);
+ }
+ else {
+ assertFalse(expectedStatus);
+ }
+ }
+
+
+ public void testContentMatch(final TikaProcessingResult result, final String docPathPrefix) throws Exception {
+ // read truth document
+ final String sourceText = getDocumentText(docPathPrefix + ".txt");
+
+ // test status and content
+ assertTrue(result.getText().length() > 0);
+ assertContentMatches(sourceText, result.getText());
+ }
+}
diff --git a/src/test/java/tika/LegacyTikaProcessorTests.java b/src/test/java/tika/LegacyTikaProcessorTests.java
new file mode 100644
index 0000000..0b9d0d5
--- /dev/null
+++ b/src/test/java/tika/LegacyTikaProcessorTests.java
@@ -0,0 +1,40 @@
+package tika;
+
+import org.junit.*;
+import org.junit.runner.RunWith;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.boot.test.context.SpringBootTest;
+import org.springframework.test.annotation.DirtiesContext;
+import org.springframework.test.context.ContextConfiguration;
+import org.springframework.test.context.junit4.SpringRunner;
+import tika.legacy.LegacyPdfProcessorConfig;
+import tika.legacy.LegacyTikaProcessor;
+import tika.processor.AbstractTikaProcessor;
+
+
+/**
+ * Implements the tests using LegacyTikaProcessor as the documents processor
+ */
+@SpringBootTest(classes = LegacyTikaProcessor.class)
+@RunWith(SpringRunner.class)
+@DirtiesContext
+@ContextConfiguration(classes = {LegacyPdfProcessorConfig.class})
+public class LegacyTikaProcessorTests extends DocumentProcessorTests {
+
+ @Autowired
+ LegacyPdfProcessorConfig defaultConfig;
+
+ @Autowired
+ LegacyTikaProcessor processor;
+
+ @Override
+ protected AbstractTikaProcessor getProcessor() {
+ return processor;
+ }
+
+ @After
+ public void reset() throws Exception {
+ processor.reset();
+ }
+}
+
diff --git a/src/test/resources/application.yaml b/src/test/resources/application.yaml
new file mode 100644
index 0000000..907c149
--- /dev/null
+++ b/src/test/resources/application.yaml
@@ -0,0 +1,47 @@
+# application configuration
+#
+application:
+ version: 0.1.0
+
+
+# general spring boot configuration
+#
+server:
+ port: 8090
+
+spring:
+ servlet:
+ multipart.max-file-size: 100MB
+ multipart.max-request-size: 100MB
+
+
+# tika configuration
+#
+tika:
+ parsers:
+ tesseract-ocr:
+ language: eng
+ timeout: 300
+ enable-image-processing: false
+ apply-rotation: false
+
+ pdf-ocr-parser:
+ ocr-only-strategy: true
+ min-doc-text-length: 100
+ min-doc-byte-size: 10000
+ use-legacy-ocr-parser-for-single-page-doc: false
+
+ legacy-pdf-parser:
+ image-magick:
+ timeout: 300
+ tesseract-ocr:
+ timeout: 300
+ min-doc-text-length: 100
+
+
+# documents processing configuration
+#
+processing:
+ use-legacy-tika-processor-as-default: true
+ fail-on-empty-files: false
+ fail-on-non-document-types: true
diff --git a/src/test/resources/tika/config/legacy-parser-config.xml b/src/test/resources/tika/config/legacy-parser-config.xml
new file mode 100644
index 0000000..c414d03
--- /dev/null
+++ b/src/test/resources/tika/config/legacy-parser-config.xml
@@ -0,0 +1,14 @@
+
+
+
+
+
+ application/pdf
+
+
+
+ application/pdf
+
+
+
\ No newline at end of file
diff --git a/src/test/resources/tika/docs/generic/pat_id_1.doc b/src/test/resources/tika/docs/generic/pat_id_1.doc
new file mode 100644
index 0000000..1fe2ca2
Binary files /dev/null and b/src/test/resources/tika/docs/generic/pat_id_1.doc differ
diff --git a/src/test/resources/tika/docs/generic/pat_id_1.docx b/src/test/resources/tika/docs/generic/pat_id_1.docx
new file mode 100644
index 0000000..e9700ce
Binary files /dev/null and b/src/test/resources/tika/docs/generic/pat_id_1.docx differ
diff --git a/src/test/resources/tika/docs/generic/pat_id_1.odt b/src/test/resources/tika/docs/generic/pat_id_1.odt
new file mode 100644
index 0000000..90db7d9
Binary files /dev/null and b/src/test/resources/tika/docs/generic/pat_id_1.odt differ
diff --git a/src/test/resources/tika/docs/generic/pat_id_1.pdf b/src/test/resources/tika/docs/generic/pat_id_1.pdf
new file mode 100644
index 0000000..5b42732
Binary files /dev/null and b/src/test/resources/tika/docs/generic/pat_id_1.pdf differ
diff --git a/src/test/resources/tika/docs/generic/pat_id_1.png b/src/test/resources/tika/docs/generic/pat_id_1.png
new file mode 100644
index 0000000..fb8d321
Binary files /dev/null and b/src/test/resources/tika/docs/generic/pat_id_1.png differ
diff --git a/src/test/resources/tika/docs/generic/pat_id_1.rtf b/src/test/resources/tika/docs/generic/pat_id_1.rtf
new file mode 100644
index 0000000..020514a
--- /dev/null
+++ b/src/test/resources/tika/docs/generic/pat_id_1.rtf
@@ -0,0 +1,52 @@
+{\rtf1\ansi\deff3\adeflang1025
+{\fonttbl{\f0\froman\fprq2\fcharset0 Times New Roman;}{\f1\froman\fprq2\fcharset2 Symbol;}{\f2\fswiss\fprq2\fcharset0 Arial;}{\f3\froman\fprq2\fcharset0 Liberation Serif{\*\falt Times New Roman};}{\f4\fswiss\fprq2\fcharset0 Liberation Sans{\*\falt Arial};}{\f5\fswiss\fprq2\fcharset0 Calibri;}{\f6\fnil\fprq2\fcharset0 PingFang SC;}{\f7\fnil\fprq2\fcharset0 Arial Unicode MS;}{\f8\fswiss\fprq0\fcharset128 Arial Unicode MS;}}
+{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}
+{\stylesheet{\s0\snext0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057 Normal;}
+{\*\cs15\snext15 Default Paragraph Font;}
+{\s16\sbasedon0\snext17\ql\sl256\slmult1\widctlpar\sb240\sa120\keepn\ltrpar\cf0\dbch\af6\dbch\af7\afs28\alang1025\loch\f4\fs28\lang2057 Heading;}
+{\s17\sbasedon0\snext17\ql\sl276\slmult1\widctlpar\sb0\sa140\ltrpar\cf0\dbch\af5\dbch\af0\afs22\alang1025\loch\f5\fs22\lang2057 Text Body;}
+{\s18\sbasedon17\snext18\ql\sl276\slmult1\widctlpar\sb0\sa140\ltrpar\cf0\dbch\af5\dbch\af8\afs22\alang1025\loch\f5\fs22\lang2057 List;}
+{\s19\sbasedon0\snext19\ql\sl256\slmult1\widctlpar\sb120\sa120\noline\ltrpar\cf0\i\dbch\af5\dbch\af8\afs24\alang1025\ai\loch\f5\fs24\lang2057 Caption;}
+{\s20\sbasedon0\snext20\ql\sl256\slmult1\widctlpar\sb0\sa160\noline\ltrpar\cf0\dbch\af5\dbch\af8\afs22\alang1025\loch\f5\fs22\lang2057 Index;}
+}{\*\generator LibreOffice/6.1.0.3$MacOSX_X86_64 LibreOffice_project/efb621ed25068d70781dc026f7e9c5187a4decd1}{\info{\author Rich}{\creatim\yr2015\mo11\dy2\hr16\min52}{\author Rich}{\revtim\yr2015\mo11\dy2\hr16\min59}{\printim\yr0\mo0\dy0\hr0\min0}}{\*\userprops}\deftab720
+\viewscale100
+{\*\pgdsctbl
+{\pgdsc0\pgdscuse451\pgwsxn11906\pghsxn16838\marglsxn1440\margrsxn1440\margtsxn1440\margbsxn1440\pgdscnxt0 Default Style;}}
+\formshade{\*\pgdscno0}\paperh16838\paperw11906\margl1440\margr1440\margt1440\margb1440\sectd\sbknone\sectunlocked1\pgndec\pgwsxn11906\pghsxn16838\marglsxn1440\margrsxn1440\margtsxn1440\margbsxn1440\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc\htmautsp
+{\*\ftnsep\chftnsep}\pgndec\pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057{\rtlch \ltrch\loch
+This is an example of a clinical document}
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057\rtlch \ltrch\loch
+
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057{\rtlch \ltrch\loch
+The patient\u8217\'92s name is Bart Davidson.}
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057{\rtlch \ltrch\loch
+His carer\u8217\'92s Name Paul Wayne.}
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057\rtlch \ltrch\loch
+
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057{\rtlch \ltrch\loch
+His telephone number is 07754828992}
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057\rtlch \ltrch\loch
+
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057{\rtlch \ltrch\loch
+His Address is 61 Basildon Way, }
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057{\rtlch \ltrch\loch
+East Croyhurst, }
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057{\rtlch \ltrch\loch
+Angelton, }
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057{\rtlch \ltrch\loch
+AL64 9HT}
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057\rtlch \ltrch\loch
+
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057{\rtlch \ltrch\loch
+His mother\u8217\'92s name is Pauline Smith.}
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057\rtlch \ltrch\loch
+
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057{\rtlch \ltrch\loch
+He is on 100mg Paracetamol, 20 milligrams clozapine}
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057\rtlch \ltrch\loch
+
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057\rtlch \ltrch\loch
+
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\rtlch \ltrch\loch
+
+\par }
\ No newline at end of file
diff --git a/src/test/resources/tika/docs/generic/pat_id_1.txt b/src/test/resources/tika/docs/generic/pat_id_1.txt
new file mode 100644
index 0000000..5d9c770
--- /dev/null
+++ b/src/test/resources/tika/docs/generic/pat_id_1.txt
@@ -0,0 +1,17 @@
+This is an example of a clinical document
+
+The patient’s name is Bart Davidson.
+His carer’s Name Paul Wayne.
+
+His telephone number is 07754828992
+
+His Address is 61 Basildon Way,
+East Croyhurst,
+Angelton,
+AL64 9HT
+
+His mother’s name is Pauline Smith.
+
+He is on 100mg Paracetamol, 20 milligrams clozapine
+
+
diff --git a/src/test/resources/tika/docs/invalid/pdf_empty.pdf b/src/test/resources/tika/docs/invalid/pdf_empty.pdf
new file mode 100644
index 0000000..e69de29
diff --git a/src/test/resources/tika/docs/invalid/tiff_multipage_spp2.tiff.zip b/src/test/resources/tika/docs/invalid/tiff_multipage_spp2.tiff.zip
new file mode 100644
index 0000000..f0627de
Binary files /dev/null and b/src/test/resources/tika/docs/invalid/tiff_multipage_spp2.tiff.zip differ
diff --git a/src/test/resources/tika/docs/invalid/word_enc_noerror.docx b/src/test/resources/tika/docs/invalid/word_enc_noerror.docx
new file mode 100644
index 0000000..2be820e
Binary files /dev/null and b/src/test/resources/tika/docs/invalid/word_enc_noerror.docx differ
diff --git a/src/test/resources/tika/docs/pdf/ex1.pdf b/src/test/resources/tika/docs/pdf/ex1.pdf
new file mode 100644
index 0000000..ab5db00
Binary files /dev/null and b/src/test/resources/tika/docs/pdf/ex1.pdf differ
diff --git a/src/test/resources/tika/docs/pdf/ex1_enc.pdf b/src/test/resources/tika/docs/pdf/ex1_enc.pdf
new file mode 100644
index 0000000..5e3b5d0
Binary files /dev/null and b/src/test/resources/tika/docs/pdf/ex1_enc.pdf differ
diff --git a/src/test/resources/tika/docs/pdf/ex2_ocr.pdf b/src/test/resources/tika/docs/pdf/ex2_ocr.pdf
new file mode 100644
index 0000000..9fd8321
Binary files /dev/null and b/src/test/resources/tika/docs/pdf/ex2_ocr.pdf differ
diff --git a/travis_gradle_build.sh b/travis_gradle_build.sh
new file mode 100644
index 0000000..10d6457
--- /dev/null
+++ b/travis_gradle_build.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+
+# Abort on error, unitialized variables and pipe errors
+set -eEu
+set -o pipefail
+#set -v
+
+export PING_SLEEP=30s
+export WORKDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+export BUILD_OUTPUT=$WORKDIR/build.out
+export TEST_PROC_LOG_OUTPUT=$WORKDIR/test-proc.out
+export TEST_API_LOG_OUTPUT=$WORKDIR/test-api.out
+
+# dump the last N lines of the output files
+DUMP_LINES_BUILD=2000
+DUMP_LINES_TEST_PROC=5000
+DUMP_LINES_TEST_API=2000
+
+touch $BUILD_OUTPUT
+touch $TEST_PROC_LOG_OUTPUT
+touch $TEST_API_LOG_OUTPUT
+
+
+# Helper functions
+#
+print_log_separator() {
+ echo "----------------------------------------------------------------"
+ echo "-"
+ echo "-"
+ echo "-"
+ echo "-"
+ echo "----------------------------------------------------------------"
+}
+
+dump_output() {
+ if [ "$2" -eq "-1" ]; then
+ echo "Printing all the output: $1"
+ cat $1
+ else
+ echo "Tailing the last $2 lines of build output: $1"
+ tail -$2 $1
+ fi
+}
+
+print_logs() {
+ print_log_separator
+ dump_output $BUILD_OUTPUT $DUMP_LINES_BUILD
+
+ print_log_separator
+ dump_output $TEST_PROC_LOG_OUTPUT $DUMP_LINES_TEST_PROC
+
+ print_log_separator
+ dump_output $TEST_API_LOG_OUTPUT $DUMP_LINES_TEST_API
+}
+
+run_build() {
+ #./gradlew build --full-stacktrace --debug 2>&1 | tee >(grep TestEventLogger | grep -P -n "[[:ascii:]]" >> $TEST_LOG_OUTPUT) | grep -P -n "[[:ascii:]]" >> $BUILD_OUTPUT
+ ./gradlew assemble --full-stacktrace 2>&1 >> $BUILD_OUTPUT
+}
+
+run_tests() {
+ # enable debug output here to spot the errors
+ ./gradlew test --full-stacktrace --debug --tests=tika.LegacyTikaProcessorTests 2>&1 | grep TestEventLogger >> $TEST_PROC_LOG_OUTPUT
+ ./gradlew test --full-stacktrace --debug --tests=tika.CompositeTikaProcessorTests 2>&1 | grep TestEventLogger >> $TEST_PROC_LOG_OUTPUT
+ # disable debug here, too much verbose
+ ./gradlew test --full-stacktrace --tests=ServiceControllerTests 2>&1 >> $TEST_API_LOG_OUTPUT
+ ./gradlew test --full-stacktrace --tests=ServiceControllerDocumentMultipartFileTests 2>&1 >> $TEST_API_LOG_OUTPUT
+ ./gradlew test --full-stacktrace --tests=ServiceControllerDocumentStreamTests 2>&1 >> $TEST_API_LOG_OUTPUT
+}
+
+error_handler() {
+ echo ERROR: An error was encountered with the build.
+ print_logs
+ exit 1
+}
+
+
+# The Main
+#
+
+# If an error occurs, run our error handler to output a tail of the build
+trap 'error_handler' ERR SIGPIPE
+
+# Set up a repeating loop to send some output to Travis (to avoid killing inactive builds)
+bash -c "while true; do echo \$(date) - building ...; sleep $PING_SLEEP; done" &
+PING_LOOP_PID=$!
+
+# Build Commands
+run_build
+run_tests
+
+# 'nicely' terminate the ping output loop
+kill $PING_LOOP_PID
+
+# Print the logs
+echo SUCCESS
+print_logs