diff --git a/.gitignore b/.gitignore
index a1c2a23..20bf8a4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,23 +1,6 @@
-# Compiled class file
-*.class
+.DS_Store
+.idea
+.gradle
 
-# Log file
-*.log
-
-# BlueJ files
-*.ctxt
-
-# Mobile Tools for Java (J2ME)
-.mtj.tmp/
-
-# Package Files #
-*.jar
-*.war
-*.nar
-*.ear
-*.zip
-*.tar.gz
-*.rar
-
-# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml
-hs_err_pid*
+build
+out
diff --git a/.travis.yml b/.travis.yml
new file mode 100644
index 0000000..471f814
--- /dev/null
+++ b/.travis.yml
@@ -0,0 +1,49 @@
+dist: xenial
+
+language: java
+
+jdk:
+  - openjdk11
+
+env:
+  # limit the number of processing theads used by tesseract
+  - OMP_THREAD_LIMIT=1
+
+addons:
+  apt:
+    sources:
+    # tesseract-ocr >= 4.0 is not available in the standard Xenial / Trusty distro
+    - sourceline: 'ppa:alex-p/tesseract-ocr'
+    packages:
+      - tesseract-ocr
+      - tesseract-ocr-osd
+      - tesseract-ocr-eng
+      - imagemagick
+      - ghostscript
+      - libtesseract-dev
+      - libmagickcore-dev
+      - libmagickwand-dev
+      - libmagic-dev
+      - apache2-utils
+
+before_cache:
+  - rm -f  $HOME/.gradle/caches/modules-2/modules-2.lock
+  - rm -fr $HOME/.gradle/caches/*/plugin-resolution/
+  - rm -fr $HOME/.gradle/caches/*/scripts/
+
+cache:
+  directories:
+    - $HOME/.gradle/caches/
+    - $HOME/.gradle/wrapper/
+
+install:
+  - sudo cp ./extras/ImageMagick/policy.xml /etc/ImageMagick-6/policy.xml
+
+before_script:
+  - convert --version
+#  - convert -list policy
+  - tesseract --version
+#  - ./gradlew downloadDependencies > /dev/null
+
+script:
+  - bash travis_gradle_build.sh
diff --git a/CHANGELOG.txt b/CHANGELOG.txt
new file mode 100644
index 0000000..1519650
--- /dev/null
+++ b/CHANGELOG.txt
@@ -0,0 +1,3 @@
+Release 0.1.0 -- 15 Aug 2019
+---------------
+* Initial stable version release
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..803fd23
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,109 @@
+################################################################
+#
+# BUILD STEPS
+#
+
+################################
+#
+# JDK base
+#
+FROM adoptopenjdk/openjdk11:slim AS jdk-11-base
+
+# freeze the versions of the Tesseract+ImageMagick for reproducibility
+ENV TESSERACT_VERSION 4.00~git2288-10f4998a-2
+ENV TESSERACT_RES_VERSION 4.00~git24-0e00fe6-1.2
+ENV IMAGEMAGICK_VERSION 8:6.9.7.4+dfsg-16ubuntu6.7
+
+RUN apt-get update && \
+#	apt-get dist-upgrade -y && \
+#	apt-get install -y tesseract-ocr && \
+    apt-get update && \
+    apt-get install -y software-properties-common && \
+	apt-get install -y tesseract-ocr=$TESSERACT_VERSION tesseract-ocr-eng=$TESSERACT_RES_VERSION tesseract-ocr-osd=$TESSERACT_RES_VERSION && \
+###	apt-get install -y tesseract-ocr-osd=3.04.00-1 tesseract-ocr-eng=3.04.00-1 tesseract-ocr=3.04.01-5 && \
+	apt-get install -y imagemagick=$IMAGEMAGICK_VERSION --fix-missing && \
+	apt-get install -y python3-pip && pip3 install numpy matplotlib scikit-image && \
+	apt-get clean autoclean && \
+    apt-get autoremove -y && \
+    rm -rf /var/lib/apt/lists/*
+
+
+################################
+#
+# Tika Server Builder
+#
+FROM jdk-11-base AS service-builder
+
+# setup the build environment
+RUN mkdir -p /devel
+WORKDIR /devel
+
+COPY ./gradle/wrapper /devel/gradle/wrapper
+COPY ./gradlew /devel/
+
+RUN ./gradlew --version
+
+COPY ./settings.gradle /devel/
+COPY . /devel/
+
+# build service
+# TIP: uncomment the two lines below to both build the service
+#      and run the tests during the build
+#COPY ./extras/ImageMagick/policy.xml /etc/ImageMagick-6/policy.xml
+#RUN ./gradlew build --no-daemon
+
+RUN ./gradlew bootJar --no-daemon
+
+
+
+################################################################
+#
+# RUN STEPS
+#
+
+################################
+#
+# JRE base
+#
+FROM adoptopenjdk/openjdk11:jre AS jre-11-base
+
+# freeze the versions of the Tesseract+ImageMagick for reproducibility
+ENV TESSERACT_VERSION 4.00~git2288-10f4998a-2
+ENV TESSERACT_RES_VERSION 4.00~git24-0e00fe6-1.2
+ENV IMAGEMAGICK_VERSION 8:6.9.7.4+dfsg-16ubuntu6.7
+
+RUN apt-get update && \
+#	apt-get dist-upgrade -y && \
+#	apt-get install -y tesseract-ocr && \
+    apt-get update && \
+    apt-get install -y software-properties-common && \
+	apt-get install -y tesseract-ocr=$TESSERACT_VERSION tesseract-ocr-eng=$TESSERACT_RES_VERSION tesseract-ocr-osd=$TESSERACT_RES_VERSION && \
+###	apt-get install -y tesseract-ocr-osd=3.04.00-1 tesseract-ocr-eng=3.04.00-1 tesseract-ocr=3.04.01-5 && \
+	apt-get install -y imagemagick=$IMAGEMAGICK_VERSION --fix-missing && \
+	apt-get install -y python3-pip && pip3 install numpy matplotlib scikit-image && \
+	apt-get clean autoclean && \
+    apt-get autoremove -y && \
+    rm -rf /var/lib/apt/lists/*
+
+
+################################
+#
+# Tika Service
+#
+FROM jre-11-base AS service-runner
+
+# setup env
+RUN mkdir -p /app/config
+WORKDIR /app
+
+# copy tika-server artifacts
+COPY --from=service-builder /devel/build/libs/service-*.jar ./
+COPY --from=service-builder /devel/src/main/resources/application.yaml ./config/
+
+COPY --from=service-builder /devel/scripts/run.sh ./
+
+# copy external tools configuration files
+COPY ./extras/ImageMagick/policy.xml /etc/ImageMagick-6/policy.xml
+
+# entry point
+CMD ["/bin/bash", "/app/run.sh"]
diff --git a/README.md b/README.md
index 597b867..74ec506 100644
--- a/README.md
+++ b/README.md
@@ -1,5 +1,162 @@
 # Introduction
-Apache Tika running as a web service
+This project implements Apache Tika running as a web service using Spring Boot. It exposes a REST API so that a client can send a document in binary format and receive back the extracted text. The supported document formats are the ones as in Tika.
 
-# Status
-Work-in-progress ...
+Some of the key motivation behind developing own wrapper over Tika instead of using the already availabke [Tika server](https://cwiki.apache.org/confluence/display/tika/TikaJAXRS) is a better control over used document parsers (such as PDFParser, Tesseract OCR and the legacy one taken from [CogStack-Pipeline](https://github.com/CogStack/CogStack-Pipeline)) and control over returned results with HTTP return codes.
+
+
+# Building
+To build the application, run in the main directory:
+
+`./gradlew build`
+
+The build artifacts will be placed in `./build` directory.
+
+
+During the build, the tests will be run, where the failed tests can also signify missing third-party dependencies (see below). However, to skip running the tests and just build the application, one can run:
+
+`./gradlew bootJar`.
+
+
+## Tests
+To run the available tests, run:
+
+`./gradlew test`
+
+Please note that failed tests may signify missing third-party dependencies.
+
+
+## Third-party dependencies
+In the minimal setup, for proper text extraction Apache Tika requires the following applications to be present on the system:
+- [Tesseract OCR](https://github.com/tesseract-ocr/tesseract),
+- [ImageMagick](https://imagemagick.org),
+- [Ghostscript](https://www.ghostscript.com/) (required by ImageMagick for documents conversion).
+
+ImageMagick also requires its configuration file `policy.xml` to be overriden by the provided `extras/ImageMagick/policy.xml` (in order to increase the the available resources for file processing and to override [security policy](https://stackoverflow.com/questions/52703123/override-default-imagemagick-policy-xml) related with Ghostscript).
+
+Moreover, in order to enable additional image processing capabilities of Tesseract OCR, few other dependencies need to be present in the system, such as Python environment. Please see the provided `Dockerfile` for the full list.
+
+
+# Running the application
+The application can be either run as a standalone Java application or inside a Docker container. The application configuration can be changed in the `application.yaml` file. The default version of configuration file is embeded in the jar file, but can be specified manually (see below).
+
+Please note that the recommended way is to use the provided Docker image since a number of dependencies need to be satisfied on a local machine.
+
+
+## Running as a standalone Java application
+Assuming that the build went correctly, to run the Tika service on a local machine:
+
+`java -jar build/jar/service-*.jar`
+
+The running service will be listening on port `8090` (by default) on the host machine. 
+
+
+## Using the Docker image
+The latest stable Docker image is available in the Docker Hub under `cogstacksystems/tika-service:latest` tag. Alternatively, the latest development version is available under `cogstacksystems/tika-service:dev-latest` tag. The image can be also build locally using the provided `Dockerfile`.
+
+
+To run Tika service container:
+
+`docker run -p 8090:8090 cogstacksystems/tika-service:latest`
+
+The service will be listening on port `8090` on the host machine.
+
+
+# API
+
+## API specification
+Tika Service, by default, will be listening on port `8090` and the returned content extraction result will be represented in JSON format. 
+
+The service exposes such endpoints:
+- *GET* `/api/info` - returns information about the service with its configuration,
+- *POST* `/api/process` - processes a binary data stream with the binary document content,
+- *POST* `/api/process_file` - processes a document file (multi-part request).
+
+## Document extraction result
+The extraction results are represented in JSON format where the available main fields are:
+- `result` - the content extraction result with metadata,
+- `timestamp` - the content processing timestamp,
+- `success` - specifies whether the extraction accomplished successfully,
+- `error` - the message in case of processing error (assumes `success : false`).
+
+The content extraction result can contain such fields:
+- `text` - the extracted text,
+- `metadata` - the metadata associated with the document and the used parsers.
+
+The provided metadata associated with the document and the used parsers can include such fields:
+- `X-Parsed-By` - an array of names of the parsers used during the content extraction,
+- `X-OCR-Applied` - a flag specifying whether OCR was applied,
+- `Content-Type` - the content type of the document, as identified by Tika,
+- `Page-Count` - the document page count (extracted from the document metadata by Tika),
+- `Creation-Date` - the document creation date (extracted from the document metadata by Tika).
+
+
+# Example use
+Using `curl` to send the document to Tika server instance running on localhost on `8090` port:
+
+`curl -F file=@test.pdf http://localhost:8090/api/process_file | jq`
+
+Returned result:
+```
+{
+  "result": {
+    "text": "Sample Type / Medical Specialty: Lab Medicine - Pathology",
+    "metadata": {
+      "X-Parsed-By": [
+        "org.apache.tika.parser.CompositeParser",
+        "org.apache.tika.parser.DefaultParser",
+        "org.apache.tika.parser.microsoft.ooxml.OOXMLParser"
+      ],
+      "X-OCR-Applied": "false",
+      "Content-Type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+    },
+    "success": true,
+    "timestamp": "2019-08-13T15:14:58.022+01:00"
+  }
+}
+```
+
+# Configuration
+
+## Configuration file
+All the available service and document processors parameteres are stored in a single `src/main/resources/application.yaml` file. 
+
+Although the initial configuration file is bundled with the application jar file, a modified one can be provided as a parameter when running the Java application. For example, when running the Tika service in the Docker container, the script `scripts/run.sh` runs the Tika service with custom configuration file `application.yaml` located in `/app/config/` directory: 
+`java -Dspring.config.location=/app/config/ -jar /app/service-*.jar`
+
+
+## Available properties
+The configuration file is stored in yaml format with the following available properties.
+
+### General application properties
+- `application.version` - specifies the application version,
+- `server.port` - the port number on which the service will be run (default: `8090`),
+- `spring.servlet.multipart.max-file-size` and `spring.servlet.multipart.max-request-size` - specifies the max file size when processing file requests (default: `100MB`).
+
+
+### Tika service configuration
+The following keys reside under `tika.processing` node:
+- `use-legacy-tika-processor-as-default` - whether to use the legacy Tika PDF parser (as used in CogStack Pipeline) for backward compatibility (default: `true`),
+- `fail-on-empty-files` - whether to fail the request and report an error when client provided an empty document (default: `false`),
+- `fail-on-non-document-types` - whether to fail the request and report an erorr when client provided a not supported and/or non-document content (default: `true`).
+
+
+### Tika parsers configuration
+The following keys reside under `tika.parsers` node.
+
+The keys under `tesseract-ocr` define the default behavior of the Tika Tesseract OCR parser:
+- `language` - the language dictionary used by Tesseract (default: `eng`),
+- `timeout` - the max time (ms) to process documents before reporting error (default: `300`),
+- `enable-image-processing` - whether to use additional pre-processing of the images using ImageMagick (default: `false`),
+- `apply-rotation` - whether to apply de-rotating of the images (default: `false`),
+Please note that enabling `enable-image-processing` and/or `apply-rotation` although might improve the quality of the extracted text can significantly slower the extraction process.
+
+The keys under `pdf-ocr-parser` define the default behavior of the PDF parser that uses Tesseract OCR to extract the text:
+- `ocr-only-strategy` - whether to use only OCR or to apply additional text extraction from the content (default: `true`),
+- `min-doc-text-length` - if the available text in the document (before applying OCR) is higher than this value then skip OCR (default: `100`),
+- `min-doc-byte-size` - the minimum size of the image data (in bytes) that should have the content to be extracted, otherwise is skipped (default: `10000`),
+- `use-legacy-ocr-parser-for-single-page-doc` - in case of single-page PDF documents, whether to use the legacy parser (default: `false`).
+
+The keys under `legacy-pdf-parser` define the behavior of the Tika PDF parser used in CogStack Pipeline (the 'legacy' parser), that is used for backward compatibility:
+- `image-magick.timeout` - the max timeout value (in ms) when performing document conversion using ImageMagick (default: `300`),
+- `tesseract-ocr.timeout` - the max timeout value (in ms) when performing text extraction using Tesseract OCR (default: `300`),
+- `min-doc-text-length` - if the available text in the document (before applying OCR) is higher than this value then skip OCR (default: `100`).
diff --git a/build.gradle b/build.gradle
new file mode 100644
index 0000000..2c649b2
--- /dev/null
+++ b/build.gradle
@@ -0,0 +1,43 @@
+plugins {
+	id 'org.springframework.boot' version '2.1.6.RELEASE'
+	id 'java'
+}
+
+apply plugin: 'io.spring.dependency-management'
+
+group = 'service'
+version = '0.1.0-SNAPSHOT'
+sourceCompatibility = '11'
+
+configurations {
+	compileOnly {
+		extendsFrom annotationProcessor
+	}
+}
+
+repositories {
+	mavenCentral()
+}
+
+dependencies {
+	implementation 'org.springframework.boot:spring-boot-starter-web'
+    testImplementation 'org.springframework.boot:spring-boot-starter-test'
+
+    compileOnly 'org.projectlombok:lombok'
+	annotationProcessor 'org.projectlombok:lombok'
+
+    // json serialization
+	compile 'com.fasterxml.jackson.module:jackson-module-parameter-names'
+    compile 'com.fasterxml.jackson.datatype:jackson-datatype-jsr310'
+    compile 'com.fasterxml.jackson.datatype:jackson-datatype-jdk8'
+
+	// tika
+	compile group: 'org.apache.tika', name: 'tika', version: '1.21'
+	compile group: 'org.apache.tika', name: 'tika-core', version: '1.21'
+	compile group: 'org.apache.tika', name: 'tika-parsers', version: '1.21'
+
+    // tika-dependencies
+    compile group: 'org.apache.pdfbox', name: 'jbig2-imageio', version: '3.0.2'
+    compile group: 'com.github.jai-imageio', name: 'jai-imageio-jpeg2000', version: '1.3.0'
+    compile group: 'org.xerial', name: 'sqlite-jdbc', version: '3.27.2.1'
+}
diff --git a/extras/ImageMagick/policy.xml b/extras/ImageMagick/policy.xml
new file mode 100644
index 0000000..c61e183
--- /dev/null
+++ b/extras/ImageMagick/policy.xml
@@ -0,0 +1,77 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE policymap [
+<!ELEMENT policymap (policy)+>
+<!ELEMENT policy (#PCDATA)>
+<!ATTLIST policy domain (delegate|coder|filter|path|resource) #IMPLIED>
+<!ATTLIST policy name CDATA #IMPLIED>
+<!ATTLIST policy rights CDATA #IMPLIED>
+<!ATTLIST policy pattern CDATA #IMPLIED>
+<!ATTLIST policy value CDATA #IMPLIED>
+]>
+<!--
+  Configure ImageMagick policies.
+
+  Domains include system, delegate, coder, filter, path, or resource.
+
+  Rights include none, read, write, and execute.  Use | to combine them,
+  for example: "read | write" to permit read from, or write to, a path.
+
+  Use a glob expression as a pattern.
+
+  Suppose we do not want users to process MPEG video images:
+
+    <policy domain="delegate" rights="none" pattern="mpeg:decode" />
+
+  Here we do not want users reading images from HTTP:
+
+    <policy domain="coder" rights="none" pattern="HTTP" />
+
+  Lets prevent users from executing any image filters:
+
+    <policy domain="filter" rights="none" pattern="*" />
+
+  The /repository file system is restricted to read only.  We use a glob
+  expression to match all paths that start with /repository:
+  
+    <policy domain="path" rights="read" pattern="/repository/*" />
+
+  Let's prevent possible exploits by removing the right to use indirect reads.
+
+    <policy domain="path" rights="none" pattern="@*" />
+
+  Any large image is cached to disk rather than memory:
+
+    <policy domain="resource" name="area" value="1GB"/>
+
+  Define arguments for the memory, map, area, width, height, and disk resources
+  with SI prefixes (.e.g 100MB).  In addition, resource policies are maximums
+  for each instance of ImageMagick (e.g. policy memory limit 1GB, -limit 2GB
+  exceeds policy maximum so memory limit is 1GB).
+-->
+<policymap>
+  <!-- <policy domain="resource" name="temporary-path" value="/tmp"/> -->
+  <!-- Maximum size of the memory used for processing the image -->
+  <policy domain="resource" name="memory" value="512MiB"/>
+  <!-- Maximum size of the image in swap -->
+  <policy domain="resource" name="map" value="2GiB"/>
+  <policy domain="resource" name="width" value="32KP"/>
+  <policy domain="resource" name="height" value="32KP"/>
+  <!-- Maximum size of the image to be processed in memory before swapping -->
+  <policy domain="resource" name="area" value="256MB"/>
+  <!-- Maximum size of the available disk space for processing -->
+  <policy domain="resource" name="disk" value="8GiB"/>
+  <!-- <policy domain="resource" name="file" value="768"/> -->
+  <!-- <policy domain="resource" name="thread" value="4"/> -->
+  <!-- <policy domain="resource" name="throttle" value="0"/> -->
+  <!-- <policy domain="resource" name="time" value="3600"/> -->
+  <!-- <policy domain="system" name="precision" value="6"/> -->
+  <!-- not needed due to the need to use explicitly by mvg: -->
+  <!-- <policy domain="delegate" rights="none" pattern="MVG" /> -->
+  <!-- use curl -->
+  <policy domain="delegate" rights="none" pattern="URL" />
+  <policy domain="delegate" rights="none" pattern="HTTPS" />
+  <policy domain="delegate" rights="none" pattern="HTTP" />
+  <!-- in order to avoid to get image with password text -->
+  <policy domain="path" rights="none" pattern="@*"/>
+  <policy domain="cache" name="shared-secret" value="passphrase" stealth="true"/>
+</policymap>
diff --git a/gradle/wrapper/gradle-wrapper.jar b/gradle/wrapper/gradle-wrapper.jar
new file mode 100644
index 0000000..87b738c
Binary files /dev/null and b/gradle/wrapper/gradle-wrapper.jar differ
diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties
new file mode 100644
index 0000000..2a54fa4
--- /dev/null
+++ b/gradle/wrapper/gradle-wrapper.properties
@@ -0,0 +1,6 @@
+#Wed Jul 17 08:52:30 BST 2019
+distributionBase=GRADLE_USER_HOME
+distributionPath=wrapper/dists
+zipStoreBase=GRADLE_USER_HOME
+zipStorePath=wrapper/dists
+distributionUrl=https\://services.gradle.org/distributions/gradle-5.4.1-all.zip
diff --git a/gradlew b/gradlew
new file mode 100755
index 0000000..af6708f
--- /dev/null
+++ b/gradlew
@@ -0,0 +1,172 @@
+#!/usr/bin/env sh
+
+##############################################################################
+##
+##  Gradle start up script for UN*X
+##
+##############################################################################
+
+# Attempt to set APP_HOME
+# Resolve links: $0 may be a link
+PRG="$0"
+# Need this for relative symlinks.
+while [ -h "$PRG" ] ; do
+    ls=`ls -ld "$PRG"`
+    link=`expr "$ls" : '.*-> \(.*\)$'`
+    if expr "$link" : '/.*' > /dev/null; then
+        PRG="$link"
+    else
+        PRG=`dirname "$PRG"`"/$link"
+    fi
+done
+SAVED="`pwd`"
+cd "`dirname \"$PRG\"`/" >/dev/null
+APP_HOME="`pwd -P`"
+cd "$SAVED" >/dev/null
+
+APP_NAME="Gradle"
+APP_BASE_NAME=`basename "$0"`
+
+# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+DEFAULT_JVM_OPTS='"-Xmx64m"'
+
+# Use the maximum available, or set MAX_FD != -1 to use that value.
+MAX_FD="maximum"
+
+warn () {
+    echo "$*"
+}
+
+die () {
+    echo
+    echo "$*"
+    echo
+    exit 1
+}
+
+# OS specific support (must be 'true' or 'false').
+cygwin=false
+msys=false
+darwin=false
+nonstop=false
+case "`uname`" in
+  CYGWIN* )
+    cygwin=true
+    ;;
+  Darwin* )
+    darwin=true
+    ;;
+  MINGW* )
+    msys=true
+    ;;
+  NONSTOP* )
+    nonstop=true
+    ;;
+esac
+
+CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar
+
+# Determine the Java command to use to start the JVM.
+if [ -n "$JAVA_HOME" ] ; then
+    if [ -x "$JAVA_HOME/jre/sh/java" ] ; then
+        # IBM's JDK on AIX uses strange locations for the executables
+        JAVACMD="$JAVA_HOME/jre/sh/java"
+    else
+        JAVACMD="$JAVA_HOME/bin/java"
+    fi
+    if [ ! -x "$JAVACMD" ] ; then
+        die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+    fi
+else
+    JAVACMD="java"
+    which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+
+Please set the JAVA_HOME variable in your environment to match the
+location of your Java installation."
+fi
+
+# Increase the maximum file descriptors if we can.
+if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then
+    MAX_FD_LIMIT=`ulimit -H -n`
+    if [ $? -eq 0 ] ; then
+        if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then
+            MAX_FD="$MAX_FD_LIMIT"
+        fi
+        ulimit -n $MAX_FD
+        if [ $? -ne 0 ] ; then
+            warn "Could not set maximum file descriptor limit: $MAX_FD"
+        fi
+    else
+        warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT"
+    fi
+fi
+
+# For Darwin, add options to specify how the application appears in the dock
+if $darwin; then
+    GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\""
+fi
+
+# For Cygwin, switch paths to Windows format before running java
+if $cygwin ; then
+    APP_HOME=`cygpath --path --mixed "$APP_HOME"`
+    CLASSPATH=`cygpath --path --mixed "$CLASSPATH"`
+    JAVACMD=`cygpath --unix "$JAVACMD"`
+
+    # We build the pattern for arguments to be converted via cygpath
+    ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null`
+    SEP=""
+    for dir in $ROOTDIRSRAW ; do
+        ROOTDIRS="$ROOTDIRS$SEP$dir"
+        SEP="|"
+    done
+    OURCYGPATTERN="(^($ROOTDIRS))"
+    # Add a user-defined pattern to the cygpath arguments
+    if [ "$GRADLE_CYGPATTERN" != "" ] ; then
+        OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)"
+    fi
+    # Now convert the arguments - kludge to limit ourselves to /bin/sh
+    i=0
+    for arg in "$@" ; do
+        CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -`
+        CHECK2=`echo "$arg"|egrep -c "^-"`                                 ### Determine if an option
+
+        if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then                    ### Added a condition
+            eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"`
+        else
+            eval `echo args$i`="\"$arg\""
+        fi
+        i=$((i+1))
+    done
+    case $i in
+        (0) set -- ;;
+        (1) set -- "$args0" ;;
+        (2) set -- "$args0" "$args1" ;;
+        (3) set -- "$args0" "$args1" "$args2" ;;
+        (4) set -- "$args0" "$args1" "$args2" "$args3" ;;
+        (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;;
+        (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;;
+        (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;;
+        (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;;
+        (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;;
+    esac
+fi
+
+# Escape application args
+save () {
+    for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done
+    echo " "
+}
+APP_ARGS=$(save "$@")
+
+# Collect all arguments for the java command, following the shell quoting and substitution rules
+eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS"
+
+# by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong
+if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then
+  cd "$(dirname "$0")"
+fi
+
+exec "$JAVACMD" "$@"
diff --git a/gradlew.bat b/gradlew.bat
new file mode 100644
index 0000000..0f8d593
--- /dev/null
+++ b/gradlew.bat
@@ -0,0 +1,84 @@
+@if "%DEBUG%" == "" @echo off
+@rem ##########################################################################
+@rem
+@rem  Gradle startup script for Windows
+@rem
+@rem ##########################################################################
+
+@rem Set local scope for the variables with windows NT shell
+if "%OS%"=="Windows_NT" setlocal
+
+set DIRNAME=%~dp0
+if "%DIRNAME%" == "" set DIRNAME=.
+set APP_BASE_NAME=%~n0
+set APP_HOME=%DIRNAME%
+
+@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script.
+set DEFAULT_JVM_OPTS="-Xmx64m"
+
+@rem Find java.exe
+if defined JAVA_HOME goto findJavaFromJavaHome
+
+set JAVA_EXE=java.exe
+%JAVA_EXE% -version >NUL 2>&1
+if "%ERRORLEVEL%" == "0" goto init
+
+echo.
+echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH.
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:findJavaFromJavaHome
+set JAVA_HOME=%JAVA_HOME:"=%
+set JAVA_EXE=%JAVA_HOME%/bin/java.exe
+
+if exist "%JAVA_EXE%" goto init
+
+echo.
+echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME%
+echo.
+echo Please set the JAVA_HOME variable in your environment to match the
+echo location of your Java installation.
+
+goto fail
+
+:init
+@rem Get command-line arguments, handling Windows variants
+
+if not "%OS%" == "Windows_NT" goto win9xME_args
+
+:win9xME_args
+@rem Slurp the command line arguments.
+set CMD_LINE_ARGS=
+set _SKIP=2
+
+:win9xME_args_slurp
+if "x%~1" == "x" goto execute
+
+set CMD_LINE_ARGS=%*
+
+:execute
+@rem Setup the command line
+
+set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar
+
+@rem Execute Gradle
+"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS%
+
+:end
+@rem End local scope for the variables with windows NT shell
+if "%ERRORLEVEL%"=="0" goto mainEnd
+
+:fail
+rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of
+rem the _cmd.exe /c_ return code!
+if  not "" == "%GRADLE_EXIT_CONSOLE%" exit 1
+exit /b 1
+
+:mainEnd
+if "%OS%"=="Windows_NT" endlocal
+
+:omega
diff --git a/lombok.config b/lombok.config
new file mode 100644
index 0000000..6fe22f9
--- /dev/null
+++ b/lombok.config
@@ -0,0 +1,2 @@
+lombok.anyConstructor.addConstructorProperties=true
+config.stopBubbling = true
\ No newline at end of file
diff --git a/scripts/run.sh b/scripts/run.sh
new file mode 100644
index 0000000..819d7a9
--- /dev/null
+++ b/scripts/run.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+java -Dspring.config.location=/app/config/ -jar /app/service-*.jar
diff --git a/settings.gradle b/settings.gradle
new file mode 100644
index 0000000..ddc8b78
--- /dev/null
+++ b/settings.gradle
@@ -0,0 +1,6 @@
+pluginManagement {
+	repositories {
+		gradlePluginPortal()
+	}
+}
+rootProject.name = 'service'
diff --git a/src/main/java/common/JsonPropertyAccessView.java b/src/main/java/common/JsonPropertyAccessView.java
new file mode 100644
index 0000000..ac8745e
--- /dev/null
+++ b/src/main/java/common/JsonPropertyAccessView.java
@@ -0,0 +1,10 @@
+package common;
+
+/**
+ * Implements mechanisms to access only selected members of the class
+ * during JSON serialization/deserialization
+ */
+public class JsonPropertyAccessView {
+    public static class Public {}
+    public static class Private {}
+}
diff --git a/src/main/java/service/TikaServiceApplication.java b/src/main/java/service/TikaServiceApplication.java
new file mode 100644
index 0000000..8cee59e
--- /dev/null
+++ b/src/main/java/service/TikaServiceApplication.java
@@ -0,0 +1,17 @@
+package service;
+
+import org.springframework.boot.SpringApplication;
+import org.springframework.boot.autoconfigure.SpringBootApplication;
+
+
+/**
+ * The main application
+ */
+@SpringBootApplication
+public class TikaServiceApplication {
+
+	public static void main(String[] args) {
+		SpringApplication.run(TikaServiceApplication.class, args);
+	}
+
+}
diff --git a/src/main/java/service/controller/TikaServiceConfig.java b/src/main/java/service/controller/TikaServiceConfig.java
new file mode 100644
index 0000000..98d3740
--- /dev/null
+++ b/src/main/java/service/controller/TikaServiceConfig.java
@@ -0,0 +1,38 @@
+package service.controller;
+
+import com.fasterxml.jackson.annotation.JsonView;
+import common.JsonPropertyAccessView;
+import lombok.Data;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.context.annotation.Configuration;
+
+
+/**
+ * A general Tika Service processing configuration
+ */
+@Data
+@Configuration
+public class TikaServiceConfig {
+
+    @JsonView(JsonPropertyAccessView.Public.class)
+    @Value("${application.version}")
+    String appVersion;
+
+    // specifies whether to use the legacy Tika processor (as in CogStack-Pipeline)
+    // as the default documents processor
+    @JsonView(JsonPropertyAccessView.Public.class)
+    @Value("${processing.use-legacy-tika-processor-as-default:true}")
+    boolean useLegacyTikaProcessor;
+
+    // specifies whether providing an empty file shall result in reporting failure
+    // due to invalid input provided by the client
+    @JsonView(JsonPropertyAccessView.Public.class)
+    @Value("${processing.fail-on-empty-files:true}")
+    boolean failOnEmptyFiles;
+
+    // specifies whether providing a non-document type of data (e.g. executable) should fail
+    // due to invalid input provided by the client
+    @JsonView(JsonPropertyAccessView.Public.class)
+    @Value("${processing.fail-on-non-document-types:true}")
+    boolean failOnNonDocumentTypes;
+}
diff --git a/src/main/java/service/controller/TikaServiceController.java b/src/main/java/service/controller/TikaServiceController.java
new file mode 100644
index 0000000..e48c58f
--- /dev/null
+++ b/src/main/java/service/controller/TikaServiceController.java
@@ -0,0 +1,192 @@
+package service.controller;
+
+import com.fasterxml.jackson.annotation.JsonView;
+import common.JsonPropertyAccessView;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.beans.factory.annotation.Qualifier;
+import org.springframework.context.annotation.ComponentScan;
+import org.springframework.http.HttpStatus;
+import org.springframework.http.ResponseEntity;
+import org.springframework.web.bind.annotation.*;
+import org.springframework.web.multipart.MultipartFile;
+import service.model.ServiceInformation;
+import service.model.ServiceResponseContent;
+import tika.legacy.LegacyTikaProcessor;
+import tika.model.TikaProcessingResult;
+import tika.processor.AbstractTikaProcessor;
+import tika.processor.CompositeTikaProcessor;
+import javax.annotation.PostConstruct;
+import javax.servlet.http.HttpServletRequest;
+import java.io.ByteArrayInputStream;
+
+
+/**
+ * Main Tika Service REST controller
+ */
+@RestController
+@ComponentScan({"tika.legacy", "tika.processor"})
+public class TikaServiceController {
+
+    private final String apiPathPrefix = "/**/api";
+    //private final String apiVersion = "v1";
+    private final String apiFullPath = apiPathPrefix;
+
+    private Logger log = LoggerFactory.getLogger(TikaServiceController.class);
+
+    /**
+     * Tika document processors
+     */
+    @Autowired
+    @Qualifier("legacyTikaProcessor")
+    private LegacyTikaProcessor legacyTikaProcessor;
+
+    @Autowired
+    @Qualifier("compositeTikaProcessor")
+    private CompositeTikaProcessor compositeTikaProcessor;
+
+    /**
+     * All the necessary information about the service, incl. config
+     */
+    @Autowired
+    ServiceInformation serviceInfo;
+
+
+    private AbstractTikaProcessor tikaProcessor;
+
+    @PostConstruct
+    void init() {
+        // select the appropriate document processor depending on the configuration
+        if (serviceInfo.getServiceConfig().isUseLegacyTikaProcessor()) {
+            tikaProcessor = legacyTikaProcessor;
+        }
+        else {
+            tikaProcessor = compositeTikaProcessor;
+        }
+    }
+
+
+    /**
+     * The endpoint returning service information with configuration
+     */
+    @GetMapping(value = apiFullPath + "/info", produces = "application/json")
+    @JsonView(JsonPropertyAccessView.Public.class)
+    public @ResponseBody
+    ServiceInformation info() {
+        return serviceInfo;
+    }
+
+
+    /**
+     * The endpoint used for processing documents (e.g. sent as [ocet] stream)
+     */
+    @PostMapping(value = apiFullPath + "/process", produces = "application/json")
+    public ResponseEntity<ServiceResponseContent> process(HttpServletRequest request) {
+        try {
+            byte[] streamContent = request.getInputStream().readAllBytes();
+            if (streamContent.length == 0) {
+                final String message = "Empty content";
+                log.info(message);
+
+                return createEmptyDocumentResponseEntity(message);
+            }
+
+            // we are buffering the stream using ByteArrayInputStream in order to enable
+            // re-reading the binary document content
+            ByteArrayInputStream bufs = new ByteArrayInputStream(streamContent);
+            TikaProcessingResult result = processStream(bufs);
+
+            return createProcessedDocumentResponseEntiy(result);
+        }
+        catch (Exception e) {
+            final String message = "Error processing the query: " + e.getMessage();
+            log.error(message);
+
+            return new ResponseEntity<>(createErrorResponse(message), HttpStatus.INTERNAL_SERVER_ERROR);
+        }
+    }
+
+
+    /**
+     * The endpoint used for processing documents sent as multi-part files
+     */
+    @PostMapping(value = apiFullPath + "/process_file", consumes = { "multipart/form-data" }, produces = "application/json")
+    public ResponseEntity<ServiceResponseContent> process(@RequestParam("file") MultipartFile file) {
+        // check whether we need to perform any processing
+        if (file.isEmpty()) {
+            final String message = "Empty content";
+            log.info(message);
+
+            return createEmptyDocumentResponseEntity(message);
+        }
+
+        // process the content
+        try {
+            // we are buffering the stream using ByteArrayInputStream in order to enable
+            // re-reading the binary document content
+            ByteArrayInputStream bufs = new ByteArrayInputStream(file.getBytes());
+
+            TikaProcessingResult result = processStream(bufs);
+            return createProcessedDocumentResponseEntiy(result);
+        }
+        catch (Exception e) {
+            final String message = "Error processing the query: " + e.getMessage();
+            log.error(message);
+
+            return new ResponseEntity<>(createErrorResponse(message), HttpStatus.INTERNAL_SERVER_ERROR);
+        }
+    }
+
+
+    private ServiceResponseContent createErrorResponse(String message) {
+        ServiceResponseContent response = new ServiceResponseContent();
+        TikaProcessingResult result = TikaProcessingResult.builder()
+                .success(false)
+                .error(message).build();
+        response.setResult(result);
+        return response;
+    }
+
+
+    private TikaProcessingResult processStream(ByteArrayInputStream stream) {
+        log.info("Running processor: " + tikaProcessor.getClass().toString());
+        return tikaProcessor.process(stream);
+    }
+
+
+    private ResponseEntity<ServiceResponseContent> createEmptyDocumentResponseEntity(String errorMessage) {
+        HttpStatus status;
+        if (serviceInfo.getServiceConfig().isFailOnEmptyFiles()) {
+            status = HttpStatus.BAD_REQUEST;
+        }
+        else {
+            status = HttpStatus.OK;
+        }
+
+        return new ResponseEntity<>(createErrorResponse(errorMessage), status);
+    }
+
+    private ResponseEntity<ServiceResponseContent> createProcessedDocumentResponseEntiy(TikaProcessingResult result) {
+        // remember to actually check the processing status
+        HttpStatus status;
+        if (result.getSuccess()) {
+            if (serviceInfo.getServiceConfig().isFailOnNonDocumentTypes()
+                    & !AbstractTikaProcessor.isValidDocumentType(result.getMetadata())) {
+                // assume fail on non-document types
+                status = HttpStatus.BAD_REQUEST;
+            }
+            else {
+                status = HttpStatus.OK;
+            }
+        }
+        else {
+            // an error occurred during processing -- assume it's actually faulty document
+            status = HttpStatus.BAD_REQUEST;
+        }
+
+        ServiceResponseContent response = new ServiceResponseContent();
+        response.setResult(result);
+        return new ResponseEntity<>(response, status);
+    }
+}
diff --git a/src/main/java/service/model/ServiceInformation.java b/src/main/java/service/model/ServiceInformation.java
new file mode 100644
index 0000000..ae8cfcc
--- /dev/null
+++ b/src/main/java/service/model/ServiceInformation.java
@@ -0,0 +1,42 @@
+package service.model;
+
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonView;
+import common.JsonPropertyAccessView;
+import lombok.Data;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.context.annotation.ComponentScan;
+import org.springframework.context.annotation.Configuration;
+import service.controller.TikaServiceConfig;
+import tika.legacy.LegacyPdfProcessorConfig;
+import tika.model.TikaPackageInformation;
+import tika.processor.CompositeTikaProcessorConfig;
+
+
+/**
+ * All the information about Tika Service configuration
+ */
+@Data
+@Configuration
+@ComponentScan({"tika.legacy", "tika.processor"})
+public class ServiceInformation {
+
+    @Autowired
+    @JsonProperty("legacy_processor_config")
+    @JsonView(JsonPropertyAccessView.Public.class)
+    LegacyPdfProcessorConfig legacyProcessorConfig;
+
+    @Autowired
+    @JsonProperty("composite_processor_config")
+    @JsonView(JsonPropertyAccessView.Public.class)
+    CompositeTikaProcessorConfig compositeProcesorConfig;
+
+    @Autowired
+    @JsonProperty("service_config")
+    @JsonView(JsonPropertyAccessView.Public.class)
+    TikaServiceConfig serviceConfig;
+
+    @JsonProperty("tika_info")
+    @JsonView(JsonPropertyAccessView.Public.class)
+    TikaPackageInformation tikaInfo = new TikaPackageInformation();
+}
diff --git a/src/main/java/service/model/ServiceRequestContent.java b/src/main/java/service/model/ServiceRequestContent.java
new file mode 100644
index 0000000..e37cc34
--- /dev/null
+++ b/src/main/java/service/model/ServiceRequestContent.java
@@ -0,0 +1,24 @@
+package service.model;
+
+import lombok.Data;
+import tika.model.TikaBinaryDocument;
+
+/**
+ * Service request content when used with JSON-accepting endpoints
+ *
+ * Current status: NOT USED
+ *
+ * NB: for the moment, documents are sent either as:
+ * - ocet stream
+ * - multi-part files
+ * as encoding binary document content into JSON may be an overkill,
+ * but may be revisited when going forward with gRPC
+ *
+ * [keeping for now as a placeholder]
+ */
+@Data
+public class ServiceRequestContent {
+    TikaBinaryDocument document;
+
+    // TODO: footer as in NLP
+}
diff --git a/src/main/java/service/model/ServiceResponseContent.java b/src/main/java/service/model/ServiceResponseContent.java
new file mode 100644
index 0000000..77c448d
--- /dev/null
+++ b/src/main/java/service/model/ServiceResponseContent.java
@@ -0,0 +1,18 @@
+package service.model;
+
+import com.fasterxml.jackson.annotation.JsonInclude;
+import lombok.Data;
+import tika.model.TikaProcessingResult;
+
+
+/**
+ * The response from the service containing the document processing results
+ */
+@Data
+@JsonInclude(JsonInclude.Include.NON_NULL)
+public class ServiceResponseContent {
+
+    TikaProcessingResult result;
+
+    // TODO: footer as in NLP
+}
diff --git a/src/main/java/tika/legacy/ImageMagickConfig.java b/src/main/java/tika/legacy/ImageMagickConfig.java
new file mode 100644
index 0000000..1740a47
--- /dev/null
+++ b/src/main/java/tika/legacy/ImageMagickConfig.java
@@ -0,0 +1,310 @@
+/*
+ * Copyright 2016 King's College London, Richard Jackson <richgjackson@gmail.com>.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package tika.legacy;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.Serializable;
+import java.util.Locale;
+import java.util.Properties;
+
+/**
+ * Configuration for TesseractOCRParser.
+ *
+ * This allows to enable TesseractOCRParser and set its parameters:
+ * <p>
+ * TesseractOCRConfig config = new TesseractOCRConfig();<br>
+ * config.setTesseractPath(tesseractFolder);<br>
+ * parseContext.set(TesseractOCRConfig.class, config);<br>
+ * </p>
+ *
+ * Parameters can also be set by either editing the existing TesseractOCRConfig.properties file in,
+ * tika-parser/src/main/resources/org/apache/tika/parser/ocr, or overriding it by creating your own
+ * and placing it in the package org/apache/tika/parser/ocr on the classpath.
+ *
+ */
+public class ImageMagickConfig implements Serializable{
+
+    private static final long serialVersionUID = -4861942486845757891L;
+
+    // Path to tesseract installation folder, if not on system path.
+    private  String tesseractPath = "";
+
+    // Language dictionary to be used.
+    private  String language = "eng";
+
+    // Tesseract page segmentation mode.
+    private  String pageSegMode = "1";
+
+    // Minimum file size to submit file to ocr.
+    private  int minFileSizeToOcr = 0;
+
+    // Maximum file size to submit file to ocr.
+    private  int maxFileSizeToOcr = Integer.MAX_VALUE;
+
+    // Maximum time (seconds) to wait for the ocring process termination
+    private int timeout = 120;
+    private String imageMagickPath = "";
+    private String density = "300";
+    private String depth = "8";
+    private String quality = "1";
+    private int maxTiffSize = Integer.MAX_VALUE;
+    private int minTiffSize = 0;
+
+
+    /**
+     * Default contructor.
+     */
+    public ImageMagickConfig() {
+        init(this.getClass().getResourceAsStream("ImageMagickConfig.properties"));
+    }
+
+    /**
+     * Loads properties from InputStream and then tries to close InputStream.
+     * If there is an IOException, this silently swallows the exception
+     * and goes back to the default.
+     *
+     * @param is
+     */
+    public ImageMagickConfig(InputStream is) {
+        init(is);
+    }
+
+    private void init(InputStream is) {
+        if (is == null) {
+            return;
+        }
+        Properties props = new Properties();
+        try {
+            props.load(is);
+        } catch (IOException ignored) {
+        } finally {
+            if (is != null) {
+                try {
+                    is.close();
+                } catch (IOException e) {
+                    //swallow
+                }
+            }
+        }
+        setTesseractPath(
+                getProp(props, "tesseractPath", getTesseractPath()));
+        setLanguage(
+                getProp(props, "language", getLanguage()));
+        setPageSegMode(
+                getProp(props, "pageSegMode", getPageSegMode()));
+        setMinFileSizeToOcr(
+                getProp(props, "minFileSizeToOcr", getMinFileSizeToOcr()));
+        setMaxFileSizeToOcr(
+                getProp(props, "maxFileSizeToOcr", getMaxFileSizeToOcr()));
+        setImageMagickPath(
+                getProp(props, "imageMagickPath", getImageMagickPath()));
+
+        setTimeout(
+                getProp(props, "timeout", getTimeout()));
+        setDensity(
+                getProp(props, "density", getDensity()));
+        setQuality(
+                getProp(props, "quality", getQuality()));
+        setDepth(
+                getProp(props, "depth", getDepth()));
+        setMinTiffSize(
+                getProp(props, "minTiffSize", getMinTiffSize()));
+        setMaxTiffSize(
+                getProp(props, "maxTiffSize", getMaxTiffSize()));
+    }
+
+    /** @see #setTesseractPath(String tesseractPath)*/
+    public String getTesseractPath() {
+        return tesseractPath;
+    }
+
+    /**
+     * Set tesseract installation folder, needed if it is not on system path.
+     */
+    public void setTesseractPath(String tesseractPath) {
+        if(!tesseractPath.isEmpty() && !tesseractPath.endsWith(File.separator))
+            tesseractPath += File.separator;
+
+        this.tesseractPath = tesseractPath;
+    }
+
+    /** @see #setLanguage(String language)*/
+    public String getLanguage() {
+        return language;
+    }
+
+    /**
+     * Set tesseract language dictionary to be used. Default is "eng".
+     * Multiple languages may be specified, separated by plus characters.
+     */
+    public void setLanguage(String language) {
+        if (!language.matches("([A-Za-z](\\+?))*")) {
+            throw new IllegalArgumentException("Invalid language code");
+        }
+        this.language = language;
+    }
+
+    /** @see #setPageSegMode(String pageSegMode)*/
+    public String getPageSegMode() {
+        return pageSegMode;
+    }
+
+    /**
+     * Set tesseract page segmentation mode.
+     * Default is 1 = Automatic page segmentation with OSD (Orientation and Script Detection)
+     */
+    public void setPageSegMode(String pageSegMode) {
+        if (!pageSegMode.matches("[1-9]|10")) {
+            throw new IllegalArgumentException("Invalid language code");
+        }
+        this.pageSegMode = pageSegMode;
+    }
+
+    /** @see #setMinFileSizeToOcr(int minFileSizeToOcr)*/
+    public int getMinFileSizeToOcr() {
+        return minFileSizeToOcr;
+    }
+
+    /**
+     * Set minimum file size to submit file to ocr.
+     * Default is 0.
+     */
+    public void setMinFileSizeToOcr(int minFileSizeToOcr) {
+        this.minFileSizeToOcr = minFileSizeToOcr;
+    }
+
+    /** @see #setMaxFileSizeToOcr(int maxFileSizeToOcr)*/
+    public int getMaxFileSizeToOcr() {
+        return maxFileSizeToOcr;
+    }
+
+    /**
+     * Set maximum file size to submit file to ocr.
+     * Default is Integer.MAX_VALUE.
+     */
+    public void setMaxFileSizeToOcr(int maxFileSizeToOcr) {
+        this.maxFileSizeToOcr = maxFileSizeToOcr;
+    }
+
+    /**
+     * Set maximum time (seconds) to wait for the ocring process to terminate.
+     * Default value is 120s.
+     */
+    public void setTimeout(int timeout) {
+        this.timeout = timeout;
+    }
+
+    /** @see #setTimeout(int timeout)*/
+    public int getTimeout() {
+        return timeout;
+    }
+
+    /**
+     * Get property from the properties file passed in.
+     * @param properties properties file to read from.
+     * @param property the property to fetch.
+     * @param defaultMissing default parameter to use.
+     * @return the value.
+     */
+    private int getProp(Properties properties, String property, int defaultMissing) {
+        String p = properties.getProperty(property);
+        if (p == null || p.isEmpty()){
+            return defaultMissing;
+        }
+        try {
+            return Integer.parseInt(p);
+        } catch (Throwable ex) {
+            throw new RuntimeException(String.format(Locale.ROOT, "Cannot parse ImageMagickConfig variable %s, invalid integer value",
+                    property), ex);
+        }
+    }
+
+    /**
+     * Get property from the properties file passed in.
+     * @param properties properties file to read from.
+     * @param property the property to fetch.
+     * @param defaultMissing default parameter to use.
+     * @return the value.
+     */
+    private String getProp(Properties properties, String property, String defaultMissing) {
+        return properties.getProperty(property, defaultMissing);
+    }
+
+    public String getImageMagickPath() {
+        return imageMagickPath;
+    }
+
+    public String getDensity() {
+        return density;
+    }
+
+    public String getDepth() {
+        return depth;
+    }
+
+    public String getQuality() {
+        return quality;
+    }
+
+    public int getMaxTiffSize() {
+        return maxTiffSize;
+    }
+
+    public void setMaxTiffSize(int maxTiffSize) {
+        this.maxTiffSize = maxTiffSize;
+    }
+
+    public void setImageMagickPath(String imageMagickPath) {
+        this.imageMagickPath = imageMagickPath;
+    }
+
+    public void setDensity(String density) {
+        this.density = density;
+    }
+
+    public void setDepth(String depth) {
+        this.depth = depth;
+    }
+
+    public void setQuality(String quality) {
+        this.quality = quality;
+    }
+
+    public int getMinTiffSize() {
+        return minTiffSize;
+    }
+
+    public void setMinTiffSize(int minTiffSize) {
+        this.minTiffSize = minTiffSize;
+    }
+}
\ No newline at end of file
diff --git a/src/main/java/tika/legacy/LegacyPdfProcessorConfig.java b/src/main/java/tika/legacy/LegacyPdfProcessorConfig.java
new file mode 100644
index 0000000..6c316f6
--- /dev/null
+++ b/src/main/java/tika/legacy/LegacyPdfProcessorConfig.java
@@ -0,0 +1,51 @@
+package tika.legacy;
+
+import com.fasterxml.jackson.annotation.JsonIgnore;
+import com.fasterxml.jackson.annotation.JsonView;
+import lombok.Data;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.exception.TikaException;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.context.annotation.Configuration;
+import org.xml.sax.SAXException;
+import common.JsonPropertyAccessView;
+import javax.annotation.PostConstruct;
+import java.io.IOException;
+
+
+/**
+ * The legacy PDF processor configuration, as used in CogStack-Pipeline
+ * with some minor additions
+ */
+@Data
+@Configuration
+public class LegacyPdfProcessorConfig {
+
+    @JsonIgnore
+    private TikaConfig tikaConfig;
+
+    // the timeout value (s) when performing PDF->TIFF conversion of the documents
+    // the default value in Tika is 120s, but this may be too short for multi-page documents
+    @JsonView(JsonPropertyAccessView.Public.class)
+    @Value("${tika.parsers.legacy-pdf-parser.image-magick.timeout:120}")
+    private int conversionTimeout;
+
+    // the timeout value (s) when performing OCR over the documents
+    // the default value in Tika is 120s, but this may be too short for multi-page documents
+    @JsonView(JsonPropertyAccessView.Public.class)
+    @Value("${tika.parsers.legacy-pdf-parser.tesseract-ocr.timeout:120}")
+    private int ocrTimeout;
+
+    // apply OCR only when trying to extract text from previously parsed document (w/o OCR)
+    // that extracted characters were less than N
+    @JsonView(JsonPropertyAccessView.Public.class)
+    @Value("${tika.parsers.legacy-pdf-parser.min-doc-text-length:100}")
+    private int pdfMinDocTextLength;
+
+
+    @PostConstruct
+    public void init() throws IOException, SAXException, TikaException  {
+        tikaConfig = new TikaConfig(this.getClass().getClassLoader()
+                .getResourceAsStream("tika-config/legacy-parser-config.xml"));
+    }
+}
diff --git a/src/main/java/tika/legacy/LegacyPdfProcessorParser.java b/src/main/java/tika/legacy/LegacyPdfProcessorParser.java
new file mode 100644
index 0000000..58ed0a2
--- /dev/null
+++ b/src/main/java/tika/legacy/LegacyPdfProcessorParser.java
@@ -0,0 +1,256 @@
+/*
+ * Copyright 2016 King's College London, Richard Jackson <richgjackson@gmail.com>.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package tika.legacy;
+
+import org.apache.commons.io.FileUtils;
+import org.apache.commons.logging.LogFactory;
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.external.ExternalParser;
+import org.apache.tika.parser.ocr.TesseractOCRParser;
+import org.apache.tika.parser.pdf.PDFParser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import java.io.*;
+import java.util.*;
+import java.util.concurrent.*;
+
+
+public class LegacyPdfProcessorParser extends AbstractParser {
+
+    private static final long serialVersionUID = -8167538283213097265L;
+    private static Map<String, Boolean> IMAGEMAGICK_PRESENT = new HashMap<String, Boolean>();
+    private static final ImageMagickConfig DEFAULT_IMAGEMAGICK_CONFIG = new ImageMagickConfig();
+
+    private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
+            new HashSet<>(Arrays.asList(new MediaType[]{
+                    MediaType.application("pdf")
+            })));
+    private static final Logger LOG = LoggerFactory.getLogger(LegacyPdfProcessorParser.class);
+
+
+    @Override
+    public Set<MediaType> getSupportedTypes(ParseContext context) {
+        // If ImageMagick is installed, offer our supported image types
+        ImageMagickConfig imconfig = context.get(ImageMagickConfig.class, DEFAULT_IMAGEMAGICK_CONFIG);
+        if (hasImageMagick(imconfig)) {
+            return SUPPORTED_TYPES;
+        }
+
+        // Otherwise don't advertise anything, so the other parsers
+        //  can be selected instead
+        return Collections.emptySet();
+    }
+
+    private boolean hasImageMagick(ImageMagickConfig config) {
+        // Fetch where the config says to find hasImageMagick
+        String imageMagick = config.getImageMagickPath() + getImageMagickProg();
+
+        // Have we already checked for a copy of ImageMagick there?
+        if (IMAGEMAGICK_PRESENT.containsKey(imageMagick)) {
+            return IMAGEMAGICK_PRESENT.get(imageMagick);
+        }
+
+        // Try running ImageMagick from there, and see if it exists + works
+        String[] checkCmd = {imageMagick};
+        try {
+            boolean hasImageMagick = ExternalParser.check(checkCmd);
+            IMAGEMAGICK_PRESENT.put(imageMagick, hasImageMagick);
+            return hasImageMagick;
+        } catch (NoClassDefFoundError e) {
+            // This happens under OSGi + Fork Parser - see TIKA-1507
+            // As a workaround for now, just say we can't use OCR
+            // TODO Resolve it so we don't need this try/catch block
+            IMAGEMAGICK_PRESENT.put(imageMagick, false);
+            return false;
+        }
+    }
+
+    @Override
+    public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context)
+            throws IOException, SAXException, TikaException {
+        ImageMagickConfig config = context.get(ImageMagickConfig.class, DEFAULT_IMAGEMAGICK_CONFIG);
+
+        // If ImageMagick is not on the path with the current config, do not try to run OCR
+        // getSupportedTypes shouldn't have listed us as handling it, so this should only
+        //  occur if someone directly calls this parser, not via DefaultParser or similar
+//        TemporaryResources tmp = new TemporaryResources();
+        //TikaInputStream pdfStream = TikaInputStream.get(stream);
+        PDFParser pdfParser = new PDFParser();
+
+        //create temp handlers to investigate object
+        BodyContentHandler body = new BodyContentHandler();
+        Metadata pdfMetadata = new Metadata();
+
+        //needed to reset stream
+        if (stream.markSupported()) {
+            stream.mark(Integer.MAX_VALUE);
+        }
+
+        //first do initial parse to see if there's subsantial content in pdf metadata already
+        pdfParser.parse(stream, body, pdfMetadata, context);
+        stream.reset();
+        //if there's content - reparse with official handlers/metadata. What else can you do? Also check imagemagick is available
+
+        LegacyPdfProcessorConfig generalConfig = context.get(LegacyPdfProcessorConfig.class);
+
+        if (body.toString().length() > generalConfig.getPdfMinDocTextLength() || !hasImageMagick(config)) {
+            pdfParser.parse(stream, handler, metadata, context);
+            //metadata.set("X-PDFPREPROC-OCR-APPLIED", "NA");
+            return;
+        }
+
+        //metadata.set("X-PDFPREPROC-ORIGINAL", body.toString());
+        // "FAIL" will be overwritten if it succeeds later
+
+        //add the PDF metadata to the official metadata object
+        Arrays.asList(pdfMetadata.names()).forEach(name -> {
+            metadata.add(name, pdfMetadata.get(name));
+        });
+
+        //objects to hold file references for manipulation outside of Java
+        File tiffFileOfPDF = null;
+        File pdfFileFromStream = File.createTempFile("tempPDF", ".pdf");
+        try {
+            FileUtils.copyInputStreamToFile(stream, pdfFileFromStream);
+            tiffFileOfPDF = File.createTempFile("tempTIFF", ".tiff");
+            makeTiffFromPDF(pdfFileFromStream,tiffFileOfPDF, config);
+            if (tiffFileOfPDF.exists()) {
+                long tessStartTime = System.currentTimeMillis();
+                TesseractOCRParser tesseract = new TesseractOCRParser();
+
+                tesseract.parse(FileUtils.openInputStream(tiffFileOfPDF), handler, metadata, context);
+
+                //metadata.set("X-OCR-Applied", "true");
+                metadata.add("X-Parsed-By", TesseractOCRParser.class.getName());
+
+                LOG.debug("Document parsing -- OCR processing time: {} ms", System.currentTimeMillis() - tessStartTime);
+            }
+        } catch (Exception e) {
+            LOG.warn("Error while running OCR over the document");
+            throw e;
+        }
+        finally {
+            if (tiffFileOfPDF.exists()) {
+                tiffFileOfPDF.delete();
+            }
+            if (pdfFileFromStream.exists()) {
+                pdfFileFromStream.delete();
+            }
+        }
+    }
+
+    static String getImageMagickProg() {
+        return System.getProperty("os.name").startsWith("Windows") ? "convert.exe" : "convert";
+    }
+
+    private File makeTiffFromPDF(File input, File output, ImageMagickConfig config) throws IOException, TikaException {
+        String[] cmd = {config.getImageMagickPath() + getImageMagickProg(),
+                "-density", config.getDensity(), input.getPath(),
+                "-depth", config.getDepth(),
+                "-quality", config.getQuality(),
+                "-background", "white", "+matte",
+                output.getPath()};
+
+        ProcessBuilder pb = new ProcessBuilder(cmd);
+        //setEnv(config, pb);
+        final Process process = pb.start();
+
+        process.getOutputStream().close();
+        InputStream out = process.getInputStream();
+        InputStream err = process.getErrorStream();
+
+        logStream("ImageMagick-stdout", out, input);
+        logStream("ImageMagick-stderr", err, input);
+
+        FutureTask<Integer> waitTask = new FutureTask<Integer>(new Callable<Integer>() {
+            public Integer call() throws Exception {
+                return process.waitFor();
+            }
+        });
+
+        Thread waitThread = new Thread(waitTask);
+        waitThread.start();
+
+        try {
+            waitTask.get(config.getTimeout(), TimeUnit.SECONDS);
+            return output;
+        } catch (InterruptedException e) {
+            waitThread.interrupt();
+            process.destroy();
+            Thread.currentThread().interrupt();
+            throw new TikaException("ImageMagick-OCR-PDFParser: interrupted", e);
+
+        } catch (ExecutionException e) {
+            // should not be thrown
+
+        } catch (TimeoutException e) {
+            waitThread.interrupt();
+            process.destroy();
+            throw new TikaException("ImageMagick-OCR-PDFParser: timeout", e);
+        }
+        return null;
+    }
+
+    /**
+     * Starts a thread that reads the contents of the standard output or error
+     * stream of the given process to not block the process. The stream is
+     * closed once fully processed.
+     */
+    private void logStream(final String logType, final InputStream stream, final File file) {
+        new Thread() {
+            public void run() {
+                Reader reader = new InputStreamReader(stream);
+                StringBuilder out = new StringBuilder();
+                char[] buffer = new char[1024];
+                try {
+                    for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) {
+                        out.append(buffer, 0, n);
+                    }
+                } catch (IOException e) {
+
+                } finally {
+                    IOUtils.closeQuietly(stream);
+                }
+
+                String msg = out.toString();
+                LogFactory.getLog(LegacyPdfProcessorParser.class).debug(msg);
+            }
+        }.start();
+    }
+}
\ No newline at end of file
diff --git a/src/main/java/tika/legacy/LegacyTikaProcessor.java b/src/main/java/tika/legacy/LegacyTikaProcessor.java
new file mode 100644
index 0000000..5346e82
--- /dev/null
+++ b/src/main/java/tika/legacy/LegacyTikaProcessor.java
@@ -0,0 +1,108 @@
+package tika.legacy;
+
+import java.io.ByteArrayOutputStream;
+import java.time.OffsetDateTime;
+import java.util.*;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.ocr.TesseractOCRConfig;
+import org.apache.tika.sax.BodyContentHandler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.stereotype.Component;
+import tika.model.TikaProcessingResult;
+import tika.processor.AbstractTikaProcessor;
+import javax.annotation.PostConstruct;
+
+
+/**
+ * The "legacy" Tika processor, using parser from CogStack-Pipeline
+ * to provide compatibility with the migration of the pipeline.
+ *
+ * Processes PDF documents by running manually:
+ * - 1x ImageMagick - to create one large temporary TIFF image
+ * - 1x Tesseract - to extract the text from the TIFF
+ */
+@Component("legacyTikaProcessor")
+public class LegacyTikaProcessor extends AbstractTikaProcessor {
+
+    @Autowired
+    private LegacyPdfProcessorConfig config;
+
+    /**
+     * Document-type based automatic detection of the parser to be used by Tika
+     */
+    private AutoDetectParser defaultParser;
+    private ParseContext defaultParseContext;
+
+    private Logger log = LoggerFactory.getLogger(LegacyTikaProcessor.class);
+
+
+    /**
+     * Initializes the processor using provided (autowired) configuration
+     */
+    @PostConstruct
+    @Override
+    public void init() throws Exception {
+        defaultParseContext = new ParseContext();
+        defaultParseContext.set(TikaConfig.class, config.getTikaConfig());
+        defaultParseContext.set(LegacyPdfProcessorConfig.class, config);
+
+        TesseractOCRConfig tessConfig = new TesseractOCRConfig();
+        tessConfig.setTimeout(config.getOcrTimeout());
+        defaultParseContext.set(TesseractOCRConfig.class, tessConfig);
+
+        ImageMagickConfig imgConfig = new ImageMagickConfig();
+        imgConfig.setTimeout(config.getConversionTimeout());
+        defaultParseContext.set(ImageMagickConfig.class, imgConfig);
+
+        defaultParser = new AutoDetectParser(config.getTikaConfig());
+    }
+
+    /**
+     * Resets the component with any intermediate data used
+     */
+    @Override
+    public void reset() throws Exception {
+        // actually, we only need to re-initialize all the resources apart from the configuration
+        init();
+    }
+
+    /**
+     * Processes the input stream returning the extracted text
+     */
+    protected TikaProcessingResult processStream(TikaInputStream stream) {
+        TikaProcessingResult result;
+
+        try {
+            ByteArrayOutputStream outStream = new ByteArrayOutputStream(64 * 1024);
+            BodyContentHandler handler = new BodyContentHandler(outStream);
+            Metadata metadata = new Metadata();
+
+            defaultParser.parse(stream, handler, metadata, defaultParseContext);
+
+            // parse the metadata and store the result
+            Map<String, Object> resultMetadata = extractMetadata(metadata);
+            result = TikaProcessingResult.builder()
+                    .text(outStream.toString())
+                    .metadata(resultMetadata)
+                    .success(true)
+                    .timestamp(OffsetDateTime.now())
+                    .build();
+        }
+        catch (Exception e) {
+            log.error(e.getMessage());
+
+            result = TikaProcessingResult.builder()
+                    .error("Exception caught while processing the document: " + e.getMessage())
+                    .success(false)
+                    .build();
+        }
+
+        return result;
+    }
+}
diff --git a/src/main/java/tika/model/MetadataKeys.java b/src/main/java/tika/model/MetadataKeys.java
new file mode 100644
index 0000000..8c607e7
--- /dev/null
+++ b/src/main/java/tika/model/MetadataKeys.java
@@ -0,0 +1,15 @@
+package tika.model;
+
+/**
+ * Metadata keys that are to be used to extract relevant information
+ * from the document alongside the text.
+ * Note that some of these keys may not be available, depending on the document type.
+ */
+public class MetadataKeys {
+    public final static String CONTENT_TYPE = "Content-Type";
+    public final static String CREATION_DATE = "Creation-Date";
+    public final static String LAST_MODIFIED = "Last-Modified";
+    public final static String OCR_APPLIED = "X-OCR-Applied";
+    public final static String PARSED_BY = "X-Parsed-By";
+    public final static String PAGE_COUNT = "Page-Count";
+}
diff --git a/src/main/java/tika/model/TikaBinaryDocument.java b/src/main/java/tika/model/TikaBinaryDocument.java
new file mode 100644
index 0000000..1c4bf2e
--- /dev/null
+++ b/src/main/java/tika/model/TikaBinaryDocument.java
@@ -0,0 +1,12 @@
+package tika.model;
+
+import lombok.Data;
+
+/**
+ * A simplified representation of Tika Binary document
+ * that can be used as a payload for requests
+ */
+@Data
+public class TikaBinaryDocument {
+        byte[] content;
+}
diff --git a/src/main/java/tika/model/TikaPackageInformation.java b/src/main/java/tika/model/TikaPackageInformation.java
new file mode 100644
index 0000000..23367e2
--- /dev/null
+++ b/src/main/java/tika/model/TikaPackageInformation.java
@@ -0,0 +1,28 @@
+package tika.model;
+
+import com.fasterxml.jackson.annotation.JsonIgnoreProperties;
+import com.fasterxml.jackson.annotation.JsonProperty;
+import com.fasterxml.jackson.annotation.JsonView;
+import common.JsonPropertyAccessView;
+import lombok.Data;
+import org.apache.tika.Tika;
+
+/**
+ * A helper class providing information about the implementation details of used Tika package
+ */
+@Data
+@JsonIgnoreProperties(value={"specification_version", "implementation_version"}, allowGetters=true)
+public class TikaPackageInformation {
+
+    @JsonProperty("specification_version")
+    @JsonView(JsonPropertyAccessView.Public.class)
+    String getTikaSpecificationVersion() {
+        return Tika.class.getPackage().getSpecificationVersion();
+    }
+
+    @JsonProperty("implementation_version")
+    @JsonView(JsonPropertyAccessView.Public.class)
+    final String getTikaImplementationVersion() {
+        return Tika.class.getPackage().getImplementationVersion();
+    }
+}
diff --git a/src/main/java/tika/model/TikaProcessingResult.java b/src/main/java/tika/model/TikaProcessingResult.java
new file mode 100644
index 0000000..a8802d2
--- /dev/null
+++ b/src/main/java/tika/model/TikaProcessingResult.java
@@ -0,0 +1,38 @@
+package tika.model;
+
+import com.fasterxml.jackson.annotation.JsonFormat;
+import com.fasterxml.jackson.annotation.JsonInclude;
+import lombok.Builder;
+import lombok.Data;
+import org.springframework.format.annotation.DateTimeFormat;
+
+import java.time.OffsetDateTime;
+import java.util.Map;
+
+
+/**
+ * Tika processing result payload
+ */
+@Data
+@Builder
+//@JsonAutoDetect(fieldVisibility = JsonAutoDetect.Visibility.ANY)
+@JsonInclude(JsonInclude.Include.NON_NULL)
+public class TikaProcessingResult {
+
+    // extracted text from the document
+    String text;
+
+    // document metadata
+    Map<String, Object> metadata;
+
+    // processing status
+    Boolean success;
+
+    // the error message in case processing failed
+    String error;
+
+    // when the document was processed
+    @DateTimeFormat(iso = DateTimeFormat.ISO.DATE_TIME)
+    @JsonFormat(pattern = "yyyy-MM-dd'T'HH:mm:ss.SSSXXX")
+    OffsetDateTime timestamp;
+}
diff --git a/src/main/java/tika/processor/AbstractTikaProcessor.java b/src/main/java/tika/processor/AbstractTikaProcessor.java
new file mode 100644
index 0000000..fc93368
--- /dev/null
+++ b/src/main/java/tika/processor/AbstractTikaProcessor.java
@@ -0,0 +1,126 @@
+package tika.processor;
+
+import java.io.InputStream;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.ocr.TesseractOCRParser;
+import tika.model.MetadataKeys;
+import tika.model.TikaBinaryDocument;
+import tika.model.TikaProcessingResult;
+
+
+/**
+ * An abstract class for a Tika Processor
+ */
+public abstract class AbstractTikaProcessor {
+
+    /**
+     * The metadata keys that should be extracted by the processor
+     */
+    private static final String[] metaKeysSingleValue = {MetadataKeys.CONTENT_TYPE, MetadataKeys.CREATION_DATE,
+            MetadataKeys.LAST_MODIFIED, MetadataKeys.OCR_APPLIED};
+    private static final String[] metaKeysMultiValue = {MetadataKeys.PARSED_BY};
+
+
+    /**
+     * Processor lifecycle methods
+     */
+    public void init() throws Exception {}
+
+    public void reset() throws Exception {}
+
+
+    /**
+     * The main documents processing method
+     */
+    protected abstract TikaProcessingResult processStream(TikaInputStream stream);
+
+
+    /**
+     * Wrappers over the main document processing method
+     */
+    public TikaProcessingResult process(final TikaBinaryDocument binaryDoc) {
+        return processStream(TikaInputStream.get(binaryDoc.getContent()));
+    }
+
+    public TikaProcessingResult process(InputStream stream) {
+        return processStream(TikaInputStream.get(stream));
+    }
+
+
+
+    /**
+     * Helper methods
+     * TODO: can be moved to utils
+     */
+    static public int getPageCount(final Metadata docMeta) {
+        Map<String, Object> resultMeta = new HashMap<>();
+        extractPageCount(docMeta, resultMeta);
+
+        if (resultMeta.containsKey(MetadataKeys.PAGE_COUNT)) {
+            return Integer.parseInt(resultMeta.get(MetadataKeys.PAGE_COUNT).toString());
+        }
+        return -1;
+    }
+
+    static public boolean isValidDocumentType(final Map<String, Object> resultMeta) {
+        return !( !resultMeta.containsKey(MetadataKeys.CONTENT_TYPE) ||
+                   resultMeta.get(MetadataKeys.CONTENT_TYPE).equals(MediaType.OCTET_STREAM.toString()) ||
+                   resultMeta.get(MetadataKeys.CONTENT_TYPE).equals(MediaType.EMPTY.toString()));
+    }
+
+    static private void extractPageCount(final Metadata docMeta, Map<String, Object> resultMeta) {
+        String pgValue = null;
+        if (docMeta.get("xmpTPg:NPages") != null) {
+            pgValue = docMeta.get("xmpTPg:NPages");
+        }
+        else if (docMeta.get("meta:page-count") != null) {
+            pgValue = docMeta.get("meta:page-count");
+        }
+        else if (docMeta.get("exif:PageCount") != null) {
+            pgValue = docMeta.get("exif:PageCount");
+        }
+        else if (docMeta.get("Page-Count") != null) {
+            pgValue = docMeta.get("Page-Count");
+        }
+
+        if (pgValue != null) {
+            resultMeta.put(MetadataKeys.PAGE_COUNT, pgValue);
+        }
+    }
+
+    static private void extractOcrApplied(final Metadata docMeta, Map<String, Object> resultMeta) {
+        if (docMeta.get("X-Parsed-By") != null
+                && (Arrays.asList(docMeta.getValues("X-Parsed-By")).contains(TesseractOCRParser.class.getName())
+                // note that some parsers are also adding class prefix to the name: 'class org...
+               || Arrays.asList(docMeta.getValues("X-Parsed-By")).contains(TesseractOCRParser.class.toString()))) {
+            resultMeta.put(MetadataKeys.OCR_APPLIED, "true");
+        }
+        else {
+            resultMeta.put(MetadataKeys.OCR_APPLIED, "false");
+        }
+    }
+
+    protected Map<String, Object> extractMetadata(final Metadata docMeta) {
+        Map<String, Object> resultMeta = new HashMap<>();
+        Arrays.stream(metaKeysSingleValue).forEach(name -> {
+            if (docMeta.get(name) != null)
+                resultMeta.put(name, docMeta.get(name));
+        });
+
+        Arrays.stream(metaKeysMultiValue).forEach(name -> {
+            if (docMeta.getValues(name) != null)
+                resultMeta.put(name, docMeta.getValues(name));
+        });
+
+        extractPageCount(docMeta, resultMeta);
+
+        extractOcrApplied(docMeta, resultMeta);
+
+        return resultMeta;
+    }
+}
diff --git a/src/main/java/tika/processor/CompositeTikaProcessor.java b/src/main/java/tika/processor/CompositeTikaProcessor.java
new file mode 100644
index 0000000..531529f
--- /dev/null
+++ b/src/main/java/tika/processor/CompositeTikaProcessor.java
@@ -0,0 +1,276 @@
+package tika.processor;
+
+import java.io.ByteArrayOutputStream;
+import java.io.InputStream;
+import java.time.OffsetDateTime;
+import java.util.*;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.parser.ocr.TesseractOCRConfig;
+import org.apache.tika.parser.pdf.PDFParser;
+import org.apache.tika.parser.pdf.PDFParserConfig;
+import org.apache.tika.sax.BodyContentHandler;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.stereotype.Component;
+import tika.legacy.ImageMagickConfig;
+import tika.legacy.LegacyPdfProcessorConfig;
+import tika.legacy.LegacyPdfProcessorParser;
+import tika.model.TikaProcessingResult;
+import javax.annotation.PostConstruct;
+
+
+/**
+ * A default, composite Tika processor.
+ *
+ * In contrast to "legacy" processor it uses the default approach implemented in Tika, i.e. when
+ * parsing PDF documents, it runs the processing independently per each PDF page,
+ * and hence running Tesseract Page-Count times.
+ */
+@Component("compositeTikaProcessor")
+public class CompositeTikaProcessor extends AbstractTikaProcessor {
+
+    @Autowired
+    private CompositeTikaProcessorConfig compositeTikaProcessorConfig;
+
+    @Autowired
+    private LegacyPdfProcessorConfig legacyPdfProcessorConfig;
+
+    /**
+     In order to properly handle PDF documents and OCR we need three separate parsers:
+     - a generic parser (for any, non-PDF document type),
+     - one that will extract text only from PDFs,
+     - one that will apply OCR on PDFs (when stored only images).
+
+     In the default configuration of PDFParser the OCR is disabled when extracting text from PDFs. However, OCR is
+     enabled when extracting text from documents of image type. When using default parser with OCR enabled (strategy:
+     extract both text and OCR), it will actually always apply OCR on the PDFs even when there is text-only provided.
+
+     We would also like to know when OCR was applied as it will affect the accuracy of the extracted text that will be
+     passed to the downstream analysis applications.
+     */
+
+    // common tika and parsers configuration
+    private TikaConfig tikaConfig;
+    private TesseractOCRConfig tessConfig;
+
+    // the default, generic parser for handling all document types (expect PDF)
+    private AutoDetectParser defaultParser;
+    private ParseContext defaultParseContext;
+
+    // the default parser for PDFs (no OCR)
+    private PDFParser pdfTextParser;
+    private ParseContext pdfTextParseContext;
+
+    // the parser to extract text from PDFs using OCR
+    private PDFParser pdfOcrParser;
+    private ParseContext pdfOcrParseContext;
+
+    // the parser to extract text from PDFs using OCR only for single-pages
+    // (used to strip-off clutter from LibreOffice-generated PDFs just with images)
+    private LegacyPdfProcessorParser pdfSinglePageOcrParser;
+    private ParseContext pdfSinglePageOcrParseContext;
+
+
+    private Logger log = LoggerFactory.getLogger(CompositeTikaProcessor.class);
+
+
+    @PostConstruct
+    @Override
+    public void init() throws Exception {
+
+        tikaConfig = new TikaConfig();
+
+        initializeTesseractConfig();
+
+        initializeDefaultParser();
+
+        initializePdfTextOnlyParser();
+
+        initializePdfOcrParser();
+
+        if (compositeTikaProcessorConfig.isUseLegacyOcrParserForSinglePageDocuments()) {
+            initializePdfLegacyOcrParser();
+        }
+    }
+
+    @Override
+    public void reset() throws Exception {
+        // actually, we only need to re-initialize all the resources apart from the configuration
+        init();
+    }
+
+    protected TikaProcessingResult processStream(TikaInputStream stream) {
+        final int MIN_TEXT_BUFFER_SIZE = 1024;
+
+        TikaProcessingResult result;
+        try {
+            ByteArrayOutputStream outStream = new ByteArrayOutputStream(MIN_TEXT_BUFFER_SIZE);
+            BodyContentHandler handler = new BodyContentHandler(outStream);
+            Metadata metadata = new Metadata();
+
+            // mark the stream for multi-pass processing
+            if (stream.markSupported()) {
+                stream.mark(Integer.MAX_VALUE);
+            }
+
+            // try to detect whether the document is PDF
+            if (isDocumentOfPdfType(stream)) {
+
+                // firstly try the default parser
+                pdfTextParser.parse(stream, handler, metadata, pdfTextParseContext);
+
+                // check if there have been enough characters read / extracted and the we read enough bytes from the stream
+                // (images embedded in the documents will occupy quite more space than just raw text)
+                if (outStream.size() < compositeTikaProcessorConfig.getPdfMinDocTextLength()
+                        && stream.getPosition() > compositeTikaProcessorConfig.getPdfMinDocByteSize()) {
+
+                    // since we are perfoming a second pass over the document, we need to reset cursor position
+                    // in both input and output streams
+                    stream.reset();
+                    outStream.reset();
+
+                    final boolean useOcrLegacyParser = compositeTikaProcessorConfig.isUseLegacyOcrParserForSinglePageDocuments()
+                            && getPageCount(metadata) == 1;
+
+                    // TODO: Q: shall we use a clean metadata or re-use some of the previously parsed fields???
+                    handler = new BodyContentHandler(outStream);
+                    metadata = new Metadata();
+
+                    if (useOcrLegacyParser) {
+                        pdfSinglePageOcrParser.parse(stream, handler, metadata, pdfSinglePageOcrParseContext);
+
+                        // since we use the parser manually, update the metadata with the name of the parser class used
+                        metadata.add("X-Parsed-By", LegacyPdfProcessorParser.class.getName());
+                    }
+                    else {
+                        pdfOcrParser.parse(stream, handler, metadata, pdfOcrParseContext);
+
+                        // since we use the parser manually, update the metadata with the name of the parser class used
+                        metadata.add("X-Parsed-By", PDFParser.class.getName());
+                    }
+                }
+                else {
+                    // since we use the parser manually, update the metadata with the name of the parser class used
+                    metadata.add("X-Parsed-By", PDFParser.class.getName());
+                }
+            }
+            else {
+                // otherwise, run default documents parser
+                defaultParser.parse(stream, handler, metadata, defaultParseContext);
+            }
+
+            // parse the metadata and store the result
+            Map<String, Object> resultMeta = extractMetadata(metadata);
+            result = TikaProcessingResult.builder()
+                    .text(outStream.toString())
+                    .metadata(resultMeta)
+                    .success(true)
+                    .timestamp(OffsetDateTime.now())
+                    .build();
+        }
+        catch (Exception e) {
+            log.error(e.getMessage());
+
+            result = TikaProcessingResult.builder()
+                    .error("Exception caught while processing the document: " + e.getMessage())
+                    .success(false)
+                    .build();
+        }
+
+        return result;
+    }
+
+
+    private boolean isDocumentOfPdfType(InputStream stream) throws Exception {
+        Metadata metadata = new Metadata();
+        MediaType mediaType = defaultParser.getDetector().detect(stream, metadata);
+
+        return mediaType.equals(MediaType.application("pdf"));
+    }
+
+
+    private void initializeTesseractConfig() {
+        tessConfig = new TesseractOCRConfig();
+
+        tessConfig.setTimeout(compositeTikaProcessorConfig.getOcrTimeout());
+        tessConfig.setApplyRotation(compositeTikaProcessorConfig.isOcrApplyRotation());
+        if (compositeTikaProcessorConfig.isOcrEnableImageProcessing()) {
+            tessConfig.setEnableImageProcessing(1);
+        }
+        else {
+            tessConfig.setEnableImageProcessing(0);
+        }
+        tessConfig.setLanguage(compositeTikaProcessorConfig.getOcrLanguage());
+    }
+
+
+    private void initializeDefaultParser() {
+        defaultParser = new AutoDetectParser(tikaConfig);
+
+        defaultParseContext = new ParseContext();
+        defaultParseContext.set(TikaConfig.class, tikaConfig);
+        defaultParseContext.set(TesseractOCRConfig.class, tessConfig);
+        defaultParseContext.set(Parser.class, defaultParser); //need to add this to make sure recursive parsing happens!
+    }
+
+
+    private void initializePdfTextOnlyParser() {
+        PDFParserConfig pdfTextOnlyConfig = new PDFParserConfig();
+        pdfTextOnlyConfig.setExtractInlineImages(false);
+        pdfTextOnlyConfig.setExtractUniqueInlineImagesOnly(false); // do not extract multiple inline images
+        pdfTextOnlyConfig.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR);
+
+        pdfTextParser = new PDFParser();
+        pdfTextParseContext = new ParseContext();
+        pdfTextParseContext.set(TikaConfig.class, tikaConfig);
+        pdfTextParseContext.set(PDFParserConfig.class, pdfTextOnlyConfig);
+        //pdfTextParseContext.set(Parser.class, defaultParser); //need to add this to make sure recursive parsing happens!
+    }
+
+
+    private void initializePdfOcrParser() {
+        PDFParserConfig pdfOcrConfig = new PDFParserConfig();
+        pdfOcrConfig.setExtractUniqueInlineImagesOnly(false); // do not extract multiple inline images
+        if (compositeTikaProcessorConfig.isPdfOcrOnlyStrategy()) {
+            pdfOcrConfig.setExtractInlineImages(false);
+            pdfOcrConfig.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY);
+        }
+        else {
+            pdfOcrConfig.setExtractInlineImages(true);
+            // warn: note that applying 'OCR_AND_TEXT_EXTRACTION' the content can be duplicated
+            pdfOcrConfig.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION);
+        }
+
+        pdfOcrParser = new PDFParser();
+        pdfOcrParseContext = new ParseContext();
+        pdfOcrParseContext.set(TikaConfig.class, tikaConfig);
+        pdfOcrParseContext.set(PDFParserConfig.class, pdfOcrConfig);
+        pdfOcrParseContext.set(TesseractOCRConfig.class, tessConfig);
+        //pdfOcrParseContext.set(Parser.class, defaultParser); //need to add this to make sure recursive parsing happens!
+    }
+
+    private void initializePdfLegacyOcrParser() {
+        pdfSinglePageOcrParser = new LegacyPdfProcessorParser();
+
+        pdfSinglePageOcrParseContext = new ParseContext();
+        pdfSinglePageOcrParseContext.set(TikaConfig.class, tikaConfig);
+        pdfSinglePageOcrParseContext.set(LegacyPdfProcessorConfig.class, legacyPdfProcessorConfig);
+
+        TesseractOCRConfig tessConfig = new TesseractOCRConfig();
+        tessConfig.setTimeout(legacyPdfProcessorConfig.getOcrTimeout());
+        pdfSinglePageOcrParseContext.set(TesseractOCRConfig.class, tessConfig);
+
+        ImageMagickConfig imgConfig = new ImageMagickConfig();
+        imgConfig.setTimeout(legacyPdfProcessorConfig.getConversionTimeout());
+        pdfSinglePageOcrParseContext.set(ImageMagickConfig.class, imgConfig);
+
+        //pdfOcrParseContext.set(Parser.class, defaultParser); //need to add this to make sure recursive parsing happens!
+    }
+}
diff --git a/src/main/java/tika/processor/CompositeTikaProcessorConfig.java b/src/main/java/tika/processor/CompositeTikaProcessorConfig.java
new file mode 100644
index 0000000..2a88507
--- /dev/null
+++ b/src/main/java/tika/processor/CompositeTikaProcessorConfig.java
@@ -0,0 +1,63 @@
+package tika.processor;
+
+import com.fasterxml.jackson.annotation.JsonView;
+import common.JsonPropertyAccessView;
+import lombok.Data;
+import org.springframework.beans.factory.annotation.Value;
+import org.springframework.context.annotation.Configuration;
+
+
+/**
+ * The composite PDF processor configuration
+ */
+@Data
+@Configuration
+public class CompositeTikaProcessorConfig {
+
+    // the timeout value (s) when performing OCR over documents
+    @JsonView(JsonPropertyAccessView.Public.class)
+    @Value("${tika.parsers.tesseract-ocr.timeout:120}")
+    private int ocrTimeout;
+
+    // apply image processing techniques during documents conversion (using ImageMagick)
+    // required to enable applying rotation (see below)
+    @JsonView(JsonPropertyAccessView.Public.class)
+    @Value("${tika.parsers.tesseract-ocr.enable-image-processing:false}")
+    private boolean ocrEnableImageProcessing;
+
+    // apply de-rotation of documents before processing
+    // can be quite computationally expensive (runs as an external python script)
+    @JsonView(JsonPropertyAccessView.Public.class)
+    @Value("${tika.parsers.tesseract-ocr.apply-rotation:false}")
+    private boolean ocrApplyRotation;
+
+    // the language used in the OCR for corrections
+    @JsonView(JsonPropertyAccessView.Public.class)
+    @Value("${tika.parsers.tesseract-ocr.language:eng}")
+    private String ocrLanguage;
+
+    // whether to apply OCR only on the documents or also extract the embeded text (if present)
+    // warn: note that applying 'OCR_AND_TEXT_EXTRACTION' the content can be duplicated
+    @JsonView(JsonPropertyAccessView.Public.class)
+    @Value("${tika.parsers.pdf-ocr-parser.ocr-only-strategy:true}")
+    private boolean pdfOcrOnlyStrategy;
+
+    // apply OCR only when trying to extract text from previously parsed document (w/o OCR)
+    // that extracted characters were less than N
+    @JsonView(JsonPropertyAccessView.Public.class)
+    @Value("${tika.parsers.pdf-ocr-parser.min-doc-text-length:100}")
+    private int pdfMinDocTextLength;
+
+    // apply OCR only when trying to extract text from previously parsed document (w/o OCR)
+    // that the read bytes were at least N
+    @JsonView(JsonPropertyAccessView.Public.class)
+    @Value("${tika.parsers.pdf-ocr-parser.min-doc-byte-size:10000}")
+    private int pdfMinDocByteSize;
+
+    // use a legacy parser for applying OCR for single-page PDF documents
+    // (NB: when exporting single-page PDFs from LibreOffice that contain only one image,
+    //   some additional clutter may be embedded in the PDF content)
+    @JsonView(JsonPropertyAccessView.Public.class)
+    @Value("${tika.parsers.use-legacy-ocr-parser-for-single-page-doc:false}")
+    private boolean useLegacyOcrParserForSinglePageDocuments;
+}
diff --git a/src/main/resources/application.yaml b/src/main/resources/application.yaml
new file mode 100644
index 0000000..907c149
--- /dev/null
+++ b/src/main/resources/application.yaml
@@ -0,0 +1,47 @@
+# application configuration
+#
+application:
+  version: 0.1.0
+
+
+# general spring boot configuration
+#
+server:
+  port: 8090
+
+spring:
+  servlet:
+    multipart.max-file-size: 100MB
+    multipart.max-request-size: 100MB
+
+
+# tika configuration
+#
+tika:
+  parsers:
+    tesseract-ocr:
+      language: eng
+      timeout: 300
+      enable-image-processing: false
+      apply-rotation: false
+
+    pdf-ocr-parser:
+      ocr-only-strategy: true
+      min-doc-text-length: 100
+      min-doc-byte-size: 10000
+      use-legacy-ocr-parser-for-single-page-doc: false
+
+    legacy-pdf-parser:
+      image-magick:
+        timeout: 300
+      tesseract-ocr:
+        timeout: 300
+      min-doc-text-length: 100
+
+
+# documents processing configuration
+#
+processing:
+  use-legacy-tika-processor-as-default: true
+  fail-on-empty-files: false
+  fail-on-non-document-types: true
diff --git a/src/main/resources/logback.xml b/src/main/resources/logback.xml
new file mode 100644
index 0000000..2527c44
--- /dev/null
+++ b/src/main/resources/logback.xml
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<configuration>
+    <appender name="STDOUT" class="ch.qos.logback.core.ConsoleAppender">
+        <encoder>
+            <pattern>%d{dd-MM-yyyy HH:mm:ss.SSS} -- %highlight(%-5level) : %magenta([%thread]) %logger{36}.%M - %msg%n</pattern>
+        </encoder>
+    </appender>
+    <root level="info">
+        <appender-ref ref="STDOUT" />
+    </root>
+</configuration>
\ No newline at end of file
diff --git a/src/main/resources/tika-config/legacy-parser-config.xml b/src/main/resources/tika-config/legacy-parser-config.xml
new file mode 100644
index 0000000..c414d03
--- /dev/null
+++ b/src/main/resources/tika-config/legacy-parser-config.xml
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<properties>
+    <parsers>
+        <!-- Default Parser for most things, except for 2 mime types, and never
+             use the Executable Parser -->
+        <parser class="org.apache.tika.parser.DefaultParser">
+            <mime-exclude>application/pdf</mime-exclude>
+        </parser>
+        <!-- Use a different parser for PDF -->
+        <parser class="tika.legacy.LegacyPdfProcessorParser">
+            <mime>application/pdf</mime>
+        </parser>
+    </parsers>
+</properties>
\ No newline at end of file
diff --git a/src/test/java/service/ServiceControllerDocumentMultipartFileTests.java b/src/test/java/service/ServiceControllerDocumentMultipartFileTests.java
new file mode 100644
index 0000000..1c92363
--- /dev/null
+++ b/src/test/java/service/ServiceControllerDocumentMultipartFileTests.java
@@ -0,0 +1,69 @@
+package service;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule;
+import org.junit.runner.RunWith;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.boot.test.autoconfigure.web.servlet.AutoConfigureMockMvc;
+import org.springframework.boot.test.context.SpringBootTest;
+import org.springframework.http.HttpStatus;
+import org.springframework.mock.web.MockMultipartFile;
+import org.springframework.test.context.ContextConfiguration;
+import org.springframework.test.context.junit4.SpringRunner;
+import org.springframework.test.web.servlet.MockMvc;
+import org.springframework.test.web.servlet.MvcResult;
+import org.springframework.test.web.servlet.request.MockMvcRequestBuilders;
+import service.controller.TikaServiceConfig;
+import service.model.ServiceResponseContent;
+import tika.legacy.LegacyPdfProcessorConfig;
+import tika.model.TikaProcessingResult;
+import tika.processor.CompositeTikaProcessorConfig;
+import java.io.InputStream;
+
+import static org.junit.Assert.*;
+import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.status;
+
+
+/**
+ * Implements document processing tests for the Service Controller
+ * A document is passed as a multi-part file
+ */
+@SpringBootTest(classes = TikaServiceApplication.class)
+@RunWith(SpringRunner.class)
+@AutoConfigureMockMvc
+@ContextConfiguration(classes = {TikaServiceConfig.class, LegacyPdfProcessorConfig.class, CompositeTikaProcessorConfig.class})
+public class ServiceControllerDocumentMultipartFileTests extends ServiceControllerDocumentTests  {
+
+	@Autowired
+	private MockMvc mockMvc;
+
+	final private String PROCESS_FILE_ENDPOINT_URL = "/api/process_file";
+
+    @Override
+    protected TikaProcessingResult sendProcessingRequest(final String docPath, HttpStatus expectedStatus) throws Exception  {
+        return sendMultipartFileProcessingRequest(docPath, expectedStatus);
+    }
+
+    private TikaProcessingResult sendMultipartFileProcessingRequest(final String docPath, HttpStatus expectedStatus) throws Exception  {
+        InputStream stream = utils.getDocumentStream(docPath);
+        MockMultipartFile multipartFile = new MockMultipartFile("file", docPath, "multipart/form-data", stream);
+
+        MvcResult result = mockMvc.perform(MockMvcRequestBuilders.multipart(PROCESS_FILE_ENDPOINT_URL)
+                .file(multipartFile))
+                //.param("some-random", "4"))
+                .andExpect(status().is(expectedStatus.value()))
+                .andReturn();
+        //.andExpect(content().string("success"));
+
+        assertEquals(expectedStatus.value(), result.getResponse().getStatus());
+        assertNotNull(result.getResponse().getContentAsString());
+
+        // parse content
+        ObjectMapper mapper = new ObjectMapper();
+        mapper.registerModule(new JavaTimeModule());
+        TikaProcessingResult tikaResult = mapper.readValue(result.getResponse().getContentAsString(),
+                ServiceResponseContent.class).getResult();
+
+        return tikaResult;
+    }
+}
diff --git a/src/test/java/service/ServiceControllerDocumentStreamTests.java b/src/test/java/service/ServiceControllerDocumentStreamTests.java
new file mode 100644
index 0000000..6bf1ff3
--- /dev/null
+++ b/src/test/java/service/ServiceControllerDocumentStreamTests.java
@@ -0,0 +1,68 @@
+package service;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule;
+import org.junit.runner.RunWith;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.boot.test.autoconfigure.web.servlet.AutoConfigureMockMvc;
+import org.springframework.boot.test.context.SpringBootTest;
+import org.springframework.http.HttpStatus;
+import org.springframework.test.context.ContextConfiguration;
+import org.springframework.test.context.junit4.SpringRunner;
+import org.springframework.test.web.servlet.MockMvc;
+import org.springframework.test.web.servlet.MvcResult;
+import org.springframework.test.web.servlet.request.MockMvcRequestBuilders;
+import service.controller.TikaServiceConfig;
+import service.model.ServiceResponseContent;
+import tika.legacy.LegacyPdfProcessorConfig;
+import tika.model.TikaProcessingResult;
+import tika.processor.CompositeTikaProcessorConfig;
+import java.io.InputStream;
+
+import static org.junit.Assert.*;
+import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.status;
+
+
+/**
+ * Implements document processing tests for the Service Controller
+ * A document is passed as an ocet stream
+ */
+@SpringBootTest(classes = TikaServiceApplication.class)
+@RunWith(SpringRunner.class)
+@AutoConfigureMockMvc
+@ContextConfiguration(classes = {TikaServiceConfig.class, LegacyPdfProcessorConfig.class, CompositeTikaProcessorConfig.class})
+public class ServiceControllerDocumentStreamTests extends ServiceControllerDocumentTests  {
+
+    @Autowired
+    private MockMvc mockMvc;
+
+    final private String PROCESS_ENDPOINT_URL = "/api/process";
+
+    @Override
+    protected TikaProcessingResult sendProcessingRequest(final String docPath, HttpStatus expectedStatus) throws Exception  {
+        return sendFileProcessingRequest(docPath, expectedStatus);
+    }
+
+    private TikaProcessingResult sendFileProcessingRequest(final String docPath, HttpStatus expectedStatus) throws Exception  {
+        InputStream stream = utils.getDocumentStream(docPath);
+
+        byte[] content = stream.readAllBytes();
+
+        MvcResult result = mockMvc.perform(MockMvcRequestBuilders.post(PROCESS_ENDPOINT_URL)
+                .content(content))
+                //.param("some-random", "4"))
+                .andExpect(status().is(expectedStatus.value()))
+                .andReturn();
+
+        assertEquals(expectedStatus.value(), result.getResponse().getStatus());
+        assertNotNull(result.getResponse().getContentAsString());
+
+        // parse content
+        ObjectMapper mapper = new ObjectMapper();
+        mapper.registerModule(new JavaTimeModule());
+        TikaProcessingResult tikaResult = mapper.readValue(result.getResponse().getContentAsString(),
+                ServiceResponseContent.class).getResult();
+
+        return tikaResult;
+    }
+}
diff --git a/src/test/java/service/ServiceControllerDocumentTests.java b/src/test/java/service/ServiceControllerDocumentTests.java
new file mode 100644
index 0000000..aa3fd88
--- /dev/null
+++ b/src/test/java/service/ServiceControllerDocumentTests.java
@@ -0,0 +1,71 @@
+package service;
+
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.boot.test.autoconfigure.web.servlet.AutoConfigureMockMvc;
+import org.springframework.boot.test.context.SpringBootTest;
+import org.springframework.http.HttpStatus;
+import org.springframework.test.context.ContextConfiguration;
+import org.springframework.test.context.junit4.SpringRunner;
+import service.controller.TikaServiceConfig;
+import tika.DocumentProcessorTests;
+import tika.legacy.LegacyPdfProcessorConfig;
+import tika.model.TikaProcessingResult;
+import tika.processor.CompositeTikaProcessorConfig;
+
+import static org.junit.Assert.*;
+
+
+/**
+ * Implements document processing tests for the Service Controller, extending the set of available tests
+ * present in DocumentProcessorTests
+ */
+@SpringBootTest(classes = TikaServiceApplication.class)
+@RunWith(SpringRunner.class)
+@AutoConfigureMockMvc
+@ContextConfiguration(classes = {TikaServiceConfig.class, LegacyPdfProcessorConfig.class, CompositeTikaProcessorConfig.class})
+public abstract class ServiceControllerDocumentTests extends DocumentProcessorTests  {
+
+    @Autowired
+    TikaServiceConfig serviceConfig;
+
+
+    protected abstract TikaProcessingResult sendProcessingRequest(final String docPath, HttpStatus expectedStatus) throws Exception;
+
+    @Override
+    protected TikaProcessingResult processDocument(final String docPath) throws Exception {
+        return sendProcessingRequest(docPath, HttpStatus.OK);
+    }
+
+
+    /**
+     * The actual tests start from here
+     *
+     *
+     */
+
+    @Override
+    public void testExtractPdfEx1Encrypted() throws Exception {
+        final String docPath = "pdf/ex1_enc.pdf";
+
+        TikaProcessingResult result = sendProcessingRequest(docPath, HttpStatus.BAD_REQUEST);
+
+        // extraction from encrypted PDF will fail with the proper error message
+        assertFalse(result.getSuccess());
+        assertTrue(result.getError().contains("document is encrypted"));
+    }
+
+
+    @Test
+    public void testExtractEmptyPdfFile() throws Exception {
+        final String docPath = "invalid/pdf_empty.pdf";
+
+        assertFalse(serviceConfig.isFailOnEmptyFiles());
+
+        // extraction should pass but with error
+        TikaProcessingResult result = sendProcessingRequest(docPath, HttpStatus.OK);
+        assertFalse(result.getSuccess());
+        assertTrue(result.getError().contains("Empty"));
+    }
+}
diff --git a/src/test/java/service/ServiceControllerTests.java b/src/test/java/service/ServiceControllerTests.java
new file mode 100644
index 0000000..f55c3b2
--- /dev/null
+++ b/src/test/java/service/ServiceControllerTests.java
@@ -0,0 +1,62 @@
+package service;
+
+import com.fasterxml.jackson.databind.ObjectMapper;
+import org.junit.Test;
+import org.junit.runner.RunWith;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.boot.test.autoconfigure.web.servlet.AutoConfigureMockMvc;
+import org.springframework.boot.test.context.SpringBootTest;
+import org.springframework.http.HttpStatus;
+import org.springframework.http.MediaType;
+import org.springframework.test.context.ContextConfiguration;
+import org.springframework.test.context.junit4.SpringRunner;
+import org.springframework.test.web.servlet.MockMvc;
+import org.springframework.test.web.servlet.MvcResult;
+import org.springframework.test.web.servlet.request.MockMvcRequestBuilders;
+import service.controller.TikaServiceConfig;
+import service.model.ServiceInformation;
+import tika.legacy.LegacyPdfProcessorConfig;
+import tika.processor.CompositeTikaProcessorConfig;
+
+import static org.junit.Assert.assertEquals;
+
+
+/**
+ * Implements general tests for the Service Controller
+ * (no documents processing)
+ */
+@SpringBootTest(classes = TikaServiceApplication.class)
+@RunWith(SpringRunner.class)
+@AutoConfigureMockMvc
+@ContextConfiguration(classes = {TikaServiceConfig.class, LegacyPdfProcessorConfig.class, CompositeTikaProcessorConfig.class})
+public class ServiceControllerTests  {
+
+    @Autowired
+    private MockMvc mockMvc;
+
+    @Autowired
+    private ServiceInformation serviceinfo;
+
+    final private String INFO_ENDPOINT_URL = "/api/info";
+
+
+    @Test
+    public void testGetApplicationInfo() throws Exception {
+        MvcResult result = mockMvc.perform(MockMvcRequestBuilders
+                .get(INFO_ENDPOINT_URL)
+                .accept(MediaType.APPLICATION_JSON_UTF8))
+                .andReturn();
+
+        // check response status
+        int status = result.getResponse().getStatus();
+        assertEquals(HttpStatus.OK.value(), status);
+
+        // parse content
+        ObjectMapper mapper = new ObjectMapper();
+        ServiceInformation response = mapper.readValue(result.getResponse().getContentAsString(),
+                ServiceInformation.class);
+
+        // check example content
+        assertEquals(response.getServiceConfig().getAppVersion(), serviceinfo.getServiceConfig().getAppVersion());
+    }
+}
diff --git a/src/test/java/tika/CompositeTikaProcessorTests.java b/src/test/java/tika/CompositeTikaProcessorTests.java
new file mode 100644
index 0000000..b528219
--- /dev/null
+++ b/src/test/java/tika/CompositeTikaProcessorTests.java
@@ -0,0 +1,43 @@
+package tika;
+
+import org.junit.*;
+import org.junit.runner.RunWith;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.boot.test.context.SpringBootTest;
+import org.springframework.test.context.ContextConfiguration;
+import org.springframework.test.context.junit4.SpringRunner;
+import tika.legacy.LegacyPdfProcessorConfig;
+import tika.processor.AbstractTikaProcessor;
+import tika.processor.CompositeTikaProcessor;
+import tika.processor.CompositeTikaProcessorConfig;
+
+
+/**
+ * Implements the tests using CompositeTikaProcessor as the documents processor
+ */
+@SpringBootTest(classes = CompositeTikaProcessor.class)
+@RunWith(SpringRunner.class)
+@ContextConfiguration(classes = {LegacyPdfProcessorConfig.class, CompositeTikaProcessorConfig.class})
+public class CompositeTikaProcessorTests extends DocumentProcessorTests {
+
+    @Autowired
+    LegacyPdfProcessorConfig legacyProcessorConfig;
+
+    @Autowired
+    CompositeTikaProcessorConfig compositeProcessorConfig;
+
+    @Autowired
+    CompositeTikaProcessor processor;
+
+
+    @Override
+    protected AbstractTikaProcessor getProcessor() {
+        return processor;
+    }
+
+    @After
+    public void reset() throws Exception {
+        processor.reset();
+    }
+}
+
diff --git a/src/test/java/tika/DocumentProcessorTests.java b/src/test/java/tika/DocumentProcessorTests.java
new file mode 100644
index 0000000..d752636
--- /dev/null
+++ b/src/test/java/tika/DocumentProcessorTests.java
@@ -0,0 +1,234 @@
+package tika;
+
+import org.junit.Ignore;
+import org.junit.Test;
+import tika.model.TikaProcessingResult;
+import tika.processor.AbstractTikaProcessor;
+import java.io.InputStream;
+
+import static org.junit.Assert.*;
+
+
+/**
+ * All the document processor tests are implemented in this abstract class in order to keep the
+ * rationale behind the tests and results in one single place.
+ */
+public abstract class DocumentProcessorTests {
+
+    protected DocumentTestUtils utils = new DocumentTestUtils();
+
+    /**
+     * Helper methods used in tests that can be overloaded in child classes
+     */
+    protected AbstractTikaProcessor getProcessor() { return null; }
+
+    protected TikaProcessingResult processDocument(final String docPath) throws Exception  {
+        AbstractTikaProcessor processor = getProcessor();
+        assertNotNull(processor);
+
+        InputStream stream = utils.getDocumentStream(docPath);
+        return processor.process(stream);
+    }
+
+
+    /**
+     * The actual tests start from here
+     *
+     *
+     */
+
+    @Test
+    public void testGenericExtractPattern1SourceTxt() throws Exception {
+        final String docPathPrefix = "generic/pat_id_1";
+        final String docExt = ".txt";
+
+        TikaProcessingResult result = processDocument(docPathPrefix + docExt);
+        assertTrue(result.getSuccess());
+
+        // test parsing status
+        String parsedString = result.getText();
+        assertEquals(310, parsedString.length());
+
+        // test metadata
+        utils.assertOcrApplied(false, result);
+    }
+
+    @Test
+    public void testGenericExtractPattern1Doc() throws Exception {
+        final String docPathPrefix = "generic/pat_id_1";
+        final String docExt = ".doc";
+
+        TikaProcessingResult result = processDocument(docPathPrefix + docExt);
+        assertTrue(result.getSuccess());
+
+        utils.testContentMatch(result, docPathPrefix);
+
+        // test metadata
+        utils.assertPageCount(1, result);
+        utils.assertOcrApplied(false, result);
+    }
+
+    @Test
+    public void testGenericExtractPattern1Docx() throws Exception {
+        final String docPathPrefix = "generic/pat_id_1";
+        final String docExt = ".docx";
+
+        TikaProcessingResult result = processDocument(docPathPrefix + docExt);
+        assertTrue(result.getSuccess());
+
+        utils.testContentMatch(result, docPathPrefix);
+
+        // test metadata
+        utils.assertPageCount(1, result);
+        utils.assertOcrApplied(false, result);
+    }
+
+    @Test
+    public void testGenericExtractPattern1Odt() throws Exception {
+        final String docPathPrefix = "generic/pat_id_1";
+        final String docExt = ".odt";
+
+        TikaProcessingResult result = processDocument(docPathPrefix + docExt);
+        assertTrue(result.getSuccess());
+
+        utils.testContentMatch(result, docPathPrefix);
+
+        // test metadata
+        utils.assertPageCount(1, result);
+        utils.assertOcrApplied(false, result);
+    }
+
+    @Test
+    public void testGenericExtractPattern1Rtf() throws Exception {
+        final String docPathPrefix = "generic/pat_id_1";
+        final String docExt = ".txt";
+
+        TikaProcessingResult result = processDocument(docPathPrefix + docExt);
+        assertTrue(result.getSuccess());
+
+        utils.testContentMatch(result, docPathPrefix);
+
+        // test metadata
+        // rtf does not contain page count
+        utils.assertOcrApplied(false, result);
+    }
+
+    @Test
+    public void testGenericExtractPattern1Png() throws Exception {
+        final String docPathPrefix = "generic/pat_id_1";
+        final String docExt = ".png";
+
+        TikaProcessingResult result = processDocument(docPathPrefix + docExt);
+        assertTrue(result.getSuccess());
+
+        utils.testContentMatch(result, docPathPrefix);
+
+        // test metadata
+        // png does not contain page count
+        utils.assertOcrApplied(true, result);
+    }
+
+    @Test
+    public void testGenericExtractPattern1Pdf() throws Exception {
+        final String docPathPrefix = "generic/pat_id_1";
+        final String docExt = ".pdf";
+
+        TikaProcessingResult result = processDocument(docPathPrefix + docExt);
+        assertTrue(result.getSuccess());
+
+        utils.testContentMatch(result, docPathPrefix);
+
+        // test metadata
+        utils.assertPageCount(1, result);
+        utils.assertOcrApplied(false, result); // this pdf contains text-only
+    }
+
+    @Test
+    public void testExtractPdfEx1WithoutOcr() throws Exception {
+        final String docPath = "pdf/ex1.pdf";
+
+        TikaProcessingResult result = processDocument(docPath);
+
+        // check an example string
+        assertTrue(result.getSuccess());
+        assertTrue(result.getText().contains("An Example Paper"));
+
+        // test metadata
+        utils.assertPageCount(10, result);
+        utils.assertOcrApplied(false, result); // this pdf contains text-only
+    }
+
+    @Test
+    public void testExtractPdfEx1Encrypted() throws Exception  {
+        final String docPath = "pdf/ex1_enc.pdf";
+
+        TikaProcessingResult result = processDocument(docPath);
+
+        // extraction from encrypted PDF will fail with the proper error message
+        assertFalse(result.getSuccess());
+        assertTrue(result.getError().contains("document is encrypted"));
+    }
+
+    @Test
+    public void testExtractPdfEx2WithOcr() throws Exception {
+        final String docPath = "pdf/ex2_ocr.pdf";
+
+        TikaProcessingResult result = processDocument(docPath);
+
+        // check the content
+        assertTrue(result.getSuccess());
+        final String parsedString = result.getText();
+        assertTrue(parsedString.length() > 0);
+
+        // example text from the first page
+        assertTrue(parsedString.contains("Father or mother"));
+        // example text from the second page
+        assertTrue(parsedString.contains("how you have determined who is the Nearest"));
+
+        // test medatata
+        utils.assertPageCount(2, result);
+        utils.assertOcrApplied(true, result);
+    }
+
+
+    // TODO: need to double-check how to handle invalid TIFFs or image files
+    @Ignore
+    @Test
+    public void testExtractTiffWithOCR() throws Exception {
+        InputStream stream = utils.getDocumentZipStream("invalid/tiff_multipage_spp2.tiff.zip", "tiff_multipage_spp2.tiff");
+
+        AbstractTikaProcessor processor = getProcessor();
+        TikaProcessingResult result = processor.process(stream);
+        assertTrue(result.getSuccess());
+
+        // HINT: the test should fail either as the TIFF is invalid
+        // or should an additional pre-processing of the image happen
+
+        // test parsing status
+        String parsedString = result.getText();
+        assertTrue(parsedString.length() > 0);
+
+        // test metadata
+        utils.assertPageCount(6, result);
+
+        // test example content
+        // - from first page
+        assertTrue(parsedString.contains("Sample Narrative Report"));
+    }
+
+
+    //TODO: need to create a proper docx encrypted file
+    @Ignore
+    @Test
+    public void testExtractWordEncrypted() throws Exception {
+        InputStream stream = utils.getDocumentStream("word_enc_noerror.docx");
+
+        AbstractTikaProcessor processor = getProcessor();
+        TikaProcessingResult result = processor.process(stream);
+
+        // extraction from encrypted DOCX will succeed but with the content empty and no error message
+        // uses: org.apache.tika.parser.microsoft.OfficeParser
+        //TODO: this one needs an internal fix or further investigation
+        assertTrue(result.getSuccess());
+    }
+}
diff --git a/src/test/java/tika/DocumentTestUtils.java b/src/test/java/tika/DocumentTestUtils.java
new file mode 100644
index 0000000..cae8308
--- /dev/null
+++ b/src/test/java/tika/DocumentTestUtils.java
@@ -0,0 +1,75 @@
+package tika;
+
+import tika.model.MetadataKeys;
+import tika.model.TikaProcessingResult;
+import java.io.ByteArrayInputStream;
+import java.io.InputStream;
+import java.util.Map;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipFile;
+
+import static org.junit.Assert.*;
+import static org.junit.Assert.assertTrue;
+
+
+/**
+ * Helper utilities used in tests
+ */
+public class DocumentTestUtils {
+    public InputStream getDocumentStream(final String docName) throws Exception {
+        final String fullPath = "tika/docs/" + docName;
+        InputStream stream = getClass().getClassLoader().getResourceAsStream(fullPath);
+        assertNotNull(stream);
+        ByteArrayInputStream bas = new ByteArrayInputStream(stream.readAllBytes());
+        return bas;
+    }
+
+    public InputStream getDocumentZipStream(final String archiveName, final String zipEntry) throws Exception {
+        final String fullPath = "tika/docs/" + archiveName;
+        final ZipEntry entry = new ZipEntry(zipEntry);
+        ZipFile zf = new ZipFile(getClass().getClassLoader().getResource(fullPath).getPath());
+        InputStream stream = zf.getInputStream(entry);
+        assertNotNull(stream);
+        return stream;
+    }
+
+    public String getDocumentText(final String path) throws Exception {
+        return new String(getDocumentStream(path).readAllBytes());
+    }
+
+
+    public void assertContentMatches(final String expected, final String actual) {
+        // note that this check is a very naive method of content comparison, as we only
+        // strip all the special characters and compare the content in lowercase
+        final String regexPattern = "[^\\dA-Za-z]";
+        final String s1parsed = expected.replaceAll(regexPattern, "");
+        final String s2parsed = actual.replaceAll(regexPattern, "");
+        assertEquals(s1parsed, s2parsed);
+    }
+
+    public void assertPageCount(final int expectedPageCount, TikaProcessingResult result) {
+        Map<String, Object> metadata = result.getMetadata();
+        assertTrue(metadata.containsKey(MetadataKeys.PAGE_COUNT));
+        assertEquals(Integer.parseInt(metadata.get(MetadataKeys.PAGE_COUNT).toString()), expectedPageCount);
+    }
+
+    public void assertOcrApplied(final boolean expectedStatus, TikaProcessingResult result) {
+        Map<String, Object> metadata = result.getMetadata();
+        if (metadata.containsKey(MetadataKeys.OCR_APPLIED)) {
+            assertEquals(Boolean.parseBoolean(metadata.get(MetadataKeys.OCR_APPLIED).toString()), expectedStatus);
+        }
+        else {
+            assertFalse(expectedStatus);
+        }
+    }
+
+
+    public void testContentMatch(final TikaProcessingResult result, final String docPathPrefix) throws Exception {
+        // read truth document
+        final String sourceText = getDocumentText(docPathPrefix + ".txt");
+
+        // test status and content
+        assertTrue(result.getText().length() > 0);
+        assertContentMatches(sourceText, result.getText());
+    }
+}
diff --git a/src/test/java/tika/LegacyTikaProcessorTests.java b/src/test/java/tika/LegacyTikaProcessorTests.java
new file mode 100644
index 0000000..0b9d0d5
--- /dev/null
+++ b/src/test/java/tika/LegacyTikaProcessorTests.java
@@ -0,0 +1,40 @@
+package tika;
+
+import org.junit.*;
+import org.junit.runner.RunWith;
+import org.springframework.beans.factory.annotation.Autowired;
+import org.springframework.boot.test.context.SpringBootTest;
+import org.springframework.test.annotation.DirtiesContext;
+import org.springframework.test.context.ContextConfiguration;
+import org.springframework.test.context.junit4.SpringRunner;
+import tika.legacy.LegacyPdfProcessorConfig;
+import tika.legacy.LegacyTikaProcessor;
+import tika.processor.AbstractTikaProcessor;
+
+
+/**
+ * Implements the tests using LegacyTikaProcessor as the documents processor
+ */
+@SpringBootTest(classes = LegacyTikaProcessor.class)
+@RunWith(SpringRunner.class)
+@DirtiesContext
+@ContextConfiguration(classes = {LegacyPdfProcessorConfig.class})
+public class LegacyTikaProcessorTests extends DocumentProcessorTests {
+
+    @Autowired
+    LegacyPdfProcessorConfig defaultConfig;
+
+    @Autowired
+    LegacyTikaProcessor processor;
+
+    @Override
+    protected AbstractTikaProcessor getProcessor() {
+        return processor;
+    }
+
+    @After
+    public void reset() throws Exception {
+        processor.reset();
+    }
+}
+
diff --git a/src/test/resources/application.yaml b/src/test/resources/application.yaml
new file mode 100644
index 0000000..907c149
--- /dev/null
+++ b/src/test/resources/application.yaml
@@ -0,0 +1,47 @@
+# application configuration
+#
+application:
+  version: 0.1.0
+
+
+# general spring boot configuration
+#
+server:
+  port: 8090
+
+spring:
+  servlet:
+    multipart.max-file-size: 100MB
+    multipart.max-request-size: 100MB
+
+
+# tika configuration
+#
+tika:
+  parsers:
+    tesseract-ocr:
+      language: eng
+      timeout: 300
+      enable-image-processing: false
+      apply-rotation: false
+
+    pdf-ocr-parser:
+      ocr-only-strategy: true
+      min-doc-text-length: 100
+      min-doc-byte-size: 10000
+      use-legacy-ocr-parser-for-single-page-doc: false
+
+    legacy-pdf-parser:
+      image-magick:
+        timeout: 300
+      tesseract-ocr:
+        timeout: 300
+      min-doc-text-length: 100
+
+
+# documents processing configuration
+#
+processing:
+  use-legacy-tika-processor-as-default: true
+  fail-on-empty-files: false
+  fail-on-non-document-types: true
diff --git a/src/test/resources/tika/config/legacy-parser-config.xml b/src/test/resources/tika/config/legacy-parser-config.xml
new file mode 100644
index 0000000..c414d03
--- /dev/null
+++ b/src/test/resources/tika/config/legacy-parser-config.xml
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<properties>
+    <parsers>
+        <!-- Default Parser for most things, except for 2 mime types, and never
+             use the Executable Parser -->
+        <parser class="org.apache.tika.parser.DefaultParser">
+            <mime-exclude>application/pdf</mime-exclude>
+        </parser>
+        <!-- Use a different parser for PDF -->
+        <parser class="tika.legacy.LegacyPdfProcessorParser">
+            <mime>application/pdf</mime>
+        </parser>
+    </parsers>
+</properties>
\ No newline at end of file
diff --git a/src/test/resources/tika/docs/generic/pat_id_1.doc b/src/test/resources/tika/docs/generic/pat_id_1.doc
new file mode 100644
index 0000000..1fe2ca2
Binary files /dev/null and b/src/test/resources/tika/docs/generic/pat_id_1.doc differ
diff --git a/src/test/resources/tika/docs/generic/pat_id_1.docx b/src/test/resources/tika/docs/generic/pat_id_1.docx
new file mode 100644
index 0000000..e9700ce
Binary files /dev/null and b/src/test/resources/tika/docs/generic/pat_id_1.docx differ
diff --git a/src/test/resources/tika/docs/generic/pat_id_1.odt b/src/test/resources/tika/docs/generic/pat_id_1.odt
new file mode 100644
index 0000000..90db7d9
Binary files /dev/null and b/src/test/resources/tika/docs/generic/pat_id_1.odt differ
diff --git a/src/test/resources/tika/docs/generic/pat_id_1.pdf b/src/test/resources/tika/docs/generic/pat_id_1.pdf
new file mode 100644
index 0000000..5b42732
Binary files /dev/null and b/src/test/resources/tika/docs/generic/pat_id_1.pdf differ
diff --git a/src/test/resources/tika/docs/generic/pat_id_1.png b/src/test/resources/tika/docs/generic/pat_id_1.png
new file mode 100644
index 0000000..fb8d321
Binary files /dev/null and b/src/test/resources/tika/docs/generic/pat_id_1.png differ
diff --git a/src/test/resources/tika/docs/generic/pat_id_1.rtf b/src/test/resources/tika/docs/generic/pat_id_1.rtf
new file mode 100644
index 0000000..020514a
--- /dev/null
+++ b/src/test/resources/tika/docs/generic/pat_id_1.rtf
@@ -0,0 +1,52 @@
+{\rtf1\ansi\deff3\adeflang1025
+{\fonttbl{\f0\froman\fprq2\fcharset0 Times New Roman;}{\f1\froman\fprq2\fcharset2 Symbol;}{\f2\fswiss\fprq2\fcharset0 Arial;}{\f3\froman\fprq2\fcharset0 Liberation Serif{\*\falt Times New Roman};}{\f4\fswiss\fprq2\fcharset0 Liberation Sans{\*\falt Arial};}{\f5\fswiss\fprq2\fcharset0 Calibri;}{\f6\fnil\fprq2\fcharset0 PingFang SC;}{\f7\fnil\fprq2\fcharset0 Arial Unicode MS;}{\f8\fswiss\fprq0\fcharset128 Arial Unicode MS;}}
+{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;}
+{\stylesheet{\s0\snext0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057 Normal;}
+{\*\cs15\snext15 Default Paragraph Font;}
+{\s16\sbasedon0\snext17\ql\sl256\slmult1\widctlpar\sb240\sa120\keepn\ltrpar\cf0\dbch\af6\dbch\af7\afs28\alang1025\loch\f4\fs28\lang2057 Heading;}
+{\s17\sbasedon0\snext17\ql\sl276\slmult1\widctlpar\sb0\sa140\ltrpar\cf0\dbch\af5\dbch\af0\afs22\alang1025\loch\f5\fs22\lang2057 Text Body;}
+{\s18\sbasedon17\snext18\ql\sl276\slmult1\widctlpar\sb0\sa140\ltrpar\cf0\dbch\af5\dbch\af8\afs22\alang1025\loch\f5\fs22\lang2057 List;}
+{\s19\sbasedon0\snext19\ql\sl256\slmult1\widctlpar\sb120\sa120\noline\ltrpar\cf0\i\dbch\af5\dbch\af8\afs24\alang1025\ai\loch\f5\fs24\lang2057 Caption;}
+{\s20\sbasedon0\snext20\ql\sl256\slmult1\widctlpar\sb0\sa160\noline\ltrpar\cf0\dbch\af5\dbch\af8\afs22\alang1025\loch\f5\fs22\lang2057 Index;}
+}{\*\generator LibreOffice/6.1.0.3$MacOSX_X86_64 LibreOffice_project/efb621ed25068d70781dc026f7e9c5187a4decd1}{\info{\author Rich}{\creatim\yr2015\mo11\dy2\hr16\min52}{\author Rich}{\revtim\yr2015\mo11\dy2\hr16\min59}{\printim\yr0\mo0\dy0\hr0\min0}}{\*\userprops}\deftab720
+\viewscale100
+{\*\pgdsctbl
+{\pgdsc0\pgdscuse451\pgwsxn11906\pghsxn16838\marglsxn1440\margrsxn1440\margtsxn1440\margbsxn1440\pgdscnxt0 Default Style;}}
+\formshade{\*\pgdscno0}\paperh16838\paperw11906\margl1440\margr1440\margt1440\margb1440\sectd\sbknone\sectunlocked1\pgndec\pgwsxn11906\pghsxn16838\marglsxn1440\margrsxn1440\margtsxn1440\margbsxn1440\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc\htmautsp
+{\*\ftnsep\chftnsep}\pgndec\pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057{\rtlch \ltrch\loch
+This is an example of a clinical document}
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057\rtlch \ltrch\loch
+
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057{\rtlch \ltrch\loch
+The patient\u8217\'92s name is Bart Davidson.}
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057{\rtlch \ltrch\loch
+His carer\u8217\'92s Name Paul Wayne.}
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057\rtlch \ltrch\loch
+
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057{\rtlch \ltrch\loch
+His telephone number is 07754828992}
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057\rtlch \ltrch\loch
+
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057{\rtlch \ltrch\loch
+His Address is 61 Basildon Way, }
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057{\rtlch \ltrch\loch
+East Croyhurst, }
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057{\rtlch \ltrch\loch
+Angelton, }
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057{\rtlch \ltrch\loch
+AL64 9HT}
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057\rtlch \ltrch\loch
+
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057{\rtlch \ltrch\loch
+His mother\u8217\'92s name is Pauline Smith.}
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057\rtlch \ltrch\loch
+
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057{\rtlch \ltrch\loch
+He is on 100mg Paracetamol, 20 milligrams clozapine}
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057\rtlch \ltrch\loch
+
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057\rtlch \ltrch\loch
+
+\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\rtlch \ltrch\loch
+
+\par }
\ No newline at end of file
diff --git a/src/test/resources/tika/docs/generic/pat_id_1.txt b/src/test/resources/tika/docs/generic/pat_id_1.txt
new file mode 100644
index 0000000..5d9c770
--- /dev/null
+++ b/src/test/resources/tika/docs/generic/pat_id_1.txt
@@ -0,0 +1,17 @@
+This is an example of a clinical document
+
+The patient’s name is Bart Davidson.
+His carer’s Name Paul Wayne.
+
+His telephone number is 07754828992
+
+His Address is 61 Basildon Way, 
+East Croyhurst, 
+Angelton, 
+AL64 9HT
+
+His mother’s name is Pauline Smith.
+
+He is on 100mg Paracetamol, 20 milligrams clozapine
+
+
diff --git a/src/test/resources/tika/docs/invalid/pdf_empty.pdf b/src/test/resources/tika/docs/invalid/pdf_empty.pdf
new file mode 100644
index 0000000..e69de29
diff --git a/src/test/resources/tika/docs/invalid/tiff_multipage_spp2.tiff.zip b/src/test/resources/tika/docs/invalid/tiff_multipage_spp2.tiff.zip
new file mode 100644
index 0000000..f0627de
Binary files /dev/null and b/src/test/resources/tika/docs/invalid/tiff_multipage_spp2.tiff.zip differ
diff --git a/src/test/resources/tika/docs/invalid/word_enc_noerror.docx b/src/test/resources/tika/docs/invalid/word_enc_noerror.docx
new file mode 100644
index 0000000..2be820e
Binary files /dev/null and b/src/test/resources/tika/docs/invalid/word_enc_noerror.docx differ
diff --git a/src/test/resources/tika/docs/pdf/ex1.pdf b/src/test/resources/tika/docs/pdf/ex1.pdf
new file mode 100644
index 0000000..ab5db00
Binary files /dev/null and b/src/test/resources/tika/docs/pdf/ex1.pdf differ
diff --git a/src/test/resources/tika/docs/pdf/ex1_enc.pdf b/src/test/resources/tika/docs/pdf/ex1_enc.pdf
new file mode 100644
index 0000000..5e3b5d0
Binary files /dev/null and b/src/test/resources/tika/docs/pdf/ex1_enc.pdf differ
diff --git a/src/test/resources/tika/docs/pdf/ex2_ocr.pdf b/src/test/resources/tika/docs/pdf/ex2_ocr.pdf
new file mode 100644
index 0000000..9fd8321
Binary files /dev/null and b/src/test/resources/tika/docs/pdf/ex2_ocr.pdf differ
diff --git a/travis_gradle_build.sh b/travis_gradle_build.sh
new file mode 100644
index 0000000..10d6457
--- /dev/null
+++ b/travis_gradle_build.sh
@@ -0,0 +1,97 @@
+#!/bin/bash
+
+# Abort on error, unitialized variables and pipe errors
+set -eEu
+set -o pipefail
+#set -v
+
+export PING_SLEEP=30s
+export WORKDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+export BUILD_OUTPUT=$WORKDIR/build.out
+export TEST_PROC_LOG_OUTPUT=$WORKDIR/test-proc.out
+export TEST_API_LOG_OUTPUT=$WORKDIR/test-api.out
+
+# dump the last N lines of the output files
+DUMP_LINES_BUILD=2000
+DUMP_LINES_TEST_PROC=5000
+DUMP_LINES_TEST_API=2000
+
+touch $BUILD_OUTPUT
+touch $TEST_PROC_LOG_OUTPUT
+touch $TEST_API_LOG_OUTPUT
+
+
+# Helper functions
+#
+print_log_separator() {
+  echo "----------------------------------------------------------------"
+  echo "-"
+  echo "-"
+  echo "-"
+  echo "-"
+  echo "----------------------------------------------------------------"
+}
+
+dump_output() {
+  if [ "$2" -eq "-1" ]; then
+    echo "Printing all the output: $1"
+    cat $1 
+  else
+    echo "Tailing the last $2 lines of build output: $1"
+    tail -$2 $1
+  fi
+}
+
+print_logs() {
+  print_log_separator
+  dump_output $BUILD_OUTPUT $DUMP_LINES_BUILD
+
+  print_log_separator
+  dump_output $TEST_PROC_LOG_OUTPUT $DUMP_LINES_TEST_PROC
+
+  print_log_separator
+  dump_output $TEST_API_LOG_OUTPUT $DUMP_LINES_TEST_API
+}
+
+run_build() {
+  #./gradlew build --full-stacktrace --debug 2>&1 | tee >(grep TestEventLogger | grep -P -n "[[:ascii:]]" >> $TEST_LOG_OUTPUT) | grep  -P -n "[[:ascii:]]" >> $BUILD_OUTPUT
+  ./gradlew assemble --full-stacktrace 2>&1 >> $BUILD_OUTPUT
+}
+
+run_tests() {
+  # enable debug output here to spot the errors
+  ./gradlew test --full-stacktrace --debug --tests=tika.LegacyTikaProcessorTests 2>&1 | grep TestEventLogger >> $TEST_PROC_LOG_OUTPUT
+  ./gradlew test --full-stacktrace --debug --tests=tika.CompositeTikaProcessorTests 2>&1 | grep TestEventLogger >> $TEST_PROC_LOG_OUTPUT
+  # disable debug here, too much verbose
+  ./gradlew test --full-stacktrace --tests=ServiceControllerTests 2>&1 >> $TEST_API_LOG_OUTPUT
+  ./gradlew test --full-stacktrace --tests=ServiceControllerDocumentMultipartFileTests 2>&1 >> $TEST_API_LOG_OUTPUT
+  ./gradlew test --full-stacktrace --tests=ServiceControllerDocumentStreamTests 2>&1 >> $TEST_API_LOG_OUTPUT
+}
+
+error_handler() {
+  echo ERROR: An error was encountered with the build.
+  print_logs
+  exit 1
+}
+
+
+# The Main
+#
+
+# If an error occurs, run our error handler to output a tail of the build
+trap 'error_handler' ERR SIGPIPE
+
+# Set up a repeating loop to send some output to Travis (to avoid killing inactive builds)
+bash -c "while true; do echo \$(date) - building ...; sleep $PING_SLEEP; done" &
+PING_LOOP_PID=$!
+
+# Build Commands
+run_build
+run_tests
+
+# 'nicely' terminate the ping output loop
+kill $PING_LOOP_PID
+
+# Print the logs
+echo SUCCESS
+print_logs