diff --git a/.gitignore b/.gitignore index a1c2a23..20bf8a4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,23 +1,6 @@ -# Compiled class file -*.class +.DS_Store +.idea +.gradle -# Log file -*.log - -# BlueJ files -*.ctxt - -# Mobile Tools for Java (J2ME) -.mtj.tmp/ - -# Package Files # -*.jar -*.war -*.nar -*.ear -*.zip -*.tar.gz -*.rar - -# virtual machine crash logs, see http://www.java.com/en/download/help/error_hotspot.xml -hs_err_pid* +build +out diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..471f814 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,49 @@ +dist: xenial + +language: java + +jdk: + - openjdk11 + +env: + # limit the number of processing theads used by tesseract + - OMP_THREAD_LIMIT=1 + +addons: + apt: + sources: + # tesseract-ocr >= 4.0 is not available in the standard Xenial / Trusty distro + - sourceline: 'ppa:alex-p/tesseract-ocr' + packages: + - tesseract-ocr + - tesseract-ocr-osd + - tesseract-ocr-eng + - imagemagick + - ghostscript + - libtesseract-dev + - libmagickcore-dev + - libmagickwand-dev + - libmagic-dev + - apache2-utils + +before_cache: + - rm -f $HOME/.gradle/caches/modules-2/modules-2.lock + - rm -fr $HOME/.gradle/caches/*/plugin-resolution/ + - rm -fr $HOME/.gradle/caches/*/scripts/ + +cache: + directories: + - $HOME/.gradle/caches/ + - $HOME/.gradle/wrapper/ + +install: + - sudo cp ./extras/ImageMagick/policy.xml /etc/ImageMagick-6/policy.xml + +before_script: + - convert --version +# - convert -list policy + - tesseract --version +# - ./gradlew downloadDependencies > /dev/null + +script: + - bash travis_gradle_build.sh diff --git a/CHANGELOG.txt b/CHANGELOG.txt new file mode 100644 index 0000000..1519650 --- /dev/null +++ b/CHANGELOG.txt @@ -0,0 +1,3 @@ +Release 0.1.0 -- 15 Aug 2019 +--------------- +* Initial stable version release diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..803fd23 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,109 @@ +################################################################ +# +# BUILD STEPS +# + +################################ +# +# JDK base +# +FROM adoptopenjdk/openjdk11:slim AS jdk-11-base + +# freeze the versions of the Tesseract+ImageMagick for reproducibility +ENV TESSERACT_VERSION 4.00~git2288-10f4998a-2 +ENV TESSERACT_RES_VERSION 4.00~git24-0e00fe6-1.2 +ENV IMAGEMAGICK_VERSION 8:6.9.7.4+dfsg-16ubuntu6.7 + +RUN apt-get update && \ +# apt-get dist-upgrade -y && \ +# apt-get install -y tesseract-ocr && \ + apt-get update && \ + apt-get install -y software-properties-common && \ + apt-get install -y tesseract-ocr=$TESSERACT_VERSION tesseract-ocr-eng=$TESSERACT_RES_VERSION tesseract-ocr-osd=$TESSERACT_RES_VERSION && \ +### apt-get install -y tesseract-ocr-osd=3.04.00-1 tesseract-ocr-eng=3.04.00-1 tesseract-ocr=3.04.01-5 && \ + apt-get install -y imagemagick=$IMAGEMAGICK_VERSION --fix-missing && \ + apt-get install -y python3-pip && pip3 install numpy matplotlib scikit-image && \ + apt-get clean autoclean && \ + apt-get autoremove -y && \ + rm -rf /var/lib/apt/lists/* + + +################################ +# +# Tika Server Builder +# +FROM jdk-11-base AS service-builder + +# setup the build environment +RUN mkdir -p /devel +WORKDIR /devel + +COPY ./gradle/wrapper /devel/gradle/wrapper +COPY ./gradlew /devel/ + +RUN ./gradlew --version + +COPY ./settings.gradle /devel/ +COPY . /devel/ + +# build service +# TIP: uncomment the two lines below to both build the service +# and run the tests during the build +#COPY ./extras/ImageMagick/policy.xml /etc/ImageMagick-6/policy.xml +#RUN ./gradlew build --no-daemon + +RUN ./gradlew bootJar --no-daemon + + + +################################################################ +# +# RUN STEPS +# + +################################ +# +# JRE base +# +FROM adoptopenjdk/openjdk11:jre AS jre-11-base + +# freeze the versions of the Tesseract+ImageMagick for reproducibility +ENV TESSERACT_VERSION 4.00~git2288-10f4998a-2 +ENV TESSERACT_RES_VERSION 4.00~git24-0e00fe6-1.2 +ENV IMAGEMAGICK_VERSION 8:6.9.7.4+dfsg-16ubuntu6.7 + +RUN apt-get update && \ +# apt-get dist-upgrade -y && \ +# apt-get install -y tesseract-ocr && \ + apt-get update && \ + apt-get install -y software-properties-common && \ + apt-get install -y tesseract-ocr=$TESSERACT_VERSION tesseract-ocr-eng=$TESSERACT_RES_VERSION tesseract-ocr-osd=$TESSERACT_RES_VERSION && \ +### apt-get install -y tesseract-ocr-osd=3.04.00-1 tesseract-ocr-eng=3.04.00-1 tesseract-ocr=3.04.01-5 && \ + apt-get install -y imagemagick=$IMAGEMAGICK_VERSION --fix-missing && \ + apt-get install -y python3-pip && pip3 install numpy matplotlib scikit-image && \ + apt-get clean autoclean && \ + apt-get autoremove -y && \ + rm -rf /var/lib/apt/lists/* + + +################################ +# +# Tika Service +# +FROM jre-11-base AS service-runner + +# setup env +RUN mkdir -p /app/config +WORKDIR /app + +# copy tika-server artifacts +COPY --from=service-builder /devel/build/libs/service-*.jar ./ +COPY --from=service-builder /devel/src/main/resources/application.yaml ./config/ + +COPY --from=service-builder /devel/scripts/run.sh ./ + +# copy external tools configuration files +COPY ./extras/ImageMagick/policy.xml /etc/ImageMagick-6/policy.xml + +# entry point +CMD ["/bin/bash", "/app/run.sh"] diff --git a/README.md b/README.md index 597b867..74ec506 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,162 @@ # Introduction -Apache Tika running as a web service +This project implements Apache Tika running as a web service using Spring Boot. It exposes a REST API so that a client can send a document in binary format and receive back the extracted text. The supported document formats are the ones as in Tika. -# Status -Work-in-progress ... +Some of the key motivation behind developing own wrapper over Tika instead of using the already availabke [Tika server](https://cwiki.apache.org/confluence/display/tika/TikaJAXRS) is a better control over used document parsers (such as PDFParser, Tesseract OCR and the legacy one taken from [CogStack-Pipeline](https://github.com/CogStack/CogStack-Pipeline)) and control over returned results with HTTP return codes. + + +# Building +To build the application, run in the main directory: + +`./gradlew build` + +The build artifacts will be placed in `./build` directory. + + +During the build, the tests will be run, where the failed tests can also signify missing third-party dependencies (see below). However, to skip running the tests and just build the application, one can run: + +`./gradlew bootJar`. + + +## Tests +To run the available tests, run: + +`./gradlew test` + +Please note that failed tests may signify missing third-party dependencies. + + +## Third-party dependencies +In the minimal setup, for proper text extraction Apache Tika requires the following applications to be present on the system: +- [Tesseract OCR](https://github.com/tesseract-ocr/tesseract), +- [ImageMagick](https://imagemagick.org), +- [Ghostscript](https://www.ghostscript.com/) (required by ImageMagick for documents conversion). + +ImageMagick also requires its configuration file `policy.xml` to be overriden by the provided `extras/ImageMagick/policy.xml` (in order to increase the the available resources for file processing and to override [security policy](https://stackoverflow.com/questions/52703123/override-default-imagemagick-policy-xml) related with Ghostscript). + +Moreover, in order to enable additional image processing capabilities of Tesseract OCR, few other dependencies need to be present in the system, such as Python environment. Please see the provided `Dockerfile` for the full list. + + +# Running the application +The application can be either run as a standalone Java application or inside a Docker container. The application configuration can be changed in the `application.yaml` file. The default version of configuration file is embeded in the jar file, but can be specified manually (see below). + +Please note that the recommended way is to use the provided Docker image since a number of dependencies need to be satisfied on a local machine. + + +## Running as a standalone Java application +Assuming that the build went correctly, to run the Tika service on a local machine: + +`java -jar build/jar/service-*.jar` + +The running service will be listening on port `8090` (by default) on the host machine. + + +## Using the Docker image +The latest stable Docker image is available in the Docker Hub under `cogstacksystems/tika-service:latest` tag. Alternatively, the latest development version is available under `cogstacksystems/tika-service:dev-latest` tag. The image can be also build locally using the provided `Dockerfile`. + + +To run Tika service container: + +`docker run -p 8090:8090 cogstacksystems/tika-service:latest` + +The service will be listening on port `8090` on the host machine. + + +# API + +## API specification +Tika Service, by default, will be listening on port `8090` and the returned content extraction result will be represented in JSON format. + +The service exposes such endpoints: +- *GET* `/api/info` - returns information about the service with its configuration, +- *POST* `/api/process` - processes a binary data stream with the binary document content, +- *POST* `/api/process_file` - processes a document file (multi-part request). + +## Document extraction result +The extraction results are represented in JSON format where the available main fields are: +- `result` - the content extraction result with metadata, +- `timestamp` - the content processing timestamp, +- `success` - specifies whether the extraction accomplished successfully, +- `error` - the message in case of processing error (assumes `success : false`). + +The content extraction result can contain such fields: +- `text` - the extracted text, +- `metadata` - the metadata associated with the document and the used parsers. + +The provided metadata associated with the document and the used parsers can include such fields: +- `X-Parsed-By` - an array of names of the parsers used during the content extraction, +- `X-OCR-Applied` - a flag specifying whether OCR was applied, +- `Content-Type` - the content type of the document, as identified by Tika, +- `Page-Count` - the document page count (extracted from the document metadata by Tika), +- `Creation-Date` - the document creation date (extracted from the document metadata by Tika). + + +# Example use +Using `curl` to send the document to Tika server instance running on localhost on `8090` port: + +`curl -F file=@test.pdf http://localhost:8090/api/process_file | jq` + +Returned result: +``` +{ + "result": { + "text": "Sample Type / Medical Specialty: Lab Medicine - Pathology", + "metadata": { + "X-Parsed-By": [ + "org.apache.tika.parser.CompositeParser", + "org.apache.tika.parser.DefaultParser", + "org.apache.tika.parser.microsoft.ooxml.OOXMLParser" + ], + "X-OCR-Applied": "false", + "Content-Type": "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + }, + "success": true, + "timestamp": "2019-08-13T15:14:58.022+01:00" + } +} +``` + +# Configuration + +## Configuration file +All the available service and document processors parameteres are stored in a single `src/main/resources/application.yaml` file. + +Although the initial configuration file is bundled with the application jar file, a modified one can be provided as a parameter when running the Java application. For example, when running the Tika service in the Docker container, the script `scripts/run.sh` runs the Tika service with custom configuration file `application.yaml` located in `/app/config/` directory: +`java -Dspring.config.location=/app/config/ -jar /app/service-*.jar` + + +## Available properties +The configuration file is stored in yaml format with the following available properties. + +### General application properties +- `application.version` - specifies the application version, +- `server.port` - the port number on which the service will be run (default: `8090`), +- `spring.servlet.multipart.max-file-size` and `spring.servlet.multipart.max-request-size` - specifies the max file size when processing file requests (default: `100MB`). + + +### Tika service configuration +The following keys reside under `tika.processing` node: +- `use-legacy-tika-processor-as-default` - whether to use the legacy Tika PDF parser (as used in CogStack Pipeline) for backward compatibility (default: `true`), +- `fail-on-empty-files` - whether to fail the request and report an error when client provided an empty document (default: `false`), +- `fail-on-non-document-types` - whether to fail the request and report an erorr when client provided a not supported and/or non-document content (default: `true`). + + +### Tika parsers configuration +The following keys reside under `tika.parsers` node. + +The keys under `tesseract-ocr` define the default behavior of the Tika Tesseract OCR parser: +- `language` - the language dictionary used by Tesseract (default: `eng`), +- `timeout` - the max time (ms) to process documents before reporting error (default: `300`), +- `enable-image-processing` - whether to use additional pre-processing of the images using ImageMagick (default: `false`), +- `apply-rotation` - whether to apply de-rotating of the images (default: `false`), +Please note that enabling `enable-image-processing` and/or `apply-rotation` although might improve the quality of the extracted text can significantly slower the extraction process. + +The keys under `pdf-ocr-parser` define the default behavior of the PDF parser that uses Tesseract OCR to extract the text: +- `ocr-only-strategy` - whether to use only OCR or to apply additional text extraction from the content (default: `true`), +- `min-doc-text-length` - if the available text in the document (before applying OCR) is higher than this value then skip OCR (default: `100`), +- `min-doc-byte-size` - the minimum size of the image data (in bytes) that should have the content to be extracted, otherwise is skipped (default: `10000`), +- `use-legacy-ocr-parser-for-single-page-doc` - in case of single-page PDF documents, whether to use the legacy parser (default: `false`). + +The keys under `legacy-pdf-parser` define the behavior of the Tika PDF parser used in CogStack Pipeline (the 'legacy' parser), that is used for backward compatibility: +- `image-magick.timeout` - the max timeout value (in ms) when performing document conversion using ImageMagick (default: `300`), +- `tesseract-ocr.timeout` - the max timeout value (in ms) when performing text extraction using Tesseract OCR (default: `300`), +- `min-doc-text-length` - if the available text in the document (before applying OCR) is higher than this value then skip OCR (default: `100`). diff --git a/build.gradle b/build.gradle new file mode 100644 index 0000000..2c649b2 --- /dev/null +++ b/build.gradle @@ -0,0 +1,43 @@ +plugins { + id 'org.springframework.boot' version '2.1.6.RELEASE' + id 'java' +} + +apply plugin: 'io.spring.dependency-management' + +group = 'service' +version = '0.1.0-SNAPSHOT' +sourceCompatibility = '11' + +configurations { + compileOnly { + extendsFrom annotationProcessor + } +} + +repositories { + mavenCentral() +} + +dependencies { + implementation 'org.springframework.boot:spring-boot-starter-web' + testImplementation 'org.springframework.boot:spring-boot-starter-test' + + compileOnly 'org.projectlombok:lombok' + annotationProcessor 'org.projectlombok:lombok' + + // json serialization + compile 'com.fasterxml.jackson.module:jackson-module-parameter-names' + compile 'com.fasterxml.jackson.datatype:jackson-datatype-jsr310' + compile 'com.fasterxml.jackson.datatype:jackson-datatype-jdk8' + + // tika + compile group: 'org.apache.tika', name: 'tika', version: '1.21' + compile group: 'org.apache.tika', name: 'tika-core', version: '1.21' + compile group: 'org.apache.tika', name: 'tika-parsers', version: '1.21' + + // tika-dependencies + compile group: 'org.apache.pdfbox', name: 'jbig2-imageio', version: '3.0.2' + compile group: 'com.github.jai-imageio', name: 'jai-imageio-jpeg2000', version: '1.3.0' + compile group: 'org.xerial', name: 'sqlite-jdbc', version: '3.27.2.1' +} diff --git a/extras/ImageMagick/policy.xml b/extras/ImageMagick/policy.xml new file mode 100644 index 0000000..c61e183 --- /dev/null +++ b/extras/ImageMagick/policy.xml @@ -0,0 +1,77 @@ + + + + + + + + +]> + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/gradle/wrapper/gradle-wrapper.jar b/gradle/wrapper/gradle-wrapper.jar new file mode 100644 index 0000000..87b738c Binary files /dev/null and b/gradle/wrapper/gradle-wrapper.jar differ diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties new file mode 100644 index 0000000..2a54fa4 --- /dev/null +++ b/gradle/wrapper/gradle-wrapper.properties @@ -0,0 +1,6 @@ +#Wed Jul 17 08:52:30 BST 2019 +distributionBase=GRADLE_USER_HOME +distributionPath=wrapper/dists +zipStoreBase=GRADLE_USER_HOME +zipStorePath=wrapper/dists +distributionUrl=https\://services.gradle.org/distributions/gradle-5.4.1-all.zip diff --git a/gradlew b/gradlew new file mode 100755 index 0000000..af6708f --- /dev/null +++ b/gradlew @@ -0,0 +1,172 @@ +#!/usr/bin/env sh + +############################################################################## +## +## Gradle start up script for UN*X +## +############################################################################## + +# Attempt to set APP_HOME +# Resolve links: $0 may be a link +PRG="$0" +# Need this for relative symlinks. +while [ -h "$PRG" ] ; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '/.*' > /dev/null; then + PRG="$link" + else + PRG=`dirname "$PRG"`"/$link" + fi +done +SAVED="`pwd`" +cd "`dirname \"$PRG\"`/" >/dev/null +APP_HOME="`pwd -P`" +cd "$SAVED" >/dev/null + +APP_NAME="Gradle" +APP_BASE_NAME=`basename "$0"` + +# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +DEFAULT_JVM_OPTS='"-Xmx64m"' + +# Use the maximum available, or set MAX_FD != -1 to use that value. +MAX_FD="maximum" + +warn () { + echo "$*" +} + +die () { + echo + echo "$*" + echo + exit 1 +} + +# OS specific support (must be 'true' or 'false'). +cygwin=false +msys=false +darwin=false +nonstop=false +case "`uname`" in + CYGWIN* ) + cygwin=true + ;; + Darwin* ) + darwin=true + ;; + MINGW* ) + msys=true + ;; + NONSTOP* ) + nonstop=true + ;; +esac + +CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar + +# Determine the Java command to use to start the JVM. +if [ -n "$JAVA_HOME" ] ; then + if [ -x "$JAVA_HOME/jre/sh/java" ] ; then + # IBM's JDK on AIX uses strange locations for the executables + JAVACMD="$JAVA_HOME/jre/sh/java" + else + JAVACMD="$JAVA_HOME/bin/java" + fi + if [ ! -x "$JAVACMD" ] ; then + die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." + fi +else + JAVACMD="java" + which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." +fi + +# Increase the maximum file descriptors if we can. +if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then + MAX_FD_LIMIT=`ulimit -H -n` + if [ $? -eq 0 ] ; then + if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then + MAX_FD="$MAX_FD_LIMIT" + fi + ulimit -n $MAX_FD + if [ $? -ne 0 ] ; then + warn "Could not set maximum file descriptor limit: $MAX_FD" + fi + else + warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" + fi +fi + +# For Darwin, add options to specify how the application appears in the dock +if $darwin; then + GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" +fi + +# For Cygwin, switch paths to Windows format before running java +if $cygwin ; then + APP_HOME=`cygpath --path --mixed "$APP_HOME"` + CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` + JAVACMD=`cygpath --unix "$JAVACMD"` + + # We build the pattern for arguments to be converted via cygpath + ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` + SEP="" + for dir in $ROOTDIRSRAW ; do + ROOTDIRS="$ROOTDIRS$SEP$dir" + SEP="|" + done + OURCYGPATTERN="(^($ROOTDIRS))" + # Add a user-defined pattern to the cygpath arguments + if [ "$GRADLE_CYGPATTERN" != "" ] ; then + OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" + fi + # Now convert the arguments - kludge to limit ourselves to /bin/sh + i=0 + for arg in "$@" ; do + CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` + CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option + + if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition + eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` + else + eval `echo args$i`="\"$arg\"" + fi + i=$((i+1)) + done + case $i in + (0) set -- ;; + (1) set -- "$args0" ;; + (2) set -- "$args0" "$args1" ;; + (3) set -- "$args0" "$args1" "$args2" ;; + (4) set -- "$args0" "$args1" "$args2" "$args3" ;; + (5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; + (6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; + (7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; + (8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; + (9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; + esac +fi + +# Escape application args +save () { + for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done + echo " " +} +APP_ARGS=$(save "$@") + +# Collect all arguments for the java command, following the shell quoting and substitution rules +eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" + +# by default we should be in the correct project dir, but when run from Finder on Mac, the cwd is wrong +if [ "$(uname)" = "Darwin" ] && [ "$HOME" = "$PWD" ]; then + cd "$(dirname "$0")" +fi + +exec "$JAVACMD" "$@" diff --git a/gradlew.bat b/gradlew.bat new file mode 100644 index 0000000..0f8d593 --- /dev/null +++ b/gradlew.bat @@ -0,0 +1,84 @@ +@if "%DEBUG%" == "" @echo off +@rem ########################################################################## +@rem +@rem Gradle startup script for Windows +@rem +@rem ########################################################################## + +@rem Set local scope for the variables with windows NT shell +if "%OS%"=="Windows_NT" setlocal + +set DIRNAME=%~dp0 +if "%DIRNAME%" == "" set DIRNAME=. +set APP_BASE_NAME=%~n0 +set APP_HOME=%DIRNAME% + +@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +set DEFAULT_JVM_OPTS="-Xmx64m" + +@rem Find java.exe +if defined JAVA_HOME goto findJavaFromJavaHome + +set JAVA_EXE=java.exe +%JAVA_EXE% -version >NUL 2>&1 +if "%ERRORLEVEL%" == "0" goto init + +echo. +echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:findJavaFromJavaHome +set JAVA_HOME=%JAVA_HOME:"=% +set JAVA_EXE=%JAVA_HOME%/bin/java.exe + +if exist "%JAVA_EXE%" goto init + +echo. +echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:init +@rem Get command-line arguments, handling Windows variants + +if not "%OS%" == "Windows_NT" goto win9xME_args + +:win9xME_args +@rem Slurp the command line arguments. +set CMD_LINE_ARGS= +set _SKIP=2 + +:win9xME_args_slurp +if "x%~1" == "x" goto execute + +set CMD_LINE_ARGS=%* + +:execute +@rem Setup the command line + +set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar + +@rem Execute Gradle +"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% + +:end +@rem End local scope for the variables with windows NT shell +if "%ERRORLEVEL%"=="0" goto mainEnd + +:fail +rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of +rem the _cmd.exe /c_ return code! +if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 +exit /b 1 + +:mainEnd +if "%OS%"=="Windows_NT" endlocal + +:omega diff --git a/lombok.config b/lombok.config new file mode 100644 index 0000000..6fe22f9 --- /dev/null +++ b/lombok.config @@ -0,0 +1,2 @@ +lombok.anyConstructor.addConstructorProperties=true +config.stopBubbling = true \ No newline at end of file diff --git a/scripts/run.sh b/scripts/run.sh new file mode 100644 index 0000000..819d7a9 --- /dev/null +++ b/scripts/run.sh @@ -0,0 +1,3 @@ +#!/bin/bash + +java -Dspring.config.location=/app/config/ -jar /app/service-*.jar diff --git a/settings.gradle b/settings.gradle new file mode 100644 index 0000000..ddc8b78 --- /dev/null +++ b/settings.gradle @@ -0,0 +1,6 @@ +pluginManagement { + repositories { + gradlePluginPortal() + } +} +rootProject.name = 'service' diff --git a/src/main/java/common/JsonPropertyAccessView.java b/src/main/java/common/JsonPropertyAccessView.java new file mode 100644 index 0000000..ac8745e --- /dev/null +++ b/src/main/java/common/JsonPropertyAccessView.java @@ -0,0 +1,10 @@ +package common; + +/** + * Implements mechanisms to access only selected members of the class + * during JSON serialization/deserialization + */ +public class JsonPropertyAccessView { + public static class Public {} + public static class Private {} +} diff --git a/src/main/java/service/TikaServiceApplication.java b/src/main/java/service/TikaServiceApplication.java new file mode 100644 index 0000000..8cee59e --- /dev/null +++ b/src/main/java/service/TikaServiceApplication.java @@ -0,0 +1,17 @@ +package service; + +import org.springframework.boot.SpringApplication; +import org.springframework.boot.autoconfigure.SpringBootApplication; + + +/** + * The main application + */ +@SpringBootApplication +public class TikaServiceApplication { + + public static void main(String[] args) { + SpringApplication.run(TikaServiceApplication.class, args); + } + +} diff --git a/src/main/java/service/controller/TikaServiceConfig.java b/src/main/java/service/controller/TikaServiceConfig.java new file mode 100644 index 0000000..98d3740 --- /dev/null +++ b/src/main/java/service/controller/TikaServiceConfig.java @@ -0,0 +1,38 @@ +package service.controller; + +import com.fasterxml.jackson.annotation.JsonView; +import common.JsonPropertyAccessView; +import lombok.Data; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.context.annotation.Configuration; + + +/** + * A general Tika Service processing configuration + */ +@Data +@Configuration +public class TikaServiceConfig { + + @JsonView(JsonPropertyAccessView.Public.class) + @Value("${application.version}") + String appVersion; + + // specifies whether to use the legacy Tika processor (as in CogStack-Pipeline) + // as the default documents processor + @JsonView(JsonPropertyAccessView.Public.class) + @Value("${processing.use-legacy-tika-processor-as-default:true}") + boolean useLegacyTikaProcessor; + + // specifies whether providing an empty file shall result in reporting failure + // due to invalid input provided by the client + @JsonView(JsonPropertyAccessView.Public.class) + @Value("${processing.fail-on-empty-files:true}") + boolean failOnEmptyFiles; + + // specifies whether providing a non-document type of data (e.g. executable) should fail + // due to invalid input provided by the client + @JsonView(JsonPropertyAccessView.Public.class) + @Value("${processing.fail-on-non-document-types:true}") + boolean failOnNonDocumentTypes; +} diff --git a/src/main/java/service/controller/TikaServiceController.java b/src/main/java/service/controller/TikaServiceController.java new file mode 100644 index 0000000..e48c58f --- /dev/null +++ b/src/main/java/service/controller/TikaServiceController.java @@ -0,0 +1,192 @@ +package service.controller; + +import com.fasterxml.jackson.annotation.JsonView; +import common.JsonPropertyAccessView; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.beans.factory.annotation.Qualifier; +import org.springframework.context.annotation.ComponentScan; +import org.springframework.http.HttpStatus; +import org.springframework.http.ResponseEntity; +import org.springframework.web.bind.annotation.*; +import org.springframework.web.multipart.MultipartFile; +import service.model.ServiceInformation; +import service.model.ServiceResponseContent; +import tika.legacy.LegacyTikaProcessor; +import tika.model.TikaProcessingResult; +import tika.processor.AbstractTikaProcessor; +import tika.processor.CompositeTikaProcessor; +import javax.annotation.PostConstruct; +import javax.servlet.http.HttpServletRequest; +import java.io.ByteArrayInputStream; + + +/** + * Main Tika Service REST controller + */ +@RestController +@ComponentScan({"tika.legacy", "tika.processor"}) +public class TikaServiceController { + + private final String apiPathPrefix = "/**/api"; + //private final String apiVersion = "v1"; + private final String apiFullPath = apiPathPrefix; + + private Logger log = LoggerFactory.getLogger(TikaServiceController.class); + + /** + * Tika document processors + */ + @Autowired + @Qualifier("legacyTikaProcessor") + private LegacyTikaProcessor legacyTikaProcessor; + + @Autowired + @Qualifier("compositeTikaProcessor") + private CompositeTikaProcessor compositeTikaProcessor; + + /** + * All the necessary information about the service, incl. config + */ + @Autowired + ServiceInformation serviceInfo; + + + private AbstractTikaProcessor tikaProcessor; + + @PostConstruct + void init() { + // select the appropriate document processor depending on the configuration + if (serviceInfo.getServiceConfig().isUseLegacyTikaProcessor()) { + tikaProcessor = legacyTikaProcessor; + } + else { + tikaProcessor = compositeTikaProcessor; + } + } + + + /** + * The endpoint returning service information with configuration + */ + @GetMapping(value = apiFullPath + "/info", produces = "application/json") + @JsonView(JsonPropertyAccessView.Public.class) + public @ResponseBody + ServiceInformation info() { + return serviceInfo; + } + + + /** + * The endpoint used for processing documents (e.g. sent as [ocet] stream) + */ + @PostMapping(value = apiFullPath + "/process", produces = "application/json") + public ResponseEntity process(HttpServletRequest request) { + try { + byte[] streamContent = request.getInputStream().readAllBytes(); + if (streamContent.length == 0) { + final String message = "Empty content"; + log.info(message); + + return createEmptyDocumentResponseEntity(message); + } + + // we are buffering the stream using ByteArrayInputStream in order to enable + // re-reading the binary document content + ByteArrayInputStream bufs = new ByteArrayInputStream(streamContent); + TikaProcessingResult result = processStream(bufs); + + return createProcessedDocumentResponseEntiy(result); + } + catch (Exception e) { + final String message = "Error processing the query: " + e.getMessage(); + log.error(message); + + return new ResponseEntity<>(createErrorResponse(message), HttpStatus.INTERNAL_SERVER_ERROR); + } + } + + + /** + * The endpoint used for processing documents sent as multi-part files + */ + @PostMapping(value = apiFullPath + "/process_file", consumes = { "multipart/form-data" }, produces = "application/json") + public ResponseEntity process(@RequestParam("file") MultipartFile file) { + // check whether we need to perform any processing + if (file.isEmpty()) { + final String message = "Empty content"; + log.info(message); + + return createEmptyDocumentResponseEntity(message); + } + + // process the content + try { + // we are buffering the stream using ByteArrayInputStream in order to enable + // re-reading the binary document content + ByteArrayInputStream bufs = new ByteArrayInputStream(file.getBytes()); + + TikaProcessingResult result = processStream(bufs); + return createProcessedDocumentResponseEntiy(result); + } + catch (Exception e) { + final String message = "Error processing the query: " + e.getMessage(); + log.error(message); + + return new ResponseEntity<>(createErrorResponse(message), HttpStatus.INTERNAL_SERVER_ERROR); + } + } + + + private ServiceResponseContent createErrorResponse(String message) { + ServiceResponseContent response = new ServiceResponseContent(); + TikaProcessingResult result = TikaProcessingResult.builder() + .success(false) + .error(message).build(); + response.setResult(result); + return response; + } + + + private TikaProcessingResult processStream(ByteArrayInputStream stream) { + log.info("Running processor: " + tikaProcessor.getClass().toString()); + return tikaProcessor.process(stream); + } + + + private ResponseEntity createEmptyDocumentResponseEntity(String errorMessage) { + HttpStatus status; + if (serviceInfo.getServiceConfig().isFailOnEmptyFiles()) { + status = HttpStatus.BAD_REQUEST; + } + else { + status = HttpStatus.OK; + } + + return new ResponseEntity<>(createErrorResponse(errorMessage), status); + } + + private ResponseEntity createProcessedDocumentResponseEntiy(TikaProcessingResult result) { + // remember to actually check the processing status + HttpStatus status; + if (result.getSuccess()) { + if (serviceInfo.getServiceConfig().isFailOnNonDocumentTypes() + & !AbstractTikaProcessor.isValidDocumentType(result.getMetadata())) { + // assume fail on non-document types + status = HttpStatus.BAD_REQUEST; + } + else { + status = HttpStatus.OK; + } + } + else { + // an error occurred during processing -- assume it's actually faulty document + status = HttpStatus.BAD_REQUEST; + } + + ServiceResponseContent response = new ServiceResponseContent(); + response.setResult(result); + return new ResponseEntity<>(response, status); + } +} diff --git a/src/main/java/service/model/ServiceInformation.java b/src/main/java/service/model/ServiceInformation.java new file mode 100644 index 0000000..ae8cfcc --- /dev/null +++ b/src/main/java/service/model/ServiceInformation.java @@ -0,0 +1,42 @@ +package service.model; + +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.annotation.JsonView; +import common.JsonPropertyAccessView; +import lombok.Data; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.context.annotation.ComponentScan; +import org.springframework.context.annotation.Configuration; +import service.controller.TikaServiceConfig; +import tika.legacy.LegacyPdfProcessorConfig; +import tika.model.TikaPackageInformation; +import tika.processor.CompositeTikaProcessorConfig; + + +/** + * All the information about Tika Service configuration + */ +@Data +@Configuration +@ComponentScan({"tika.legacy", "tika.processor"}) +public class ServiceInformation { + + @Autowired + @JsonProperty("legacy_processor_config") + @JsonView(JsonPropertyAccessView.Public.class) + LegacyPdfProcessorConfig legacyProcessorConfig; + + @Autowired + @JsonProperty("composite_processor_config") + @JsonView(JsonPropertyAccessView.Public.class) + CompositeTikaProcessorConfig compositeProcesorConfig; + + @Autowired + @JsonProperty("service_config") + @JsonView(JsonPropertyAccessView.Public.class) + TikaServiceConfig serviceConfig; + + @JsonProperty("tika_info") + @JsonView(JsonPropertyAccessView.Public.class) + TikaPackageInformation tikaInfo = new TikaPackageInformation(); +} diff --git a/src/main/java/service/model/ServiceRequestContent.java b/src/main/java/service/model/ServiceRequestContent.java new file mode 100644 index 0000000..e37cc34 --- /dev/null +++ b/src/main/java/service/model/ServiceRequestContent.java @@ -0,0 +1,24 @@ +package service.model; + +import lombok.Data; +import tika.model.TikaBinaryDocument; + +/** + * Service request content when used with JSON-accepting endpoints + * + * Current status: NOT USED + * + * NB: for the moment, documents are sent either as: + * - ocet stream + * - multi-part files + * as encoding binary document content into JSON may be an overkill, + * but may be revisited when going forward with gRPC + * + * [keeping for now as a placeholder] + */ +@Data +public class ServiceRequestContent { + TikaBinaryDocument document; + + // TODO: footer as in NLP +} diff --git a/src/main/java/service/model/ServiceResponseContent.java b/src/main/java/service/model/ServiceResponseContent.java new file mode 100644 index 0000000..77c448d --- /dev/null +++ b/src/main/java/service/model/ServiceResponseContent.java @@ -0,0 +1,18 @@ +package service.model; + +import com.fasterxml.jackson.annotation.JsonInclude; +import lombok.Data; +import tika.model.TikaProcessingResult; + + +/** + * The response from the service containing the document processing results + */ +@Data +@JsonInclude(JsonInclude.Include.NON_NULL) +public class ServiceResponseContent { + + TikaProcessingResult result; + + // TODO: footer as in NLP +} diff --git a/src/main/java/tika/legacy/ImageMagickConfig.java b/src/main/java/tika/legacy/ImageMagickConfig.java new file mode 100644 index 0000000..1740a47 --- /dev/null +++ b/src/main/java/tika/legacy/ImageMagickConfig.java @@ -0,0 +1,310 @@ +/* + * Copyright 2016 King's College London, Richard Jackson . + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package tika.legacy; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.Serializable; +import java.util.Locale; +import java.util.Properties; + +/** + * Configuration for TesseractOCRParser. + * + * This allows to enable TesseractOCRParser and set its parameters: + *

+ * TesseractOCRConfig config = new TesseractOCRConfig();
+ * config.setTesseractPath(tesseractFolder);
+ * parseContext.set(TesseractOCRConfig.class, config);
+ *

+ * + * Parameters can also be set by either editing the existing TesseractOCRConfig.properties file in, + * tika-parser/src/main/resources/org/apache/tika/parser/ocr, or overriding it by creating your own + * and placing it in the package org/apache/tika/parser/ocr on the classpath. + * + */ +public class ImageMagickConfig implements Serializable{ + + private static final long serialVersionUID = -4861942486845757891L; + + // Path to tesseract installation folder, if not on system path. + private String tesseractPath = ""; + + // Language dictionary to be used. + private String language = "eng"; + + // Tesseract page segmentation mode. + private String pageSegMode = "1"; + + // Minimum file size to submit file to ocr. + private int minFileSizeToOcr = 0; + + // Maximum file size to submit file to ocr. + private int maxFileSizeToOcr = Integer.MAX_VALUE; + + // Maximum time (seconds) to wait for the ocring process termination + private int timeout = 120; + private String imageMagickPath = ""; + private String density = "300"; + private String depth = "8"; + private String quality = "1"; + private int maxTiffSize = Integer.MAX_VALUE; + private int minTiffSize = 0; + + + /** + * Default contructor. + */ + public ImageMagickConfig() { + init(this.getClass().getResourceAsStream("ImageMagickConfig.properties")); + } + + /** + * Loads properties from InputStream and then tries to close InputStream. + * If there is an IOException, this silently swallows the exception + * and goes back to the default. + * + * @param is + */ + public ImageMagickConfig(InputStream is) { + init(is); + } + + private void init(InputStream is) { + if (is == null) { + return; + } + Properties props = new Properties(); + try { + props.load(is); + } catch (IOException ignored) { + } finally { + if (is != null) { + try { + is.close(); + } catch (IOException e) { + //swallow + } + } + } + setTesseractPath( + getProp(props, "tesseractPath", getTesseractPath())); + setLanguage( + getProp(props, "language", getLanguage())); + setPageSegMode( + getProp(props, "pageSegMode", getPageSegMode())); + setMinFileSizeToOcr( + getProp(props, "minFileSizeToOcr", getMinFileSizeToOcr())); + setMaxFileSizeToOcr( + getProp(props, "maxFileSizeToOcr", getMaxFileSizeToOcr())); + setImageMagickPath( + getProp(props, "imageMagickPath", getImageMagickPath())); + + setTimeout( + getProp(props, "timeout", getTimeout())); + setDensity( + getProp(props, "density", getDensity())); + setQuality( + getProp(props, "quality", getQuality())); + setDepth( + getProp(props, "depth", getDepth())); + setMinTiffSize( + getProp(props, "minTiffSize", getMinTiffSize())); + setMaxTiffSize( + getProp(props, "maxTiffSize", getMaxTiffSize())); + } + + /** @see #setTesseractPath(String tesseractPath)*/ + public String getTesseractPath() { + return tesseractPath; + } + + /** + * Set tesseract installation folder, needed if it is not on system path. + */ + public void setTesseractPath(String tesseractPath) { + if(!tesseractPath.isEmpty() && !tesseractPath.endsWith(File.separator)) + tesseractPath += File.separator; + + this.tesseractPath = tesseractPath; + } + + /** @see #setLanguage(String language)*/ + public String getLanguage() { + return language; + } + + /** + * Set tesseract language dictionary to be used. Default is "eng". + * Multiple languages may be specified, separated by plus characters. + */ + public void setLanguage(String language) { + if (!language.matches("([A-Za-z](\\+?))*")) { + throw new IllegalArgumentException("Invalid language code"); + } + this.language = language; + } + + /** @see #setPageSegMode(String pageSegMode)*/ + public String getPageSegMode() { + return pageSegMode; + } + + /** + * Set tesseract page segmentation mode. + * Default is 1 = Automatic page segmentation with OSD (Orientation and Script Detection) + */ + public void setPageSegMode(String pageSegMode) { + if (!pageSegMode.matches("[1-9]|10")) { + throw new IllegalArgumentException("Invalid language code"); + } + this.pageSegMode = pageSegMode; + } + + /** @see #setMinFileSizeToOcr(int minFileSizeToOcr)*/ + public int getMinFileSizeToOcr() { + return minFileSizeToOcr; + } + + /** + * Set minimum file size to submit file to ocr. + * Default is 0. + */ + public void setMinFileSizeToOcr(int minFileSizeToOcr) { + this.minFileSizeToOcr = minFileSizeToOcr; + } + + /** @see #setMaxFileSizeToOcr(int maxFileSizeToOcr)*/ + public int getMaxFileSizeToOcr() { + return maxFileSizeToOcr; + } + + /** + * Set maximum file size to submit file to ocr. + * Default is Integer.MAX_VALUE. + */ + public void setMaxFileSizeToOcr(int maxFileSizeToOcr) { + this.maxFileSizeToOcr = maxFileSizeToOcr; + } + + /** + * Set maximum time (seconds) to wait for the ocring process to terminate. + * Default value is 120s. + */ + public void setTimeout(int timeout) { + this.timeout = timeout; + } + + /** @see #setTimeout(int timeout)*/ + public int getTimeout() { + return timeout; + } + + /** + * Get property from the properties file passed in. + * @param properties properties file to read from. + * @param property the property to fetch. + * @param defaultMissing default parameter to use. + * @return the value. + */ + private int getProp(Properties properties, String property, int defaultMissing) { + String p = properties.getProperty(property); + if (p == null || p.isEmpty()){ + return defaultMissing; + } + try { + return Integer.parseInt(p); + } catch (Throwable ex) { + throw new RuntimeException(String.format(Locale.ROOT, "Cannot parse ImageMagickConfig variable %s, invalid integer value", + property), ex); + } + } + + /** + * Get property from the properties file passed in. + * @param properties properties file to read from. + * @param property the property to fetch. + * @param defaultMissing default parameter to use. + * @return the value. + */ + private String getProp(Properties properties, String property, String defaultMissing) { + return properties.getProperty(property, defaultMissing); + } + + public String getImageMagickPath() { + return imageMagickPath; + } + + public String getDensity() { + return density; + } + + public String getDepth() { + return depth; + } + + public String getQuality() { + return quality; + } + + public int getMaxTiffSize() { + return maxTiffSize; + } + + public void setMaxTiffSize(int maxTiffSize) { + this.maxTiffSize = maxTiffSize; + } + + public void setImageMagickPath(String imageMagickPath) { + this.imageMagickPath = imageMagickPath; + } + + public void setDensity(String density) { + this.density = density; + } + + public void setDepth(String depth) { + this.depth = depth; + } + + public void setQuality(String quality) { + this.quality = quality; + } + + public int getMinTiffSize() { + return minTiffSize; + } + + public void setMinTiffSize(int minTiffSize) { + this.minTiffSize = minTiffSize; + } +} \ No newline at end of file diff --git a/src/main/java/tika/legacy/LegacyPdfProcessorConfig.java b/src/main/java/tika/legacy/LegacyPdfProcessorConfig.java new file mode 100644 index 0000000..6c316f6 --- /dev/null +++ b/src/main/java/tika/legacy/LegacyPdfProcessorConfig.java @@ -0,0 +1,51 @@ +package tika.legacy; + +import com.fasterxml.jackson.annotation.JsonIgnore; +import com.fasterxml.jackson.annotation.JsonView; +import lombok.Data; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.exception.TikaException; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.context.annotation.Configuration; +import org.xml.sax.SAXException; +import common.JsonPropertyAccessView; +import javax.annotation.PostConstruct; +import java.io.IOException; + + +/** + * The legacy PDF processor configuration, as used in CogStack-Pipeline + * with some minor additions + */ +@Data +@Configuration +public class LegacyPdfProcessorConfig { + + @JsonIgnore + private TikaConfig tikaConfig; + + // the timeout value (s) when performing PDF->TIFF conversion of the documents + // the default value in Tika is 120s, but this may be too short for multi-page documents + @JsonView(JsonPropertyAccessView.Public.class) + @Value("${tika.parsers.legacy-pdf-parser.image-magick.timeout:120}") + private int conversionTimeout; + + // the timeout value (s) when performing OCR over the documents + // the default value in Tika is 120s, but this may be too short for multi-page documents + @JsonView(JsonPropertyAccessView.Public.class) + @Value("${tika.parsers.legacy-pdf-parser.tesseract-ocr.timeout:120}") + private int ocrTimeout; + + // apply OCR only when trying to extract text from previously parsed document (w/o OCR) + // that extracted characters were less than N + @JsonView(JsonPropertyAccessView.Public.class) + @Value("${tika.parsers.legacy-pdf-parser.min-doc-text-length:100}") + private int pdfMinDocTextLength; + + + @PostConstruct + public void init() throws IOException, SAXException, TikaException { + tikaConfig = new TikaConfig(this.getClass().getClassLoader() + .getResourceAsStream("tika-config/legacy-parser-config.xml")); + } +} diff --git a/src/main/java/tika/legacy/LegacyPdfProcessorParser.java b/src/main/java/tika/legacy/LegacyPdfProcessorParser.java new file mode 100644 index 0000000..58ed0a2 --- /dev/null +++ b/src/main/java/tika/legacy/LegacyPdfProcessorParser.java @@ -0,0 +1,256 @@ +/* + * Copyright 2016 King's College London, Richard Jackson . + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package tika.legacy; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.logging.LogFactory; +import org.apache.tika.exception.TikaException; +import org.apache.tika.io.IOUtils; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AbstractParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.external.ExternalParser; +import org.apache.tika.parser.ocr.TesseractOCRParser; +import org.apache.tika.parser.pdf.PDFParser; +import org.apache.tika.sax.BodyContentHandler; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.xml.sax.ContentHandler; +import org.xml.sax.SAXException; +import java.io.*; +import java.util.*; +import java.util.concurrent.*; + + +public class LegacyPdfProcessorParser extends AbstractParser { + + private static final long serialVersionUID = -8167538283213097265L; + private static Map IMAGEMAGICK_PRESENT = new HashMap(); + private static final ImageMagickConfig DEFAULT_IMAGEMAGICK_CONFIG = new ImageMagickConfig(); + + private static final Set SUPPORTED_TYPES = Collections.unmodifiableSet( + new HashSet<>(Arrays.asList(new MediaType[]{ + MediaType.application("pdf") + }))); + private static final Logger LOG = LoggerFactory.getLogger(LegacyPdfProcessorParser.class); + + + @Override + public Set getSupportedTypes(ParseContext context) { + // If ImageMagick is installed, offer our supported image types + ImageMagickConfig imconfig = context.get(ImageMagickConfig.class, DEFAULT_IMAGEMAGICK_CONFIG); + if (hasImageMagick(imconfig)) { + return SUPPORTED_TYPES; + } + + // Otherwise don't advertise anything, so the other parsers + // can be selected instead + return Collections.emptySet(); + } + + private boolean hasImageMagick(ImageMagickConfig config) { + // Fetch where the config says to find hasImageMagick + String imageMagick = config.getImageMagickPath() + getImageMagickProg(); + + // Have we already checked for a copy of ImageMagick there? + if (IMAGEMAGICK_PRESENT.containsKey(imageMagick)) { + return IMAGEMAGICK_PRESENT.get(imageMagick); + } + + // Try running ImageMagick from there, and see if it exists + works + String[] checkCmd = {imageMagick}; + try { + boolean hasImageMagick = ExternalParser.check(checkCmd); + IMAGEMAGICK_PRESENT.put(imageMagick, hasImageMagick); + return hasImageMagick; + } catch (NoClassDefFoundError e) { + // This happens under OSGi + Fork Parser - see TIKA-1507 + // As a workaround for now, just say we can't use OCR + // TODO Resolve it so we don't need this try/catch block + IMAGEMAGICK_PRESENT.put(imageMagick, false); + return false; + } + } + + @Override + public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) + throws IOException, SAXException, TikaException { + ImageMagickConfig config = context.get(ImageMagickConfig.class, DEFAULT_IMAGEMAGICK_CONFIG); + + // If ImageMagick is not on the path with the current config, do not try to run OCR + // getSupportedTypes shouldn't have listed us as handling it, so this should only + // occur if someone directly calls this parser, not via DefaultParser or similar +// TemporaryResources tmp = new TemporaryResources(); + //TikaInputStream pdfStream = TikaInputStream.get(stream); + PDFParser pdfParser = new PDFParser(); + + //create temp handlers to investigate object + BodyContentHandler body = new BodyContentHandler(); + Metadata pdfMetadata = new Metadata(); + + //needed to reset stream + if (stream.markSupported()) { + stream.mark(Integer.MAX_VALUE); + } + + //first do initial parse to see if there's subsantial content in pdf metadata already + pdfParser.parse(stream, body, pdfMetadata, context); + stream.reset(); + //if there's content - reparse with official handlers/metadata. What else can you do? Also check imagemagick is available + + LegacyPdfProcessorConfig generalConfig = context.get(LegacyPdfProcessorConfig.class); + + if (body.toString().length() > generalConfig.getPdfMinDocTextLength() || !hasImageMagick(config)) { + pdfParser.parse(stream, handler, metadata, context); + //metadata.set("X-PDFPREPROC-OCR-APPLIED", "NA"); + return; + } + + //metadata.set("X-PDFPREPROC-ORIGINAL", body.toString()); + // "FAIL" will be overwritten if it succeeds later + + //add the PDF metadata to the official metadata object + Arrays.asList(pdfMetadata.names()).forEach(name -> { + metadata.add(name, pdfMetadata.get(name)); + }); + + //objects to hold file references for manipulation outside of Java + File tiffFileOfPDF = null; + File pdfFileFromStream = File.createTempFile("tempPDF", ".pdf"); + try { + FileUtils.copyInputStreamToFile(stream, pdfFileFromStream); + tiffFileOfPDF = File.createTempFile("tempTIFF", ".tiff"); + makeTiffFromPDF(pdfFileFromStream,tiffFileOfPDF, config); + if (tiffFileOfPDF.exists()) { + long tessStartTime = System.currentTimeMillis(); + TesseractOCRParser tesseract = new TesseractOCRParser(); + + tesseract.parse(FileUtils.openInputStream(tiffFileOfPDF), handler, metadata, context); + + //metadata.set("X-OCR-Applied", "true"); + metadata.add("X-Parsed-By", TesseractOCRParser.class.getName()); + + LOG.debug("Document parsing -- OCR processing time: {} ms", System.currentTimeMillis() - tessStartTime); + } + } catch (Exception e) { + LOG.warn("Error while running OCR over the document"); + throw e; + } + finally { + if (tiffFileOfPDF.exists()) { + tiffFileOfPDF.delete(); + } + if (pdfFileFromStream.exists()) { + pdfFileFromStream.delete(); + } + } + } + + static String getImageMagickProg() { + return System.getProperty("os.name").startsWith("Windows") ? "convert.exe" : "convert"; + } + + private File makeTiffFromPDF(File input, File output, ImageMagickConfig config) throws IOException, TikaException { + String[] cmd = {config.getImageMagickPath() + getImageMagickProg(), + "-density", config.getDensity(), input.getPath(), + "-depth", config.getDepth(), + "-quality", config.getQuality(), + "-background", "white", "+matte", + output.getPath()}; + + ProcessBuilder pb = new ProcessBuilder(cmd); + //setEnv(config, pb); + final Process process = pb.start(); + + process.getOutputStream().close(); + InputStream out = process.getInputStream(); + InputStream err = process.getErrorStream(); + + logStream("ImageMagick-stdout", out, input); + logStream("ImageMagick-stderr", err, input); + + FutureTask waitTask = new FutureTask(new Callable() { + public Integer call() throws Exception { + return process.waitFor(); + } + }); + + Thread waitThread = new Thread(waitTask); + waitThread.start(); + + try { + waitTask.get(config.getTimeout(), TimeUnit.SECONDS); + return output; + } catch (InterruptedException e) { + waitThread.interrupt(); + process.destroy(); + Thread.currentThread().interrupt(); + throw new TikaException("ImageMagick-OCR-PDFParser: interrupted", e); + + } catch (ExecutionException e) { + // should not be thrown + + } catch (TimeoutException e) { + waitThread.interrupt(); + process.destroy(); + throw new TikaException("ImageMagick-OCR-PDFParser: timeout", e); + } + return null; + } + + /** + * Starts a thread that reads the contents of the standard output or error + * stream of the given process to not block the process. The stream is + * closed once fully processed. + */ + private void logStream(final String logType, final InputStream stream, final File file) { + new Thread() { + public void run() { + Reader reader = new InputStreamReader(stream); + StringBuilder out = new StringBuilder(); + char[] buffer = new char[1024]; + try { + for (int n = reader.read(buffer); n != -1; n = reader.read(buffer)) { + out.append(buffer, 0, n); + } + } catch (IOException e) { + + } finally { + IOUtils.closeQuietly(stream); + } + + String msg = out.toString(); + LogFactory.getLog(LegacyPdfProcessorParser.class).debug(msg); + } + }.start(); + } +} \ No newline at end of file diff --git a/src/main/java/tika/legacy/LegacyTikaProcessor.java b/src/main/java/tika/legacy/LegacyTikaProcessor.java new file mode 100644 index 0000000..5346e82 --- /dev/null +++ b/src/main/java/tika/legacy/LegacyTikaProcessor.java @@ -0,0 +1,108 @@ +package tika.legacy; + +import java.io.ByteArrayOutputStream; +import java.time.OffsetDateTime; +import java.util.*; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.ocr.TesseractOCRConfig; +import org.apache.tika.sax.BodyContentHandler; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; +import tika.model.TikaProcessingResult; +import tika.processor.AbstractTikaProcessor; +import javax.annotation.PostConstruct; + + +/** + * The "legacy" Tika processor, using parser from CogStack-Pipeline + * to provide compatibility with the migration of the pipeline. + * + * Processes PDF documents by running manually: + * - 1x ImageMagick - to create one large temporary TIFF image + * - 1x Tesseract - to extract the text from the TIFF + */ +@Component("legacyTikaProcessor") +public class LegacyTikaProcessor extends AbstractTikaProcessor { + + @Autowired + private LegacyPdfProcessorConfig config; + + /** + * Document-type based automatic detection of the parser to be used by Tika + */ + private AutoDetectParser defaultParser; + private ParseContext defaultParseContext; + + private Logger log = LoggerFactory.getLogger(LegacyTikaProcessor.class); + + + /** + * Initializes the processor using provided (autowired) configuration + */ + @PostConstruct + @Override + public void init() throws Exception { + defaultParseContext = new ParseContext(); + defaultParseContext.set(TikaConfig.class, config.getTikaConfig()); + defaultParseContext.set(LegacyPdfProcessorConfig.class, config); + + TesseractOCRConfig tessConfig = new TesseractOCRConfig(); + tessConfig.setTimeout(config.getOcrTimeout()); + defaultParseContext.set(TesseractOCRConfig.class, tessConfig); + + ImageMagickConfig imgConfig = new ImageMagickConfig(); + imgConfig.setTimeout(config.getConversionTimeout()); + defaultParseContext.set(ImageMagickConfig.class, imgConfig); + + defaultParser = new AutoDetectParser(config.getTikaConfig()); + } + + /** + * Resets the component with any intermediate data used + */ + @Override + public void reset() throws Exception { + // actually, we only need to re-initialize all the resources apart from the configuration + init(); + } + + /** + * Processes the input stream returning the extracted text + */ + protected TikaProcessingResult processStream(TikaInputStream stream) { + TikaProcessingResult result; + + try { + ByteArrayOutputStream outStream = new ByteArrayOutputStream(64 * 1024); + BodyContentHandler handler = new BodyContentHandler(outStream); + Metadata metadata = new Metadata(); + + defaultParser.parse(stream, handler, metadata, defaultParseContext); + + // parse the metadata and store the result + Map resultMetadata = extractMetadata(metadata); + result = TikaProcessingResult.builder() + .text(outStream.toString()) + .metadata(resultMetadata) + .success(true) + .timestamp(OffsetDateTime.now()) + .build(); + } + catch (Exception e) { + log.error(e.getMessage()); + + result = TikaProcessingResult.builder() + .error("Exception caught while processing the document: " + e.getMessage()) + .success(false) + .build(); + } + + return result; + } +} diff --git a/src/main/java/tika/model/MetadataKeys.java b/src/main/java/tika/model/MetadataKeys.java new file mode 100644 index 0000000..8c607e7 --- /dev/null +++ b/src/main/java/tika/model/MetadataKeys.java @@ -0,0 +1,15 @@ +package tika.model; + +/** + * Metadata keys that are to be used to extract relevant information + * from the document alongside the text. + * Note that some of these keys may not be available, depending on the document type. + */ +public class MetadataKeys { + public final static String CONTENT_TYPE = "Content-Type"; + public final static String CREATION_DATE = "Creation-Date"; + public final static String LAST_MODIFIED = "Last-Modified"; + public final static String OCR_APPLIED = "X-OCR-Applied"; + public final static String PARSED_BY = "X-Parsed-By"; + public final static String PAGE_COUNT = "Page-Count"; +} diff --git a/src/main/java/tika/model/TikaBinaryDocument.java b/src/main/java/tika/model/TikaBinaryDocument.java new file mode 100644 index 0000000..1c4bf2e --- /dev/null +++ b/src/main/java/tika/model/TikaBinaryDocument.java @@ -0,0 +1,12 @@ +package tika.model; + +import lombok.Data; + +/** + * A simplified representation of Tika Binary document + * that can be used as a payload for requests + */ +@Data +public class TikaBinaryDocument { + byte[] content; +} diff --git a/src/main/java/tika/model/TikaPackageInformation.java b/src/main/java/tika/model/TikaPackageInformation.java new file mode 100644 index 0000000..23367e2 --- /dev/null +++ b/src/main/java/tika/model/TikaPackageInformation.java @@ -0,0 +1,28 @@ +package tika.model; + +import com.fasterxml.jackson.annotation.JsonIgnoreProperties; +import com.fasterxml.jackson.annotation.JsonProperty; +import com.fasterxml.jackson.annotation.JsonView; +import common.JsonPropertyAccessView; +import lombok.Data; +import org.apache.tika.Tika; + +/** + * A helper class providing information about the implementation details of used Tika package + */ +@Data +@JsonIgnoreProperties(value={"specification_version", "implementation_version"}, allowGetters=true) +public class TikaPackageInformation { + + @JsonProperty("specification_version") + @JsonView(JsonPropertyAccessView.Public.class) + String getTikaSpecificationVersion() { + return Tika.class.getPackage().getSpecificationVersion(); + } + + @JsonProperty("implementation_version") + @JsonView(JsonPropertyAccessView.Public.class) + final String getTikaImplementationVersion() { + return Tika.class.getPackage().getImplementationVersion(); + } +} diff --git a/src/main/java/tika/model/TikaProcessingResult.java b/src/main/java/tika/model/TikaProcessingResult.java new file mode 100644 index 0000000..a8802d2 --- /dev/null +++ b/src/main/java/tika/model/TikaProcessingResult.java @@ -0,0 +1,38 @@ +package tika.model; + +import com.fasterxml.jackson.annotation.JsonFormat; +import com.fasterxml.jackson.annotation.JsonInclude; +import lombok.Builder; +import lombok.Data; +import org.springframework.format.annotation.DateTimeFormat; + +import java.time.OffsetDateTime; +import java.util.Map; + + +/** + * Tika processing result payload + */ +@Data +@Builder +//@JsonAutoDetect(fieldVisibility = JsonAutoDetect.Visibility.ANY) +@JsonInclude(JsonInclude.Include.NON_NULL) +public class TikaProcessingResult { + + // extracted text from the document + String text; + + // document metadata + Map metadata; + + // processing status + Boolean success; + + // the error message in case processing failed + String error; + + // when the document was processed + @DateTimeFormat(iso = DateTimeFormat.ISO.DATE_TIME) + @JsonFormat(pattern = "yyyy-MM-dd'T'HH:mm:ss.SSSXXX") + OffsetDateTime timestamp; +} diff --git a/src/main/java/tika/processor/AbstractTikaProcessor.java b/src/main/java/tika/processor/AbstractTikaProcessor.java new file mode 100644 index 0000000..fc93368 --- /dev/null +++ b/src/main/java/tika/processor/AbstractTikaProcessor.java @@ -0,0 +1,126 @@ +package tika.processor; + +import java.io.InputStream; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.ocr.TesseractOCRParser; +import tika.model.MetadataKeys; +import tika.model.TikaBinaryDocument; +import tika.model.TikaProcessingResult; + + +/** + * An abstract class for a Tika Processor + */ +public abstract class AbstractTikaProcessor { + + /** + * The metadata keys that should be extracted by the processor + */ + private static final String[] metaKeysSingleValue = {MetadataKeys.CONTENT_TYPE, MetadataKeys.CREATION_DATE, + MetadataKeys.LAST_MODIFIED, MetadataKeys.OCR_APPLIED}; + private static final String[] metaKeysMultiValue = {MetadataKeys.PARSED_BY}; + + + /** + * Processor lifecycle methods + */ + public void init() throws Exception {} + + public void reset() throws Exception {} + + + /** + * The main documents processing method + */ + protected abstract TikaProcessingResult processStream(TikaInputStream stream); + + + /** + * Wrappers over the main document processing method + */ + public TikaProcessingResult process(final TikaBinaryDocument binaryDoc) { + return processStream(TikaInputStream.get(binaryDoc.getContent())); + } + + public TikaProcessingResult process(InputStream stream) { + return processStream(TikaInputStream.get(stream)); + } + + + + /** + * Helper methods + * TODO: can be moved to utils + */ + static public int getPageCount(final Metadata docMeta) { + Map resultMeta = new HashMap<>(); + extractPageCount(docMeta, resultMeta); + + if (resultMeta.containsKey(MetadataKeys.PAGE_COUNT)) { + return Integer.parseInt(resultMeta.get(MetadataKeys.PAGE_COUNT).toString()); + } + return -1; + } + + static public boolean isValidDocumentType(final Map resultMeta) { + return !( !resultMeta.containsKey(MetadataKeys.CONTENT_TYPE) || + resultMeta.get(MetadataKeys.CONTENT_TYPE).equals(MediaType.OCTET_STREAM.toString()) || + resultMeta.get(MetadataKeys.CONTENT_TYPE).equals(MediaType.EMPTY.toString())); + } + + static private void extractPageCount(final Metadata docMeta, Map resultMeta) { + String pgValue = null; + if (docMeta.get("xmpTPg:NPages") != null) { + pgValue = docMeta.get("xmpTPg:NPages"); + } + else if (docMeta.get("meta:page-count") != null) { + pgValue = docMeta.get("meta:page-count"); + } + else if (docMeta.get("exif:PageCount") != null) { + pgValue = docMeta.get("exif:PageCount"); + } + else if (docMeta.get("Page-Count") != null) { + pgValue = docMeta.get("Page-Count"); + } + + if (pgValue != null) { + resultMeta.put(MetadataKeys.PAGE_COUNT, pgValue); + } + } + + static private void extractOcrApplied(final Metadata docMeta, Map resultMeta) { + if (docMeta.get("X-Parsed-By") != null + && (Arrays.asList(docMeta.getValues("X-Parsed-By")).contains(TesseractOCRParser.class.getName()) + // note that some parsers are also adding class prefix to the name: 'class org... + || Arrays.asList(docMeta.getValues("X-Parsed-By")).contains(TesseractOCRParser.class.toString()))) { + resultMeta.put(MetadataKeys.OCR_APPLIED, "true"); + } + else { + resultMeta.put(MetadataKeys.OCR_APPLIED, "false"); + } + } + + protected Map extractMetadata(final Metadata docMeta) { + Map resultMeta = new HashMap<>(); + Arrays.stream(metaKeysSingleValue).forEach(name -> { + if (docMeta.get(name) != null) + resultMeta.put(name, docMeta.get(name)); + }); + + Arrays.stream(metaKeysMultiValue).forEach(name -> { + if (docMeta.getValues(name) != null) + resultMeta.put(name, docMeta.getValues(name)); + }); + + extractPageCount(docMeta, resultMeta); + + extractOcrApplied(docMeta, resultMeta); + + return resultMeta; + } +} diff --git a/src/main/java/tika/processor/CompositeTikaProcessor.java b/src/main/java/tika/processor/CompositeTikaProcessor.java new file mode 100644 index 0000000..531529f --- /dev/null +++ b/src/main/java/tika/processor/CompositeTikaProcessor.java @@ -0,0 +1,276 @@ +package tika.processor; + +import java.io.ByteArrayOutputStream; +import java.io.InputStream; +import java.time.OffsetDateTime; +import java.util.*; +import org.apache.tika.config.TikaConfig; +import org.apache.tika.io.TikaInputStream; +import org.apache.tika.metadata.Metadata; +import org.apache.tika.mime.MediaType; +import org.apache.tika.parser.AutoDetectParser; +import org.apache.tika.parser.ParseContext; +import org.apache.tika.parser.Parser; +import org.apache.tika.parser.ocr.TesseractOCRConfig; +import org.apache.tika.parser.pdf.PDFParser; +import org.apache.tika.parser.pdf.PDFParserConfig; +import org.apache.tika.sax.BodyContentHandler; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.stereotype.Component; +import tika.legacy.ImageMagickConfig; +import tika.legacy.LegacyPdfProcessorConfig; +import tika.legacy.LegacyPdfProcessorParser; +import tika.model.TikaProcessingResult; +import javax.annotation.PostConstruct; + + +/** + * A default, composite Tika processor. + * + * In contrast to "legacy" processor it uses the default approach implemented in Tika, i.e. when + * parsing PDF documents, it runs the processing independently per each PDF page, + * and hence running Tesseract Page-Count times. + */ +@Component("compositeTikaProcessor") +public class CompositeTikaProcessor extends AbstractTikaProcessor { + + @Autowired + private CompositeTikaProcessorConfig compositeTikaProcessorConfig; + + @Autowired + private LegacyPdfProcessorConfig legacyPdfProcessorConfig; + + /** + In order to properly handle PDF documents and OCR we need three separate parsers: + - a generic parser (for any, non-PDF document type), + - one that will extract text only from PDFs, + - one that will apply OCR on PDFs (when stored only images). + + In the default configuration of PDFParser the OCR is disabled when extracting text from PDFs. However, OCR is + enabled when extracting text from documents of image type. When using default parser with OCR enabled (strategy: + extract both text and OCR), it will actually always apply OCR on the PDFs even when there is text-only provided. + + We would also like to know when OCR was applied as it will affect the accuracy of the extracted text that will be + passed to the downstream analysis applications. + */ + + // common tika and parsers configuration + private TikaConfig tikaConfig; + private TesseractOCRConfig tessConfig; + + // the default, generic parser for handling all document types (expect PDF) + private AutoDetectParser defaultParser; + private ParseContext defaultParseContext; + + // the default parser for PDFs (no OCR) + private PDFParser pdfTextParser; + private ParseContext pdfTextParseContext; + + // the parser to extract text from PDFs using OCR + private PDFParser pdfOcrParser; + private ParseContext pdfOcrParseContext; + + // the parser to extract text from PDFs using OCR only for single-pages + // (used to strip-off clutter from LibreOffice-generated PDFs just with images) + private LegacyPdfProcessorParser pdfSinglePageOcrParser; + private ParseContext pdfSinglePageOcrParseContext; + + + private Logger log = LoggerFactory.getLogger(CompositeTikaProcessor.class); + + + @PostConstruct + @Override + public void init() throws Exception { + + tikaConfig = new TikaConfig(); + + initializeTesseractConfig(); + + initializeDefaultParser(); + + initializePdfTextOnlyParser(); + + initializePdfOcrParser(); + + if (compositeTikaProcessorConfig.isUseLegacyOcrParserForSinglePageDocuments()) { + initializePdfLegacyOcrParser(); + } + } + + @Override + public void reset() throws Exception { + // actually, we only need to re-initialize all the resources apart from the configuration + init(); + } + + protected TikaProcessingResult processStream(TikaInputStream stream) { + final int MIN_TEXT_BUFFER_SIZE = 1024; + + TikaProcessingResult result; + try { + ByteArrayOutputStream outStream = new ByteArrayOutputStream(MIN_TEXT_BUFFER_SIZE); + BodyContentHandler handler = new BodyContentHandler(outStream); + Metadata metadata = new Metadata(); + + // mark the stream for multi-pass processing + if (stream.markSupported()) { + stream.mark(Integer.MAX_VALUE); + } + + // try to detect whether the document is PDF + if (isDocumentOfPdfType(stream)) { + + // firstly try the default parser + pdfTextParser.parse(stream, handler, metadata, pdfTextParseContext); + + // check if there have been enough characters read / extracted and the we read enough bytes from the stream + // (images embedded in the documents will occupy quite more space than just raw text) + if (outStream.size() < compositeTikaProcessorConfig.getPdfMinDocTextLength() + && stream.getPosition() > compositeTikaProcessorConfig.getPdfMinDocByteSize()) { + + // since we are perfoming a second pass over the document, we need to reset cursor position + // in both input and output streams + stream.reset(); + outStream.reset(); + + final boolean useOcrLegacyParser = compositeTikaProcessorConfig.isUseLegacyOcrParserForSinglePageDocuments() + && getPageCount(metadata) == 1; + + // TODO: Q: shall we use a clean metadata or re-use some of the previously parsed fields??? + handler = new BodyContentHandler(outStream); + metadata = new Metadata(); + + if (useOcrLegacyParser) { + pdfSinglePageOcrParser.parse(stream, handler, metadata, pdfSinglePageOcrParseContext); + + // since we use the parser manually, update the metadata with the name of the parser class used + metadata.add("X-Parsed-By", LegacyPdfProcessorParser.class.getName()); + } + else { + pdfOcrParser.parse(stream, handler, metadata, pdfOcrParseContext); + + // since we use the parser manually, update the metadata with the name of the parser class used + metadata.add("X-Parsed-By", PDFParser.class.getName()); + } + } + else { + // since we use the parser manually, update the metadata with the name of the parser class used + metadata.add("X-Parsed-By", PDFParser.class.getName()); + } + } + else { + // otherwise, run default documents parser + defaultParser.parse(stream, handler, metadata, defaultParseContext); + } + + // parse the metadata and store the result + Map resultMeta = extractMetadata(metadata); + result = TikaProcessingResult.builder() + .text(outStream.toString()) + .metadata(resultMeta) + .success(true) + .timestamp(OffsetDateTime.now()) + .build(); + } + catch (Exception e) { + log.error(e.getMessage()); + + result = TikaProcessingResult.builder() + .error("Exception caught while processing the document: " + e.getMessage()) + .success(false) + .build(); + } + + return result; + } + + + private boolean isDocumentOfPdfType(InputStream stream) throws Exception { + Metadata metadata = new Metadata(); + MediaType mediaType = defaultParser.getDetector().detect(stream, metadata); + + return mediaType.equals(MediaType.application("pdf")); + } + + + private void initializeTesseractConfig() { + tessConfig = new TesseractOCRConfig(); + + tessConfig.setTimeout(compositeTikaProcessorConfig.getOcrTimeout()); + tessConfig.setApplyRotation(compositeTikaProcessorConfig.isOcrApplyRotation()); + if (compositeTikaProcessorConfig.isOcrEnableImageProcessing()) { + tessConfig.setEnableImageProcessing(1); + } + else { + tessConfig.setEnableImageProcessing(0); + } + tessConfig.setLanguage(compositeTikaProcessorConfig.getOcrLanguage()); + } + + + private void initializeDefaultParser() { + defaultParser = new AutoDetectParser(tikaConfig); + + defaultParseContext = new ParseContext(); + defaultParseContext.set(TikaConfig.class, tikaConfig); + defaultParseContext.set(TesseractOCRConfig.class, tessConfig); + defaultParseContext.set(Parser.class, defaultParser); //need to add this to make sure recursive parsing happens! + } + + + private void initializePdfTextOnlyParser() { + PDFParserConfig pdfTextOnlyConfig = new PDFParserConfig(); + pdfTextOnlyConfig.setExtractInlineImages(false); + pdfTextOnlyConfig.setExtractUniqueInlineImagesOnly(false); // do not extract multiple inline images + pdfTextOnlyConfig.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.NO_OCR); + + pdfTextParser = new PDFParser(); + pdfTextParseContext = new ParseContext(); + pdfTextParseContext.set(TikaConfig.class, tikaConfig); + pdfTextParseContext.set(PDFParserConfig.class, pdfTextOnlyConfig); + //pdfTextParseContext.set(Parser.class, defaultParser); //need to add this to make sure recursive parsing happens! + } + + + private void initializePdfOcrParser() { + PDFParserConfig pdfOcrConfig = new PDFParserConfig(); + pdfOcrConfig.setExtractUniqueInlineImagesOnly(false); // do not extract multiple inline images + if (compositeTikaProcessorConfig.isPdfOcrOnlyStrategy()) { + pdfOcrConfig.setExtractInlineImages(false); + pdfOcrConfig.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_ONLY); + } + else { + pdfOcrConfig.setExtractInlineImages(true); + // warn: note that applying 'OCR_AND_TEXT_EXTRACTION' the content can be duplicated + pdfOcrConfig.setOcrStrategy(PDFParserConfig.OCR_STRATEGY.OCR_AND_TEXT_EXTRACTION); + } + + pdfOcrParser = new PDFParser(); + pdfOcrParseContext = new ParseContext(); + pdfOcrParseContext.set(TikaConfig.class, tikaConfig); + pdfOcrParseContext.set(PDFParserConfig.class, pdfOcrConfig); + pdfOcrParseContext.set(TesseractOCRConfig.class, tessConfig); + //pdfOcrParseContext.set(Parser.class, defaultParser); //need to add this to make sure recursive parsing happens! + } + + private void initializePdfLegacyOcrParser() { + pdfSinglePageOcrParser = new LegacyPdfProcessorParser(); + + pdfSinglePageOcrParseContext = new ParseContext(); + pdfSinglePageOcrParseContext.set(TikaConfig.class, tikaConfig); + pdfSinglePageOcrParseContext.set(LegacyPdfProcessorConfig.class, legacyPdfProcessorConfig); + + TesseractOCRConfig tessConfig = new TesseractOCRConfig(); + tessConfig.setTimeout(legacyPdfProcessorConfig.getOcrTimeout()); + pdfSinglePageOcrParseContext.set(TesseractOCRConfig.class, tessConfig); + + ImageMagickConfig imgConfig = new ImageMagickConfig(); + imgConfig.setTimeout(legacyPdfProcessorConfig.getConversionTimeout()); + pdfSinglePageOcrParseContext.set(ImageMagickConfig.class, imgConfig); + + //pdfOcrParseContext.set(Parser.class, defaultParser); //need to add this to make sure recursive parsing happens! + } +} diff --git a/src/main/java/tika/processor/CompositeTikaProcessorConfig.java b/src/main/java/tika/processor/CompositeTikaProcessorConfig.java new file mode 100644 index 0000000..2a88507 --- /dev/null +++ b/src/main/java/tika/processor/CompositeTikaProcessorConfig.java @@ -0,0 +1,63 @@ +package tika.processor; + +import com.fasterxml.jackson.annotation.JsonView; +import common.JsonPropertyAccessView; +import lombok.Data; +import org.springframework.beans.factory.annotation.Value; +import org.springframework.context.annotation.Configuration; + + +/** + * The composite PDF processor configuration + */ +@Data +@Configuration +public class CompositeTikaProcessorConfig { + + // the timeout value (s) when performing OCR over documents + @JsonView(JsonPropertyAccessView.Public.class) + @Value("${tika.parsers.tesseract-ocr.timeout:120}") + private int ocrTimeout; + + // apply image processing techniques during documents conversion (using ImageMagick) + // required to enable applying rotation (see below) + @JsonView(JsonPropertyAccessView.Public.class) + @Value("${tika.parsers.tesseract-ocr.enable-image-processing:false}") + private boolean ocrEnableImageProcessing; + + // apply de-rotation of documents before processing + // can be quite computationally expensive (runs as an external python script) + @JsonView(JsonPropertyAccessView.Public.class) + @Value("${tika.parsers.tesseract-ocr.apply-rotation:false}") + private boolean ocrApplyRotation; + + // the language used in the OCR for corrections + @JsonView(JsonPropertyAccessView.Public.class) + @Value("${tika.parsers.tesseract-ocr.language:eng}") + private String ocrLanguage; + + // whether to apply OCR only on the documents or also extract the embeded text (if present) + // warn: note that applying 'OCR_AND_TEXT_EXTRACTION' the content can be duplicated + @JsonView(JsonPropertyAccessView.Public.class) + @Value("${tika.parsers.pdf-ocr-parser.ocr-only-strategy:true}") + private boolean pdfOcrOnlyStrategy; + + // apply OCR only when trying to extract text from previously parsed document (w/o OCR) + // that extracted characters were less than N + @JsonView(JsonPropertyAccessView.Public.class) + @Value("${tika.parsers.pdf-ocr-parser.min-doc-text-length:100}") + private int pdfMinDocTextLength; + + // apply OCR only when trying to extract text from previously parsed document (w/o OCR) + // that the read bytes were at least N + @JsonView(JsonPropertyAccessView.Public.class) + @Value("${tika.parsers.pdf-ocr-parser.min-doc-byte-size:10000}") + private int pdfMinDocByteSize; + + // use a legacy parser for applying OCR for single-page PDF documents + // (NB: when exporting single-page PDFs from LibreOffice that contain only one image, + // some additional clutter may be embedded in the PDF content) + @JsonView(JsonPropertyAccessView.Public.class) + @Value("${tika.parsers.use-legacy-ocr-parser-for-single-page-doc:false}") + private boolean useLegacyOcrParserForSinglePageDocuments; +} diff --git a/src/main/resources/application.yaml b/src/main/resources/application.yaml new file mode 100644 index 0000000..907c149 --- /dev/null +++ b/src/main/resources/application.yaml @@ -0,0 +1,47 @@ +# application configuration +# +application: + version: 0.1.0 + + +# general spring boot configuration +# +server: + port: 8090 + +spring: + servlet: + multipart.max-file-size: 100MB + multipart.max-request-size: 100MB + + +# tika configuration +# +tika: + parsers: + tesseract-ocr: + language: eng + timeout: 300 + enable-image-processing: false + apply-rotation: false + + pdf-ocr-parser: + ocr-only-strategy: true + min-doc-text-length: 100 + min-doc-byte-size: 10000 + use-legacy-ocr-parser-for-single-page-doc: false + + legacy-pdf-parser: + image-magick: + timeout: 300 + tesseract-ocr: + timeout: 300 + min-doc-text-length: 100 + + +# documents processing configuration +# +processing: + use-legacy-tika-processor-as-default: true + fail-on-empty-files: false + fail-on-non-document-types: true diff --git a/src/main/resources/logback.xml b/src/main/resources/logback.xml new file mode 100644 index 0000000..2527c44 --- /dev/null +++ b/src/main/resources/logback.xml @@ -0,0 +1,11 @@ + + + + + %d{dd-MM-yyyy HH:mm:ss.SSS} -- %highlight(%-5level) : %magenta([%thread]) %logger{36}.%M - %msg%n + + + + + + \ No newline at end of file diff --git a/src/main/resources/tika-config/legacy-parser-config.xml b/src/main/resources/tika-config/legacy-parser-config.xml new file mode 100644 index 0000000..c414d03 --- /dev/null +++ b/src/main/resources/tika-config/legacy-parser-config.xml @@ -0,0 +1,14 @@ + + + + + + application/pdf + + + + application/pdf + + + \ No newline at end of file diff --git a/src/test/java/service/ServiceControllerDocumentMultipartFileTests.java b/src/test/java/service/ServiceControllerDocumentMultipartFileTests.java new file mode 100644 index 0000000..1c92363 --- /dev/null +++ b/src/test/java/service/ServiceControllerDocumentMultipartFileTests.java @@ -0,0 +1,69 @@ +package service; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule; +import org.junit.runner.RunWith; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.autoconfigure.web.servlet.AutoConfigureMockMvc; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.http.HttpStatus; +import org.springframework.mock.web.MockMultipartFile; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringRunner; +import org.springframework.test.web.servlet.MockMvc; +import org.springframework.test.web.servlet.MvcResult; +import org.springframework.test.web.servlet.request.MockMvcRequestBuilders; +import service.controller.TikaServiceConfig; +import service.model.ServiceResponseContent; +import tika.legacy.LegacyPdfProcessorConfig; +import tika.model.TikaProcessingResult; +import tika.processor.CompositeTikaProcessorConfig; +import java.io.InputStream; + +import static org.junit.Assert.*; +import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.status; + + +/** + * Implements document processing tests for the Service Controller + * A document is passed as a multi-part file + */ +@SpringBootTest(classes = TikaServiceApplication.class) +@RunWith(SpringRunner.class) +@AutoConfigureMockMvc +@ContextConfiguration(classes = {TikaServiceConfig.class, LegacyPdfProcessorConfig.class, CompositeTikaProcessorConfig.class}) +public class ServiceControllerDocumentMultipartFileTests extends ServiceControllerDocumentTests { + + @Autowired + private MockMvc mockMvc; + + final private String PROCESS_FILE_ENDPOINT_URL = "/api/process_file"; + + @Override + protected TikaProcessingResult sendProcessingRequest(final String docPath, HttpStatus expectedStatus) throws Exception { + return sendMultipartFileProcessingRequest(docPath, expectedStatus); + } + + private TikaProcessingResult sendMultipartFileProcessingRequest(final String docPath, HttpStatus expectedStatus) throws Exception { + InputStream stream = utils.getDocumentStream(docPath); + MockMultipartFile multipartFile = new MockMultipartFile("file", docPath, "multipart/form-data", stream); + + MvcResult result = mockMvc.perform(MockMvcRequestBuilders.multipart(PROCESS_FILE_ENDPOINT_URL) + .file(multipartFile)) + //.param("some-random", "4")) + .andExpect(status().is(expectedStatus.value())) + .andReturn(); + //.andExpect(content().string("success")); + + assertEquals(expectedStatus.value(), result.getResponse().getStatus()); + assertNotNull(result.getResponse().getContentAsString()); + + // parse content + ObjectMapper mapper = new ObjectMapper(); + mapper.registerModule(new JavaTimeModule()); + TikaProcessingResult tikaResult = mapper.readValue(result.getResponse().getContentAsString(), + ServiceResponseContent.class).getResult(); + + return tikaResult; + } +} diff --git a/src/test/java/service/ServiceControllerDocumentStreamTests.java b/src/test/java/service/ServiceControllerDocumentStreamTests.java new file mode 100644 index 0000000..6bf1ff3 --- /dev/null +++ b/src/test/java/service/ServiceControllerDocumentStreamTests.java @@ -0,0 +1,68 @@ +package service; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.fasterxml.jackson.datatype.jsr310.JavaTimeModule; +import org.junit.runner.RunWith; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.autoconfigure.web.servlet.AutoConfigureMockMvc; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.http.HttpStatus; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringRunner; +import org.springframework.test.web.servlet.MockMvc; +import org.springframework.test.web.servlet.MvcResult; +import org.springframework.test.web.servlet.request.MockMvcRequestBuilders; +import service.controller.TikaServiceConfig; +import service.model.ServiceResponseContent; +import tika.legacy.LegacyPdfProcessorConfig; +import tika.model.TikaProcessingResult; +import tika.processor.CompositeTikaProcessorConfig; +import java.io.InputStream; + +import static org.junit.Assert.*; +import static org.springframework.test.web.servlet.result.MockMvcResultMatchers.status; + + +/** + * Implements document processing tests for the Service Controller + * A document is passed as an ocet stream + */ +@SpringBootTest(classes = TikaServiceApplication.class) +@RunWith(SpringRunner.class) +@AutoConfigureMockMvc +@ContextConfiguration(classes = {TikaServiceConfig.class, LegacyPdfProcessorConfig.class, CompositeTikaProcessorConfig.class}) +public class ServiceControllerDocumentStreamTests extends ServiceControllerDocumentTests { + + @Autowired + private MockMvc mockMvc; + + final private String PROCESS_ENDPOINT_URL = "/api/process"; + + @Override + protected TikaProcessingResult sendProcessingRequest(final String docPath, HttpStatus expectedStatus) throws Exception { + return sendFileProcessingRequest(docPath, expectedStatus); + } + + private TikaProcessingResult sendFileProcessingRequest(final String docPath, HttpStatus expectedStatus) throws Exception { + InputStream stream = utils.getDocumentStream(docPath); + + byte[] content = stream.readAllBytes(); + + MvcResult result = mockMvc.perform(MockMvcRequestBuilders.post(PROCESS_ENDPOINT_URL) + .content(content)) + //.param("some-random", "4")) + .andExpect(status().is(expectedStatus.value())) + .andReturn(); + + assertEquals(expectedStatus.value(), result.getResponse().getStatus()); + assertNotNull(result.getResponse().getContentAsString()); + + // parse content + ObjectMapper mapper = new ObjectMapper(); + mapper.registerModule(new JavaTimeModule()); + TikaProcessingResult tikaResult = mapper.readValue(result.getResponse().getContentAsString(), + ServiceResponseContent.class).getResult(); + + return tikaResult; + } +} diff --git a/src/test/java/service/ServiceControllerDocumentTests.java b/src/test/java/service/ServiceControllerDocumentTests.java new file mode 100644 index 0000000..aa3fd88 --- /dev/null +++ b/src/test/java/service/ServiceControllerDocumentTests.java @@ -0,0 +1,71 @@ +package service; + +import org.junit.Test; +import org.junit.runner.RunWith; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.autoconfigure.web.servlet.AutoConfigureMockMvc; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.http.HttpStatus; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringRunner; +import service.controller.TikaServiceConfig; +import tika.DocumentProcessorTests; +import tika.legacy.LegacyPdfProcessorConfig; +import tika.model.TikaProcessingResult; +import tika.processor.CompositeTikaProcessorConfig; + +import static org.junit.Assert.*; + + +/** + * Implements document processing tests for the Service Controller, extending the set of available tests + * present in DocumentProcessorTests + */ +@SpringBootTest(classes = TikaServiceApplication.class) +@RunWith(SpringRunner.class) +@AutoConfigureMockMvc +@ContextConfiguration(classes = {TikaServiceConfig.class, LegacyPdfProcessorConfig.class, CompositeTikaProcessorConfig.class}) +public abstract class ServiceControllerDocumentTests extends DocumentProcessorTests { + + @Autowired + TikaServiceConfig serviceConfig; + + + protected abstract TikaProcessingResult sendProcessingRequest(final String docPath, HttpStatus expectedStatus) throws Exception; + + @Override + protected TikaProcessingResult processDocument(final String docPath) throws Exception { + return sendProcessingRequest(docPath, HttpStatus.OK); + } + + + /** + * The actual tests start from here + * + * + */ + + @Override + public void testExtractPdfEx1Encrypted() throws Exception { + final String docPath = "pdf/ex1_enc.pdf"; + + TikaProcessingResult result = sendProcessingRequest(docPath, HttpStatus.BAD_REQUEST); + + // extraction from encrypted PDF will fail with the proper error message + assertFalse(result.getSuccess()); + assertTrue(result.getError().contains("document is encrypted")); + } + + + @Test + public void testExtractEmptyPdfFile() throws Exception { + final String docPath = "invalid/pdf_empty.pdf"; + + assertFalse(serviceConfig.isFailOnEmptyFiles()); + + // extraction should pass but with error + TikaProcessingResult result = sendProcessingRequest(docPath, HttpStatus.OK); + assertFalse(result.getSuccess()); + assertTrue(result.getError().contains("Empty")); + } +} diff --git a/src/test/java/service/ServiceControllerTests.java b/src/test/java/service/ServiceControllerTests.java new file mode 100644 index 0000000..f55c3b2 --- /dev/null +++ b/src/test/java/service/ServiceControllerTests.java @@ -0,0 +1,62 @@ +package service; + +import com.fasterxml.jackson.databind.ObjectMapper; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.autoconfigure.web.servlet.AutoConfigureMockMvc; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.http.HttpStatus; +import org.springframework.http.MediaType; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringRunner; +import org.springframework.test.web.servlet.MockMvc; +import org.springframework.test.web.servlet.MvcResult; +import org.springframework.test.web.servlet.request.MockMvcRequestBuilders; +import service.controller.TikaServiceConfig; +import service.model.ServiceInformation; +import tika.legacy.LegacyPdfProcessorConfig; +import tika.processor.CompositeTikaProcessorConfig; + +import static org.junit.Assert.assertEquals; + + +/** + * Implements general tests for the Service Controller + * (no documents processing) + */ +@SpringBootTest(classes = TikaServiceApplication.class) +@RunWith(SpringRunner.class) +@AutoConfigureMockMvc +@ContextConfiguration(classes = {TikaServiceConfig.class, LegacyPdfProcessorConfig.class, CompositeTikaProcessorConfig.class}) +public class ServiceControllerTests { + + @Autowired + private MockMvc mockMvc; + + @Autowired + private ServiceInformation serviceinfo; + + final private String INFO_ENDPOINT_URL = "/api/info"; + + + @Test + public void testGetApplicationInfo() throws Exception { + MvcResult result = mockMvc.perform(MockMvcRequestBuilders + .get(INFO_ENDPOINT_URL) + .accept(MediaType.APPLICATION_JSON_UTF8)) + .andReturn(); + + // check response status + int status = result.getResponse().getStatus(); + assertEquals(HttpStatus.OK.value(), status); + + // parse content + ObjectMapper mapper = new ObjectMapper(); + ServiceInformation response = mapper.readValue(result.getResponse().getContentAsString(), + ServiceInformation.class); + + // check example content + assertEquals(response.getServiceConfig().getAppVersion(), serviceinfo.getServiceConfig().getAppVersion()); + } +} diff --git a/src/test/java/tika/CompositeTikaProcessorTests.java b/src/test/java/tika/CompositeTikaProcessorTests.java new file mode 100644 index 0000000..b528219 --- /dev/null +++ b/src/test/java/tika/CompositeTikaProcessorTests.java @@ -0,0 +1,43 @@ +package tika; + +import org.junit.*; +import org.junit.runner.RunWith; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringRunner; +import tika.legacy.LegacyPdfProcessorConfig; +import tika.processor.AbstractTikaProcessor; +import tika.processor.CompositeTikaProcessor; +import tika.processor.CompositeTikaProcessorConfig; + + +/** + * Implements the tests using CompositeTikaProcessor as the documents processor + */ +@SpringBootTest(classes = CompositeTikaProcessor.class) +@RunWith(SpringRunner.class) +@ContextConfiguration(classes = {LegacyPdfProcessorConfig.class, CompositeTikaProcessorConfig.class}) +public class CompositeTikaProcessorTests extends DocumentProcessorTests { + + @Autowired + LegacyPdfProcessorConfig legacyProcessorConfig; + + @Autowired + CompositeTikaProcessorConfig compositeProcessorConfig; + + @Autowired + CompositeTikaProcessor processor; + + + @Override + protected AbstractTikaProcessor getProcessor() { + return processor; + } + + @After + public void reset() throws Exception { + processor.reset(); + } +} + diff --git a/src/test/java/tika/DocumentProcessorTests.java b/src/test/java/tika/DocumentProcessorTests.java new file mode 100644 index 0000000..d752636 --- /dev/null +++ b/src/test/java/tika/DocumentProcessorTests.java @@ -0,0 +1,234 @@ +package tika; + +import org.junit.Ignore; +import org.junit.Test; +import tika.model.TikaProcessingResult; +import tika.processor.AbstractTikaProcessor; +import java.io.InputStream; + +import static org.junit.Assert.*; + + +/** + * All the document processor tests are implemented in this abstract class in order to keep the + * rationale behind the tests and results in one single place. + */ +public abstract class DocumentProcessorTests { + + protected DocumentTestUtils utils = new DocumentTestUtils(); + + /** + * Helper methods used in tests that can be overloaded in child classes + */ + protected AbstractTikaProcessor getProcessor() { return null; } + + protected TikaProcessingResult processDocument(final String docPath) throws Exception { + AbstractTikaProcessor processor = getProcessor(); + assertNotNull(processor); + + InputStream stream = utils.getDocumentStream(docPath); + return processor.process(stream); + } + + + /** + * The actual tests start from here + * + * + */ + + @Test + public void testGenericExtractPattern1SourceTxt() throws Exception { + final String docPathPrefix = "generic/pat_id_1"; + final String docExt = ".txt"; + + TikaProcessingResult result = processDocument(docPathPrefix + docExt); + assertTrue(result.getSuccess()); + + // test parsing status + String parsedString = result.getText(); + assertEquals(310, parsedString.length()); + + // test metadata + utils.assertOcrApplied(false, result); + } + + @Test + public void testGenericExtractPattern1Doc() throws Exception { + final String docPathPrefix = "generic/pat_id_1"; + final String docExt = ".doc"; + + TikaProcessingResult result = processDocument(docPathPrefix + docExt); + assertTrue(result.getSuccess()); + + utils.testContentMatch(result, docPathPrefix); + + // test metadata + utils.assertPageCount(1, result); + utils.assertOcrApplied(false, result); + } + + @Test + public void testGenericExtractPattern1Docx() throws Exception { + final String docPathPrefix = "generic/pat_id_1"; + final String docExt = ".docx"; + + TikaProcessingResult result = processDocument(docPathPrefix + docExt); + assertTrue(result.getSuccess()); + + utils.testContentMatch(result, docPathPrefix); + + // test metadata + utils.assertPageCount(1, result); + utils.assertOcrApplied(false, result); + } + + @Test + public void testGenericExtractPattern1Odt() throws Exception { + final String docPathPrefix = "generic/pat_id_1"; + final String docExt = ".odt"; + + TikaProcessingResult result = processDocument(docPathPrefix + docExt); + assertTrue(result.getSuccess()); + + utils.testContentMatch(result, docPathPrefix); + + // test metadata + utils.assertPageCount(1, result); + utils.assertOcrApplied(false, result); + } + + @Test + public void testGenericExtractPattern1Rtf() throws Exception { + final String docPathPrefix = "generic/pat_id_1"; + final String docExt = ".txt"; + + TikaProcessingResult result = processDocument(docPathPrefix + docExt); + assertTrue(result.getSuccess()); + + utils.testContentMatch(result, docPathPrefix); + + // test metadata + // rtf does not contain page count + utils.assertOcrApplied(false, result); + } + + @Test + public void testGenericExtractPattern1Png() throws Exception { + final String docPathPrefix = "generic/pat_id_1"; + final String docExt = ".png"; + + TikaProcessingResult result = processDocument(docPathPrefix + docExt); + assertTrue(result.getSuccess()); + + utils.testContentMatch(result, docPathPrefix); + + // test metadata + // png does not contain page count + utils.assertOcrApplied(true, result); + } + + @Test + public void testGenericExtractPattern1Pdf() throws Exception { + final String docPathPrefix = "generic/pat_id_1"; + final String docExt = ".pdf"; + + TikaProcessingResult result = processDocument(docPathPrefix + docExt); + assertTrue(result.getSuccess()); + + utils.testContentMatch(result, docPathPrefix); + + // test metadata + utils.assertPageCount(1, result); + utils.assertOcrApplied(false, result); // this pdf contains text-only + } + + @Test + public void testExtractPdfEx1WithoutOcr() throws Exception { + final String docPath = "pdf/ex1.pdf"; + + TikaProcessingResult result = processDocument(docPath); + + // check an example string + assertTrue(result.getSuccess()); + assertTrue(result.getText().contains("An Example Paper")); + + // test metadata + utils.assertPageCount(10, result); + utils.assertOcrApplied(false, result); // this pdf contains text-only + } + + @Test + public void testExtractPdfEx1Encrypted() throws Exception { + final String docPath = "pdf/ex1_enc.pdf"; + + TikaProcessingResult result = processDocument(docPath); + + // extraction from encrypted PDF will fail with the proper error message + assertFalse(result.getSuccess()); + assertTrue(result.getError().contains("document is encrypted")); + } + + @Test + public void testExtractPdfEx2WithOcr() throws Exception { + final String docPath = "pdf/ex2_ocr.pdf"; + + TikaProcessingResult result = processDocument(docPath); + + // check the content + assertTrue(result.getSuccess()); + final String parsedString = result.getText(); + assertTrue(parsedString.length() > 0); + + // example text from the first page + assertTrue(parsedString.contains("Father or mother")); + // example text from the second page + assertTrue(parsedString.contains("how you have determined who is the Nearest")); + + // test medatata + utils.assertPageCount(2, result); + utils.assertOcrApplied(true, result); + } + + + // TODO: need to double-check how to handle invalid TIFFs or image files + @Ignore + @Test + public void testExtractTiffWithOCR() throws Exception { + InputStream stream = utils.getDocumentZipStream("invalid/tiff_multipage_spp2.tiff.zip", "tiff_multipage_spp2.tiff"); + + AbstractTikaProcessor processor = getProcessor(); + TikaProcessingResult result = processor.process(stream); + assertTrue(result.getSuccess()); + + // HINT: the test should fail either as the TIFF is invalid + // or should an additional pre-processing of the image happen + + // test parsing status + String parsedString = result.getText(); + assertTrue(parsedString.length() > 0); + + // test metadata + utils.assertPageCount(6, result); + + // test example content + // - from first page + assertTrue(parsedString.contains("Sample Narrative Report")); + } + + + //TODO: need to create a proper docx encrypted file + @Ignore + @Test + public void testExtractWordEncrypted() throws Exception { + InputStream stream = utils.getDocumentStream("word_enc_noerror.docx"); + + AbstractTikaProcessor processor = getProcessor(); + TikaProcessingResult result = processor.process(stream); + + // extraction from encrypted DOCX will succeed but with the content empty and no error message + // uses: org.apache.tika.parser.microsoft.OfficeParser + //TODO: this one needs an internal fix or further investigation + assertTrue(result.getSuccess()); + } +} diff --git a/src/test/java/tika/DocumentTestUtils.java b/src/test/java/tika/DocumentTestUtils.java new file mode 100644 index 0000000..cae8308 --- /dev/null +++ b/src/test/java/tika/DocumentTestUtils.java @@ -0,0 +1,75 @@ +package tika; + +import tika.model.MetadataKeys; +import tika.model.TikaProcessingResult; +import java.io.ByteArrayInputStream; +import java.io.InputStream; +import java.util.Map; +import java.util.zip.ZipEntry; +import java.util.zip.ZipFile; + +import static org.junit.Assert.*; +import static org.junit.Assert.assertTrue; + + +/** + * Helper utilities used in tests + */ +public class DocumentTestUtils { + public InputStream getDocumentStream(final String docName) throws Exception { + final String fullPath = "tika/docs/" + docName; + InputStream stream = getClass().getClassLoader().getResourceAsStream(fullPath); + assertNotNull(stream); + ByteArrayInputStream bas = new ByteArrayInputStream(stream.readAllBytes()); + return bas; + } + + public InputStream getDocumentZipStream(final String archiveName, final String zipEntry) throws Exception { + final String fullPath = "tika/docs/" + archiveName; + final ZipEntry entry = new ZipEntry(zipEntry); + ZipFile zf = new ZipFile(getClass().getClassLoader().getResource(fullPath).getPath()); + InputStream stream = zf.getInputStream(entry); + assertNotNull(stream); + return stream; + } + + public String getDocumentText(final String path) throws Exception { + return new String(getDocumentStream(path).readAllBytes()); + } + + + public void assertContentMatches(final String expected, final String actual) { + // note that this check is a very naive method of content comparison, as we only + // strip all the special characters and compare the content in lowercase + final String regexPattern = "[^\\dA-Za-z]"; + final String s1parsed = expected.replaceAll(regexPattern, ""); + final String s2parsed = actual.replaceAll(regexPattern, ""); + assertEquals(s1parsed, s2parsed); + } + + public void assertPageCount(final int expectedPageCount, TikaProcessingResult result) { + Map metadata = result.getMetadata(); + assertTrue(metadata.containsKey(MetadataKeys.PAGE_COUNT)); + assertEquals(Integer.parseInt(metadata.get(MetadataKeys.PAGE_COUNT).toString()), expectedPageCount); + } + + public void assertOcrApplied(final boolean expectedStatus, TikaProcessingResult result) { + Map metadata = result.getMetadata(); + if (metadata.containsKey(MetadataKeys.OCR_APPLIED)) { + assertEquals(Boolean.parseBoolean(metadata.get(MetadataKeys.OCR_APPLIED).toString()), expectedStatus); + } + else { + assertFalse(expectedStatus); + } + } + + + public void testContentMatch(final TikaProcessingResult result, final String docPathPrefix) throws Exception { + // read truth document + final String sourceText = getDocumentText(docPathPrefix + ".txt"); + + // test status and content + assertTrue(result.getText().length() > 0); + assertContentMatches(sourceText, result.getText()); + } +} diff --git a/src/test/java/tika/LegacyTikaProcessorTests.java b/src/test/java/tika/LegacyTikaProcessorTests.java new file mode 100644 index 0000000..0b9d0d5 --- /dev/null +++ b/src/test/java/tika/LegacyTikaProcessorTests.java @@ -0,0 +1,40 @@ +package tika; + +import org.junit.*; +import org.junit.runner.RunWith; +import org.springframework.beans.factory.annotation.Autowired; +import org.springframework.boot.test.context.SpringBootTest; +import org.springframework.test.annotation.DirtiesContext; +import org.springframework.test.context.ContextConfiguration; +import org.springframework.test.context.junit4.SpringRunner; +import tika.legacy.LegacyPdfProcessorConfig; +import tika.legacy.LegacyTikaProcessor; +import tika.processor.AbstractTikaProcessor; + + +/** + * Implements the tests using LegacyTikaProcessor as the documents processor + */ +@SpringBootTest(classes = LegacyTikaProcessor.class) +@RunWith(SpringRunner.class) +@DirtiesContext +@ContextConfiguration(classes = {LegacyPdfProcessorConfig.class}) +public class LegacyTikaProcessorTests extends DocumentProcessorTests { + + @Autowired + LegacyPdfProcessorConfig defaultConfig; + + @Autowired + LegacyTikaProcessor processor; + + @Override + protected AbstractTikaProcessor getProcessor() { + return processor; + } + + @After + public void reset() throws Exception { + processor.reset(); + } +} + diff --git a/src/test/resources/application.yaml b/src/test/resources/application.yaml new file mode 100644 index 0000000..907c149 --- /dev/null +++ b/src/test/resources/application.yaml @@ -0,0 +1,47 @@ +# application configuration +# +application: + version: 0.1.0 + + +# general spring boot configuration +# +server: + port: 8090 + +spring: + servlet: + multipart.max-file-size: 100MB + multipart.max-request-size: 100MB + + +# tika configuration +# +tika: + parsers: + tesseract-ocr: + language: eng + timeout: 300 + enable-image-processing: false + apply-rotation: false + + pdf-ocr-parser: + ocr-only-strategy: true + min-doc-text-length: 100 + min-doc-byte-size: 10000 + use-legacy-ocr-parser-for-single-page-doc: false + + legacy-pdf-parser: + image-magick: + timeout: 300 + tesseract-ocr: + timeout: 300 + min-doc-text-length: 100 + + +# documents processing configuration +# +processing: + use-legacy-tika-processor-as-default: true + fail-on-empty-files: false + fail-on-non-document-types: true diff --git a/src/test/resources/tika/config/legacy-parser-config.xml b/src/test/resources/tika/config/legacy-parser-config.xml new file mode 100644 index 0000000..c414d03 --- /dev/null +++ b/src/test/resources/tika/config/legacy-parser-config.xml @@ -0,0 +1,14 @@ + + + + + + application/pdf + + + + application/pdf + + + \ No newline at end of file diff --git a/src/test/resources/tika/docs/generic/pat_id_1.doc b/src/test/resources/tika/docs/generic/pat_id_1.doc new file mode 100644 index 0000000..1fe2ca2 Binary files /dev/null and b/src/test/resources/tika/docs/generic/pat_id_1.doc differ diff --git a/src/test/resources/tika/docs/generic/pat_id_1.docx b/src/test/resources/tika/docs/generic/pat_id_1.docx new file mode 100644 index 0000000..e9700ce Binary files /dev/null and b/src/test/resources/tika/docs/generic/pat_id_1.docx differ diff --git a/src/test/resources/tika/docs/generic/pat_id_1.odt b/src/test/resources/tika/docs/generic/pat_id_1.odt new file mode 100644 index 0000000..90db7d9 Binary files /dev/null and b/src/test/resources/tika/docs/generic/pat_id_1.odt differ diff --git a/src/test/resources/tika/docs/generic/pat_id_1.pdf b/src/test/resources/tika/docs/generic/pat_id_1.pdf new file mode 100644 index 0000000..5b42732 Binary files /dev/null and b/src/test/resources/tika/docs/generic/pat_id_1.pdf differ diff --git a/src/test/resources/tika/docs/generic/pat_id_1.png b/src/test/resources/tika/docs/generic/pat_id_1.png new file mode 100644 index 0000000..fb8d321 Binary files /dev/null and b/src/test/resources/tika/docs/generic/pat_id_1.png differ diff --git a/src/test/resources/tika/docs/generic/pat_id_1.rtf b/src/test/resources/tika/docs/generic/pat_id_1.rtf new file mode 100644 index 0000000..020514a --- /dev/null +++ b/src/test/resources/tika/docs/generic/pat_id_1.rtf @@ -0,0 +1,52 @@ +{\rtf1\ansi\deff3\adeflang1025 +{\fonttbl{\f0\froman\fprq2\fcharset0 Times New Roman;}{\f1\froman\fprq2\fcharset2 Symbol;}{\f2\fswiss\fprq2\fcharset0 Arial;}{\f3\froman\fprq2\fcharset0 Liberation Serif{\*\falt Times New Roman};}{\f4\fswiss\fprq2\fcharset0 Liberation Sans{\*\falt Arial};}{\f5\fswiss\fprq2\fcharset0 Calibri;}{\f6\fnil\fprq2\fcharset0 PingFang SC;}{\f7\fnil\fprq2\fcharset0 Arial Unicode MS;}{\f8\fswiss\fprq0\fcharset128 Arial Unicode MS;}} +{\colortbl;\red0\green0\blue0;\red0\green0\blue255;\red0\green255\blue255;\red0\green255\blue0;\red255\green0\blue255;\red255\green0\blue0;\red255\green255\blue0;\red255\green255\blue255;\red0\green0\blue128;\red0\green128\blue128;\red0\green128\blue0;\red128\green0\blue128;\red128\green0\blue0;\red128\green128\blue0;\red128\green128\blue128;\red192\green192\blue192;} +{\stylesheet{\s0\snext0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057 Normal;} +{\*\cs15\snext15 Default Paragraph Font;} +{\s16\sbasedon0\snext17\ql\sl256\slmult1\widctlpar\sb240\sa120\keepn\ltrpar\cf0\dbch\af6\dbch\af7\afs28\alang1025\loch\f4\fs28\lang2057 Heading;} +{\s17\sbasedon0\snext17\ql\sl276\slmult1\widctlpar\sb0\sa140\ltrpar\cf0\dbch\af5\dbch\af0\afs22\alang1025\loch\f5\fs22\lang2057 Text Body;} +{\s18\sbasedon17\snext18\ql\sl276\slmult1\widctlpar\sb0\sa140\ltrpar\cf0\dbch\af5\dbch\af8\afs22\alang1025\loch\f5\fs22\lang2057 List;} +{\s19\sbasedon0\snext19\ql\sl256\slmult1\widctlpar\sb120\sa120\noline\ltrpar\cf0\i\dbch\af5\dbch\af8\afs24\alang1025\ai\loch\f5\fs24\lang2057 Caption;} +{\s20\sbasedon0\snext20\ql\sl256\slmult1\widctlpar\sb0\sa160\noline\ltrpar\cf0\dbch\af5\dbch\af8\afs22\alang1025\loch\f5\fs22\lang2057 Index;} +}{\*\generator LibreOffice/6.1.0.3$MacOSX_X86_64 LibreOffice_project/efb621ed25068d70781dc026f7e9c5187a4decd1}{\info{\author Rich}{\creatim\yr2015\mo11\dy2\hr16\min52}{\author Rich}{\revtim\yr2015\mo11\dy2\hr16\min59}{\printim\yr0\mo0\dy0\hr0\min0}}{\*\userprops}\deftab720 +\viewscale100 +{\*\pgdsctbl +{\pgdsc0\pgdscuse451\pgwsxn11906\pghsxn16838\marglsxn1440\margrsxn1440\margtsxn1440\margbsxn1440\pgdscnxt0 Default Style;}} +\formshade{\*\pgdscno0}\paperh16838\paperw11906\margl1440\margr1440\margt1440\margb1440\sectd\sbknone\sectunlocked1\pgndec\pgwsxn11906\pghsxn16838\marglsxn1440\margrsxn1440\margtsxn1440\margbsxn1440\ftnbj\ftnstart1\ftnrstcont\ftnnar\aenddoc\aftnrstcont\aftnstart1\aftnnrlc\htmautsp +{\*\ftnsep\chftnsep}\pgndec\pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057{\rtlch \ltrch\loch +This is an example of a clinical document} +\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057\rtlch \ltrch\loch + +\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057{\rtlch \ltrch\loch +The patient\u8217\'92s name is Bart Davidson.} +\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057{\rtlch \ltrch\loch +His carer\u8217\'92s Name Paul Wayne.} +\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057\rtlch \ltrch\loch + +\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057{\rtlch \ltrch\loch +His telephone number is 07754828992} +\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057\rtlch \ltrch\loch + +\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057{\rtlch \ltrch\loch +His Address is 61 Basildon Way, } +\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057{\rtlch \ltrch\loch +East Croyhurst, } +\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057{\rtlch \ltrch\loch +Angelton, } +\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057{\rtlch \ltrch\loch +AL64 9HT} +\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057\rtlch \ltrch\loch + +\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057{\rtlch \ltrch\loch +His mother\u8217\'92s name is Pauline Smith.} +\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057\rtlch \ltrch\loch + +\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057{\rtlch \ltrch\loch +He is on 100mg Paracetamol, 20 milligrams clozapine} +\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057\rtlch \ltrch\loch + +\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057\rtlch \ltrch\loch + +\par \pard\plain \s0\ql\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\hyphpar0\cf0\dbch\af5\dbch\af0\afs22\alang1025\langfe2052\loch\f5\fs22\lang2057\sl256\slmult1\widctlpar\sb0\sa160\ltrpar\rtlch \ltrch\loch + +\par } \ No newline at end of file diff --git a/src/test/resources/tika/docs/generic/pat_id_1.txt b/src/test/resources/tika/docs/generic/pat_id_1.txt new file mode 100644 index 0000000..5d9c770 --- /dev/null +++ b/src/test/resources/tika/docs/generic/pat_id_1.txt @@ -0,0 +1,17 @@ +This is an example of a clinical document + +The patient’s name is Bart Davidson. +His carer’s Name Paul Wayne. + +His telephone number is 07754828992 + +His Address is 61 Basildon Way, +East Croyhurst, +Angelton, +AL64 9HT + +His mother’s name is Pauline Smith. + +He is on 100mg Paracetamol, 20 milligrams clozapine + + diff --git a/src/test/resources/tika/docs/invalid/pdf_empty.pdf b/src/test/resources/tika/docs/invalid/pdf_empty.pdf new file mode 100644 index 0000000..e69de29 diff --git a/src/test/resources/tika/docs/invalid/tiff_multipage_spp2.tiff.zip b/src/test/resources/tika/docs/invalid/tiff_multipage_spp2.tiff.zip new file mode 100644 index 0000000..f0627de Binary files /dev/null and b/src/test/resources/tika/docs/invalid/tiff_multipage_spp2.tiff.zip differ diff --git a/src/test/resources/tika/docs/invalid/word_enc_noerror.docx b/src/test/resources/tika/docs/invalid/word_enc_noerror.docx new file mode 100644 index 0000000..2be820e Binary files /dev/null and b/src/test/resources/tika/docs/invalid/word_enc_noerror.docx differ diff --git a/src/test/resources/tika/docs/pdf/ex1.pdf b/src/test/resources/tika/docs/pdf/ex1.pdf new file mode 100644 index 0000000..ab5db00 Binary files /dev/null and b/src/test/resources/tika/docs/pdf/ex1.pdf differ diff --git a/src/test/resources/tika/docs/pdf/ex1_enc.pdf b/src/test/resources/tika/docs/pdf/ex1_enc.pdf new file mode 100644 index 0000000..5e3b5d0 Binary files /dev/null and b/src/test/resources/tika/docs/pdf/ex1_enc.pdf differ diff --git a/src/test/resources/tika/docs/pdf/ex2_ocr.pdf b/src/test/resources/tika/docs/pdf/ex2_ocr.pdf new file mode 100644 index 0000000..9fd8321 Binary files /dev/null and b/src/test/resources/tika/docs/pdf/ex2_ocr.pdf differ diff --git a/travis_gradle_build.sh b/travis_gradle_build.sh new file mode 100644 index 0000000..10d6457 --- /dev/null +++ b/travis_gradle_build.sh @@ -0,0 +1,97 @@ +#!/bin/bash + +# Abort on error, unitialized variables and pipe errors +set -eEu +set -o pipefail +#set -v + +export PING_SLEEP=30s +export WORKDIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" +export BUILD_OUTPUT=$WORKDIR/build.out +export TEST_PROC_LOG_OUTPUT=$WORKDIR/test-proc.out +export TEST_API_LOG_OUTPUT=$WORKDIR/test-api.out + +# dump the last N lines of the output files +DUMP_LINES_BUILD=2000 +DUMP_LINES_TEST_PROC=5000 +DUMP_LINES_TEST_API=2000 + +touch $BUILD_OUTPUT +touch $TEST_PROC_LOG_OUTPUT +touch $TEST_API_LOG_OUTPUT + + +# Helper functions +# +print_log_separator() { + echo "----------------------------------------------------------------" + echo "-" + echo "-" + echo "-" + echo "-" + echo "----------------------------------------------------------------" +} + +dump_output() { + if [ "$2" -eq "-1" ]; then + echo "Printing all the output: $1" + cat $1 + else + echo "Tailing the last $2 lines of build output: $1" + tail -$2 $1 + fi +} + +print_logs() { + print_log_separator + dump_output $BUILD_OUTPUT $DUMP_LINES_BUILD + + print_log_separator + dump_output $TEST_PROC_LOG_OUTPUT $DUMP_LINES_TEST_PROC + + print_log_separator + dump_output $TEST_API_LOG_OUTPUT $DUMP_LINES_TEST_API +} + +run_build() { + #./gradlew build --full-stacktrace --debug 2>&1 | tee >(grep TestEventLogger | grep -P -n "[[:ascii:]]" >> $TEST_LOG_OUTPUT) | grep -P -n "[[:ascii:]]" >> $BUILD_OUTPUT + ./gradlew assemble --full-stacktrace 2>&1 >> $BUILD_OUTPUT +} + +run_tests() { + # enable debug output here to spot the errors + ./gradlew test --full-stacktrace --debug --tests=tika.LegacyTikaProcessorTests 2>&1 | grep TestEventLogger >> $TEST_PROC_LOG_OUTPUT + ./gradlew test --full-stacktrace --debug --tests=tika.CompositeTikaProcessorTests 2>&1 | grep TestEventLogger >> $TEST_PROC_LOG_OUTPUT + # disable debug here, too much verbose + ./gradlew test --full-stacktrace --tests=ServiceControllerTests 2>&1 >> $TEST_API_LOG_OUTPUT + ./gradlew test --full-stacktrace --tests=ServiceControllerDocumentMultipartFileTests 2>&1 >> $TEST_API_LOG_OUTPUT + ./gradlew test --full-stacktrace --tests=ServiceControllerDocumentStreamTests 2>&1 >> $TEST_API_LOG_OUTPUT +} + +error_handler() { + echo ERROR: An error was encountered with the build. + print_logs + exit 1 +} + + +# The Main +# + +# If an error occurs, run our error handler to output a tail of the build +trap 'error_handler' ERR SIGPIPE + +# Set up a repeating loop to send some output to Travis (to avoid killing inactive builds) +bash -c "while true; do echo \$(date) - building ...; sleep $PING_SLEEP; done" & +PING_LOOP_PID=$! + +# Build Commands +run_build +run_tests + +# 'nicely' terminate the ping output loop +kill $PING_LOOP_PID + +# Print the logs +echo SUCCESS +print_logs