From 923ecda706bd72c651fbbe5840a25711e83d0409 Mon Sep 17 00:00:00 2001 From: Sahara Yousuf Date: Fri, 23 Feb 2024 15:21:02 -0600 Subject: [PATCH 1/2] Add RegexExtractText Kotlin sample --- .github/workflows/test-kotlin-samples.yml | 2 +- .../runConfigurations/RegexExtractText.xml | 14 ++ RegexExtractText/pom.xml | 236 ++++++++++++++++++ .../pdfl/samples/RegexExtractText.kt | 139 +++++++++++ 4 files changed, 390 insertions(+), 1 deletion(-) create mode 100644 RegexExtractText/.idea/runConfigurations/RegexExtractText.xml create mode 100644 RegexExtractText/pom.xml create mode 100644 RegexExtractText/src/main/kotlin/com/datalogics/pdfl/samples/RegexExtractText.kt diff --git a/.github/workflows/test-kotlin-samples.yml b/.github/workflows/test-kotlin-samples.yml index 0ee2a5c..9edc212 100644 --- a/.github/workflows/test-kotlin-samples.yml +++ b/.github/workflows/test-kotlin-samples.yml @@ -17,7 +17,7 @@ jobs: strategy: matrix: os: [windows-latest, ubuntu-latest] - dir: ['TextExtract', 'FlattenTransparency', 'SplitPDF', 'ConvertToOffice', 'MergePDF', 'ListWords', 'PDFOptimize', 'PDFAConverter', 'Redactions'] + dir: ['TextExtract', 'FlattenTransparency', 'SplitPDF', 'ConvertToOffice', 'MergePDF', 'ListWords', 'PDFOptimize', 'PDFAConverter', 'Redactions', 'RegexExtractText'] steps: - name: Checkout uses: actions/checkout@v4 diff --git a/RegexExtractText/.idea/runConfigurations/RegexExtractText.xml b/RegexExtractText/.idea/runConfigurations/RegexExtractText.xml new file mode 100644 index 0000000..b758f36 --- /dev/null +++ b/RegexExtractText/.idea/runConfigurations/RegexExtractText.xml @@ -0,0 +1,14 @@ + + + + + + + \ No newline at end of file diff --git a/RegexExtractText/pom.xml b/RegexExtractText/pom.xml new file mode 100644 index 0000000..525c870 --- /dev/null +++ b/RegexExtractText/pom.xml @@ -0,0 +1,236 @@ + + + 4.0.0 + com.datalogics.pdfl.samples + RegexExtractText + 1.0-SNAPSHOT + + + + mavenCentral + https://repo1.maven.org/maven2/ + + + + + UTF-8 + official + 1.8 + 1.8 + 1.8 + + + + + Windows64 + + + windows + amd64 + + + + win-x86-64-jni + + + + MacArm + + + mac + aarch64 + + + + mac-arm-64-jni + + + + Linux64 + + + + Linux + amd64 + + + + linux-x86-64-jni + + + + + + + org.jetbrains.kotlin + kotlin-stdlib-jdk8 + 1.9.21 + + + com.datalogics.pdfl + pdfl + 18.31.0 + pom + + + com.datalogics.pdfl + pdfl + 18.31.0 + + + com.datalogics.pdfl + pdfl + 18.31.0 + zip + ${jni.classifier} + + + com.datalogics.pdfl + pdfl + 18.31.0 + zip + resources + + + com.datalogics.pdfl + pdfl + 18.31.0 + javadoc + + + org.json + json + [20230227,) + + + + + src/main/kotlin + + + org.jetbrains.kotlin + kotlin-maven-plugin + 1.9.21 + + + compile + compile + + compile + + + + + + maven-surefire-plugin + 2.22.2 + + + maven-failsafe-plugin + 2.22.2 + + + org.codehaus.mojo + exec-maven-plugin + 1.6.0 + + RegexExtractText + + + + org.apache.maven.plugins + maven-dependency-plugin + + + unpack-resources + generate-resources + + unpack + + + + + com.datalogics.pdfl + pdfl + resources + zip + ${project.build.directory}/lib/Resources + + + + + + unpack-jni + generate-resources + + unpack + + + + + com.datalogics.pdfl + pdfl + ${jni.classifier} + zip + ${project.build.directory}/lib + + + + + + unpack-license + generate-resources + + unpack + + + + + com.datalogics.pdfl + pdfl + license + zip + ${project.build.directory}/lib + + + + + + + + maven-assembly-plugin + + + package + + single + + + + + + + true + com.datalogics.pdfl.samples.RegexExtractTextKt + + + + jar-with-dependencies + + + + + + + + org.apache.maven.plugins + maven-dependency-plugin + 3.0.2 + + + + + + \ No newline at end of file diff --git a/RegexExtractText/src/main/kotlin/com/datalogics/pdfl/samples/RegexExtractText.kt b/RegexExtractText/src/main/kotlin/com/datalogics/pdfl/samples/RegexExtractText.kt new file mode 100644 index 0000000..eee2b79 --- /dev/null +++ b/RegexExtractText/src/main/kotlin/com/datalogics/pdfl/samples/RegexExtractText.kt @@ -0,0 +1,139 @@ +package com.datalogics.pdfl.samples + +import com.datalogics.PDFL.* +import org.json.JSONArray +import org.json.JSONObject +import java.io.FileWriter + +/* + * + * This sample demonstrates using DocTextFinder to find instances of a phrase + * that matches a user-supplied regular expression. The output is a JSON file that + * has the match information. + * + * Copyright (c) 2024, Datalogics, Inc. All rights reserved. + * + */ + +// This Datalogics sample uses the org.json (JSON-Java) library to generate JSON output. Below is the JSON license for the org.json software: +/* +Copyright (c) 2002 JSON.org +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: +The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. +The Software shall be used for Good, not Evil. +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +IN THE SOFTWARE. +*/ + +fun main(args: Array) { + println("RegexExtractText sample:") + + val lib = Library() + + try { + + val sInput = + if (args.isNotEmpty()){ + args[0] + } else { + Library.getResourceDirectory() + "Sample_Input/RegexExtractText.pdf" + } + + val sOutput = "RegexExtractText-out.json" + + // Phone numbers + val sRegex = "((1-)?(\\()?\\d{3}(\\))?(\\s)?(-)?\\d{3}-\\d{4})" + // Email addresses + //val sRegex = "(\\b[\\w.!#$%&'*+\\/=?^`{|}~-]+@[\\w-]+(?:\\.[\\w-]+)*\\b)" + // URLs + //val sRegex = "((https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|www\\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9]+\\.[^\\s]{2,}|www\\.[a-zA-Z0-9]+\\.[^\\s]{2,}))" + + println("Reading $sInput") + + val doc = Document(sInput) + + // This array will hold the JSON stream that we will print to the output JSON file. + val result = JSONArray() + + val nPages = doc.numPages + + println("Opened document $sInput") + + val wordConfig = WordFinderConfig() + + // Need to set this to true so phrases will be concatenated properly. + wordConfig.noHyphenDetection = true + + val docTextFinder = DocTextFinder(doc, wordConfig) + + val docMatches = docTextFinder.getMatchList(0, nPages - 1, sRegex) + + for (wInfo in docMatches) { + // This JSON object will store the match phrase and an array of quads for the match. + val matchObject = JSONObject() + + // This JSON array will store the page number and quad location for each match quad. + val matchQuadInformation = JSONArray() + + // Set the match phrase in the JSON object. + matchObject.put("match-phrase", wInfo.matchString) + + // Get the word quads. + val quadList = wInfo.quadInfo + + // Iterate through the quad info. + for (qInfo in quadList) { + for (quad in qInfo.quads) { + // Get the coordinates of the quad and set the quad coordinates in JSON objects. + val topLeft = JSONObject() + topLeft.put("x", quad.topLeft.h) + topLeft.put("y", quad.topLeft.v) + + val bottomLeft = JSONObject() + bottomLeft.put("x", quad.bottomLeft.h) + bottomLeft.put("y", quad.bottomLeft.v) + + val topRight = JSONObject() + topRight.put("x", quad.topRight.h) + topRight.put("y", quad.topRight.v) + + val bottomRight = JSONObject() + bottomRight.put("x", quad.bottomRight.h) + bottomRight.put("y", quad.bottomRight.v) + + // Use the quad coordinate JSON objects to form a single JSON object that holds match quad location information. + val quadLocation = JSONObject() + quadLocation.put("bottom-left", bottomLeft) + quadLocation.put("bottom-right", bottomRight) + quadLocation.put("top-left", topLeft) + quadLocation.put("top-right", topRight) + + val quadInformationObject = JSONObject() + quadInformationObject.put("page-number", qInfo.pageNum) + quadInformationObject.put("quad-location", quadLocation) + + // Insert the match's page number and quad location(s) in the matchQuadInformation JSON array. + matchQuadInformation.put(quadInformationObject) + } + } + + // Set the match's quad information in the matchObject. + matchObject.put("match-quads", matchQuadInformation) + + result.put(matchObject) + } + + // Write the match information to the output JSON file. + println("Writing JSON to $sOutput") + FileWriter(sOutput).use { file -> file.write(result.toString(4)) } + + doc.close() + + } finally { + lib.delete() + } +} From 4a4400e297eea81f8ce918a3401457e7cbd2e00e Mon Sep 17 00:00:00 2001 From: Sahara Yousuf Date: Fri, 23 Feb 2024 15:27:25 -0600 Subject: [PATCH 2/2] Ensure JSON sample output is archived --- .github/workflows/test-kotlin-samples.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/test-kotlin-samples.yml b/.github/workflows/test-kotlin-samples.yml index 9edc212..9e6a80d 100644 --- a/.github/workflows/test-kotlin-samples.yml +++ b/.github/workflows/test-kotlin-samples.yml @@ -62,3 +62,4 @@ jobs: ${{matrix.dir}}/*.docx ${{matrix.dir}}/*.xlsx ${{matrix.dir}}/*.pptx + ${{matrix.dir}}/*.json