Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add RegexTextSearch Kotlin sample #14

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test-kotlin-samples.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ jobs:
strategy:
matrix:
os: [windows-latest, ubuntu-latest]
dir: ['TextExtract', 'FlattenTransparency', 'SplitPDF', 'ConvertToOffice', 'MergePDF', 'ListWords', 'PDFOptimize', 'PDFAConverter', 'Redactions', 'Watermark']
dir: ['TextExtract', 'FlattenTransparency', 'SplitPDF', 'ConvertToOffice', 'MergePDF', 'ListWords', 'PDFOptimize', 'PDFAConverter', 'Redactions', 'Watermark', 'RegexTextSearch']
steps:
- name: Checkout
uses: actions/checkout@v4
Expand Down
14 changes: 14 additions & 0 deletions RegexTextSearch/.idea/runConfigurations/RegexTextSearch.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

231 changes: 231 additions & 0 deletions RegexTextSearch/pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>com.datalogics.pdfl.samples</groupId>
<artifactId>RegexTextSearch</artifactId>
<version>1.0-SNAPSHOT</version>

<repositories>
<repository>
<id>mavenCentral</id>
<url>https://repo1.maven.org/maven2/</url>
</repository>
</repositories>

<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<kotlin.code.style>official</kotlin.code.style>
<kotlin.compiler.jvmTarget>1.8</kotlin.compiler.jvmTarget>
<maven.compiler.source>1.8</maven.compiler.source>
<maven.compiler.target>1.8</maven.compiler.target>
</properties>

<profiles>
<profile>
<id>Windows64</id>
<activation>
<os>
<family>windows</family>
<arch>amd64</arch>
</os>
</activation>
<properties>
<jni.classifier>win-x86-64-jni</jni.classifier>
</properties>
</profile>
<profile>
<id>MacArm</id>
<activation>
<os>
<family>mac</family>
<arch>aarch64</arch>
</os>
</activation>
<properties>
<jni.classifier>mac-arm-64-jni</jni.classifier>
</properties>
</profile>
<profile>
<id>Linux64</id>
<activation>
<os>
<!-- Use OS <name> instead of <family> because the "unix" <family> also includes Mac -->
<name>Linux</name>
<arch>amd64</arch>
</os>
</activation>
<properties>
<jni.classifier>linux-x86-64-jni</jni.classifier>
</properties>
</profile>
</profiles>

<dependencies>
<dependency>
<groupId>org.jetbrains.kotlin</groupId>
<artifactId>kotlin-stdlib-jdk8</artifactId>
<version>1.9.21</version>
</dependency>
<dependency>
<groupId>com.datalogics.pdfl</groupId>
<artifactId>pdfl</artifactId>
<version>18.31.0</version>
<type>pom</type>
</dependency>
<dependency>
<groupId>com.datalogics.pdfl</groupId>
<artifactId>pdfl</artifactId>
<version>18.31.0</version>
</dependency>
<dependency>
<groupId>com.datalogics.pdfl</groupId>
<artifactId>pdfl</artifactId>
<version>18.31.0</version>
<type>zip</type>
<classifier>${jni.classifier}</classifier>
</dependency>
<dependency>
<groupId>com.datalogics.pdfl</groupId>
<artifactId>pdfl</artifactId>
<version>18.31.0</version>
<type>zip</type>
<classifier>resources</classifier>
</dependency>
<dependency>
<groupId>com.datalogics.pdfl</groupId>
<artifactId>pdfl</artifactId>
<version>18.31.0</version>
<classifier>javadoc</classifier>
</dependency>
</dependencies>

<build>
<sourceDirectory>src/main/kotlin</sourceDirectory>
<plugins>
<plugin>
<groupId>org.jetbrains.kotlin</groupId>
<artifactId>kotlin-maven-plugin</artifactId>
<version>1.9.21</version>
<executions>
<execution>
<id>compile</id>
<phase>compile</phase>
<goals>
<goal>compile</goal>
</goals>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-surefire-plugin</artifactId>
<version>2.22.2</version>
</plugin>
<plugin>
<artifactId>maven-failsafe-plugin</artifactId>
<version>2.22.2</version>
</plugin>
<plugin>
<groupId>org.codehaus.mojo</groupId>
<artifactId>exec-maven-plugin</artifactId>
<version>1.6.0</version>
<configuration>
<mainClass>RegexTextSearch</mainClass>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<executions>
<execution>
<id>unpack-resources</id>
<phase>generate-resources</phase>
<goals>
<goal>unpack</goal>
</goals>
<configuration>
<artifactItems>
<artifactItem>
<groupId>com.datalogics.pdfl</groupId>
<artifactId>pdfl</artifactId>
<classifier>resources</classifier>
<type>zip</type>
<outputDirectory>${project.build.directory}/lib/Resources</outputDirectory>
</artifactItem>
</artifactItems>
</configuration>
</execution>
<execution>
<id>unpack-jni</id>
<phase>generate-resources</phase>
<goals>
<goal>unpack</goal>
</goals>
<configuration>
<artifactItems>
<artifactItem>
<groupId>com.datalogics.pdfl</groupId>
<artifactId>pdfl</artifactId>
<classifier>${jni.classifier}</classifier>
<type>zip</type>
<outputDirectory>${project.build.directory}/lib</outputDirectory>
</artifactItem>
</artifactItems>
</configuration>
</execution>
<execution>
<id>unpack-license</id>
<phase>generate-resources</phase>
<goals>
<goal>unpack</goal>
</goals>
<configuration>
<artifactItems>
<artifactItem>
<groupId>com.datalogics.pdfl</groupId>
<artifactId>pdfl</artifactId>
<classifier>license</classifier>
<type>zip</type>
<outputDirectory>${project.build.directory}/lib</outputDirectory>
</artifactItem>
</artifactItems>
</configuration>
</execution>
</executions>
</plugin>
<plugin>
<artifactId>maven-assembly-plugin</artifactId>
<executions>
<execution>
<phase>package</phase>
<goals>
<goal>single</goal>
</goals>
</execution>
</executions>
<configuration>
<archive>
<manifest>
<addClasspath>true</addClasspath>
<mainClass>com.datalogics.pdfl.samples.RegexTextSearchKt</mainClass>
</manifest>
</archive>
<descriptorRefs>
<descriptorRef>jar-with-dependencies</descriptorRef>
</descriptorRefs>
</configuration>
</plugin>
</plugins>
<pluginManagement>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-dependency-plugin</artifactId>
<version>3.0.2</version>
</plugin>
</plugins>
</pluginManagement>
</build>

</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
package com.datalogics.pdfl.samples

import com.datalogics.PDFL.*
import java.util.*

/*
*
* This sample shows how to search a PDF document using regex pattern matching. The program opens an input PDF, searches for
* words using the DocTextFinder, and then prints these words to the console.
*
* Copyright (c) 2024, Datalogics, Inc. All rights reserved.
*
*/

fun main(args: Array<String>) {
println("RegexTextSearch sample:")
val lib = Library()

try {
val sInput: String =
if (args.isNotEmpty()) {
args[0]
} else {
Library.getResourceDirectory() + "Sample_Input/RegexTextSearch.pdf"
}

val sOutput = "RegexTextSearch-out.pdf"

// Highlight occurrences of the words that match this regular expression.
// Phone numbers
val sRegex = "((1-)?(\\()?\\d{3}(\\))?(\\s)?(-)?\\d{3}-\\d{4})"
// Email addresses
//val sRegex = "(\\b[\\w.!#$%&'*+\\/=?^`{|}~-]+@[\\w-]+(?:\\.[\\w-]+)*\\b)"
// URLs
//val sRegex = "((https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|www\\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\\.[^\\s]{2,}|https?:\\/\\/(?:www\\.|(?!www))[a-zA-Z0-9]+\\.[^\\s]{2,}|www\\.[a-zA-Z0-9]+\\.[^\\s]{2,}))"

println("Reading $sInput")

val doc = Document(sInput)
val nPages = doc.numPages
println("Opened document $sInput")

val wordConfig = WordFinderConfig()

// Need to set this to true so phrases will be concatenated properly
wordConfig.noHyphenDetection = true

val docTextFinder = DocTextFinder(doc, wordConfig)
val docMatches = docTextFinder.getMatchList(0, nPages - 1, sRegex)

for (wInfo in docMatches) {
// Show the matching phrase
val s = wInfo.matchString
println(s)

// Get the word quads
val quadList = wInfo.quadInfo

// Iterate through the quad info and create highlights
for (qInfo in quadList) {
val docPage = doc.getPage(qInfo.pageNum)
val highlight = HighlightAnnotation(docPage, qInfo.quads)
highlight.normalAppearance = highlight.generateAppearance()
}
}

// Save the document with the highlighted matched strings
doc.save(EnumSet.of(SaveFlags.FULL), sOutput)
doc.close()

} finally {
lib.delete()
}
}
Loading