From 98fe018bd2fc71b7ce39280d6a1edc2849ecba02 Mon Sep 17 00:00:00 2001 From: Simon Welsch Date: Thu, 7 Dec 2023 13:22:30 +0100 Subject: [PATCH 1/2] feat(deps): upgrade PDFBox to 3.0.1 --- pom.xml | 2 +- src/main/java/org/jadice/filetype/matchers/PDFMatcher.java | 6 ++++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/pom.xml b/pom.xml index 046c717..97cf273 100644 --- a/pom.xml +++ b/pom.xml @@ -58,7 +58,7 @@ 1.22 1.9.4 - 2.0.27 + 3.0.1 2.3.1 2.3.6 diff --git a/src/main/java/org/jadice/filetype/matchers/PDFMatcher.java b/src/main/java/org/jadice/filetype/matchers/PDFMatcher.java index b130a4a..af58c75 100644 --- a/src/main/java/org/jadice/filetype/matchers/PDFMatcher.java +++ b/src/main/java/org/jadice/filetype/matchers/PDFMatcher.java @@ -15,6 +15,8 @@ import javax.xml.transform.stream.StreamResult; import javax.xml.transform.stream.StreamSource; +import org.apache.pdfbox.Loader; +import org.apache.pdfbox.io.IOUtils; import org.apache.pdfbox.pdmodel.PDDocument; import org.apache.pdfbox.pdmodel.PDDocumentCatalog; import org.apache.pdfbox.pdmodel.PDDocumentInformation; @@ -82,10 +84,10 @@ public boolean matches(final Context context) { SeekableInputStream sis = context.getStream(); try { sis.seek(0); - try (PDDocument document = PDDocument.load(sis)) { + try (PDDocument document = Loader.loadPDF(IOUtils.toByteArray(sis))) { context.setProperty(MimeTypeAction.KEY, PDF_MIME_TYPE); - Map pdfDetails = new HashMap(); + Map pdfDetails = new HashMap<>(); context.setProperty(DETAILS_KEY, pdfDetails); pdfDetails.put(NUMBER_OF_PAGES_KEY, Integer.valueOf(document.getNumberOfPages())); From 10fffc5654daa1e14f469a3adab980bd50940de7 Mon Sep 17 00:00:00 2001 From: Simon Welsch Date: Thu, 7 Dec 2023 13:35:03 +0100 Subject: [PATCH 2/2] fix: fix code smells in PDFMatcher --- .../jadice/filetype/matchers/PDFMatcher.java | 53 ++++++------------- 1 file changed, 17 insertions(+), 36 deletions(-) diff --git a/src/main/java/org/jadice/filetype/matchers/PDFMatcher.java b/src/main/java/org/jadice/filetype/matchers/PDFMatcher.java index af58c75..03d714e 100644 --- a/src/main/java/org/jadice/filetype/matchers/PDFMatcher.java +++ b/src/main/java/org/jadice/filetype/matchers/PDFMatcher.java @@ -3,9 +3,13 @@ import java.io.IOException; import java.io.InputStream; import java.io.StringWriter; -import java.util.*; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; import java.util.Map.Entry; +import javax.xml.XMLConstants; import javax.xml.transform.OutputKeys; import javax.xml.transform.Source; import javax.xml.transform.Transformer; @@ -43,7 +47,7 @@ * A {@link Matcher} for PDF documents . *

* Caveat: for performance reasons, this should only be called from a context where the stream has - * already be identified as a PDF file/stream. + * already been identified as a PDF file/stream. */ public class PDFMatcher extends Matcher { private static final Logger LOGGER = LoggerFactory.getLogger(PDFMatcher.class); @@ -90,7 +94,7 @@ public boolean matches(final Context context) { Map pdfDetails = new HashMap<>(); context.setProperty(DETAILS_KEY, pdfDetails); - pdfDetails.put(NUMBER_OF_PAGES_KEY, Integer.valueOf(document.getNumberOfPages())); + pdfDetails.put(NUMBER_OF_PAGES_KEY, document.getNumberOfPages()); PDDocumentInformation info = document.getDocumentInformation(); if (null != info) { @@ -114,7 +118,7 @@ public boolean matches(final Context context) { pdfDetails.put(IS_ENCRYPTED_KEY, false); - final List filenames = new ArrayList(); + final List filenames = new ArrayList<>(); PDDocumentNameDictionary namesDictionary = new PDDocumentNameDictionary(document.getDocumentCatalog()); PDEmbeddedFilesNameTreeNode efTree = namesDictionary.getEmbeddedFiles(); @@ -166,7 +170,10 @@ private void provideXMPMetadata(final Map pdfDetails, final PDMe StreamResult xmlOutput = new StreamResult(new StringWriter()); // Configure transformer - Transformer transformer = TransformerFactory.newInstance().newTransformer(); + TransformerFactory tf = TransformerFactory.newInstance(); + tf.setAttribute(XMLConstants.ACCESS_EXTERNAL_DTD, ""); + tf.setAttribute(XMLConstants.ACCESS_EXTERNAL_STYLESHEET, ""); + Transformer transformer = tf.newTransformer(); transformer.setOutputProperty(OutputKeys.INDENT, "yes"); transformer.transform(xmlInput, xmlOutput); @@ -185,7 +192,7 @@ private static void extractFilesFromPage(final PDPage page, final List f PDComplexFileSpecification complexFileSpec = (PDComplexFileSpecification) fileSpec; PDEmbeddedFile embeddedFile = getEmbeddedFile(complexFileSpec); if (embeddedFile != null) { - extractFile(filenames, complexFileSpec.getFilename(), embeddedFile); + extractFile(filenames, complexFileSpec.getFilename()); } } } @@ -206,19 +213,17 @@ private static void extractFilesFromEFTree(final PDEmbeddedFilesNameTreeNode efT } } - private static void extractFiles(final Map names, final List filenames) - throws IOException { + private static void extractFiles(final Map names, final List filenames) { for (Entry entry : names.entrySet()) { PDComplexFileSpecification fileSpec = entry.getValue(); PDEmbeddedFile embeddedFile = getEmbeddedFile(fileSpec); if (embeddedFile != null) { - extractFile(filenames, fileSpec.getFilename(), embeddedFile); + extractFile(filenames, fileSpec.getFilename()); } } } - private static void extractFile(final List filenames, final String filename, - final PDEmbeddedFile embeddedFile) throws IOException { + private static void extractFile(final List filenames, final String filename) { filenames.add(filename); } @@ -263,7 +268,7 @@ private static void addTextInfo(final Map pdfDetails, final PDDo reader.setEndPage(i); final String pdfText = reader.getText(doc).replaceAll("([\\r\\n])", ""); textLengthPerPages.add(pdfText.length()); - if (pdfText.length() > 0) { + if (!pdfText.isEmpty()) { containsText = true; } } @@ -275,28 +280,4 @@ private static void addTextInfo(final Map pdfDetails, final PDDo } } - /** - * Reads the whole stream to determine the length of it. - * - * @param sis stream - * @return length of given stream or -1 if any error occurred - */ - private static long getFileLength(final SeekableInputStream sis) { - try { - sis.seek(0); - int read = 0; - final byte[] buffer = new byte[4096]; - do { - synchronized (sis) { // perform synchronization inside while loop! See DOCPV-932 - read = sis.read(buffer); - } - } while (read != -1); - - // whole sis is read now - return sis.length(); - } catch (Exception e) { - LOGGER.warn("Failed to determine file length.", e); - return -1; - } - } }