Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Using FILE_ABSOLUTEPATH and Original-Filename to populate FILEXT #600

Merged
merged 7 commits into from
Dec 27, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions src/main/java/emissary/core/constants/Parameters.java
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ public class Parameters {
// Common parameters
public static final String DOCUMENT_TITLE = "DocumentTitle";
public static final String EVENT_DATE = "EventDate";
public static final String FILEXT = "FILEXT";
public static final String FILE_ABSOLUTEPATH = "FILE_ABSOLUTEPATH";
public static final String FILE_DATE = "FILE_DATE";
public static final String FILE_NAME = "FILE_NAME";
public static final String INPUT_FILEDATE = "INPUT_FILEDATE";
Expand Down
79 changes: 62 additions & 17 deletions src/main/java/emissary/output/DropOffUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import emissary.util.TimeUtil;
import emissary.util.shell.Executrix;

import org.apache.commons.io.FilenameUtils;
import org.apache.commons.lang3.StringUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
Expand Down Expand Up @@ -38,6 +39,8 @@
import static emissary.core.Form.PREFIXES_LANG;
import static emissary.core.Form.TEXT;
import static emissary.core.Form.UNKNOWN;
import static emissary.core.constants.Parameters.FILEXT;
import static emissary.core.constants.Parameters.FILE_ABSOLUTEPATH;
import static emissary.core.constants.Parameters.ORIGINAL_FILENAME;

public class DropOffUtil {
Expand Down Expand Up @@ -82,6 +85,8 @@ public class DropOffUtil {
private static final String DEFAULT_EVENT_DATE_TO_NOW = "DEFAULT_EVENT_DATE_TO_NOW";
protected boolean defaultEventDateToNow = true;

private static List<String> defaultFilenameFields;

/**
* Create with the default configuration
*/
Expand Down Expand Up @@ -145,6 +150,15 @@ protected void configure(final Configurator configG) {
if (this.maxFilextLen < 0) {
this.maxFilextLen = Integer.MAX_VALUE;
}

defaultFilenameFields = new ArrayList<>();
List<String> configuredFilenameFields = actualConfigG.findEntries("FILENAME_FIELDS");
if (!configuredFilenameFields.isEmpty()) {
defaultFilenameFields.addAll(configuredFilenameFields);
} else {
defaultFilenameFields.add(ORIGINAL_FILENAME);
defaultFilenameFields.add(FILE_ABSOLUTEPATH);
}
} else {
logger.debug("Configuration is null for DropOffUtil, using defaults");
this.executrix = new Executrix();
Expand Down Expand Up @@ -969,32 +983,63 @@ public Date extractEventDateFrom(final IBaseDataObject d, final boolean lastReso
}

/**
* Extracts from the provided {@link IBaseDataObject} the last file extension from each value in the "Original-Filename"
* parameter, if that value contains "." before its last character. If one or more file extensions are extracted, the
* IBaseDataObject's "FILEXT" parameter is set as the unique set of extracted file extensions, converted to lowercase.
* Utilizes the static methods getFullFilepathsFromParams and getFileExtensions to extract the file extensions from all
* the filenames of the object of a given {@link IBaseDataObject}. If one or more file extensions are extracted, the
* IBaseDataObject's FILEXT parameter is set as the unique set of extracted file extensions, converted to lowercase.
*
* @param p IBaseDataObject to process
*
*/
void extractUniqueFileExtensions(IBaseDataObject p) {
arp-0984 marked this conversation as resolved.
Show resolved Hide resolved
if (p.hasParameter(ORIGINAL_FILENAME)) {
final Set<String> extensions = new HashSet<>();
for (Object filename : p.getParameter(ORIGINAL_FILENAME)) {
final String fn = (String) filename;
if (StringUtils.isNotEmpty(fn) && fn.lastIndexOf('.') > -1) {
final int pos = fn.lastIndexOf('.') + 1;
if (pos < fn.length()) {
final String fext = fn.substring(pos).toLowerCase();
if (fext.length() > 0 && fext.length() <= this.maxFilextLen) {
extensions.add(fext);
}
List<String> filenames = getFullFilepathsFromParams(p);
Set<String> extensions = getFileExtensions(filenames, this.maxFilextLen);
if (!extensions.isEmpty()) {
p.setParameter(FILEXT, extensions);
}
}

/**
* Given a list of filenames, extract and return a set of non-blank file extensions converted to lowercase.
*
* @param filenames The list of filenames to examine
* @param maxFilextLen The maximum size we want a file extension to be
* @return A set of unique file extensions from the filename list
*/
public static Set<String> getFileExtensions(List<String> filenames, int maxFilextLen) {
final Set<String> extensions = new HashSet<>();
for (String filename : filenames) {

// add the file extension if it is smaller than maxFileextLen
final String fext = FilenameUtils.getExtension(filename);
if (StringUtils.isNotBlank(fext) && fext.length() <= maxFilextLen) {
extensions.add(fext.toLowerCase());
}
}
return extensions;
}

/**
* Checks the Original-Filename and FILE_ABSOLUTEPATH for the filename of the object. Returns a list with of the
* non-empty strings found in these fields. If nothing is found in either field, return an empty list.
*
* @param d The IBDO
* @return The list of filenames found in the field Original-Filename or FILE_ABSOLUTEPATH
*/
public static List<String> getFullFilepathsFromParams(IBaseDataObject d) {
arp-0984 marked this conversation as resolved.
Show resolved Hide resolved

List<String> filenames = new ArrayList<>();

for (String ibdoField : defaultFilenameFields) {
if (d.hasParameter(ibdoField)) {
for (Object filename : d.getParameter(ibdoField)) {
String stringFileName = (String) filename;
if (StringUtils.isNotBlank(stringFileName)) {
filenames.add(stringFileName);
}
}
}
if (!extensions.isEmpty()) {
p.setParameter("FILEXT", extensions);
}
}
return filenames;
}

/**
Expand Down
56 changes: 55 additions & 1 deletion src/test/java/emissary/output/DropOffUtilTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
import static emissary.core.Form.TEXT;
import static emissary.core.Form.UNKNOWN;
import static emissary.core.constants.Parameters.EVENT_DATE;
import static emissary.core.constants.Parameters.FILEXT;
import static emissary.core.constants.Parameters.FILE_ABSOLUTEPATH;
import static emissary.core.constants.Parameters.FILE_DATE;
import static emissary.core.constants.Parameters.ORIGINAL_FILENAME;
import static org.junit.jupiter.api.Assertions.assertEquals;
Expand Down Expand Up @@ -576,7 +578,6 @@ void testGetFileType() {
@Test
void testExtractUniqueFileExtensions() {
// these should be constants
final String FILEXT = "FILEXT";
DropOffUtil util = new DropOffUtil();

final IBaseDataObject bdo = new BaseDataObject();
Expand All @@ -601,6 +602,37 @@ void testExtractUniqueFileExtensions() {
assertTrue(fileExts.contains("mp3"), "FILEXT values should contain \"mp3\"");
}

@Test
void testExtractFileExtensionsWithFullFilepaths() {
DropOffUtil util = new DropOffUtil();

// tests a combination of either FILE_ABSOLUTEPATH and Original-Filename, neither, and both set at once
String[] fileAbsolutepaths = {"D:\\Users\\jdoe\\Documents\\Taxes 2023.csv", "", "/paper.abc.zzz",
"/home/jdoe/SHARED_D.IR/cat.mov", "/home/user/.bashrc", ""};
String[] originalFilenames = {"", "D:\\Users\\jdoe\\interesting.folder\\a.table", "flowers.456.123",
"/home/jdoe/SHARED_D.IR/cat", "taxes.thisfileextensionistoolong", ""};

String[][] extensions = {{"csv"}, {"table"}, {"zzz", "123"}, {"mov"}, {"bashrc"}, {}};

final IBaseDataObject ibdo = new BaseDataObject();

for (int i = 0; i < fileAbsolutepaths.length; i++) {
ibdo.setParameter(FILE_ABSOLUTEPATH, fileAbsolutepaths[i]);
ibdo.setParameter(ORIGINAL_FILENAME, originalFilenames[i]);
util.extractUniqueFileExtensions(ibdo);
if (extensions[i].length == 0) {
assertFalse(ibdo.hasParameter(FILEXT));
}
for (String extension : extensions[i]) {
assertEquals(extensions[i].length, ibdo.getParameter(FILEXT).size(), "Only "
+ extensions[i].length + " file extensions should have been extracted");
assertTrue(ibdo.getParameter(FILEXT).contains(extension), "FILEXT should be extracted");
}
// reset for the next test
ibdo.clearParameters();
}
}

@Test
void testCleanSpecPath() {
assertEquals("/this/is/fine", util.cleanSpecPath("/this/is/fine"));
Expand All @@ -609,6 +641,28 @@ void testCleanSpecPath() {
assertEquals("/this/./././/./is/fine", util.cleanSpecPath("/this/....../../..//./is/fine"));
}

@Test
void testGetFullFilepathsFromParams() {
IBaseDataObject ibdo = new BaseDataObject();
List<String> bestFilenames;

ibdo.setParameter(ORIGINAL_FILENAME, "");
ibdo.setParameter(FILE_ABSOLUTEPATH, "");
bestFilenames = DropOffUtil.getFullFilepathsFromParams(ibdo);
assertEquals(0, bestFilenames.size(), "No filename should have been found");

ibdo.setParameter(FILE_ABSOLUTEPATH, "theOtherFile.csv");
bestFilenames = DropOffUtil.getFullFilepathsFromParams(ibdo);
assertEquals(1, bestFilenames.size(), "There should be one filename extracted");
assertEquals("theOtherFile.csv", bestFilenames.get(0), "The FILE_ABSOLUTEPATH should have been extracted");

ibdo.setParameter(ORIGINAL_FILENAME, "file.docx");
bestFilenames = DropOffUtil.getFullFilepathsFromParams(ibdo);
assertEquals(2, bestFilenames.size(), "There should be two filenames extracted");
assertEquals("file.docx", bestFilenames.get(0), "The Original-Filename should have been extracted");
assertEquals("theOtherFile.csv", bestFilenames.get(1), "The Original-Filename should have been extracted");
}

private void setupMetadata(IBaseDataObject bdo, String fieldValue, DropOffUtil.FileTypeCheckParameter fileTypeCheckParameter) {
bdo.clearParameters();
bdo.putParameter(fileTypeCheckParameter.getFieldName(), fieldValue);
Expand Down