Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Using FILE_ABSOLUTEPATH and Original-Filename to populate FILEXT #600

Merged
merged 7 commits into from
Dec 27, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/main/java/emissary/core/constants/Parameters.java
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ public class Parameters {
public static final String INPUT_FILENAME = "INPUT_FILENAME";
public static final String ORIGINAL_FILENAME = "Original-Filename";
public static final String SUMMARY = "SUMMARY";
public static final String FILE_ABSOLUTEPATH = "FILE_ABSOLUTEPATH";
arp-0984 marked this conversation as resolved.
Show resolved Hide resolved

// Common parameter prefixes

Expand Down
65 changes: 51 additions & 14 deletions src/main/java/emissary/output/DropOffUtil.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,11 +33,14 @@
import java.util.Set;
import java.util.SimpleTimeZone;
import java.util.UUID;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.annotation.Nullable;

import static emissary.core.Form.PREFIXES_LANG;
import static emissary.core.Form.TEXT;
import static emissary.core.Form.UNKNOWN;
import static emissary.core.constants.Parameters.FILE_ABSOLUTEPATH;
import static emissary.core.constants.Parameters.ORIGINAL_FILENAME;

public class DropOffUtil {
Expand Down Expand Up @@ -82,6 +85,8 @@ public class DropOffUtil {
private static final String DEFAULT_EVENT_DATE_TO_NOW = "DEFAULT_EVENT_DATE_TO_NOW";
protected boolean defaultEventDateToNow = true;

private final Pattern JUST_FILENAME_PATTERN = Pattern.compile("(^|\\\\|/)[^\\\\/]+$");

/**
* Create with the default configuration
*/
Expand Down Expand Up @@ -969,25 +974,32 @@ public Date extractEventDateFrom(final IBaseDataObject d, final boolean lastReso
}

/**
* Extracts from the provided {@link IBaseDataObject} the last file extension from each value in the "Original-Filename"
* parameter, if that value contains "." before its last character. If one or more file extensions are extracted, the
* IBaseDataObject's "FILEXT" parameter is set as the unique set of extracted file extensions, converted to lowercase.
* Extracts from the provided {@link IBaseDataObject} the last file extension from each filename, if that value contains
* "." before its last few characters. If one or more file extensions are extracted, the IBaseDataObject's "FILEXT"
* parameter is set as the unique set of extracted file extensions, converted to lowercase.
*
* @param p IBaseDataObject to process
*
*/
void extractUniqueFileExtensions(IBaseDataObject p) {
arp-0984 marked this conversation as resolved.
Show resolved Hide resolved
if (p.hasParameter(ORIGINAL_FILENAME)) {
final Set<String> extensions = new HashSet<>();
for (Object filename : p.getParameter(ORIGINAL_FILENAME)) {
final String fn = (String) filename;
if (StringUtils.isNotEmpty(fn) && fn.lastIndexOf('.') > -1) {
final int pos = fn.lastIndexOf('.') + 1;
if (pos < fn.length()) {
final String fext = fn.substring(pos).toLowerCase();
if (fext.length() > 0 && fext.length() <= this.maxFilextLen) {
extensions.add(fext);
}
List<String> filenames = getBestFilenames(p);
final Set<String> extensions = new HashSet<>();
for (String filename : filenames) {

// if what we have is a full filepath, extract just the filename (text after the file path separator)
Matcher matcher = JUST_FILENAME_PATTERN.matcher(filename);
arp-0984 marked this conversation as resolved.
Show resolved Hide resolved
String justFilename = filename;
if (matcher.find()) {
justFilename = matcher.group(0);
}

// get the text after the last period
if (justFilename.lastIndexOf('.') > -1) {
arp-0984 marked this conversation as resolved.
Show resolved Hide resolved
final int pos = justFilename.lastIndexOf('.') + 1;
if (pos < justFilename.length()) {
final String fext = justFilename.substring(pos).toLowerCase();
if (fext.length() > 0 && fext.length() <= this.maxFilextLen) {
extensions.add(fext);
}
}
}
Expand All @@ -997,6 +1009,31 @@ void extractUniqueFileExtensions(IBaseDataObject p) {
}
}

/**
* Checks the Original-Filename and FILE_ABSOLUTEPATH for the filename of the object. Returns a list with of the
* non-empty strings found in these fields. If nothing is found in either field, return an empty list.
*
* @param d The IBDO
* @return The list of filenames found in the field Original-Filename or FILE_ABSOLUTEPATH
*/
public static List<String> getBestFilenames(IBaseDataObject d) {
arp-0984 marked this conversation as resolved.
Show resolved Hide resolved
String[] fieldsToTry = {ORIGINAL_FILENAME, FILE_ABSOLUTEPATH};
arp-0984 marked this conversation as resolved.
Show resolved Hide resolved

List<String> filenames = new ArrayList<>();

for (String ibdoField : fieldsToTry) {
if (d.hasParameter(ibdoField)) {
for (Object filename : d.getParameter(ibdoField)) {
String stringFileName = (String) filename;
if (StringUtils.isNotEmpty(stringFileName)) {
arp-0984 marked this conversation as resolved.
Show resolved Hide resolved
filenames.add(stringFileName);
}
}
}
}
return filenames;
}

/**
* Process metadata before doing any output
*
Expand Down
60 changes: 59 additions & 1 deletion src/test/java/emissary/output/DropOffUtilTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
import static emissary.core.Form.TEXT;
import static emissary.core.Form.UNKNOWN;
import static emissary.core.constants.Parameters.EVENT_DATE;
import static emissary.core.constants.Parameters.FILE_ABSOLUTEPATH;
import static emissary.core.constants.Parameters.FILE_DATE;
import static emissary.core.constants.Parameters.ORIGINAL_FILENAME;
import static org.junit.jupiter.api.Assertions.assertEquals;
Expand All @@ -38,6 +39,7 @@
class DropOffUtilTest extends UnitTest {
private DropOffUtil util = null;
private IBaseDataObject payload = null;
static final String FILEXT = "FILEXT";
arp-0984 marked this conversation as resolved.
Show resolved Hide resolved

@BeforeEach
public void createUtil() {
Expand Down Expand Up @@ -576,7 +578,6 @@ void testGetFileType() {
@Test
void testExtractUniqueFileExtensions() {
// these should be constants
final String FILEXT = "FILEXT";
DropOffUtil util = new DropOffUtil();

final IBaseDataObject bdo = new BaseDataObject();
Expand All @@ -601,6 +602,41 @@ void testExtractUniqueFileExtensions() {
assertTrue(fileExts.contains("mp3"), "FILEXT values should contain \"mp3\"");
}

@Test
void testExtractFullFilepaths() {
arp-0984 marked this conversation as resolved.
Show resolved Hide resolved
DropOffUtil util = new DropOffUtil();

final IBaseDataObject ibdo = new BaseDataObject();
ibdo.setParameter(FILE_ABSOLUTEPATH, "D:\\Users\\jdoe\\Documents\\Taxes 2023.csv");
arp-0984 marked this conversation as resolved.
Show resolved Hide resolved
util.extractUniqueFileExtensions(ibdo);
assertEquals("csv", ibdo.getStringParameter(FILEXT), "FILEXT should be extracted");
ibdo.setParameter(FILEXT, "");

ibdo.setParameter(FILE_ABSOLUTEPATH, "D:\\Users\\jdoe\\interesting.folder\\a.table");
util.extractUniqueFileExtensions(ibdo);
assertEquals("table", ibdo.getStringParameter(FILEXT), "FILEXT should be extracted");
ibdo.setParameter(FILEXT, "");

ibdo.setParameter(FILE_ABSOLUTEPATH, "/paper.abc.zzz");
util.extractUniqueFileExtensions(ibdo);
assertEquals("zzz", ibdo.getStringParameter(FILEXT), "FILEXT should be extracted");
ibdo.setParameter(FILEXT, "");

ibdo.setParameter(FILE_ABSOLUTEPATH, "flowers.456.123");
util.extractUniqueFileExtensions(ibdo);
assertEquals("123", ibdo.getStringParameter(FILEXT), "FILEXT should be extracted");
ibdo.setParameter(FILEXT, "");

ibdo.setParameter(FILE_ABSOLUTEPATH, "/home/jdoe/SHARED_D.IR/cat.mov");
util.extractUniqueFileExtensions(ibdo);
assertEquals("mov", ibdo.getStringParameter(FILEXT), "FILEXT should be extracted");
ibdo.setParameter(FILEXT, "");

ibdo.setParameter(FILE_ABSOLUTEPATH, "/home/jdoe/SHARED_D.IR/cat");
util.extractUniqueFileExtensions(ibdo);
assertEquals("", ibdo.getStringParameter(FILEXT), "FILEXT should not be extracted if there is no period in the filename");
}

@Test
void testCleanSpecPath() {
assertEquals("/this/is/fine", util.cleanSpecPath("/this/is/fine"));
Expand All @@ -609,6 +645,28 @@ void testCleanSpecPath() {
assertEquals("/this/./././/./is/fine", util.cleanSpecPath("/this/....../../..//./is/fine"));
}

@Test
void testGetBestFilename() {
arp-0984 marked this conversation as resolved.
Show resolved Hide resolved
IBaseDataObject ibdo = new BaseDataObject();
List<String> bestFilenames;

ibdo.setParameter(ORIGINAL_FILENAME, "");
ibdo.setParameter(FILE_ABSOLUTEPATH, "");
bestFilenames = DropOffUtil.getBestFilenames(ibdo);
assertEquals(0, bestFilenames.size(), "No filename should have been found");

ibdo.setParameter(FILE_ABSOLUTEPATH, "theOtherFile.csv");
bestFilenames = DropOffUtil.getBestFilenames(ibdo);
assertEquals(1, bestFilenames.size(), "There should be one filename extracted");
assertEquals("theOtherFile.csv", bestFilenames.get(0), "The FILE_ABSOLUTEPATH should have been extracted");

ibdo.setParameter(ORIGINAL_FILENAME, "file.docx");
bestFilenames = DropOffUtil.getBestFilenames(ibdo);
assertEquals(2, bestFilenames.size(), "There should be two filenames extracted");
assertEquals("file.docx", bestFilenames.get(0), "The Original-Filename should have been extracted");
assertEquals("theOtherFile.csv", bestFilenames.get(1), "The Original-Filename should have been extracted");
}

private void setupMetadata(IBaseDataObject bdo, String fieldValue, DropOffUtil.FileTypeCheckParameter fileTypeCheckParameter) {
bdo.clearParameters();
bdo.putParameter(fileTypeCheckParameter.getFieldName(), fieldValue);
Expand Down
Loading