Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

make FlexibleDateTimeParser remove regex configurable #828

Merged
merged 2 commits into from
Jun 18, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 19 additions & 4 deletions src/main/java/emissary/util/FlexibleDateTimeParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@
private static final String CFG_FORMAT_MAIN = "FORMAT_DATETIME_MAIN";
private static final String CFG_FORMAT_EXTRA = "FORMAT_DATETIME_EXTRA";
private static final String CFG_TIMEZONE = "TIMEZONE";
private static final String CFG_REMOVE_REGEX = "REMOVE_REGEX";
private static final String CFG_EXTRA_TEXT_REMOVE_REGEX = "EXTRA_TEXT_REMOVE_REGEX";
private static final String DEFAULT_TIMEZONE = "GMT";
private static final String SPACE = " ";
private static final String EMPTY = "";
Expand All @@ -58,12 +60,12 @@
* Remove other junk -- anything in an html tag, all parenthesis and quotes, and any non-word characters at the
* beginning or end
*/
private static final Pattern REMOVE = Pattern.compile("<.+?>$|=0D$|\\(|\\)|\"|\\[|]|\\W+$|^\\W+", Pattern.DOTALL);
private static Pattern remove;
/*
* This is our last ditch parsing effort if we failed to parse the string - remove all extra text after the numeric time
* zone offset
*/
private static final Pattern EXTRA_TEXT_REMOVE = Pattern.compile("(\\+\\d{4}).*$");
private static Pattern extraTextRemove;

/* timezone - config var: TIMEZONE */
private static ZoneId timezone = ZoneId.of(DEFAULT_TIMEZONE);
Expand Down Expand Up @@ -131,7 +133,7 @@

// Attempt to remove all text after the numeric offset and try again - this should give us a valid date string
// to work with
Matcher matcher = EXTRA_TEXT_REMOVE.matcher(date);
Matcher matcher = extraTextRemove.matcher(date);
if (matcher.find()) {
String secondChanceDate = matcher.replaceAll(matcher.group(1));
// if we removed text, attempt to parse again to see if we are more successful this time
Expand Down Expand Up @@ -223,11 +225,24 @@
Configurator configurator = ConfigUtil.getConfigInfo(FlexibleDateTimeParser.class);
setupTimezone(configurator.findStringEntry(CFG_TIMEZONE, DEFAULT_TIMEZONE));
setupDateFormats(configurator.findStringMatchEntries(CFG_FORMAT_MAIN), configurator.findStringMatchEntries(CFG_FORMAT_EXTRA));

setupRemoveRegex(configurator);
} catch (IOException e) {
throw new IllegalArgumentException("Could not configure parser!!", e);
}
}

private static void setupRemoveRegex(Configurator configurator) {
String removeRegex = configurator.findStringEntry(CFG_REMOVE_REGEX, "<.+?>$|=0D$|\\(|\\)|\"|\\[|]|\\W+$|^\\W+");
remove = Pattern.compile(removeRegex, Pattern.DOTALL);
/*
* This is our last ditch parsing effort if we failed to parse the string - remove all extra text after the numeric time
* zone offset
*/
String extraTextRemoveRegex = configurator.findStringEntry(CFG_EXTRA_TEXT_REMOVE_REGEX, "((\\+|-)\\d{4}).*$");
extraTextRemove = Pattern.compile(extraTextRemoveRegex);
}

/**
* Set the timezone to use for parsing (needed for DateTimes that do not have timezone information)
*
Expand Down Expand Up @@ -317,7 +332,7 @@
// date strings over 100 characters are more than likely invalid
String cleanedDateString = StringUtils.substring(date, 0, 100);
cleanedDateString = REPLACE.matcher(cleanedDateString).replaceAll(SPACE);
cleanedDateString = REMOVE.matcher(cleanedDateString).replaceAll(EMPTY);
cleanedDateString = remove.matcher(cleanedDateString).replaceAll(EMPTY);
Dismissed Show dismissed Hide dismissed

return StringUtils.trimToNull(cleanedDateString);
}
Expand Down