From 45d9ff80acf06086b0bee08d394e656f5516546a Mon Sep 17 00:00:00 2001 From: fbruton <19573915+fbruton@users.noreply.github.com> Date: Tue, 18 Jun 2024 17:41:08 -0400 Subject: [PATCH] make FlexibleDateTimeParser remove regex configurable (#828) --- .../emissary/util/FlexibleDateTimeParser.java | 23 +++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/src/main/java/emissary/util/FlexibleDateTimeParser.java b/src/main/java/emissary/util/FlexibleDateTimeParser.java index cb8fd0761e..67893b3a9c 100644 --- a/src/main/java/emissary/util/FlexibleDateTimeParser.java +++ b/src/main/java/emissary/util/FlexibleDateTimeParser.java @@ -47,6 +47,8 @@ public final class FlexibleDateTimeParser { private static final String CFG_FORMAT_MAIN = "FORMAT_DATETIME_MAIN"; private static final String CFG_FORMAT_EXTRA = "FORMAT_DATETIME_EXTRA"; private static final String CFG_TIMEZONE = "TIMEZONE"; + private static final String CFG_REMOVE_REGEX = "REMOVE_REGEX"; + private static final String CFG_EXTRA_TEXT_REMOVE_REGEX = "EXTRA_TEXT_REMOVE_REGEX"; private static final String DEFAULT_TIMEZONE = "GMT"; private static final String SPACE = " "; private static final String EMPTY = ""; @@ -58,12 +60,12 @@ public final class FlexibleDateTimeParser { * Remove other junk -- anything in an html tag, all parenthesis and quotes, and any non-word characters at the * beginning or end */ - private static final Pattern REMOVE = Pattern.compile("<.+?>$|=0D$|\\(|\\)|\"|\\[|]|\\W+$|^\\W+", Pattern.DOTALL); + private static Pattern remove; /* * This is our last ditch parsing effort if we failed to parse the string - remove all extra text after the numeric time * zone offset */ - private static final Pattern EXTRA_TEXT_REMOVE = Pattern.compile("(\\+\\d{4}).*$"); + private static Pattern extraTextRemove; /* timezone - config var: TIMEZONE */ private static ZoneId timezone = ZoneId.of(DEFAULT_TIMEZONE); @@ -131,7 +133,7 @@ static ZonedDateTime lastDitchParsingEffort(final String date) { // Attempt to remove all text after the numeric offset and try again - this should give us a valid date string // to work with - Matcher matcher = EXTRA_TEXT_REMOVE.matcher(date); + Matcher matcher = extraTextRemove.matcher(date); if (matcher.find()) { String secondChanceDate = matcher.replaceAll(matcher.group(1)); // if we removed text, attempt to parse again to see if we are more successful this time @@ -223,11 +225,24 @@ private static void configure() { Configurator configurator = ConfigUtil.getConfigInfo(FlexibleDateTimeParser.class); setupTimezone(configurator.findStringEntry(CFG_TIMEZONE, DEFAULT_TIMEZONE)); setupDateFormats(configurator.findStringMatchEntries(CFG_FORMAT_MAIN), configurator.findStringMatchEntries(CFG_FORMAT_EXTRA)); + + setupRemoveRegex(configurator); } catch (IOException e) { throw new IllegalArgumentException("Could not configure parser!!", e); } } + private static void setupRemoveRegex(Configurator configurator) { + String removeRegex = configurator.findStringEntry(CFG_REMOVE_REGEX, "<.+?>$|=0D$|\\(|\\)|\"|\\[|]|\\W+$|^\\W+"); + remove = Pattern.compile(removeRegex, Pattern.DOTALL); + /* + * This is our last ditch parsing effort if we failed to parse the string - remove all extra text after the numeric time + * zone offset + */ + String extraTextRemoveRegex = configurator.findStringEntry(CFG_EXTRA_TEXT_REMOVE_REGEX, "((\\+|-)\\d{4}).*$"); + extraTextRemove = Pattern.compile(extraTextRemoveRegex); + } + /** * Set the timezone to use for parsing (needed for DateTimes that do not have timezone information) * @@ -317,7 +332,7 @@ private static String cleanDateString(final String date) { // date strings over 100 characters are more than likely invalid String cleanedDateString = StringUtils.substring(date, 0, 100); cleanedDateString = REPLACE.matcher(cleanedDateString).replaceAll(SPACE); - cleanedDateString = REMOVE.matcher(cleanedDateString).replaceAll(EMPTY); + cleanedDateString = remove.matcher(cleanedDateString).replaceAll(EMPTY); return StringUtils.trimToNull(cleanedDateString); }