From 4c281d31256ae347aaa25206ac1886db6212c477 Mon Sep 17 00:00:00 2001 From: Vasyl Khrystiuk Date: Fri, 6 Dec 2024 01:19:50 +0200 Subject: [PATCH] [WIP] --- .../liqp/filters/date/FuzzyDateParser.java | 406 ++++++++++++++---- .../date/FuzzyDateParserParametrizedTest.java | 92 ++++ .../filters/date/FuzzyDateParserTest.java | 56 +-- 3 files changed, 427 insertions(+), 127 deletions(-) create mode 100644 src/test/java/liqp/filters/date/FuzzyDateParserParametrizedTest.java diff --git a/src/main/java/liqp/filters/date/FuzzyDateParser.java b/src/main/java/liqp/filters/date/FuzzyDateParser.java index f2b65b79..8f92aba9 100644 --- a/src/main/java/liqp/filters/date/FuzzyDateParser.java +++ b/src/main/java/liqp/filters/date/FuzzyDateParser.java @@ -1,5 +1,7 @@ package liqp.filters.date; +import static liqp.LValue.isBlank; + import java.text.DateFormatSymbols; import java.time.ZoneId; import java.time.ZonedDateTime; @@ -12,8 +14,25 @@ import java.util.regex.Pattern; import java.util.stream.Collectors; +/** + * This class is writen for fun. + * Even if it may be extended to be very powerful, this is not intention. For example, it may + * use locale information to guess some pattern parts, + * like "leading zeros appear to be less commonly used in Germany than in Austria and Switzerland". + * But it is not implemented. And not going to be. + * It simply must cover all possible cases of format that been used in previous + * implementation of BasicDateParser. Which in fact is quite poor yet did the work well. + * So not expect super-magic here. + * And yes. + * It is called "fuzzy" but is having strict and well-defined rules of "guessing". + * So this class is quite exact. See test for details. + */ public class FuzzyDateParser extends BasicDateParser { + public static boolean CLDR_LOADED = new DateFormatSymbols(Locale.GERMANY) + .getShortMonths()[1] + .endsWith("."); + @Override public ZonedDateTime parse(String valAsString, Locale locale, ZoneId defaultZone) { String normalized = valAsString.toLowerCase(); @@ -45,6 +64,53 @@ String guessPattern(String normalized, Locale locale) { return reconstructPattern(parts); } + private List parsePart(List parts, DateParseContext ctx) { + + if (notSet(ctx.hasYear)) { + LookupResult result = lookup(parts, yearWithEraExtractor); + if (result.found) { + ctx.hasYear = true; + return result.parts; + } + } + if (notSet(ctx.hasTime)) { + LookupResult result = lookup(parts, regularTimeExtractor); + if (result.found) { + ctx.hasTime = true; + return result.parts; + } + ctx.hasTime = false; + } + + if (notSet(ctx.hasYear)) { + LookupResult result = lookup(parts, plainYearExtractor); + if (result.found) { + ctx.hasYear = true; + return result.parts; + } + // last "year check" and since we are here - there is no year + ctx.hasYear = false; + } + if (notSet(ctx.hasMonthName)) { + LookupResult result = lookup(parts, fullMonthExtractor(ctx.locale)); + if (result.found) { + ctx.hasMonthName = true; + return result.parts; + } + + result = lookup(parts, shortMonthExtractor(ctx.locale)); + if (result.found) { + ctx.hasMonthName = true; + return result.parts; + } + + ctx.hasMonthName = false; + } + + return markAsUnrecognized(parts); + } + + private String reconstructPattern(List parts) { return parts.stream().map(p -> { @@ -68,17 +134,21 @@ public DateParseContext(Locale locale) { } } static class PartExtractorResult { + public PartExtractorResult(){} + public PartExtractorResult(String formatterPattern){ + this.formatterPattern = formatterPattern; + } boolean found; int start; int end; + String formatterPattern; } interface PartExtractor { PartExtractorResult extract(String source); - String formatterPattern(); } static class RegexPartExtractor implements PartExtractor { - private final Pattern pattern; - private final String formatterPattern; + protected final Pattern pattern; + protected final String formatterPattern; RegexPartExtractor(String regex, String formatterPattern) { this.pattern = Pattern.compile(regex); @@ -89,7 +159,7 @@ static class RegexPartExtractor implements PartExtractor { public PartExtractorResult extract(String source) { Matcher matcher = pattern.matcher(source); if (matcher.find()) { - PartExtractorResult result = new PartExtractorResult(); + PartExtractorResult result = new PartExtractorResult(formatterPattern); result.found = true; result.start = matcher.start(1); result.end = matcher.end(1); @@ -97,12 +167,42 @@ public PartExtractorResult extract(String source) { } return new PartExtractorResult(); } + } + static class YearWithEra extends RegexPartExtractor { + YearWithEra() { + super("(?:^|.*?\\D)(?\\d+)(?\\s*)(?AD|BC|Anno Domini|Before Christ)(?:$|\\D.*?)", null); + } @Override - public String formatterPattern() { - return formatterPattern; + public PartExtractorResult extract(String source) { + Matcher matcher = pattern.matcher(source); + if (matcher.find()) { + PartExtractorResult result = new PartExtractorResult(); + result.found = true; + result.start = matcher.start("year"); + result.formatterPattern = "yyyy"; + String era = matcher.group("era"); + if(!isBlank(era)) { + String eraSeparator = matcher.group("eraSeparator"); + if (eraSeparator != null) { + result.formatterPattern += eraSeparator; + } + result.end = matcher.end("era"); + if (era.length() == 2) { + result.formatterPattern += "GG"; + } else { + result.formatterPattern += "GGGG"; + } + } else { + result.end = matcher.end("year"); + } + return result; + } + return new PartExtractorResult(); } } + PartExtractor yearWithEraExtractor = new YearWithEra(); + PartExtractor plainYearExtractor = new RegexPartExtractor(".*\\b?(\\d{4})\\b?.*", "yyyy"); static class PartExtractorDelegate implements PartExtractor { @@ -112,11 +212,6 @@ static class PartExtractorDelegate implements PartExtractor { public PartExtractorResult extract(String source) { return delegate.extract(source); } - - @Override - public String formatterPattern() { - return delegate.formatterPattern(); - } } static class FullMonthExtractor extends PartExtractorDelegate { public FullMonthExtractor(Locale locale, String formatterPattern) { @@ -132,21 +227,16 @@ protected String[] getMonthsNamesFromLocale(Locale locale) { return new DateFormatSymbols(locale).getMonths(); } - private static String[] withoutNulls(String[] shortMonths) { + protected String[] withoutNulls(String[] shortMonths) { return Arrays.stream(shortMonths) .filter(month -> month != null && !month.isEmpty()) - .map(el -> { - // after jdk 9 it's normal (not counted as a bug) - // so here need to check version and work around - // https://bugs.openjdk.org/browse/JDK-8194289 - while (el.endsWith(".")) { - el = el.substring(0, el.length() - 1); - } - return el; - }) .map(Pattern::quote) + .map(this::convertMonthName) .toArray(String[]::new); } + protected String convertMonthName(String monthName) { + return monthName; + } } private PartExtractor fullMonthExtractor(Locale locale) { @@ -168,67 +258,85 @@ private PartExtractor shortMonthExtractor(Locale locale) { return new ShortMonthExtractor(locale); } + static class RegularTimeExtractor extends RegexPartExtractor { - static class LookupResult { - final List parts; - final boolean found; - LookupResult(List parts, boolean found) { - this.parts = parts; - this.found = found; + RegularTimeExtractor() { + super("(?:^|.*?\\D)" + + "(" + + "(?(\\d|0\\d|1\\d|2[0-3]))" + + ":" + + "(?[0-5]\\d)" + + "(?:" + + ":(?[0-5]\\d)" + + "(?:\\.(?\\d{1,9})?)?" + + ")?" // end of seconds + + "(?\\s*[AaPp][Mm])?" + + ")" // end of main group + + "(?:$|\\D.*?)", null); } - } - private List parsePart(List parts, DateParseContext ctx) { - if (notSet(ctx.hasYear)) { - LookupResult result = lookup(parts, plainYearExtractor); - if (result.found) { - ctx.hasYear = true; - return result.parts; - } - ctx.hasYear = false; - } - if (notSet(ctx.hasMonthName)) { - LookupResult result = lookup(parts, fullMonthExtractor(ctx.locale)); - if (result.found) { - ctx.hasMonthName = true; - return result.parts; - } - result = lookup(parts, shortMonthExtractor(ctx.locale)); - if (result.found) { - ctx.hasMonthName = true; - return result.parts; - } + @Override + public PartExtractorResult extract(String source) { + Matcher m = pattern.matcher(source); + if (m.matches()) { + PartExtractorResult r = new PartExtractorResult(); + r.found = true; + + String ampmPart = ""; + String ampm = m.group("ampm"); + if (ampm != null) { + ampmPart = ampm.substring(0, ampm.length() - 2) + "a"; + } - ctx.hasMonthName = false; - } + boolean hasAmPm = !ampmPart.isEmpty(); - if (notSet(ctx.hasTime)) { - LookupResult result = new LookupResult(parts, false); - if (result.found) { - ctx.hasTime = true; - return result.parts; - } - ctx.hasTime = false; - } - return markAsUnrecognized(parts); - } + String hourPart; + if (hasAmPm) { + if (m.group("hours").startsWith("0")) { + hourPart = "hh"; + } else { + hourPart = "h"; // most often time with ampm is without leading zero + } + } else { + hourPart = m.group("hours").length() == 1 ? "H" : "HH"; + } - private List markAsUnrecognized(List parts) { - return parts.stream().map(p -> { - if (p.state() == PartState.UNPARSED) { - return new UnrecognizedPart(p); - } else { - return p; + r.start = m.start("hours"); + if (m.group("milliseconds") != null) { + int millisecondsLength = m.group("milliseconds").length(); + r.end = m.end("milliseconds"); + r.formatterPattern = hourPart + ":mm:ss." + repeat("S", millisecondsLength); + } else if (m.group("seconds") != null) { + r.end = m.end("seconds"); + r.formatterPattern = hourPart + ":mm:ss"; + } else if (m.group("minutes") != null) { + r.end = m.end("minutes"); + r.formatterPattern = hourPart + ":mm"; + } else { + r.end = m.end("hours"); + r.formatterPattern = hourPart; + } + if (hasAmPm) { + r.formatterPattern += ampmPart; + r.end = m.end("ampm"); + } + return r; } - }).collect(Collectors.toList()); - } - - private boolean notSet(Boolean val) { - return val == null; + return new PartExtractorResult(); + } } + static PartExtractor regularTimeExtractor = new RegularTimeExtractor(); + static class LookupResult { + final List parts; + final boolean found; + LookupResult(List parts, boolean found) { + this.parts = parts; + this.found = found; + } + } private LookupResult lookup(List parts, PartExtractor partExtractor) { for (int i = 0; i < parts.size(); i++) { Part part = parts.get(i); @@ -244,7 +352,7 @@ private LookupResult lookup(List parts, PartExtractor partExtractor) { parts.add(i, after); } - ParsedPart parsed = new ParsedPart(part.start() + per.start, part.start() + per.end, partExtractor.formatterPattern()); + ParsedPart parsed = new ParsedPart(part.start() + per.start, part.start() + per.end, per.formatterPattern); parts.add(i, parsed); if (per.start != 0) { @@ -252,7 +360,6 @@ private LookupResult lookup(List parts, PartExtractor partExtractor) { parts.add(i, before); } - return new LookupResult(parts, true); } } @@ -264,6 +371,20 @@ private boolean haveUnparsed(List parts) { return parts.stream().anyMatch(p -> p.state() == PartState.UNPARSED); } + private List markAsUnrecognized(List parts) { + return parts.stream().map(p -> { + if (p.state() == PartState.UNPARSED) { + return new UnrecognizedPart(p); + } else { + return p; + } + }).collect(Collectors.toList()); + } + + private boolean notSet(Boolean val) { + return val == null; + } + enum PartState { UNPARSED, PARSED, @@ -280,7 +401,7 @@ interface Part { static class UnparsedPart implements Part { final int start; final int end; - private final String source; + protected final String source; UnparsedPart(int start, int end, String source) { this.start = start; @@ -325,6 +446,15 @@ public UnrecognizedPart(Part p) { public PartState state() { return PartState.UNRECOGNIZED; } + + @Override + public String toString() { + return "UnrecognizedPart{" + + "start=" + start + + ", end=" + end + + ", source='" + source + '\'' + + '}'; + } } static class ParsedPart implements Part { final int start; @@ -362,17 +492,123 @@ public String getPattern() { } } - enum PartKind { - CONSTANT, - YEAR, - MONTH, - DAY, - HOUR, - MINUTE, - SECOND, - MILLISECOND, - MICROSECOND, - NANOSECOND + static String repeat(String key, int count) { + return new String(new char[count]).replace("\0", key); + } + +// public static void main(String[] args) { +// String key = "z"; +// for (int i = 1; i < 10; i++) { +// printPattern(key, i); +// } +// } + +// static void printPattern(String key, int count) { +// String fullKey = new String(new char[count]).replace("\0", key); +//// ZonedDateTime now = ZonedDateTime.now(); +// ZonedDateTime now = ZonedDateTime.of( +// LocalDate.of(2020, 1, 1), +// LocalTime.of(1, 1, 1), +// ZoneId.of("Europe/Kiev") +//// ZoneOffset.systemDefault() +//// ZoneOffset.UTC +// ); +// try { +// String formatted = now.format(DateTimeFormatter.ofPattern(fullKey)); +// System.out.println(fullKey + " -> " + formatted); +// } catch (Exception e) { +// System.out.println(fullKey + " -> " + "Error: " + e.getMessage()); +// } +// } + private class Chart { + /** + G - era designator + GG -> AD / BC + GGGG -> Anno Domini / Before Christ + + yy -> 24 + yyyy -> 2024 + + Y - do not use (year of the week) + + MM -> 12 + MMM -> Dec/груд. + MMMM -> December/грудня + + L - do not use (non-contextual month) + LLLL -> December/грудень + + w - do not use (week of the year) + W - do not use (Week in month) + + D - do not use (day of the year) + + d - day of the month + d -> 5 + dd -> 05 + + F - do not use (day of the week in month) + + EEE -> Thu + EEEE -> Thursday + + u - day of the week (1 = Monday, ..., 7 = Sunday) + u - do not use (day of the week) + + a - am/pm marker + a -> PM + + H - hour in day (0-23) + H -> 9 + HH -> 09 + + k - do not use (hour in day (1-24)) + + K - do not use (hour in am/pm (0-11)) + + h Hour in am/pm (1-12) ONLY IF am/pm marker is present + h -> 1 + hh -> 01 + + m - minute in hour + m -> 1 + mm -> 01 + + s - second in minute + s -> 1 + ss -> 01 + + S - millisecond (already defined) + + z - General time zone + + z -> UTC / EET BUT ZoneOffset.UTC -> Z + zzz -> UTC / EET BUT ZoneOffset.UTC -> Z + zzzz -> "Coordinated Universal Time" / "Eastern European Standard" Time BUT ZoneOffset.UTC -> Z + + Z - RFC 822 time zone + Z -> +0200 + ZZ -> +0200 + ZZZ -> +0200 + ZZZZ -> GMT+02:00 + ZZZZZ -> +02:00 + + X - ISO 8601 time zone + X -> +02 + XX -> +0200 + XXX -> +02:00 + XXXX -> +0200 + XXXXX -> +02:00 + + V time-zone ID + VV -> Europe/Kiev / Z for ZoneOffset.UTC and "Etc/UTC" for systemDefault + + v generic time-zone name + v -> EET + vvvv -> Eastern European Time + + + */ } } diff --git a/src/test/java/liqp/filters/date/FuzzyDateParserParametrizedTest.java b/src/test/java/liqp/filters/date/FuzzyDateParserParametrizedTest.java new file mode 100644 index 00000000..3d0e1f09 --- /dev/null +++ b/src/test/java/liqp/filters/date/FuzzyDateParserParametrizedTest.java @@ -0,0 +1,92 @@ +package liqp.filters.date; + +import java.util.Locale; +import java.util.regex.Pattern; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import java.util.Arrays; +import java.util.Collection; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +@RunWith(Parameterized.class) +public class FuzzyDateParserParametrizedTest { + + private final String input; + private final String expectedPattern; + private final Locale locale; + + @Parameterized.Parameters + public static Collection data() { + return Arrays.asList(new Object[][]{ + {null, "1995", "yyyy"}, + {null, " 1995 ", " yyyy "}, + {null, " 1995", " yyyy"}, + {null, "1995 ", "yyyy "}, + {null, "January 1995", "MMMM yyyy"}, + {null, "January 1995 ", "MMMM yyyy "}, + {null, " January 1995", " MMMM yyyy"}, + {null, " 1995 January", " yyyy MMMM"}, + {null, "Jan 1995", "MMM yyyy"}, + {null, "1995 Jan ", "yyyy MMM "}, + {Locale.GERMAN, "1995 Mai", "yyyy MMMM"}, + FuzzyDateParser.CLDR_LOADED ? + new Object[]{ + Locale.GERMAN, "??1995-----Dez.!", "??yyyy-----MMM!"} + : new Object[]{ + Locale.GERMAN, "??1995-----Dez!", "??yyyy-----MMM!"} + , + {null, "1:23", "H:mm"}, + {null, "01:23", "HH:mm"}, + {null, "1:23:45", "H:mm:ss"}, + {null, "01:23:45", "HH:mm:ss"}, + {null, "1:23:45.6", "H:mm:ss.S"}, + {null, "01:23:45.6", "HH:mm:ss.S"}, + {null, "1:23:45.67", "H:mm:ss.SS"}, + {null, "1:23:45.678", "H:mm:ss.SSS"}, + {null, "1:23:45.6789", "H:mm:ss.SSSS"}, + {null, "1:23:45.67890", "H:mm:ss.SSSSS"}, + {null, "1:23:45.678901", "H:mm:ss.SSSSSS"}, + {null, "1:23:45.6789012", "H:mm:ss.SSSSSSS"}, + {null, "1:23:45.67890123", "H:mm:ss.SSSSSSSS"}, + {null, "1:23:45.678901234", "H:mm:ss.SSSSSSSSS"}, + {null, "1:23:45.678901234am", "h:mm:ss.SSSSSSSSSa"}, // correct + {null, "1:23:45.678901234a", "H:mm:ss.SSSSSSSSSa"}, // incorrect + {null, "1:23:45.678901234p", "H:mm:ss.SSSSSSSSSp"}, // incorrect + {null, "1:23:45.678901234pm", "h:mm:ss.SSSSSSSSSa"}, // correct + {null, "1:23:45.678901234 pm", "h:mm:ss.SSSSSSSSS a"}, // correct + {null, " 1:23:45.678", " H:mm:ss.SSS"}, + {null, " 1:23:45.678 ", " H:mm:ss.SSS "}, + {null, " 01:23:45.678 ", " HH:mm:ss.SSS "}, + {null, " 1:23:45.678 am ", " h:mm:ss.SSS a "}, + {null, " 1:23:45.678 PM ", " h:mm:ss.SSS a "}, + {null, "12 Jan 1995T01:23:45.678", "12 MMM yyyyTHH:mm:ss.SSS"}, + {null, "12 AD", "yyyy GG"}, + {null, " 12 AD ", " yyyy GG "}, + {null, " 12 Anno Domini ", " yyyy GGGG "}, + {null, " 12345 Before Christ ", " yyyy GGGG "}, + {null, " 0 BC ", " yyyy GG "}, + + {null, " 12 BC 12 Jan 01:23:45.678 ", " yyyy GG 12 MMM HH:mm:ss.SSS "}, + {null, "12 Jan 01:23:45.678 12 Anno Domini", "12 MMM HH:mm:ss.SSS yyyy GGGG"}, + }); + } + + public FuzzyDateParserParametrizedTest(Locale locale, String input, String expectedPattern) { + this.locale = locale; + this.input = input; + this.expectedPattern = expectedPattern; + } + + @Test + public void shouldParse() { + final FuzzyDateParser parser = new FuzzyDateParser(); + String pattern = parser.guessPattern(input, locale); + assertEquals(expectedPattern, pattern); + } + + +} diff --git a/src/test/java/liqp/filters/date/FuzzyDateParserTest.java b/src/test/java/liqp/filters/date/FuzzyDateParserTest.java index 8d74e658..6dcfacf9 100644 --- a/src/test/java/liqp/filters/date/FuzzyDateParserTest.java +++ b/src/test/java/liqp/filters/date/FuzzyDateParserTest.java @@ -1,50 +1,22 @@ package liqp.filters.date; -import java.util.Locale; -import org.junit.Test; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; - -import java.util.Arrays; -import java.util.Collection; - import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; -@RunWith(Parameterized.class) -public class FuzzyDateParserTest { - private final String input; - private final String expectedPattern; - private final Locale locale; - - @Parameterized.Parameters - public static Collection data() { - return Arrays.asList(new Object[][] { - {null, "1995", "yyyy" }, - {null, " 1995 ", " yyyy "}, - {null, " 1995", " yyyy"}, - {null, "1995 ", "yyyy "}, - {null, "January 1995", "MMMM yyyy"}, - {null, "January 1995 ", "MMMM yyyy "}, - {null, " January 1995", " MMMM yyyy"}, - {null, " 1995 January", " yyyy MMMM"}, - {null, "Jan 1995", "MMM yyyy"}, - {null, "1995 Jan ", "yyyy MMM "}, - {Locale.GERMAN, "1995 Mai", "yyyy MMMM"}, - {Locale.GERMAN, "??1995-----Dez!", "??yyyy-----MMM!"}, - }); - } - - public FuzzyDateParserTest(Locale locale, String input, String expectedPattern) { - this.locale = locale; - this.input = input; - this.expectedPattern = expectedPattern; - } +import java.util.regex.Matcher; +import java.util.regex.Pattern; +import liqp.filters.date.FuzzyDateParser.PartExtractor; +import liqp.filters.date.FuzzyDateParser.PartExtractorResult; +import org.junit.Test; +public class FuzzyDateParserTest { @Test - public void shouldParse() { - final FuzzyDateParser parser = new FuzzyDateParser(); - String pattern = parser.guessPattern(input, locale); - assertEquals(expectedPattern, pattern); + public void testTimeRegexp() { + PartExtractor partExtractor = FuzzyDateParser.regularTimeExtractor; + PartExtractorResult result = partExtractor.extract(" 12:34 "); + assertTrue(result.found); + assertEquals( 1, result.start); + assertEquals( 6, result.end); + assertEquals(result.formatterPattern, "HH:mm"); } - }