From 715753203218b699d99c85fd2677eb0a2a34f535 Mon Sep 17 00:00:00 2001 From: Nikolay Date: Tue, 21 Mar 2017 00:51:31 +0200 Subject: [PATCH] NonStrictPhoneTokenizer update - added generateNGrams parameter to work in two modes: generate strict match phone tokens or ngrams --- .../analysis/NonStrictPhoneTokenizer.java | 50 +++++++++++++------ src/test/java/tests/PhoneUtilsTest.java | 5 +- 2 files changed, 40 insertions(+), 15 deletions(-) diff --git a/src/main/java/org/elasticsearch/index/analysis/NonStrictPhoneTokenizer.java b/src/main/java/org/elasticsearch/index/analysis/NonStrictPhoneTokenizer.java index 2ef2246..9269722 100644 --- a/src/main/java/org/elasticsearch/index/analysis/NonStrictPhoneTokenizer.java +++ b/src/main/java/org/elasticsearch/index/analysis/NonStrictPhoneTokenizer.java @@ -22,6 +22,7 @@ public class NonStrictPhoneTokenizer extends Tokenizer { private String defaultRegion = "US"; private boolean addCountryCode = false; private boolean addExtension = false; + private boolean generateNGrams = false; // The raw input private String stringToTokenize = null; @@ -87,17 +88,16 @@ private void generateTokens() { String number = parts[0]; - // Add a token for the raw unmanipulated address. Note this could be a username (sip) instead of - // telephone number so take it as is - tokens.add(number); // Let google's libphone try to parse it PhoneNumberUtil phoneUtil = PhoneNumberUtil.getInstance(); PhoneNumber numberProto; String countryCode = null; + boolean validPhone = false; try { numberProto = phoneUtil.parse(number, defaultRegion); - if (numberProto != null) { + validPhone = phoneUtil.isValidNumber(numberProto); + if (numberProto != null && validPhone) { // Libphone likes it! countryCode = String.valueOf(numberProto.getCountryCode()); number = String.valueOf(numberProto.getNationalNumber()); @@ -107,8 +107,6 @@ private void generateTokens() { if (addExtension && !StringUtils.isEmpty(numberProto.getExtension())) { tokens.add(numberProto.getExtension()); } - - tokens.add(number); } } catch (NumberParseException e) { // Libphone didn't like it, no biggie. We'll just ngram the number as it is. @@ -117,19 +115,43 @@ private void generateTokens() { } // ngram the phone number EG 19198243333 produces 9, 91, 919, etc - if (NumberUtils.isNumber(number)) { - for (int count = 1; count <= number.length(); count++) { - String token = number.substring(0, count); - tokens.add(token); - if (countryCode != null) { - // If there was a country code, add more ngrams such that 19198243333 produces 19, 191, - // 1919, etc - tokens.add(countryCode + token); + if (validPhone) { + boolean hasCountryCode = StringUtils.isNotBlank(countryCode); + if (generateNGrams) { + for (int count = 1; count <= number.length(); count++) { + String token = number.substring(0, count); + tokens.add(token); + if (hasCountryCode) { + // If there was a country code, add more ngrams such that 19198243333 produces 19, 191, + // 1919, etc + tokens.add(countryCode + token); + } + } + } else { + tokens.add(number); + if (hasCountryCode) { + tokens.add(countryCode + number); } } + } else { + number = cleanNumber(number); + if (generateNGrams) { + for (int count = 1; count <= number.length(); count++) { + String token = number.substring(0, count); + tokens.add(token); + } + } else { + tokens.add(number); + } + tokens.add(number); } } + private String cleanNumber(String number) { + //todo: implement + return number; + } + /** * Read the input into a local variable * diff --git a/src/test/java/tests/PhoneUtilsTest.java b/src/test/java/tests/PhoneUtilsTest.java index 7da2e2b..86a7e29 100644 --- a/src/test/java/tests/PhoneUtilsTest.java +++ b/src/test/java/tests/PhoneUtilsTest.java @@ -9,7 +9,10 @@ import static org.hamcrest.CoreMatchers.notNullValue; import static org.hamcrest.MatcherAssert.assertThat; - +/** + * {@link PhoneNumberUtil#parse(String, String)} test which verifies that the util parses typical valid phones + * from exelare db and doesn't parse typical invalid phones + */ public class PhoneUtilsTest { PhoneNumberUtil phoneUtil = PhoneNumberUtil.getInstance();